{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 3860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 56.375, "epoch": 0.0025906735751295338, "grad_norm": 102.78443638392365, "kl": 0.0, "learning_rate": 9.99740932642487e-07, "loss": -0.0, "reward": 0.9124050736427307, "reward_std": 0.756333202123642, "rewards/format_reward_rec": 0.6875, "rewards/point_reward": 0.5686550736427307, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 56.5, "epoch": 0.0051813471502590676, "grad_norm": 9.448633977734644, "kl": 0.00024187564849853516, "learning_rate": 9.99481865284974e-07, "loss": 0.0, "reward": 1.4197708368301392, "reward_std": 0.5856578946113586, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 0.9510208964347839, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 33.375, "epoch": 0.007772020725388601, "grad_norm": 46.44795961542426, "kl": 0.0009088516235351562, "learning_rate": 9.992227979274612e-07, "loss": 0.0, "reward": 0.8207422494888306, "reward_std": 0.6994703561067581, "rewards/format_reward_rec": 0.75, "rewards/point_reward": 0.44574226438999176, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 0.010362694300518135, "grad_norm": 22.039562329707408, "kl": 0.00015461444854736328, "learning_rate": 9.989637305699482e-07, "loss": 0.0, "reward": 1.768720269203186, "reward_std": 0.49991145730018616, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.2999702095985413, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 0.012953367875647668, "grad_norm": 7.433242298010772, "kl": 0.0002903938293457031, "learning_rate": 9.987046632124352e-07, "loss": 0.0, "reward": 1.7803658843040466, "reward_std": 0.636418879032135, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.3116158843040466, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.015544041450777202, "grad_norm": 20.83373786370868, "kl": 0.0012612342834472656, "learning_rate": 9.984455958549224e-07, "loss": 0.0, "reward": 1.7586567997932434, "reward_std": 0.6922780275344849, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.2899067997932434, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.018134715025906734, "grad_norm": 28.683603051863027, "kl": 0.0006504058837890625, "learning_rate": 9.981865284974092e-07, "loss": 0.0, "reward": 1.5135871767997742, "reward_std": 0.5269960463047028, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.0448371767997742, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.02072538860103627, "grad_norm": 10.128159772588388, "kl": 0.00034332275390625, "learning_rate": 9.979274611398964e-07, "loss": 0.0, "reward": 1.9344244003295898, "reward_std": 0.6239902973175049, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4344244003295898, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.023316062176165803, "grad_norm": 36.03327479266432, "kl": 0.0025310516357421875, "learning_rate": 9.976683937823834e-07, "loss": 0.0, "reward": 1.598103404045105, "reward_std": 0.32715106941759586, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.098103404045105, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.025906735751295335, "grad_norm": 13.470375613885656, "kl": 0.015668869018554688, "learning_rate": 9.974093264248704e-07, "loss": 0.0001, "reward": 1.4637945890426636, "reward_std": 0.2723078988492489, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.963794618844986, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.02849740932642487, "grad_norm": 20.677556302120895, "kl": 0.003204345703125, "learning_rate": 9.971502590673576e-07, "loss": 0.0, "reward": 1.8317521810531616, "reward_std": 0.7209424078464508, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3317522406578064, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.031088082901554404, "grad_norm": 21.87278738790615, "kl": 0.004302978515625, "learning_rate": 9.968911917098446e-07, "loss": 0.0, "reward": 1.686286211013794, "reward_std": 0.8876461386680603, "rewards/format_reward_rec": 0.875, "rewards/point_reward": 1.248786211013794, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 0.03367875647668394, "grad_norm": 27.3130645702305, "kl": 0.00260162353515625, "learning_rate": 9.966321243523316e-07, "loss": 0.0, "reward": 1.3790531754493713, "reward_std": 0.44522781670093536, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 0.9103032350540161, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.03626943005181347, "grad_norm": 9.015755735529073, "kl": 0.002445220947265625, "learning_rate": 9.963730569948186e-07, "loss": 0.0, "reward": 1.4819737672805786, "reward_std": 0.5076836049556732, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9819737374782562, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.038860103626943004, "grad_norm": 27.95356340106226, "kl": 0.006580352783203125, "learning_rate": 9.961139896373056e-07, "loss": 0.0, "reward": 1.83109050989151, "reward_std": 0.6424896717071533, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.33109050989151, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.04145077720207254, "grad_norm": 19.368134071735668, "kl": 0.0023174285888671875, "learning_rate": 9.958549222797928e-07, "loss": 0.0, "reward": 1.7067553997039795, "reward_std": 0.7389237880706787, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.2380053997039795, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 0.04404145077720207, "grad_norm": 18.241894673479052, "kl": 0.0026702880859375, "learning_rate": 9.955958549222798e-07, "loss": 0.0, "reward": 1.6574268341064453, "reward_std": 0.5008782595396042, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.1886768341064453, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.046632124352331605, "grad_norm": 9.540356216906547, "kl": 0.001827239990234375, "learning_rate": 9.953367875647668e-07, "loss": 0.0, "reward": 1.6734890937805176, "reward_std": 0.3350725844502449, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1734891831874847, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.04922279792746114, "grad_norm": 5.826187158620456, "kl": 0.0049591064453125, "learning_rate": 9.950777202072538e-07, "loss": -0.0004, "reward": 2.374372363090515, "reward_std": 0.23256518263588077, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8743725419044495, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.05181347150259067, "grad_norm": 22.95343976117933, "kl": 0.0100860595703125, "learning_rate": 9.948186528497408e-07, "loss": 0.0, "reward": 1.301570177078247, "reward_std": 0.42649850249290466, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 0.8328201770782471, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 0.054404145077720206, "grad_norm": 6.772878748156653, "kl": 0.00823974609375, "learning_rate": 9.94559585492228e-07, "loss": 0.0, "reward": 1.33074951171875, "reward_std": 0.3832203894853592, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8307496011257172, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 39.4375, "epoch": 0.05699481865284974, "grad_norm": 10.297679199454947, "kl": 0.00865936279296875, "learning_rate": 9.94300518134715e-07, "loss": 0.0, "reward": 1.5798376202583313, "reward_std": 0.4465784877538681, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0798377394676208, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.05958549222797927, "grad_norm": 13.551649937641264, "kl": 0.0035247802734375, "learning_rate": 9.94041450777202e-07, "loss": 0.0, "reward": 1.4647013545036316, "reward_std": 0.04671426024287939, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9647012948989868, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.06217616580310881, "grad_norm": 10.228081761806076, "kl": 0.0051422119140625, "learning_rate": 9.937823834196892e-07, "loss": 0.0, "reward": 2.090973377227783, "reward_std": 0.5235144793987274, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5909733176231384, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.06476683937823834, "grad_norm": 12.670354401567412, "kl": 0.01373291015625, "learning_rate": 9.93523316062176e-07, "loss": 0.0001, "reward": 1.5279790163040161, "reward_std": 0.34837983548641205, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0279790163040161, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.06735751295336788, "grad_norm": 85.73393583985431, "kl": 0.0098876953125, "learning_rate": 9.932642487046632e-07, "loss": 0.0, "reward": 1.7200268507003784, "reward_std": 0.5373799502849579, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.220026820898056, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.06994818652849741, "grad_norm": 20.70548425239945, "kl": 0.03607940673828125, "learning_rate": 9.930051813471502e-07, "loss": 0.0001, "reward": 2.3122777938842773, "reward_std": 0.34807332935633895, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122778534889221, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.07253886010362694, "grad_norm": 5.908075087974139, "kl": 0.009857177734375, "learning_rate": 9.927461139896372e-07, "loss": -0.0001, "reward": 2.2271978855133057, "reward_std": 0.294845873087354, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7271978855133057, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.07512953367875648, "grad_norm": 3.9199487254359524, "kl": 0.00604248046875, "learning_rate": 9.924870466321244e-07, "loss": 0.0002, "reward": 1.9747270345687866, "reward_std": 0.028876740972918924, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4747270345687866, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.07772020725388601, "grad_norm": 65.80625397311022, "kl": 0.021881103515625, "learning_rate": 9.922279792746114e-07, "loss": 0.0001, "reward": 1.6788696646690369, "reward_std": 0.48825588822364807, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1788696646690369, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 42.25, "epoch": 0.08031088082901554, "grad_norm": 12.073697470308701, "kl": 0.051910400390625, "learning_rate": 9.919689119170984e-07, "loss": 0.0002, "reward": 1.504340648651123, "reward_std": 0.7002298533916473, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.0355906188488007, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.08290155440414508, "grad_norm": 13.663647643926925, "kl": 0.0283203125, "learning_rate": 9.917098445595854e-07, "loss": 0.0001, "reward": 1.8799924850463867, "reward_std": 0.5280212461948395, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3799926042556763, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.08549222797927461, "grad_norm": 13.1644848063147, "kl": 0.010101318359375, "learning_rate": 9.914507772020724e-07, "loss": 0.0, "reward": 2.283276081085205, "reward_std": 0.4688723385334015, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7832762002944946, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 33.6875, "epoch": 0.08808290155440414, "grad_norm": 3.916899897262447, "kl": 0.0069026947021484375, "learning_rate": 9.911917098445596e-07, "loss": -0.0007, "reward": 1.9066423177719116, "reward_std": 0.1210946045447372, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4066424369812012, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.09067357512953368, "grad_norm": 32.54094334602466, "kl": 0.0145263671875, "learning_rate": 9.909326424870466e-07, "loss": 0.0008, "reward": 2.2481777667999268, "reward_std": 0.2691532947064843, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7481777667999268, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.09326424870466321, "grad_norm": 16.203970112412247, "kl": 0.0537109375, "learning_rate": 9.906735751295336e-07, "loss": 0.0002, "reward": 2.374782085418701, "reward_std": 0.3541204035282135, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747820258140564, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 0.09585492227979274, "grad_norm": 7.346267303565529, "kl": 0.02984619140625, "learning_rate": 9.904145077720206e-07, "loss": 0.0001, "reward": 1.9984049797058105, "reward_std": 0.4639572650194168, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984049201011658, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 0.09844559585492228, "grad_norm": 5.458969176953973, "kl": 0.06634521484375, "learning_rate": 9.901554404145076e-07, "loss": 0.0003, "reward": 2.358880877494812, "reward_std": 0.26305398192755547, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8588809370994568, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 39.5, "epoch": 0.10103626943005181, "grad_norm": 6.3765801392331944, "kl": 0.0377197265625, "learning_rate": 9.898963730569949e-07, "loss": 0.0002, "reward": 2.3735098838806152, "reward_std": 0.35733506083488464, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.87350994348526, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 0.10362694300518134, "grad_norm": 6.168304150936274, "kl": 0.0550689697265625, "learning_rate": 9.896373056994819e-07, "loss": -0.0004, "reward": 2.3628244400024414, "reward_std": 0.2539816516989504, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.862824559211731, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.10621761658031088, "grad_norm": 10.706190009666566, "kl": 0.030609130859375, "learning_rate": 9.893782383419688e-07, "loss": 0.0001, "reward": 2.1231403946876526, "reward_std": 0.5498570650815964, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6231403946876526, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.10880829015544041, "grad_norm": 29.043125215673257, "kl": 0.0101776123046875, "learning_rate": 9.89119170984456e-07, "loss": -0.0006, "reward": 2.148629665374756, "reward_std": 0.2971586900384864, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6486298441886902, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.11139896373056994, "grad_norm": 8.490661241148981, "kl": 0.0391845703125, "learning_rate": 9.888601036269428e-07, "loss": 0.0002, "reward": 1.9060485363006592, "reward_std": 0.20844853296875954, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.406048595905304, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.11398963730569948, "grad_norm": 9.724094268852028, "kl": 0.021392822265625, "learning_rate": 9.8860103626943e-07, "loss": 0.0009, "reward": 2.056355655193329, "reward_std": 0.17936077756348823, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5563557147979736, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.11658031088082901, "grad_norm": 15.685740211027266, "kl": 0.05206298828125, "learning_rate": 9.88341968911917e-07, "loss": 0.0002, "reward": 2.121419072151184, "reward_std": 0.43889446556568146, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6214191913604736, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.11917098445595854, "grad_norm": 11.953779879531842, "kl": 0.03326416015625, "learning_rate": 9.88082901554404e-07, "loss": 0.0001, "reward": 1.9242934584617615, "reward_std": 0.1779791955195833, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.424293577671051, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.12176165803108809, "grad_norm": 17.840764937299017, "kl": 0.0562744140625, "learning_rate": 9.878238341968913e-07, "loss": 0.0002, "reward": 1.5369911193847656, "reward_std": 0.19212525617331266, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0369911193847656, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 0.12435233160621761, "grad_norm": 65.66576263295809, "kl": 0.0738525390625, "learning_rate": 9.875647668393783e-07, "loss": 0.0003, "reward": 1.8490688800811768, "reward_std": 0.2483662785962224, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3490687608718872, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.12694300518134716, "grad_norm": 21.433893109599417, "kl": 0.074462890625, "learning_rate": 9.873056994818653e-07, "loss": 0.0003, "reward": 1.8652429580688477, "reward_std": 0.6188614070415497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3652429580688477, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.12953367875647667, "grad_norm": 17.295195700121205, "kl": 0.02862548828125, "learning_rate": 9.870466321243523e-07, "loss": 0.0001, "reward": 1.694675326347351, "reward_std": 0.4057391732931137, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1946754157543182, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 0.13212435233160622, "grad_norm": 7.316841823241248, "kl": 0.011077880859375, "learning_rate": 9.867875647668393e-07, "loss": -0.0003, "reward": 1.8684223890304565, "reward_std": 0.003720223583513871, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3684225678443909, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.13471502590673576, "grad_norm": 0.4917079772357874, "kl": 0.016571044921875, "learning_rate": 9.865284974093265e-07, "loss": -0.0, "reward": 2.4999821186065674, "reward_std": 1.0357146493333858e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982237815857, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 38.875, "epoch": 0.13730569948186527, "grad_norm": 9.978800421940269, "kl": 0.09149169921875, "learning_rate": 9.862694300518135e-07, "loss": 0.0001, "reward": 2.249968647956848, "reward_std": 0.2672910816257854, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499686479568481, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 0.13989637305699482, "grad_norm": 8.84133750825491, "kl": 0.010711669921875, "learning_rate": 9.860103626943005e-07, "loss": 0.0001, "reward": 2.000650644302368, "reward_std": 0.2072200019902084, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5006505250930786, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.14248704663212436, "grad_norm": 4.771244705555047, "kl": 0.02789306640625, "learning_rate": 9.857512953367875e-07, "loss": 0.0008, "reward": 2.499993324279785, "reward_std": 3.1860309945841436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.14507772020725387, "grad_norm": 3.844994525557949, "kl": 0.03631591796875, "learning_rate": 9.854922279792745e-07, "loss": 0.0008, "reward": 2.4991708993911743, "reward_std": 0.00035420662584328966, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9991707801818848, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.14766839378238342, "grad_norm": 16.06769473398353, "kl": 0.03460693359375, "learning_rate": 9.852331606217617e-07, "loss": 0.0001, "reward": 2.1552769541740417, "reward_std": 0.29821854834733585, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6552769541740417, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.15025906735751296, "grad_norm": 1.2411027099249696, "kl": 0.07916259765625, "learning_rate": 9.849740932642487e-07, "loss": -0.0004, "reward": 2.4999871253967285, "reward_std": 3.0301206493277277e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873042106628, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.15284974093264247, "grad_norm": 4.491392544341068, "kl": 0.03009033203125, "learning_rate": 9.847150259067357e-07, "loss": 0.0009, "reward": 1.997864007949829, "reward_std": 0.00034692228433641503, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978638589382172, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.15544041450777202, "grad_norm": 0.23798557165187859, "kl": 0.0177001953125, "learning_rate": 9.844559585492227e-07, "loss": -0.0007, "reward": 2.4999938011169434, "reward_std": 5.458761336285534e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 0.15803108808290156, "grad_norm": 39.10769476863789, "kl": 0.0384521484375, "learning_rate": 9.841968911917097e-07, "loss": 0.0002, "reward": 1.3119145035743713, "reward_std": 0.37560027092695236, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8119145631790161, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.16062176165803108, "grad_norm": 18.33334629217925, "kl": 0.0511474609375, "learning_rate": 9.83937823834197e-07, "loss": 0.0005, "reward": 1.835119366645813, "reward_std": 0.06800985410086469, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.335119366645813, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 0.16321243523316062, "grad_norm": 7.985533496360222, "kl": 0.085205078125, "learning_rate": 9.83678756476684e-07, "loss": 0.0012, "reward": 2.4024648666381836, "reward_std": 0.2758014367409487, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9024646878242493, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.16580310880829016, "grad_norm": 10.328464049853949, "kl": 0.0159912109375, "learning_rate": 9.83419689119171e-07, "loss": 0.0002, "reward": 2.4373693466186523, "reward_std": 0.17690205269900616, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373695254325867, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.16839378238341968, "grad_norm": 3.737569530507125, "kl": 0.010406494140625, "learning_rate": 9.831606217616581e-07, "loss": -0.0005, "reward": 2.4998233318328857, "reward_std": 0.00017982780809688848, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998233318328857, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.17098445595854922, "grad_norm": 35.09194181457652, "kl": 0.08062744140625, "learning_rate": 9.829015544041451e-07, "loss": 0.0003, "reward": 1.2884096503257751, "reward_std": 0.18556920439004898, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7884096801280975, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.17357512953367876, "grad_norm": 4.4742606069747985, "kl": 0.02557373046875, "learning_rate": 9.826424870466321e-07, "loss": 0.0002, "reward": 2.0064504742622375, "reward_std": 0.22039345638245322, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5064502954483032, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.17616580310880828, "grad_norm": 13.738664040449144, "kl": 0.09814453125, "learning_rate": 9.823834196891191e-07, "loss": 0.0004, "reward": 2.2496520280838013, "reward_std": 0.4632646292448044, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7496519684791565, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.17875647668393782, "grad_norm": 1.0286939436763218, "kl": 0.031524658203125, "learning_rate": 9.821243523316061e-07, "loss": -0.0, "reward": 2.499985098838806, "reward_std": 8.088442243092686e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852180480957, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.18134715025906736, "grad_norm": 13.372342861956389, "kl": 0.06988525390625, "learning_rate": 9.818652849740933e-07, "loss": 0.0003, "reward": 2.3748332262039185, "reward_std": 0.3540114015340805, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748334050178528, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.18393782383419688, "grad_norm": 9.859774379863193, "kl": 0.144287109375, "learning_rate": 9.816062176165803e-07, "loss": 0.0011, "reward": 2.4373730421066284, "reward_std": 0.17707084878566093, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373729825019836, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.18652849740932642, "grad_norm": 17.217347767832006, "kl": 0.042816162109375, "learning_rate": 9.813471502590673e-07, "loss": 0.0001, "reward": 2.4374468326568604, "reward_std": 0.17688091165473452, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374468922615051, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 0.18911917098445596, "grad_norm": 80.46695086162809, "kl": 0.03826904296875, "learning_rate": 9.810880829015543e-07, "loss": 0.0002, "reward": 1.610620379447937, "reward_std": 0.32785288244485855, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.110620379447937, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.19170984455958548, "grad_norm": 2.5897332971003006, "kl": 0.02685546875, "learning_rate": 9.808290155440413e-07, "loss": 0.0003, "reward": 2.4999903440475464, "reward_std": 7.873922641010722e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.19430051813471502, "grad_norm": 50.61743007949566, "kl": 0.0472412109375, "learning_rate": 9.805699481865285e-07, "loss": 0.0002, "reward": 1.435221254825592, "reward_std": 0.14075116362801054, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9352213144302368, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.19689119170984457, "grad_norm": 10.481833772251237, "kl": 0.02294921875, "learning_rate": 9.803108808290155e-07, "loss": 0.0011, "reward": 1.998408854007721, "reward_std": 0.00015119295005661115, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498408854007721, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.19948186528497408, "grad_norm": 12.730467915995241, "kl": 0.1314697265625, "learning_rate": 9.800518134715025e-07, "loss": 0.0011, "reward": 1.998818814754486, "reward_std": 2.660337872839591e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988188743591309, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.20207253886010362, "grad_norm": 11.814774431735474, "kl": 0.02392578125, "learning_rate": 9.797927461139895e-07, "loss": 0.0007, "reward": 1.9988856315612793, "reward_std": 0.0004637845554498199, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988854825496674, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.20466321243523317, "grad_norm": 11.32405188454657, "kl": 0.108642578125, "learning_rate": 9.795336787564765e-07, "loss": 0.0011, "reward": 2.1285080909729004, "reward_std": 0.3215496615994198, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6285080909729004, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.20725388601036268, "grad_norm": 93.06758798781671, "kl": 0.0654296875, "learning_rate": 9.792746113989637e-07, "loss": -0.0001, "reward": 1.872558832168579, "reward_std": 0.11530058777498198, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3725588917732239, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 34.375, "epoch": 0.20984455958549222, "grad_norm": 10.648000849028326, "kl": 0.31884765625, "learning_rate": 9.790155440414507e-07, "loss": 0.0013, "reward": 1.8860008716583252, "reward_std": 0.45801009237766266, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.38600093126297, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.21243523316062177, "grad_norm": 13.66639702733694, "kl": 0.0579833984375, "learning_rate": 9.787564766839377e-07, "loss": -0.0001, "reward": 2.437170624732971, "reward_std": 0.17767984258659908, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9371704459190369, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.21502590673575128, "grad_norm": 14.383985101842805, "kl": 0.031982421875, "learning_rate": 9.784974093264247e-07, "loss": 0.0001, "reward": 2.0623351335525513, "reward_std": 0.1768428385257721, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5623350143432617, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.21761658031088082, "grad_norm": 0.1389676875098764, "kl": 0.036376953125, "learning_rate": 9.78238341968912e-07, "loss": -0.0013, "reward": 2.499998092651367, "reward_std": 1.0862225678920367e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.22020725388601037, "grad_norm": 9.77432571226228, "kl": 0.0509033203125, "learning_rate": 9.77979274611399e-07, "loss": 0.0006, "reward": 2.4998271465301514, "reward_std": 0.00018300341616850346, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998271465301514, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.22279792746113988, "grad_norm": 0.33833466598602246, "kl": 0.073486328125, "learning_rate": 9.77720207253886e-07, "loss": -0.0004, "reward": 2.4999457597732544, "reward_std": 2.931680228357436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999458193778992, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.22538860103626943, "grad_norm": 0.28784132781290656, "kl": 0.042236328125, "learning_rate": 9.77461139896373e-07, "loss": 0.0006, "reward": 2.499996781349182, "reward_std": 2.5204562916769646e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.22797927461139897, "grad_norm": 58.73064903983638, "kl": 0.15966796875, "learning_rate": 9.772020725388602e-07, "loss": 0.0006, "reward": 1.5121837854385376, "reward_std": 0.386112280189991, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0121837854385376, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.23056994818652848, "grad_norm": 55.61984201254667, "kl": 0.12158203125, "learning_rate": 9.769430051813472e-07, "loss": 0.0004, "reward": 1.942750096321106, "reward_std": 0.041752650431590155, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4427500069141388, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.23316062176165803, "grad_norm": 10.53859501254366, "kl": 0.157958984375, "learning_rate": 9.766839378238342e-07, "loss": 0.0006, "reward": 1.5449185967445374, "reward_std": 0.21881897374987602, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.044918715953827, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.23575129533678757, "grad_norm": 80.09334191534732, "kl": 0.175537109375, "learning_rate": 9.764248704663212e-07, "loss": -0.0002, "reward": 1.9802252650260925, "reward_std": 0.016688236203890483, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4802253544330597, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.23834196891191708, "grad_norm": 7.657382761835424, "kl": 0.093994140625, "learning_rate": 9.761658031088082e-07, "loss": 0.0004, "reward": 1.7322908639907837, "reward_std": 0.14614096486081962, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2322908788919449, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.24093264248704663, "grad_norm": 5.257843085920056, "kl": 0.0897216796875, "learning_rate": 9.759067357512954e-07, "loss": 0.0003, "reward": 2.499901533126831, "reward_std": 0.0001717846425890457, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999014735221863, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.24352331606217617, "grad_norm": 15.26643561770162, "kl": 0.0888671875, "learning_rate": 9.756476683937824e-07, "loss": 0.0005, "reward": 2.3111801147460938, "reward_std": 0.2606041412608988, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.811180055141449, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 41.1875, "epoch": 0.24611398963730569, "grad_norm": 1.3490938995303823, "kl": 0.0394287109375, "learning_rate": 9.753886010362694e-07, "loss": 0.0007, "reward": 2.4999839067459106, "reward_std": 9.999583255648758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.24870466321243523, "grad_norm": 59.022090006407026, "kl": 0.0853271484375, "learning_rate": 9.751295336787564e-07, "loss": 0.0003, "reward": 2.0621442794799805, "reward_std": 0.4084620773792267, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562144160270691, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.25129533678756477, "grad_norm": 12.294296583702906, "kl": 0.116943359375, "learning_rate": 9.748704663212434e-07, "loss": 0.0012, "reward": 1.9784838557243347, "reward_std": 0.003630782363416074, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4784838557243347, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.2538860103626943, "grad_norm": 21.1199053214945, "kl": 0.2490234375, "learning_rate": 9.746113989637306e-07, "loss": 0.0014, "reward": 2.1795830726623535, "reward_std": 0.26560253942892587, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6795830130577087, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 0.25647668393782386, "grad_norm": 33.10002746999616, "kl": 0.394287109375, "learning_rate": 9.743523316062176e-07, "loss": 0.0017, "reward": 2.320215582847595, "reward_std": 0.3332075335565605, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8202155232429504, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.25906735751295334, "grad_norm": 189.16592808724997, "kl": 0.1123046875, "learning_rate": 9.740932642487046e-07, "loss": 0.0005, "reward": 2.4200730323791504, "reward_std": 0.22603996885300148, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9200730323791504, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.2616580310880829, "grad_norm": 1.495490033471356, "kl": 0.039306640625, "learning_rate": 9.738341968911916e-07, "loss": -0.0004, "reward": 2.4999409914016724, "reward_std": 1.6437497833976522e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999411702156067, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.26424870466321243, "grad_norm": 10.65546773504425, "kl": 0.031829833984375, "learning_rate": 9.735751295336788e-07, "loss": -0.0006, "reward": 2.499818205833435, "reward_std": 6.320671693060831e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998184442520142, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.266839378238342, "grad_norm": 82.19765220899964, "kl": 0.083984375, "learning_rate": 9.733160621761658e-07, "loss": 0.001, "reward": 2.3058135509490967, "reward_std": 0.268088152033215, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8058134317398071, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.2694300518134715, "grad_norm": 5.089169033737183, "kl": 0.089111328125, "learning_rate": 9.730569948186528e-07, "loss": 0.001, "reward": 1.9871313571929932, "reward_std": 7.813290170588516e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4871313273906708, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.27202072538860106, "grad_norm": 4.5539652227793495, "kl": 0.10791015625, "learning_rate": 9.727979274611398e-07, "loss": 0.0006, "reward": 1.769575834274292, "reward_std": 0.00033584438506295555, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.269575834274292, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 0.27461139896373055, "grad_norm": 25.004526255440048, "kl": 0.0772705078125, "learning_rate": 9.725388601036268e-07, "loss": 0.0003, "reward": 2.0617693662643433, "reward_std": 0.6375812590122223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5617693066596985, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.2772020725388601, "grad_norm": 19.925211971061326, "kl": 0.079833984375, "learning_rate": 9.72279792746114e-07, "loss": -0.0002, "reward": 1.9989428520202637, "reward_std": 0.0004973846025677631, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498943030834198, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.27979274611398963, "grad_norm": 14.317061568250736, "kl": 0.095703125, "learning_rate": 9.72020725388601e-07, "loss": -0.0, "reward": 2.499958872795105, "reward_std": 1.515552415298771e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999588131904602, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.2823834196891192, "grad_norm": 20.40592105569183, "kl": 0.058258056640625, "learning_rate": 9.71761658031088e-07, "loss": 0.0008, "reward": 1.8940714597702026, "reward_std": 0.24569877097846415, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3940714597702026, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.2849740932642487, "grad_norm": 230.18149524419584, "kl": 0.0628662109375, "learning_rate": 9.71502590673575e-07, "loss": 0.0003, "reward": 1.7844058871269226, "reward_std": 0.19000897742807865, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.284405767917633, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.28756476683937826, "grad_norm": 73.15143001154128, "kl": 0.423095703125, "learning_rate": 9.712435233160622e-07, "loss": 0.0017, "reward": 0.9695435762405396, "reward_std": 0.1753087192773819, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.4695435240864754, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.29015544041450775, "grad_norm": 3.725501941307994, "kl": 0.1109619140625, "learning_rate": 9.709844559585492e-07, "loss": 0.0008, "reward": 1.9989084005355835, "reward_std": 0.00028391299019858707, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989084005355835, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.2927461139896373, "grad_norm": 28.638828428541117, "kl": 0.09619140625, "learning_rate": 9.707253886010362e-07, "loss": 0.0004, "reward": 1.8261573910713196, "reward_std": 0.2956121563911438, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3261573910713196, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.29533678756476683, "grad_norm": 5.560865105392929, "kl": 0.134765625, "learning_rate": 9.704663212435232e-07, "loss": 0.0012, "reward": 1.998631477355957, "reward_std": 0.0001192707695736317, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498631328344345, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.2979274611398964, "grad_norm": 0.137458738940309, "kl": 0.04571533203125, "learning_rate": 9.702072538860102e-07, "loss": -0.0003, "reward": 2.4999951124191284, "reward_std": 2.0688578672434232e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3005181347150259, "grad_norm": 0.10096002483151793, "kl": 0.0291748046875, "learning_rate": 9.699481865284974e-07, "loss": 0.0007, "reward": 2.4999988079071045, "reward_std": 1.0496823961148039e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.30310880829015546, "grad_norm": 40.866315923761796, "kl": 0.0638427734375, "learning_rate": 9.696891191709844e-07, "loss": 0.0004, "reward": 1.9141007661819458, "reward_std": 0.06377019267540618, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4141008257865906, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.30569948186528495, "grad_norm": 29.267532103833616, "kl": 0.07904052734375, "learning_rate": 9.694300518134714e-07, "loss": 0.0, "reward": 1.7782161235809326, "reward_std": 0.011168545060229462, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2782160639762878, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3082901554404145, "grad_norm": 3.2452592539248033, "kl": 0.079345703125, "learning_rate": 9.691709844559584e-07, "loss": 0.0007, "reward": 2.4999226331710815, "reward_std": 8.360885274782959e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999224543571472, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.31088082901554404, "grad_norm": 3.3701739106858293, "kl": 0.095703125, "learning_rate": 9.689119170984456e-07, "loss": 0.0016, "reward": 2.4999860525131226, "reward_std": 1.803977755798769e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999858736991882, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3134715025906736, "grad_norm": 0.4306910685004529, "kl": 0.0399169921875, "learning_rate": 9.686528497409326e-07, "loss": 0.0014, "reward": 2.499981164932251, "reward_std": 5.114542318551685e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981164932251, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.3160621761658031, "grad_norm": 0.6165614525019705, "kl": 0.0792236328125, "learning_rate": 9.683937823834196e-07, "loss": 0.0001, "reward": 2.499987483024597, "reward_std": 8.934968491303152e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873638153076, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.31865284974093266, "grad_norm": 43.78654189917861, "kl": 0.0859375, "learning_rate": 9.681347150259066e-07, "loss": 0.0003, "reward": 2.4349666833877563, "reward_std": 0.17870346421841532, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9349667429924011, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.32124352331606215, "grad_norm": 9.576574378863842, "kl": 0.069091796875, "learning_rate": 9.678756476683936e-07, "loss": -0.0006, "reward": 1.786772608757019, "reward_std": 0.008558189496397972, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2867727875709534, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3238341968911917, "grad_norm": 0.5672974342919754, "kl": 0.109375, "learning_rate": 9.676165803108809e-07, "loss": -0.0003, "reward": 2.4999914169311523, "reward_std": 3.607425810514542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914765357971, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.32642487046632124, "grad_norm": 0.45849603087536406, "kl": 0.0281829833984375, "learning_rate": 9.673575129533679e-07, "loss": 0.0001, "reward": 2.499946355819702, "reward_std": 6.464975044195853e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999463558197021, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3290155440414508, "grad_norm": 0.10684558291271581, "kl": 0.0191650390625, "learning_rate": 9.670984455958549e-07, "loss": 0.0004, "reward": 2.499997854232788, "reward_std": 9.635996036649885e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.3316062176165803, "grad_norm": 1.6672301678592432, "kl": 0.0277099609375, "learning_rate": 9.668393782383419e-07, "loss": -0.0007, "reward": 2.4999799728393555, "reward_std": 1.7622547019868762e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999802112579346, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.33419689119170987, "grad_norm": 14.564917648332004, "kl": 0.06689453125, "learning_rate": 9.665803108808289e-07, "loss": 0.0003, "reward": 2.4364627599716187, "reward_std": 0.17969927358228688, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9364628195762634, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.33678756476683935, "grad_norm": 0.21128881747789507, "kl": 0.024688720703125, "learning_rate": 9.66321243523316e-07, "loss": -0.0, "reward": 2.4999969005584717, "reward_std": 1.539935226446687e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.3393782383419689, "grad_norm": 0.1661997432961643, "kl": 0.041259765625, "learning_rate": 9.66062176165803e-07, "loss": 0.0006, "reward": 2.49999737739563, "reward_std": 1.0921688158305187e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.34196891191709844, "grad_norm": 11.077352201415549, "kl": 0.09423828125, "learning_rate": 9.6580310880829e-07, "loss": 0.0001, "reward": 2.40717351436615, "reward_std": 0.2625342948043681, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.90717351436615, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 0.344559585492228, "grad_norm": 43.09364834527359, "kl": 0.060546875, "learning_rate": 9.655440414507773e-07, "loss": 0.0008, "reward": 1.7857298254966736, "reward_std": 0.000609212564313566, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2857298254966736, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.3471502590673575, "grad_norm": 13.739199546064242, "kl": 0.06396484375, "learning_rate": 9.652849740932643e-07, "loss": 0.0002, "reward": 1.487752914428711, "reward_std": 0.0015800511628185632, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9877530038356781, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.34974093264248707, "grad_norm": 0.9475437626482274, "kl": 0.08642578125, "learning_rate": 9.650259067357513e-07, "loss": 0.0002, "reward": 2.4999953508377075, "reward_std": 4.093372297120368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.35233160621761656, "grad_norm": 0.08830749903842879, "kl": 0.0357666015625, "learning_rate": 9.647668393782383e-07, "loss": 0.001, "reward": 2.499996304512024, "reward_std": 1.367066943203099e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3549222797927461, "grad_norm": 16.296471141831603, "kl": 0.0745849609375, "learning_rate": 9.645077720207253e-07, "loss": 0.0006, "reward": 1.8740895986557007, "reward_std": 0.23158495993175165, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3740895986557007, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.35751295336787564, "grad_norm": 1.4798000022055835, "kl": 0.02935791015625, "learning_rate": 9.642487046632125e-07, "loss": -0.0002, "reward": 2.4999786615371704, "reward_std": 2.0193642058075056e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999788403511047, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3601036269430052, "grad_norm": 22.08781398117867, "kl": 0.123779296875, "learning_rate": 9.639896373056995e-07, "loss": 0.0011, "reward": 2.4374067783355713, "reward_std": 0.17703431779892753, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374065399169922, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3626943005181347, "grad_norm": 10.824279159577653, "kl": 0.041748046875, "learning_rate": 9.637305699481865e-07, "loss": 0.0011, "reward": 2.499959945678711, "reward_std": 4.6924909895551536e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999959647655487, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.36528497409326427, "grad_norm": 2.8302928857015397, "kl": 0.0526123046875, "learning_rate": 9.634715025906735e-07, "loss": 0.0009, "reward": 1.997397243976593, "reward_std": 2.503079991811319e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4973973035812378, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.36787564766839376, "grad_norm": 2.284220801583633, "kl": 0.04046630859375, "learning_rate": 9.632124352331605e-07, "loss": 0.0009, "reward": 2.4998464584350586, "reward_std": 1.869342008831154e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998464584350586, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.3704663212435233, "grad_norm": 4.851253192919188, "kl": 0.0435791015625, "learning_rate": 9.629533678756477e-07, "loss": -0.0001, "reward": 1.9625054001808167, "reward_std": 0.269062704847741, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.462505429983139, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.37305699481865284, "grad_norm": 8.323749221884357, "kl": 0.045135498046875, "learning_rate": 9.626943005181347e-07, "loss": -0.0006, "reward": 2.4999040365219116, "reward_std": 0.00011651427837477968, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999040365219116, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.3756476683937824, "grad_norm": 62.99828543989011, "kl": 0.1173095703125, "learning_rate": 9.624352331606217e-07, "loss": 0.0006, "reward": 2.370848774909973, "reward_std": 0.2392801402755822, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.870848834514618, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.37823834196891193, "grad_norm": 35.45704518430419, "kl": 0.1943359375, "learning_rate": 9.621761658031087e-07, "loss": 0.0009, "reward": 1.332327127456665, "reward_std": 0.03815057013707701, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8323271870613098, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.38082901554404147, "grad_norm": 15.83723257614323, "kl": 0.140869140625, "learning_rate": 9.619170984455957e-07, "loss": 0.0006, "reward": 2.0642071962356567, "reward_std": 0.5120111899973949, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.5954571962356567, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.38341968911917096, "grad_norm": 2.1547226634750576, "kl": 0.06689453125, "learning_rate": 9.61658031088083e-07, "loss": 0.0001, "reward": 2.499987244606018, "reward_std": 2.1742151872672366e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874830245972, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.3860103626943005, "grad_norm": 6.158429008428887, "kl": 0.22662353515625, "learning_rate": 9.6139896373057e-07, "loss": 0.001, "reward": 2.4999027252197266, "reward_std": 0.0002664233661562321, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999902606010437, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.38860103626943004, "grad_norm": 14.626430330180554, "kl": 0.0928955078125, "learning_rate": 9.61139896373057e-07, "loss": 0.0004, "reward": 1.9991881847381592, "reward_std": 0.00034865460474975407, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991881847381592, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3911917098445596, "grad_norm": 4.4445929973162155, "kl": 0.1058349609375, "learning_rate": 9.608808290155441e-07, "loss": -0.0005, "reward": 2.4999340772628784, "reward_std": 3.253828839433481e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999342560768127, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.39378238341968913, "grad_norm": 7.162114332238333, "kl": 0.041748046875, "learning_rate": 9.60621761658031e-07, "loss": -0.0007, "reward": 1.9944767951965332, "reward_std": 0.00012321585373342714, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.494476854801178, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.3963730569948187, "grad_norm": 3.9669637574981986, "kl": 0.04388427734375, "learning_rate": 9.603626943005181e-07, "loss": -0.0, "reward": 2.4999598264694214, "reward_std": 2.1484502212842926e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999597072601318, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.39896373056994816, "grad_norm": 8.854710434075713, "kl": 0.113525390625, "learning_rate": 9.601036269430051e-07, "loss": 0.0005, "reward": 1.740896463394165, "reward_std": 0.14869303407704137, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2408965229988098, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.4015544041450777, "grad_norm": 2.2968149994613536, "kl": 0.02081298828125, "learning_rate": 9.598445595854921e-07, "loss": 0.0007, "reward": 2.499956965446472, "reward_std": 2.407233478152193e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999568462371826, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.40414507772020725, "grad_norm": 4.6370754624889905, "kl": 0.06298828125, "learning_rate": 9.595854922279793e-07, "loss": 0.0003, "reward": 1.999412477016449, "reward_std": 3.453648241702467e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994125366210938, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.4067357512953368, "grad_norm": 29.51123237328384, "kl": 0.093505859375, "learning_rate": 9.593264248704663e-07, "loss": 0.0004, "reward": 1.5522821545600891, "reward_std": 0.18402792094275355, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0522821843624115, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.40932642487046633, "grad_norm": 11.494280425813823, "kl": 0.08056640625, "learning_rate": 9.590673575129533e-07, "loss": 0.001, "reward": 2.4998749494552612, "reward_std": 0.0001450816967007995, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999875009059906, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4119170984455959, "grad_norm": 5.6088117778874285, "kl": 0.103759765625, "learning_rate": 9.588082901554403e-07, "loss": 0.0004, "reward": 1.2630306482315063, "reward_std": 0.000123612725417388, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7630305588245392, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.41450777202072536, "grad_norm": 21.424009264626743, "kl": 0.0565185546875, "learning_rate": 9.585492227979273e-07, "loss": 0.0002, "reward": 1.8036848306655884, "reward_std": 0.013577054726852111, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3036848902702332, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.4170984455958549, "grad_norm": 115.661575666202, "kl": 0.3123779296875, "learning_rate": 9.582901554404145e-07, "loss": 0.0013, "reward": 1.496459722518921, "reward_std": 0.0006840773348812945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9964596629142761, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.41968911917098445, "grad_norm": 11.130225495012816, "kl": 0.084228515625, "learning_rate": 9.580310880829015e-07, "loss": 0.0004, "reward": 1.9993212223052979, "reward_std": 2.3810795255485573e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499321162700653, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.422279792746114, "grad_norm": 0.9513904777992606, "kl": 0.03668212890625, "learning_rate": 9.577720207253885e-07, "loss": 0.0001, "reward": 1.9998791217803955, "reward_std": 8.183985613641198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499879240989685, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.42487046632124353, "grad_norm": 14.867278786760952, "kl": 0.0399169921875, "learning_rate": 9.575129533678755e-07, "loss": 0.0006, "reward": 1.936882734298706, "reward_std": 0.17682716632043594, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4368827939033508, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4274611398963731, "grad_norm": 23.8103102756272, "kl": 0.02960205078125, "learning_rate": 9.572538860103625e-07, "loss": 0.0001, "reward": 1.3677114248275757, "reward_std": 0.0053491308353841305, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8677114248275757, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.43005181347150256, "grad_norm": 12.844052718055307, "kl": 0.056396484375, "learning_rate": 9.569948186528497e-07, "loss": -0.0001, "reward": 1.93603515625, "reward_std": 0.00016410122043453157, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.43603515625, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.4326424870466321, "grad_norm": 22.58584711705445, "kl": 0.044769287109375, "learning_rate": 9.567357512953367e-07, "loss": -0.0004, "reward": 1.9570363759994507, "reward_std": 0.008587222369897063, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.457036554813385, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.43523316062176165, "grad_norm": 17.616513920020026, "kl": 0.0389404296875, "learning_rate": 9.564766839378237e-07, "loss": 0.0002, "reward": 2.0078753232955933, "reward_std": 0.2208762717918944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5078753232955933, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4378238341968912, "grad_norm": 36.387893164904625, "kl": 0.0811767578125, "learning_rate": 9.56217616580311e-07, "loss": 0.0003, "reward": 1.412998616695404, "reward_std": 0.25945562243578024, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9129986315965652, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 0.44041450777202074, "grad_norm": 190.2019975603722, "kl": 0.07177734375, "learning_rate": 9.559585492227977e-07, "loss": -0.0, "reward": 2.1324145793914795, "reward_std": 0.30438611496308, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6324146389961243, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4430051813471503, "grad_norm": 0.3142419652096299, "kl": 0.061279296875, "learning_rate": 9.55699481865285e-07, "loss": 0.0005, "reward": 2.4999970197677612, "reward_std": 1.8965409935844946e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.44559585492227977, "grad_norm": 27.980496321559347, "kl": 0.093017578125, "learning_rate": 9.55440414507772e-07, "loss": 0.0004, "reward": 1.5251266956329346, "reward_std": 0.43262910449993797, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0251267850399017, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4481865284974093, "grad_norm": 80.56612648287063, "kl": 0.0677490234375, "learning_rate": 9.55181347150259e-07, "loss": 0.0003, "reward": 2.499842405319214, "reward_std": 0.00043706211971539233, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999842643737793, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.45077720207253885, "grad_norm": 25.788920218188494, "kl": 0.10986328125, "learning_rate": 9.549222797927462e-07, "loss": 0.0003, "reward": 1.5545368790626526, "reward_std": 0.17708226561080664, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0545367896556854, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.4533678756476684, "grad_norm": 12.089137385762902, "kl": 0.12579345703125, "learning_rate": 9.546632124352332e-07, "loss": 0.0008, "reward": 1.895274043083191, "reward_std": 0.032199050390772754, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3952739834785461, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.45595854922279794, "grad_norm": 10.582577503457859, "kl": 0.0333251953125, "learning_rate": 9.544041450777202e-07, "loss": 0.0003, "reward": 2.437143325805664, "reward_std": 0.17744470021716552, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9371434450149536, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4585492227979275, "grad_norm": 48.327024431471294, "kl": 0.091796875, "learning_rate": 9.541450777202072e-07, "loss": 0.0004, "reward": 2.2498220801353455, "reward_std": 0.2674347456622854, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749821960926056, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.46113989637305697, "grad_norm": 0.5286362200031707, "kl": 0.0750732421875, "learning_rate": 9.538860103626942e-07, "loss": -0.0001, "reward": 2.499991297721863, "reward_std": 5.015401029595523e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4637305699481865, "grad_norm": 32.41112421030689, "kl": 0.0633544921875, "learning_rate": 9.536269430051813e-07, "loss": 0.0007, "reward": 2.1240649223327637, "reward_std": 0.23203402315498067, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6240649223327637, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.46632124352331605, "grad_norm": 25.966780755507866, "kl": 0.0693359375, "learning_rate": 9.533678756476683e-07, "loss": 0.0007, "reward": 1.8902241587638855, "reward_std": 0.03994759791311253, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3902240097522736, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.4689119170984456, "grad_norm": 37.81218897173825, "kl": 0.1064453125, "learning_rate": 9.531088082901554e-07, "loss": 0.0004, "reward": 1.9350528120994568, "reward_std": 0.17979187262244523, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4350528717041016, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.47150259067357514, "grad_norm": 15.086951394500765, "kl": 0.169921875, "learning_rate": 9.528497409326425e-07, "loss": 0.0014, "reward": 1.7499852180480957, "reward_std": 0.462918080507734, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2499852180480957, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.4740932642487047, "grad_norm": 7.892375066373191, "kl": 0.0791015625, "learning_rate": 9.525906735751295e-07, "loss": 0.0002, "reward": 1.498442828655243, "reward_std": 8.984662508737529e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9984428882598877, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.47668393782383417, "grad_norm": 20.27023159720772, "kl": 0.087738037109375, "learning_rate": 9.523316062176166e-07, "loss": 0.0001, "reward": 1.5829168260097504, "reward_std": 0.1536550578173319, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.08291694521904, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4792746113989637, "grad_norm": 0.17595850225718468, "kl": 0.042236328125, "learning_rate": 9.520725388601036e-07, "loss": 0.0, "reward": 2.4999961853027344, "reward_std": 1.2840036731631699e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 48.25, "epoch": 0.48186528497409326, "grad_norm": 0.40461139036671895, "kl": 0.039520263671875, "learning_rate": 9.518134715025906e-07, "loss": 0.0004, "reward": 2.499995231628418, "reward_std": 4.404245203204482e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 0.4844559585492228, "grad_norm": 12.380701805072905, "kl": 0.0531005859375, "learning_rate": 9.515544041450777e-07, "loss": 0.0007, "reward": 1.9987742900848389, "reward_std": 0.0008791745219838276, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987742602825165, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.48704663212435234, "grad_norm": 11.823680571216041, "kl": 0.0537109375, "learning_rate": 9.512953367875647e-07, "loss": 0.0003, "reward": 2.4999101161956787, "reward_std": 4.3073491724499036e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999102354049683, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 0.4896373056994819, "grad_norm": 7.044800704684001, "kl": 0.084716796875, "learning_rate": 9.510362694300518e-07, "loss": -0.0007, "reward": 1.9992892742156982, "reward_std": 5.6545551615272416e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499289333820343, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 0.49222797927461137, "grad_norm": 74.0119987865906, "kl": 0.326171875, "learning_rate": 9.507772020725389e-07, "loss": 0.0013, "reward": 1.5072201490402222, "reward_std": 0.5940889120101929, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0072201192378998, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4948186528497409, "grad_norm": 37.64276190821093, "kl": 0.150390625, "learning_rate": 9.505181347150258e-07, "loss": 0.0007, "reward": 2.374457597732544, "reward_std": 0.2324506744629673, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8744576573371887, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.49740932642487046, "grad_norm": 15.555389313056926, "kl": 0.0870361328125, "learning_rate": 9.502590673575129e-07, "loss": 0.0005, "reward": 2.4265120029449463, "reward_std": 0.20781972532222426, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.926512062549591, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.5, "grad_norm": 19.4756111799575, "kl": 0.26513671875, "learning_rate": 9.499999999999999e-07, "loss": 0.0014, "reward": 2.4092931747436523, "reward_std": 0.25654956975799337, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.909293293952942, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5025906735751295, "grad_norm": 5.178014531324024, "kl": 0.0919189453125, "learning_rate": 9.49740932642487e-07, "loss": 0.0006, "reward": 2.499848246574402, "reward_std": 5.712374422728317e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998482465744019, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.5051813471502591, "grad_norm": 12.198803097484618, "kl": 0.06866455078125, "learning_rate": 9.494818652849741e-07, "loss": 0.0004, "reward": 1.9853832721710205, "reward_std": 0.0004887606264674105, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4853830933570862, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.5077720207253886, "grad_norm": 0.3244974806076809, "kl": 0.0340576171875, "learning_rate": 9.492227979274611e-07, "loss": 0.0007, "reward": 2.4999890327453613, "reward_std": 3.4051186048600357e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889731407166, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.5103626943005182, "grad_norm": 2.1556294809377716, "kl": 0.049560546875, "learning_rate": 9.489637305699481e-07, "loss": 0.0006, "reward": 2.499929428100586, "reward_std": 2.7586033411353128e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999293088912964, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5129533678756477, "grad_norm": 45.64552233968355, "kl": 0.03009033203125, "learning_rate": 9.487046632124351e-07, "loss": -0.0003, "reward": 2.4373308420181274, "reward_std": 0.1772478182147097, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937330961227417, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5155440414507773, "grad_norm": 4.284872116233782, "kl": 0.06439208984375, "learning_rate": 9.484455958549222e-07, "loss": 0.0001, "reward": 2.4999619722366333, "reward_std": 4.201222407118621e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999961793422699, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 0.5181347150259067, "grad_norm": 40.295813516934885, "kl": 0.079833984375, "learning_rate": 9.481865284974093e-07, "loss": 0.0002, "reward": 1.9017527103424072, "reward_std": 0.05226568686339306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4017528295516968, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.5207253886010362, "grad_norm": 32.64195006506234, "kl": 0.093505859375, "learning_rate": 9.479274611398963e-07, "loss": 0.0004, "reward": 2.3739689588546753, "reward_std": 0.23335007277461273, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8739689588546753, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5233160621761658, "grad_norm": 40.57780012874333, "kl": 0.0743408203125, "learning_rate": 9.476683937823834e-07, "loss": -0.0004, "reward": 2.437275528907776, "reward_std": 0.17739782775328194, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937275469303131, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.5259067357512953, "grad_norm": 0.5033898025506102, "kl": 0.212890625, "learning_rate": 9.474093264248703e-07, "loss": 0.0006, "reward": 2.499997615814209, "reward_std": 1.1682932381518185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.5284974093264249, "grad_norm": 19.798513522945562, "kl": 0.16162109375, "learning_rate": 9.471502590673574e-07, "loss": 0.0008, "reward": 2.405739188194275, "reward_std": 0.26654690777104406, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9057391285896301, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5310880829015544, "grad_norm": 18.766191281557987, "kl": 0.047607421875, "learning_rate": 9.468911917098445e-07, "loss": -0.0003, "reward": 2.4999096393585205, "reward_std": 0.00010371945518272696, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999096393585205, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.533678756476684, "grad_norm": 0.8376760627223832, "kl": 0.0599365234375, "learning_rate": 9.466321243523315e-07, "loss": -0.0005, "reward": 2.499987006187439, "reward_std": 2.613274205032212e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999869465827942, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.5362694300518135, "grad_norm": 20.680836317165262, "kl": 0.089111328125, "learning_rate": 9.463730569948186e-07, "loss": -0.0001, "reward": 1.9333288669586182, "reward_std": 0.17510401243907836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.433328926563263, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.538860103626943, "grad_norm": 22.66875582077499, "kl": 0.053955078125, "learning_rate": 9.461139896373057e-07, "loss": 0.0, "reward": 1.9965955018997192, "reward_std": 0.00012420560778991785, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4965955018997192, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5414507772020726, "grad_norm": 0.13609180115292815, "kl": 0.039794921875, "learning_rate": 9.458549222797926e-07, "loss": -0.0006, "reward": 2.49999463558197, "reward_std": 9.232799129677005e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.5440414507772021, "grad_norm": 31.359195596096093, "kl": 0.16064453125, "learning_rate": 9.455958549222797e-07, "loss": 0.0006, "reward": 1.4549378752708435, "reward_std": 0.018142134184017777, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9549379050731659, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5466321243523317, "grad_norm": 67.54038074170582, "kl": 0.31689453125, "learning_rate": 9.453367875647667e-07, "loss": 0.001, "reward": 2.431976556777954, "reward_std": 0.1921473663933284, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9319766163825989, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5492227979274611, "grad_norm": 11.71282694587892, "kl": 0.092529296875, "learning_rate": 9.450777202072539e-07, "loss": 0.0009, "reward": 2.0621068477630615, "reward_std": 0.1769405230324992, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5621068477630615, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5518134715025906, "grad_norm": 20.61117697687836, "kl": 0.0509033203125, "learning_rate": 9.44818652849741e-07, "loss": 0.0002, "reward": 2.4997026920318604, "reward_std": 0.00029530325332416396, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99970281124115, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.5544041450777202, "grad_norm": 23.527289999832895, "kl": 0.15863037109375, "learning_rate": 9.44559585492228e-07, "loss": 0.0008, "reward": 1.9992926716804504, "reward_std": 0.00015641727713955333, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992926120758057, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 0.5569948186528497, "grad_norm": 69.58966702007723, "kl": 0.070068359375, "learning_rate": 9.44300518134715e-07, "loss": -0.0002, "reward": 1.9246259927749634, "reward_std": 0.06947185431999969, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4246260225772858, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 0.5595854922279793, "grad_norm": 11.86978468331808, "kl": 0.08746337890625, "learning_rate": 9.44041450777202e-07, "loss": 0.0001, "reward": 1.9520050287246704, "reward_std": 0.08759284867301176, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4520050287246704, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 0.5621761658031088, "grad_norm": 27.251641281394978, "kl": 0.046142578125, "learning_rate": 9.437823834196891e-07, "loss": 0.0002, "reward": 1.9846946001052856, "reward_std": 0.008279969051727676, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4846945703029633, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.5647668393782384, "grad_norm": 41.65977365876782, "kl": 0.1307373046875, "learning_rate": 9.435233160621762e-07, "loss": 0.0002, "reward": 1.9886304140090942, "reward_std": 0.21296574726588346, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4886305034160614, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 0.5673575129533679, "grad_norm": 19.083579649949037, "kl": 0.11279296875, "learning_rate": 9.432642487046632e-07, "loss": 0.0005, "reward": 1.6808223724365234, "reward_std": 0.26438943332664167, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1808224022388458, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5699481865284974, "grad_norm": 37.535938543585765, "kl": 0.03765869140625, "learning_rate": 9.430051813471503e-07, "loss": 0.001, "reward": 1.9849122762680054, "reward_std": 0.0022335679154821264, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4849122166633606, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.572538860103627, "grad_norm": 3.817424137813881, "kl": 0.0626220703125, "learning_rate": 9.427461139896372e-07, "loss": -0.0002, "reward": 2.4999735355377197, "reward_std": 3.330709364490758e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999734163284302, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5751295336787565, "grad_norm": 32.329894788730776, "kl": 0.132080078125, "learning_rate": 9.424870466321243e-07, "loss": 0.0006, "reward": 1.2147775888442993, "reward_std": 0.005103324925585184, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7147775590419769, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5777202072538861, "grad_norm": 5.316831600071935, "kl": 0.02362060546875, "learning_rate": 9.422279792746114e-07, "loss": 0.0005, "reward": 2.4999879598617554, "reward_std": 1.2397496902849525e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879002571106, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 0.5803108808290155, "grad_norm": 7.69632525933819, "kl": 0.1533203125, "learning_rate": 9.419689119170984e-07, "loss": 0.0002, "reward": 2.498706102371216, "reward_std": 4.3258162804704625e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9987059831619263, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.582901554404145, "grad_norm": 2.5390973305159865, "kl": 0.14013671875, "learning_rate": 9.417098445595855e-07, "loss": 0.0002, "reward": 2.4999914169311523, "reward_std": 1.481319878848808e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914765357971, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 0.5854922279792746, "grad_norm": 105.9745098669422, "kl": 0.072265625, "learning_rate": 9.414507772020725e-07, "loss": 0.0007, "reward": 1.861237645149231, "reward_std": 0.2548731706574472, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3612376153469086, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5880829015544041, "grad_norm": 8.802559439556907, "kl": 0.07220458984375, "learning_rate": 9.411917098445595e-07, "loss": 0.0003, "reward": 1.9982655048370361, "reward_std": 4.2557023789413506e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982656240463257, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.5906735751295337, "grad_norm": 1.8800332037848513, "kl": 0.08642578125, "learning_rate": 9.409326424870466e-07, "loss": 0.0007, "reward": 2.4999865293502808, "reward_std": 7.314209142350592e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986469745636, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5932642487046632, "grad_norm": 45.910301878876524, "kl": 0.04852294921875, "learning_rate": 9.406735751295336e-07, "loss": 0.0, "reward": 2.2490041851997375, "reward_std": 0.268184903877966, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749004065990448, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5958549222797928, "grad_norm": 67.87463248194385, "kl": 0.130615234375, "learning_rate": 9.404145077720207e-07, "loss": 0.0005, "reward": 1.9348987340927124, "reward_std": 0.17891097848769277, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4348988831043243, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 34.1875, "epoch": 0.5984455958549223, "grad_norm": 139.60814619798214, "kl": 0.07861328125, "learning_rate": 9.401554404145078e-07, "loss": -0.0003, "reward": 1.7711799144744873, "reward_std": 0.0027494705482808968, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2711800038814545, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6010362694300518, "grad_norm": 0.9793997681297096, "kl": 0.113037109375, "learning_rate": 9.398963730569948e-07, "loss": 0.0006, "reward": 2.4999979734420776, "reward_std": 1.4335365676743095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 0.6036269430051814, "grad_norm": 7.194179929331639, "kl": 0.067596435546875, "learning_rate": 9.396373056994819e-07, "loss": 0.0006, "reward": 1.7761932015419006, "reward_std": 0.0039824671922588095, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.276193231344223, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.6062176165803109, "grad_norm": 8.180830502177496, "kl": 0.0389404296875, "learning_rate": 9.393782383419688e-07, "loss": 0.0005, "reward": 1.9539988040924072, "reward_std": 0.014520528495438612, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4539988040924072, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.6088082901554405, "grad_norm": 182.15129466978613, "kl": 0.1429443359375, "learning_rate": 9.391191709844559e-07, "loss": 0.0004, "reward": 2.2972477674484253, "reward_std": 0.2822528837136815, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.79724782705307, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6113989637305699, "grad_norm": 17.08462006857316, "kl": 0.08203125, "learning_rate": 9.38860103626943e-07, "loss": 0.0005, "reward": 1.9986051321029663, "reward_std": 0.00019978535419795662, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986051619052887, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.6139896373056994, "grad_norm": 21.76024485647235, "kl": 0.0634765625, "learning_rate": 9.3860103626943e-07, "loss": 0.0004, "reward": 1.9594929218292236, "reward_std": 0.049739725039898985, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.459492951631546, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.616580310880829, "grad_norm": 26.42248341675574, "kl": 0.04876708984375, "learning_rate": 9.383419689119171e-07, "loss": 0.0002, "reward": 1.8738928437232971, "reward_std": 0.2314888799082837, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3738927841186523, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6191709844559585, "grad_norm": 54.059187823103954, "kl": 0.11602783203125, "learning_rate": 9.38082901554404e-07, "loss": 0.0011, "reward": 2.499852776527405, "reward_std": 0.00018792226643427057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998528361320496, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6217616580310881, "grad_norm": 0.3050202626143976, "kl": 0.09619140625, "learning_rate": 9.378238341968911e-07, "loss": -0.0001, "reward": 2.499997138977051, "reward_std": 2.272139994374811e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6243523316062176, "grad_norm": 1.6682737065538857, "kl": 0.0596923828125, "learning_rate": 9.375647668393782e-07, "loss": 0.0006, "reward": 2.4999693632125854, "reward_std": 1.2420740858942736e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999969244003296, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.6269430051813472, "grad_norm": 563.0429167784401, "kl": 0.13055419921875, "learning_rate": 9.373056994818652e-07, "loss": 0.0014, "reward": 1.8475852012634277, "reward_std": 0.043247136728453484, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3475850820541382, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.6295336787564767, "grad_norm": 55.56084960659313, "kl": 0.14013671875, "learning_rate": 9.370466321243523e-07, "loss": 0.0006, "reward": 1.3245500922203064, "reward_std": 0.02424245560541749, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8245501220226288, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6321243523316062, "grad_norm": 53.49170948539851, "kl": 0.22998046875, "learning_rate": 9.367875647668393e-07, "loss": 0.0009, "reward": 1.999035358428955, "reward_std": 0.3539975881576538, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499035358428955, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6347150259067358, "grad_norm": 0.15796069062473897, "kl": 0.01751708984375, "learning_rate": 9.365284974093264e-07, "loss": 0.0007, "reward": 2.4999918937683105, "reward_std": 1.9037836409552256e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.6373056994818653, "grad_norm": 4.431958292704458, "kl": 0.0311279296875, "learning_rate": 9.362694300518134e-07, "loss": 0.0008, "reward": 2.499981164932251, "reward_std": 1.4701588952448219e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999809861183167, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6398963730569949, "grad_norm": 262.33231360423315, "kl": 0.115234375, "learning_rate": 9.360103626943004e-07, "loss": 0.0007, "reward": 1.931067705154419, "reward_std": 0.020982795310828806, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4310676455497742, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6424870466321243, "grad_norm": 0.5367421705388409, "kl": 0.0479736328125, "learning_rate": 9.357512953367875e-07, "loss": 0.0006, "reward": 2.4999868869781494, "reward_std": 6.835135621940935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999867677688599, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6450777202072538, "grad_norm": 72.95876918994013, "kl": 0.17822265625, "learning_rate": 9.354922279792745e-07, "loss": 0.001, "reward": 1.9957296252250671, "reward_std": 0.005071526973097207, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4957295656204224, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6476683937823834, "grad_norm": 76.81352940203844, "kl": 0.1767578125, "learning_rate": 9.352331606217616e-07, "loss": 0.0007, "reward": 1.799617886543274, "reward_std": 0.281535180285573, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.299617886543274, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6502590673575129, "grad_norm": 27.796569009678898, "kl": 0.0718994140625, "learning_rate": 9.349740932642487e-07, "loss": -0.0003, "reward": 2.24941349029541, "reward_std": 0.2678882962031821, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7494134902954102, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6528497409326425, "grad_norm": 53.78272145335576, "kl": 0.0439453125, "learning_rate": 9.347150259067356e-07, "loss": -0.0006, "reward": 1.9910848140716553, "reward_std": 0.0020881170991913223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4910849928855896, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.655440414507772, "grad_norm": 15.671794176682647, "kl": 0.119873046875, "learning_rate": 9.344559585492227e-07, "loss": 0.0005, "reward": 2.3747057914733887, "reward_std": 0.2319982796907425, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874705970287323, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6580310880829016, "grad_norm": 0.39096522832753317, "kl": 0.0428466796875, "learning_rate": 9.341968911917099e-07, "loss": 0.0006, "reward": 2.499985456466675, "reward_std": 4.736155688078725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998539686203, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 0.6606217616580311, "grad_norm": 69.11722019783949, "kl": 0.072998046875, "learning_rate": 9.339378238341969e-07, "loss": 0.0001, "reward": 1.917561948299408, "reward_std": 0.0965556811188435, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4175621271133423, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6632124352331606, "grad_norm": 4.917823500717499, "kl": 0.047607421875, "learning_rate": 9.33678756476684e-07, "loss": 0.0012, "reward": 1.9998875260353088, "reward_std": 2.051514871936888e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998875260353088, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6658031088082902, "grad_norm": 1.5014354934201366, "kl": 0.1126708984375, "learning_rate": 9.33419689119171e-07, "loss": 0.0005, "reward": 2.499976634979248, "reward_std": 1.032855533367183e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999766945838928, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6683937823834197, "grad_norm": 0.4386385162817569, "kl": 0.113525390625, "learning_rate": 9.33160621761658e-07, "loss": -0.0001, "reward": 2.4999969005584717, "reward_std": 1.4673130976916582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6709844559585493, "grad_norm": 8.943288947354556, "kl": 0.06591796875, "learning_rate": 9.329015544041451e-07, "loss": 0.0005, "reward": 2.499526262283325, "reward_std": 2.3864250351834926e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995264410972595, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.6735751295336787, "grad_norm": 63.34069072866932, "kl": 0.114013671875, "learning_rate": 9.326424870466321e-07, "loss": 0.0005, "reward": 1.6249992847442627, "reward_std": 0.35355334668014393, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1249994114041328, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6761658031088082, "grad_norm": 24.923249939208088, "kl": 0.103759765625, "learning_rate": 9.323834196891192e-07, "loss": 0.0007, "reward": 1.8188714981079102, "reward_std": 0.0004906635687405014, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3188716173171997, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6787564766839378, "grad_norm": 0.1935786435863623, "kl": 0.09814453125, "learning_rate": 9.321243523316062e-07, "loss": 0.0008, "reward": 2.4999982118606567, "reward_std": 1.3507603568996274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6813471502590673, "grad_norm": 10.442448034621787, "kl": 0.0745849609375, "learning_rate": 9.318652849740933e-07, "loss": 0.0011, "reward": 2.3122979402542114, "reward_std": 0.2588302083981944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812297761440277, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6839378238341969, "grad_norm": 0.23193648450621937, "kl": 0.07281494140625, "learning_rate": 9.316062176165803e-07, "loss": -0.0, "reward": 2.499997138977051, "reward_std": 1.8553428162704222e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6865284974093264, "grad_norm": 2.7200278160432054, "kl": 0.0462646484375, "learning_rate": 9.313471502590673e-07, "loss": -0.0007, "reward": 2.49998140335083, "reward_std": 3.378462224645773e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998140335083, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.689119170984456, "grad_norm": 0.06016279165742596, "kl": 0.0496826171875, "learning_rate": 9.310880829015544e-07, "loss": 0.0002, "reward": 2.4999642372131348, "reward_std": 9.812092685024254e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999642372131348, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 38.375, "epoch": 0.6917098445595855, "grad_norm": 2.764616910718669, "kl": 0.08203125, "learning_rate": 9.308290155440414e-07, "loss": -0.0, "reward": 2.499989628791809, "reward_std": 1.2161759329387678e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.694300518134715, "grad_norm": 102.29706871094685, "kl": 0.083984375, "learning_rate": 9.305699481865285e-07, "loss": 0.0014, "reward": 1.9987385869026184, "reward_std": 0.0017034186852811217, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987384676933289, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 0.6968911917098446, "grad_norm": 19.458021285760132, "kl": 0.114501953125, "learning_rate": 9.303108808290156e-07, "loss": 0.0004, "reward": 1.9169889688491821, "reward_std": 0.0004150967740770284, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4169889688491821, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6994818652849741, "grad_norm": 3.0104536103053934, "kl": 0.033203125, "learning_rate": 9.300518134715025e-07, "loss": 0.0002, "reward": 1.9989683628082275, "reward_std": 3.013064247170405e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989683330059052, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7020725388601037, "grad_norm": 5.320232771119414, "kl": 0.0733642578125, "learning_rate": 9.297927461139896e-07, "loss": 0.0009, "reward": 1.8857821226119995, "reward_std": 0.00020022162902932905, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3857821226119995, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7046632124352331, "grad_norm": 0.5782355592333644, "kl": 0.047607421875, "learning_rate": 9.295336787564766e-07, "loss": 0.0008, "reward": 2.4999942779541016, "reward_std": 5.719464525100193e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7072538860103627, "grad_norm": 2.4793475433302414, "kl": 0.10791015625, "learning_rate": 9.292746113989637e-07, "loss": 0.0007, "reward": 2.49995756149292, "reward_std": 9.19318586056761e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999573230743408, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 0.7098445595854922, "grad_norm": 14.34374003217959, "kl": 0.0460205078125, "learning_rate": 9.290155440414508e-07, "loss": 0.0002, "reward": 2.3124663829803467, "reward_std": 0.25880904911264224, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812466323375702, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7124352331606217, "grad_norm": 6.575679330675983, "kl": 0.072509765625, "learning_rate": 9.287564766839378e-07, "loss": 0.0009, "reward": 2.4998632669448853, "reward_std": 7.547130098828347e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998632073402405, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.7150259067357513, "grad_norm": 50.75737719245101, "kl": 0.123291015625, "learning_rate": 9.284974093264248e-07, "loss": 0.0005, "reward": 1.6007416248321533, "reward_std": 0.4657672494649887, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.100741684436798, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7176165803108808, "grad_norm": 0.17214845446927296, "kl": 0.0433349609375, "learning_rate": 9.282383419689118e-07, "loss": -0.0012, "reward": 2.4999988079071045, "reward_std": 9.736103834256937e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 49.5, "epoch": 0.7202072538860104, "grad_norm": 11.73346501514433, "kl": 0.224609375, "learning_rate": 9.279792746113989e-07, "loss": 0.0008, "reward": 1.486747682094574, "reward_std": 0.0015261696025845595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9867476522922516, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7227979274611399, "grad_norm": 46.90697695319266, "kl": 0.0372314453125, "learning_rate": 9.27720207253886e-07, "loss": 0.0002, "reward": 2.3738441467285156, "reward_std": 0.23252329300521524, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8738441467285156, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 0.7253886010362695, "grad_norm": 56.25343895730956, "kl": 0.139892578125, "learning_rate": 9.27461139896373e-07, "loss": 0.0011, "reward": 1.8741596937179565, "reward_std": 0.09873420094663743, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3741595149040222, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.727979274611399, "grad_norm": 18.03440165791779, "kl": 0.322265625, "learning_rate": 9.272020725388601e-07, "loss": 0.001, "reward": 1.4974006414413452, "reward_std": 6.577471867785789e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.99740070104599, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7305699481865285, "grad_norm": 13.544696618730867, "kl": 0.109375, "learning_rate": 9.269430051813471e-07, "loss": 0.0, "reward": 1.9989688396453857, "reward_std": 0.000128183211927535, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989688992500305, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7331606217616581, "grad_norm": 36.22425821848427, "kl": 0.275146484375, "learning_rate": 9.266839378238341e-07, "loss": 0.0013, "reward": 1.8731261491775513, "reward_std": 0.23227407510421472, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3731261789798737, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7357512953367875, "grad_norm": 0.22075762681954414, "kl": 0.04034423828125, "learning_rate": 9.264248704663212e-07, "loss": 0.0003, "reward": 2.499996304512024, "reward_std": 1.3656868418365775e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7383419689119171, "grad_norm": 1.4442619899592837, "kl": 0.1011962890625, "learning_rate": 9.261658031088082e-07, "loss": -0.0001, "reward": 2.498896598815918, "reward_std": 1.3903473700338509e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998896837234497, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.7409326424870466, "grad_norm": 16.345628525045232, "kl": 0.04931640625, "learning_rate": 9.259067357512953e-07, "loss": -0.0001, "reward": 2.061937093734741, "reward_std": 0.1770040218208635, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5619369745254517, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 49.0, "epoch": 0.7435233160621761, "grad_norm": 8.201261334236742, "kl": 0.0888671875, "learning_rate": 9.256476683937824e-07, "loss": 0.0007, "reward": 2.4992451667785645, "reward_std": 7.46283676562598e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9992449879646301, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7461139896373057, "grad_norm": 1.0104520027333654, "kl": 0.0472412109375, "learning_rate": 9.253886010362693e-07, "loss": -0.0005, "reward": 2.4999842643737793, "reward_std": 4.736931657589594e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845623970032, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7487046632124352, "grad_norm": 24.661482373161597, "kl": 0.099365234375, "learning_rate": 9.251295336787564e-07, "loss": 0.0008, "reward": 1.998279094696045, "reward_std": 0.0005421763530648605, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982790350914001, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7512953367875648, "grad_norm": 9.606892227935449, "kl": 0.0792236328125, "learning_rate": 9.248704663212434e-07, "loss": 0.0002, "reward": 1.9960808753967285, "reward_std": 2.517347229513689e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4960809350013733, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7538860103626943, "grad_norm": 3.1298812359166868, "kl": 0.06787109375, "learning_rate": 9.246113989637305e-07, "loss": 0.0005, "reward": 1.9997079372406006, "reward_std": 2.085572123178281e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499707818031311, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7564766839378239, "grad_norm": 23.96211939574321, "kl": 0.0517578125, "learning_rate": 9.243523316062176e-07, "loss": 0.0, "reward": 2.374557614326477, "reward_std": 0.23211884864213062, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8745576739311218, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 0.7590673575129534, "grad_norm": 23.905465397205766, "kl": 0.77984619140625, "learning_rate": 9.240932642487046e-07, "loss": 0.0028, "reward": 1.4529399275779724, "reward_std": 0.06377134055492206, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.952940046787262, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7616580310880829, "grad_norm": 12.981492401637952, "kl": 0.066162109375, "learning_rate": 9.238341968911916e-07, "loss": 0.0006, "reward": 1.962188482284546, "reward_std": 0.014669846681499621, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4621884226799011, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7642487046632125, "grad_norm": 47.56722486479132, "kl": 0.091064453125, "learning_rate": 9.235751295336786e-07, "loss": 0.0004, "reward": 1.5587114691734314, "reward_std": 0.2839438980445266, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0587115287780762, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7668393782383419, "grad_norm": 2.1764530014467596, "kl": 0.03558349609375, "learning_rate": 9.233160621761657e-07, "loss": -0.0001, "reward": 1.9992074966430664, "reward_std": 5.13399518240476e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499207615852356, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.7694300518134715, "grad_norm": 22.359922973020144, "kl": 0.084228515625, "learning_rate": 9.230569948186529e-07, "loss": 0.0009, "reward": 1.6246170401573181, "reward_std": 0.23074987732877617, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1246169358491898, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.772020725388601, "grad_norm": 0.25775221772410295, "kl": 0.071044921875, "learning_rate": 9.227979274611399e-07, "loss": -0.0007, "reward": 2.4999910593032837, "reward_std": 3.6018820992467226e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7746113989637305, "grad_norm": 3.4255490101836483, "kl": 0.03717041015625, "learning_rate": 9.22538860103627e-07, "loss": 0.0007, "reward": 2.499966025352478, "reward_std": 1.9960130998697423e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999966025352478, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7772020725388601, "grad_norm": 13.974903172317482, "kl": 0.0545654296875, "learning_rate": 9.222797927461139e-07, "loss": 0.0004, "reward": 2.4995501041412354, "reward_std": 0.00022598833857045975, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999549925327301, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.7797927461139896, "grad_norm": 5.084860804899374, "kl": 2.0999755859375, "learning_rate": 9.22020725388601e-07, "loss": 0.0077, "reward": 2.4998668432235718, "reward_std": 0.0003563241192523492, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998669624328613, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.7823834196891192, "grad_norm": 29.24754676631399, "kl": 0.10406494140625, "learning_rate": 9.217616580310881e-07, "loss": 0.0003, "reward": 2.34384286403656, "reward_std": 0.2891426747557375, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8438429832458496, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.7849740932642487, "grad_norm": 0.09043587328237046, "kl": 0.082275390625, "learning_rate": 9.215025906735751e-07, "loss": 0.0002, "reward": 2.499998927116394, "reward_std": 5.115822361290157e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.7875647668393783, "grad_norm": 48.39699554524291, "kl": 0.081298828125, "learning_rate": 9.212435233160622e-07, "loss": 0.0, "reward": 1.7611334323883057, "reward_std": 0.02202786858721595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2611334323883057, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.7901554404145078, "grad_norm": 29.90360677176637, "kl": 0.0712890625, "learning_rate": 9.209844559585493e-07, "loss": 0.0003, "reward": 1.445801556110382, "reward_std": 0.25903499030391686, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9458015859127045, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7927461139896373, "grad_norm": 4.9189957322946265, "kl": 0.0728759765625, "learning_rate": 9.207253886010362e-07, "loss": 0.0003, "reward": 1.143511950969696, "reward_std": 0.0002724595287872944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.6435119062662125, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7953367875647669, "grad_norm": 0.9267659390144185, "kl": 0.02655029296875, "learning_rate": 9.204663212435233e-07, "loss": 0.0008, "reward": 2.4999918937683105, "reward_std": 9.939834967553907e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7979274611398963, "grad_norm": 25.999525264555484, "kl": 0.0281982421875, "learning_rate": 9.202072538860103e-07, "loss": 0.0002, "reward": 2.4996529817581177, "reward_std": 0.0005068558443781512, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996527433395386, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.8005181347150259, "grad_norm": 60.67457118827194, "kl": 0.219970703125, "learning_rate": 9.199481865284974e-07, "loss": 0.0009, "reward": 1.8351448774337769, "reward_std": 0.013795074075460434, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3351448774337769, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8031088082901554, "grad_norm": 0.3610197400427877, "kl": 0.14111328125, "learning_rate": 9.196891191709845e-07, "loss": -0.0003, "reward": 2.4999955892562866, "reward_std": 2.1781631858175388e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.805699481865285, "grad_norm": 0.2280375142958015, "kl": 0.1131591796875, "learning_rate": 9.194300518134715e-07, "loss": 0.0, "reward": 2.4999958276748657, "reward_std": 3.821065433839976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.8082901554404145, "grad_norm": 1.7031960441810114, "kl": 0.0545654296875, "learning_rate": 9.191709844559585e-07, "loss": -0.0001, "reward": 2.499996066093445, "reward_std": 3.5297505291964626e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.810880829015544, "grad_norm": 22.147269010453286, "kl": 0.060791015625, "learning_rate": 9.189119170984455e-07, "loss": 0.0003, "reward": 1.3742315769195557, "reward_std": 0.5175782124861144, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8742315471172333, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8134715025906736, "grad_norm": 10.517255841692602, "kl": 0.084228515625, "learning_rate": 9.186528497409326e-07, "loss": 0.0009, "reward": 2.4998375177383423, "reward_std": 0.00011021030059055192, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998374581336975, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.8160621761658031, "grad_norm": 12.01351049164203, "kl": 0.119140625, "learning_rate": 9.183937823834197e-07, "loss": 0.0005, "reward": 2.437287211418152, "reward_std": 0.1772405478404835, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372870922088623, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 0.8186528497409327, "grad_norm": 5.305793416265261, "kl": 0.045166015625, "learning_rate": 9.181347150259067e-07, "loss": 0.0006, "reward": 1.952026903629303, "reward_std": 0.00013917574415245326, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4520266950130463, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8212435233160622, "grad_norm": 0.20086730907884917, "kl": 0.10986328125, "learning_rate": 9.178756476683938e-07, "loss": 0.0004, "reward": 2.4999959468841553, "reward_std": 2.379997454227123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.8238341968911918, "grad_norm": 12.725253000105752, "kl": 0.51318359375, "learning_rate": 9.176165803108807e-07, "loss": 0.0018, "reward": 2.432328224182129, "reward_std": 0.1913791030401626, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.932328224182129, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8264248704663213, "grad_norm": 26.768779954127723, "kl": 0.05810546875, "learning_rate": 9.173575129533678e-07, "loss": 0.0005, "reward": 1.6163683533668518, "reward_std": 0.2367670061712488, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.116368293762207, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8290155440414507, "grad_norm": 35.01852542005629, "kl": 0.0499267578125, "learning_rate": 9.170984455958549e-07, "loss": 0.0002, "reward": 1.4518651962280273, "reward_std": 0.018155442754505202, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9518651962280273, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.8316062176165803, "grad_norm": 77.32382641505498, "kl": 0.03369140625, "learning_rate": 9.168393782383419e-07, "loss": 0.0006, "reward": 2.437354564666748, "reward_std": 0.17717695192504834, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373546242713928, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8341968911917098, "grad_norm": 12.238968475516936, "kl": 0.081298828125, "learning_rate": 9.16580310880829e-07, "loss": 0.0001, "reward": 1.9985675811767578, "reward_std": 7.426684015854335e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985676407814026, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8367875647668394, "grad_norm": 3.7281682165940166, "kl": 0.27294921875, "learning_rate": 9.16321243523316e-07, "loss": 0.0011, "reward": 1.490460216999054, "reward_std": 6.572147367478465e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9904601275920868, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.8393782383419689, "grad_norm": 84.3996637190592, "kl": 0.2391357421875, "learning_rate": 9.16062176165803e-07, "loss": 0.001, "reward": 1.5886932015419006, "reward_std": 0.34788138791918755, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0886932015419006, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8419689119170984, "grad_norm": 117.46267679146419, "kl": 0.06640625, "learning_rate": 9.158031088082901e-07, "loss": -0.0003, "reward": 1.9880843758583069, "reward_std": 0.012375812484606286, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4880844056606293, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 0.844559585492228, "grad_norm": 0.8858184417723093, "kl": 0.04827880859375, "learning_rate": 9.155440414507771e-07, "loss": 0.0009, "reward": 2.4999940395355225, "reward_std": 2.7375037916499423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 0.8471502590673575, "grad_norm": 26.98644813071061, "kl": 0.054443359375, "learning_rate": 9.152849740932642e-07, "loss": 0.0002, "reward": 1.8707313537597656, "reward_std": 0.23359641691786237, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3707314729690552, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 0.8497409326424871, "grad_norm": 36.72672796348208, "kl": 0.1064453125, "learning_rate": 9.150259067357513e-07, "loss": 0.0006, "reward": 1.3625949621200562, "reward_std": 0.11323657091634232, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8625949621200562, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8523316062176166, "grad_norm": 5.137149185226111, "kl": 0.05126953125, "learning_rate": 9.147668393782383e-07, "loss": 0.0001, "reward": 2.4999788999557495, "reward_std": 1.1832885547846672e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999789595603943, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8549222797927462, "grad_norm": 0.4237015474697751, "kl": 0.08935546875, "learning_rate": 9.145077720207253e-07, "loss": 0.0021, "reward": 2.499995708465576, "reward_std": 1.5240586890286068e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 38.125, "epoch": 0.8575129533678757, "grad_norm": 8.945859966594083, "kl": 0.14453125, "learning_rate": 9.142487046632123e-07, "loss": 0.0013, "reward": 2.498598337173462, "reward_std": 0.0004084099932697427, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9985982775688171, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8601036269430051, "grad_norm": 0.07478515294457982, "kl": 0.05804443359375, "learning_rate": 9.139896373056994e-07, "loss": 0.0005, "reward": 2.499998688697815, "reward_std": 9.422803088909859e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8626943005181347, "grad_norm": 4.879098612483906, "kl": 0.03436279296875, "learning_rate": 9.137305699481865e-07, "loss": 0.0014, "reward": 2.4998931884765625, "reward_std": 3.5479511325320345e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998931288719177, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.8652849740932642, "grad_norm": 35.89000234165128, "kl": 0.081298828125, "learning_rate": 9.134715025906735e-07, "loss": -0.0, "reward": 1.5267971754074097, "reward_std": 0.0759749440530868, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0267971772700548, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8678756476683938, "grad_norm": 22.782994989306655, "kl": 0.0550537109375, "learning_rate": 9.132124352331606e-07, "loss": 0.0002, "reward": 1.4980176091194153, "reward_std": 0.000535443930857582, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9980175495147705, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.8704663212435233, "grad_norm": 29.76855081461592, "kl": 0.0360107421875, "learning_rate": 9.129533678756475e-07, "loss": 0.0002, "reward": 1.6836384534835815, "reward_std": 0.2612959434190998, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1836384534835815, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8730569948186528, "grad_norm": 0.14439518079930413, "kl": 0.0252685546875, "learning_rate": 9.126943005181346e-07, "loss": 0.0011, "reward": 2.499996781349182, "reward_std": 2.6027656758742523e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.8756476683937824, "grad_norm": 0.06086039943664083, "kl": 0.03094482421875, "learning_rate": 9.124352331606217e-07, "loss": 0.0004, "reward": 2.499994993209839, "reward_std": 2.0081938032490143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8782383419689119, "grad_norm": 24.147484684638414, "kl": 0.082763671875, "learning_rate": 9.121761658031087e-07, "loss": 0.0002, "reward": 1.4440079927444458, "reward_std": 0.00016752214560256107, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9440080523490906, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 0.8808290155440415, "grad_norm": 1.0896323663507075, "kl": 0.107177734375, "learning_rate": 9.119170984455959e-07, "loss": 0.0011, "reward": 2.4999934434890747, "reward_std": 1.1774988422530441e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.883419689119171, "grad_norm": 0.13185178316333193, "kl": 0.0364990234375, "learning_rate": 9.116580310880829e-07, "loss": 0.0, "reward": 2.4999972581863403, "reward_std": 1.9932980990233773e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.8860103626943006, "grad_norm": 12.177965582845715, "kl": 0.07470703125, "learning_rate": 9.113989637305699e-07, "loss": 0.0007, "reward": 2.4994239807128906, "reward_std": 0.0001067378640300376, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9994239211082458, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8886010362694301, "grad_norm": 1.0483897043946357, "kl": 0.0330810546875, "learning_rate": 9.11139896373057e-07, "loss": -0.0006, "reward": 2.499993085861206, "reward_std": 5.50139787947046e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8911917098445595, "grad_norm": 0.6691869385301727, "kl": 0.043212890625, "learning_rate": 9.10880829015544e-07, "loss": 0.0006, "reward": 2.4999890327453613, "reward_std": 7.612593435624149e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889731407166, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.8937823834196891, "grad_norm": 17.38049804246634, "kl": 0.0634765625, "learning_rate": 9.106217616580311e-07, "loss": 0.0003, "reward": 1.726312279701233, "reward_std": 0.29936864227056503, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2263123989105225, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8963730569948186, "grad_norm": 0.1271669080651781, "kl": 0.043212890625, "learning_rate": 9.103626943005181e-07, "loss": 0.0003, "reward": 2.4999985694885254, "reward_std": 1.8901986038599716e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 0.8989637305699482, "grad_norm": 49.738039278062146, "kl": 0.0728759765625, "learning_rate": 9.101036269430052e-07, "loss": 0.0003, "reward": 1.595479667186737, "reward_std": 0.3625297471880913, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0954797267913818, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.9015544041450777, "grad_norm": 41.30099057507056, "kl": 0.0562744140625, "learning_rate": 9.098445595854922e-07, "loss": 0.0002, "reward": 1.4993904829025269, "reward_std": 0.5242236405611038, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9993905127048492, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.9041450777202072, "grad_norm": 0.15547808117105286, "kl": 0.03607177734375, "learning_rate": 9.095854922279792e-07, "loss": -0.0007, "reward": 2.49999737739563, "reward_std": 1.6273463927518605e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9067357512953368, "grad_norm": 0.3172936487529762, "kl": 0.0712890625, "learning_rate": 9.093264248704663e-07, "loss": 0.0006, "reward": 1.999813437461853, "reward_std": 4.252203154919698e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998133778572083, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.9093264248704663, "grad_norm": 12.13965472832464, "kl": 0.075439453125, "learning_rate": 9.090673575129534e-07, "loss": -0.0003, "reward": 1.8559105396270752, "reward_std": 0.05643878592195506, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.35591059923172, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9119170984455959, "grad_norm": 0.09064879968770556, "kl": 0.04052734375, "learning_rate": 9.088082901554404e-07, "loss": 0.0013, "reward": 2.4999964237213135, "reward_std": 1.9779182309775933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 43.5, "epoch": 0.9145077720207254, "grad_norm": 16.083020115031385, "kl": 0.0640869140625, "learning_rate": 9.085492227979275e-07, "loss": -0.0005, "reward": 1.990911066532135, "reward_std": 0.0013948471358844472, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4909111559391022, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.917098445595855, "grad_norm": 0.04306009543693447, "kl": 0.080810546875, "learning_rate": 9.082901554404144e-07, "loss": 0.0002, "reward": 2.499998927116394, "reward_std": 6.712469371450425e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9196891191709845, "grad_norm": 0.191597588676775, "kl": 0.039215087890625, "learning_rate": 9.080310880829015e-07, "loss": 0.0006, "reward": 2.4999966621398926, "reward_std": 2.228439825557871e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9222797927461139, "grad_norm": 0.2077425574267357, "kl": 0.0894775390625, "learning_rate": 9.077720207253886e-07, "loss": 0.0018, "reward": 2.4999983310699463, "reward_std": 8.821554047244717e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.9248704663212435, "grad_norm": 82.25939811962819, "kl": 0.069580078125, "learning_rate": 9.075129533678756e-07, "loss": -0.0006, "reward": 1.9606314897537231, "reward_std": 0.06992258400725859, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4606316089630127, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.927461139896373, "grad_norm": 2.129048577534351, "kl": 0.2626953125, "learning_rate": 9.072538860103627e-07, "loss": 0.0015, "reward": 2.4993492364883423, "reward_std": 2.5001405248303854e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9993492364883423, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9300518134715026, "grad_norm": 0.1776565962125698, "kl": 0.031982421875, "learning_rate": 9.069948186528497e-07, "loss": -0.0006, "reward": 2.4999948740005493, "reward_std": 2.2983322907066395e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 45.6875, "epoch": 0.9326424870466321, "grad_norm": 9.093432127379609, "kl": 0.1053466796875, "learning_rate": 9.067357512953367e-07, "loss": -0.0, "reward": 1.9990997314453125, "reward_std": 7.091586337537592e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990998208522797, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 70.8125, "epoch": 0.9352331606217616, "grad_norm": 1.9247135660301367, "kl": 0.110107421875, "learning_rate": 9.064766839378238e-07, "loss": 0.0004, "reward": 1.9623119831085205, "reward_std": 0.0003404405206310912, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4623119533061981, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 39.5, "epoch": 0.9378238341968912, "grad_norm": 9.640169112816553, "kl": 0.136962890625, "learning_rate": 9.062176165803108e-07, "loss": 0.0017, "reward": 1.984513282775879, "reward_std": 9.006850166315417e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4845131933689117, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9404145077720207, "grad_norm": 0.440919595343132, "kl": 0.117919921875, "learning_rate": 9.059585492227979e-07, "loss": 0.0005, "reward": 2.4999969005584717, "reward_std": 2.6379036057733174e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 69.0, "epoch": 0.9430051813471503, "grad_norm": 6.162753095320263, "kl": 0.2236328125, "learning_rate": 9.056994818652849e-07, "loss": 0.0012, "reward": 1.9994291067123413, "reward_std": 2.7912211749026028e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499429076910019, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.9455958549222798, "grad_norm": 10.584178951891278, "kl": 0.0870361328125, "learning_rate": 9.05440414507772e-07, "loss": -0.0005, "reward": 1.9742512702941895, "reward_std": 0.019844198538294222, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4742513597011566, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 0.9481865284974094, "grad_norm": 12.382828976830927, "kl": 0.35009765625, "learning_rate": 9.051813471502591e-07, "loss": 0.0013, "reward": 2.3749510049819946, "reward_std": 0.23152522772352313, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749509453773499, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 0.9507772020725389, "grad_norm": 30.874312148427126, "kl": 0.1866455078125, "learning_rate": 9.04922279792746e-07, "loss": 0.0005, "reward": 2.4374895095825195, "reward_std": 0.1767997801786123, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374893307685852, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9533678756476683, "grad_norm": 23.28510947812026, "kl": 0.10302734375, "learning_rate": 9.046632124352331e-07, "loss": 0.0007, "reward": 2.312163472175598, "reward_std": 0.2592373712662379, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8121635913848877, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 54.125, "epoch": 0.9559585492227979, "grad_norm": 71.63604226718203, "kl": 0.375, "learning_rate": 9.044041450777201e-07, "loss": 0.0018, "reward": 1.9997397661209106, "reward_std": 0.00011051230831071734, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997397661209106, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 130.75, "epoch": 0.9585492227979274, "grad_norm": 32.84832752416035, "kl": 0.30029296875, "learning_rate": 9.041450777202072e-07, "loss": 0.0009, "reward": 2.2491402626037598, "reward_std": 0.2681685869995363, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7491403818130493, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 38.1875, "epoch": 0.961139896373057, "grad_norm": 2.5840078247475637, "kl": 0.462890625, "learning_rate": 9.038860103626943e-07, "loss": 0.0026, "reward": 2.4999678134918213, "reward_std": 1.611490870345733e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999678134918213, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 0.9637305699481865, "grad_norm": 29.078033392596925, "kl": 0.5625, "learning_rate": 9.036269430051813e-07, "loss": 0.003, "reward": 2.374996304512024, "reward_std": 0.3535534933473059, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749961853027344, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 38.125, "epoch": 0.966321243523316, "grad_norm": 1.1845156336318436, "kl": 0.51171875, "learning_rate": 9.033678756476683e-07, "loss": 0.0033, "reward": 2.4999794960021973, "reward_std": 1.1057275060011307e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999793767929077, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 0.9689119170984456, "grad_norm": 6.777450373373138, "kl": 0.5087890625, "learning_rate": 9.031088082901554e-07, "loss": 0.002, "reward": 1.26322603225708, "reward_std": 0.0005494275537785143, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7632260620594025, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 0.9715025906735751, "grad_norm": 23.262872541009806, "kl": 0.380859375, "learning_rate": 9.028497409326424e-07, "loss": 0.0009, "reward": 2.4999603033065796, "reward_std": 1.9874356780746893e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999604225158691, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 0.9740932642487047, "grad_norm": 21.620167505249153, "kl": 0.4990234375, "learning_rate": 9.025906735751295e-07, "loss": 0.0021, "reward": 1.999347448348999, "reward_std": 0.00039217513040057383, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993475079536438, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 0.9766839378238342, "grad_norm": 4.297592416675427, "kl": 0.427734375, "learning_rate": 9.023316062176165e-07, "loss": 0.0011, "reward": 2.498944878578186, "reward_std": 5.1105109150739736e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9989449977874756, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 0.9792746113989638, "grad_norm": 0.10963193187441017, "kl": 0.58203125, "learning_rate": 9.020725388601036e-07, "loss": 0.0029, "reward": 2.4999935626983643, "reward_std": 1.9313748111926543e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 125.0, "epoch": 0.9818652849740933, "grad_norm": 0.18289344838834914, "kl": 0.4990234375, "learning_rate": 9.018134715025906e-07, "loss": 0.0033, "reward": 2.4999974966049194, "reward_std": 1.3051088956217427e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 0.9844559585492227, "grad_norm": 33.261525665083816, "kl": 0.48828125, "learning_rate": 9.015544041450776e-07, "loss": 0.0026, "reward": 2.0619019865989685, "reward_std": 0.17701144925376866, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5619019269943237, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.9870466321243523, "grad_norm": 49.81181552790332, "kl": 0.533203125, "learning_rate": 9.012953367875647e-07, "loss": 0.0029, "reward": 1.9975403547286987, "reward_std": 0.0008709488830618284, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975402057170868, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 59.625, "epoch": 0.9896373056994818, "grad_norm": 9.32163881763997, "kl": 0.50390625, "learning_rate": 9.010362694300517e-07, "loss": 0.0022, "reward": 2.4362341165542603, "reward_std": 0.1787196119776695, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.936234176158905, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 51.1875, "epoch": 0.9922279792746114, "grad_norm": 0.2710241267034931, "kl": 0.3740234375, "learning_rate": 9.007772020725389e-07, "loss": 0.0015, "reward": 2.4999834299087524, "reward_std": 4.688734179580933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999833703041077, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.9948186528497409, "grad_norm": 66.49114504862428, "kl": 0.4873046875, "learning_rate": 9.00518134715026e-07, "loss": 0.0012, "reward": 1.8325021862983704, "reward_std": 0.01290575864567245, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.33250230550766, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 0.9974093264248705, "grad_norm": 0.1810698960485993, "kl": 0.650390625, "learning_rate": 9.002590673575129e-07, "loss": 0.0023, "reward": 2.4999977350234985, "reward_std": 1.1945232358812063e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0, "grad_norm": 29.707008159584515, "kl": 0.57421875, "learning_rate": 9e-07, "loss": 0.0025, "reward": 1.4975524544715881, "reward_std": 0.00045794826291967183, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9975524842739105, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 1.0025906735751295, "grad_norm": 45.469961368385135, "kl": 0.6484375, "learning_rate": 8.99740932642487e-07, "loss": 0.0026, "reward": 1.8603761792182922, "reward_std": 0.24354034196585417, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3603761792182922, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 1.005181347150259, "grad_norm": 53.72483151818067, "kl": 0.625, "learning_rate": 8.994818652849741e-07, "loss": 0.0022, "reward": 1.9684478044509888, "reward_std": 0.002413144509318954, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4684478044509888, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0077720207253886, "grad_norm": 9.945812544065877, "kl": 0.705078125, "learning_rate": 8.992227979274612e-07, "loss": 0.0021, "reward": 2.4996620416641235, "reward_std": 0.00017333832738586352, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999662160873413, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0103626943005182, "grad_norm": 1.5420717265631745, "kl": 0.564453125, "learning_rate": 8.989637305699482e-07, "loss": 0.0023, "reward": 2.499967575073242, "reward_std": 1.4706164165545488e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999675750732422, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 1.0129533678756477, "grad_norm": 14.531248838126801, "kl": 0.7265625, "learning_rate": 8.987046632124352e-07, "loss": 0.0034, "reward": 1.9970236420631409, "reward_std": 4.911084261038923e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970235526561737, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 1.0155440414507773, "grad_norm": 0.45996348441778995, "kl": 0.517578125, "learning_rate": 8.984455958549222e-07, "loss": 0.0031, "reward": 2.4999932050704956, "reward_std": 3.846457047984586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.0181347150259068, "grad_norm": 11.459073491028992, "kl": 0.623046875, "learning_rate": 8.981865284974093e-07, "loss": 0.0017, "reward": 1.9960216879844666, "reward_std": 0.0004672125055549259, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4960218369960785, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0207253886010363, "grad_norm": 22.455212038246163, "kl": 0.61328125, "learning_rate": 8.979274611398964e-07, "loss": 0.003, "reward": 2.4990986585617065, "reward_std": 0.0003687833091134962, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990987181663513, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 41.625, "epoch": 1.0233160621761659, "grad_norm": 5.905953781526967, "kl": 0.46875, "learning_rate": 8.976683937823834e-07, "loss": 0.0018, "reward": 1.9978543519973755, "reward_std": 5.816556586069055e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978542923927307, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0259067357512954, "grad_norm": 0.04104486481078809, "kl": 0.455078125, "learning_rate": 8.974093264248705e-07, "loss": 0.0015, "reward": 2.4999979734420776, "reward_std": 1.2768821306963218e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.028497409326425, "grad_norm": 2.788787039774178, "kl": 0.4814453125, "learning_rate": 8.971502590673574e-07, "loss": 0.0026, "reward": 1.8587602376937866, "reward_std": 0.00023226516964314214, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3587602376937866, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0310880829015545, "grad_norm": 7.050968321704338, "kl": 0.537109375, "learning_rate": 8.968911917098445e-07, "loss": 0.0021, "reward": 1.9524061679840088, "reward_std": 0.0005537799224839546, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.452406108379364, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.0336787564766838, "grad_norm": 9.786721511451008, "kl": 0.552734375, "learning_rate": 8.966321243523316e-07, "loss": 0.0014, "reward": 1.9385772347450256, "reward_std": 0.020249723491588156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4385771751403809, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0362694300518134, "grad_norm": 1.1174958808199673, "kl": 0.685546875, "learning_rate": 8.963730569948186e-07, "loss": 0.0026, "reward": 2.499992251396179, "reward_std": 6.725646358063386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 1.038860103626943, "grad_norm": 10.420625106914656, "kl": 0.5654296875, "learning_rate": 8.961139896373057e-07, "loss": 0.0018, "reward": 1.9085689783096313, "reward_std": 0.08361113999103509, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.408569097518921, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0414507772020725, "grad_norm": 16.661140490263776, "kl": 0.5419921875, "learning_rate": 8.958549222797928e-07, "loss": 0.0014, "reward": 2.249204397201538, "reward_std": 0.26810313327683843, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7492044568061829, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 37.0625, "epoch": 1.044041450777202, "grad_norm": 4.294950913797158, "kl": 0.716796875, "learning_rate": 8.955958549222797e-07, "loss": 0.0031, "reward": 2.4998929500579834, "reward_std": 2.334502892153978e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998928904533386, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 1.0466321243523315, "grad_norm": 27.148195390421172, "kl": 0.62109375, "learning_rate": 8.953367875647668e-07, "loss": 0.0025, "reward": 1.8907556533813477, "reward_std": 0.6998912990093231, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.39075568318367, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.049222797927461, "grad_norm": 7.5745641112574775, "kl": 0.615234375, "learning_rate": 8.950777202072538e-07, "loss": 0.0034, "reward": 2.499971628189087, "reward_std": 3.966803825505849e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999716877937317, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.0518134715025906, "grad_norm": 8.574687735517635, "kl": 0.5458984375, "learning_rate": 8.948186528497409e-07, "loss": 0.0016, "reward": 1.9763970971107483, "reward_std": 0.0007562593152670161, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4763973355293274, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0544041450777202, "grad_norm": 12.381607101703585, "kl": 0.6328125, "learning_rate": 8.94559585492228e-07, "loss": 0.0019, "reward": 1.999618113040924, "reward_std": 6.319640033325413e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996181726455688, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0569948186528497, "grad_norm": 30.649173320963342, "kl": 0.60546875, "learning_rate": 8.94300518134715e-07, "loss": 0.0023, "reward": 2.374837040901184, "reward_std": 0.23174897913486348, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874836802482605, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 1.0595854922279793, "grad_norm": 53.58033242243849, "kl": 0.544921875, "learning_rate": 8.94041450777202e-07, "loss": 0.0016, "reward": 1.5641858577728271, "reward_std": 0.11900429461229578, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0641858726739883, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.0621761658031088, "grad_norm": 16.981917867834103, "kl": 0.51953125, "learning_rate": 8.93782383419689e-07, "loss": 0.0023, "reward": 2.4373676776885986, "reward_std": 0.1771187965750869, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937367558479309, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0647668393782384, "grad_norm": 59.632350131557324, "kl": 0.61328125, "learning_rate": 8.935233160621761e-07, "loss": 0.0023, "reward": 2.312239170074463, "reward_std": 0.25913637741905404, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812239170074463, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.067357512953368, "grad_norm": 19.746766655771594, "kl": 0.609375, "learning_rate": 8.932642487046632e-07, "loss": 0.0023, "reward": 1.5079607367515564, "reward_std": 0.21990992432984058, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0079606771469116, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 39.0625, "epoch": 1.0699481865284974, "grad_norm": 271.43743837629995, "kl": 0.759765625, "learning_rate": 8.930051813471502e-07, "loss": 0.0038, "reward": 1.9984516501426697, "reward_std": 0.0011134283909086662, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984516203403473, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.072538860103627, "grad_norm": 14.557600871190271, "kl": 0.630859375, "learning_rate": 8.927461139896373e-07, "loss": 0.0023, "reward": 2.4998329877853394, "reward_std": 0.0004553854503228649, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998328685760498, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0751295336787565, "grad_norm": 0.24086475817146616, "kl": 0.5439453125, "learning_rate": 8.924870466321242e-07, "loss": 0.0023, "reward": 2.499993324279785, "reward_std": 5.703721171812504e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.077720207253886, "grad_norm": 3.0941833131651415, "kl": 0.53515625, "learning_rate": 8.922279792746113e-07, "loss": 0.0014, "reward": 2.4999892711639404, "reward_std": 1.0688217798815458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.0803108808290156, "grad_norm": 1.9684168216872286, "kl": 0.59375, "learning_rate": 8.919689119170984e-07, "loss": 0.0026, "reward": 1.998124897480011, "reward_std": 3.422750296522281e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981248378753662, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0829015544041452, "grad_norm": 0.5378599916450096, "kl": 0.537109375, "learning_rate": 8.917098445595854e-07, "loss": 0.0018, "reward": 2.499990701675415, "reward_std": 7.378631266874436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0854922279792747, "grad_norm": 357.337501055031, "kl": 0.583984375, "learning_rate": 8.914507772020725e-07, "loss": 0.0023, "reward": 1.866849660873413, "reward_std": 0.23362560383975506, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3668497204780579, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.0880829015544042, "grad_norm": 0.8050240333605199, "kl": 0.4716796875, "learning_rate": 8.911917098445595e-07, "loss": 0.0026, "reward": 2.499972105026245, "reward_std": 6.230813937690982e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999719262123108, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.0906735751295338, "grad_norm": 0.4140165665242631, "kl": 0.60546875, "learning_rate": 8.909326424870465e-07, "loss": 0.0031, "reward": 2.4999914169311523, "reward_std": 7.973819720064057e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.093264248704663, "grad_norm": 0.4750945620029414, "kl": 0.587890625, "learning_rate": 8.906735751295336e-07, "loss": 0.0011, "reward": 2.4999895095825195, "reward_std": 6.6680502186500235e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 52.375, "epoch": 1.0958549222797926, "grad_norm": 0.41839544167048026, "kl": 0.4140625, "learning_rate": 8.904145077720206e-07, "loss": 0.0014, "reward": 2.4999955892562866, "reward_std": 3.7826586094524828e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 1.0984455958549222, "grad_norm": 4.789196290460638, "kl": 0.564453125, "learning_rate": 8.901554404145077e-07, "loss": 0.0017, "reward": 1.998329520225525, "reward_std": 4.748090645989578e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983296990394592, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 1.1010362694300517, "grad_norm": 72.60514767800034, "kl": 0.6005859375, "learning_rate": 8.898963730569949e-07, "loss": 0.0024, "reward": 1.993331789970398, "reward_std": 0.005676061989561276, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4933317303657532, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 1.1036269430051813, "grad_norm": 8.374351546892886, "kl": 0.677734375, "learning_rate": 8.896373056994819e-07, "loss": 0.0033, "reward": 2.222361743450165, "reward_std": 0.38330446022882825, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7223615646362305, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.1062176165803108, "grad_norm": 25.997912166297766, "kl": 0.5625, "learning_rate": 8.893782383419689e-07, "loss": 0.0021, "reward": 2.249489903450012, "reward_std": 0.2678053895274104, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7494899034500122, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1088082901554404, "grad_norm": 4.553423918142021, "kl": 0.564453125, "learning_rate": 8.891191709844559e-07, "loss": 0.0022, "reward": 1.9992722272872925, "reward_std": 0.00012518943003669847, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499272346496582, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.11139896373057, "grad_norm": 16.31867375989807, "kl": 0.58203125, "learning_rate": 8.88860103626943e-07, "loss": 0.0017, "reward": 2.0617632269859314, "reward_std": 0.17707666491787677, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5617634057998657, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 1.1139896373056994, "grad_norm": 4.314569456559592, "kl": 0.6328125, "learning_rate": 8.886010362694301e-07, "loss": 0.002, "reward": 2.499899983406067, "reward_std": 4.3456856190005055e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999900221824646, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.116580310880829, "grad_norm": 0.9252262601395185, "kl": 0.57421875, "learning_rate": 8.883419689119171e-07, "loss": 0.0009, "reward": 2.499980926513672, "reward_std": 1.0860051816052874e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999810457229614, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 1.1191709844559585, "grad_norm": 275.2594357969682, "kl": 0.578125, "learning_rate": 8.880829015544042e-07, "loss": 0.0023, "reward": 1.5035042762756348, "reward_std": 0.24922202248126268, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0035044252872467, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 1.121761658031088, "grad_norm": 13.419891914638564, "kl": 0.6796875, "learning_rate": 8.878238341968911e-07, "loss": 0.0039, "reward": 2.3748362064361572, "reward_std": 0.23171381004135583, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748360872268677, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.1243523316062176, "grad_norm": 0.15272716501907646, "kl": 0.615234375, "learning_rate": 8.875647668393782e-07, "loss": 0.0014, "reward": 2.4999972581863403, "reward_std": 2.363373369007604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1269430051813472, "grad_norm": 1.0453922050355886, "kl": 0.685546875, "learning_rate": 8.873056994818653e-07, "loss": 0.0023, "reward": 1.9999256134033203, "reward_std": 7.939465618278518e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499925673007965, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 1.1295336787564767, "grad_norm": 21.78759291570729, "kl": 0.669921875, "learning_rate": 8.870466321243523e-07, "loss": 0.0028, "reward": 2.4374330043792725, "reward_std": 0.1769484130942942, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937432885169983, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.1321243523316062, "grad_norm": 19.59687312597213, "kl": 0.650390625, "learning_rate": 8.867875647668394e-07, "loss": 0.0028, "reward": 2.047846496105194, "reward_std": 0.18229154073196696, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5478463768959045, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1347150259067358, "grad_norm": 1.139946823581814, "kl": 0.6640625, "learning_rate": 8.865284974093264e-07, "loss": 0.0015, "reward": 2.4999805688858032, "reward_std": 1.1830166840809397e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999806880950928, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1373056994818653, "grad_norm": 19.897935930384396, "kl": 0.625, "learning_rate": 8.862694300518134e-07, "loss": 0.0026, "reward": 1.4810757040977478, "reward_std": 0.0005743358087784145, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9810757637023926, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1398963730569949, "grad_norm": 3.985587001020023, "kl": 0.568359375, "learning_rate": 8.860103626943005e-07, "loss": 0.0018, "reward": 1.9168751239776611, "reward_std": 0.0002372907304106775, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4168752431869507, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.1424870466321244, "grad_norm": 0.27211269883772987, "kl": 0.55859375, "learning_rate": 8.857512953367875e-07, "loss": 0.0022, "reward": 2.49999463558197, "reward_std": 5.12884531644886e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.145077720207254, "grad_norm": 0.21559155144648293, "kl": 0.578125, "learning_rate": 8.854922279792746e-07, "loss": 0.0025, "reward": 2.499993681907654, "reward_std": 2.7186021043235087e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.1476683937823835, "grad_norm": 3.0237862827404274, "kl": 0.58203125, "learning_rate": 8.852331606217616e-07, "loss": 0.0024, "reward": 2.4987964630126953, "reward_std": 3.004723203048343e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9987964630126953, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.150259067357513, "grad_norm": 0.08148788614493192, "kl": 0.5859375, "learning_rate": 8.849740932642487e-07, "loss": 0.0026, "reward": 2.499997854232788, "reward_std": 1.4008281254973554e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 1.1528497409326426, "grad_norm": 17.111288091510296, "kl": 0.6064453125, "learning_rate": 8.847150259067357e-07, "loss": 0.0024, "reward": 2.218728244304657, "reward_std": 0.4519421654002116, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.749978244304657, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1554404145077721, "grad_norm": 8.381948229299203, "kl": 0.59375, "learning_rate": 8.844559585492227e-07, "loss": 0.0028, "reward": 1.8481711745262146, "reward_std": 0.00017754426380633959, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.348171055316925, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1580310880829017, "grad_norm": 30.074596470120774, "kl": 0.63671875, "learning_rate": 8.841968911917098e-07, "loss": 0.0023, "reward": 1.4861799478530884, "reward_std": 0.00011329373228363693, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9861800372600555, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 41.875, "epoch": 1.160621761658031, "grad_norm": 0.11263967947740311, "kl": 0.4619140625, "learning_rate": 8.839378238341969e-07, "loss": 0.0023, "reward": 2.4999964237213135, "reward_std": 1.8359672822043649e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.1632124352331605, "grad_norm": 22.290196316670027, "kl": 0.6484375, "learning_rate": 8.836787564766839e-07, "loss": 0.0025, "reward": 2.1241907477378845, "reward_std": 0.2319577410335114, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6241907477378845, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.16580310880829, "grad_norm": 38.84084459393819, "kl": 0.560546875, "learning_rate": 8.83419689119171e-07, "loss": 0.0028, "reward": 1.9990530610084534, "reward_std": 0.000834117460442485, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990528523921967, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 49.875, "epoch": 1.1683937823834196, "grad_norm": 0.25641148307535555, "kl": 0.501953125, "learning_rate": 8.831606217616579e-07, "loss": 0.0004, "reward": 2.499998450279236, "reward_std": 9.724700760216365e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.1709844559585492, "grad_norm": 0.2697129682195001, "kl": 0.5361328125, "learning_rate": 8.82901554404145e-07, "loss": 0.0024, "reward": 2.4999947547912598, "reward_std": 3.5958451007900294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1735751295336787, "grad_norm": 67.80492797364819, "kl": 0.572265625, "learning_rate": 8.826424870466321e-07, "loss": 0.003, "reward": 1.8329634070396423, "reward_std": 0.013958233583821311, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3329634070396423, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.1761658031088082, "grad_norm": 31.966179255023054, "kl": 0.53515625, "learning_rate": 8.823834196891191e-07, "loss": 0.002, "reward": 2.096955180168152, "reward_std": 0.248760860066227, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5969551801681519, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1787564766839378, "grad_norm": 8.800900750686791, "kl": 0.5263671875, "learning_rate": 8.821243523316062e-07, "loss": 0.0024, "reward": 1.9851295948028564, "reward_std": 0.0001500832782994621, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4851295053958893, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.1813471502590673, "grad_norm": 2.4012922862010755, "kl": 0.609375, "learning_rate": 8.818652849740932e-07, "loss": 0.0024, "reward": 2.4999899864196777, "reward_std": 5.653961466123292e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999900460243225, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 1.1839378238341969, "grad_norm": 14.266569532549147, "kl": 0.625, "learning_rate": 8.816062176165802e-07, "loss": 0.0034, "reward": 1.9816468358039856, "reward_std": 0.009951436694336735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.481646716594696, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 1.1865284974093264, "grad_norm": 3.3648038092910295, "kl": 0.80078125, "learning_rate": 8.813471502590673e-07, "loss": 0.004, "reward": 2.499940872192383, "reward_std": 2.0321308625170786e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999408721923828, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.189119170984456, "grad_norm": 0.3459096375428303, "kl": 0.5625, "learning_rate": 8.810880829015543e-07, "loss": 0.003, "reward": 2.4999977350234985, "reward_std": 2.304723551560528e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.1917098445595855, "grad_norm": 18.534741740371537, "kl": 0.603515625, "learning_rate": 8.808290155440414e-07, "loss": 0.0023, "reward": 2.2494969367980957, "reward_std": 0.2677713489587177, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7494969964027405, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.194300518134715, "grad_norm": 0.44586623272369225, "kl": 0.67578125, "learning_rate": 8.805699481865284e-07, "loss": 0.0024, "reward": 2.4999945163726807, "reward_std": 4.996632696929737e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 1.1968911917098446, "grad_norm": 62.052420630776396, "kl": 0.794921875, "learning_rate": 8.803108808290155e-07, "loss": 0.0032, "reward": 1.9643994569778442, "reward_std": 0.026802264872515025, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4643995761871338, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1994818652849741, "grad_norm": 0.14513054602257722, "kl": 0.4697265625, "learning_rate": 8.800518134715025e-07, "loss": 0.0013, "reward": 2.499996542930603, "reward_std": 1.136028657811039e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2020725388601037, "grad_norm": 0.47465832674344294, "kl": 0.552734375, "learning_rate": 8.797927461139895e-07, "loss": 0.0018, "reward": 2.4999970197677612, "reward_std": 1.81452435299434e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2046632124352332, "grad_norm": 0.033592290839851095, "kl": 0.61328125, "learning_rate": 8.795336787564766e-07, "loss": 0.0028, "reward": 2.499999523162842, "reward_std": 5.936531266570455e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999995827674866, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.2072538860103628, "grad_norm": 0.09835597607462437, "kl": 0.57421875, "learning_rate": 8.792746113989636e-07, "loss": 0.0025, "reward": 2.4999938011169434, "reward_std": 2.209989517609756e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2098445595854923, "grad_norm": 10.49853774579234, "kl": 0.6953125, "learning_rate": 8.790155440414507e-07, "loss": 0.0025, "reward": 2.124144971370697, "reward_std": 0.2319839366459746, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6241450905799866, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2124352331606219, "grad_norm": 23.538863181556497, "kl": 0.734375, "learning_rate": 8.787564766839379e-07, "loss": 0.0029, "reward": 1.5312804579734802, "reward_std": 0.1817591809667647, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0312804579734802, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2150259067357512, "grad_norm": 0.17823457849634575, "kl": 0.51953125, "learning_rate": 8.784974093264247e-07, "loss": 0.0028, "reward": 2.499996304512024, "reward_std": 2.998301226853073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2176165803108807, "grad_norm": 0.1522134884560947, "kl": 0.591796875, "learning_rate": 8.782383419689119e-07, "loss": 0.0019, "reward": 2.4999972581863403, "reward_std": 1.3038910537943593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2202072538860103, "grad_norm": 7.2806138913905025, "kl": 0.509765625, "learning_rate": 8.77979274611399e-07, "loss": 0.0018, "reward": 1.9776080250740051, "reward_std": 0.00011458025335286948, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4776080250740051, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2227979274611398, "grad_norm": 0.08450419896830544, "kl": 0.5732421875, "learning_rate": 8.77720207253886e-07, "loss": 0.0029, "reward": 2.4999983310699463, "reward_std": 1.1394291732358397e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.2253886010362693, "grad_norm": 2.526372335200411, "kl": 0.642578125, "learning_rate": 8.774611398963731e-07, "loss": 0.0026, "reward": 2.4999881982803345, "reward_std": 8.376503785711975e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2279792746113989, "grad_norm": 2.862368796034517, "kl": 0.4736328125, "learning_rate": 8.772020725388601e-07, "loss": 0.0024, "reward": 1.9990586042404175, "reward_std": 2.572512323695264e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990585148334503, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2305699481865284, "grad_norm": 12.073158142023667, "kl": 0.546875, "learning_rate": 8.769430051813471e-07, "loss": 0.0025, "reward": 2.06146103143692, "reward_std": 0.17719211281200842, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.561461091041565, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.233160621761658, "grad_norm": 0.3280514305180032, "kl": 0.623046875, "learning_rate": 8.766839378238342e-07, "loss": 0.0025, "reward": 2.499995470046997, "reward_std": 2.629326104397478e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2357512953367875, "grad_norm": 16.00236452166824, "kl": 0.505859375, "learning_rate": 8.764248704663212e-07, "loss": 0.0019, "reward": 2.4372899532318115, "reward_std": 0.17736581388699335, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937290072441101, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.238341968911917, "grad_norm": 49.00956074530929, "kl": 0.638671875, "learning_rate": 8.761658031088083e-07, "loss": 0.0026, "reward": 1.99703848361969, "reward_std": 0.0035216359392507, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970384240150452, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2409326424870466, "grad_norm": 29.841107084135555, "kl": 0.591796875, "learning_rate": 8.759067357512953e-07, "loss": 0.0025, "reward": 2.2133166790008545, "reward_std": 0.306481953538821, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.713316559791565, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2435233160621761, "grad_norm": 3.6449346905102846, "kl": 0.7109375, "learning_rate": 8.756476683937824e-07, "loss": 0.0028, "reward": 2.499991774559021, "reward_std": 2.7494734240463004e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2461139896373057, "grad_norm": 0.8438294247120333, "kl": 0.58203125, "learning_rate": 8.753886010362695e-07, "loss": 0.0023, "reward": 2.4999972581863403, "reward_std": 1.6779780480646878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2487046632124352, "grad_norm": 0.1371898226035424, "kl": 0.5302734375, "learning_rate": 8.751295336787564e-07, "loss": 0.0017, "reward": 2.4999977350234985, "reward_std": 1.5734124900745883e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.2512953367875648, "grad_norm": 0.27941417018387393, "kl": 0.671875, "learning_rate": 8.748704663212435e-07, "loss": 0.0028, "reward": 2.4999961853027344, "reward_std": 1.7125852878052683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2538860103626943, "grad_norm": 2.6050909877486794, "kl": 0.634765625, "learning_rate": 8.746113989637305e-07, "loss": 0.0034, "reward": 2.4999419450759888, "reward_std": 1.2323869782449037e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999941885471344, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.2564766839378239, "grad_norm": 17.068698259955003, "kl": 0.646484375, "learning_rate": 8.743523316062176e-07, "loss": 0.0027, "reward": 1.9983042478561401, "reward_std": 6.174851114337798e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983042478561401, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2590673575129534, "grad_norm": 0.42066089459394507, "kl": 0.607421875, "learning_rate": 8.740932642487047e-07, "loss": 0.004, "reward": 2.499998927116394, "reward_std": 9.661517736958558e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.261658031088083, "grad_norm": 12.386392606268613, "kl": 0.48046875, "learning_rate": 8.738341968911916e-07, "loss": 0.0021, "reward": 1.9986199140548706, "reward_std": 2.6706332391768228e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986199140548706, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2642487046632125, "grad_norm": 0.9413582185518181, "kl": 0.51171875, "learning_rate": 8.735751295336787e-07, "loss": 0.002, "reward": 1.9993279576301575, "reward_std": 5.824303059398517e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993278980255127, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.266839378238342, "grad_norm": 4.272618000592426, "kl": 0.5546875, "learning_rate": 8.733160621761657e-07, "loss": 0.0016, "reward": 1.9996135234832764, "reward_std": 4.621013397354545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996135532855988, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2694300518134716, "grad_norm": 0.11713704968480952, "kl": 0.4609375, "learning_rate": 8.730569948186528e-07, "loss": 0.0024, "reward": 2.499995708465576, "reward_std": 2.034455633292964e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2720207253886011, "grad_norm": 4.542216806242224, "kl": 0.625, "learning_rate": 8.727979274611399e-07, "loss": 0.0025, "reward": 2.4999414682388306, "reward_std": 3.6450136576604564e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999414682388306, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.2746113989637307, "grad_norm": 23.230839322441255, "kl": 0.603515625, "learning_rate": 8.725388601036269e-07, "loss": 0.0024, "reward": 2.1241281032562256, "reward_std": 0.2319880039477482, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6241281628608704, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.2772020725388602, "grad_norm": 5.3260742439247055, "kl": 0.640625, "learning_rate": 8.72279792746114e-07, "loss": 0.0032, "reward": 1.9734973907470703, "reward_std": 0.00010697755791966301, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4734971225261688, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 1.2797927461139897, "grad_norm": 33.69591275200276, "kl": 0.478515625, "learning_rate": 8.720207253886009e-07, "loss": 0.001, "reward": 1.9605590105056763, "reward_std": 0.07004326995550514, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4605591893196106, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2823834196891193, "grad_norm": 45.0222961673678, "kl": 0.611328125, "learning_rate": 8.71761658031088e-07, "loss": 0.0023, "reward": 2.374676823616028, "reward_std": 0.23204107198716883, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8746768832206726, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 1.2849740932642488, "grad_norm": 15.722600614184417, "kl": 0.544921875, "learning_rate": 8.715025906735751e-07, "loss": 0.0026, "reward": 1.9896942377090454, "reward_std": 0.001354626134798309, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4896942973136902, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2875647668393784, "grad_norm": 10.770511710967282, "kl": 0.640625, "learning_rate": 8.712435233160621e-07, "loss": 0.003, "reward": 1.4990655183792114, "reward_std": 8.783236262388527e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9990654587745667, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2901554404145077, "grad_norm": 0.4393583407255433, "kl": 0.6015625, "learning_rate": 8.709844559585492e-07, "loss": 0.0021, "reward": 2.499993681907654, "reward_std": 3.075938337815387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2927461139896372, "grad_norm": 0.35648075882810437, "kl": 0.548828125, "learning_rate": 8.707253886010363e-07, "loss": 0.002, "reward": 2.4999959468841553, "reward_std": 3.798614443439874e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2953367875647668, "grad_norm": 0.07936942305806453, "kl": 0.57421875, "learning_rate": 8.704663212435232e-07, "loss": 0.003, "reward": 2.4999983310699463, "reward_std": 7.136892890002855e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.2979274611398963, "grad_norm": 1.446875220700201, "kl": 0.580078125, "learning_rate": 8.702072538860103e-07, "loss": 0.0029, "reward": 2.499969244003296, "reward_std": 7.677880830669892e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999691247940063, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.3005181347150259, "grad_norm": 8.54978563235663, "kl": 0.599609375, "learning_rate": 8.699481865284973e-07, "loss": 0.0017, "reward": 1.9988346099853516, "reward_std": 3.518123412504792e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988347291946411, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3031088082901554, "grad_norm": 0.45581560690399275, "kl": 0.58984375, "learning_rate": 8.696891191709844e-07, "loss": 0.0022, "reward": 2.4999970197677612, "reward_std": 1.6745199786782905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.305699481865285, "grad_norm": 0.04467758285422141, "kl": 0.50390625, "learning_rate": 8.694300518134715e-07, "loss": 0.0019, "reward": 2.4999990463256836, "reward_std": 6.303886550540483e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3082901554404145, "grad_norm": 0.06947867151443883, "kl": 0.525390625, "learning_rate": 8.691709844559585e-07, "loss": 0.003, "reward": 2.499996542930603, "reward_std": 1.7946805712654168e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.310880829015544, "grad_norm": 0.2980369041674649, "kl": 0.564453125, "learning_rate": 8.689119170984455e-07, "loss": 0.0025, "reward": 2.499996542930603, "reward_std": 2.154090282147081e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3134715025906736, "grad_norm": 24.759761961946634, "kl": 0.64453125, "learning_rate": 8.686528497409325e-07, "loss": 0.0031, "reward": 1.9898204803466797, "reward_std": 0.0001572022486016067, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.489820420742035, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.3160621761658031, "grad_norm": 16.256309876146926, "kl": 0.61328125, "learning_rate": 8.683937823834196e-07, "loss": 0.0025, "reward": 1.9966996908187866, "reward_std": 0.00012456769127311418, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4966996610164642, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.3186528497409327, "grad_norm": 6.991762346904131, "kl": 0.599609375, "learning_rate": 8.681347150259068e-07, "loss": 0.0025, "reward": 1.9964443445205688, "reward_std": 0.0011499571760396066, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4964445233345032, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3212435233160622, "grad_norm": 0.5995898829493257, "kl": 0.560546875, "learning_rate": 8.678756476683938e-07, "loss": 0.002, "reward": 2.499982476234436, "reward_std": 4.166895962498529e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999827146530151, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 1.3238341968911918, "grad_norm": 19.47851117072531, "kl": 0.560546875, "learning_rate": 8.676165803108809e-07, "loss": 0.0022, "reward": 1.208937406539917, "reward_std": 0.14962222333997488, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7089374363422394, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3264248704663213, "grad_norm": 47.12850220404575, "kl": 0.693359375, "learning_rate": 8.673575129533677e-07, "loss": 0.0028, "reward": 1.8592469692230225, "reward_std": 0.23720298008993268, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3592469096183777, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 1.3290155440414508, "grad_norm": 37.414641996967816, "kl": 0.623046875, "learning_rate": 8.670984455958549e-07, "loss": 0.0028, "reward": 2.301492691040039, "reward_std": 0.2752895653525229, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8014929294586182, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 1.3316062176165804, "grad_norm": 0.21666667892284558, "kl": 0.662109375, "learning_rate": 8.66839378238342e-07, "loss": 0.0031, "reward": 2.4999942779541016, "reward_std": 2.754374321511932e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.33419689119171, "grad_norm": 0.37155277980157536, "kl": 0.697265625, "learning_rate": 8.66580310880829e-07, "loss": 0.0027, "reward": 2.499998092651367, "reward_std": 2.483462040459017e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3367875647668392, "grad_norm": 1.9472748295902245, "kl": 0.5361328125, "learning_rate": 8.663212435233161e-07, "loss": 0.0017, "reward": 2.4999769926071167, "reward_std": 1.0777632894587441e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999771118164062, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3393782383419688, "grad_norm": 0.19383666203584884, "kl": 0.6484375, "learning_rate": 8.660621761658031e-07, "loss": 0.0026, "reward": 2.499995470046997, "reward_std": 2.0626290790914936e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 1.3419689119170983, "grad_norm": 0.23332847249990504, "kl": 0.5654296875, "learning_rate": 8.658031088082901e-07, "loss": 0.0011, "reward": 2.499992609024048, "reward_std": 2.636243777942582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 45.6875, "epoch": 1.3445595854922279, "grad_norm": 3.914646892014181, "kl": 0.41796875, "learning_rate": 8.655440414507772e-07, "loss": 0.0006, "reward": 2.499976396560669, "reward_std": 6.120356260908011e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976396560669, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3471502590673574, "grad_norm": 22.511110900659286, "kl": 0.60546875, "learning_rate": 8.652849740932642e-07, "loss": 0.002, "reward": 1.9996398091316223, "reward_std": 0.00011680582065309864, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996399283409119, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.349740932642487, "grad_norm": 5.669669627096847, "kl": 0.54296875, "learning_rate": 8.650259067357513e-07, "loss": 0.0021, "reward": 1.497983455657959, "reward_std": 4.6898274376871996e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9979834854602814, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3523316062176165, "grad_norm": 1.3136329334226196, "kl": 0.62109375, "learning_rate": 8.647668393782384e-07, "loss": 0.0033, "reward": 2.4999947547912598, "reward_std": 2.5336748308291135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.354922279792746, "grad_norm": 43.04303973678258, "kl": 0.513671875, "learning_rate": 8.645077720207254e-07, "loss": 0.0017, "reward": 2.186347484588623, "reward_std": 0.2591854878514823, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6863476037979126, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 49.875, "epoch": 1.3575129533678756, "grad_norm": 15.454257247206165, "kl": 0.4384765625, "learning_rate": 8.642487046632124e-07, "loss": 0.0019, "reward": 1.9239393472671509, "reward_std": 0.0253527188492626, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4239393770694733, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3601036269430051, "grad_norm": 6.634574213902974, "kl": 0.685546875, "learning_rate": 8.639896373056994e-07, "loss": 0.0033, "reward": 1.9995331168174744, "reward_std": 6.290257567798108e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49953293800354, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.3626943005181347, "grad_norm": 17.452722157009752, "kl": 0.556640625, "learning_rate": 8.637305699481865e-07, "loss": 0.0021, "reward": 2.3744312524795532, "reward_std": 0.23249074837985972, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8744314312934875, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3652849740932642, "grad_norm": 1.78256921218075, "kl": 0.5625, "learning_rate": 8.634715025906736e-07, "loss": 0.0022, "reward": 1.9988274574279785, "reward_std": 1.7268625356336997e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988275170326233, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3678756476683938, "grad_norm": 6.6237721347641685, "kl": 0.638671875, "learning_rate": 8.632124352331606e-07, "loss": 0.0029, "reward": 2.4998873472213745, "reward_std": 2.7250078801444033e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998871684074402, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 1.3704663212435233, "grad_norm": 17.34261023701216, "kl": 2.048828125, "learning_rate": 8.629533678756477e-07, "loss": 0.008, "reward": 2.409585118293762, "reward_std": 0.255618360049084, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9095849394798279, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 1.3730569948186528, "grad_norm": 17.860459283242225, "kl": 0.67578125, "learning_rate": 8.626943005181346e-07, "loss": 0.0018, "reward": 2.238621711730957, "reward_std": 0.2808477502805431, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7386218905448914, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3756476683937824, "grad_norm": 0.41303407149081817, "kl": 0.7890625, "learning_rate": 8.624352331606217e-07, "loss": 0.0025, "reward": 2.4999873638153076, "reward_std": 1.1987276707259298e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.378238341968912, "grad_norm": 0.12288930595573583, "kl": 0.580078125, "learning_rate": 8.621761658031088e-07, "loss": 0.0019, "reward": 2.4999948740005493, "reward_std": 7.171377092163311e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.3808290155440415, "grad_norm": 247.34445574106437, "kl": 0.51171875, "learning_rate": 8.619170984455958e-07, "loss": 0.002, "reward": 1.9598177671432495, "reward_std": 0.0012843398504571724, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4598178565502167, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.383419689119171, "grad_norm": 2.6063730956114215, "kl": 0.6015625, "learning_rate": 8.616580310880829e-07, "loss": 0.0026, "reward": 1.9964655637741089, "reward_std": 4.188081356915063e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4964656233787537, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 1.3860103626943006, "grad_norm": 14.602450519610512, "kl": 0.556640625, "learning_rate": 8.613989637305699e-07, "loss": 0.0019, "reward": 2.2257866859436035, "reward_std": 0.3784497884051632, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7257866859436035, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 96.5, "epoch": 1.38860103626943, "grad_norm": 22.10788659481788, "kl": 0.4912109375, "learning_rate": 8.611398963730569e-07, "loss": 0.0021, "reward": 2.4372280836105347, "reward_std": 0.17753830705669316, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372280836105347, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 51.0, "epoch": 1.3911917098445596, "grad_norm": 0.13620758604248384, "kl": 0.517578125, "learning_rate": 8.60880829015544e-07, "loss": 0.001, "reward": 2.499997615814209, "reward_std": 1.0291777527982049e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3937823834196892, "grad_norm": 0.49435136817104774, "kl": 0.65234375, "learning_rate": 8.60621761658031e-07, "loss": 0.0028, "reward": 2.4999942779541016, "reward_std": 1.694198488166876e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 1.3963730569948187, "grad_norm": 56.529739975188036, "kl": 0.68359375, "learning_rate": 8.603626943005181e-07, "loss": 0.0028, "reward": 1.9373018741607666, "reward_std": 0.17690018775010685, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4373019337654114, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 1.3989637305699483, "grad_norm": 0.6636558505664327, "kl": 0.59375, "learning_rate": 8.601036269430051e-07, "loss": 0.0022, "reward": 2.4999836683273315, "reward_std": 1.075003638106864e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.4015544041450778, "grad_norm": 21.24985694373075, "kl": 0.5966796875, "learning_rate": 8.598445595854922e-07, "loss": 0.0017, "reward": 2.499105930328369, "reward_std": 0.0003413122630036014, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9991058707237244, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 48.6875, "epoch": 1.4041450777202074, "grad_norm": 3.4641772706403366, "kl": 0.5625, "learning_rate": 8.595854922279792e-07, "loss": 0.0028, "reward": 2.374996304512024, "reward_std": 0.35355379785801233, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749964237213135, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.406735751295337, "grad_norm": 153.87330989747602, "kl": 0.564453125, "learning_rate": 8.593264248704662e-07, "loss": 0.0025, "reward": 1.8745841979980469, "reward_std": 0.09209507029297015, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3745842278003693, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4093264248704664, "grad_norm": 0.11631000251700786, "kl": 0.5166015625, "learning_rate": 8.590673575129533e-07, "loss": 0.0009, "reward": 2.4999964237213135, "reward_std": 1.7403886545253044e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 50.75, "epoch": 1.411917098445596, "grad_norm": 11.084036691025215, "kl": 0.4609375, "learning_rate": 8.588082901554404e-07, "loss": 0.0021, "reward": 2.4358290433883667, "reward_std": 0.18149413354001354, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9358289241790771, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 1.4145077720207253, "grad_norm": 38.8711323506586, "kl": 0.55859375, "learning_rate": 8.585492227979274e-07, "loss": 0.0019, "reward": 1.8141435384750366, "reward_std": 0.1593217192511247, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3141435384750366, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.4170984455958548, "grad_norm": 1.8137997440054463, "kl": 0.572265625, "learning_rate": 8.582901554404145e-07, "loss": 0.0022, "reward": 2.4999887943267822, "reward_std": 9.104116003300078e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887943267822, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.4196891191709844, "grad_norm": 16.22298151779105, "kl": 0.61328125, "learning_rate": 8.580310880829014e-07, "loss": 0.002, "reward": 1.9997638463974, "reward_std": 1.6737004671085742e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997639060020447, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.422279792746114, "grad_norm": 10.467033569969603, "kl": 0.591796875, "learning_rate": 8.577720207253885e-07, "loss": 0.0018, "reward": 2.0617659091949463, "reward_std": 0.17707010858262606, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5617661476135254, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.4248704663212435, "grad_norm": 0.3112781948701516, "kl": 0.6171875, "learning_rate": 8.575129533678756e-07, "loss": 0.0022, "reward": 2.499996304512024, "reward_std": 2.8055086431777454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.427461139896373, "grad_norm": 15.665902286253937, "kl": 0.560546875, "learning_rate": 8.572538860103626e-07, "loss": 0.002, "reward": 1.993625283241272, "reward_std": 4.525115221554188e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4936253428459167, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.4300518134715026, "grad_norm": 23.41429133662071, "kl": 0.59375, "learning_rate": 8.569948186528498e-07, "loss": 0.0027, "reward": 1.882952868938446, "reward_std": 0.046902431076716766, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3829528391361237, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.432642487046632, "grad_norm": 8.987315557421686, "kl": 0.59765625, "learning_rate": 8.567357512953368e-07, "loss": 0.0021, "reward": 1.9986605644226074, "reward_std": 1.8590238767046685e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498660683631897, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.4352331606217616, "grad_norm": 46.845804119363514, "kl": 0.5224609375, "learning_rate": 8.564766839378238e-07, "loss": 0.0014, "reward": 1.9995248317718506, "reward_std": 0.00012432434778020252, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995249807834625, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.4378238341968912, "grad_norm": 0.3050211417656412, "kl": 0.595703125, "learning_rate": 8.562176165803109e-07, "loss": 0.0029, "reward": 2.499995231628418, "reward_std": 2.9612097591780184e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4404145077720207, "grad_norm": 58.1642346866013, "kl": 0.6015625, "learning_rate": 8.559585492227979e-07, "loss": 0.0014, "reward": 2.437396287918091, "reward_std": 0.177044933285174, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.93739652633667, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4430051813471503, "grad_norm": 7.5412535498441295, "kl": 0.474609375, "learning_rate": 8.55699481865285e-07, "loss": 0.0017, "reward": 1.999861478805542, "reward_std": 1.6503423012181884e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998615384101868, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 1.4455958549222798, "grad_norm": 3.9746579805075317, "kl": 0.51171875, "learning_rate": 8.55440414507772e-07, "loss": 0.0017, "reward": 1.9900933504104614, "reward_std": 4.736338996735867e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.490093320608139, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 1.4481865284974094, "grad_norm": 53.88710300132334, "kl": 0.658203125, "learning_rate": 8.551813471502591e-07, "loss": 0.0025, "reward": 1.7350038290023804, "reward_std": 0.3651373956381576, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2350038290023804, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 50.25, "epoch": 1.450777202072539, "grad_norm": 69.70438947157771, "kl": 0.43310546875, "learning_rate": 8.549222797927461e-07, "loss": 0.0016, "reward": 1.9831583499908447, "reward_std": 0.0034443593758624047, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4831584095954895, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 1.4533678756476685, "grad_norm": 42.26819556798007, "kl": 0.5546875, "learning_rate": 8.546632124352331e-07, "loss": 0.0032, "reward": 1.9047209024429321, "reward_std": 0.0753998734015795, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4047207236289978, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.455958549222798, "grad_norm": 33.073914814216636, "kl": 0.576171875, "learning_rate": 8.544041450777202e-07, "loss": 0.0024, "reward": 1.8826268911361694, "reward_std": 0.007231505942399963, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3826269209384918, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4585492227979275, "grad_norm": 6.41389116407757, "kl": 0.560546875, "learning_rate": 8.541450777202072e-07, "loss": 0.002, "reward": 1.9992753267288208, "reward_std": 5.61330285790973e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992753267288208, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4611398963730569, "grad_norm": 3.7024580845299453, "kl": 0.6171875, "learning_rate": 8.538860103626943e-07, "loss": 0.0026, "reward": 1.998275637626648, "reward_std": 3.522502174746478e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982757270336151, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.4637305699481864, "grad_norm": 14.577589032448673, "kl": 0.5947265625, "learning_rate": 8.536269430051814e-07, "loss": 0.0018, "reward": 1.9945787191390991, "reward_std": 0.0003757158296622265, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4945788383483887, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.466321243523316, "grad_norm": 0.2539754840298736, "kl": 0.513671875, "learning_rate": 8.533678756476683e-07, "loss": 0.0015, "reward": 2.4999979734420776, "reward_std": 1.3269381327063456e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4689119170984455, "grad_norm": 1.1902587400332436, "kl": 0.498046875, "learning_rate": 8.531088082901554e-07, "loss": 0.0014, "reward": 1.999872863292694, "reward_std": 8.49820055748296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998729526996613, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.471502590673575, "grad_norm": 4.187539050599338, "kl": 0.552734375, "learning_rate": 8.528497409326425e-07, "loss": 0.0015, "reward": 1.8861017227172852, "reward_std": 0.00033210853226250947, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3861018419265747, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 1.4740932642487046, "grad_norm": 24.591999064387085, "kl": 0.4521484375, "learning_rate": 8.525906735751295e-07, "loss": 0.0024, "reward": 2.4373717308044434, "reward_std": 0.17713080061616893, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937371850013733, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4766839378238341, "grad_norm": 0.5688787579612354, "kl": 0.53515625, "learning_rate": 8.523316062176166e-07, "loss": 0.0021, "reward": 2.4999924898147583, "reward_std": 2.1595516841443896e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.4792746113989637, "grad_norm": 15.29466972041705, "kl": 0.5341796875, "learning_rate": 8.520725388601036e-07, "loss": 0.0029, "reward": 2.0622364282608032, "reward_std": 0.17686682704083978, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5622363686561584, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4818652849740932, "grad_norm": 19.023250225920926, "kl": 0.55859375, "learning_rate": 8.518134715025906e-07, "loss": 0.0023, "reward": 1.6861881613731384, "reward_std": 0.2591725378388219, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1861882209777832, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.4844559585492227, "grad_norm": 5.755423571938906, "kl": 0.60546875, "learning_rate": 8.515544041450777e-07, "loss": 0.002, "reward": 1.9973472952842712, "reward_std": 0.00013764822915618424, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4973473250865936, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 1.4870466321243523, "grad_norm": 91.90671730182062, "kl": 0.61328125, "learning_rate": 8.512953367875647e-07, "loss": 0.0025, "reward": 1.5483200550079346, "reward_std": 0.2582144923508167, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0483201146125793, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4896373056994818, "grad_norm": 15.370805671351556, "kl": 0.654296875, "learning_rate": 8.510362694300518e-07, "loss": 0.0026, "reward": 1.9986218214035034, "reward_std": 6.388745077856584e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986218214035034, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 1.4922279792746114, "grad_norm": 20.670444895369016, "kl": 0.5625, "learning_rate": 8.507772020725388e-07, "loss": 0.0025, "reward": 2.406248450279236, "reward_std": 0.2651615071664537, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9062485694885254, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 1.494818652849741, "grad_norm": 7.25093856693366, "kl": 0.74609375, "learning_rate": 8.505181347150259e-07, "loss": 0.0026, "reward": 2.4544568061828613, "reward_std": 0.05893796041209498, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.954456865787506, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.4974093264248705, "grad_norm": 10.342316218734036, "kl": 0.5859375, "learning_rate": 8.502590673575129e-07, "loss": 0.0021, "reward": 1.936093270778656, "reward_std": 0.17680319944520306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4360932111740112, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5, "grad_norm": 25.117057921460443, "kl": 0.57421875, "learning_rate": 8.499999999999999e-07, "loss": 0.0023, "reward": 2.1866241097450256, "reward_std": 0.25947116597740205, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6866240501403809, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.5025906735751295, "grad_norm": 1.2006737240390517, "kl": 0.533203125, "learning_rate": 8.49740932642487e-07, "loss": 0.0022, "reward": 1.9997755289077759, "reward_std": 1.1002382848346315e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997754096984863, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.505181347150259, "grad_norm": 1.99567278031156, "kl": 0.455078125, "learning_rate": 8.49481865284974e-07, "loss": 0.001, "reward": 1.9986823201179504, "reward_std": 3.0938876761865686e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49868243932724, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.5077720207253886, "grad_norm": 0.3903268086173615, "kl": 0.54296875, "learning_rate": 8.492227979274611e-07, "loss": 0.0014, "reward": 2.499998092651367, "reward_std": 1.7405783694357524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5103626943005182, "grad_norm": 15.918323704168337, "kl": 0.583984375, "learning_rate": 8.489637305699482e-07, "loss": 0.0017, "reward": 2.311471939086914, "reward_std": 0.25982454147646195, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8114719986915588, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.5129533678756477, "grad_norm": 5.504934188932366, "kl": 0.951171875, "learning_rate": 8.487046632124351e-07, "loss": 0.0037, "reward": 1.4473223686218262, "reward_std": 7.590972745674662e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9473223686218262, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 46.6875, "epoch": 1.5155440414507773, "grad_norm": 0.22756862462905705, "kl": 0.4248046875, "learning_rate": 8.484455958549222e-07, "loss": 0.0023, "reward": 2.4999831914901733, "reward_std": 1.374030262013548e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831318855286, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5181347150259068, "grad_norm": 22.49588871677703, "kl": 0.63671875, "learning_rate": 8.481865284974092e-07, "loss": 0.0023, "reward": 1.9353505969047546, "reward_std": 0.17692831628119166, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4353505373001099, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.5207253886010363, "grad_norm": 0.8375372519919833, "kl": 0.599609375, "learning_rate": 8.479274611398963e-07, "loss": 0.003, "reward": 2.4999901056289673, "reward_std": 4.094197265658295e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.5233160621761659, "grad_norm": 0.6413716165348334, "kl": 0.603515625, "learning_rate": 8.476683937823834e-07, "loss": 0.0017, "reward": 2.4999550580978394, "reward_std": 4.155054057264351e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999550580978394, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5259067357512954, "grad_norm": 1.34150800406126, "kl": 0.580078125, "learning_rate": 8.474093264248704e-07, "loss": 0.0027, "reward": 2.4999938011169434, "reward_std": 7.260470454184542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.528497409326425, "grad_norm": 0.45892241671222617, "kl": 0.603515625, "learning_rate": 8.471502590673574e-07, "loss": 0.0027, "reward": 2.4999929666519165, "reward_std": 4.291471896067378e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 1.5310880829015545, "grad_norm": 10.72181078288636, "kl": 0.61328125, "learning_rate": 8.468911917098444e-07, "loss": 0.0027, "reward": 1.9988294839859009, "reward_std": 0.001390402227116283, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988293051719666, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.533678756476684, "grad_norm": 1.1741308145001224, "kl": 0.5654296875, "learning_rate": 8.466321243523315e-07, "loss": 0.0021, "reward": 2.4999804496765137, "reward_std": 4.6286670425388365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999804496765137, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 1.5362694300518136, "grad_norm": 177.54942153376635, "kl": 0.568359375, "learning_rate": 8.463730569948186e-07, "loss": 0.0014, "reward": 2.1878397464752197, "reward_std": 0.34647539447155395, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6878398060798645, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5388601036269431, "grad_norm": 0.49458218522748953, "kl": 0.58984375, "learning_rate": 8.461139896373056e-07, "loss": 0.0029, "reward": 2.499992847442627, "reward_std": 2.4279542003569077e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5414507772020727, "grad_norm": 0.23253768512039127, "kl": 0.59375, "learning_rate": 8.458549222797928e-07, "loss": 0.0024, "reward": 2.499993920326233, "reward_std": 2.8567873187057558e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5440414507772022, "grad_norm": 0.3118479396370552, "kl": 0.4990234375, "learning_rate": 8.455958549222799e-07, "loss": 0.002, "reward": 2.499994397163391, "reward_std": 4.18366740007059e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 37.3125, "epoch": 1.5466321243523318, "grad_norm": 85.17562514809535, "kl": 0.560546875, "learning_rate": 8.453367875647668e-07, "loss": 0.0022, "reward": 1.1941595673561096, "reward_std": 0.01267794001614675, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.6941596865653992, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.549222797927461, "grad_norm": 0.6574820618018961, "kl": 0.640625, "learning_rate": 8.450777202072539e-07, "loss": 0.0028, "reward": 2.499974250793457, "reward_std": 4.362286517789471e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999741315841675, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.5518134715025906, "grad_norm": 13.617286704280167, "kl": 0.5087890625, "learning_rate": 8.448186528497409e-07, "loss": 0.0022, "reward": 1.4990816116333008, "reward_std": 0.0001413195550412638, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9990816414356232, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.5544041450777202, "grad_norm": 0.3575208436959802, "kl": 0.640625, "learning_rate": 8.44559585492228e-07, "loss": 0.0033, "reward": 2.499996542930603, "reward_std": 1.823465481720632e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5569948186528497, "grad_norm": 0.3477217808806215, "kl": 0.619140625, "learning_rate": 8.443005181347151e-07, "loss": 0.0039, "reward": 2.499996304512024, "reward_std": 2.7382543521525804e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5595854922279793, "grad_norm": 1028.6707492639712, "kl": 0.533203125, "learning_rate": 8.44041450777202e-07, "loss": 0.0021, "reward": 1.685690462589264, "reward_std": 0.26005479259765707, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1856905817985535, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5621761658031088, "grad_norm": 3.9016819608777, "kl": 0.775390625, "learning_rate": 8.437823834196891e-07, "loss": 0.0027, "reward": 1.997941017150879, "reward_std": 6.452282764257689e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979411363601685, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5647668393782384, "grad_norm": 1.0348487332390275, "kl": 0.634765625, "learning_rate": 8.435233160621761e-07, "loss": 0.0025, "reward": 2.4999901056289673, "reward_std": 3.7165237927183625e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.567357512953368, "grad_norm": 3.5054547601429777, "kl": 0.638671875, "learning_rate": 8.432642487046632e-07, "loss": 0.0023, "reward": 1.999354600906372, "reward_std": 2.056602784250572e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499354600906372, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5699481865284974, "grad_norm": 4.180741402794605, "kl": 0.646484375, "learning_rate": 8.430051813471503e-07, "loss": 0.0027, "reward": 1.9984654188156128, "reward_std": 5.189365811020252e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984654784202576, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.572538860103627, "grad_norm": 6.511333949335672, "kl": 0.564453125, "learning_rate": 8.427461139896373e-07, "loss": 0.0015, "reward": 1.9713775515556335, "reward_std": 0.00032928195008707917, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4713777601718903, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.5751295336787565, "grad_norm": 0.851368865259266, "kl": 0.578125, "learning_rate": 8.424870466321244e-07, "loss": 0.0016, "reward": 2.4999951124191284, "reward_std": 2.6659653826754948e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.577720207253886, "grad_norm": 2.401313060538958, "kl": 0.59765625, "learning_rate": 8.422279792746113e-07, "loss": 0.0025, "reward": 2.499996542930603, "reward_std": 3.17682274442177e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5803108808290154, "grad_norm": 22.408335305134244, "kl": 0.625, "learning_rate": 8.419689119170984e-07, "loss": 0.0024, "reward": 1.6848644614219666, "reward_std": 0.2596064087629202, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1848644614219666, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.582901554404145, "grad_norm": 0.7934342596367173, "kl": 0.6328125, "learning_rate": 8.417098445595855e-07, "loss": 0.0018, "reward": 2.4999955892562866, "reward_std": 3.369080786796985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.5854922279792745, "grad_norm": 1.2190930607788675, "kl": 0.6171875, "learning_rate": 8.414507772020725e-07, "loss": 0.0044, "reward": 2.499992847442627, "reward_std": 1.0025270285041188e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.588082901554404, "grad_norm": 1.9788631502470624, "kl": 0.580078125, "learning_rate": 8.411917098445596e-07, "loss": 0.0029, "reward": 2.49992573261261, "reward_std": 1.739458775773528e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999257326126099, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5906735751295336, "grad_norm": 66.4636776773515, "kl": 0.61328125, "learning_rate": 8.409326424870465e-07, "loss": 0.0026, "reward": 1.844853699207306, "reward_std": 0.008113989167839009, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3448537588119507, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 1.593264248704663, "grad_norm": 4.5449407661284615, "kl": 0.548828125, "learning_rate": 8.406735751295336e-07, "loss": 0.0015, "reward": 2.4998890161514282, "reward_std": 2.2897173948877025e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998891353607178, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.5958549222797926, "grad_norm": 4.889260865450319, "kl": 0.57421875, "learning_rate": 8.404145077720207e-07, "loss": 0.0031, "reward": 2.4999812841415405, "reward_std": 1.585746349519468e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999812245368958, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5984455958549222, "grad_norm": 3.879513218357302, "kl": 0.591796875, "learning_rate": 8.401554404145077e-07, "loss": 0.0021, "reward": 2.4374709129333496, "reward_std": 0.17677977942935286, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374709129333496, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6010362694300517, "grad_norm": 0.9193491077956003, "kl": 0.6015625, "learning_rate": 8.398963730569948e-07, "loss": 0.0018, "reward": 2.4999923706054688, "reward_std": 5.7667124337967834e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 1.6036269430051813, "grad_norm": 59.638266860081764, "kl": 0.57421875, "learning_rate": 8.396373056994819e-07, "loss": 0.0022, "reward": 1.9537792205810547, "reward_std": 0.3371285290177184, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4537791907787323, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 1.6062176165803108, "grad_norm": 12.293198905068529, "kl": 0.61328125, "learning_rate": 8.393782383419689e-07, "loss": 0.0019, "reward": 1.7936373949050903, "reward_std": 0.05805445067352366, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2936375141143799, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.6088082901554404, "grad_norm": 0.12750041877494056, "kl": 0.603515625, "learning_rate": 8.391191709844559e-07, "loss": 0.002, "reward": 1.9993314743041992, "reward_std": 9.451725162534785e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993314445018768, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.61139896373057, "grad_norm": 0.4071358932768193, "kl": 0.5556640625, "learning_rate": 8.388601036269429e-07, "loss": 0.0021, "reward": 2.4999969005584717, "reward_std": 1.5008283753559226e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 1.6139896373056994, "grad_norm": 25.8855488230219, "kl": 0.60546875, "learning_rate": 8.3860103626943e-07, "loss": 0.0026, "reward": 2.19793039560318, "reward_std": 0.3322799834838861, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6979303359985352, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 1.616580310880829, "grad_norm": 43.373640455494865, "kl": 0.5615234375, "learning_rate": 8.383419689119171e-07, "loss": 0.0022, "reward": 2.184620440006256, "reward_std": 0.2611318569241803, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6846204996109009, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6191709844559585, "grad_norm": 15.306328056590775, "kl": 0.62890625, "learning_rate": 8.380829015544041e-07, "loss": 0.0034, "reward": 1.9979455471038818, "reward_std": 0.0009181858374347485, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497945487499237, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.621761658031088, "grad_norm": 1.4332009488081985, "kl": 0.6640625, "learning_rate": 8.378238341968912e-07, "loss": 0.0031, "reward": 2.49998939037323, "reward_std": 4.605430149240419e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6243523316062176, "grad_norm": 25.219623304383635, "kl": 0.50390625, "learning_rate": 8.375647668393781e-07, "loss": 0.0021, "reward": 1.9362186789512634, "reward_std": 0.17727814394856978, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4362186193466187, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6269430051813472, "grad_norm": 86.77118077040384, "kl": 0.6484375, "learning_rate": 8.373056994818652e-07, "loss": 0.0026, "reward": 1.998164415359497, "reward_std": 0.001018396318613668, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498164415359497, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6295336787564767, "grad_norm": 0.08239146420183055, "kl": 0.625, "learning_rate": 8.370466321243523e-07, "loss": 0.0015, "reward": 2.4999974966049194, "reward_std": 1.5192312048384338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.6321243523316062, "grad_norm": 27.08370081276134, "kl": 0.517578125, "learning_rate": 8.367875647668393e-07, "loss": 0.0021, "reward": 1.8729895949363708, "reward_std": 0.4998658746480942, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3729895949363708, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6347150259067358, "grad_norm": 3.6378703630436315, "kl": 0.66015625, "learning_rate": 8.365284974093264e-07, "loss": 0.0023, "reward": 2.499932289123535, "reward_std": 1.5095150331490004e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999324083328247, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 1.6373056994818653, "grad_norm": 7.7524113772314855, "kl": 0.59765625, "learning_rate": 8.362694300518134e-07, "loss": 0.0024, "reward": 2.3437399864196777, "reward_std": 0.4419422846224279, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749899864196777, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.6398963730569949, "grad_norm": 1.2742117365728107, "kl": 0.541015625, "learning_rate": 8.360103626943004e-07, "loss": 0.0016, "reward": 2.4999890327453613, "reward_std": 4.4807430299442785e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999890327453613, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6424870466321244, "grad_norm": 30.88037223653013, "kl": 0.60546875, "learning_rate": 8.357512953367875e-07, "loss": 0.0021, "reward": 1.7766960859298706, "reward_std": 0.0010048328070411117, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2766961455345154, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.645077720207254, "grad_norm": 18.748018791159073, "kl": 0.60546875, "learning_rate": 8.354922279792745e-07, "loss": 0.0027, "reward": 1.8667698502540588, "reward_std": 0.0005063568282821507, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3667697310447693, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.6476683937823835, "grad_norm": 23.874944479136335, "kl": 0.59375, "learning_rate": 8.352331606217616e-07, "loss": 0.0022, "reward": 2.248118817806244, "reward_std": 0.26940431451657787, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7481189966201782, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.650259067357513, "grad_norm": 13.50305636835652, "kl": 0.609375, "learning_rate": 8.349740932642486e-07, "loss": 0.0016, "reward": 1.9997544288635254, "reward_std": 5.519778505913564e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997544586658478, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.6528497409326426, "grad_norm": 17.680072186111005, "kl": 0.66015625, "learning_rate": 8.347150259067358e-07, "loss": 0.0028, "reward": 2.124287247657776, "reward_std": 0.23187686055484846, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6242871284484863, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6554404145077721, "grad_norm": 53.82368945759585, "kl": 0.642578125, "learning_rate": 8.344559585492228e-07, "loss": 0.0025, "reward": 1.8095591068267822, "reward_std": 0.2592014985420974, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3095592260360718, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.6580310880829017, "grad_norm": 1.130993242484934, "kl": 0.5859375, "learning_rate": 8.341968911917098e-07, "loss": 0.0017, "reward": 2.4999841451644897, "reward_std": 7.279661076609045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984323978424, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 1.6606217616580312, "grad_norm": 15.657937073279085, "kl": 0.6484375, "learning_rate": 8.339378238341969e-07, "loss": 0.0027, "reward": 1.3255118131637573, "reward_std": 0.010213490386377089, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8255117535591125, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6632124352331608, "grad_norm": 15.209816734977053, "kl": 0.57421875, "learning_rate": 8.33678756476684e-07, "loss": 0.0024, "reward": 1.9970781803131104, "reward_std": 6.014580503688194e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970781803131104, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6658031088082903, "grad_norm": 2.486500399388777, "kl": 0.63671875, "learning_rate": 8.33419689119171e-07, "loss": 0.0018, "reward": 2.4999316930770874, "reward_std": 2.5750330905793817e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999318718910217, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.6683937823834198, "grad_norm": 1.9605886709540565, "kl": 0.630859375, "learning_rate": 8.331606217616581e-07, "loss": 0.0019, "reward": 1.9997612237930298, "reward_std": 9.97581923911639e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997612833976746, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6709844559585494, "grad_norm": 17.669175577158406, "kl": 0.7265625, "learning_rate": 8.32901554404145e-07, "loss": 0.0038, "reward": 1.9966441988945007, "reward_std": 4.05157540512846e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4966441094875336, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6735751295336787, "grad_norm": 3.0751383814063216, "kl": 0.626953125, "learning_rate": 8.326424870466321e-07, "loss": 0.0028, "reward": 2.499972343444824, "reward_std": 1.445753969164798e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99997216463089, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6761658031088082, "grad_norm": 2.498666732653749, "kl": 0.595703125, "learning_rate": 8.323834196891192e-07, "loss": 0.0025, "reward": 1.9984981417655945, "reward_std": 2.5950153940357268e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984982013702393, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.6787564766839378, "grad_norm": 12.779323722981246, "kl": 0.5, "learning_rate": 8.321243523316062e-07, "loss": 0.0014, "reward": 1.998361885547638, "reward_std": 4.844013420779447e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983620047569275, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6813471502590673, "grad_norm": 0.28171589718986967, "kl": 0.615234375, "learning_rate": 8.318652849740933e-07, "loss": 0.0026, "reward": 2.499994158744812, "reward_std": 1.472791609558044e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.6839378238341969, "grad_norm": 1.735549508677203, "kl": 0.65625, "learning_rate": 8.316062176165803e-07, "loss": 0.0025, "reward": 2.499990940093994, "reward_std": 1.0059645319415722e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 1.6865284974093264, "grad_norm": 22.66343339006925, "kl": 0.703125, "learning_rate": 8.313471502590673e-07, "loss": 0.0026, "reward": 1.4446939826011658, "reward_std": 0.08488998702750905, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9446939826011658, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 1.689119170984456, "grad_norm": 11.64408893025107, "kl": 0.50390625, "learning_rate": 8.310880829015544e-07, "loss": 0.0026, "reward": 1.9878321886062622, "reward_std": 0.009167013271905944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4878322184085846, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6917098445595855, "grad_norm": 21.356379942423843, "kl": 0.59375, "learning_rate": 8.308290155440414e-07, "loss": 0.0023, "reward": 1.9300598502159119, "reward_std": 0.17694006842793897, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.430059790611267, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.694300518134715, "grad_norm": 78.97221210244034, "kl": 3.44921875, "learning_rate": 8.305699481865285e-07, "loss": 0.0132, "reward": 1.9834641218185425, "reward_std": 0.006136150237807669, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4834641218185425, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6968911917098446, "grad_norm": 7.926414758645502, "kl": 0.611328125, "learning_rate": 8.303108808290155e-07, "loss": 0.002, "reward": 1.8061879873275757, "reward_std": 0.00031112637086039285, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.306188017129898, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6994818652849741, "grad_norm": 1.4828220302007602, "kl": 0.541015625, "learning_rate": 8.300518134715026e-07, "loss": 0.0025, "reward": 1.9996954798698425, "reward_std": 1.2413017088874767e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996954798698425, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.7020725388601037, "grad_norm": 1.1348769920748236, "kl": 0.61328125, "learning_rate": 8.297927461139896e-07, "loss": 0.0011, "reward": 2.4999961853027344, "reward_std": 2.2066390101826983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.704663212435233, "grad_norm": 3.7032659847421954, "kl": 0.599609375, "learning_rate": 8.295336787564766e-07, "loss": 0.0022, "reward": 1.996187150478363, "reward_std": 3.3818785141193075e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.496187150478363, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.7072538860103625, "grad_norm": 27.647358429080917, "kl": 0.6953125, "learning_rate": 8.292746113989637e-07, "loss": 0.0037, "reward": 2.4373772144317627, "reward_std": 0.1770834387718878, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937377154827118, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.709844559585492, "grad_norm": 0.15743287811442236, "kl": 0.548828125, "learning_rate": 8.290155440414507e-07, "loss": 0.0019, "reward": 2.499997854232788, "reward_std": 1.6300011793646263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7124352331606216, "grad_norm": 0.7806528042730063, "kl": 0.5, "learning_rate": 8.287564766839378e-07, "loss": 0.0016, "reward": 1.9994524121284485, "reward_std": 1.3068570865470974e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499452382326126, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 1.7150259067357512, "grad_norm": 29.768721608361396, "kl": 0.6640625, "learning_rate": 8.284974093264249e-07, "loss": 0.0028, "reward": 2.371861219406128, "reward_std": 0.23726117547050762, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8718610405921936, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7176165803108807, "grad_norm": 31.649926525379318, "kl": 0.572265625, "learning_rate": 8.282383419689118e-07, "loss": 0.0023, "reward": 1.2046794891357422, "reward_std": 0.1789806168526411, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7046795785427094, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.7202072538860103, "grad_norm": 0.2244304484062769, "kl": 0.599609375, "learning_rate": 8.279792746113989e-07, "loss": 0.0033, "reward": 2.4999929666519165, "reward_std": 2.0725569811474998e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7227979274611398, "grad_norm": 7.01014629833475, "kl": 0.533203125, "learning_rate": 8.27720207253886e-07, "loss": 0.0027, "reward": 2.4997899532318115, "reward_std": 6.764185604879458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999789834022522, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.7253886010362693, "grad_norm": 15.760154880969477, "kl": 0.671875, "learning_rate": 8.27461139896373e-07, "loss": 0.0028, "reward": 1.9953019618988037, "reward_std": 0.0006968316768052318, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4953020811080933, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7279792746113989, "grad_norm": 0.31248497492048627, "kl": 0.5302734375, "learning_rate": 8.272020725388601e-07, "loss": 0.0021, "reward": 2.4999974966049194, "reward_std": 1.4822262528468855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.7305699481865284, "grad_norm": 64.1506927075546, "kl": 0.564453125, "learning_rate": 8.269430051813471e-07, "loss": 0.0017, "reward": 2.4324063062667847, "reward_std": 0.19116630883092967, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9324064254760742, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.733160621761658, "grad_norm": 0.17218979874925464, "kl": 0.58203125, "learning_rate": 8.266839378238341e-07, "loss": 0.0019, "reward": 2.4999979734420776, "reward_std": 1.6239664546446875e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 1.7357512953367875, "grad_norm": 14.040651711442532, "kl": 0.568359375, "learning_rate": 8.264248704663212e-07, "loss": 0.0019, "reward": 2.42636501789093, "reward_std": 0.2082480076533102, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9263650178909302, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.738341968911917, "grad_norm": 20.045273396079637, "kl": 0.623046875, "learning_rate": 8.261658031088082e-07, "loss": 0.0023, "reward": 1.936579704284668, "reward_std": 0.22765135316913643, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4365798234939575, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7409326424870466, "grad_norm": 1.7913381267709598, "kl": 0.6015625, "learning_rate": 8.259067357512953e-07, "loss": 0.0013, "reward": 2.4999923706054688, "reward_std": 3.97106066429842e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 1.7435233160621761, "grad_norm": 23.710853282717732, "kl": 0.5693359375, "learning_rate": 8.256476683937823e-07, "loss": 0.0023, "reward": 2.374288558959961, "reward_std": 0.35555823147296906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8742887377738953, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 1.7461139896373057, "grad_norm": 2.110534045510839, "kl": 0.67578125, "learning_rate": 8.253886010362694e-07, "loss": 0.0021, "reward": 2.4999728202819824, "reward_std": 7.013269510025566e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999972939491272, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7487046632124352, "grad_norm": 0.4158337490090829, "kl": 0.5859375, "learning_rate": 8.251295336787564e-07, "loss": 0.0025, "reward": 2.4999959468841553, "reward_std": 2.575144947059016e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.7512953367875648, "grad_norm": 35.02252842531428, "kl": 0.59375, "learning_rate": 8.248704663212434e-07, "loss": 0.0025, "reward": 2.374793529510498, "reward_std": 0.2318252045324698, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747936487197876, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7538860103626943, "grad_norm": 0.22756447443275393, "kl": 0.5703125, "learning_rate": 8.246113989637305e-07, "loss": 0.0024, "reward": 2.499995231628418, "reward_std": 2.384291434509578e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 1.7564766839378239, "grad_norm": 69.73333803744084, "kl": 0.546875, "learning_rate": 8.243523316062175e-07, "loss": 0.0028, "reward": 2.3573787212371826, "reward_std": 0.26407604158316644, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8573787212371826, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7590673575129534, "grad_norm": 1.4075711674617677, "kl": 0.7734375, "learning_rate": 8.240932642487046e-07, "loss": 0.0035, "reward": 1.999743640422821, "reward_std": 2.7027926904565902e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997436106204987, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 1.761658031088083, "grad_norm": 242.29087741836534, "kl": 0.6015625, "learning_rate": 8.238341968911918e-07, "loss": 0.0024, "reward": 1.5806587934494019, "reward_std": 0.36365649849176407, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.080658733844757, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 1.7642487046632125, "grad_norm": 30.80165611505171, "kl": 0.556640625, "learning_rate": 8.235751295336786e-07, "loss": 0.0022, "reward": 1.7493666410446167, "reward_std": 0.267767902304513, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2493665218353271, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.766839378238342, "grad_norm": 0.6802771241772709, "kl": 0.548828125, "learning_rate": 8.233160621761658e-07, "loss": 0.0018, "reward": 2.499991536140442, "reward_std": 7.139059619021282e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.7694300518134716, "grad_norm": 0.5972560446823737, "kl": 0.638671875, "learning_rate": 8.230569948186528e-07, "loss": 0.0025, "reward": 2.4999879598617554, "reward_std": 4.720328661278472e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7720207253886011, "grad_norm": 4.852487048526476, "kl": 0.591796875, "learning_rate": 8.227979274611399e-07, "loss": 0.0025, "reward": 1.9979513883590698, "reward_std": 3.152483554913488e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979512691497803, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7746113989637307, "grad_norm": 5.103295836413389, "kl": 0.5625, "learning_rate": 8.22538860103627e-07, "loss": 0.0026, "reward": 1.8882436752319336, "reward_std": 0.0003013908669515786, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3882436454296112, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7772020725388602, "grad_norm": 0.8063302441872162, "kl": 0.666015625, "learning_rate": 8.22279792746114e-07, "loss": 0.0038, "reward": 2.4999953508377075, "reward_std": 1.2017776498396415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.7797927461139897, "grad_norm": 10.103751318668621, "kl": 0.509765625, "learning_rate": 8.22020725388601e-07, "loss": 0.0024, "reward": 2.3649821281433105, "reward_std": 0.00039959421758339886, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8649820685386658, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7823834196891193, "grad_norm": 0.48264374471583615, "kl": 0.6015625, "learning_rate": 8.217616580310881e-07, "loss": 0.0017, "reward": 2.499995231628418, "reward_std": 1.9699483573276666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.7849740932642488, "grad_norm": 13.01977308590633, "kl": 0.775390625, "learning_rate": 8.215025906735751e-07, "loss": 0.0024, "reward": 2.437427520751953, "reward_std": 0.1769776159570995, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937427580356598, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 1.7875647668393784, "grad_norm": 9.775085281586096, "kl": 0.578125, "learning_rate": 8.212435233160622e-07, "loss": 0.0016, "reward": 1.802764654159546, "reward_std": 0.0016438082975582802, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.302764743566513, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.790155440414508, "grad_norm": 26.522804969666854, "kl": 0.56640625, "learning_rate": 8.209844559585492e-07, "loss": 0.0018, "reward": 1.8690840005874634, "reward_std": 0.022832446065081058, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3690840005874634, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 47.5, "epoch": 1.7927461139896375, "grad_norm": 0.34716305899708544, "kl": 0.4287109375, "learning_rate": 8.207253886010363e-07, "loss": 0.002, "reward": 2.499994397163391, "reward_std": 3.507246901790495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.795336787564767, "grad_norm": 0.5286347159683991, "kl": 0.5380859375, "learning_rate": 8.204663212435233e-07, "loss": 0.0019, "reward": 2.4999889135360718, "reward_std": 7.429944048453763e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889135360718, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.7979274611398963, "grad_norm": 0.16191972736046176, "kl": 0.4736328125, "learning_rate": 8.202072538860103e-07, "loss": 0.0029, "reward": 2.4999992847442627, "reward_std": 8.061048930585457e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8005181347150259, "grad_norm": 14.763086907792246, "kl": 0.556640625, "learning_rate": 8.199481865284974e-07, "loss": 0.0018, "reward": 2.437382936477661, "reward_std": 0.1770987726915223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373830556869507, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.8031088082901554, "grad_norm": 3.680544045217989, "kl": 0.5546875, "learning_rate": 8.196891191709844e-07, "loss": 0.0022, "reward": 2.4998674392700195, "reward_std": 2.0750182670781214e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999867558479309, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.805699481865285, "grad_norm": 3.8281358260527893, "kl": 0.6328125, "learning_rate": 8.194300518134715e-07, "loss": 0.0034, "reward": 2.4998855590820312, "reward_std": 2.8580336220329627e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998854398727417, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8082901554404145, "grad_norm": 3.4194790044347974, "kl": 0.5546875, "learning_rate": 8.191709844559586e-07, "loss": 0.0018, "reward": 2.499988555908203, "reward_std": 8.464232564620033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.810880829015544, "grad_norm": 17.858110572812368, "kl": 0.525390625, "learning_rate": 8.189119170984455e-07, "loss": 0.0014, "reward": 2.4985694885253906, "reward_std": 0.00041193802704242444, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9985695481300354, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8134715025906736, "grad_norm": 4.441066752689226, "kl": 0.578125, "learning_rate": 8.186528497409326e-07, "loss": 0.0026, "reward": 2.499459147453308, "reward_std": 3.754328997729317e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9994592666625977, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.8160621761658031, "grad_norm": 47.791585129133274, "kl": 0.595703125, "learning_rate": 8.183937823834196e-07, "loss": 0.0022, "reward": 1.990900456905365, "reward_std": 0.010563275308982156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4909004867076874, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8186528497409327, "grad_norm": 7.867443418627512, "kl": 0.568359375, "learning_rate": 8.181347150259067e-07, "loss": 0.002, "reward": 1.9997710585594177, "reward_std": 2.4340854906768072e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997711777687073, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8212435233160622, "grad_norm": 9.009191459614398, "kl": 0.5390625, "learning_rate": 8.178756476683938e-07, "loss": 0.0018, "reward": 2.4996525049209595, "reward_std": 0.00016829496944126277, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996525049209595, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8238341968911918, "grad_norm": 1.4868957936740022, "kl": 0.546875, "learning_rate": 8.176165803108808e-07, "loss": 0.0025, "reward": 2.4998859167099, "reward_std": 3.3098233302553126e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998857378959656, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8264248704663213, "grad_norm": 0.06231822675496508, "kl": 0.49609375, "learning_rate": 8.173575129533678e-07, "loss": 0.001, "reward": 2.4999890327453613, "reward_std": 1.3851810081177973e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8290155440414506, "grad_norm": 15.480611204211383, "kl": 1.220703125, "learning_rate": 8.170984455958548e-07, "loss": 0.0044, "reward": 2.4987872838974, "reward_std": 0.0006092726238193791, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9987873435020447, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8316062176165802, "grad_norm": 2.5397628950859605, "kl": 0.58984375, "learning_rate": 8.168393782383419e-07, "loss": 0.0023, "reward": 1.999273657798767, "reward_std": 4.6511597759035794e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992738366127014, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.8341968911917097, "grad_norm": 22.045909737066598, "kl": 0.61328125, "learning_rate": 8.16580310880829e-07, "loss": 0.0025, "reward": 1.6844506859779358, "reward_std": 0.2596860003141046, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.184450775384903, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.8367875647668392, "grad_norm": 0.12039819111499356, "kl": 0.576171875, "learning_rate": 8.16321243523316e-07, "loss": 0.0032, "reward": 2.4999966621398926, "reward_std": 1.9631219174698344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8393782383419688, "grad_norm": 4.620698144157835, "kl": 0.548828125, "learning_rate": 8.160621761658031e-07, "loss": 0.0027, "reward": 2.4999728202819824, "reward_std": 1.6227220385189867e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999727606773376, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.8419689119170983, "grad_norm": 15.869518616944253, "kl": 0.50390625, "learning_rate": 8.1580310880829e-07, "loss": 0.0019, "reward": 2.4992637634277344, "reward_std": 0.00045814631971552444, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9992636442184448, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8445595854922279, "grad_norm": 2.863430734996328, "kl": 0.60546875, "learning_rate": 8.155440414507771e-07, "loss": 0.003, "reward": 1.9972261190414429, "reward_std": 2.650837490136837e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972259998321533, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8471502590673574, "grad_norm": 1.2929975278205454, "kl": 0.61328125, "learning_rate": 8.152849740932642e-07, "loss": 0.0025, "reward": 1.999918520450592, "reward_std": 8.15740736470616e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999185502529144, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.849740932642487, "grad_norm": 53.206169785879396, "kl": 0.53515625, "learning_rate": 8.150259067357512e-07, "loss": 0.0022, "reward": 1.2649214267730713, "reward_std": 0.0005708700628019869, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7649214267730713, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8523316062176165, "grad_norm": 6.537547043956284, "kl": 0.58203125, "learning_rate": 8.147668393782383e-07, "loss": 0.002, "reward": 1.9985774755477905, "reward_std": 4.4287794878528075e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985776245594025, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.854922279792746, "grad_norm": 4.538356724778373, "kl": 0.59765625, "learning_rate": 8.145077720207254e-07, "loss": 0.0024, "reward": 1.4910039901733398, "reward_std": 0.0003661315213321359, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9910040199756622, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8575129533678756, "grad_norm": 42.275932389319, "kl": 0.67578125, "learning_rate": 8.142487046632123e-07, "loss": 0.0027, "reward": 2.4373152256011963, "reward_std": 0.17726546007565958, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373154044151306, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8601036269430051, "grad_norm": 2.9756618304332925, "kl": 0.580078125, "learning_rate": 8.139896373056994e-07, "loss": 0.0026, "reward": 1.9997833967208862, "reward_std": 2.3093944946595002e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997832775115967, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8626943005181347, "grad_norm": 33.93579058053243, "kl": 0.458984375, "learning_rate": 8.137305699481864e-07, "loss": 0.0021, "reward": 2.2492282390594482, "reward_std": 0.2680792650683088, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7492281794548035, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8652849740932642, "grad_norm": 7.075566446698302, "kl": 0.498046875, "learning_rate": 8.134715025906735e-07, "loss": 0.0021, "reward": 1.9999392628669739, "reward_std": 1.8548518482930376e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999392628669739, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8678756476683938, "grad_norm": 2.391574654196397, "kl": 0.556640625, "learning_rate": 8.132124352331606e-07, "loss": 0.0033, "reward": 2.4999818801879883, "reward_std": 8.343676427102764e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981701374054, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8704663212435233, "grad_norm": 31.853808184400428, "kl": 0.595703125, "learning_rate": 8.129533678756476e-07, "loss": 0.0023, "reward": 2.499494433403015, "reward_std": 0.0003628747596167159, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9994944334030151, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8730569948186528, "grad_norm": 45.13193051581315, "kl": 0.560546875, "learning_rate": 8.126943005181348e-07, "loss": 0.0032, "reward": 1.8006942868232727, "reward_std": 0.00784747799411889, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3006941676139832, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8756476683937824, "grad_norm": 3.326362727573432, "kl": 0.70703125, "learning_rate": 8.124352331606216e-07, "loss": 0.0024, "reward": 1.4977238774299622, "reward_std": 7.014654693193734e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9977238476276398, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.878238341968912, "grad_norm": 83.37003472135653, "kl": 0.572265625, "learning_rate": 8.121761658031088e-07, "loss": 0.0024, "reward": 1.9929113388061523, "reward_std": 0.015552189870504662, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4929113388061523, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.8808290155440415, "grad_norm": 23.66461117647561, "kl": 0.64453125, "learning_rate": 8.119170984455959e-07, "loss": 0.0024, "reward": 1.9044480323791504, "reward_std": 0.0004517716801615279, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.404447853565216, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.883419689119171, "grad_norm": 0.0736816707541898, "kl": 0.658203125, "learning_rate": 8.116580310880829e-07, "loss": 0.0028, "reward": 2.499998450279236, "reward_std": 9.05689489627548e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8860103626943006, "grad_norm": 3.234301167317128, "kl": 0.5185546875, "learning_rate": 8.1139896373057e-07, "loss": 0.0021, "reward": 1.9990112781524658, "reward_std": 3.794973963522352e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990113377571106, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 1.88860103626943, "grad_norm": 26.555682495306996, "kl": 0.642578125, "learning_rate": 8.111398963730569e-07, "loss": 0.0027, "reward": 2.435392379760742, "reward_std": 0.1780978873539425, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9353923797607422, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 1.8911917098445596, "grad_norm": 5.128765188182223, "kl": 0.591796875, "learning_rate": 8.10880829015544e-07, "loss": 0.0024, "reward": 1.9369257688522339, "reward_std": 0.176560147498094, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4369257986545563, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8937823834196892, "grad_norm": 44.34844756818657, "kl": 0.5546875, "learning_rate": 8.106217616580311e-07, "loss": 0.0023, "reward": 1.190453439950943, "reward_std": 0.009079714611289091, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.6904534250497818, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.8963730569948187, "grad_norm": 5.2120547188707675, "kl": 0.482421875, "learning_rate": 8.103626943005181e-07, "loss": 0.0031, "reward": 2.499936103820801, "reward_std": 3.565089173207525e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999359250068665, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 1.8989637305699483, "grad_norm": 2.7958397144535154, "kl": 0.52734375, "learning_rate": 8.101036269430052e-07, "loss": 0.0026, "reward": 2.4998265504837036, "reward_std": 5.272549549317773e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998264908790588, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.9015544041450778, "grad_norm": 0.5922452456459791, "kl": 0.498046875, "learning_rate": 8.098445595854922e-07, "loss": 0.0019, "reward": 2.4999955892562866, "reward_std": 3.5370176192373037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9041450777202074, "grad_norm": 26.000994314733713, "kl": 0.576171875, "learning_rate": 8.095854922279793e-07, "loss": 0.0023, "reward": 1.5052390694618225, "reward_std": 0.18136432068422437, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0052390694618225, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.906735751295337, "grad_norm": 14.671203122580629, "kl": 0.53515625, "learning_rate": 8.093264248704663e-07, "loss": 0.003, "reward": 2.4999961853027344, "reward_std": 2.7046322657042765e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9093264248704664, "grad_norm": 1.3839527351795466, "kl": 0.591796875, "learning_rate": 8.090673575129533e-07, "loss": 0.0022, "reward": 2.4999693632125854, "reward_std": 3.5315530908519577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999693036079407, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 1.911917098445596, "grad_norm": 55.765962332469584, "kl": 0.576171875, "learning_rate": 8.088082901554404e-07, "loss": 0.0017, "reward": 2.4373066425323486, "reward_std": 0.17727701491094194, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373065829277039, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9145077720207255, "grad_norm": 0.258727550261043, "kl": 0.572265625, "learning_rate": 8.085492227979275e-07, "loss": 0.0016, "reward": 2.499997615814209, "reward_std": 1.8633427316672169e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 1.917098445595855, "grad_norm": 26.34406770832871, "kl": 0.54296875, "learning_rate": 8.082901554404145e-07, "loss": 0.0025, "reward": 1.9417505264282227, "reward_std": 0.22676452300891015, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4417505264282227, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 1.9196891191709846, "grad_norm": 16.88161013006616, "kl": 0.65625, "learning_rate": 8.080310880829016e-07, "loss": 0.0025, "reward": 2.320062279701233, "reward_std": 0.33316629852470214, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8200623989105225, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.922279792746114, "grad_norm": 0.6486570270134164, "kl": 0.63671875, "learning_rate": 8.077720207253885e-07, "loss": 0.0028, "reward": 2.4999921321868896, "reward_std": 4.1400835470994934e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9248704663212435, "grad_norm": 68.03534067419625, "kl": 0.658203125, "learning_rate": 8.075129533678756e-07, "loss": 0.0026, "reward": 1.317706286907196, "reward_std": 0.0020105853909626603, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8177063465118408, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 1.927461139896373, "grad_norm": 0.15581105407140633, "kl": 0.548828125, "learning_rate": 8.072538860103627e-07, "loss": 0.0015, "reward": 2.4999983310699463, "reward_std": 1.900022652989719e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.9300518134715026, "grad_norm": 0.8443281235330804, "kl": 0.5205078125, "learning_rate": 8.069948186528497e-07, "loss": 0.0019, "reward": 2.499994158744812, "reward_std": 5.579298886004835e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.932642487046632, "grad_norm": 26.579123046214256, "kl": 0.625, "learning_rate": 8.067357512953368e-07, "loss": 0.0031, "reward": 1.9990614652633667, "reward_std": 9.150606774710468e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990612864494324, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9352331606217616, "grad_norm": 5.575930857953386, "kl": 0.646484375, "learning_rate": 8.064766839378238e-07, "loss": 0.0029, "reward": 1.8152194619178772, "reward_std": 0.00010451917040654735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.31521937251091, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.9378238341968912, "grad_norm": 0.25329748173728833, "kl": 0.6171875, "learning_rate": 8.062176165803108e-07, "loss": 0.0023, "reward": 2.4999948740005493, "reward_std": 2.2240097479198084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.9404145077720207, "grad_norm": 22.16678345663416, "kl": 0.53515625, "learning_rate": 8.059585492227979e-07, "loss": 0.003, "reward": 2.3748111724853516, "reward_std": 0.23180074050910093, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748111724853516, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.9430051813471503, "grad_norm": 19.398680041862207, "kl": 0.724609375, "learning_rate": 8.056994818652849e-07, "loss": 0.0027, "reward": 1.9980978965759277, "reward_std": 8.672712692714413e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980978965759277, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9455958549222798, "grad_norm": 11.199594090078307, "kl": 0.69921875, "learning_rate": 8.05440414507772e-07, "loss": 0.0035, "reward": 1.9988819360733032, "reward_std": 3.6573262832462206e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988816976547241, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.9481865284974094, "grad_norm": 0.8566052800839844, "kl": 0.640625, "learning_rate": 8.05181347150259e-07, "loss": 0.0031, "reward": 2.499992251396179, "reward_std": 3.572612513380591e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.950777202072539, "grad_norm": 55.64761148442921, "kl": 0.591796875, "learning_rate": 8.049222797927461e-07, "loss": 0.0026, "reward": 2.3113327026367188, "reward_std": 0.26038133179690703, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8113325834274292, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9533678756476682, "grad_norm": 29.650299843977272, "kl": 0.609375, "learning_rate": 8.046632124352331e-07, "loss": 0.0023, "reward": 1.7489295601844788, "reward_std": 0.26742117578396574, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2489296197891235, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9559585492227978, "grad_norm": 47.4950924328193, "kl": 0.67578125, "learning_rate": 8.044041450777201e-07, "loss": 0.0027, "reward": 2.18611478805542, "reward_std": 0.2591387484353618, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.68611478805542, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.9585492227979273, "grad_norm": 11.098529129198631, "kl": 0.5625, "learning_rate": 8.041450777202072e-07, "loss": 0.002, "reward": 2.4999654293060303, "reward_std": 3.133619793516118e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999653100967407, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.9611398963730569, "grad_norm": 0.5701525791317523, "kl": 0.69140625, "learning_rate": 8.038860103626942e-07, "loss": 0.0039, "reward": 2.499998927116394, "reward_std": 1.0851132401512587e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.9637305699481864, "grad_norm": 0.592427461968302, "kl": 0.654296875, "learning_rate": 8.036269430051813e-07, "loss": 0.0041, "reward": 2.4999935626983643, "reward_std": 3.969579438489745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.966321243523316, "grad_norm": 18.83475165860954, "kl": 0.625, "learning_rate": 8.033678756476684e-07, "loss": 0.0029, "reward": 2.0611624121665955, "reward_std": 0.1773168448753495, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5611623525619507, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 1.9689119170984455, "grad_norm": 42.4104450561026, "kl": 0.54296875, "learning_rate": 8.031088082901553e-07, "loss": 0.0027, "reward": 1.8590643405914307, "reward_std": 0.4068869238071784, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3590644598007202, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.971502590673575, "grad_norm": 37.77476449164945, "kl": 0.599609375, "learning_rate": 8.028497409326424e-07, "loss": 0.0029, "reward": 1.999179482460022, "reward_std": 0.0010529774735914543, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991796016693115, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9740932642487046, "grad_norm": 0.41947983855717186, "kl": 0.63671875, "learning_rate": 8.025906735751295e-07, "loss": 0.0021, "reward": 2.499996542930603, "reward_std": 2.264656700390333e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9766839378238341, "grad_norm": 0.12322951730019832, "kl": 0.4921875, "learning_rate": 8.023316062176165e-07, "loss": 0.0023, "reward": 2.499997615814209, "reward_std": 1.2305677046242636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 1.9792746113989637, "grad_norm": 0.3267865971002745, "kl": 0.619140625, "learning_rate": 8.020725388601036e-07, "loss": 0.0025, "reward": 2.499995708465576, "reward_std": 3.0971226578913047e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9818652849740932, "grad_norm": 4.95349720724852, "kl": 0.55859375, "learning_rate": 8.018134715025906e-07, "loss": 0.0028, "reward": 1.9981555342674255, "reward_std": 3.5713949728233274e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981553852558136, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.9844559585492227, "grad_norm": 0.5046062887915987, "kl": 0.509765625, "learning_rate": 8.015544041450776e-07, "loss": 0.0014, "reward": 2.4999808073043823, "reward_std": 5.605218547088953e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999810457229614, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9870466321243523, "grad_norm": 5.196539571241836, "kl": 0.5400390625, "learning_rate": 8.012953367875648e-07, "loss": 0.0021, "reward": 1.3832663893699646, "reward_std": 0.0005237822897470323, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8832663893699646, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.9896373056994818, "grad_norm": 1.909006902131869, "kl": 0.5361328125, "learning_rate": 8.010362694300518e-07, "loss": 0.0031, "reward": 1.9999504685401917, "reward_std": 2.635537163087065e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499950349330902, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9922279792746114, "grad_norm": 8.944113086540824, "kl": 0.666015625, "learning_rate": 8.007772020725389e-07, "loss": 0.002, "reward": 2.3124144077301025, "reward_std": 0.2588784824954473, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124145865440369, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 1.994818652849741, "grad_norm": 2.9594339166477925, "kl": 0.630859375, "learning_rate": 8.005181347150259e-07, "loss": 0.0023, "reward": 1.999068558216095, "reward_std": 2.0738410086096337e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990684986114502, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9974093264248705, "grad_norm": 1.5903235116848464, "kl": 0.5244140625, "learning_rate": 8.00259067357513e-07, "loss": 0.0018, "reward": 1.9997640252113342, "reward_std": 2.4475769123455393e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997640252113342, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0, "grad_norm": 1.0764099522749564, "kl": 0.572265625, "learning_rate": 8e-07, "loss": 0.0019, "reward": 2.499993324279785, "reward_std": 3.9984249724511756e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0025906735751295, "grad_norm": 21.535071843436462, "kl": 0.83984375, "learning_rate": 7.99740932642487e-07, "loss": 0.0031, "reward": 2.1870230436325073, "reward_std": 0.25916092243414823, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6870230436325073, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.005181347150259, "grad_norm": 6.61214261050889, "kl": 0.634765625, "learning_rate": 7.994818652849741e-07, "loss": 0.0025, "reward": 1.4979920387268066, "reward_std": 0.00011432188330218196, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9979920387268066, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.0077720207253886, "grad_norm": 0.2470574852533565, "kl": 0.49609375, "learning_rate": 7.992227979274611e-07, "loss": 0.0012, "reward": 2.499996542930603, "reward_std": 3.3122202012236812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.010362694300518, "grad_norm": 0.7750112622004939, "kl": 0.568359375, "learning_rate": 7.989637305699482e-07, "loss": 0.0007, "reward": 2.4999899864196777, "reward_std": 5.532025056709244e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0129533678756477, "grad_norm": 0.3445187156463046, "kl": 0.4638671875, "learning_rate": 7.987046632124353e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.262218091824252e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.0155440414507773, "grad_norm": 10.906443565134559, "kl": 0.564453125, "learning_rate": 7.984455958549222e-07, "loss": 0.0025, "reward": 1.388745665550232, "reward_std": 0.00024291040608659387, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8887457251548767, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.018134715025907, "grad_norm": 2.3768600314541697, "kl": 0.58203125, "learning_rate": 7.981865284974093e-07, "loss": 0.0021, "reward": 1.9986672401428223, "reward_std": 2.871683568628214e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498667299747467, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0207253886010363, "grad_norm": 0.1883228385444539, "kl": 0.640625, "learning_rate": 7.979274611398963e-07, "loss": 0.0037, "reward": 2.4999955892562866, "reward_std": 1.7697889234113973e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.023316062176166, "grad_norm": 1.6456110579176388, "kl": 0.59375, "learning_rate": 7.976683937823834e-07, "loss": 0.0024, "reward": 2.499976873397827, "reward_std": 8.440048077318352e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999768733978271, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0259067357512954, "grad_norm": 0.23890489376663165, "kl": 0.541015625, "learning_rate": 7.974093264248705e-07, "loss": 0.0033, "reward": 2.4999927282333374, "reward_std": 3.4055315154546406e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.028497409326425, "grad_norm": 4.609523384213504, "kl": 0.53125, "learning_rate": 7.971502590673575e-07, "loss": 0.0021, "reward": 1.1445733904838562, "reward_std": 0.00039083947740436997, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.6445733904838562, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 2.0310880829015545, "grad_norm": 233.42797386159765, "kl": 0.58203125, "learning_rate": 7.968911917098445e-07, "loss": 0.0016, "reward": 1.9590036273002625, "reward_std": 0.02842423763269153, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4590036869049072, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.033678756476684, "grad_norm": 69.46770154247524, "kl": 0.513671875, "learning_rate": 7.966321243523316e-07, "loss": 0.0015, "reward": 1.9838842153549194, "reward_std": 0.007355917569270787, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4838842451572418, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.0362694300518136, "grad_norm": 10.228313819238066, "kl": 0.634765625, "learning_rate": 7.963730569948186e-07, "loss": 0.0025, "reward": 1.306801199913025, "reward_std": 0.0005683594372385414, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8068011403083801, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.038860103626943, "grad_norm": 0.5123848370891608, "kl": 0.669921875, "learning_rate": 7.961139896373057e-07, "loss": 0.0038, "reward": 2.4999910593032837, "reward_std": 3.6072258353669895e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0414507772020727, "grad_norm": 0.2892134185657239, "kl": 0.587890625, "learning_rate": 7.958549222797927e-07, "loss": 0.0021, "reward": 2.499995470046997, "reward_std": 4.599880867317552e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.0440414507772022, "grad_norm": 5.558692360697497, "kl": 0.5517578125, "learning_rate": 7.955958549222798e-07, "loss": 0.0022, "reward": 1.9989656209945679, "reward_std": 9.891260742733721e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989655017852783, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 2.0466321243523318, "grad_norm": 0.25565994476153525, "kl": 0.5625, "learning_rate": 7.953367875647668e-07, "loss": 0.0018, "reward": 2.4999958276748657, "reward_std": 3.3991085501838825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.0492227979274613, "grad_norm": 13.271429943444872, "kl": 0.59765625, "learning_rate": 7.950777202072538e-07, "loss": 0.0029, "reward": 2.3119280338287354, "reward_std": 0.2588019425757011, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8119279742240906, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.051813471502591, "grad_norm": 2.209746211778665, "kl": 0.677734375, "learning_rate": 7.948186528497409e-07, "loss": 0.0025, "reward": 1.9983933568000793, "reward_std": 2.272693163263284e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498393476009369, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 2.0544041450777204, "grad_norm": 60.61438257011553, "kl": 0.66796875, "learning_rate": 7.945595854922279e-07, "loss": 0.0025, "reward": 1.9486241340637207, "reward_std": 0.09229949381591496, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4486242234706879, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 2.05699481865285, "grad_norm": 25.941411875732253, "kl": 0.6484375, "learning_rate": 7.94300518134715e-07, "loss": 0.002, "reward": 1.9187315702438354, "reward_std": 0.02273695889090277, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4187317192554474, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.0595854922279795, "grad_norm": 0.5938527053808671, "kl": 0.53125, "learning_rate": 7.940414507772021e-07, "loss": 0.0021, "reward": 2.4999942779541016, "reward_std": 4.630309547337674e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.062176165803109, "grad_norm": 23.91034449982263, "kl": 0.640625, "learning_rate": 7.93782383419689e-07, "loss": 0.0026, "reward": 1.785571575164795, "reward_std": 0.016452619644951483, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.285571575164795, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.064766839378238, "grad_norm": 41.479908085524684, "kl": 0.599609375, "learning_rate": 7.935233160621761e-07, "loss": 0.0024, "reward": 1.6222277879714966, "reward_std": 0.35418426990509033, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1222279071807861, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0673575129533677, "grad_norm": 0.36452760361448755, "kl": 0.640625, "learning_rate": 7.932642487046631e-07, "loss": 0.0033, "reward": 2.499997615814209, "reward_std": 2.238552895050816e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 2.069948186528497, "grad_norm": 14.200733764297604, "kl": 0.630859375, "learning_rate": 7.930051813471502e-07, "loss": 0.003, "reward": 2.4199795722961426, "reward_std": 0.2263178822717009, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9199795126914978, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0725388601036268, "grad_norm": 0.030481212423631374, "kl": 0.48046875, "learning_rate": 7.927461139896373e-07, "loss": 0.001, "reward": 2.499998688697815, "reward_std": 7.423436159115226e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0751295336787563, "grad_norm": 4.450593836974067, "kl": 0.587890625, "learning_rate": 7.924870466321243e-07, "loss": 0.0015, "reward": 1.992353618144989, "reward_std": 0.00016525026057934156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4923538565635681, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.077720207253886, "grad_norm": 165.21044548755225, "kl": 0.61328125, "learning_rate": 7.922279792746113e-07, "loss": 0.0017, "reward": 1.8504005074501038, "reward_std": 0.00899917268873196, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3504004180431366, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.0803108808290154, "grad_norm": 16.74365954809602, "kl": 0.576171875, "learning_rate": 7.919689119170983e-07, "loss": 0.0024, "reward": 1.992065966129303, "reward_std": 0.0003917174958587566, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4920660257339478, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.082901554404145, "grad_norm": 0.16168432626585913, "kl": 0.4658203125, "learning_rate": 7.917098445595854e-07, "loss": 0.0016, "reward": 2.499997138977051, "reward_std": 2.0040702679580136e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.0854922279792745, "grad_norm": 0.34190800151108586, "kl": 0.62890625, "learning_rate": 7.914507772020725e-07, "loss": 0.0018, "reward": 2.499996542930603, "reward_std": 2.5806822492313586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.088082901554404, "grad_norm": 3.354784241957146, "kl": 0.595703125, "learning_rate": 7.911917098445595e-07, "loss": 0.0032, "reward": 2.4999955892562866, "reward_std": 5.423235506896162e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 2.0906735751295336, "grad_norm": 68.74951150048851, "kl": 0.5859375, "learning_rate": 7.909326424870466e-07, "loss": 0.0024, "reward": 1.6623141169548035, "reward_std": 0.2599085367983207, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1623140573501587, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.093264248704663, "grad_norm": 6.769194342591474, "kl": 0.62890625, "learning_rate": 7.906735751295335e-07, "loss": 0.0024, "reward": 1.3610073328018188, "reward_std": 0.0004084521933691576, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8610073328018188, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 2.0958549222797926, "grad_norm": 7.667496497453053, "kl": 0.576171875, "learning_rate": 7.904145077720206e-07, "loss": 0.0022, "reward": 1.9675925970077515, "reward_std": 0.08720768589819272, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4675925374031067, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.098445595854922, "grad_norm": 5.083641018130945, "kl": 0.6875, "learning_rate": 7.901554404145078e-07, "loss": 0.003, "reward": 1.4983093738555908, "reward_std": 7.869107321312185e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9983093738555908, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1010362694300517, "grad_norm": 2.479779862979899, "kl": 0.6181640625, "learning_rate": 7.898963730569948e-07, "loss": 0.0023, "reward": 2.4999831914901733, "reward_std": 1.0685473057492345e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983310699463, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1036269430051813, "grad_norm": 17.515375750390636, "kl": 0.619140625, "learning_rate": 7.896373056994819e-07, "loss": 0.0028, "reward": 1.989603877067566, "reward_std": 6.074985935811128e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4896038174629211, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.106217616580311, "grad_norm": 14.101353716959524, "kl": 0.615234375, "learning_rate": 7.89378238341969e-07, "loss": 0.0022, "reward": 1.474350392818451, "reward_std": 0.019159212168233353, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9743503332138062, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1088082901554404, "grad_norm": 0.2261605963122978, "kl": 0.56640625, "learning_rate": 7.891191709844559e-07, "loss": 0.0014, "reward": 2.4999969005584717, "reward_std": 1.814804477362486e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.11139896373057, "grad_norm": 90.9345996464154, "kl": 0.5087890625, "learning_rate": 7.88860103626943e-07, "loss": 0.0026, "reward": 2.499596357345581, "reward_std": 0.0001757368721655439, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995962977409363, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1139896373056994, "grad_norm": 0.7219789209229946, "kl": 0.560546875, "learning_rate": 7.8860103626943e-07, "loss": 0.0036, "reward": 2.499991774559021, "reward_std": 5.073510976671969e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.116580310880829, "grad_norm": 23.076465097836696, "kl": 0.564453125, "learning_rate": 7.883419689119171e-07, "loss": 0.0022, "reward": 1.9980223178863525, "reward_std": 0.0011134111555293202, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980222582817078, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1191709844559585, "grad_norm": 0.19719546684933917, "kl": 0.5693359375, "learning_rate": 7.880829015544042e-07, "loss": 0.0017, "reward": 2.4999979734420776, "reward_std": 1.3139845975729258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.121761658031088, "grad_norm": 30.747245983503426, "kl": 0.619140625, "learning_rate": 7.878238341968912e-07, "loss": 0.0032, "reward": 2.1865060329437256, "reward_std": 0.2595980352398328, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.686505913734436, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1243523316062176, "grad_norm": 0.16926215423129531, "kl": 0.62890625, "learning_rate": 7.875647668393782e-07, "loss": 0.0015, "reward": 2.499985098838806, "reward_std": 3.3252969160457724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852776527405, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.126943005181347, "grad_norm": 0.2275083148854889, "kl": 0.548828125, "learning_rate": 7.873056994818652e-07, "loss": 0.0021, "reward": 2.4999977350234985, "reward_std": 1.5297295021809987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.1295336787564767, "grad_norm": 0.5853329897965538, "kl": 0.53515625, "learning_rate": 7.870466321243523e-07, "loss": 0.0029, "reward": 2.4999947547912598, "reward_std": 3.2832916758707142e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 2.1321243523316062, "grad_norm": 68.30677443433447, "kl": 0.6328125, "learning_rate": 7.867875647668394e-07, "loss": 0.0026, "reward": 1.402154564857483, "reward_std": 0.12056825850595487, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9021545350551605, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.134715025906736, "grad_norm": 0.16050881628502192, "kl": 0.572265625, "learning_rate": 7.865284974093264e-07, "loss": 0.0024, "reward": 2.499994993209839, "reward_std": 2.240067260572687e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.1373056994818653, "grad_norm": 153.3431333137575, "kl": 0.52734375, "learning_rate": 7.862694300518135e-07, "loss": 0.002, "reward": 2.371320128440857, "reward_std": 0.23827327637576445, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.871320128440857, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.139896373056995, "grad_norm": 38.17198315484388, "kl": 0.646484375, "learning_rate": 7.860103626943004e-07, "loss": 0.0026, "reward": 2.499499201774597, "reward_std": 0.00021818420077579503, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999499261379242, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.1424870466321244, "grad_norm": 0.21556778856768938, "kl": 0.681640625, "learning_rate": 7.857512953367875e-07, "loss": 0.0029, "reward": 2.49999737739563, "reward_std": 2.5992444534495007e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.145077720207254, "grad_norm": 0.30386406283654527, "kl": 0.39453125, "learning_rate": 7.854922279792746e-07, "loss": 0.0017, "reward": 2.499995231628418, "reward_std": 2.789265465708013e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1476683937823835, "grad_norm": 0.5760296398845397, "kl": 0.708984375, "learning_rate": 7.852331606217616e-07, "loss": 0.0024, "reward": 2.4999988079071045, "reward_std": 5.240717797505567e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.150259067357513, "grad_norm": 274.9889572165265, "kl": 0.5068359375, "learning_rate": 7.849740932642487e-07, "loss": 0.0021, "reward": 1.4226058721542358, "reward_std": 0.17791467247297987, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9226058423519135, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1528497409326426, "grad_norm": 3.300881626242842, "kl": 0.638671875, "learning_rate": 7.847150259067357e-07, "loss": 0.0026, "reward": 1.992441713809967, "reward_std": 6.37705257986454e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924417734146118, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.155440414507772, "grad_norm": 0.04528244762387948, "kl": 0.53515625, "learning_rate": 7.844559585492227e-07, "loss": 0.0018, "reward": 2.499997854232788, "reward_std": 1.1732090285931918e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.1580310880829017, "grad_norm": 0.5468898821542946, "kl": 0.5859375, "learning_rate": 7.841968911917098e-07, "loss": 0.0023, "reward": 2.4999738931655884, "reward_std": 5.9463459365360904e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999739527702332, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.160621761658031, "grad_norm": 6.4943264072981375, "kl": 0.59765625, "learning_rate": 7.839378238341968e-07, "loss": 0.0034, "reward": 1.8086551427841187, "reward_std": 0.00026101190155713994, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3086549937725067, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.1632124352331608, "grad_norm": 369.295631756484, "kl": 0.638671875, "learning_rate": 7.836787564766839e-07, "loss": 0.0026, "reward": 2.3740105628967285, "reward_std": 0.23149993296465254, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8740106225013733, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1658031088082903, "grad_norm": 2.522209272346679, "kl": 0.658203125, "learning_rate": 7.83419689119171e-07, "loss": 0.0029, "reward": 2.4984259605407715, "reward_std": 3.0620788379565056e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9984257817268372, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.16839378238342, "grad_norm": 0.8867791795386545, "kl": 0.587890625, "learning_rate": 7.83160621761658e-07, "loss": 0.002, "reward": 1.9989197850227356, "reward_std": 2.2387875787899247e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49891996383667, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.1709844559585494, "grad_norm": 1.24121723925803, "kl": 0.642578125, "learning_rate": 7.829015544041451e-07, "loss": 0.0031, "reward": 2.499969005584717, "reward_std": 9.61785235631396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999690055847168, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 2.173575129533679, "grad_norm": 28.2135929826487, "kl": 0.67578125, "learning_rate": 7.82642487046632e-07, "loss": 0.0031, "reward": 2.312175989151001, "reward_std": 0.25909132075685193, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812175989151001, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.1761658031088085, "grad_norm": 23.39143343009809, "kl": 0.67578125, "learning_rate": 7.823834196891191e-07, "loss": 0.003, "reward": 2.4372611045837402, "reward_std": 0.17734522199134517, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372608661651611, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.178756476683938, "grad_norm": 32.53815111620874, "kl": 0.681640625, "learning_rate": 7.821243523316062e-07, "loss": 0.0029, "reward": 1.8744940757751465, "reward_std": 0.23185647589343716, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.374493956565857, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.1813471502590676, "grad_norm": 0.5414600686631816, "kl": 0.642578125, "learning_rate": 7.818652849740932e-07, "loss": 0.0021, "reward": 2.4999923706054688, "reward_std": 3.967311428709763e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1839378238341967, "grad_norm": 0.1044831952864383, "kl": 0.576171875, "learning_rate": 7.816062176165803e-07, "loss": 0.0022, "reward": 2.4999985694885254, "reward_std": 1.0980425031448249e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.186528497409326, "grad_norm": 0.1997986768071452, "kl": 0.583984375, "learning_rate": 7.813471502590672e-07, "loss": 0.0023, "reward": 2.4999969005584717, "reward_std": 2.6972466002916917e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.1891191709844557, "grad_norm": 0.19148120659457804, "kl": 0.603515625, "learning_rate": 7.810880829015543e-07, "loss": 0.0024, "reward": 2.4999945163726807, "reward_std": 1.8547192439655191e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.1917098445595853, "grad_norm": 1.485052398455108, "kl": 0.4892578125, "learning_rate": 7.808290155440414e-07, "loss": 0.0028, "reward": 1.9999026656150818, "reward_std": 1.3065275652479613e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999023973941803, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.194300518134715, "grad_norm": 0.2525856583231932, "kl": 0.54296875, "learning_rate": 7.805699481865284e-07, "loss": 0.0008, "reward": 2.4999983310699463, "reward_std": 1.5716885854999418e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.1968911917098444, "grad_norm": 1.5774881240681569, "kl": 0.5703125, "learning_rate": 7.803108808290155e-07, "loss": 0.0026, "reward": 2.4999877214431763, "reward_std": 4.972859869667445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.199481865284974, "grad_norm": 2.421342959766688, "kl": 0.671875, "learning_rate": 7.800518134715025e-07, "loss": 0.0035, "reward": 2.499977231025696, "reward_std": 7.521375664509833e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999771118164062, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.2020725388601035, "grad_norm": 7.336019005365122, "kl": 0.6328125, "learning_rate": 7.797927461139896e-07, "loss": 0.0018, "reward": 1.9990352392196655, "reward_std": 9.247055720607023e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990352988243103, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.204663212435233, "grad_norm": 0.20636597322039005, "kl": 0.607421875, "learning_rate": 7.795336787564766e-07, "loss": 0.002, "reward": 2.499996542930603, "reward_std": 2.5037608679667755e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2072538860103625, "grad_norm": 0.1872281247367873, "kl": 0.681640625, "learning_rate": 7.792746113989636e-07, "loss": 0.003, "reward": 2.4999921321868896, "reward_std": 3.04935701933573e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.209844559585492, "grad_norm": 0.2309458000356775, "kl": 0.521484375, "learning_rate": 7.790155440414508e-07, "loss": 0.0009, "reward": 2.4999951124191284, "reward_std": 3.3746418921509758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 2.2124352331606216, "grad_norm": 1.5018910052708576, "kl": 0.580078125, "learning_rate": 7.787564766839378e-07, "loss": 0.0015, "reward": 2.499983787536621, "reward_std": 1.568382776895305e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999838471412659, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.215025906735751, "grad_norm": 22.93435634424872, "kl": 0.59375, "learning_rate": 7.784974093264249e-07, "loss": 0.0024, "reward": 1.6636338233947754, "reward_std": 0.0006845678726676852, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1636338531970978, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2176165803108807, "grad_norm": 0.7384477026229272, "kl": 0.56640625, "learning_rate": 7.78238341968912e-07, "loss": 0.0038, "reward": 2.4999945163726807, "reward_std": 4.684316735392713e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 2.2202072538860103, "grad_norm": 728.6075007819809, "kl": 0.474609375, "learning_rate": 7.779792746113989e-07, "loss": 0.0025, "reward": 1.9874972105026245, "reward_std": 0.004937961384257505, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.487497091293335, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 2.22279792746114, "grad_norm": 31.83023756848891, "kl": 0.58203125, "learning_rate": 7.77720207253886e-07, "loss": 0.0022, "reward": 2.122915744781494, "reward_std": 0.23274855818283413, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6229156851768494, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 2.2253886010362693, "grad_norm": 42.159430091902564, "kl": 0.6015625, "learning_rate": 7.774611398963731e-07, "loss": 0.0023, "reward": 1.2821725606918335, "reward_std": 0.13140388762076327, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7821725606918335, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.227979274611399, "grad_norm": 28.196430570778197, "kl": 0.5595703125, "learning_rate": 7.772020725388601e-07, "loss": 0.0013, "reward": 1.8042874336242676, "reward_std": 0.003498773615774553, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3042875528335571, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2305699481865284, "grad_norm": 0.10416780126866194, "kl": 0.568359375, "learning_rate": 7.769430051813472e-07, "loss": 0.0023, "reward": 2.4999988079071045, "reward_std": 1.0820242550835246e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.233160621761658, "grad_norm": 4.379848323178771, "kl": 0.544921875, "learning_rate": 7.766839378238342e-07, "loss": 0.0016, "reward": 2.499983787536621, "reward_std": 5.914594225941983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2357512953367875, "grad_norm": 19.242965864107614, "kl": 0.525390625, "learning_rate": 7.764248704663212e-07, "loss": 0.002, "reward": 2.4373579025268555, "reward_std": 0.17716996379670036, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373578429222107, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.238341968911917, "grad_norm": 18.317036847781278, "kl": 0.59765625, "learning_rate": 7.761658031088083e-07, "loss": 0.0031, "reward": 2.4997286796569824, "reward_std": 0.0002016998769249767, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997285604476929, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.2409326424870466, "grad_norm": 31.156704399628357, "kl": 0.578125, "learning_rate": 7.759067357512953e-07, "loss": 0.0029, "reward": 2.0623890161514282, "reward_std": 0.17680641112457351, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5623889565467834, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 2.243523316062176, "grad_norm": 23.520924683960818, "kl": 0.587890625, "learning_rate": 7.756476683937824e-07, "loss": 0.0021, "reward": 2.409049153327942, "reward_std": 0.2572140264855989, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9090492129325867, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.2461139896373057, "grad_norm": 14.069277704761703, "kl": 0.646484375, "learning_rate": 7.753886010362694e-07, "loss": 0.0019, "reward": 2.499582052230835, "reward_std": 0.00010315176314179553, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995821714401245, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.2487046632124352, "grad_norm": 11.786421915047713, "kl": 0.642578125, "learning_rate": 7.751295336787565e-07, "loss": 0.0032, "reward": 2.374848961830139, "reward_std": 0.23171925331536158, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748489022254944, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.2512953367875648, "grad_norm": 11.45727162626517, "kl": 0.576171875, "learning_rate": 7.748704663212435e-07, "loss": 0.0024, "reward": 2.499905824661255, "reward_std": 7.650870219322314e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99990576505661, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 37.3125, "epoch": 2.2538860103626943, "grad_norm": 41.41869201797733, "kl": 0.6328125, "learning_rate": 7.746113989637305e-07, "loss": 0.0034, "reward": 1.9218160510063171, "reward_std": 0.02917963555779579, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.42181596159935, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.256476683937824, "grad_norm": 0.06441563295788447, "kl": 0.5322265625, "learning_rate": 7.743523316062176e-07, "loss": 0.0033, "reward": 2.4999938011169434, "reward_std": 8.936144979543315e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2590673575129534, "grad_norm": 6.1531421608474774, "kl": 0.5390625, "learning_rate": 7.740932642487046e-07, "loss": 0.0031, "reward": 1.998644471168518, "reward_std": 7.335765121752047e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986442923545837, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.261658031088083, "grad_norm": 0.2574827743822252, "kl": 0.58203125, "learning_rate": 7.738341968911917e-07, "loss": 0.0025, "reward": 2.4999964237213135, "reward_std": 2.1259435243337066e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2642487046632125, "grad_norm": 4.337005511179867, "kl": 0.646484375, "learning_rate": 7.735751295336788e-07, "loss": 0.0017, "reward": 1.759987473487854, "reward_std": 0.00041509166999276204, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2599875330924988, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.266839378238342, "grad_norm": 38.88478471325961, "kl": 0.60546875, "learning_rate": 7.733160621761657e-07, "loss": 0.0022, "reward": 2.062354803085327, "reward_std": 0.17687109646067256, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562354862689972, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.2694300518134716, "grad_norm": 27.04486765398002, "kl": 0.72265625, "learning_rate": 7.730569948186528e-07, "loss": 0.003, "reward": 2.3747036457061768, "reward_std": 0.23197072408765962, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747036457061768, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.272020725388601, "grad_norm": 0.696144595249741, "kl": 0.56640625, "learning_rate": 7.727979274611398e-07, "loss": 0.0019, "reward": 2.499985694885254, "reward_std": 5.35832231207678e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985694885254, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2746113989637307, "grad_norm": 1.7022955510801498, "kl": 0.603515625, "learning_rate": 7.725388601036269e-07, "loss": 0.0021, "reward": 2.4999890327453613, "reward_std": 5.8061281720256375e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.27720207253886, "grad_norm": 1.0523120929436485, "kl": 0.609375, "learning_rate": 7.72279792746114e-07, "loss": 0.0039, "reward": 2.499991774559021, "reward_std": 5.008470793654851e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2797927461139897, "grad_norm": 2.127144779077141, "kl": 0.587890625, "learning_rate": 7.72020725388601e-07, "loss": 0.0025, "reward": 2.499993324279785, "reward_std": 4.804511377187737e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.2823834196891193, "grad_norm": 3.090851801577995, "kl": 0.583984375, "learning_rate": 7.71761658031088e-07, "loss": 0.0033, "reward": 1.9986871480941772, "reward_std": 6.608832458709912e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498686969280243, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.284974093264249, "grad_norm": 9.782700837591536, "kl": 0.64453125, "learning_rate": 7.715025906735751e-07, "loss": 0.0032, "reward": 2.4999254941940308, "reward_std": 7.780850052085952e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999253153800964, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2875647668393784, "grad_norm": 0.944903471178733, "kl": 0.5419921875, "learning_rate": 7.712435233160621e-07, "loss": 0.0033, "reward": 2.4999953508377075, "reward_std": 3.03858104189203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.290155440414508, "grad_norm": 0.7034139578181235, "kl": 0.939453125, "learning_rate": 7.709844559585492e-07, "loss": 0.0028, "reward": 2.4999895095825195, "reward_std": 1.0695679861782992e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2927461139896375, "grad_norm": 36.91310169103488, "kl": 0.6328125, "learning_rate": 7.707253886010362e-07, "loss": 0.0029, "reward": 1.9993050694465637, "reward_std": 0.00023564890943816863, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993049502372742, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.295336787564767, "grad_norm": 21.085130297208593, "kl": 0.619140625, "learning_rate": 7.704663212435233e-07, "loss": 0.0024, "reward": 1.686486005783081, "reward_std": 0.2591594222576532, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1864860653877258, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2979274611398965, "grad_norm": 20.675237115387805, "kl": 0.6015625, "learning_rate": 7.702072538860103e-07, "loss": 0.0031, "reward": 1.9998692870140076, "reward_std": 1.6719795553399308e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998690485954285, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.300518134715026, "grad_norm": 35.6044369652973, "kl": 0.5546875, "learning_rate": 7.699481865284973e-07, "loss": 0.0019, "reward": 2.061944603919983, "reward_std": 0.17699992401233544, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5619447231292725, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.3031088082901556, "grad_norm": 14.35497131575874, "kl": 0.609375, "learning_rate": 7.696891191709844e-07, "loss": 0.0026, "reward": 2.499463438987732, "reward_std": 0.0004388477923384926, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999463438987732, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.305699481865285, "grad_norm": 1.1970916651258918, "kl": 0.5625, "learning_rate": 7.694300518134714e-07, "loss": 0.002, "reward": 2.4999818801879883, "reward_std": 9.831666801574102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818205833435, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.3082901554404147, "grad_norm": 3.538019635834841, "kl": 0.609375, "learning_rate": 7.691709844559585e-07, "loss": 0.003, "reward": 1.9989755153656006, "reward_std": 2.179729517592932e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989754259586334, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.3108808290155443, "grad_norm": 0.2536298236899121, "kl": 0.544921875, "learning_rate": 7.689119170984456e-07, "loss": 0.0025, "reward": 2.4999934434890747, "reward_std": 2.1087797108521045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.313471502590674, "grad_norm": 12.263274199699282, "kl": 2.646484375, "learning_rate": 7.686528497409325e-07, "loss": 0.0106, "reward": 1.9707841873168945, "reward_std": 0.0007504088125642738, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4707842469215393, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.3160621761658033, "grad_norm": 1.0730935516375741, "kl": 0.619140625, "learning_rate": 7.683937823834196e-07, "loss": 0.0026, "reward": 2.4999959468841553, "reward_std": 3.0141371780700865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.318652849740933, "grad_norm": 1.5507809207674266, "kl": 0.57421875, "learning_rate": 7.681347150259066e-07, "loss": 0.0013, "reward": 2.499983549118042, "reward_std": 5.905121554405923e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999838471412659, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.321243523316062, "grad_norm": 0.10316397342374276, "kl": 0.5595703125, "learning_rate": 7.678756476683938e-07, "loss": 0.0019, "reward": 2.4999955892562866, "reward_std": 1.370376935483364e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.3238341968911915, "grad_norm": 2.494156379847518, "kl": 0.634765625, "learning_rate": 7.676165803108809e-07, "loss": 0.0028, "reward": 1.9963735342025757, "reward_std": 2.6715618105299654e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4963735342025757, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.326424870466321, "grad_norm": 38.74158179342059, "kl": 0.611328125, "learning_rate": 7.673575129533679e-07, "loss": 0.0027, "reward": 2.374591112136841, "reward_std": 0.2322060577110392, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8745912313461304, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.3290155440414506, "grad_norm": 1.1892774050300812, "kl": 0.615234375, "learning_rate": 7.670984455958549e-07, "loss": 0.0015, "reward": 1.9976301193237305, "reward_std": 2.2865874029776023e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497630089521408, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.33160621761658, "grad_norm": 0.43156780266958966, "kl": 0.658203125, "learning_rate": 7.668393782383419e-07, "loss": 0.0025, "reward": 2.499993085861206, "reward_std": 8.399609896514448e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3341968911917097, "grad_norm": 7.383958852291368, "kl": 0.5859375, "learning_rate": 7.66580310880829e-07, "loss": 0.0024, "reward": 1.1900035440921783, "reward_std": 0.000649515917757526, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.6900034993886948, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.3367875647668392, "grad_norm": 20.531115847153167, "kl": 0.5625, "learning_rate": 7.663212435233161e-07, "loss": 0.0022, "reward": 2.4990603923797607, "reward_std": 0.0004444948153832229, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990603923797607, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.339378238341969, "grad_norm": 3.679880530507776, "kl": 0.689453125, "learning_rate": 7.660621761658031e-07, "loss": 0.0024, "reward": 1.9992091059684753, "reward_std": 1.6536886505491566e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992091655731201, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.3419689119170983, "grad_norm": 0.39720845718263026, "kl": 0.5625, "learning_rate": 7.658031088082902e-07, "loss": 0.0014, "reward": 2.4999938011169434, "reward_std": 4.172498847765382e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.344559585492228, "grad_norm": 4.076056667878319, "kl": 1.2265625, "learning_rate": 7.655440414507772e-07, "loss": 0.0032, "reward": 2.4999964237213135, "reward_std": 6.328612727202199e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.3471502590673574, "grad_norm": 1.4244343506112813, "kl": 0.611328125, "learning_rate": 7.652849740932642e-07, "loss": 0.0034, "reward": 2.4999876022338867, "reward_std": 5.460921670419339e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.349740932642487, "grad_norm": 11.782272882279319, "kl": 0.5263671875, "learning_rate": 7.650259067357513e-07, "loss": 0.0021, "reward": 1.99983412027359, "reward_std": 1.7881741769087967e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49983412027359, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3523316062176165, "grad_norm": 4.522252794465685, "kl": 0.587890625, "learning_rate": 7.647668393782383e-07, "loss": 0.002, "reward": 2.4999942779541016, "reward_std": 1.1338036188135447e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.354922279792746, "grad_norm": 0.24861707733073535, "kl": 0.580078125, "learning_rate": 7.645077720207254e-07, "loss": 0.0019, "reward": 2.4999969005584717, "reward_std": 2.4174229906748224e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 2.3575129533678756, "grad_norm": 119.78999485554999, "kl": 0.541015625, "learning_rate": 7.642487046632125e-07, "loss": 0.0028, "reward": 2.0919803977012634, "reward_std": 0.2519297336739328, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5919803380966187, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.360103626943005, "grad_norm": 0.13027284660116642, "kl": 0.59375, "learning_rate": 7.639896373056994e-07, "loss": 0.0027, "reward": 2.499997854232788, "reward_std": 1.611621712527267e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 2.3626943005181347, "grad_norm": 23.353706019463832, "kl": 0.609375, "learning_rate": 7.637305699481865e-07, "loss": 0.0034, "reward": 2.3283960819244385, "reward_std": 0.3177375477896476, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8283960819244385, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.365284974093264, "grad_norm": 0.8786045880138211, "kl": 0.58203125, "learning_rate": 7.634715025906735e-07, "loss": 0.0025, "reward": 2.4999958276748657, "reward_std": 3.0884592376878572e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 2.3678756476683938, "grad_norm": 32.32075858158416, "kl": 0.580078125, "learning_rate": 7.632124352331606e-07, "loss": 0.002, "reward": 2.1023336052894592, "reward_std": 0.24543633735720505, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.602333664894104, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.3704663212435233, "grad_norm": 15.283046660858098, "kl": 0.763671875, "learning_rate": 7.629533678756477e-07, "loss": 0.003, "reward": 1.687330186367035, "reward_std": 0.40831319987773895, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1873301267623901, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.373056994818653, "grad_norm": 0.12140395415938718, "kl": 0.5458984375, "learning_rate": 7.626943005181347e-07, "loss": 0.0019, "reward": 2.499998927116394, "reward_std": 9.545253476517246e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.3756476683937824, "grad_norm": 0.6774660813121185, "kl": 0.62890625, "learning_rate": 7.624352331606217e-07, "loss": 0.0032, "reward": 2.499975085258484, "reward_std": 9.2962916369288e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999749064445496, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.378238341968912, "grad_norm": 21.2646412919919, "kl": 0.701171875, "learning_rate": 7.621761658031087e-07, "loss": 0.0022, "reward": 2.061279296875, "reward_std": 0.1772737633291399, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5612794160842896, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.3808290155440415, "grad_norm": 0.10653694909391444, "kl": 0.576171875, "learning_rate": 7.619170984455958e-07, "loss": 0.0017, "reward": 2.499995470046997, "reward_std": 3.461220330791548e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.383419689119171, "grad_norm": 0.07741459505478553, "kl": 0.58203125, "learning_rate": 7.616580310880829e-07, "loss": 0.0023, "reward": 2.49999737739563, "reward_std": 1.7808486632020504e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.3860103626943006, "grad_norm": 0.433270478873534, "kl": 0.65625, "learning_rate": 7.613989637305699e-07, "loss": 0.0012, "reward": 2.4999927282333374, "reward_std": 4.9335510539094685e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.38860103626943, "grad_norm": 4.379834730897725, "kl": 0.5234375, "learning_rate": 7.61139896373057e-07, "loss": 0.0016, "reward": 2.4999895095825195, "reward_std": 6.246718669444817e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999894499778748, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3911917098445596, "grad_norm": 0.09437059332378547, "kl": 0.587890625, "learning_rate": 7.608808290155439e-07, "loss": 0.0026, "reward": 2.4999959468841553, "reward_std": 9.89998724776342e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.393782383419689, "grad_norm": 0.40870981607200857, "kl": 0.64453125, "learning_rate": 7.60621761658031e-07, "loss": 0.0033, "reward": 2.4999958276748657, "reward_std": 3.206738000471887e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.3963730569948187, "grad_norm": 4.311275353641554, "kl": 0.607421875, "learning_rate": 7.603626943005181e-07, "loss": 0.0024, "reward": 2.499971628189087, "reward_std": 5.9424590290291235e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999971628189087, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.3989637305699483, "grad_norm": 1.884380116543594, "kl": 0.5859375, "learning_rate": 7.601036269430051e-07, "loss": 0.0028, "reward": 2.4999746084213257, "reward_std": 1.2694768429355463e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999743700027466, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.401554404145078, "grad_norm": 3.8851800762277082, "kl": 0.7109375, "learning_rate": 7.598445595854922e-07, "loss": 0.0033, "reward": 1.8698992133140564, "reward_std": 0.00029489015918215955, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.369899183511734, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.4041450777202074, "grad_norm": 36.325321722064764, "kl": 0.5263671875, "learning_rate": 7.595854922279792e-07, "loss": 0.002, "reward": 2.3746389150619507, "reward_std": 0.2321225698375997, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8746389150619507, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.406735751295337, "grad_norm": 13.479244201983148, "kl": 0.62890625, "learning_rate": 7.593264248704662e-07, "loss": 0.0022, "reward": 1.9991964101791382, "reward_std": 0.0001618796889601981, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991964101791382, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.4093264248704664, "grad_norm": 27.758663501968172, "kl": 0.66015625, "learning_rate": 7.590673575129533e-07, "loss": 0.0026, "reward": 1.9333068132400513, "reward_std": 0.18467982392030535, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4333068132400513, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.411917098445596, "grad_norm": 1.2800651758080959, "kl": 0.61328125, "learning_rate": 7.588082901554403e-07, "loss": 0.0018, "reward": 2.4999858140945435, "reward_std": 1.1215276572329458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985933303833, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.4145077720207255, "grad_norm": 0.15640701724460748, "kl": 0.5322265625, "learning_rate": 7.585492227979274e-07, "loss": 0.0023, "reward": 2.4999929666519165, "reward_std": 3.206113831311086e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.417098445595855, "grad_norm": 31.366012263935975, "kl": 0.591796875, "learning_rate": 7.582901554404145e-07, "loss": 0.0029, "reward": 1.9989227056503296, "reward_std": 0.00017651030816523416, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989225268363953, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.4196891191709846, "grad_norm": 0.32504999565317566, "kl": 0.578125, "learning_rate": 7.580310880829015e-07, "loss": 0.0022, "reward": 2.4999947547912598, "reward_std": 2.8908783065162424e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.422279792746114, "grad_norm": 2.5713062997615346, "kl": 0.642578125, "learning_rate": 7.577720207253885e-07, "loss": 0.0029, "reward": 2.499922275543213, "reward_std": 5.819086345582036e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999922275543213, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4248704663212437, "grad_norm": 7.5781767589938545, "kl": 0.61328125, "learning_rate": 7.575129533678755e-07, "loss": 0.0032, "reward": 1.999721646308899, "reward_std": 7.019639929239929e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997215569019318, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.4274611398963732, "grad_norm": 0.21247104305655287, "kl": 0.619140625, "learning_rate": 7.572538860103626e-07, "loss": 0.0025, "reward": 2.4999938011169434, "reward_std": 2.9362248596953577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4300518134715023, "grad_norm": 5.104978739277492, "kl": 0.5322265625, "learning_rate": 7.569948186528498e-07, "loss": 0.0021, "reward": 1.9997690916061401, "reward_std": 2.300553524037241e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997690320014954, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 2.432642487046632, "grad_norm": 10.605672433336606, "kl": 0.587890625, "learning_rate": 7.567357512953368e-07, "loss": 0.0031, "reward": 1.9931389093399048, "reward_std": 0.01749832135806173, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4931387901306152, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.4352331606217614, "grad_norm": 14.14141100659577, "kl": 0.572265625, "learning_rate": 7.564766839378239e-07, "loss": 0.0025, "reward": 2.499860167503357, "reward_std": 6.533151258736325e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998602271080017, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 2.437823834196891, "grad_norm": 7.182114033951788, "kl": 0.623046875, "learning_rate": 7.562176165803108e-07, "loss": 0.0027, "reward": 1.998765230178833, "reward_std": 6.207824844750576e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987651407718658, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.4404145077720205, "grad_norm": 1.832905148558534, "kl": 0.69921875, "learning_rate": 7.559585492227979e-07, "loss": 0.0039, "reward": 2.499989628791809, "reward_std": 1.2279784868951538e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.44300518134715, "grad_norm": 2.4421967927750483, "kl": 0.52734375, "learning_rate": 7.55699481865285e-07, "loss": 0.002, "reward": 1.9997816681861877, "reward_std": 3.41636231269149e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997816979885101, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4455958549222796, "grad_norm": 66.36900038801208, "kl": 0.55859375, "learning_rate": 7.55440414507772e-07, "loss": 0.0022, "reward": 2.374892830848694, "reward_std": 0.23163343284431903, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748928308486938, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.448186528497409, "grad_norm": 6.1162789728696865, "kl": 0.609375, "learning_rate": 7.551813471502591e-07, "loss": 0.0024, "reward": 1.8022658228874207, "reward_std": 0.00031342578148496614, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3022658824920654, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 2.4507772020725387, "grad_norm": 0.17323638276895145, "kl": 0.587890625, "learning_rate": 7.549222797927461e-07, "loss": 0.0031, "reward": 2.4999977350234985, "reward_std": 2.4917172822824796e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4533678756476682, "grad_norm": 0.4577854727736334, "kl": 0.65625, "learning_rate": 7.546632124352331e-07, "loss": 0.0034, "reward": 2.4994534254074097, "reward_std": 7.672408969483513e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999453365802765, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4559585492227978, "grad_norm": 0.06235251533700957, "kl": 0.537109375, "learning_rate": 7.544041450777202e-07, "loss": 0.0027, "reward": 2.499998927116394, "reward_std": 8.750069184770837e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 2.4585492227979273, "grad_norm": 2.011999114417474, "kl": 0.611328125, "learning_rate": 7.541450777202072e-07, "loss": 0.0017, "reward": 2.498802900314331, "reward_std": 4.284227918560646e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9988030195236206, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.461139896373057, "grad_norm": 114.28351539648929, "kl": 0.619140625, "learning_rate": 7.538860103626943e-07, "loss": 0.0033, "reward": 1.98245108127594, "reward_std": 0.000968717007935993, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4824508428573608, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4637305699481864, "grad_norm": 16.16995940606268, "kl": 0.736328125, "learning_rate": 7.536269430051813e-07, "loss": 0.0032, "reward": 1.9327902793884277, "reward_std": 0.0006903799221618101, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4327903389930725, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.466321243523316, "grad_norm": 1.088122536065148, "kl": 0.55078125, "learning_rate": 7.533678756476684e-07, "loss": 0.0022, "reward": 2.4999901056289673, "reward_std": 5.9213186887063785e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4689119170984455, "grad_norm": 0.14219131429296464, "kl": 0.646484375, "learning_rate": 7.531088082901554e-07, "loss": 0.0021, "reward": 2.4999964237213135, "reward_std": 1.848027778805772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.471502590673575, "grad_norm": 0.352883524421245, "kl": 0.603515625, "learning_rate": 7.528497409326424e-07, "loss": 0.0033, "reward": 2.499988079071045, "reward_std": 3.844083124704412e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4740932642487046, "grad_norm": 6.906195955817216, "kl": 0.60546875, "learning_rate": 7.525906735751295e-07, "loss": 0.0023, "reward": 2.499780535697937, "reward_std": 6.184356846006267e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997804760932922, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.476683937823834, "grad_norm": 79.94981824576323, "kl": 0.580078125, "learning_rate": 7.523316062176166e-07, "loss": 0.0023, "reward": 1.5549529790878296, "reward_std": 0.17820383014623076, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0549529194831848, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 2.4792746113989637, "grad_norm": 59.406527797132725, "kl": 0.4794921875, "learning_rate": 7.520725388601036e-07, "loss": 0.0019, "reward": 1.7177022099494934, "reward_std": 0.30141002187701815, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2177022099494934, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.481865284974093, "grad_norm": 12.776015246741643, "kl": 0.68359375, "learning_rate": 7.518134715025907e-07, "loss": 0.0024, "reward": 2.248867154121399, "reward_std": 0.2677266724858782, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7488670945167542, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.4844559585492227, "grad_norm": 5.236448204857519, "kl": 0.607421875, "learning_rate": 7.515544041450776e-07, "loss": 0.0031, "reward": 2.4999892711639404, "reward_std": 3.724578050423588e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.4870466321243523, "grad_norm": 1.7744024433762033, "kl": 0.525390625, "learning_rate": 7.512953367875647e-07, "loss": 0.0015, "reward": 2.4999762773513794, "reward_std": 1.8287130671978957e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976396560669, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.489637305699482, "grad_norm": 21.752426294936416, "kl": 0.521484375, "learning_rate": 7.510362694300518e-07, "loss": 0.0029, "reward": 2.1245163679122925, "reward_std": 0.23171412267447522, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6245161890983582, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.4922279792746114, "grad_norm": 3.031598145964468, "kl": 0.640625, "learning_rate": 7.507772020725388e-07, "loss": 0.0033, "reward": 1.9987646341323853, "reward_std": 2.749076361396874e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987645149230957, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 2.494818652849741, "grad_norm": 27.286611939263214, "kl": 0.6171875, "learning_rate": 7.505181347150259e-07, "loss": 0.0026, "reward": 1.4918133616447449, "reward_std": 0.0036306986385170603, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9918132722377777, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4974093264248705, "grad_norm": 0.2741287251267847, "kl": 0.625, "learning_rate": 7.502590673575129e-07, "loss": 0.0029, "reward": 2.4999959468841553, "reward_std": 3.7872012796924537e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 2.5, "grad_norm": 13.092375063476396, "kl": 0.697265625, "learning_rate": 7.5e-07, "loss": 0.0031, "reward": 2.4373263120651245, "reward_std": 0.17723388059971512, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373263120651245, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.5025906735751295, "grad_norm": 0.2618263726776021, "kl": 0.53515625, "learning_rate": 7.49740932642487e-07, "loss": 0.0019, "reward": 2.4999920129776, "reward_std": 3.62072921689105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 2.505181347150259, "grad_norm": 8.593595913040115, "kl": 0.57421875, "learning_rate": 7.49481865284974e-07, "loss": 0.003, "reward": 2.499876379966736, "reward_std": 7.658883987460285e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998762607574463, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.5077720207253886, "grad_norm": 89.29152820345384, "kl": 0.609375, "learning_rate": 7.492227979274611e-07, "loss": 0.0029, "reward": 2.372649669647217, "reward_std": 0.23578068103074656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8726496696472168, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.510362694300518, "grad_norm": 0.15398367346885286, "kl": 0.4931640625, "learning_rate": 7.489637305699481e-07, "loss": 0.0013, "reward": 2.499997615814209, "reward_std": 2.3334175267564206e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5129533678756477, "grad_norm": 0.07459519524295279, "kl": 0.609375, "learning_rate": 7.487046632124352e-07, "loss": 0.001, "reward": 2.4999969005584717, "reward_std": 1.306709208392931e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.5155440414507773, "grad_norm": 15.859949166359224, "kl": 0.85546875, "learning_rate": 7.484455958549223e-07, "loss": 0.0036, "reward": 2.374956965446472, "reward_std": 0.2315176915453776, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749569058418274, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.518134715025907, "grad_norm": 6.422335868305393, "kl": 0.62109375, "learning_rate": 7.481865284974092e-07, "loss": 0.0026, "reward": 1.2836967706680298, "reward_std": 0.00046862858289387077, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7836967408657074, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.5207253886010363, "grad_norm": 0.23769327921045735, "kl": 0.60546875, "learning_rate": 7.479274611398963e-07, "loss": 0.0015, "reward": 2.499996542930603, "reward_std": 1.654968116326927e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.523316062176166, "grad_norm": 128.50981954676396, "kl": 0.658203125, "learning_rate": 7.476683937823833e-07, "loss": 0.0026, "reward": 1.2282724976539612, "reward_std": 0.008970540598966181, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.728272557258606, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.5259067357512954, "grad_norm": 2.876134118367682, "kl": 0.578125, "learning_rate": 7.474093264248704e-07, "loss": 0.0028, "reward": 1.9998290538787842, "reward_std": 1.6503866959283187e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998290240764618, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.528497409326425, "grad_norm": 67.40445787372204, "kl": 0.634765625, "learning_rate": 7.471502590673575e-07, "loss": 0.0023, "reward": 1.9995241165161133, "reward_std": 0.0006343295460169429, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995241463184357, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.5310880829015545, "grad_norm": 0.36206119408527215, "kl": 0.58203125, "learning_rate": 7.468911917098445e-07, "loss": 0.0029, "reward": 2.4999958276748657, "reward_std": 2.111678810479134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.533678756476684, "grad_norm": 0.7063625777492147, "kl": 0.708984375, "learning_rate": 7.466321243523315e-07, "loss": 0.0034, "reward": 2.4999887943267822, "reward_std": 7.369596687567537e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.5362694300518136, "grad_norm": 0.17823958707874737, "kl": 0.623046875, "learning_rate": 7.463730569948187e-07, "loss": 0.0044, "reward": 2.4999969005584717, "reward_std": 2.0820922657094343e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.538860103626943, "grad_norm": 14.73421330424598, "kl": 0.6171875, "learning_rate": 7.461139896373057e-07, "loss": 0.0026, "reward": 1.9364207983016968, "reward_std": 0.1770634657877963, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4364207088947296, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 2.5414507772020727, "grad_norm": 17.74390690002725, "kl": 0.552734375, "learning_rate": 7.458549222797928e-07, "loss": 0.0023, "reward": 1.8512808084487915, "reward_std": 0.0018155387424485525, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3512806594371796, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.5440414507772022, "grad_norm": 0.49021061710155434, "kl": 0.58203125, "learning_rate": 7.455958549222798e-07, "loss": 0.0031, "reward": 2.499996304512024, "reward_std": 2.9877732004024438e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.5466321243523318, "grad_norm": 0.12649601556651824, "kl": 0.5859375, "learning_rate": 7.453367875647669e-07, "loss": 0.0026, "reward": 2.4999935626983643, "reward_std": 2.345544316995074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.5492227979274613, "grad_norm": 0.19552086007182168, "kl": 0.529296875, "learning_rate": 7.450777202072539e-07, "loss": 0.0025, "reward": 2.499995708465576, "reward_std": 2.836892576851824e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.551813471502591, "grad_norm": 20.292588967699295, "kl": 0.541015625, "learning_rate": 7.448186528497409e-07, "loss": 0.0026, "reward": 2.4998886585235596, "reward_std": 0.00011506282578466198, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998887181282043, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.5544041450777204, "grad_norm": 0.28934490221739917, "kl": 0.65234375, "learning_rate": 7.44559585492228e-07, "loss": 0.0033, "reward": 2.4999899864196777, "reward_std": 1.6398790165794708e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999899864196777, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.55699481865285, "grad_norm": 3.5735356157707185, "kl": 0.544921875, "learning_rate": 7.44300518134715e-07, "loss": 0.0026, "reward": 1.997496485710144, "reward_std": 3.460847528913291e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4974964559078217, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.5595854922279795, "grad_norm": 19.352268359346425, "kl": 0.61328125, "learning_rate": 7.440414507772021e-07, "loss": 0.0025, "reward": 1.3498385548591614, "reward_std": 0.0005047675367677584, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8498385846614838, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.562176165803109, "grad_norm": 1.981653722579117, "kl": 1.150390625, "learning_rate": 7.437823834196892e-07, "loss": 0.0041, "reward": 2.4999899864196777, "reward_std": 1.0264709544571815e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999899864196777, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.5647668393782386, "grad_norm": 1.5157352121693908, "kl": 0.560546875, "learning_rate": 7.435233160621761e-07, "loss": 0.0025, "reward": 1.9982438683509827, "reward_std": 5.083354699308984e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982438385486603, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.567357512953368, "grad_norm": 0.07687578218558838, "kl": 0.5087890625, "learning_rate": 7.432642487046632e-07, "loss": 0.0019, "reward": 2.499998688697815, "reward_std": 1.62989920227119e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.5699481865284977, "grad_norm": 30.17881398208321, "kl": 0.591796875, "learning_rate": 7.430051813471502e-07, "loss": 0.0026, "reward": 1.7900046110153198, "reward_std": 0.2592464115805342, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.290004551410675, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.572538860103627, "grad_norm": 14.518514669135504, "kl": 0.56640625, "learning_rate": 7.427461139896373e-07, "loss": 0.0024, "reward": 1.9992915987968445, "reward_std": 7.533883888299897e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992913603782654, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5751295336787567, "grad_norm": 0.16752370828968927, "kl": 0.595703125, "learning_rate": 7.424870466321244e-07, "loss": 0.0033, "reward": 2.499996304512024, "reward_std": 1.7472955846642435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 2.5777202072538863, "grad_norm": 31.77498901477639, "kl": 0.62890625, "learning_rate": 7.422279792746114e-07, "loss": 0.0032, "reward": 1.924851655960083, "reward_std": 0.2679212669761455, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4248515665531158, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.5803108808290154, "grad_norm": 59.447315600071164, "kl": 0.638671875, "learning_rate": 7.419689119170984e-07, "loss": 0.0028, "reward": 1.9531428217887878, "reward_std": 0.018399068091866866, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4531428515911102, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.582901554404145, "grad_norm": 0.276857979792695, "kl": 0.599609375, "learning_rate": 7.417098445595854e-07, "loss": 0.0031, "reward": 2.4999935626983643, "reward_std": 5.544983082472754e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.5854922279792745, "grad_norm": 8.074741453110477, "kl": 0.52734375, "learning_rate": 7.414507772020725e-07, "loss": 0.0025, "reward": 2.4996321201324463, "reward_std": 0.0002697518038985436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996322989463806, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.588082901554404, "grad_norm": 8.386613116390619, "kl": 0.580078125, "learning_rate": 7.411917098445596e-07, "loss": 0.0022, "reward": 2.499936580657959, "reward_std": 3.581345259817681e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999936580657959, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.5906735751295336, "grad_norm": 0.5987973944110988, "kl": 0.521484375, "learning_rate": 7.409326424870466e-07, "loss": 0.0018, "reward": 2.4999911785125732, "reward_std": 6.202222948559211e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991238117218, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.593264248704663, "grad_norm": 4.87622257734275, "kl": 0.548828125, "learning_rate": 7.406735751295337e-07, "loss": 0.0011, "reward": 2.4999489784240723, "reward_std": 3.9846519200636976e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999491572380066, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.5958549222797926, "grad_norm": 1.001787873848091, "kl": 0.646484375, "learning_rate": 7.404145077720207e-07, "loss": 0.0042, "reward": 2.4999886751174927, "reward_std": 6.360509019032179e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884963035583, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 2.598445595854922, "grad_norm": 0.06664138081393953, "kl": 0.4990234375, "learning_rate": 7.401554404145077e-07, "loss": 0.0017, "reward": 2.4999982118606567, "reward_std": 1.3958422755422362e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6010362694300517, "grad_norm": 11.151213063225967, "kl": 0.53515625, "learning_rate": 7.398963730569948e-07, "loss": 0.0022, "reward": 2.4991356134414673, "reward_std": 7.643758567610348e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999135673046112, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.6036269430051813, "grad_norm": 3.169729544833356, "kl": 0.568359375, "learning_rate": 7.396373056994818e-07, "loss": 0.0017, "reward": 1.9402807354927063, "reward_std": 0.00017156028567910653, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4402807652950287, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.606217616580311, "grad_norm": 13.90051239396124, "kl": 0.6328125, "learning_rate": 7.393782383419689e-07, "loss": 0.0024, "reward": 1.9994070529937744, "reward_std": 0.00014929118333384395, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994070529937744, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6088082901554404, "grad_norm": 24.78992990747229, "kl": 0.62890625, "learning_rate": 7.39119170984456e-07, "loss": 0.003, "reward": 1.9996339082717896, "reward_std": 4.13798379668151e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996338486671448, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.61139896373057, "grad_norm": 20.6842858332421, "kl": 0.52734375, "learning_rate": 7.388601036269429e-07, "loss": 0.0024, "reward": 2.43747878074646, "reward_std": 0.17681364501987673, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374787211418152, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.6139896373056994, "grad_norm": 0.1554282829706568, "kl": 0.572265625, "learning_rate": 7.3860103626943e-07, "loss": 0.0029, "reward": 2.4999959468841553, "reward_std": 2.6558952299637895e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.616580310880829, "grad_norm": 0.8399412558079791, "kl": 0.5166015625, "learning_rate": 7.38341968911917e-07, "loss": 0.0023, "reward": 2.4999876022338867, "reward_std": 7.493700053373686e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6191709844559585, "grad_norm": 1.22144494922258, "kl": 0.658203125, "learning_rate": 7.380829015544041e-07, "loss": 0.0017, "reward": 2.4999879598617554, "reward_std": 1.031075066748599e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.621761658031088, "grad_norm": 2.4284239829022876, "kl": 0.623046875, "learning_rate": 7.378238341968912e-07, "loss": 0.0029, "reward": 1.9178662300109863, "reward_std": 0.00011027958566955931, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.417866200208664, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.6243523316062176, "grad_norm": 19.07721079471984, "kl": 0.5205078125, "learning_rate": 7.375647668393782e-07, "loss": 0.003, "reward": 1.9953519701957703, "reward_std": 0.0002652873045008164, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4953519403934479, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.626943005181347, "grad_norm": 1.862947710205046, "kl": 0.82421875, "learning_rate": 7.373056994818652e-07, "loss": 0.0036, "reward": 2.499986171722412, "reward_std": 1.00274053238536e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6295336787564767, "grad_norm": 1.235329257845796, "kl": 0.62109375, "learning_rate": 7.370466321243522e-07, "loss": 0.0021, "reward": 2.4999889135360718, "reward_std": 5.0256810482096625e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.6321243523316062, "grad_norm": 0.9128958814818373, "kl": 0.61328125, "learning_rate": 7.367875647668393e-07, "loss": 0.0027, "reward": 2.4999836683273315, "reward_std": 1.0410579989184043e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999836683273315, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.634715025906736, "grad_norm": 342.16526321074656, "kl": 0.55859375, "learning_rate": 7.365284974093264e-07, "loss": 0.0016, "reward": 2.498082756996155, "reward_std": 0.002041650069259049, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9980829358100891, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.6373056994818653, "grad_norm": 18.06473506398839, "kl": 0.619140625, "learning_rate": 7.362694300518134e-07, "loss": 0.0034, "reward": 1.910888910293579, "reward_std": 0.0006456601977333776, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.410888671875, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.639896373056995, "grad_norm": 122.58722350207216, "kl": 0.75, "learning_rate": 7.360103626943005e-07, "loss": 0.0032, "reward": 1.5609620213508606, "reward_std": 0.17711189764304436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0609620213508606, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 2.6424870466321244, "grad_norm": 243.0708631986865, "kl": 0.59375, "learning_rate": 7.357512953367874e-07, "loss": 0.0018, "reward": 1.9978336691856384, "reward_std": 0.0017346380832350405, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497833639383316, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.645077720207254, "grad_norm": 2.5156945184730826, "kl": 0.5546875, "learning_rate": 7.354922279792745e-07, "loss": 0.0015, "reward": 2.4999805688858032, "reward_std": 1.765838010214793e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999807476997375, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6476683937823835, "grad_norm": 1.7128944260849968, "kl": 0.62890625, "learning_rate": 7.352331606217617e-07, "loss": 0.0014, "reward": 2.49998140335083, "reward_std": 8.653629265609197e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818205833435, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.650259067357513, "grad_norm": 0.10296044867897858, "kl": 0.59375, "learning_rate": 7.349740932642487e-07, "loss": 0.0028, "reward": 2.4999982118606567, "reward_std": 1.1570347169254092e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.6528497409326426, "grad_norm": 2.889232457720384, "kl": 0.556640625, "learning_rate": 7.347150259067358e-07, "loss": 0.0028, "reward": 1.9982944130897522, "reward_std": 4.1515821976645384e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982944130897522, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.655440414507772, "grad_norm": 0.7019635496804637, "kl": 0.587890625, "learning_rate": 7.344559585492228e-07, "loss": 0.001, "reward": 2.4999825954437256, "reward_std": 1.0534914281379315e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999829530715942, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.6580310880829017, "grad_norm": 0.5114498362948312, "kl": 0.63671875, "learning_rate": 7.341968911917098e-07, "loss": 0.0017, "reward": 2.499989628791809, "reward_std": 3.957206104132638e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.660621761658031, "grad_norm": 0.36440611444565657, "kl": 0.5458984375, "learning_rate": 7.339378238341969e-07, "loss": 0.0029, "reward": 2.4999966621398926, "reward_std": 3.727925218299788e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6632124352331608, "grad_norm": 3.2807156158572184, "kl": 0.583984375, "learning_rate": 7.336787564766839e-07, "loss": 0.0026, "reward": 2.49996817111969, "reward_std": 9.629137650790653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999679327011108, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.6658031088082903, "grad_norm": 0.5699560302530254, "kl": 0.603515625, "learning_rate": 7.33419689119171e-07, "loss": 0.0037, "reward": 2.499987483024597, "reward_std": 5.23601778468219e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.66839378238342, "grad_norm": 0.23183470337197568, "kl": 0.662109375, "learning_rate": 7.331606217616581e-07, "loss": 0.0032, "reward": 2.4999940395355225, "reward_std": 1.912548952986981e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6709844559585494, "grad_norm": 0.9388499401118193, "kl": 0.607421875, "learning_rate": 7.329015544041451e-07, "loss": 0.0028, "reward": 2.499990224838257, "reward_std": 8.745879767957376e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 2.6735751295336785, "grad_norm": 65.0107502980631, "kl": 0.6640625, "learning_rate": 7.326424870466321e-07, "loss": 0.0025, "reward": 1.9227005243301392, "reward_std": 0.016036122865671132, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4227006137371063, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.676165803108808, "grad_norm": 1.6160050355280635, "kl": 0.552734375, "learning_rate": 7.323834196891191e-07, "loss": 0.0022, "reward": 1.9447761178016663, "reward_std": 0.00011315471095940666, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4447761178016663, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.6787564766839376, "grad_norm": 1.4402152223018394, "kl": 0.650390625, "learning_rate": 7.321243523316062e-07, "loss": 0.0013, "reward": 2.4999927282333374, "reward_std": 3.114154935701663e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.681347150259067, "grad_norm": 9.861295729776351, "kl": 0.52734375, "learning_rate": 7.318652849740933e-07, "loss": 0.0023, "reward": 1.993043065071106, "reward_std": 0.0011060473804604953, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4930429458618164, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6839378238341967, "grad_norm": 33.836881546641784, "kl": 0.564453125, "learning_rate": 7.316062176165803e-07, "loss": 0.0022, "reward": 1.4937424659729004, "reward_std": 0.000841647750348784, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9937424659729004, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.686528497409326, "grad_norm": 16.255331476195174, "kl": 0.66015625, "learning_rate": 7.313471502590674e-07, "loss": 0.0026, "reward": 1.4789152145385742, "reward_std": 0.006790194016502937, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9789152443408966, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.6891191709844557, "grad_norm": 2.2820693262557614, "kl": 0.66015625, "learning_rate": 7.310880829015543e-07, "loss": 0.0021, "reward": 1.9990639686584473, "reward_std": 2.2915255556199554e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499064028263092, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.6917098445595853, "grad_norm": 24.325514175458768, "kl": 0.642578125, "learning_rate": 7.308290155440414e-07, "loss": 0.0026, "reward": 1.9958844184875488, "reward_std": 0.0008863257212397002, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4958844780921936, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.694300518134715, "grad_norm": 2.127272719237676, "kl": 0.525390625, "learning_rate": 7.305699481865285e-07, "loss": 0.0022, "reward": 1.9998043775558472, "reward_std": 2.1155123704375e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998044967651367, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.6968911917098444, "grad_norm": 36.09028955344603, "kl": 0.529296875, "learning_rate": 7.303108808290155e-07, "loss": 0.0025, "reward": 1.8975412249565125, "reward_std": 0.008347328531272069, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3975412249565125, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.699481865284974, "grad_norm": 5.694815568054407, "kl": 0.591796875, "learning_rate": 7.300518134715026e-07, "loss": 0.0033, "reward": 2.499982714653015, "reward_std": 1.706350940366974e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999825954437256, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.7020725388601035, "grad_norm": 0.06859526823366953, "kl": 0.5, "learning_rate": 7.297927461139896e-07, "loss": 0.003, "reward": 2.4999992847442627, "reward_std": 5.856752807176235e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.704663212435233, "grad_norm": 0.28995463439556984, "kl": 0.53515625, "learning_rate": 7.295336787564766e-07, "loss": 0.0026, "reward": 1.9993937611579895, "reward_std": 7.021259932571411e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499393880367279, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.7072538860103625, "grad_norm": 42.11245203143001, "kl": 0.5703125, "learning_rate": 7.292746113989637e-07, "loss": 0.0022, "reward": 1.9991716146469116, "reward_std": 0.00014460187816212056, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991715848445892, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.709844559585492, "grad_norm": 34.4307177586476, "kl": 0.6796875, "learning_rate": 7.290155440414507e-07, "loss": 0.0027, "reward": 1.5421943068504333, "reward_std": 0.2601618515327573, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0421943962574005, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7124352331606216, "grad_norm": 17.19732502400883, "kl": 0.5625, "learning_rate": 7.287564766839378e-07, "loss": 0.0022, "reward": 1.5610501170158386, "reward_std": 0.17714093877884807, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.061050146818161, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.715025906735751, "grad_norm": 28.3177816315109, "kl": 0.59765625, "learning_rate": 7.284974093264248e-07, "loss": 0.0025, "reward": 2.186920642852783, "reward_std": 0.2592533446493235, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.686920702457428, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7176165803108807, "grad_norm": 11.66661736733289, "kl": 0.609375, "learning_rate": 7.282383419689119e-07, "loss": 0.0029, "reward": 2.4370943307876587, "reward_std": 0.17790817391733071, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9370942115783691, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7202072538860103, "grad_norm": 33.47781707942804, "kl": 0.634765625, "learning_rate": 7.279792746113989e-07, "loss": 0.0027, "reward": 2.2499648928642273, "reward_std": 0.26728503153299243, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499648332595825, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.72279792746114, "grad_norm": 0.41201428591907235, "kl": 0.603515625, "learning_rate": 7.277202072538859e-07, "loss": 0.0021, "reward": 2.499997854232788, "reward_std": 1.6471865933453955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7253886010362693, "grad_norm": 0.4430840694935509, "kl": 0.6015625, "learning_rate": 7.27461139896373e-07, "loss": 0.0022, "reward": 2.4999871253967285, "reward_std": 5.3097385261935415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871253967285, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.727979274611399, "grad_norm": 0.421816575005633, "kl": 0.5703125, "learning_rate": 7.272020725388601e-07, "loss": 0.0023, "reward": 2.4999964237213135, "reward_std": 3.4549897236502147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 2.7305699481865284, "grad_norm": 20.418743212374423, "kl": 0.646484375, "learning_rate": 7.269430051813471e-07, "loss": 0.0023, "reward": 1.920526921749115, "reward_std": 0.1349417385600873, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4205269813537598, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.733160621761658, "grad_norm": 0.24435875857433437, "kl": 0.5107421875, "learning_rate": 7.266839378238342e-07, "loss": 0.003, "reward": 2.499996066093445, "reward_std": 3.281681074440712e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.7357512953367875, "grad_norm": 0.5090494008040035, "kl": 0.69140625, "learning_rate": 7.264248704663211e-07, "loss": 0.0025, "reward": 2.499991297721863, "reward_std": 4.869184976996621e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.738341968911917, "grad_norm": 6.565405376595092, "kl": 0.677734375, "learning_rate": 7.261658031088082e-07, "loss": 0.0025, "reward": 1.4979848861694336, "reward_std": 8.402268940699287e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9979849457740784, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7409326424870466, "grad_norm": 0.4331821825844759, "kl": 0.642578125, "learning_rate": 7.259067357512953e-07, "loss": 0.0036, "reward": 2.499997615814209, "reward_std": 2.8109654408581264e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 37.3125, "epoch": 2.743523316062176, "grad_norm": 17.094276678787622, "kl": 0.646484375, "learning_rate": 7.256476683937823e-07, "loss": 0.0024, "reward": 1.992052972316742, "reward_std": 0.005187961794320017, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492052972316742, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7461139896373057, "grad_norm": 2.1249145728484535, "kl": 0.58203125, "learning_rate": 7.253886010362694e-07, "loss": 0.0023, "reward": 1.9975743889808655, "reward_std": 6.202496231821897e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975743889808655, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7487046632124352, "grad_norm": 0.15845364860556013, "kl": 0.541015625, "learning_rate": 7.251295336787564e-07, "loss": 0.0017, "reward": 2.4999964237213135, "reward_std": 1.7397197780155693e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7512953367875648, "grad_norm": 1.2262301587265434, "kl": 0.53125, "learning_rate": 7.248704663212434e-07, "loss": 0.0017, "reward": 2.4999920129776, "reward_std": 5.990513045617263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.7538860103626943, "grad_norm": 13.453230076474377, "kl": 0.548828125, "learning_rate": 7.246113989637305e-07, "loss": 0.0025, "reward": 2.499941110610962, "reward_std": 0.00011490866356211882, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999409317970276, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 2.756476683937824, "grad_norm": 3.8049863990573356, "kl": 0.583984375, "learning_rate": 7.243523316062175e-07, "loss": 0.0018, "reward": 2.4999901056289673, "reward_std": 5.761620570865489e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 37.9375, "epoch": 2.7590673575129534, "grad_norm": 69.50456100709852, "kl": 0.580078125, "learning_rate": 7.240932642487047e-07, "loss": 0.0027, "reward": 1.93032968044281, "reward_std": 0.0018972860359554033, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.43032968044281, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.761658031088083, "grad_norm": 0.17351997932032776, "kl": 0.541015625, "learning_rate": 7.238341968911917e-07, "loss": 0.0013, "reward": 2.499993085861206, "reward_std": 2.2882469465912436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 2.7642487046632125, "grad_norm": 8.292177270109168, "kl": 0.6015625, "learning_rate": 7.235751295336788e-07, "loss": 0.003, "reward": 2.498842477798462, "reward_std": 0.0003971268295117625, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9988422393798828, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 2.766839378238342, "grad_norm": 50.67379174299056, "kl": 0.5078125, "learning_rate": 7.233160621761658e-07, "loss": 0.0023, "reward": 2.3114283084869385, "reward_std": 0.2596255473399367, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8114283084869385, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7694300518134716, "grad_norm": 21.857158920001257, "kl": 0.62890625, "learning_rate": 7.230569948186528e-07, "loss": 0.0026, "reward": 1.3887208700180054, "reward_std": 0.005944397373241372, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8887207806110382, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.772020725388601, "grad_norm": 26.41739854497726, "kl": 0.529296875, "learning_rate": 7.227979274611399e-07, "loss": 0.0027, "reward": 1.9984768629074097, "reward_std": 0.0010226810257165653, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984769225120544, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7746113989637307, "grad_norm": 0.14629115915363883, "kl": 0.470703125, "learning_rate": 7.225388601036269e-07, "loss": 0.0024, "reward": 2.499998092651367, "reward_std": 2.5279367719122092e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.77720207253886, "grad_norm": 1.1225966710263608, "kl": 0.564453125, "learning_rate": 7.22279792746114e-07, "loss": 0.0023, "reward": 2.4999889135360718, "reward_std": 3.577769462026481e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889135360718, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 2.7797927461139897, "grad_norm": 1.1477864153276018, "kl": 0.5673828125, "learning_rate": 7.220207253886011e-07, "loss": 0.0027, "reward": 2.499952554702759, "reward_std": 9.058815408025112e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999525547027588, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7823834196891193, "grad_norm": 3.5342814665932933, "kl": 0.677734375, "learning_rate": 7.21761658031088e-07, "loss": 0.0019, "reward": 2.4999897480010986, "reward_std": 6.753007710358361e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.784974093264249, "grad_norm": 9.542754220340736, "kl": 0.591796875, "learning_rate": 7.215025906735751e-07, "loss": 0.0023, "reward": 1.9991666674613953, "reward_std": 3.511277463985607e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991666376590729, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.7875647668393784, "grad_norm": 40.87506492993625, "kl": 0.52734375, "learning_rate": 7.212435233160622e-07, "loss": 0.0023, "reward": 1.8811602592468262, "reward_std": 0.0022370561302977876, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3811602592468262, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.790155440414508, "grad_norm": 0.21227975313328307, "kl": 0.619140625, "learning_rate": 7.209844559585492e-07, "loss": 0.0019, "reward": 2.4999985694885254, "reward_std": 9.73535676962456e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.7927461139896375, "grad_norm": 0.12312738367276538, "kl": 0.58984375, "learning_rate": 7.207253886010363e-07, "loss": 0.0021, "reward": 2.4999964237213135, "reward_std": 1.4120066111900087e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.795336787564767, "grad_norm": 47.69069413980415, "kl": 0.587890625, "learning_rate": 7.204663212435233e-07, "loss": 0.0016, "reward": 2.374545097351074, "reward_std": 0.23228111950413677, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8745452165603638, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7979274611398965, "grad_norm": 1.0705614841678457, "kl": 0.615234375, "learning_rate": 7.202072538860103e-07, "loss": 0.002, "reward": 1.999253511428833, "reward_std": 2.7619794138900033e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992536902427673, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.800518134715026, "grad_norm": 0.6949921483089313, "kl": 0.646484375, "learning_rate": 7.199481865284974e-07, "loss": 0.002, "reward": 2.4999945163726807, "reward_std": 4.0932291085482575e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.8031088082901556, "grad_norm": 0.48406945794703804, "kl": 0.544921875, "learning_rate": 7.196891191709844e-07, "loss": 0.0028, "reward": 2.499992847442627, "reward_std": 3.466269504315278e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.805699481865285, "grad_norm": 3.5410564963143836, "kl": 0.615234375, "learning_rate": 7.194300518134715e-07, "loss": 0.0023, "reward": 2.4999252557754517, "reward_std": 2.5473680807408527e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999252557754517, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.8082901554404147, "grad_norm": 11.210948755866617, "kl": 0.75, "learning_rate": 7.191709844559585e-07, "loss": 0.0024, "reward": 1.9990366697311401, "reward_std": 5.727937605115585e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990367591381073, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8108808290155443, "grad_norm": 5.780323950923431, "kl": 0.6640625, "learning_rate": 7.189119170984456e-07, "loss": 0.0025, "reward": 1.9989172220230103, "reward_std": 5.087455474495073e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989171624183655, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.813471502590674, "grad_norm": 0.11311761012996831, "kl": 0.5126953125, "learning_rate": 7.186528497409327e-07, "loss": 0.0019, "reward": 2.4999979734420776, "reward_std": 1.1268163007116527e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8160621761658033, "grad_norm": 1.7059429833670858, "kl": 0.673828125, "learning_rate": 7.183937823834196e-07, "loss": 0.0028, "reward": 1.9987366199493408, "reward_std": 3.181468264301657e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987366497516632, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.818652849740933, "grad_norm": 6.555349657902863, "kl": 0.5625, "learning_rate": 7.181347150259067e-07, "loss": 0.0028, "reward": 2.4999724626541138, "reward_std": 2.1464148744598788e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999723434448242, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8212435233160624, "grad_norm": 0.29114125580947575, "kl": 0.482421875, "learning_rate": 7.178756476683937e-07, "loss": 0.0018, "reward": 2.4999935626983643, "reward_std": 2.8045782869412506e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.823834196891192, "grad_norm": 71.3422678593196, "kl": 0.666015625, "learning_rate": 7.176165803108808e-07, "loss": 0.0034, "reward": 2.3116315603256226, "reward_std": 0.2599714249608951, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8116315603256226, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8264248704663215, "grad_norm": 0.4356362294485776, "kl": 0.58203125, "learning_rate": 7.173575129533679e-07, "loss": 0.0027, "reward": 2.4999929666519165, "reward_std": 5.683920903720718e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8290155440414506, "grad_norm": 3.327769860931279, "kl": 0.5703125, "learning_rate": 7.170984455958548e-07, "loss": 0.0014, "reward": 1.9991004467010498, "reward_std": 2.881320961023448e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991004467010498, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.83160621761658, "grad_norm": 3.0546232118277934, "kl": 0.57421875, "learning_rate": 7.168393782383419e-07, "loss": 0.0021, "reward": 1.9986391067504883, "reward_std": 4.775613979290938e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986391067504883, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.8341968911917097, "grad_norm": 1.24724228162235, "kl": 0.623046875, "learning_rate": 7.165803108808289e-07, "loss": 0.0028, "reward": 2.499860167503357, "reward_std": 1.6689350559317973e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998599886894226, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 2.8367875647668392, "grad_norm": 3.290601670206448, "kl": 0.572265625, "learning_rate": 7.16321243523316e-07, "loss": 0.0029, "reward": 1.9992623329162598, "reward_std": 7.482550273607558e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499262273311615, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 2.839378238341969, "grad_norm": 37.20245074999802, "kl": 1.1796875, "learning_rate": 7.160621761658031e-07, "loss": 0.0042, "reward": 1.8171599507331848, "reward_std": 0.11273969305352693, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3171600103378296, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.8419689119170983, "grad_norm": 12.35688293656863, "kl": 0.615234375, "learning_rate": 7.158031088082901e-07, "loss": 0.0025, "reward": 1.7486568689346313, "reward_std": 0.26803254906735674, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2486568689346313, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.844559585492228, "grad_norm": 7.926385296087825, "kl": 0.623046875, "learning_rate": 7.155440414507772e-07, "loss": 0.0025, "reward": 1.3184268474578857, "reward_std": 0.00023067424990586005, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8184268474578857, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.8471502590673574, "grad_norm": 2.3327307428225215, "kl": 0.65234375, "learning_rate": 7.152849740932642e-07, "loss": 0.0029, "reward": 1.999182105064392, "reward_std": 4.654735494113993e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991820454597473, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 2.849740932642487, "grad_norm": 29.28679733161428, "kl": 0.6171875, "learning_rate": 7.150259067357512e-07, "loss": 0.0025, "reward": 0.8198869228363037, "reward_std": 0.1057751402258873, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.3198869526386261, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.8523316062176165, "grad_norm": 17.667377368328168, "kl": 0.568359375, "learning_rate": 7.147668393782383e-07, "loss": 0.0023, "reward": 1.3958356380462646, "reward_std": 0.000722472024790477, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8958356082439423, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.854922279792746, "grad_norm": 445.37845006749365, "kl": 0.546875, "learning_rate": 7.145077720207253e-07, "loss": 0.0016, "reward": 2.4374139308929443, "reward_std": 0.1770079257757402, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374139904975891, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.8575129533678756, "grad_norm": 0.13049263090310476, "kl": 0.5859375, "learning_rate": 7.142487046632124e-07, "loss": 0.0022, "reward": 2.499998092651367, "reward_std": 1.8085851252180873e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.860103626943005, "grad_norm": 0.42187904732562714, "kl": 0.638671875, "learning_rate": 7.139896373056995e-07, "loss": 0.0028, "reward": 2.4999994039535522, "reward_std": 7.103578525402554e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 2.8626943005181347, "grad_norm": 54.210851057196734, "kl": 0.759765625, "learning_rate": 7.137305699481864e-07, "loss": 0.0035, "reward": 2.0600619316101074, "reward_std": 0.1772356966503139, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5600618124008179, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.865284974093264, "grad_norm": 0.7063850369089946, "kl": 0.552734375, "learning_rate": 7.134715025906735e-07, "loss": 0.0007, "reward": 2.4999947547912598, "reward_std": 1.0447371209920675e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.8678756476683938, "grad_norm": 0.6825191811393605, "kl": 0.4951171875, "learning_rate": 7.132124352331605e-07, "loss": 0.0032, "reward": 2.4999866485595703, "reward_std": 7.169375635385222e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999865889549255, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8704663212435233, "grad_norm": 3.4795628257844546, "kl": 0.599609375, "learning_rate": 7.129533678756477e-07, "loss": 0.0023, "reward": 2.187327027320862, "reward_std": 0.2587855036769042, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6873270273208618, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.873056994818653, "grad_norm": 23.088724767828143, "kl": 0.521484375, "learning_rate": 7.126943005181348e-07, "loss": 0.0021, "reward": 1.994544804096222, "reward_std": 0.0005626487545669079, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4945447742938995, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.8756476683937824, "grad_norm": 0.32594931475198513, "kl": 0.6484375, "learning_rate": 7.124352331606218e-07, "loss": 0.0026, "reward": 2.499991774559021, "reward_std": 3.999926548203803e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.878238341968912, "grad_norm": 0.19333830387497986, "kl": 0.603515625, "learning_rate": 7.121761658031088e-07, "loss": 0.0017, "reward": 2.4999966621398926, "reward_std": 3.0064921929806587e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.8808290155440415, "grad_norm": 0.19464124164742058, "kl": 0.619140625, "learning_rate": 7.119170984455958e-07, "loss": 0.0017, "reward": 2.499995708465576, "reward_std": 2.441301944600127e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.883419689119171, "grad_norm": 1.5642236613847356, "kl": 0.4970703125, "learning_rate": 7.116580310880829e-07, "loss": 0.0028, "reward": 2.499983310699463, "reward_std": 8.879003303263744e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983012676239, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 2.8860103626943006, "grad_norm": 31.945583719688358, "kl": 0.560546875, "learning_rate": 7.1139896373057e-07, "loss": 0.0025, "reward": 1.99392831325531, "reward_std": 0.000869050582195996, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49392831325531, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.88860103626943, "grad_norm": 1.7862884561684693, "kl": 0.5546875, "learning_rate": 7.11139896373057e-07, "loss": 0.002, "reward": 2.499977946281433, "reward_std": 9.770412191301148e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999780058860779, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.8911917098445596, "grad_norm": 2.871813985842403, "kl": 0.484375, "learning_rate": 7.108808290155441e-07, "loss": 0.002, "reward": 2.4999945163726807, "reward_std": 5.2948214630532675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.893782383419689, "grad_norm": 0.46562077990491635, "kl": 0.62890625, "learning_rate": 7.10621761658031e-07, "loss": 0.0021, "reward": 2.4999972581863403, "reward_std": 2.6211142767351703e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8963730569948187, "grad_norm": 0.1508664748861988, "kl": 0.5654296875, "learning_rate": 7.103626943005181e-07, "loss": 0.0024, "reward": 2.4999983310699463, "reward_std": 1.557139114538586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8989637305699483, "grad_norm": 145.6702776697723, "kl": 0.61328125, "learning_rate": 7.101036269430052e-07, "loss": 0.0025, "reward": 1.6220159530639648, "reward_std": 0.23326847677935802, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1220159530639648, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.901554404145078, "grad_norm": 0.17012491184673253, "kl": 0.65625, "learning_rate": 7.098445595854922e-07, "loss": 0.0036, "reward": 2.4999970197677612, "reward_std": 1.9179709056516003e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.9041450777202074, "grad_norm": 22.643373882894355, "kl": 0.611328125, "learning_rate": 7.095854922279793e-07, "loss": 0.0029, "reward": 1.4372776299715042, "reward_std": 0.3467721479457282, "rewards/format_reward_rec": 0.625, "rewards/point_reward": 1.1247776374220848, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.906735751295337, "grad_norm": 0.35236753831865986, "kl": 0.572265625, "learning_rate": 7.093264248704664e-07, "loss": 0.002, "reward": 2.49999463558197, "reward_std": 3.5201626360503724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9093264248704664, "grad_norm": 14.020937409535295, "kl": 0.595703125, "learning_rate": 7.090673575129533e-07, "loss": 0.0018, "reward": 2.3749892711639404, "reward_std": 0.23145933077006475, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749894499778748, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.911917098445596, "grad_norm": 6.486548507128977, "kl": 0.56640625, "learning_rate": 7.088082901554404e-07, "loss": 0.0025, "reward": 1.9992740154266357, "reward_std": 3.095042802669923e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992740750312805, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.9145077720207255, "grad_norm": 4.8774392593885105, "kl": 0.677734375, "learning_rate": 7.085492227979274e-07, "loss": 0.003, "reward": 1.9998435974121094, "reward_std": 4.179638858659018e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998434782028198, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.917098445595855, "grad_norm": 0.7283042122776016, "kl": 0.58203125, "learning_rate": 7.082901554404145e-07, "loss": 0.0027, "reward": 2.4999821186065674, "reward_std": 5.795685183329624e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982237815857, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9196891191709846, "grad_norm": 4.063026303587112, "kl": 0.517578125, "learning_rate": 7.080310880829016e-07, "loss": 0.002, "reward": 1.9979990124702454, "reward_std": 9.143089852159392e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979988932609558, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9222797927461137, "grad_norm": 13.277711047698382, "kl": 0.62890625, "learning_rate": 7.077720207253886e-07, "loss": 0.0026, "reward": 1.8121846914291382, "reward_std": 0.259098491034365, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3121846616268158, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.9248704663212433, "grad_norm": 161.7370046195627, "kl": 0.6171875, "learning_rate": 7.075129533678756e-07, "loss": 0.0019, "reward": 1.9998925924301147, "reward_std": 8.218448806474044e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499892771244049, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.927461139896373, "grad_norm": 0.28749823716970085, "kl": 0.64453125, "learning_rate": 7.072538860103626e-07, "loss": 0.0026, "reward": 2.4999964237213135, "reward_std": 2.225980715309106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9300518134715023, "grad_norm": 5.734926748946173, "kl": 0.6484375, "learning_rate": 7.069948186528497e-07, "loss": 0.0031, "reward": 1.9993936419487, "reward_std": 3.739055318874307e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993936419487, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 2.932642487046632, "grad_norm": 63.31569264833523, "kl": 0.71484375, "learning_rate": 7.067357512953368e-07, "loss": 0.0029, "reward": 1.9196277260780334, "reward_std": 0.10624503466533497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4196277856826782, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9352331606217614, "grad_norm": 0.04819538401852324, "kl": 0.599609375, "learning_rate": 7.064766839378238e-07, "loss": 0.0014, "reward": 2.4999988079071045, "reward_std": 6.925934883383889e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.937823834196891, "grad_norm": 0.8258854753247583, "kl": 0.57421875, "learning_rate": 7.062176165803109e-07, "loss": 0.0024, "reward": 2.4999735355377197, "reward_std": 1.2128446144288318e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999735355377197, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.9404145077720205, "grad_norm": 0.03557062108032573, "kl": 0.529296875, "learning_rate": 7.059585492227978e-07, "loss": 0.0011, "reward": 2.499998927116394, "reward_std": 7.604757570334186e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.94300518134715, "grad_norm": 7.871727098665316, "kl": 1.576171875, "learning_rate": 7.056994818652849e-07, "loss": 0.0065, "reward": 2.4999855756759644, "reward_std": 5.109554308546649e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999855160713196, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.9455958549222796, "grad_norm": 19.225635668508264, "kl": 0.71875, "learning_rate": 7.05440414507772e-07, "loss": 0.0029, "reward": 2.311902642250061, "reward_std": 0.2588590653056144, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8119027018547058, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.948186528497409, "grad_norm": 216.4289347389273, "kl": 0.630859375, "learning_rate": 7.05181347150259e-07, "loss": 0.0032, "reward": 2.3748350143432617, "reward_std": 0.23175471024592298, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874834954738617, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9507772020725387, "grad_norm": 0.1161636844120874, "kl": 0.580078125, "learning_rate": 7.049222797927461e-07, "loss": 0.0038, "reward": 2.4999961853027344, "reward_std": 2.507402655282931e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9533678756476682, "grad_norm": 0.15737727781729274, "kl": 0.623046875, "learning_rate": 7.046632124352331e-07, "loss": 0.0025, "reward": 2.499992251396179, "reward_std": 2.396859485997993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.9559585492227978, "grad_norm": 1.5013853057311195, "kl": 0.578125, "learning_rate": 7.044041450777201e-07, "loss": 0.0029, "reward": 2.499981164932251, "reward_std": 5.9105794889546814e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999811053276062, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.9585492227979273, "grad_norm": 0.4006666281545433, "kl": 0.53125, "learning_rate": 7.041450777202072e-07, "loss": 0.0028, "reward": 1.9997097253799438, "reward_std": 7.790256859152578e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997096061706543, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.961139896373057, "grad_norm": 1.0063769406816927, "kl": 0.58203125, "learning_rate": 7.038860103626942e-07, "loss": 0.0016, "reward": 2.499924063682556, "reward_std": 1.0442210651717687e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999240636825562, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9637305699481864, "grad_norm": 14.768669038101965, "kl": 0.59375, "learning_rate": 7.036269430051813e-07, "loss": 0.0029, "reward": 1.9911785125732422, "reward_std": 7.612803347001318e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4911784827709198, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.966321243523316, "grad_norm": 1.2067853185886472, "kl": 0.576171875, "learning_rate": 7.033678756476683e-07, "loss": 0.0027, "reward": 1.999124526977539, "reward_std": 3.299454368743682e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499124526977539, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9689119170984455, "grad_norm": 14.152274482774525, "kl": 0.626953125, "learning_rate": 7.031088082901554e-07, "loss": 0.0019, "reward": 1.9979956150054932, "reward_std": 5.244946532911854e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979957342147827, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.971502590673575, "grad_norm": 0.2887313187170961, "kl": 0.548828125, "learning_rate": 7.028497409326424e-07, "loss": 0.0028, "reward": 2.499997615814209, "reward_std": 2.757774026918014e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 2.9740932642487046, "grad_norm": 0.33834763946207913, "kl": 0.486328125, "learning_rate": 7.025906735751294e-07, "loss": 0.0007, "reward": 2.499995708465576, "reward_std": 2.994159189029233e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.976683937823834, "grad_norm": 0.9251658414826972, "kl": 0.6875, "learning_rate": 7.023316062176165e-07, "loss": 0.0036, "reward": 1.9999240636825562, "reward_std": 5.000698706680851e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999238848686218, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.9792746113989637, "grad_norm": 28.931520205093758, "kl": 0.66015625, "learning_rate": 7.020725388601037e-07, "loss": 0.0028, "reward": 2.43742299079895, "reward_std": 0.1769923639137403, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374229907989502, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.981865284974093, "grad_norm": 1.1445006631153711, "kl": 0.548828125, "learning_rate": 7.018134715025907e-07, "loss": 0.0018, "reward": 2.4999938011169434, "reward_std": 3.5369270108276396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9844559585492227, "grad_norm": 7.901636257039627, "kl": 0.666015625, "learning_rate": 7.015544041450778e-07, "loss": 0.0035, "reward": 1.9987123012542725, "reward_std": 6.698680545014213e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498712182044983, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.9870466321243523, "grad_norm": 30.502615926591016, "kl": 0.59375, "learning_rate": 7.012953367875647e-07, "loss": 0.0018, "reward": 1.9975014328956604, "reward_std": 0.003218543048888023, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975013732910156, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.989637305699482, "grad_norm": 0.6832763690412468, "kl": 0.599609375, "learning_rate": 7.010362694300518e-07, "loss": 0.0022, "reward": 2.499993085861206, "reward_std": 6.168161235109437e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.9922279792746114, "grad_norm": 0.3490971747017951, "kl": 0.59765625, "learning_rate": 7.007772020725389e-07, "loss": 0.0017, "reward": 2.4999935626983643, "reward_std": 6.716019811392471e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 2.994818652849741, "grad_norm": 23.95024570550229, "kl": 0.623046875, "learning_rate": 7.005181347150259e-07, "loss": 0.0025, "reward": 1.0476582646369934, "reward_std": 0.13896668516099453, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.5476583391427994, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.9974093264248705, "grad_norm": 8.421156692905752, "kl": 0.59765625, "learning_rate": 7.00259067357513e-07, "loss": 0.0024, "reward": 2.49998140335083, "reward_std": 1.568330696954945e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998140335083, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0, "grad_norm": 361.9140203035278, "kl": 0.5576171875, "learning_rate": 7e-07, "loss": 0.0023, "reward": 2.499997854232788, "reward_std": 0.3544336259365082, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0025906735751295, "grad_norm": 10.142258564042228, "kl": 0.65234375, "learning_rate": 6.99740932642487e-07, "loss": 0.0031, "reward": 2.4359086751937866, "reward_std": 0.1774228313825006, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.935908555984497, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.005181347150259, "grad_norm": 13.861381791085808, "kl": 0.552734375, "learning_rate": 6.994818652849741e-07, "loss": 0.0027, "reward": 1.9982770681381226, "reward_std": 6.264615603868151e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982770085334778, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0077720207253886, "grad_norm": 0.07224099894334043, "kl": 0.525390625, "learning_rate": 6.992227979274611e-07, "loss": 0.0015, "reward": 2.499998092651367, "reward_std": 9.067200608114945e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.010362694300518, "grad_norm": 107.9780863244853, "kl": 0.662109375, "learning_rate": 6.989637305699482e-07, "loss": 0.0018, "reward": 1.9990256428718567, "reward_std": 0.0006522652224703052, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990257620811462, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0129533678756477, "grad_norm": 0.07543876112959429, "kl": 0.5205078125, "learning_rate": 6.987046632124352e-07, "loss": 0.0016, "reward": 2.4999979734420776, "reward_std": 8.673322184904464e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0155440414507773, "grad_norm": 3.235963523896716, "kl": 0.583984375, "learning_rate": 6.984455958549223e-07, "loss": 0.0024, "reward": 1.9991928339004517, "reward_std": 3.935769049689952e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991928040981293, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.018134715025907, "grad_norm": 9.856724719931194, "kl": 0.537109375, "learning_rate": 6.981865284974093e-07, "loss": 0.0024, "reward": 2.4999356269836426, "reward_std": 0.00010193508319389366, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999356269836426, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0207253886010363, "grad_norm": 0.18700704264639922, "kl": 0.60546875, "learning_rate": 6.979274611398963e-07, "loss": 0.001, "reward": 2.499998092651367, "reward_std": 1.598040114458854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.023316062176166, "grad_norm": 0.17368584184939767, "kl": 0.646484375, "learning_rate": 6.976683937823834e-07, "loss": 0.0036, "reward": 2.4999942779541016, "reward_std": 2.856908508874767e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.0259067357512954, "grad_norm": 0.5039167594162633, "kl": 0.697265625, "learning_rate": 6.974093264248704e-07, "loss": 0.0024, "reward": 2.4999948740005493, "reward_std": 2.2534055688083754e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.028497409326425, "grad_norm": 1.2939413320927868, "kl": 0.625, "learning_rate": 6.971502590673575e-07, "loss": 0.0023, "reward": 2.499984622001648, "reward_std": 1.2518473795353202e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984622001648, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.0310880829015545, "grad_norm": 0.1770817439297366, "kl": 0.5595703125, "learning_rate": 6.968911917098446e-07, "loss": 0.0028, "reward": 2.499994993209839, "reward_std": 3.4861328686019988e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 3.033678756476684, "grad_norm": 42.88377724672054, "kl": 0.576171875, "learning_rate": 6.966321243523315e-07, "loss": 0.0021, "reward": 1.906618356704712, "reward_std": 0.261603544546233, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4066182971000671, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0362694300518136, "grad_norm": 0.8058516553278751, "kl": 0.59765625, "learning_rate": 6.963730569948186e-07, "loss": 0.0035, "reward": 2.499982237815857, "reward_std": 7.78536025336507e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821782112122, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.038860103626943, "grad_norm": 0.4852239780691297, "kl": 0.560546875, "learning_rate": 6.961139896373057e-07, "loss": 0.0019, "reward": 2.4999945163726807, "reward_std": 4.30772161053028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.0414507772020727, "grad_norm": 0.39502561011421355, "kl": 0.58984375, "learning_rate": 6.958549222797927e-07, "loss": 0.0026, "reward": 2.499994993209839, "reward_std": 5.2720604344358435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0440414507772022, "grad_norm": 10.76485742493056, "kl": 0.5859375, "learning_rate": 6.955958549222798e-07, "loss": 0.0022, "reward": 1.9450501203536987, "reward_std": 0.00018131855904357508, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4450502395629883, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0466321243523318, "grad_norm": 0.6012894115271494, "kl": 0.630859375, "learning_rate": 6.953367875647668e-07, "loss": 0.0019, "reward": 2.4999982118606567, "reward_std": 1.2631843446797575e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0492227979274613, "grad_norm": 63.42109665853525, "kl": 0.703125, "learning_rate": 6.950777202072538e-07, "loss": 0.0028, "reward": 1.9978582859039307, "reward_std": 0.002249808320812008, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978583455085754, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.051813471502591, "grad_norm": 0.30658771094396764, "kl": 0.5546875, "learning_rate": 6.948186528497409e-07, "loss": 0.0027, "reward": 2.499994158744812, "reward_std": 3.4266339525856893e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0544041450777204, "grad_norm": 0.22322552890092826, "kl": 0.615234375, "learning_rate": 6.945595854922279e-07, "loss": 0.0035, "reward": 2.4999916553497314, "reward_std": 3.2988255611599016e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999915957450867, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.05699481865285, "grad_norm": 0.5500608956331343, "kl": 0.603515625, "learning_rate": 6.94300518134715e-07, "loss": 0.0033, "reward": 2.499996066093445, "reward_std": 4.7899749233693e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.0595854922279795, "grad_norm": 0.36369903688453425, "kl": 0.576171875, "learning_rate": 6.94041450777202e-07, "loss": 0.0032, "reward": 2.4999979734420776, "reward_std": 8.218087543809816e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.062176165803109, "grad_norm": 0.15432096729435443, "kl": 0.62109375, "learning_rate": 6.937823834196891e-07, "loss": 0.0031, "reward": 2.499996066093445, "reward_std": 1.8686368150611088e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.064766839378238, "grad_norm": 0.9279547149725323, "kl": 0.630859375, "learning_rate": 6.935233160621761e-07, "loss": 0.0031, "reward": 2.4999945163726807, "reward_std": 4.293992560633342e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0673575129533677, "grad_norm": 0.689670399726225, "kl": 0.5009765625, "learning_rate": 6.932642487046631e-07, "loss": 0.0011, "reward": 2.4999929666519165, "reward_std": 3.1569003340337076e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.069948186528497, "grad_norm": 4.857918011559059, "kl": 0.556640625, "learning_rate": 6.930051813471502e-07, "loss": 0.0029, "reward": 1.9177930355072021, "reward_std": 0.00027660853038469213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4177928864955902, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0725388601036268, "grad_norm": 61.357190523342126, "kl": 0.802734375, "learning_rate": 6.927461139896372e-07, "loss": 0.0036, "reward": 2.1219688653945923, "reward_std": 0.23332506452743473, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6219688653945923, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.0751295336787563, "grad_norm": 31.80804631134281, "kl": 0.53515625, "learning_rate": 6.924870466321243e-07, "loss": 0.001, "reward": 2.499891519546509, "reward_std": 4.9671036720155826e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998916983604431, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.077720207253886, "grad_norm": 2.125382301598741, "kl": 0.607421875, "learning_rate": 6.922279792746114e-07, "loss": 0.0032, "reward": 1.999302625656128, "reward_std": 2.125860419255332e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993024468421936, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0803108808290154, "grad_norm": 0.3260783823913368, "kl": 0.6142578125, "learning_rate": 6.919689119170983e-07, "loss": 0.0023, "reward": 2.499995708465576, "reward_std": 2.489405687811086e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 38.125, "epoch": 3.082901554404145, "grad_norm": 23.41103905656573, "kl": 0.544921875, "learning_rate": 6.917098445595854e-07, "loss": 0.0014, "reward": 1.926444172859192, "reward_std": 0.010403272172879952, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.426444172859192, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 3.0854922279792745, "grad_norm": 26.05660488702092, "kl": 0.71875, "learning_rate": 6.914507772020724e-07, "loss": 0.003, "reward": 2.066689133644104, "reward_std": 0.266509745795247, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5666891932487488, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.088082901554404, "grad_norm": 25.661316830674195, "kl": 0.5703125, "learning_rate": 6.911917098445595e-07, "loss": 0.0027, "reward": 2.374777674674988, "reward_std": 0.23186307624325764, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747776746749878, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.0906735751295336, "grad_norm": 0.0899655771027801, "kl": 0.6171875, "learning_rate": 6.909326424870467e-07, "loss": 0.0022, "reward": 2.4999988079071045, "reward_std": 8.884215958460118e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.093264248704663, "grad_norm": 0.1521795560839365, "kl": 0.640625, "learning_rate": 6.906735751295337e-07, "loss": 0.0034, "reward": 2.4999959468841553, "reward_std": 2.616226424834167e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.0958549222797926, "grad_norm": 2.262289361569932, "kl": 0.580078125, "learning_rate": 6.904145077720207e-07, "loss": 0.001, "reward": 1.998343586921692, "reward_std": 2.320065595995402e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983438551425934, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.098445595854922, "grad_norm": 29.715010542969793, "kl": 0.630859375, "learning_rate": 6.901554404145078e-07, "loss": 0.0022, "reward": 2.3122215270996094, "reward_std": 0.2591497132227687, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122217059135437, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1010362694300517, "grad_norm": 0.23875857973999276, "kl": 0.5234375, "learning_rate": 6.898963730569948e-07, "loss": 0.003, "reward": 2.4999974966049194, "reward_std": 2.044284144631092e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1036269430051813, "grad_norm": 15.359181851404795, "kl": 0.58203125, "learning_rate": 6.896373056994819e-07, "loss": 0.0014, "reward": 1.9998488426208496, "reward_std": 2.7919772037421353e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499849021434784, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.106217616580311, "grad_norm": 10.639828616937162, "kl": 0.623046875, "learning_rate": 6.893782383419689e-07, "loss": 0.0034, "reward": 1.7281917929649353, "reward_std": 0.0003900645962175986, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2281917929649353, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.1088082901554404, "grad_norm": 8.07861110246678, "kl": 0.52734375, "learning_rate": 6.89119170984456e-07, "loss": 0.0012, "reward": 1.7958200573921204, "reward_std": 0.00022769381277498724, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2958202064037323, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.11139896373057, "grad_norm": 0.2051280137842463, "kl": 0.642578125, "learning_rate": 6.888601036269431e-07, "loss": 0.0037, "reward": 2.4999942779541016, "reward_std": 2.2564442474504176e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.1139896373056994, "grad_norm": 232.88574269192972, "kl": 0.66015625, "learning_rate": 6.8860103626943e-07, "loss": 0.0026, "reward": 1.2441326379776, "reward_std": 0.0015089575754245743, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7441326677799225, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.116580310880829, "grad_norm": 0.2537600694484, "kl": 0.576171875, "learning_rate": 6.883419689119171e-07, "loss": 0.0017, "reward": 2.4999958276748657, "reward_std": 2.3724439017769328e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.1191709844559585, "grad_norm": 20.825711324318686, "kl": 0.494140625, "learning_rate": 6.880829015544041e-07, "loss": 0.0014, "reward": 2.1243672370910645, "reward_std": 0.23184529455602387, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6243672370910645, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.121761658031088, "grad_norm": 0.07130347741964999, "kl": 0.4990234375, "learning_rate": 6.878238341968912e-07, "loss": 0.0013, "reward": 2.499996542930603, "reward_std": 1.297134133437794e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1243523316062176, "grad_norm": 0.23366163889152264, "kl": 0.615234375, "learning_rate": 6.875647668393783e-07, "loss": 0.0033, "reward": 2.499992609024048, "reward_std": 3.6609401377063477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.126943005181347, "grad_norm": 2.574127164317427, "kl": 0.64453125, "learning_rate": 6.873056994818652e-07, "loss": 0.002, "reward": 1.9988672733306885, "reward_std": 2.0297386981837917e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988672733306885, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.1295336787564767, "grad_norm": 0.44267603029304176, "kl": 0.642578125, "learning_rate": 6.870466321243523e-07, "loss": 0.002, "reward": 2.499996781349182, "reward_std": 3.5049030060463338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.1321243523316062, "grad_norm": 0.28567313096721453, "kl": 0.59375, "learning_rate": 6.867875647668393e-07, "loss": 0.0022, "reward": 2.499983787536621, "reward_std": 3.614259526329988e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.134715025906736, "grad_norm": 24.120526310517164, "kl": 0.52734375, "learning_rate": 6.865284974093264e-07, "loss": 0.0012, "reward": 2.311760187149048, "reward_std": 0.2597920245671048, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8117603659629822, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1373056994818653, "grad_norm": 0.15501033397231445, "kl": 0.546875, "learning_rate": 6.862694300518135e-07, "loss": 0.0028, "reward": 2.4999959468841553, "reward_std": 1.5506627164540987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.139896373056995, "grad_norm": 17.60551860770907, "kl": 0.6015625, "learning_rate": 6.860103626943005e-07, "loss": 0.0024, "reward": 1.999005675315857, "reward_std": 0.00012886681679447065, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990057051181793, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1424870466321244, "grad_norm": 35.61256667296473, "kl": 0.630859375, "learning_rate": 6.857512953367876e-07, "loss": 0.0027, "reward": 2.249640643596649, "reward_std": 0.2676418017713331, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7496405839920044, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.145077720207254, "grad_norm": 33.697593640277105, "kl": 0.560546875, "learning_rate": 6.854922279792745e-07, "loss": 0.002, "reward": 1.9980740547180176, "reward_std": 0.0018625739984372558, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980740547180176, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.1476683937823835, "grad_norm": 0.7067763125906138, "kl": 0.6015625, "learning_rate": 6.852331606217616e-07, "loss": 0.0024, "reward": 1.9988773465156555, "reward_std": 1.2711435658729897e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498877376317978, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.150259067357513, "grad_norm": 0.693856215540824, "kl": 0.693359375, "learning_rate": 6.849740932642487e-07, "loss": 0.0026, "reward": 2.4999947547912598, "reward_std": 5.036586628648365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1528497409326426, "grad_norm": 0.762179077230189, "kl": 0.58203125, "learning_rate": 6.847150259067357e-07, "loss": 0.0022, "reward": 2.499987244606018, "reward_std": 4.329399416747037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.155440414507772, "grad_norm": 11.333705039320455, "kl": 0.529296875, "learning_rate": 6.844559585492228e-07, "loss": 0.0028, "reward": 2.4373196363449097, "reward_std": 0.1772464111145382, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937319576740265, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.1580310880829017, "grad_norm": 11.456537175619228, "kl": 0.5546875, "learning_rate": 6.841968911917099e-07, "loss": 0.0027, "reward": 1.9039571285247803, "reward_std": 0.0008579762296108129, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4039570093154907, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 3.160621761658031, "grad_norm": 0.15132448573901777, "kl": 0.533203125, "learning_rate": 6.839378238341968e-07, "loss": 0.003, "reward": 2.499996066093445, "reward_std": 2.68325129582081e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1632124352331608, "grad_norm": 0.1526668623641597, "kl": 0.541015625, "learning_rate": 6.836787564766839e-07, "loss": 0.0025, "reward": 2.4999969005584717, "reward_std": 1.9648958300422237e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1658031088082903, "grad_norm": 0.6403077904030064, "kl": 0.48828125, "learning_rate": 6.834196891191709e-07, "loss": 0.002, "reward": 2.499996066093445, "reward_std": 4.495325470088574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.16839378238342, "grad_norm": 4.78410666163888, "kl": 0.5009765625, "learning_rate": 6.83160621761658e-07, "loss": 0.0022, "reward": 1.8839143514633179, "reward_std": 0.0003718053091574802, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3839143514633179, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1709844559585494, "grad_norm": 12.755015866344719, "kl": 0.5625, "learning_rate": 6.829015544041451e-07, "loss": 0.0017, "reward": 2.4374196529388428, "reward_std": 0.1769987379614122, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374196529388428, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 3.173575129533679, "grad_norm": 82.83622223438441, "kl": 0.583984375, "learning_rate": 6.826424870466321e-07, "loss": 0.0015, "reward": 1.9657975435256958, "reward_std": 0.3409871027470217, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4657975435256958, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1761658031088085, "grad_norm": 6.6424992128217095, "kl": 1.88671875, "learning_rate": 6.823834196891191e-07, "loss": 0.0077, "reward": 2.499989867210388, "reward_std": 3.774889023588912e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.178756476683938, "grad_norm": 17.903679267476182, "kl": 0.4931640625, "learning_rate": 6.821243523316061e-07, "loss": 0.002, "reward": 1.9995789527893066, "reward_std": 0.00046580433456711035, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995788931846619, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.1813471502590676, "grad_norm": 31.062287826506786, "kl": 0.626953125, "learning_rate": 6.818652849740932e-07, "loss": 0.0026, "reward": 2.249399721622467, "reward_std": 0.2678692827537361, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7493996620178223, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.1839378238341967, "grad_norm": 7.849250126939151, "kl": 0.783203125, "learning_rate": 6.816062176165803e-07, "loss": 0.0037, "reward": 2.4998879432678223, "reward_std": 8.511284727319435e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999887764453888, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.186528497409326, "grad_norm": 3.491278189985378, "kl": 0.5546875, "learning_rate": 6.813471502590673e-07, "loss": 0.0026, "reward": 1.9991375207901, "reward_std": 3.345373761476367e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991375505924225, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.1891191709844557, "grad_norm": 2.6501471649301442, "kl": 0.630859375, "learning_rate": 6.810880829015544e-07, "loss": 0.0019, "reward": 1.9997496008872986, "reward_std": 2.4856550112417608e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997497498989105, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1917098445595853, "grad_norm": 21.940818644452044, "kl": 0.572265625, "learning_rate": 6.808290155440413e-07, "loss": 0.0017, "reward": 2.4374648332595825, "reward_std": 0.17686794674841622, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937464952468872, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.194300518134715, "grad_norm": 0.9967066730366205, "kl": 0.435546875, "learning_rate": 6.805699481865284e-07, "loss": 0.0011, "reward": 2.499982476234436, "reward_std": 4.643148386662688e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999825358390808, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.1968911917098444, "grad_norm": 1.2397885328032432, "kl": 0.544921875, "learning_rate": 6.803108808290155e-07, "loss": 0.0019, "reward": 2.4999841451644897, "reward_std": 1.0223189747193828e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999840259552002, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.199481865284974, "grad_norm": 0.26657167265973697, "kl": 0.63671875, "learning_rate": 6.800518134715025e-07, "loss": 0.0024, "reward": 2.4999974966049194, "reward_std": 1.6839056797834928e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2020725388601035, "grad_norm": 13.949921512457857, "kl": 0.61328125, "learning_rate": 6.797927461139897e-07, "loss": 0.0023, "reward": 2.498992443084717, "reward_std": 0.00040484491341885587, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9989925026893616, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.204663212435233, "grad_norm": 2.749202197200427, "kl": 0.537109375, "learning_rate": 6.795336787564767e-07, "loss": 0.0015, "reward": 1.9992135763168335, "reward_std": 4.1400803951319176e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992136657238007, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.2072538860103625, "grad_norm": 1.963414826444741, "kl": 0.517578125, "learning_rate": 6.792746113989637e-07, "loss": 0.0023, "reward": 2.4999895095825195, "reward_std": 6.974496386646933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.209844559585492, "grad_norm": 4.2185400881960895, "kl": 0.6484375, "learning_rate": 6.790155440414508e-07, "loss": 0.0035, "reward": 1.9778039455413818, "reward_std": 9.465482071391307e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4778037667274475, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2124352331606216, "grad_norm": 0.6609666778763692, "kl": 0.5859375, "learning_rate": 6.787564766839378e-07, "loss": 0.0026, "reward": 2.499995470046997, "reward_std": 2.9673734616153524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.215025906735751, "grad_norm": 3.299717161302066, "kl": 0.46484375, "learning_rate": 6.784974093264249e-07, "loss": 0.001, "reward": 2.343748092651367, "reward_std": 0.44194145326463286, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749982118606567, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2176165803108807, "grad_norm": 35.67597102299727, "kl": 0.697265625, "learning_rate": 6.782383419689119e-07, "loss": 0.0039, "reward": 1.9762653708457947, "reward_std": 0.0022443477171236736, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4762652516365051, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 3.2202072538860103, "grad_norm": 49.72426465145791, "kl": 0.57421875, "learning_rate": 6.77979274611399e-07, "loss": 0.0024, "reward": 1.748256504535675, "reward_std": 0.2690846874311319, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2482564747333527, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.22279792746114, "grad_norm": 2.2851081415561474, "kl": 0.607421875, "learning_rate": 6.77720207253886e-07, "loss": 0.0027, "reward": 2.499992609024048, "reward_std": 5.356506449061271e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2253886010362693, "grad_norm": 0.3539713396421604, "kl": 0.654296875, "learning_rate": 6.77461139896373e-07, "loss": 0.0022, "reward": 2.499995708465576, "reward_std": 3.4540636306701344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.227979274611399, "grad_norm": 11.364041687212762, "kl": 0.537109375, "learning_rate": 6.772020725388601e-07, "loss": 0.0013, "reward": 1.9997043013572693, "reward_std": 6.905659631684102e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997043907642365, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2305699481865284, "grad_norm": 0.44820348516001757, "kl": 0.5390625, "learning_rate": 6.769430051813472e-07, "loss": 0.0016, "reward": 2.4999923706054688, "reward_std": 4.884116805214944e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.233160621761658, "grad_norm": 0.19984977718636715, "kl": 0.6953125, "learning_rate": 6.766839378238342e-07, "loss": 0.0031, "reward": 2.4999961853027344, "reward_std": 3.134044277430803e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2357512953367875, "grad_norm": 4.7226679683461805, "kl": 0.609375, "learning_rate": 6.764248704663213e-07, "loss": 0.003, "reward": 1.9983015060424805, "reward_std": 4.3447900509363535e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983014166355133, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.238341968911917, "grad_norm": 2.063397590858087, "kl": 0.7109375, "learning_rate": 6.761658031088082e-07, "loss": 0.0031, "reward": 1.9449492692947388, "reward_std": 0.0002097751205383247, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.444949209690094, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.2409326424870466, "grad_norm": 25.844979360118842, "kl": 0.6015625, "learning_rate": 6.759067357512953e-07, "loss": 0.0021, "reward": 1.9146761894226074, "reward_std": 0.0006666687534959692, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4146761298179626, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.243523316062176, "grad_norm": 21.012399847372212, "kl": 0.578125, "learning_rate": 6.756476683937824e-07, "loss": 0.0022, "reward": 1.9926637411117554, "reward_std": 0.0012526108571364603, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4926637411117554, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2461139896373057, "grad_norm": 8.328701393605263, "kl": 0.599609375, "learning_rate": 6.753886010362694e-07, "loss": 0.0026, "reward": 1.9989255666732788, "reward_std": 0.00020352353521957411, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989255368709564, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.2487046632124352, "grad_norm": 3.7631723824068106, "kl": 0.50390625, "learning_rate": 6.751295336787565e-07, "loss": 0.0027, "reward": 2.4999696016311646, "reward_std": 4.139473533371074e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999696612358093, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.2512953367875648, "grad_norm": 3.10511223958666, "kl": 0.6796875, "learning_rate": 6.748704663212435e-07, "loss": 0.0028, "reward": 1.999169409275055, "reward_std": 3.676583946798928e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499169409275055, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2538860103626943, "grad_norm": 0.8207838565783472, "kl": 0.5537109375, "learning_rate": 6.746113989637305e-07, "loss": 0.0021, "reward": 2.499984383583069, "reward_std": 7.479409191546438e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999844431877136, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.256476683937824, "grad_norm": 14.064718210709367, "kl": 0.587890625, "learning_rate": 6.743523316062176e-07, "loss": 0.0023, "reward": 2.2032699584960938, "reward_std": 0.31720540158477206, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7032699584960938, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.2590673575129534, "grad_norm": 0.5235425176292933, "kl": 0.5478515625, "learning_rate": 6.740932642487046e-07, "loss": 0.0016, "reward": 2.4999955892562866, "reward_std": 3.377854909558664e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.261658031088083, "grad_norm": 52.069690434808436, "kl": 0.619140625, "learning_rate": 6.738341968911917e-07, "loss": 0.0028, "reward": 1.80352783203125, "reward_std": 0.015608674555807056, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.30352783203125, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2642487046632125, "grad_norm": 0.4069387859911761, "kl": 0.568359375, "learning_rate": 6.735751295336787e-07, "loss": 0.0017, "reward": 2.4999964237213135, "reward_std": 2.394607292899309e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.266839378238342, "grad_norm": 19.12620651320064, "kl": 0.552734375, "learning_rate": 6.733160621761658e-07, "loss": 0.0014, "reward": 1.9901773929595947, "reward_std": 0.0007743120740997256, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4901775121688843, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.2694300518134716, "grad_norm": 0.5721149261898104, "kl": 0.625, "learning_rate": 6.730569948186528e-07, "loss": 0.0022, "reward": 2.4999964237213135, "reward_std": 2.7965506887994707e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.272020725388601, "grad_norm": 2.5939810082530586, "kl": 0.5390625, "learning_rate": 6.727979274611398e-07, "loss": 0.0025, "reward": 1.842836856842041, "reward_std": 0.0001461605093027174, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.342836856842041, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2746113989637307, "grad_norm": 2.664052255407038, "kl": 0.712890625, "learning_rate": 6.725388601036269e-07, "loss": 0.003, "reward": 1.9987717270851135, "reward_std": 2.722293402257492e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987717866897583, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.27720207253886, "grad_norm": 8.332538715554886, "kl": 0.701171875, "learning_rate": 6.722797927461139e-07, "loss": 0.0027, "reward": 1.5623114109039307, "reward_std": 0.17679402306021075, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.062311440706253, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2797927461139897, "grad_norm": 0.9130066045316185, "kl": 0.5625, "learning_rate": 6.72020725388601e-07, "loss": 0.0019, "reward": 2.499992847442627, "reward_std": 6.9139917968641385e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2823834196891193, "grad_norm": 28.946683639835648, "kl": 0.5888671875, "learning_rate": 6.717616580310881e-07, "loss": 0.0018, "reward": 2.3746328353881836, "reward_std": 0.23213262859047745, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8746328949928284, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.284974093264249, "grad_norm": 9.588783540046947, "kl": 0.57421875, "learning_rate": 6.71502590673575e-07, "loss": 0.0016, "reward": 2.06243896484375, "reward_std": 0.17680060100042283, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624390244483948, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2875647668393784, "grad_norm": 6.4005475990053045, "kl": 0.65234375, "learning_rate": 6.712435233160621e-07, "loss": 0.0026, "reward": 1.9991182684898376, "reward_std": 3.8323869375744835e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991183578968048, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.290155440414508, "grad_norm": 23.28102796818639, "kl": 0.52734375, "learning_rate": 6.709844559585492e-07, "loss": 0.0022, "reward": 1.854838490486145, "reward_std": 0.0006416706644358783, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3548383712768555, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2927461139896375, "grad_norm": 0.44693959292843255, "kl": 0.64453125, "learning_rate": 6.707253886010362e-07, "loss": 0.0024, "reward": 2.4999916553497314, "reward_std": 5.328251063474454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 3.295336787564767, "grad_norm": 58.322968911296364, "kl": 0.587890625, "learning_rate": 6.704663212435233e-07, "loss": 0.0022, "reward": 2.43708336353302, "reward_std": 0.17793464741214393, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.93708336353302, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2979274611398965, "grad_norm": 2.061672598959774, "kl": 0.583984375, "learning_rate": 6.702072538860103e-07, "loss": 0.0033, "reward": 2.499974489212036, "reward_std": 1.4251622815208975e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999744296073914, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 3.300518134715026, "grad_norm": 44.27921918998384, "kl": 0.5625, "learning_rate": 6.699481865284973e-07, "loss": 0.0028, "reward": 1.9934158325195312, "reward_std": 0.0012337156223338752, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4934158325195312, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3031088082901556, "grad_norm": 0.13194104140604934, "kl": 0.564453125, "learning_rate": 6.696891191709844e-07, "loss": 0.0028, "reward": 2.499998092651367, "reward_std": 1.9327447944306186e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 3.305699481865285, "grad_norm": 0.20803097508269264, "kl": 0.615234375, "learning_rate": 6.694300518134714e-07, "loss": 0.0022, "reward": 2.4999970197677612, "reward_std": 2.6047623009617382e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.3082901554404147, "grad_norm": 0.21031413444091823, "kl": 0.572265625, "learning_rate": 6.691709844559585e-07, "loss": 0.0014, "reward": 2.499994397163391, "reward_std": 3.157093260597321e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.3108808290155443, "grad_norm": 1.314125628124503, "kl": 0.646484375, "learning_rate": 6.689119170984455e-07, "loss": 0.0026, "reward": 1.9987974166870117, "reward_std": 2.542577067288221e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987974166870117, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.313471502590674, "grad_norm": 16.526187316461776, "kl": 0.5625, "learning_rate": 6.686528497409327e-07, "loss": 0.0024, "reward": 1.988803744316101, "reward_std": 0.0127425687755931, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4888035655021667, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3160621761658033, "grad_norm": 0.33023635021554026, "kl": 0.6015625, "learning_rate": 6.683937823834197e-07, "loss": 0.0022, "reward": 2.499987006187439, "reward_std": 4.590928483594325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.318652849740933, "grad_norm": 0.43343625677447717, "kl": 0.599609375, "learning_rate": 6.681347150259067e-07, "loss": 0.0018, "reward": 2.4999921321868896, "reward_std": 4.295967528378242e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.321243523316062, "grad_norm": 1.427976315096707, "kl": 0.58203125, "learning_rate": 6.678756476683938e-07, "loss": 0.0017, "reward": 2.4999947547912598, "reward_std": 4.875049853581004e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3238341968911915, "grad_norm": 6.237531847361904, "kl": 0.650390625, "learning_rate": 6.676165803108808e-07, "loss": 0.0032, "reward": 1.997864305973053, "reward_std": 8.016543495159567e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497864305973053, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 3.326424870466321, "grad_norm": 50.429040632137095, "kl": 0.93359375, "learning_rate": 6.673575129533679e-07, "loss": 0.0031, "reward": 1.9147372841835022, "reward_std": 0.007134917445057454, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4147374033927917, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 3.3290155440414506, "grad_norm": 9.609057929750628, "kl": 0.609375, "learning_rate": 6.67098445595855e-07, "loss": 0.003, "reward": 1.935227870941162, "reward_std": 0.025859398407931167, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4352277517318726, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.33160621761658, "grad_norm": 23.7553791770436, "kl": 0.546875, "learning_rate": 6.668393782383419e-07, "loss": 0.0024, "reward": 1.9936089515686035, "reward_std": 0.0005637061362619988, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4936090409755707, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.3341968911917097, "grad_norm": 2.447388149332556, "kl": 1.0234375, "learning_rate": 6.66580310880829e-07, "loss": 0.0043, "reward": 1.4977600574493408, "reward_std": 5.9922360378550366e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9977600276470184, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3367875647668392, "grad_norm": 3.6363536395849843, "kl": 0.61328125, "learning_rate": 6.66321243523316e-07, "loss": 0.0021, "reward": 1.999430537223816, "reward_std": 1.9968958440585993e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994306862354279, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.339378238341969, "grad_norm": 0.4305252619494974, "kl": 0.58203125, "learning_rate": 6.660621761658031e-07, "loss": 0.0016, "reward": 2.4999929666519165, "reward_std": 3.4184625974376104e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 3.3419689119170983, "grad_norm": 48.69556837171324, "kl": 0.59765625, "learning_rate": 6.658031088082902e-07, "loss": 0.0022, "reward": 2.332605719566345, "reward_std": 0.30988614668513037, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8326056003570557, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.344559585492228, "grad_norm": 25.505958740534773, "kl": 0.591796875, "learning_rate": 6.655440414507772e-07, "loss": 0.0026, "reward": 1.9113622903823853, "reward_std": 0.00039821324239142086, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4113622903823853, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.3471502590673574, "grad_norm": 15.380684830558323, "kl": 0.61328125, "learning_rate": 6.652849740932642e-07, "loss": 0.0025, "reward": 2.0616670846939087, "reward_std": 0.17711289126873453, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5616670846939087, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.349740932642487, "grad_norm": 40.94484678956914, "kl": 0.767578125, "learning_rate": 6.650259067357513e-07, "loss": 0.0029, "reward": 2.061910092830658, "reward_std": 0.17701491107646916, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5619100332260132, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.3523316062176165, "grad_norm": 0.6798122116004357, "kl": 0.638671875, "learning_rate": 6.647668393782383e-07, "loss": 0.003, "reward": 2.499991536140442, "reward_std": 4.55746612715302e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.354922279792746, "grad_norm": 0.24377957477256185, "kl": 0.498046875, "learning_rate": 6.645077720207254e-07, "loss": 0.0008, "reward": 2.4999953508377075, "reward_std": 2.1859274283997365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.3575129533678756, "grad_norm": 0.13774953351360994, "kl": 0.5439453125, "learning_rate": 6.642487046632124e-07, "loss": 0.0018, "reward": 2.4999966621398926, "reward_std": 2.887539892526547e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.360103626943005, "grad_norm": 0.08095003299486364, "kl": 0.56640625, "learning_rate": 6.639896373056995e-07, "loss": 0.0014, "reward": 2.499997138977051, "reward_std": 1.0379717991781945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.3626943005181347, "grad_norm": 9.92305289159648, "kl": 0.517578125, "learning_rate": 6.637305699481865e-07, "loss": 0.0021, "reward": 1.940892517566681, "reward_std": 0.00016849876129754193, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4408924281597137, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.365284974093264, "grad_norm": 48.89114699832967, "kl": 0.7578125, "learning_rate": 6.634715025906735e-07, "loss": 0.0026, "reward": 1.9753493070602417, "reward_std": 0.00237958707270991, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.475349336862564, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3678756476683938, "grad_norm": 0.3326994786995794, "kl": 0.58203125, "learning_rate": 6.632124352331606e-07, "loss": 0.0015, "reward": 2.499993324279785, "reward_std": 2.4907478177738085e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3704663212435233, "grad_norm": 0.6776887400069385, "kl": 0.642578125, "learning_rate": 6.629533678756476e-07, "loss": 0.0024, "reward": 2.499996781349182, "reward_std": 2.3861059048613242e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.373056994818653, "grad_norm": 4.969419874006646, "kl": 0.615234375, "learning_rate": 6.626943005181347e-07, "loss": 0.0025, "reward": 1.4903873801231384, "reward_std": 0.00013243843204691075, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9903874695301056, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 3.3756476683937824, "grad_norm": 44.61187065772588, "kl": 0.541015625, "learning_rate": 6.624352331606218e-07, "loss": 0.0022, "reward": 2.0992191433906555, "reward_std": 0.25696537855390034, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5992191433906555, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.378238341968912, "grad_norm": 0.4876787681677305, "kl": 0.66796875, "learning_rate": 6.621761658031087e-07, "loss": 0.0011, "reward": 2.4999916553497314, "reward_std": 4.352652695160941e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.3808290155440415, "grad_norm": 1.352866880348361, "kl": 0.6171875, "learning_rate": 6.619170984455958e-07, "loss": 0.002, "reward": 2.4994497299194336, "reward_std": 2.0898258981105755e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9994498491287231, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.383419689119171, "grad_norm": 1.034257691175753, "kl": 0.51953125, "learning_rate": 6.616580310880828e-07, "loss": 0.0021, "reward": 2.499988317489624, "reward_std": 4.178866106485657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.3860103626943006, "grad_norm": 3.0327680757439723, "kl": 0.61328125, "learning_rate": 6.613989637305699e-07, "loss": 0.0017, "reward": 2.499940514564514, "reward_std": 1.8101168279827107e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999405145645142, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.38860103626943, "grad_norm": 0.10446885684503122, "kl": 0.5390625, "learning_rate": 6.61139896373057e-07, "loss": 0.0013, "reward": 2.499996066093445, "reward_std": 1.6903552477742778e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.3911917098445596, "grad_norm": 2.849771540458285, "kl": 0.60546875, "learning_rate": 6.60880829015544e-07, "loss": 0.0024, "reward": 1.4991880059242249, "reward_std": 4.190728668618249e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9991880655288696, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.393782383419689, "grad_norm": 46.08396487873682, "kl": 0.548828125, "learning_rate": 6.60621761658031e-07, "loss": 0.0022, "reward": 2.37454891204834, "reward_std": 0.35480011999607086, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8745489120483398, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.3963730569948187, "grad_norm": 22.14046100599451, "kl": 3.361328125, "learning_rate": 6.60362694300518e-07, "loss": 0.0132, "reward": 2.117078959941864, "reward_std": 0.2367358882456756, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6170790195465088, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3989637305699483, "grad_norm": 1.1144166760377836, "kl": 0.62109375, "learning_rate": 6.601036269430051e-07, "loss": 0.0025, "reward": 2.499993681907654, "reward_std": 6.1795741430614726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.401554404145078, "grad_norm": 0.543876729802295, "kl": 0.673828125, "learning_rate": 6.598445595854922e-07, "loss": 0.0033, "reward": 2.499992251396179, "reward_std": 3.7440818800860143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4041450777202074, "grad_norm": 0.1630714687286347, "kl": 0.5283203125, "learning_rate": 6.595854922279792e-07, "loss": 0.0027, "reward": 2.499989151954651, "reward_std": 1.7377718108946283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892711639404, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 3.406735751295337, "grad_norm": 15.433011309379996, "kl": 2.75390625, "learning_rate": 6.593264248704663e-07, "loss": 0.011, "reward": 1.7780500650405884, "reward_std": 0.18995709344744682, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2780500650405884, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 3.4093264248704664, "grad_norm": 45.810701786835324, "kl": 0.677734375, "learning_rate": 6.590673575129534e-07, "loss": 0.0026, "reward": 1.433575689792633, "reward_std": 0.10967624104887364, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9335756599903107, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.411917098445596, "grad_norm": 0.6044988540570385, "kl": 0.513671875, "learning_rate": 6.588082901554403e-07, "loss": 0.0013, "reward": 2.4999929666519165, "reward_std": 2.9935817451587354e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.4145077720207255, "grad_norm": 3.272805106843922, "kl": 0.60546875, "learning_rate": 6.585492227979274e-07, "loss": 0.0024, "reward": 2.499877452850342, "reward_std": 2.9783182526443852e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998773336410522, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.417098445595855, "grad_norm": 0.21056321288748855, "kl": 0.5859375, "learning_rate": 6.582901554404144e-07, "loss": 0.0031, "reward": 2.499997615814209, "reward_std": 1.6504251902915712e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 3.4196891191709846, "grad_norm": 3.416344522299136, "kl": 0.58203125, "learning_rate": 6.580310880829015e-07, "loss": 0.0024, "reward": 1.9628286361694336, "reward_std": 0.0002690400860956288, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4628286361694336, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.422279792746114, "grad_norm": 77.98868031817126, "kl": 0.6328125, "learning_rate": 6.577720207253887e-07, "loss": 0.0027, "reward": 1.9886281490325928, "reward_std": 0.0030805376511580107, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4886280298233032, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.4248704663212437, "grad_norm": 22.012893899295083, "kl": 0.595703125, "learning_rate": 6.575129533678755e-07, "loss": 0.0024, "reward": 2.499565362930298, "reward_std": 0.0002936051459982991, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995653629302979, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.4274611398963732, "grad_norm": 33.93968677435462, "kl": 0.576171875, "learning_rate": 6.572538860103627e-07, "loss": 0.002, "reward": 2.2491897344589233, "reward_std": 0.26801190733408475, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7491897344589233, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 3.4300518134715023, "grad_norm": 0.6766821528261714, "kl": 0.615234375, "learning_rate": 6.569948186528497e-07, "loss": 0.0029, "reward": 2.499996781349182, "reward_std": 3.0038733598303224e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.432642487046632, "grad_norm": 0.060520441749043453, "kl": 0.568359375, "learning_rate": 6.567357512953368e-07, "loss": 0.0027, "reward": 2.4999992847442627, "reward_std": 9.174949582302361e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 3.4352331606217614, "grad_norm": 0.4901514403945324, "kl": 0.634765625, "learning_rate": 6.564766839378239e-07, "loss": 0.0025, "reward": 2.4999966621398926, "reward_std": 2.013122241351084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.437823834196891, "grad_norm": 0.5113404847074935, "kl": 0.57421875, "learning_rate": 6.562176165803109e-07, "loss": 0.0026, "reward": 2.499981641769409, "reward_std": 4.877410930248516e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999815821647644, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4404145077720205, "grad_norm": 34.92167937746562, "kl": 0.60546875, "learning_rate": 6.55958549222798e-07, "loss": 0.0026, "reward": 2.12365186214447, "reward_std": 0.23228603625099709, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6236519813537598, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 3.44300518134715, "grad_norm": 309.9333994230266, "kl": 0.615234375, "learning_rate": 6.556994818652849e-07, "loss": 0.0013, "reward": 2.337171792984009, "reward_std": 0.3014951067725633, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8371720910072327, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4455958549222796, "grad_norm": 3.4590845043400202, "kl": 0.55859375, "learning_rate": 6.55440414507772e-07, "loss": 0.0013, "reward": 2.4998191595077515, "reward_std": 3.279679935985769e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998193383216858, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.448186528497409, "grad_norm": 0.24868725616615536, "kl": 0.580078125, "learning_rate": 6.551813471502591e-07, "loss": 0.0017, "reward": 2.4999969005584717, "reward_std": 2.1491821939889633e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4507772020725387, "grad_norm": 9.616804363378497, "kl": 0.65625, "learning_rate": 6.549222797927461e-07, "loss": 0.0028, "reward": 1.9983316659927368, "reward_std": 5.0418053433531895e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498331606388092, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4533678756476682, "grad_norm": 4.660749154850547, "kl": 0.50390625, "learning_rate": 6.546632124352332e-07, "loss": 0.0024, "reward": 1.9993440508842468, "reward_std": 2.917234019150783e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993439316749573, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 3.4559585492227978, "grad_norm": 55.02707707887303, "kl": 0.640625, "learning_rate": 6.544041450777201e-07, "loss": 0.0018, "reward": 1.9180868864059448, "reward_std": 0.006414251880755728, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4180868864059448, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4585492227979273, "grad_norm": 0.5934864693795994, "kl": 0.61328125, "learning_rate": 6.541450777202072e-07, "loss": 0.0013, "reward": 2.4999970197677612, "reward_std": 2.4000851226446684e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.461139896373057, "grad_norm": 11.915129134977448, "kl": 0.533203125, "learning_rate": 6.538860103626943e-07, "loss": 0.0021, "reward": 1.7647348046302795, "reward_std": 0.0003912689571734518, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2647348642349243, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 3.4637305699481864, "grad_norm": 89.89835642005924, "kl": 0.62109375, "learning_rate": 6.536269430051813e-07, "loss": 0.0025, "reward": 1.9661896228790283, "reward_std": 0.07228621256365386, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.466189682483673, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.466321243523316, "grad_norm": 4.277932354442914, "kl": 0.634765625, "learning_rate": 6.533678756476684e-07, "loss": 0.0029, "reward": 1.8600261211395264, "reward_std": 0.0004616106907633366, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3600261211395264, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4689119170984455, "grad_norm": 44.85970620982723, "kl": 0.48046875, "learning_rate": 6.531088082901555e-07, "loss": 0.0019, "reward": 2.4992449283599854, "reward_std": 7.356897526733519e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9992451071739197, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.471502590673575, "grad_norm": 3.6829224773078657, "kl": 0.63671875, "learning_rate": 6.528497409326425e-07, "loss": 0.0027, "reward": 1.9996904134750366, "reward_std": 4.184322739320123e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996903836727142, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4740932642487046, "grad_norm": 6.610224387879026, "kl": 0.62109375, "learning_rate": 6.525906735751295e-07, "loss": 0.0022, "reward": 2.3681209087371826, "reward_std": 0.0003917662323260629, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.868120789527893, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 3.476683937823834, "grad_norm": 161.68143311022007, "kl": 0.63671875, "learning_rate": 6.523316062176165e-07, "loss": 0.0024, "reward": 2.2394572496414185, "reward_std": 0.27749250621764077, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.739457368850708, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.4792746113989637, "grad_norm": 13.910368312117805, "kl": 0.52734375, "learning_rate": 6.520725388601036e-07, "loss": 0.0024, "reward": 2.374870181083679, "reward_std": 0.23154308285711522, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874870240688324, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.481865284974093, "grad_norm": 1.4672888159276076, "kl": 0.546875, "learning_rate": 6.518134715025907e-07, "loss": 0.0019, "reward": 2.499992251396179, "reward_std": 4.95434198910516e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.4844559585492227, "grad_norm": 0.2901416800996486, "kl": 0.591796875, "learning_rate": 6.515544041450777e-07, "loss": 0.0029, "reward": 2.499996304512024, "reward_std": 2.5608004534660722e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4870466321243523, "grad_norm": 0.7881863243430588, "kl": 0.6484375, "learning_rate": 6.512953367875648e-07, "loss": 0.0026, "reward": 2.499983787536621, "reward_std": 7.387967059457878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983787536621, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.489637305699482, "grad_norm": 1.8035918273727127, "kl": 0.7109375, "learning_rate": 6.510362694300517e-07, "loss": 0.0028, "reward": 2.4999821186065674, "reward_std": 6.999416314101836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981939792633, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4922279792746114, "grad_norm": 0.8981555937588747, "kl": 0.703125, "learning_rate": 6.507772020725388e-07, "loss": 0.0046, "reward": 2.499990224838257, "reward_std": 1.1453644134462593e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.494818652849741, "grad_norm": 1.19319932308453, "kl": 0.8359375, "learning_rate": 6.505181347150259e-07, "loss": 0.0028, "reward": 2.4999958276748657, "reward_std": 3.8362918530765455e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4974093264248705, "grad_norm": 32.86535747314109, "kl": 0.63671875, "learning_rate": 6.502590673575129e-07, "loss": 0.0026, "reward": 1.9989941120147705, "reward_std": 2.746053036162266e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498994141817093, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5, "grad_norm": 23.392161420812826, "kl": 0.787109375, "learning_rate": 6.5e-07, "loss": 0.0026, "reward": 2.1242854595184326, "reward_std": 0.2318929609634779, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6242855191230774, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5025906735751295, "grad_norm": 0.4483967653707917, "kl": 0.6015625, "learning_rate": 6.49740932642487e-07, "loss": 0.0037, "reward": 2.4999958276748657, "reward_std": 2.9107957288943e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.505181347150259, "grad_norm": 3.8219242847848864, "kl": 0.5400390625, "learning_rate": 6.49481865284974e-07, "loss": 0.003, "reward": 1.9186437129974365, "reward_std": 0.00014858873362300073, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4186437726020813, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.5077720207253886, "grad_norm": 83.18389525134592, "kl": 0.583984375, "learning_rate": 6.492227979274611e-07, "loss": 0.0018, "reward": 2.312074661254883, "reward_std": 0.2593497541197394, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8120747804641724, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.510362694300518, "grad_norm": 0.07672235430694989, "kl": 0.60546875, "learning_rate": 6.489637305699481e-07, "loss": 0.0036, "reward": 2.499999165534973, "reward_std": 9.997762333568971e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5129533678756477, "grad_norm": 33.49190413142005, "kl": 0.662109375, "learning_rate": 6.487046632124352e-07, "loss": 0.0019, "reward": 1.9850329160690308, "reward_std": 0.00027922621620746213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4850330352783203, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.5155440414507773, "grad_norm": 2.9457973336441703, "kl": 0.6015625, "learning_rate": 6.484455958549222e-07, "loss": 0.0027, "reward": 2.4998908042907715, "reward_std": 3.385433774383273e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998908638954163, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 3.518134715025907, "grad_norm": 1.9438434684285417, "kl": 0.609375, "learning_rate": 6.481865284974093e-07, "loss": 0.0013, "reward": 2.499980926513672, "reward_std": 5.2076372583087505e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999809861183167, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5207253886010363, "grad_norm": 0.13513719854053263, "kl": 0.572265625, "learning_rate": 6.479274611398963e-07, "loss": 0.0026, "reward": 2.499996304512024, "reward_std": 1.9910360151698114e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.523316062176166, "grad_norm": 1.9173573150046994, "kl": 0.61328125, "learning_rate": 6.476683937823833e-07, "loss": 0.0029, "reward": 2.4999914169311523, "reward_std": 7.9552585248166e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5259067357512954, "grad_norm": 0.9494189252932447, "kl": 0.55078125, "learning_rate": 6.474093264248704e-07, "loss": 0.0026, "reward": 2.4999889135360718, "reward_std": 8.804554454400204e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887943267822, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.528497409326425, "grad_norm": 0.2937745769637296, "kl": 0.560546875, "learning_rate": 6.471502590673574e-07, "loss": 0.0033, "reward": 2.499994993209839, "reward_std": 4.2277121394818096e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.5310880829015545, "grad_norm": 3.7375773860487684, "kl": 0.58984375, "learning_rate": 6.468911917098445e-07, "loss": 0.0016, "reward": 1.8092767000198364, "reward_std": 0.00041171352359015145, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.309276819229126, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.533678756476684, "grad_norm": 4.101291092949582, "kl": 0.583984375, "learning_rate": 6.466321243523317e-07, "loss": 0.0023, "reward": 2.4999715089797974, "reward_std": 1.8902679130405886e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999715089797974, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5362694300518136, "grad_norm": 25.919696852179786, "kl": 0.5712890625, "learning_rate": 6.463730569948185e-07, "loss": 0.0021, "reward": 2.1249243021011353, "reward_std": 0.23149173047755767, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249242424964905, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.538860103626943, "grad_norm": 0.19653038614145996, "kl": 0.607421875, "learning_rate": 6.461139896373057e-07, "loss": 0.0016, "reward": 2.4999938011169434, "reward_std": 3.105779796896968e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5414507772020727, "grad_norm": 15.902056128566585, "kl": 0.61328125, "learning_rate": 6.458549222797928e-07, "loss": 0.0023, "reward": 2.4373351335525513, "reward_std": 0.17722894037514436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373352527618408, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5440414507772022, "grad_norm": 2.036186679400923, "kl": 0.56640625, "learning_rate": 6.455958549222798e-07, "loss": 0.0019, "reward": 1.9984742403030396, "reward_std": 6.802505140512949e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984742999076843, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5466321243523318, "grad_norm": 1.2749649095886086, "kl": 0.60546875, "learning_rate": 6.453367875647669e-07, "loss": 0.0027, "reward": 2.4999955892562866, "reward_std": 4.646158117793675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5492227979274613, "grad_norm": 0.20278281409644677, "kl": 0.603515625, "learning_rate": 6.450777202072539e-07, "loss": 0.0014, "reward": 2.499996304512024, "reward_std": 3.077019187003316e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 3.551813471502591, "grad_norm": 0.25101351860198323, "kl": 0.58203125, "learning_rate": 6.448186528497409e-07, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 3.3206756029358075e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 3.5544041450777204, "grad_norm": 73.54884158988142, "kl": 0.58203125, "learning_rate": 6.44559585492228e-07, "loss": 0.0029, "reward": 1.7386729717254639, "reward_std": 0.012680327624480014, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2386729717254639, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.55699481865285, "grad_norm": 4.577786261349774, "kl": 0.591796875, "learning_rate": 6.44300518134715e-07, "loss": 0.0019, "reward": 2.499937415122986, "reward_std": 2.2648751325959893e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999375939369202, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5595854922279795, "grad_norm": 0.08958532232600606, "kl": 0.611328125, "learning_rate": 6.440414507772021e-07, "loss": 0.0023, "reward": 2.4999983310699463, "reward_std": 2.108843744963451e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.562176165803109, "grad_norm": 1.7711545727003508, "kl": 0.537109375, "learning_rate": 6.437823834196891e-07, "loss": 0.0028, "reward": 2.499987483024597, "reward_std": 5.0998549454561726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873042106628, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5647668393782386, "grad_norm": 0.25505995142446997, "kl": 0.517578125, "learning_rate": 6.435233160621762e-07, "loss": 0.0024, "reward": 2.4999961853027344, "reward_std": 3.639949625267036e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.567357512953368, "grad_norm": 15.334962711014645, "kl": 0.873046875, "learning_rate": 6.432642487046632e-07, "loss": 0.0038, "reward": 1.9358563423156738, "reward_std": 0.1773801573617675, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.435856193304062, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.5699481865284977, "grad_norm": 0.22446468822995153, "kl": 0.53515625, "learning_rate": 6.430051813471502e-07, "loss": 0.0031, "reward": 2.499994993209839, "reward_std": 3.072932429404318e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.572538860103627, "grad_norm": 0.48106555448136323, "kl": 0.474609375, "learning_rate": 6.427461139896373e-07, "loss": 0.0023, "reward": 2.4999817609786987, "reward_std": 4.186527974070486e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999817609786987, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.5751295336787567, "grad_norm": 0.6172111077179134, "kl": 0.548828125, "learning_rate": 6.424870466321243e-07, "loss": 0.0021, "reward": 2.499990463256836, "reward_std": 5.505520334736502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5777202072538863, "grad_norm": 15.006388518804854, "kl": 0.59765625, "learning_rate": 6.422279792746114e-07, "loss": 0.0026, "reward": 1.9987740516662598, "reward_std": 7.264852865773719e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987740516662598, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5803108808290154, "grad_norm": 111.1395884924389, "kl": 0.634765625, "learning_rate": 6.419689119170985e-07, "loss": 0.0029, "reward": 2.4246867895126343, "reward_std": 0.21300501697396612, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9246866106987, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.582901554404145, "grad_norm": 1.8097291073246553, "kl": 0.677734375, "learning_rate": 6.417098445595854e-07, "loss": 0.003, "reward": 2.499986410140991, "reward_std": 1.1861967379900307e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986469745636, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5854922279792745, "grad_norm": 8.967944321645934, "kl": 0.541015625, "learning_rate": 6.414507772020725e-07, "loss": 0.0017, "reward": 2.4997754096984863, "reward_std": 0.00016394679323639139, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997755885124207, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.588082901554404, "grad_norm": 0.29143141279003054, "kl": 0.6640625, "learning_rate": 6.411917098445595e-07, "loss": 0.0027, "reward": 2.4999961853027344, "reward_std": 2.7449522121969494e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5906735751295336, "grad_norm": 0.2461857992991977, "kl": 0.65234375, "learning_rate": 6.409326424870466e-07, "loss": 0.0028, "reward": 2.499993324279785, "reward_std": 2.632466987506632e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.593264248704663, "grad_norm": 0.8514467476215797, "kl": 0.61328125, "learning_rate": 6.406735751295337e-07, "loss": 0.0024, "reward": 2.4999914169311523, "reward_std": 4.661637944991526e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.5958549222797926, "grad_norm": 0.8506191873731314, "kl": 0.64453125, "learning_rate": 6.404145077720207e-07, "loss": 0.0035, "reward": 1.9999052286148071, "reward_std": 9.354345024803479e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999050498008728, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.598445595854922, "grad_norm": 85.83621995107387, "kl": 0.580078125, "learning_rate": 6.401554404145077e-07, "loss": 0.0017, "reward": 1.8556014895439148, "reward_std": 0.021315199197033508, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3556016683578491, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6010362694300517, "grad_norm": 0.22826619125642952, "kl": 0.634765625, "learning_rate": 6.398963730569948e-07, "loss": 0.0031, "reward": 2.4999948740005493, "reward_std": 2.872682216548128e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6036269430051813, "grad_norm": 0.4119104720529271, "kl": 0.568359375, "learning_rate": 6.396373056994818e-07, "loss": 0.0014, "reward": 2.4999887943267822, "reward_std": 5.705402031708218e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.606217616580311, "grad_norm": 3.047523014080777, "kl": 0.59765625, "learning_rate": 6.393782383419689e-07, "loss": 0.0016, "reward": 2.4999802112579346, "reward_std": 1.4907342119840905e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980390071869, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6088082901554404, "grad_norm": 0.16612135295353414, "kl": 0.685546875, "learning_rate": 6.391191709844559e-07, "loss": 0.0026, "reward": 2.499997854232788, "reward_std": 2.8933482951742917e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.61139896373057, "grad_norm": 0.3421573937963226, "kl": 0.513671875, "learning_rate": 6.38860103626943e-07, "loss": 0.0002, "reward": 2.4999899864196777, "reward_std": 4.0657830027157615e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6139896373056994, "grad_norm": 19.639795765895233, "kl": 0.677734375, "learning_rate": 6.3860103626943e-07, "loss": 0.0029, "reward": 1.996898889541626, "reward_std": 0.000693051654707233, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968989193439484, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.616580310880829, "grad_norm": 0.9665139657611992, "kl": 0.580078125, "learning_rate": 6.38341968911917e-07, "loss": 0.0018, "reward": 1.9987121224403381, "reward_std": 1.828945823945105e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987122118473053, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.6191709844559585, "grad_norm": 4.855973393421105, "kl": 0.560546875, "learning_rate": 6.380829015544041e-07, "loss": 0.0024, "reward": 1.9994420409202576, "reward_std": 2.3519903152191546e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994420409202576, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.621761658031088, "grad_norm": 0.4164945775462847, "kl": 0.6171875, "learning_rate": 6.378238341968911e-07, "loss": 0.0029, "reward": 2.4999873638153076, "reward_std": 4.468172619453981e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6243523316062176, "grad_norm": 3.583932307046513, "kl": 0.5400390625, "learning_rate": 6.375647668393782e-07, "loss": 0.0025, "reward": 2.4999815225601196, "reward_std": 1.524648666872963e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998140335083, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.626943005181347, "grad_norm": 0.1990346381482961, "kl": 0.5546875, "learning_rate": 6.373056994818653e-07, "loss": 0.0025, "reward": 2.4999974966049194, "reward_std": 2.18950992803002e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6295336787564767, "grad_norm": 0.07427843209655273, "kl": 0.576171875, "learning_rate": 6.370466321243522e-07, "loss": 0.0029, "reward": 2.4999992847442627, "reward_std": 9.547549666422128e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6321243523316062, "grad_norm": 0.3532174476662188, "kl": 0.4501953125, "learning_rate": 6.367875647668393e-07, "loss": 0.0011, "reward": 2.4999966621398926, "reward_std": 2.2083634689806786e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 3.634715025906736, "grad_norm": 18.049704836255344, "kl": 0.552734375, "learning_rate": 6.365284974093263e-07, "loss": 0.0021, "reward": 1.997619390487671, "reward_std": 0.000526324788012289, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4976195693016052, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6373056994818653, "grad_norm": 0.398835991809493, "kl": 0.6015625, "learning_rate": 6.362694300518134e-07, "loss": 0.0028, "reward": 2.499993681907654, "reward_std": 3.1743469435241423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.639896373056995, "grad_norm": 8.277939942315871, "kl": 0.607421875, "learning_rate": 6.360103626943006e-07, "loss": 0.0023, "reward": 1.9982212781906128, "reward_std": 3.924361681129085e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982213079929352, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6424870466321244, "grad_norm": 6.151072357133969, "kl": 0.53125, "learning_rate": 6.357512953367876e-07, "loss": 0.0026, "reward": 1.999785304069519, "reward_std": 6.958506469345593e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499785304069519, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.645077720207254, "grad_norm": 23.313590819265517, "kl": 0.6875, "learning_rate": 6.354922279792746e-07, "loss": 0.0028, "reward": 2.3121477365493774, "reward_std": 0.25924892812872713, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8121477961540222, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6476683937823835, "grad_norm": 0.24166415591067872, "kl": 0.572265625, "learning_rate": 6.352331606217615e-07, "loss": 0.0027, "reward": 2.4999934434890747, "reward_std": 2.9446323424053844e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.650259067357513, "grad_norm": 4.356503544628265, "kl": 0.615234375, "learning_rate": 6.349740932642487e-07, "loss": 0.0024, "reward": 1.8026756048202515, "reward_std": 0.000563210720031293, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3026756346225739, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6528497409326426, "grad_norm": 0.8322377519716881, "kl": 0.59375, "learning_rate": 6.347150259067358e-07, "loss": 0.0027, "reward": 2.499995470046997, "reward_std": 1.7464221286900283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.655440414507772, "grad_norm": 0.1326704588527684, "kl": 0.494140625, "learning_rate": 6.344559585492228e-07, "loss": 0.0021, "reward": 2.4999924898147583, "reward_std": 3.137580961265485e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6580310880829017, "grad_norm": 378.7585702573075, "kl": 0.583984375, "learning_rate": 6.341968911917099e-07, "loss": 0.0028, "reward": 1.991147220134735, "reward_std": 0.0010531625374596842, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.491147220134735, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.660621761658031, "grad_norm": 0.1765227542963442, "kl": 0.517578125, "learning_rate": 6.339378238341969e-07, "loss": 0.0028, "reward": 2.499995470046997, "reward_std": 3.4504637369536795e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6632124352331608, "grad_norm": 1.2164314326805818, "kl": 0.642578125, "learning_rate": 6.336787564766839e-07, "loss": 0.0026, "reward": 2.4999924898147583, "reward_std": 5.736036882808548e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6658031088082903, "grad_norm": 2.2261058305033625, "kl": 0.55078125, "learning_rate": 6.33419689119171e-07, "loss": 0.0018, "reward": 2.4999732971191406, "reward_std": 1.664263118072995e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999732971191406, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.66839378238342, "grad_norm": 2.659128934931962, "kl": 0.51171875, "learning_rate": 6.33160621761658e-07, "loss": 0.001, "reward": 2.499793291091919, "reward_std": 0.00010856628074407126, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997934103012085, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.6709844559585494, "grad_norm": 2.394589437016423, "kl": 0.5703125, "learning_rate": 6.329015544041451e-07, "loss": 0.0019, "reward": 1.9926186800003052, "reward_std": 5.42663296982937e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4926186800003052, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6735751295336785, "grad_norm": 1.146030655264176, "kl": 0.72265625, "learning_rate": 6.326424870466322e-07, "loss": 0.0043, "reward": 2.499969244003296, "reward_std": 1.7389803815603955e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999690055847168, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.676165803108808, "grad_norm": 1.2564819150816755, "kl": 0.533203125, "learning_rate": 6.323834196891191e-07, "loss": 0.0011, "reward": 2.4999935626983643, "reward_std": 4.535087214208033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6787564766839376, "grad_norm": 314.82103911517834, "kl": 0.615234375, "learning_rate": 6.321243523316062e-07, "loss": 0.0026, "reward": 2.374079465866089, "reward_std": 0.23314310551268136, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8740793466567993, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.681347150259067, "grad_norm": 128.52793016178393, "kl": 0.572265625, "learning_rate": 6.318652849740932e-07, "loss": 0.0023, "reward": 2.4367809295654297, "reward_std": 0.1777317428495735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9367809295654297, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.6839378238341967, "grad_norm": 7.530150717814514, "kl": 0.60546875, "learning_rate": 6.316062176165803e-07, "loss": 0.0025, "reward": 2.4999412298202515, "reward_std": 2.566174237017549e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999412298202515, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.686528497409326, "grad_norm": 68.18310478760145, "kl": 0.564453125, "learning_rate": 6.313471502590674e-07, "loss": 0.0022, "reward": 2.310776472091675, "reward_std": 0.2592012918466935, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8107765316963196, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 3.6891191709844557, "grad_norm": 1.0667331051663402, "kl": 0.552734375, "learning_rate": 6.310880829015544e-07, "loss": 0.0016, "reward": 2.499985933303833, "reward_std": 7.474113260741433e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999861121177673, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6917098445595853, "grad_norm": 12.119337477158354, "kl": 0.708984375, "learning_rate": 6.308290155440414e-07, "loss": 0.0025, "reward": 2.4374581575393677, "reward_std": 0.17678954522850177, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937458097934723, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.694300518134715, "grad_norm": 11.87840484441472, "kl": 0.59375, "learning_rate": 6.305699481865284e-07, "loss": 0.0018, "reward": 2.436813235282898, "reward_std": 0.17871527082212424, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9368132948875427, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6968911917098444, "grad_norm": 0.5272713887367758, "kl": 0.623046875, "learning_rate": 6.303108808290155e-07, "loss": 0.0028, "reward": 2.499990224838257, "reward_std": 5.700041583622806e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.699481865284974, "grad_norm": 4.550795103949523, "kl": 0.595703125, "learning_rate": 6.300518134715026e-07, "loss": 0.0018, "reward": 2.499730348587036, "reward_std": 9.626925998418301e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997304677963257, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7020725388601035, "grad_norm": 16.248839491881732, "kl": 0.587890625, "learning_rate": 6.297927461139896e-07, "loss": 0.0027, "reward": 1.9992719888687134, "reward_std": 0.00019324450443036767, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992719292640686, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 3.704663212435233, "grad_norm": 94.30037764234513, "kl": 0.638671875, "learning_rate": 6.295336787564767e-07, "loss": 0.0026, "reward": 2.1245818734169006, "reward_std": 0.4362754672765732, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6245819330215454, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 3.7072538860103625, "grad_norm": 60.83898760990833, "kl": 0.615234375, "learning_rate": 6.292746113989636e-07, "loss": 0.0016, "reward": 2.186749815940857, "reward_std": 0.2593882263518026, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6867499947547913, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.709844559585492, "grad_norm": 0.1111976328774365, "kl": 0.564453125, "learning_rate": 6.290155440414507e-07, "loss": 0.0009, "reward": 2.499997138977051, "reward_std": 1.1983730701103923e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.7124352331606216, "grad_norm": 2.751399103494885, "kl": 0.5009765625, "learning_rate": 6.287564766839378e-07, "loss": 0.002, "reward": 1.9865328669548035, "reward_std": 9.168900692202442e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.486532747745514, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.715025906735751, "grad_norm": 0.9291724296710563, "kl": 0.60546875, "learning_rate": 6.284974093264248e-07, "loss": 0.0034, "reward": 2.499985098838806, "reward_std": 9.219770618074108e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999849796295166, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 3.7176165803108807, "grad_norm": 58.83902055911605, "kl": 0.525390625, "learning_rate": 6.282383419689119e-07, "loss": 0.0021, "reward": 1.4966418743133545, "reward_std": 0.0012065474766131956, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.996641993522644, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.7202072538860103, "grad_norm": 0.25585265212993175, "kl": 0.591796875, "learning_rate": 6.27979274611399e-07, "loss": 0.0028, "reward": 2.499993324279785, "reward_std": 3.0700907700520474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.72279792746114, "grad_norm": 0.11919373631459733, "kl": 0.587890625, "learning_rate": 6.277202072538859e-07, "loss": 0.0027, "reward": 2.4999974966049194, "reward_std": 2.3411982397192332e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.7253886010362693, "grad_norm": 17.16807778187466, "kl": 0.6328125, "learning_rate": 6.27461139896373e-07, "loss": 0.0024, "reward": 1.9937161207199097, "reward_std": 0.00014862913872093486, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493716150522232, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.727979274611399, "grad_norm": 2.379936472129351, "kl": 0.62890625, "learning_rate": 6.2720207253886e-07, "loss": 0.0016, "reward": 2.499932050704956, "reward_std": 1.843562210979144e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999321699142456, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.7305699481865284, "grad_norm": 0.2673639562068954, "kl": 0.60546875, "learning_rate": 6.269430051813471e-07, "loss": 0.0024, "reward": 2.49999737739563, "reward_std": 2.499199126759777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.733160621761658, "grad_norm": 28.723795843772688, "kl": 0.611328125, "learning_rate": 6.266839378238342e-07, "loss": 0.002, "reward": 2.437302827835083, "reward_std": 0.17723144319711537, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373029470443726, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7357512953367875, "grad_norm": 28.370482488983953, "kl": 0.6171875, "learning_rate": 6.264248704663212e-07, "loss": 0.0025, "reward": 1.5763513445854187, "reward_std": 0.25968330100295134, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0763513147830963, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.738341968911917, "grad_norm": 0.0448370278572583, "kl": 0.59765625, "learning_rate": 6.261658031088083e-07, "loss": 0.0028, "reward": 2.4999988079071045, "reward_std": 9.934623221852235e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 3.7409326424870466, "grad_norm": 27.60810266041904, "kl": 0.5615234375, "learning_rate": 6.259067357512952e-07, "loss": 0.0021, "reward": 2.0145556330680847, "reward_std": 0.2143037964829091, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5145557522773743, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.743523316062176, "grad_norm": 0.13593036903210876, "kl": 0.587890625, "learning_rate": 6.256476683937823e-07, "loss": 0.0034, "reward": 2.4999958276748657, "reward_std": 1.98960850639196e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7461139896373057, "grad_norm": 21.845462527637203, "kl": 0.669921875, "learning_rate": 6.253886010362694e-07, "loss": 0.0033, "reward": 2.0617647767066956, "reward_std": 0.17707414126812182, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5617647171020508, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7487046632124352, "grad_norm": 14.949852235543336, "kl": 0.810546875, "learning_rate": 6.251295336787564e-07, "loss": 0.0032, "reward": 1.4975175261497498, "reward_std": 0.00022265888401307166, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9975175261497498, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7512953367875648, "grad_norm": 66.55556682207326, "kl": 0.619140625, "learning_rate": 6.248704663212436e-07, "loss": 0.0028, "reward": 1.9995760917663574, "reward_std": 0.00010135167121916311, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995761513710022, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 3.7538860103626943, "grad_norm": 0.8295133351459518, "kl": 0.623046875, "learning_rate": 6.246113989637304e-07, "loss": 0.003, "reward": 2.499969005584717, "reward_std": 1.609471647157079e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999688863754272, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.756476683937824, "grad_norm": 5.013783558378453, "kl": 0.501953125, "learning_rate": 6.243523316062176e-07, "loss": 0.0025, "reward": 1.9955863952636719, "reward_std": 4.493643393743696e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4955865740776062, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.7590673575129534, "grad_norm": 7.242895984386232, "kl": 0.658203125, "learning_rate": 6.240932642487047e-07, "loss": 0.0026, "reward": 1.9982446432113647, "reward_std": 5.501295640897297e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982446432113647, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.761658031088083, "grad_norm": 38.560646883909826, "kl": 0.5546875, "learning_rate": 6.238341968911917e-07, "loss": 0.0021, "reward": 1.8102023601531982, "reward_std": 0.2593342049731291, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3102022409439087, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 3.7642487046632125, "grad_norm": 20.478466716089894, "kl": 0.490234375, "learning_rate": 6.235751295336788e-07, "loss": 0.0027, "reward": 2.407601833343506, "reward_std": 0.26130730905720156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9076017141342163, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.766839378238342, "grad_norm": 35.07529510770513, "kl": 0.671875, "learning_rate": 6.233160621761658e-07, "loss": 0.0023, "reward": 1.9970771074295044, "reward_std": 0.0006158917010452569, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970770478248596, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7694300518134716, "grad_norm": 4.663117822260633, "kl": 0.62890625, "learning_rate": 6.230569948186529e-07, "loss": 0.0028, "reward": 1.9330829977989197, "reward_std": 0.0005906326937292761, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.433083027601242, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.772020725388601, "grad_norm": 2.0283972784321374, "kl": 0.607421875, "learning_rate": 6.227979274611399e-07, "loss": 0.0026, "reward": 2.4999887943267822, "reward_std": 1.0341825600335142e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887943267822, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.7746113989637307, "grad_norm": 0.1504616364168647, "kl": 0.6015625, "learning_rate": 6.225388601036269e-07, "loss": 0.0027, "reward": 2.4999972581863403, "reward_std": 2.4962834572761494e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.77720207253886, "grad_norm": 0.47096582500793127, "kl": 0.638671875, "learning_rate": 6.22279792746114e-07, "loss": 0.0034, "reward": 2.4999685287475586, "reward_std": 6.1717643120573484e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999685287475586, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.7797927461139897, "grad_norm": 36.358603448296776, "kl": 1.060546875, "learning_rate": 6.22020725388601e-07, "loss": 0.0034, "reward": 2.49984872341156, "reward_std": 3.841379680125101e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998487830162048, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 3.7823834196891193, "grad_norm": 50.63860518811838, "kl": 0.666015625, "learning_rate": 6.217616580310881e-07, "loss": 0.0027, "reward": 1.3764392137527466, "reward_std": 0.20013821218162775, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8764392137527466, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.784974093264249, "grad_norm": 7.430005079844503, "kl": 0.541015625, "learning_rate": 6.215025906735752e-07, "loss": 0.0016, "reward": 1.896726667881012, "reward_std": 0.0011452280630663836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3967268466949463, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.7875647668393784, "grad_norm": 2.149411149705545, "kl": 0.5380859375, "learning_rate": 6.212435233160621e-07, "loss": 0.0025, "reward": 1.9980133771896362, "reward_std": 6.253521485177771e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980134963989258, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.790155440414508, "grad_norm": 0.206790905176495, "kl": 0.529296875, "learning_rate": 6.209844559585492e-07, "loss": 0.0024, "reward": 2.4999955892562866, "reward_std": 2.6991525601260946e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.7927461139896375, "grad_norm": 2.0750817777190287, "kl": 1.0009765625, "learning_rate": 6.207253886010363e-07, "loss": 0.0037, "reward": 2.4999901056289673, "reward_std": 4.346306582192483e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 3.795336787564767, "grad_norm": 29.26743348044127, "kl": 0.62109375, "learning_rate": 6.204663212435233e-07, "loss": 0.0024, "reward": 1.4922122955322266, "reward_std": 0.013964169160317397, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9922122657299042, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.7979274611398965, "grad_norm": 1.856874113966458, "kl": 0.4638671875, "learning_rate": 6.202072538860104e-07, "loss": 0.0023, "reward": 2.4999927282333374, "reward_std": 6.871298865007702e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.800518134715026, "grad_norm": 16.537032532970002, "kl": 0.546875, "learning_rate": 6.199481865284974e-07, "loss": 0.0022, "reward": 1.2499552965164185, "reward_std": 0.26730382442474365, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7499552369117737, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8031088082901556, "grad_norm": 0.48529939502258856, "kl": 0.662109375, "learning_rate": 6.196891191709844e-07, "loss": 0.0025, "reward": 2.4999966621398926, "reward_std": 3.607507835567958e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.805699481865285, "grad_norm": 1.3495293475403753, "kl": 0.615234375, "learning_rate": 6.194300518134715e-07, "loss": 0.002, "reward": 2.499977707862854, "reward_std": 8.25857705422095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999778270721436, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8082901554404147, "grad_norm": 0.9559715695138364, "kl": 0.47265625, "learning_rate": 6.191709844559585e-07, "loss": 0.0019, "reward": 2.4999840259552, "reward_std": 1.6824803196868743e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839067459106, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8108808290155443, "grad_norm": 17.945451644201825, "kl": 0.75, "learning_rate": 6.189119170984456e-07, "loss": 0.003, "reward": 1.6865063309669495, "reward_std": 0.40862300992012024, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1865063309669495, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.813471502590674, "grad_norm": 0.6991805537296698, "kl": 0.580078125, "learning_rate": 6.186528497409326e-07, "loss": 0.0015, "reward": 2.4999927282333374, "reward_std": 3.403640306487432e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8160621761658033, "grad_norm": 3.710431782220456, "kl": 0.65625, "learning_rate": 6.183937823834197e-07, "loss": 0.0027, "reward": 2.499968409538269, "reward_std": 1.077055890164047e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999682903289795, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.818652849740933, "grad_norm": 30.650184983867998, "kl": 0.572265625, "learning_rate": 6.181347150259067e-07, "loss": 0.0017, "reward": 1.9998257756233215, "reward_std": 6.517808481021348e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998259246349335, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8212435233160624, "grad_norm": 23.09164535362522, "kl": 0.638671875, "learning_rate": 6.178756476683937e-07, "loss": 0.0026, "reward": 1.9993667006492615, "reward_std": 0.35484497249126434, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993667006492615, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.823834196891192, "grad_norm": 0.09461364580546136, "kl": 0.5068359375, "learning_rate": 6.176165803108808e-07, "loss": 0.0023, "reward": 2.49999463558197, "reward_std": 1.5171386849033297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.8264248704663215, "grad_norm": 14.662333697462829, "kl": 0.681640625, "learning_rate": 6.173575129533678e-07, "loss": 0.0036, "reward": 1.992592692375183, "reward_std": 0.0006492468755823211, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4925926625728607, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.8290155440414506, "grad_norm": 52.8437585673581, "kl": 0.65625, "learning_rate": 6.170984455958549e-07, "loss": 0.0023, "reward": 2.499782919883728, "reward_std": 0.0001342327806241883, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997828006744385, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.83160621761658, "grad_norm": 1.862434796984712, "kl": 0.591796875, "learning_rate": 6.16839378238342e-07, "loss": 0.0031, "reward": 1.999351978302002, "reward_std": 3.4814781486147695e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993519484996796, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8341968911917097, "grad_norm": 4.869340342196718, "kl": 0.599609375, "learning_rate": 6.165803108808289e-07, "loss": 0.0017, "reward": 1.9367328882217407, "reward_std": 0.00022172840340317634, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4367329478263855, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8367875647668392, "grad_norm": 0.32801027861791593, "kl": 0.482421875, "learning_rate": 6.16321243523316e-07, "loss": 0.0021, "reward": 2.4999969005584717, "reward_std": 2.2788428850617493e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.839378238341969, "grad_norm": 19.731375611897466, "kl": 0.603515625, "learning_rate": 6.16062176165803e-07, "loss": 0.0024, "reward": 1.9977545738220215, "reward_std": 0.003868335104499465, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4977546334266663, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8419689119170983, "grad_norm": 19.916108162165454, "kl": 0.65625, "learning_rate": 6.158031088082901e-07, "loss": 0.0027, "reward": 2.4999566078186035, "reward_std": 8.312876661875634e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999956488609314, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.844559585492228, "grad_norm": 0.3796084560654814, "kl": 0.53515625, "learning_rate": 6.155440414507772e-07, "loss": 0.0017, "reward": 2.4999842643737793, "reward_std": 4.317462980907294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845027923584, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8471502590673574, "grad_norm": 98.92400194787034, "kl": 0.541015625, "learning_rate": 6.152849740932642e-07, "loss": 0.0014, "reward": 2.1242799758911133, "reward_std": 0.23188841385490377, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6242798566818237, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.849740932642487, "grad_norm": 27.66225313211229, "kl": 0.6171875, "learning_rate": 6.150259067357512e-07, "loss": 0.0031, "reward": 1.9990788698196411, "reward_std": 0.00011606310545175802, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990788698196411, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.8523316062176165, "grad_norm": 0.5434193832390222, "kl": 0.630859375, "learning_rate": 6.147668393782383e-07, "loss": 0.0021, "reward": 2.4999969005584717, "reward_std": 2.6628398757111427e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.854922279792746, "grad_norm": 23.634387073096292, "kl": 0.607421875, "learning_rate": 6.145077720207253e-07, "loss": 0.0023, "reward": 2.437478542327881, "reward_std": 0.1768248305181146, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374786019325256, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.8575129533678756, "grad_norm": 0.3727695923085608, "kl": 0.6015625, "learning_rate": 6.142487046632124e-07, "loss": 0.0025, "reward": 2.4999951124191284, "reward_std": 3.586611683203955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.860103626943005, "grad_norm": 13.825265428589349, "kl": 0.5185546875, "learning_rate": 6.139896373056994e-07, "loss": 0.0023, "reward": 1.975597620010376, "reward_std": 0.0010874239096665406, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4755975902080536, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.8626943005181347, "grad_norm": 0.5958234161389483, "kl": 0.775390625, "learning_rate": 6.137305699481866e-07, "loss": 0.0042, "reward": 2.4999855756759644, "reward_std": 8.769905889494112e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999855756759644, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.865284974093264, "grad_norm": 26.96784622725216, "kl": 0.59765625, "learning_rate": 6.134715025906736e-07, "loss": 0.002, "reward": 2.437392473220825, "reward_std": 0.17704026768478798, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373925924301147, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8678756476683938, "grad_norm": 0.3231881648449048, "kl": 0.556640625, "learning_rate": 6.132124352331606e-07, "loss": 0.0018, "reward": 2.4999982118606567, "reward_std": 1.2508337761119037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8704663212435233, "grad_norm": 0.4085542554512198, "kl": 0.4912109375, "learning_rate": 6.129533678756477e-07, "loss": 0.0034, "reward": 2.499995470046997, "reward_std": 3.4563825010991422e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.873056994818653, "grad_norm": 11.882990877057507, "kl": 0.619140625, "learning_rate": 6.126943005181347e-07, "loss": 0.0025, "reward": 2.312441349029541, "reward_std": 0.2587866015022655, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812441349029541, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8756476683937824, "grad_norm": 4.75791805072614, "kl": 1.55078125, "learning_rate": 6.124352331606218e-07, "loss": 0.0062, "reward": 2.4999773502349854, "reward_std": 1.1655332741611346e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999773502349854, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.878238341968912, "grad_norm": 35.28626007309647, "kl": 0.712890625, "learning_rate": 6.121761658031089e-07, "loss": 0.0033, "reward": 1.9990154504776, "reward_std": 0.0005325514871969972, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990155100822449, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8808290155440415, "grad_norm": 4.83397137323644, "kl": 0.677734375, "learning_rate": 6.119170984455958e-07, "loss": 0.0027, "reward": 1.9991931915283203, "reward_std": 2.6872319722315297e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991931915283203, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.883419689119171, "grad_norm": 0.33512265411369824, "kl": 0.5185546875, "learning_rate": 6.116580310880829e-07, "loss": 0.0018, "reward": 2.499998450279236, "reward_std": 1.567982849337568e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8860103626943006, "grad_norm": 5.275773688277142, "kl": 0.66015625, "learning_rate": 6.113989637305699e-07, "loss": 0.0026, "reward": 1.4958020448684692, "reward_std": 0.00015225730021484196, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9958020448684692, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.88860103626943, "grad_norm": 0.12313454813757603, "kl": 0.599609375, "learning_rate": 6.11139896373057e-07, "loss": 0.0021, "reward": 2.499998450279236, "reward_std": 9.644168414979504e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8911917098445596, "grad_norm": 1.261488250006695, "kl": 0.666015625, "learning_rate": 6.108808290155441e-07, "loss": 0.0028, "reward": 2.499992609024048, "reward_std": 4.563590209727408e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.893782383419689, "grad_norm": 0.11972113536992018, "kl": 0.58984375, "learning_rate": 6.106217616580311e-07, "loss": 0.0021, "reward": 2.4999988079071045, "reward_std": 1.087535423494046e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.8963730569948187, "grad_norm": 9.13303783618558, "kl": 0.65625, "learning_rate": 6.103626943005181e-07, "loss": 0.0026, "reward": 1.8510714769363403, "reward_std": 0.00033356632047798485, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3510713875293732, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.8989637305699483, "grad_norm": 43.747553639533976, "kl": 0.59765625, "learning_rate": 6.101036269430051e-07, "loss": 0.003, "reward": 1.9808810949325562, "reward_std": 0.013991457536860707, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4808810651302338, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.901554404145078, "grad_norm": 0.18508659512849798, "kl": 0.64453125, "learning_rate": 6.098445595854922e-07, "loss": 0.0027, "reward": 2.499995470046997, "reward_std": 3.1586680790951505e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 3.9041450777202074, "grad_norm": 58.90928740917424, "kl": 0.642578125, "learning_rate": 6.095854922279793e-07, "loss": 0.0026, "reward": 1.936310350894928, "reward_std": 0.17835398690658621, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4363104104995728, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.906735751295337, "grad_norm": 6.654735180908968, "kl": 0.546875, "learning_rate": 6.093264248704663e-07, "loss": 0.0017, "reward": 1.9985125064849854, "reward_std": 4.028769288311196e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985126852989197, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9093264248704664, "grad_norm": 3.061275028084077, "kl": 0.625, "learning_rate": 6.090673575129534e-07, "loss": 0.0028, "reward": 2.499997138977051, "reward_std": 3.0962369805820344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.911917098445596, "grad_norm": 0.9387561382051531, "kl": 0.640625, "learning_rate": 6.088082901554404e-07, "loss": 0.0032, "reward": 1.999189019203186, "reward_std": 1.3442469310120941e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991888105869293, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.9145077720207255, "grad_norm": 9.832360551252883, "kl": 0.541015625, "learning_rate": 6.085492227979274e-07, "loss": 0.0019, "reward": 2.499940514564514, "reward_std": 2.8413145599870404e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999406337738037, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 3.917098445595855, "grad_norm": 50.42522772877371, "kl": 0.65234375, "learning_rate": 6.082901554404145e-07, "loss": 0.0026, "reward": 1.8113928437232971, "reward_std": 0.4451411962509155, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3113929629325867, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.9196891191709846, "grad_norm": 0.31733256709475055, "kl": 0.607421875, "learning_rate": 6.080310880829015e-07, "loss": 0.0024, "reward": 2.499977231025696, "reward_std": 3.905087453404121e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999774098396301, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.9222797927461137, "grad_norm": 2.9996666726178036, "kl": 0.646484375, "learning_rate": 6.077720207253886e-07, "loss": 0.0025, "reward": 1.9948174357414246, "reward_std": 0.00012923176223011978, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4948175251483917, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9248704663212433, "grad_norm": 0.2977041928798933, "kl": 0.50390625, "learning_rate": 6.075129533678757e-07, "loss": 0.0026, "reward": 2.499997854232788, "reward_std": 1.666917739839846e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.927461139896373, "grad_norm": 0.17472020500868035, "kl": 0.6015625, "learning_rate": 6.072538860103626e-07, "loss": 0.0031, "reward": 2.4999923706054688, "reward_std": 4.275961941857531e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9300518134715023, "grad_norm": 0.5753968857556688, "kl": 0.599609375, "learning_rate": 6.069948186528497e-07, "loss": 0.0017, "reward": 2.499996781349182, "reward_std": 3.913148304945935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.932642487046632, "grad_norm": 16.39228146853071, "kl": 0.5546875, "learning_rate": 6.067357512953367e-07, "loss": 0.0019, "reward": 1.9955702424049377, "reward_std": 0.00021679321434930898, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.495570421218872, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.9352331606217614, "grad_norm": 11.036190529977668, "kl": 0.5546875, "learning_rate": 6.064766839378238e-07, "loss": 0.0018, "reward": 2.307799220085144, "reward_std": 0.2654867749386085, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8077992796897888, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.937823834196891, "grad_norm": 28.73634252009123, "kl": 0.423828125, "learning_rate": 6.062176165803109e-07, "loss": 0.0017, "reward": 2.3745980262756348, "reward_std": 0.23218988729331613, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8745980262756348, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9404145077720205, "grad_norm": 0.11597445509316395, "kl": 0.541015625, "learning_rate": 6.059585492227979e-07, "loss": 0.0017, "reward": 2.4999892711639404, "reward_std": 2.8522507022898935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.94300518134715, "grad_norm": 2.5630110673711695, "kl": 0.587890625, "learning_rate": 6.056994818652849e-07, "loss": 0.0023, "reward": 1.9992998838424683, "reward_std": 3.800059573677572e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993000328540802, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.9455958549222796, "grad_norm": 0.5704837702234167, "kl": 0.57421875, "learning_rate": 6.054404145077719e-07, "loss": 0.0037, "reward": 2.499988317489624, "reward_std": 5.455459131553653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.948186528497409, "grad_norm": 0.6765926049595977, "kl": 0.515625, "learning_rate": 6.05181347150259e-07, "loss": 0.0028, "reward": 1.9994646906852722, "reward_std": 1.1285911682534788e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994644820690155, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.9507772020725387, "grad_norm": 11.551484937590779, "kl": 0.55859375, "learning_rate": 6.049222797927461e-07, "loss": 0.0029, "reward": 1.9861287474632263, "reward_std": 0.00036651078926297487, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4861286580562592, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9533678756476682, "grad_norm": 2.16296644033383, "kl": 0.546875, "learning_rate": 6.046632124352331e-07, "loss": 0.0021, "reward": 2.499973773956299, "reward_std": 9.214726730988332e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999737739562988, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 3.9559585492227978, "grad_norm": 0.047060006576747686, "kl": 0.5888671875, "learning_rate": 6.044041450777202e-07, "loss": 0.0016, "reward": 2.4999988079071045, "reward_std": 7.893295901340025e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9585492227979273, "grad_norm": 0.3075106872800092, "kl": 0.66796875, "learning_rate": 6.041450777202071e-07, "loss": 0.0034, "reward": 2.4999938011169434, "reward_std": 2.7913884537156264e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.961139896373057, "grad_norm": 0.2503619551032435, "kl": 0.53515625, "learning_rate": 6.038860103626942e-07, "loss": 0.0011, "reward": 2.49999737739563, "reward_std": 1.858786788488942e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9637305699481864, "grad_norm": 0.6795501390511235, "kl": 0.560546875, "learning_rate": 6.036269430051813e-07, "loss": 0.0031, "reward": 2.4999951124191284, "reward_std": 3.1414063528245606e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.966321243523316, "grad_norm": 9.549513060696503, "kl": 0.6796875, "learning_rate": 6.033678756476683e-07, "loss": 0.0029, "reward": 1.8549686074256897, "reward_std": 0.008584378608304633, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.354968637228012, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9689119170984455, "grad_norm": 2.3571452730612883, "kl": 0.587890625, "learning_rate": 6.031088082901554e-07, "loss": 0.0028, "reward": 2.4999945163726807, "reward_std": 3.944333570871095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.971502590673575, "grad_norm": 10.881516152495793, "kl": 0.58203125, "learning_rate": 6.028497409326426e-07, "loss": 0.0024, "reward": 1.9998502135276794, "reward_std": 2.5266230323950367e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998502433300018, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9740932642487046, "grad_norm": 0.3301341463620227, "kl": 0.5048828125, "learning_rate": 6.025906735751294e-07, "loss": 0.0023, "reward": 2.499992847442627, "reward_std": 3.7215783095234656e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.976683937823834, "grad_norm": 3.660146244636932, "kl": 0.560546875, "learning_rate": 6.023316062176166e-07, "loss": 0.0015, "reward": 1.9975864887237549, "reward_std": 6.334725958367926e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975865483283997, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 3.9792746113989637, "grad_norm": 66.53750318562707, "kl": 0.935546875, "learning_rate": 6.020725388601036e-07, "loss": 0.0037, "reward": 1.4973682761192322, "reward_std": 0.0005486692680278793, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9973683059215546, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.981865284974093, "grad_norm": 111.1765270618807, "kl": 0.650390625, "learning_rate": 6.018134715025907e-07, "loss": 0.0035, "reward": 1.9948248863220215, "reward_std": 0.001766938172806931, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4948249161243439, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.9844559585492227, "grad_norm": 2.152028572649257, "kl": 0.603515625, "learning_rate": 6.015544041450778e-07, "loss": 0.0024, "reward": 1.9969515800476074, "reward_std": 4.523793973021384e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4969516396522522, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.9870466321243523, "grad_norm": 18.939390472060765, "kl": 0.673828125, "learning_rate": 6.012953367875648e-07, "loss": 0.0028, "reward": 1.4981385469436646, "reward_std": 0.0003246398082410451, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9981385171413422, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 3.989637305699482, "grad_norm": 3.183918552909533, "kl": 0.583984375, "learning_rate": 6.010362694300518e-07, "loss": 0.0013, "reward": 2.4985495805740356, "reward_std": 4.846578826800396e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9985496401786804, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9922279792746114, "grad_norm": 0.26937349312061426, "kl": 0.462890625, "learning_rate": 6.007772020725388e-07, "loss": 0.0029, "reward": 2.499994993209839, "reward_std": 2.0291383862058865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.994818652849741, "grad_norm": 3.931783633040277, "kl": 0.732421875, "learning_rate": 6.005181347150259e-07, "loss": 0.0031, "reward": 1.9987534284591675, "reward_std": 4.5306347601581365e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987533688545227, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.9974093264248705, "grad_norm": 36.73864575901774, "kl": 0.638671875, "learning_rate": 6.00259067357513e-07, "loss": 0.0024, "reward": 2.436386823654175, "reward_std": 0.17723352701614203, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9363868832588196, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.0, "grad_norm": 3.2938422379538523, "kl": 0.5576171875, "learning_rate": 6e-07, "loss": 0.0021, "reward": 1.4975780248641968, "reward_std": 4.986737258150242e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9975780248641968, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.0025906735751295, "grad_norm": 2.3050822398799142, "kl": 0.556640625, "learning_rate": 5.997409326424871e-07, "loss": 0.0025, "reward": 1.9986236691474915, "reward_std": 2.500391579474126e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498623788356781, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.005181347150259, "grad_norm": 14.437996767669107, "kl": 0.572265625, "learning_rate": 5.99481865284974e-07, "loss": 0.002, "reward": 1.9922929406166077, "reward_std": 0.0004030813342978945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492293119430542, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.007772020725389, "grad_norm": 9.924067417397616, "kl": 0.65234375, "learning_rate": 5.992227979274611e-07, "loss": 0.0027, "reward": 1.389428198337555, "reward_std": 0.00038284005859168246, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8894282877445221, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.010362694300518, "grad_norm": 203.10528574214712, "kl": 0.607421875, "learning_rate": 5.989637305699482e-07, "loss": 0.0024, "reward": 2.3747347593307495, "reward_std": 0.23193417203276567, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747347593307495, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 4.012953367875648, "grad_norm": 0.6631845260621654, "kl": 0.578125, "learning_rate": 5.987046632124352e-07, "loss": 0.0037, "reward": 2.499964714050293, "reward_std": 5.144143585766869e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999644756317139, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.015544041450777, "grad_norm": 0.22262201431311068, "kl": 0.560546875, "learning_rate": 5.984455958549223e-07, "loss": 0.0027, "reward": 2.4999977350234985, "reward_std": 1.2069747299392475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.018134715025907, "grad_norm": 0.29151188985054793, "kl": 0.56640625, "learning_rate": 5.981865284974093e-07, "loss": 0.0018, "reward": 2.499996304512024, "reward_std": 2.1979877828925964e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.020725388601036, "grad_norm": 10.40138961235979, "kl": 0.6171875, "learning_rate": 5.979274611398963e-07, "loss": 0.0021, "reward": 1.796818733215332, "reward_std": 0.0004501073863139027, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.296818882226944, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.023316062176166, "grad_norm": 0.3103180614015572, "kl": 0.607421875, "learning_rate": 5.976683937823834e-07, "loss": 0.0042, "reward": 2.49999737739563, "reward_std": 1.8482202222003252e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.025906735751295, "grad_norm": 1.8889405905973875, "kl": 0.732421875, "learning_rate": 5.974093264248704e-07, "loss": 0.0023, "reward": 1.9991815090179443, "reward_std": 4.385955401176034e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991816878318787, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.028497409326425, "grad_norm": 0.07115426909031733, "kl": 0.5576171875, "learning_rate": 5.971502590673575e-07, "loss": 0.0032, "reward": 2.4999970197677612, "reward_std": 1.6513946548002423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.0310880829015545, "grad_norm": 13.512858077769302, "kl": 0.720703125, "learning_rate": 5.968911917098445e-07, "loss": 0.0024, "reward": 2.436524510383606, "reward_std": 0.17806337454135246, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9365244507789612, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.033678756476684, "grad_norm": 83.77520133168801, "kl": 0.681640625, "learning_rate": 5.966321243523316e-07, "loss": 0.0027, "reward": 2.061656951904297, "reward_std": 0.17711779342243972, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5616569519042969, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.036269430051814, "grad_norm": 23.64170264909656, "kl": 0.498046875, "learning_rate": 5.963730569948186e-07, "loss": 0.0016, "reward": 2.4996161460876465, "reward_std": 0.0005271399697903689, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996162056922913, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.038860103626943, "grad_norm": 0.6642756087935933, "kl": 0.703125, "learning_rate": 5.961139896373056e-07, "loss": 0.0032, "reward": 2.499993324279785, "reward_std": 4.491967615649628e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.041450777202073, "grad_norm": 8.844619004960176, "kl": 0.62890625, "learning_rate": 5.958549222797927e-07, "loss": 0.0025, "reward": 1.5778818726539612, "reward_std": 0.25914280978031456, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0778818726539612, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.044041450777202, "grad_norm": 65.19586943759566, "kl": 0.59765625, "learning_rate": 5.955958549222798e-07, "loss": 0.0007, "reward": 2.4999959468841553, "reward_std": 3.1090561378732673e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.046632124352332, "grad_norm": 6.890072062375393, "kl": 0.578125, "learning_rate": 5.953367875647668e-07, "loss": 0.0024, "reward": 2.4999860525131226, "reward_std": 1.0378549177403329e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.049222797927461, "grad_norm": 8.272773535312725, "kl": 0.578125, "learning_rate": 5.950777202072539e-07, "loss": 0.0023, "reward": 2.249973237514496, "reward_std": 0.2672760846527922, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749973177909851, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.051813471502591, "grad_norm": 0.20661008677951956, "kl": 0.5166015625, "learning_rate": 5.948186528497408e-07, "loss": 0.0024, "reward": 2.499996542930603, "reward_std": 2.237595595033781e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.05440414507772, "grad_norm": 0.12555056229308964, "kl": 0.587890625, "learning_rate": 5.945595854922279e-07, "loss": 0.0026, "reward": 2.4999966621398926, "reward_std": 2.0597740331140812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.05699481865285, "grad_norm": 0.26823846297235, "kl": 0.53515625, "learning_rate": 5.94300518134715e-07, "loss": 0.0019, "reward": 2.4999924898147583, "reward_std": 4.826226074783335e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.0595854922279795, "grad_norm": 0.3115984426127892, "kl": 0.72265625, "learning_rate": 5.94041450777202e-07, "loss": 0.0038, "reward": 2.499992847442627, "reward_std": 3.457722982602718e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.062176165803109, "grad_norm": 0.07646026050877813, "kl": 0.576171875, "learning_rate": 5.937823834196891e-07, "loss": 0.0026, "reward": 2.499997854232788, "reward_std": 1.599062329660228e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.064766839378239, "grad_norm": 6.034802884388035, "kl": 0.59765625, "learning_rate": 5.935233160621761e-07, "loss": 0.0014, "reward": 1.9996399879455566, "reward_std": 4.871979575682417e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499640166759491, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.067357512953368, "grad_norm": 1.5931278902439057, "kl": 0.634765625, "learning_rate": 5.932642487046632e-07, "loss": 0.0031, "reward": 2.499993324279785, "reward_std": 8.062339020398213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.069948186528498, "grad_norm": 0.15831156772417054, "kl": 0.546875, "learning_rate": 5.930051813471502e-07, "loss": 0.002, "reward": 2.49999737739563, "reward_std": 2.2148330458549026e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.072538860103627, "grad_norm": 0.16255168965053454, "kl": 0.609375, "learning_rate": 5.927461139896372e-07, "loss": 0.0022, "reward": 2.4999945163726807, "reward_std": 2.3712901793260244e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.075129533678757, "grad_norm": 14.667512754260796, "kl": 0.56640625, "learning_rate": 5.924870466321243e-07, "loss": 0.0017, "reward": 1.807647466659546, "reward_std": 0.0005689227422749354, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3076475262641907, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.077720207253886, "grad_norm": 0.17254008601348433, "kl": 0.6015625, "learning_rate": 5.922279792746113e-07, "loss": 0.0026, "reward": 2.499996781349182, "reward_std": 2.082802836866904e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.080310880829016, "grad_norm": 0.13524035666057582, "kl": 0.51953125, "learning_rate": 5.919689119170984e-07, "loss": 0.0014, "reward": 2.4999959468841553, "reward_std": 2.2045591094865813e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.082901554404145, "grad_norm": 46.19624340052393, "kl": 0.568359375, "learning_rate": 5.917098445595856e-07, "loss": 0.0022, "reward": 1.7250802516937256, "reward_std": 0.18218115545460023, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.225080281496048, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.085492227979275, "grad_norm": 0.12442770679462524, "kl": 0.5205078125, "learning_rate": 5.914507772020724e-07, "loss": 0.0025, "reward": 2.499996304512024, "reward_std": 2.0851798439025515e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 4.0880829015544045, "grad_norm": 176.1283103633914, "kl": 0.65625, "learning_rate": 5.911917098445596e-07, "loss": 0.0026, "reward": 1.5338504314422607, "reward_std": 0.19222562294453382, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0338504016399384, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.090673575129534, "grad_norm": 4.036404758337464, "kl": 0.740234375, "learning_rate": 5.909326424870466e-07, "loss": 0.0031, "reward": 2.24997615814209, "reward_std": 0.26726677203492955, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499761581420898, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.0932642487046635, "grad_norm": 446.87500892294065, "kl": 0.677734375, "learning_rate": 5.906735751295337e-07, "loss": 0.0026, "reward": 1.985282301902771, "reward_std": 0.004063126727089639, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4852822721004486, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.095854922279793, "grad_norm": 0.9072821213148944, "kl": 0.787109375, "learning_rate": 5.904145077720208e-07, "loss": 0.0036, "reward": 2.4999982118606567, "reward_std": 2.4064745502982987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.098445595854923, "grad_norm": 7.4968836709029905, "kl": 0.623046875, "learning_rate": 5.901554404145078e-07, "loss": 0.0029, "reward": 2.437405586242676, "reward_std": 0.1770346817867221, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374055862426758, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.101036269430052, "grad_norm": 0.06995418438390179, "kl": 0.529296875, "learning_rate": 5.898963730569948e-07, "loss": 0.0026, "reward": 2.4999970197677612, "reward_std": 1.51476302789888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.103626943005182, "grad_norm": 0.9328173143380031, "kl": 0.703125, "learning_rate": 5.896373056994819e-07, "loss": 0.0024, "reward": 2.4999959468841553, "reward_std": 3.555061823590222e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.106217616580311, "grad_norm": 0.051051994192752165, "kl": 0.60546875, "learning_rate": 5.893782383419689e-07, "loss": 0.0029, "reward": 2.4999985694885254, "reward_std": 1.449777499828997e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.108808290155441, "grad_norm": 7.686547556509007, "kl": 0.533203125, "learning_rate": 5.89119170984456e-07, "loss": 0.0025, "reward": 1.8556958436965942, "reward_std": 0.0002849394913937431, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3556957244873047, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.11139896373057, "grad_norm": 0.5049042948705571, "kl": 0.640625, "learning_rate": 5.88860103626943e-07, "loss": 0.0024, "reward": 2.49999737739563, "reward_std": 2.4049422790994868e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.1139896373057, "grad_norm": 0.10960716660015431, "kl": 0.5859375, "learning_rate": 5.886010362694301e-07, "loss": 0.0029, "reward": 2.499998688697815, "reward_std": 1.0121020181941276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.116580310880829, "grad_norm": 4.647572627259293, "kl": 0.587890625, "learning_rate": 5.883419689119171e-07, "loss": 0.0016, "reward": 2.499975800514221, "reward_std": 2.3001595081950654e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975860118866, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.119170984455959, "grad_norm": 0.1390424404647077, "kl": 0.619140625, "learning_rate": 5.880829015544041e-07, "loss": 0.0015, "reward": 2.499997615814209, "reward_std": 1.999132791752345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.1217616580310885, "grad_norm": 70.80166265297127, "kl": 0.73046875, "learning_rate": 5.878238341968912e-07, "loss": 0.0031, "reward": 2.3747905492782593, "reward_std": 0.23183369268429033, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747904896736145, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.124352331606218, "grad_norm": 0.19872167233053264, "kl": 0.6015625, "learning_rate": 5.875647668393782e-07, "loss": 0.0026, "reward": 2.499994158744812, "reward_std": 3.389240191609133e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.126943005181348, "grad_norm": 0.5578899038189888, "kl": 0.615234375, "learning_rate": 5.873056994818653e-07, "loss": 0.0031, "reward": 2.499990940093994, "reward_std": 5.1923485102634e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.129533678756476, "grad_norm": 3.3537460392941045, "kl": 0.708984375, "learning_rate": 5.870466321243524e-07, "loss": 0.0029, "reward": 1.999929666519165, "reward_std": 9.79922316446391e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999294877052307, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.132124352331606, "grad_norm": 8.64241512520019, "kl": 0.544921875, "learning_rate": 5.867875647668393e-07, "loss": 0.0029, "reward": 2.4999057054519653, "reward_std": 4.391585616758675e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999055862426758, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.134715025906735, "grad_norm": 11.43487138417314, "kl": 0.607421875, "learning_rate": 5.865284974093264e-07, "loss": 0.0021, "reward": 2.4374845027923584, "reward_std": 0.17679221460957706, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937484622001648, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.137305699481865, "grad_norm": 16.91865443281206, "kl": 0.55078125, "learning_rate": 5.862694300518134e-07, "loss": 0.0021, "reward": 2.437298059463501, "reward_std": 0.17733484648465492, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937298059463501, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.139896373056994, "grad_norm": 31.92323407638644, "kl": 0.587890625, "learning_rate": 5.860103626943005e-07, "loss": 0.0026, "reward": 2.3747737407684326, "reward_std": 0.2317599626089759, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747737407684326, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.142487046632124, "grad_norm": 0.132352497143174, "kl": 0.5302734375, "learning_rate": 5.857512953367876e-07, "loss": 0.0007, "reward": 2.4999985694885254, "reward_std": 1.1445536074461415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.1450777202072535, "grad_norm": 2.3978455352323946, "kl": 0.580078125, "learning_rate": 5.854922279792746e-07, "loss": 0.0022, "reward": 2.4999834299087524, "reward_std": 3.327181090639897e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999832510948181, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.147668393782383, "grad_norm": 0.7308245229172905, "kl": 0.66796875, "learning_rate": 5.852331606217616e-07, "loss": 0.0024, "reward": 2.499981641769409, "reward_std": 4.5979085143699194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981701374054, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.150259067357513, "grad_norm": 0.617517123830945, "kl": 0.6015625, "learning_rate": 5.849740932642486e-07, "loss": 0.0023, "reward": 2.4999918937683105, "reward_std": 5.057185376244888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.152849740932642, "grad_norm": 25.280836859465396, "kl": 0.625, "learning_rate": 5.847150259067357e-07, "loss": 0.0017, "reward": 2.4997836351394653, "reward_std": 7.843161279197375e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997836351394653, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.155440414507772, "grad_norm": 42.04318795020656, "kl": 0.4921875, "learning_rate": 5.844559585492228e-07, "loss": 0.0016, "reward": 2.4996185302734375, "reward_std": 0.0005161373189821461, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999618411064148, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.158031088082901, "grad_norm": 1.6128775095841865, "kl": 0.60546875, "learning_rate": 5.841968911917098e-07, "loss": 0.0029, "reward": 2.4999868869781494, "reward_std": 1.0934297051790054e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.160621761658031, "grad_norm": 119.53691716617585, "kl": 0.564453125, "learning_rate": 5.839378238341969e-07, "loss": 0.0023, "reward": 1.9276056289672852, "reward_std": 0.18022769573144615, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.42760568857193, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.16321243523316, "grad_norm": 3.8207966143976635, "kl": 0.728515625, "learning_rate": 5.836787564766839e-07, "loss": 0.0034, "reward": 1.7968632578849792, "reward_std": 0.0003293002137070289, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2968633472919464, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.16580310880829, "grad_norm": 0.3482213063995691, "kl": 0.59375, "learning_rate": 5.834196891191709e-07, "loss": 0.0027, "reward": 2.4999982118606567, "reward_std": 1.586449357660058e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 4.168393782383419, "grad_norm": 45.903895596624764, "kl": 0.62109375, "learning_rate": 5.83160621761658e-07, "loss": 0.0024, "reward": 1.935767412185669, "reward_std": 0.17805244988267077, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4357673823833466, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.170984455958549, "grad_norm": 39.34252821822125, "kl": 0.66796875, "learning_rate": 5.82901554404145e-07, "loss": 0.0019, "reward": 1.9453968405723572, "reward_std": 0.00033813338768595713, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4453968703746796, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.1735751295336785, "grad_norm": 9.78042916189634, "kl": 0.55078125, "learning_rate": 5.826424870466321e-07, "loss": 0.0024, "reward": 1.8032630681991577, "reward_std": 0.000661811033523918, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3032631278038025, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.176165803108808, "grad_norm": 0.8432639015061719, "kl": 0.611328125, "learning_rate": 5.823834196891192e-07, "loss": 0.003, "reward": 2.49998676776886, "reward_std": 8.345733817805012e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986708164215, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.178756476683938, "grad_norm": 2.2326926123548203, "kl": 0.63671875, "learning_rate": 5.821243523316061e-07, "loss": 0.0034, "reward": 2.4986536502838135, "reward_std": 3.489657956379233e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998653769493103, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.181347150259067, "grad_norm": 0.1618053172575831, "kl": 0.546875, "learning_rate": 5.818652849740932e-07, "loss": 0.0021, "reward": 2.499998092651367, "reward_std": 1.4075787220235725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.183937823834197, "grad_norm": 2.9089152928194464, "kl": 0.5703125, "learning_rate": 5.816062176165802e-07, "loss": 0.0023, "reward": 2.4999752044677734, "reward_std": 1.560015789436875e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999749660491943, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.186528497409326, "grad_norm": 0.10729832916207153, "kl": 0.630859375, "learning_rate": 5.813471502590673e-07, "loss": 0.0025, "reward": 2.499995708465576, "reward_std": 1.9621809883574315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.189119170984456, "grad_norm": 6.709198778037411, "kl": 0.5205078125, "learning_rate": 5.810880829015544e-07, "loss": 0.0016, "reward": 1.9636247158050537, "reward_std": 0.0002158153147320263, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4636247754096985, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.191709844559585, "grad_norm": 0.3260529296936622, "kl": 0.6484375, "learning_rate": 5.808290155440414e-07, "loss": 0.0028, "reward": 2.4999961853027344, "reward_std": 3.953803400236211e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.194300518134715, "grad_norm": 0.36365895945651155, "kl": 0.5859375, "learning_rate": 5.805699481865284e-07, "loss": 0.0022, "reward": 2.4999911785125732, "reward_std": 4.7082845640034066e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.196891191709844, "grad_norm": 11.21680113147946, "kl": 0.607421875, "learning_rate": 5.803108808290154e-07, "loss": 0.0022, "reward": 1.9765933752059937, "reward_std": 0.00017748495929481578, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4765934348106384, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 4.199481865284974, "grad_norm": 8.007698177712976, "kl": 0.5703125, "learning_rate": 5.800518134715026e-07, "loss": 0.002, "reward": 1.9320799112319946, "reward_std": 0.09347817705230455, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4320799112319946, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.2020725388601035, "grad_norm": 9.257870332077497, "kl": 0.6171875, "learning_rate": 5.797927461139897e-07, "loss": 0.0025, "reward": 1.3844432830810547, "reward_std": 0.0003805444575846195, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8844432830810547, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.204663212435233, "grad_norm": 3.548646755965484, "kl": 0.591796875, "learning_rate": 5.795336787564767e-07, "loss": 0.0026, "reward": 1.912053108215332, "reward_std": 0.00016140227603500534, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4120529890060425, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.2072538860103625, "grad_norm": 15.17980436782388, "kl": 0.59765625, "learning_rate": 5.792746113989638e-07, "loss": 0.0032, "reward": 1.999588429927826, "reward_std": 0.0002578923082410256, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995883703231812, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.209844559585492, "grad_norm": 0.13114918247188348, "kl": 0.501953125, "learning_rate": 5.790155440414507e-07, "loss": 0.0017, "reward": 2.499997615814209, "reward_std": 2.739557316999708e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.212435233160622, "grad_norm": 0.46453081904447524, "kl": 0.59765625, "learning_rate": 5.787564766839378e-07, "loss": 0.0014, "reward": 2.4999845027923584, "reward_std": 4.257096406945493e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999844431877136, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.215025906735751, "grad_norm": 5.202095193134, "kl": 0.615234375, "learning_rate": 5.784974093264249e-07, "loss": 0.0032, "reward": 2.368348479270935, "reward_std": 0.2441658568332059, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.868348479270935, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.217616580310881, "grad_norm": 4.476154344917383, "kl": 0.564453125, "learning_rate": 5.782383419689119e-07, "loss": 0.0023, "reward": 1.9997096061706543, "reward_std": 3.3951238492591074e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997096061706543, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.22020725388601, "grad_norm": 0.1477563787146229, "kl": 0.4365234375, "learning_rate": 5.77979274611399e-07, "loss": 0.0012, "reward": 2.499993681907654, "reward_std": 2.7002355125205213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.22279792746114, "grad_norm": 0.15175240887176852, "kl": 0.583984375, "learning_rate": 5.777202072538861e-07, "loss": 0.0025, "reward": 2.4999961853027344, "reward_std": 2.443358539494511e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.225388601036269, "grad_norm": 0.2692670830516244, "kl": 0.603515625, "learning_rate": 5.77461139896373e-07, "loss": 0.0024, "reward": 2.4999969005584717, "reward_std": 2.649532063969673e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.227979274611399, "grad_norm": 5.414924910425309, "kl": 0.609375, "learning_rate": 5.772020725388601e-07, "loss": 0.0017, "reward": 2.4985954761505127, "reward_std": 5.5125294011304504e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998595654964447, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.230569948186528, "grad_norm": 13.720236557417483, "kl": 0.59375, "learning_rate": 5.769430051813471e-07, "loss": 0.0018, "reward": 1.9987622499465942, "reward_std": 0.00010141330039914465, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498762309551239, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.233160621761658, "grad_norm": 1.1627309784041804, "kl": 0.646484375, "learning_rate": 5.766839378238342e-07, "loss": 0.0016, "reward": 2.499995708465576, "reward_std": 3.1657982617616653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.2357512953367875, "grad_norm": 0.18817429489113038, "kl": 0.5380859375, "learning_rate": 5.764248704663213e-07, "loss": 0.0017, "reward": 2.4999958276748657, "reward_std": 3.296565353139158e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.238341968911917, "grad_norm": 1.1908659211970154, "kl": 0.59765625, "learning_rate": 5.761658031088083e-07, "loss": 0.002, "reward": 2.4999576807022095, "reward_std": 1.3231047660156037e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999577403068542, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.240932642487047, "grad_norm": 0.4278363164284021, "kl": 0.59375, "learning_rate": 5.759067357512953e-07, "loss": 0.0024, "reward": 2.4999940395355225, "reward_std": 3.60518049546954e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.243523316062176, "grad_norm": 137.28632729901813, "kl": 0.560546875, "learning_rate": 5.756476683937823e-07, "loss": 0.0022, "reward": 2.249587059020996, "reward_std": 0.4362209737300873, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749587059020996, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.246113989637306, "grad_norm": 0.8084574041117093, "kl": 0.466796875, "learning_rate": 5.753886010362694e-07, "loss": 0.0016, "reward": 1.9994917511940002, "reward_std": 2.0866170530098316e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994916915893555, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.248704663212435, "grad_norm": 17.010369189997384, "kl": 0.5, "learning_rate": 5.751295336787565e-07, "loss": 0.0021, "reward": 2.2498812675476074, "reward_std": 0.26738557077308656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7498813271522522, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.251295336787565, "grad_norm": 0.2955273179262185, "kl": 0.5771484375, "learning_rate": 5.748704663212435e-07, "loss": 0.0029, "reward": 2.499990463256836, "reward_std": 4.623496579370112e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999904036521912, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.253886010362694, "grad_norm": 2.2239185179897967, "kl": 0.5595703125, "learning_rate": 5.746113989637306e-07, "loss": 0.0011, "reward": 2.499986171722412, "reward_std": 1.693834474281175e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999862909317017, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.256476683937824, "grad_norm": 0.27420898882976713, "kl": 0.49609375, "learning_rate": 5.743523316062175e-07, "loss": 0.0024, "reward": 2.4999947547912598, "reward_std": 3.902070375261246e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 4.259067357512953, "grad_norm": 36.03492525884097, "kl": 0.5859375, "learning_rate": 5.740932642487046e-07, "loss": 0.0024, "reward": 1.4319944381713867, "reward_std": 0.3108383885701187, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9319943487644196, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.261658031088083, "grad_norm": 3.4659050334171386, "kl": 0.5380859375, "learning_rate": 5.738341968911917e-07, "loss": 0.0016, "reward": 2.499990701675415, "reward_std": 4.7403360099451675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.2642487046632125, "grad_norm": 0.42345049200856427, "kl": 0.595703125, "learning_rate": 5.735751295336787e-07, "loss": 0.0015, "reward": 2.4999959468841553, "reward_std": 3.1556160706713854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.266839378238342, "grad_norm": 21.543836015808335, "kl": 0.568359375, "learning_rate": 5.733160621761658e-07, "loss": 0.0016, "reward": 2.4997191429138184, "reward_std": 0.0002408270258911216, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997192025184631, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.269430051813472, "grad_norm": 0.7903153031082575, "kl": 0.599609375, "learning_rate": 5.730569948186528e-07, "loss": 0.0015, "reward": 2.499993085861206, "reward_std": 4.919893626720295e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.272020725388601, "grad_norm": 18.701499190485887, "kl": 0.625, "learning_rate": 5.727979274611398e-07, "loss": 0.0026, "reward": 2.3747841119766235, "reward_std": 0.2318441204608348, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747839331626892, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.274611398963731, "grad_norm": 5.942674959053139, "kl": 0.65625, "learning_rate": 5.725388601036269e-07, "loss": 0.0019, "reward": 1.9898574948310852, "reward_std": 8.745621454409047e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4898576140403748, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.27720207253886, "grad_norm": 36.703283324141054, "kl": 0.689453125, "learning_rate": 5.722797927461139e-07, "loss": 0.0018, "reward": 1.997505784034729, "reward_std": 0.00013076025470581953, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975058436393738, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.27979274611399, "grad_norm": 31.7745005273966, "kl": 0.673828125, "learning_rate": 5.72020725388601e-07, "loss": 0.0026, "reward": 2.3119869232177734, "reward_std": 0.2594574447416562, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8119871020317078, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 4.282383419689119, "grad_norm": 25.063873953050138, "kl": 0.5419921875, "learning_rate": 5.717616580310881e-07, "loss": 0.0017, "reward": 2.414185047149658, "reward_std": 0.24271571280524995, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9141851663589478, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.284974093264249, "grad_norm": 10.400713917434995, "kl": 0.638671875, "learning_rate": 5.715025906735751e-07, "loss": 0.0031, "reward": 1.9988110661506653, "reward_std": 0.00030792295433457184, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988110661506653, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.287564766839378, "grad_norm": 2.461217767085106, "kl": 0.466796875, "learning_rate": 5.712435233160621e-07, "loss": 0.0022, "reward": 2.499971628189087, "reward_std": 1.7660778553363343e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999718070030212, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.290155440414508, "grad_norm": 4.145031513342229, "kl": 0.591796875, "learning_rate": 5.709844559585491e-07, "loss": 0.0026, "reward": 1.9989597797393799, "reward_std": 3.4292488408027566e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989598393440247, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.2927461139896375, "grad_norm": 24.77837514939434, "kl": 0.490234375, "learning_rate": 5.707253886010362e-07, "loss": 0.0019, "reward": 2.374297618865967, "reward_std": 0.35366079350205837, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8742976784706116, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.295336787564767, "grad_norm": 0.7623374728522584, "kl": 0.861328125, "learning_rate": 5.704663212435233e-07, "loss": 0.0032, "reward": 2.4999938011169434, "reward_std": 4.728751719085267e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.2979274611398965, "grad_norm": 2.5077381941477017, "kl": 0.5380859375, "learning_rate": 5.702072538860103e-07, "loss": 0.0016, "reward": 2.4999916553497314, "reward_std": 1.200761550990137e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.300518134715026, "grad_norm": 5.31320853941034, "kl": 0.580078125, "learning_rate": 5.699481865284974e-07, "loss": 0.0023, "reward": 1.993963360786438, "reward_std": 0.0001938266562433455, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4939634203910828, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.303108808290156, "grad_norm": 11.949779424136578, "kl": 0.548828125, "learning_rate": 5.696891191709843e-07, "loss": 0.0017, "reward": 2.499791741371155, "reward_std": 0.000314659254883054, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997919201850891, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.305699481865285, "grad_norm": 0.30834671176015715, "kl": 0.6015625, "learning_rate": 5.694300518134714e-07, "loss": 0.0024, "reward": 2.499994158744812, "reward_std": 3.0561388939531753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.308290155440415, "grad_norm": 4.616149115036305, "kl": 0.599609375, "learning_rate": 5.691709844559586e-07, "loss": 0.0021, "reward": 1.9982799291610718, "reward_std": 4.13513469368354e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982800781726837, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.310880829015544, "grad_norm": 12.55090800342937, "kl": 0.6484375, "learning_rate": 5.689119170984456e-07, "loss": 0.0033, "reward": 1.9996336102485657, "reward_std": 4.750211064674659e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499633550643921, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.313471502590674, "grad_norm": 0.2884094402116578, "kl": 0.595703125, "learning_rate": 5.686528497409327e-07, "loss": 0.0025, "reward": 2.499993324279785, "reward_std": 3.5078776363661746e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.316062176165803, "grad_norm": 0.5033082290586768, "kl": 0.640625, "learning_rate": 5.683937823834197e-07, "loss": 0.0021, "reward": 2.4999923706054688, "reward_std": 3.2857733458513394e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.318652849740933, "grad_norm": 1.6584479437826825, "kl": 0.67578125, "learning_rate": 5.681347150259067e-07, "loss": 0.0017, "reward": 2.499986171722412, "reward_std": 1.2074332744305138e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999862313270569, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.321243523316062, "grad_norm": 0.6413122994354817, "kl": 0.5234375, "learning_rate": 5.678756476683938e-07, "loss": 0.0012, "reward": 2.4999935626983643, "reward_std": 5.578616082857479e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.323834196891192, "grad_norm": 0.5250146719646576, "kl": 0.591796875, "learning_rate": 5.676165803108808e-07, "loss": 0.0022, "reward": 2.499895930290222, "reward_std": 8.456581554128206e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998960494995117, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.3264248704663215, "grad_norm": 19.677519175704255, "kl": 0.599609375, "learning_rate": 5.673575129533679e-07, "loss": 0.0022, "reward": 1.997769296169281, "reward_std": 6.042470937472899e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4977693259716034, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.329015544041451, "grad_norm": 0.6545045101955983, "kl": 0.53125, "learning_rate": 5.670984455958549e-07, "loss": 0.0014, "reward": 2.4999938011169434, "reward_std": 2.0695933926617727e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.331606217616581, "grad_norm": 0.17350085020625586, "kl": 0.58984375, "learning_rate": 5.66839378238342e-07, "loss": 0.0028, "reward": 2.4999974966049194, "reward_std": 2.7491719265526626e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.33419689119171, "grad_norm": 1.6732636589231502, "kl": 0.78515625, "learning_rate": 5.66580310880829e-07, "loss": 0.0025, "reward": 2.4999890327453613, "reward_std": 9.537919595459243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892115592957, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.33678756476684, "grad_norm": 143.74576354126248, "kl": 0.884765625, "learning_rate": 5.66321243523316e-07, "loss": 0.0036, "reward": 1.4693045616149902, "reward_std": 0.02534233225742355, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9693045020103455, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.339378238341969, "grad_norm": 0.3592438152054522, "kl": 0.513671875, "learning_rate": 5.660621761658031e-07, "loss": 0.004, "reward": 2.4999945163726807, "reward_std": 1.987079429000005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.341968911917099, "grad_norm": 40.94326485893488, "kl": 0.646484375, "learning_rate": 5.658031088082901e-07, "loss": 0.0018, "reward": 1.996316909790039, "reward_std": 0.0038121540081874628, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4963169693946838, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.344559585492228, "grad_norm": 0.2714649585590271, "kl": 0.515625, "learning_rate": 5.655440414507772e-07, "loss": 0.0015, "reward": 2.4999961853027344, "reward_std": 2.5908908014571352e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.347150259067358, "grad_norm": 0.09820565116898158, "kl": 0.556640625, "learning_rate": 5.652849740932643e-07, "loss": 0.0024, "reward": 2.499998092651367, "reward_std": 1.1254048786213389e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.349740932642487, "grad_norm": 6.530897770191923, "kl": 0.5625, "learning_rate": 5.650259067357512e-07, "loss": 0.003, "reward": 2.499991536140442, "reward_std": 6.300147219917562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.352331606217617, "grad_norm": 0.5295991897643237, "kl": 0.521484375, "learning_rate": 5.647668393782383e-07, "loss": 0.0025, "reward": 2.4999972581863403, "reward_std": 4.091645024573154e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.3549222797927465, "grad_norm": 1.4322167881998784, "kl": 0.55859375, "learning_rate": 5.645077720207254e-07, "loss": 0.0018, "reward": 2.499990224838257, "reward_std": 1.1676072972477414e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.357512953367876, "grad_norm": 7.749092245200813, "kl": 0.564453125, "learning_rate": 5.642487046632124e-07, "loss": 0.0019, "reward": 1.8439560532569885, "reward_std": 0.00033071260651240664, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3439561128616333, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 4.360103626943006, "grad_norm": 49.14468990809364, "kl": 0.65234375, "learning_rate": 5.639896373056995e-07, "loss": 0.0029, "reward": 2.186295509338379, "reward_std": 0.25977031654730354, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6862954497337341, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.362694300518135, "grad_norm": 2.0193399604518705, "kl": 0.65625, "learning_rate": 5.637305699481865e-07, "loss": 0.0009, "reward": 2.4999890327453613, "reward_std": 4.434450602275319e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892115592957, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.365284974093265, "grad_norm": 1.299432015783731, "kl": 0.56640625, "learning_rate": 5.634715025906735e-07, "loss": 0.001, "reward": 2.4999977350234985, "reward_std": 1.795968131546033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.367875647668393, "grad_norm": 46.55266305585827, "kl": 0.66015625, "learning_rate": 5.632124352331606e-07, "loss": 0.0028, "reward": 2.249366521835327, "reward_std": 0.26793685653643706, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7493664622306824, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.370466321243523, "grad_norm": 18.124875970656927, "kl": 0.64453125, "learning_rate": 5.629533678756476e-07, "loss": 0.0025, "reward": 1.4985905885696411, "reward_std": 5.105234777147416e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9985905289649963, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.373056994818652, "grad_norm": 1.6213300703040006, "kl": 0.625, "learning_rate": 5.626943005181347e-07, "loss": 0.0028, "reward": 2.4999959468841553, "reward_std": 2.1180981093493756e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.375647668393782, "grad_norm": 65.56369435561938, "kl": 0.58984375, "learning_rate": 5.624352331606217e-07, "loss": 0.0015, "reward": 2.0606095790863037, "reward_std": 0.17720103754976435, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5606095790863037, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.3782383419689115, "grad_norm": 5.393051172743816, "kl": 0.591796875, "learning_rate": 5.621761658031088e-07, "loss": 0.0029, "reward": 2.4998871088027954, "reward_std": 2.7216264243179467e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998871684074402, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 4.380829015544041, "grad_norm": 8.243253032552285, "kl": 0.595703125, "learning_rate": 5.619170984455959e-07, "loss": 0.002, "reward": 2.4180067777633667, "reward_std": 0.2319025509225412, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9180067777633667, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.383419689119171, "grad_norm": 5.0569368304393905, "kl": 0.625, "learning_rate": 5.616580310880828e-07, "loss": 0.003, "reward": 1.996970534324646, "reward_std": 5.268523182166973e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4969704747200012, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.3860103626943, "grad_norm": 0.8050594850960735, "kl": 0.59765625, "learning_rate": 5.613989637305699e-07, "loss": 0.0024, "reward": 2.4999959468841553, "reward_std": 5.5609139053558465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.38860103626943, "grad_norm": 7.240580422554393, "kl": 0.611328125, "learning_rate": 5.611398963730569e-07, "loss": 0.003, "reward": 1.7269903421401978, "reward_std": 0.0004363310087001082, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.226990282535553, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.391191709844559, "grad_norm": 0.08911179793859764, "kl": 0.5390625, "learning_rate": 5.60880829015544e-07, "loss": 0.0019, "reward": 2.4999969005584717, "reward_std": 1.256033129948264e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.393782383419689, "grad_norm": 0.6779476701965703, "kl": 0.86328125, "learning_rate": 5.606217616580311e-07, "loss": 0.0033, "reward": 2.499997615814209, "reward_std": 2.0959851099178195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.396373056994818, "grad_norm": 0.8537262695488647, "kl": 0.51171875, "learning_rate": 5.60362694300518e-07, "loss": 0.0019, "reward": 2.499995470046997, "reward_std": 2.3593906917085405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.398963730569948, "grad_norm": 8.758586583301696, "kl": 0.72265625, "learning_rate": 5.601036269430051e-07, "loss": 0.0032, "reward": 2.4999879598617554, "reward_std": 1.07846685750701e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881386756897, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.401554404145077, "grad_norm": 0.763303384589887, "kl": 0.646484375, "learning_rate": 5.598445595854921e-07, "loss": 0.002, "reward": 2.499988555908203, "reward_std": 6.400082384061534e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884366989136, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.404145077720207, "grad_norm": 1.0320295893872449, "kl": 0.56640625, "learning_rate": 5.595854922279792e-07, "loss": 0.0023, "reward": 2.49999463558197, "reward_std": 5.971898190182401e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.4067357512953365, "grad_norm": 0.17166854282970634, "kl": 0.66015625, "learning_rate": 5.593264248704663e-07, "loss": 0.0029, "reward": 2.49999737739563, "reward_std": 2.20162591801909e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.409326424870466, "grad_norm": 2.413487864792115, "kl": 0.5556640625, "learning_rate": 5.590673575129533e-07, "loss": 0.0026, "reward": 1.9987526535987854, "reward_std": 2.8457197913667187e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987527132034302, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.4119170984455955, "grad_norm": 5.2644224633165235, "kl": 0.62109375, "learning_rate": 5.588082901554404e-07, "loss": 0.0029, "reward": 1.9333974719047546, "reward_std": 0.0003168512972706594, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4333974123001099, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.414507772020725, "grad_norm": 0.44166578440456256, "kl": 0.6640625, "learning_rate": 5.585492227979274e-07, "loss": 0.0034, "reward": 2.4999914169311523, "reward_std": 4.591734523273772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.417098445595855, "grad_norm": 64.20703102071914, "kl": 0.5283203125, "learning_rate": 5.582901554404144e-07, "loss": 0.0025, "reward": 2.1868544816970825, "reward_std": 0.2593055053558828, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.686854362487793, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.419689119170984, "grad_norm": 0.41342035790400306, "kl": 0.64453125, "learning_rate": 5.580310880829016e-07, "loss": 0.003, "reward": 2.49999737739563, "reward_std": 3.4835896371987474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.422279792746114, "grad_norm": 0.6877105763279274, "kl": 0.578125, "learning_rate": 5.577720207253886e-07, "loss": 0.0026, "reward": 2.4999961853027344, "reward_std": 3.737585814178601e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.424870466321243, "grad_norm": 0.25396066176368276, "kl": 0.5458984375, "learning_rate": 5.575129533678757e-07, "loss": 0.0022, "reward": 2.4999942779541016, "reward_std": 3.2043559485828155e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.427461139896373, "grad_norm": 0.27954146396704527, "kl": 0.55078125, "learning_rate": 5.572538860103628e-07, "loss": 0.0021, "reward": 2.4999969005584717, "reward_std": 2.5200953359671985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.430051813471502, "grad_norm": 32.39677504891524, "kl": 0.517578125, "learning_rate": 5.569948186528497e-07, "loss": 0.003, "reward": 2.374135136604309, "reward_std": 0.23305650555340662, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874135136604309, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.432642487046632, "grad_norm": 0.19391296921169693, "kl": 0.609375, "learning_rate": 5.567357512953368e-07, "loss": 0.0026, "reward": 2.499995708465576, "reward_std": 2.7248544256508467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.435233160621761, "grad_norm": 0.1856013323574657, "kl": 0.58203125, "learning_rate": 5.564766839378238e-07, "loss": 0.0019, "reward": 2.4999969005584717, "reward_std": 2.02130672732892e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.437823834196891, "grad_norm": 0.07467217358583411, "kl": 0.4873046875, "learning_rate": 5.562176165803109e-07, "loss": 0.0022, "reward": 2.499997854232788, "reward_std": 1.0300754524905642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.4404145077720205, "grad_norm": 0.4894385554645558, "kl": 0.62109375, "learning_rate": 5.55958549222798e-07, "loss": 0.002, "reward": 2.4999709129333496, "reward_std": 5.529495240352844e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999708533287048, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.44300518134715, "grad_norm": 0.19292670884726382, "kl": 0.666015625, "learning_rate": 5.55699481865285e-07, "loss": 0.0024, "reward": 2.4999908208847046, "reward_std": 2.8175333000035607e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.44559585492228, "grad_norm": 8.513509418046525, "kl": 0.611328125, "learning_rate": 5.55440414507772e-07, "loss": 0.0022, "reward": 1.9991374015808105, "reward_std": 8.173301284841727e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991374611854553, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.448186528497409, "grad_norm": 0.15226180975762416, "kl": 0.5048828125, "learning_rate": 5.55181347150259e-07, "loss": 0.0016, "reward": 2.499997138977051, "reward_std": 1.2029148592773709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.450777202072539, "grad_norm": 0.49382811073782545, "kl": 0.64453125, "learning_rate": 5.549222797927461e-07, "loss": 0.0029, "reward": 1.999888300895691, "reward_std": 8.542884415874141e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499888300895691, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.453367875647668, "grad_norm": 0.6436644602527903, "kl": 0.69921875, "learning_rate": 5.546632124352332e-07, "loss": 0.0039, "reward": 2.4999945163726807, "reward_std": 5.938922072346031e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 4.455958549222798, "grad_norm": 1.4365184731975895, "kl": 0.60546875, "learning_rate": 5.544041450777202e-07, "loss": 0.002, "reward": 2.499987840652466, "reward_std": 5.248312390904175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.458549222797927, "grad_norm": 0.8306369465720925, "kl": 0.619140625, "learning_rate": 5.541450777202073e-07, "loss": 0.0028, "reward": 2.499985933303833, "reward_std": 5.733912075811531e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.461139896373057, "grad_norm": 0.8246551009247174, "kl": 0.583984375, "learning_rate": 5.538860103626942e-07, "loss": 0.003, "reward": 2.4999903440475464, "reward_std": 5.1410165724519175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 4.463730569948186, "grad_norm": 1.884398117807914, "kl": 0.619140625, "learning_rate": 5.536269430051813e-07, "loss": 0.0018, "reward": 2.4999903440475464, "reward_std": 8.55171788316511e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.466321243523316, "grad_norm": 0.12069848168545903, "kl": 0.609375, "learning_rate": 5.533678756476684e-07, "loss": 0.0032, "reward": 2.4999979734420776, "reward_std": 1.718201474432135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.4689119170984455, "grad_norm": 45.017870981838826, "kl": 0.650390625, "learning_rate": 5.531088082901554e-07, "loss": 0.0031, "reward": 1.9975946545600891, "reward_std": 0.00019702018096268148, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975946247577667, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 4.471502590673575, "grad_norm": 45.93537412817835, "kl": 0.53125, "learning_rate": 5.528497409326425e-07, "loss": 0.0018, "reward": 2.2376590967178345, "reward_std": 0.2820798219547669, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7376590371131897, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.474093264248705, "grad_norm": 33.21720103278531, "kl": 0.56640625, "learning_rate": 5.525906735751296e-07, "loss": 0.0028, "reward": 1.9836583137512207, "reward_std": 0.00023278411543969924, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4836581945419312, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.476683937823834, "grad_norm": 2.6765200981071344, "kl": 0.53125, "learning_rate": 5.523316062176165e-07, "loss": 0.0018, "reward": 2.4999945163726807, "reward_std": 3.078778036069707e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.479274611398964, "grad_norm": 0.5180916361276481, "kl": 0.52734375, "learning_rate": 5.520725388601036e-07, "loss": 0.0026, "reward": 2.4999927282333374, "reward_std": 6.626250069530215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.481865284974093, "grad_norm": 14.76436225420233, "kl": 0.6015625, "learning_rate": 5.518134715025906e-07, "loss": 0.0023, "reward": 1.9968973398208618, "reward_std": 0.000256593877566047, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968973994255066, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.484455958549223, "grad_norm": 0.27529813095907957, "kl": 0.5263671875, "learning_rate": 5.515544041450777e-07, "loss": 0.0017, "reward": 2.49999463558197, "reward_std": 1.960438368087125e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.487046632124352, "grad_norm": 62.93472175394802, "kl": 0.654296875, "learning_rate": 5.512953367875648e-07, "loss": 0.0027, "reward": 1.4828532934188843, "reward_std": 0.007225952664157376, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9828532636165619, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.489637305699482, "grad_norm": 0.28130082006137463, "kl": 0.5546875, "learning_rate": 5.510362694300518e-07, "loss": 0.0024, "reward": 2.499990701675415, "reward_std": 4.425520273798611e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.492227979274611, "grad_norm": 1.0026373488526263, "kl": 0.671875, "learning_rate": 5.507772020725388e-07, "loss": 0.003, "reward": 1.999397337436676, "reward_std": 1.1443803060728897e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499397337436676, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.494818652849741, "grad_norm": 86.54388428057185, "kl": 0.533203125, "learning_rate": 5.505181347150258e-07, "loss": 0.0027, "reward": 1.9938467741012573, "reward_std": 0.0006588325263692241, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493846595287323, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.4974093264248705, "grad_norm": 2.1991043099599388, "kl": 0.5859375, "learning_rate": 5.502590673575129e-07, "loss": 0.0035, "reward": 2.4999252557754517, "reward_std": 1.4761043530597817e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999925136566162, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.5, "grad_norm": 1.3175576332838865, "kl": 0.638671875, "learning_rate": 5.5e-07, "loss": 0.0033, "reward": 2.4998961687088013, "reward_std": 1.7315312106802594e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998961687088013, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.5025906735751295, "grad_norm": 9.976737680756596, "kl": 0.611328125, "learning_rate": 5.49740932642487e-07, "loss": 0.0017, "reward": 1.993876576423645, "reward_std": 0.00012815303620072882, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4938766956329346, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.505181347150259, "grad_norm": 0.3735991687966832, "kl": 0.5263671875, "learning_rate": 5.494818652849741e-07, "loss": 0.002, "reward": 2.4999938011169434, "reward_std": 2.4989986400214548e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.507772020725389, "grad_norm": 0.5662354901086509, "kl": 0.560546875, "learning_rate": 5.49222797927461e-07, "loss": 0.0021, "reward": 2.499995708465576, "reward_std": 2.6010900455730734e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.510362694300518, "grad_norm": 0.18412599754413583, "kl": 0.580078125, "learning_rate": 5.489637305699481e-07, "loss": 0.0007, "reward": 2.499994993209839, "reward_std": 2.231975429367594e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.512953367875648, "grad_norm": 4.240510262971168, "kl": 0.5859375, "learning_rate": 5.487046632124352e-07, "loss": 0.0024, "reward": 1.9993302822113037, "reward_std": 3.821921188773558e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993302524089813, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.515544041450777, "grad_norm": 3.680981954038069, "kl": 0.7421875, "learning_rate": 5.484455958549222e-07, "loss": 0.0034, "reward": 2.499789237976074, "reward_std": 4.25005146098556e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997891187667847, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.518134715025907, "grad_norm": 0.10243696365282443, "kl": 0.615234375, "learning_rate": 5.481865284974093e-07, "loss": 0.0042, "reward": 2.4999988079071045, "reward_std": 8.118849450511334e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.520725388601036, "grad_norm": 0.4681931290980738, "kl": 0.525390625, "learning_rate": 5.479274611398963e-07, "loss": 0.002, "reward": 2.499995231628418, "reward_std": 3.00880640224932e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.523316062176166, "grad_norm": 0.45157542846594884, "kl": 0.580078125, "learning_rate": 5.476683937823833e-07, "loss": 0.0023, "reward": 2.4999945163726807, "reward_std": 2.951791088889877e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.525906735751295, "grad_norm": 0.32203836376902867, "kl": 0.693359375, "learning_rate": 5.474093264248704e-07, "loss": 0.0022, "reward": 2.499995231628418, "reward_std": 3.3348059673699026e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.528497409326425, "grad_norm": 3.472293660880611, "kl": 0.5703125, "learning_rate": 5.471502590673574e-07, "loss": 0.0022, "reward": 1.4029377102851868, "reward_std": 0.00028584838582901284, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9029377102851868, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.5310880829015545, "grad_norm": 0.32004810531494765, "kl": 0.5400390625, "learning_rate": 5.468911917098446e-07, "loss": 0.0023, "reward": 2.4999920129776, "reward_std": 3.146490143990377e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.533678756476684, "grad_norm": 2.6767978672456536, "kl": 0.58984375, "learning_rate": 5.466321243523317e-07, "loss": 0.0025, "reward": 1.9186439514160156, "reward_std": 0.00011860368687166556, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4186439216136932, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.536269430051814, "grad_norm": 0.8013119169757229, "kl": 0.62890625, "learning_rate": 5.463730569948187e-07, "loss": 0.0021, "reward": 1.9990673661231995, "reward_std": 1.597636696715199e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499067485332489, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.538860103626943, "grad_norm": 0.2899704822789808, "kl": 0.5166015625, "learning_rate": 5.461139896373057e-07, "loss": 0.0019, "reward": 2.499998092651367, "reward_std": 1.7281186472928312e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.541450777202073, "grad_norm": 0.43606831311901106, "kl": 0.61328125, "learning_rate": 5.458549222797927e-07, "loss": 0.0029, "reward": 2.4999969005584717, "reward_std": 3.5951325116911903e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.544041450777202, "grad_norm": 0.2249595164823007, "kl": 0.55078125, "learning_rate": 5.455958549222798e-07, "loss": 0.0023, "reward": 2.499989628791809, "reward_std": 3.529892524056777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.546632124352332, "grad_norm": 0.8550747968065899, "kl": 0.623046875, "learning_rate": 5.453367875647669e-07, "loss": 0.0025, "reward": 1.997801423072815, "reward_std": 2.5715355832289788e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978015422821045, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 4.549222797927461, "grad_norm": 24.61123945565303, "kl": 0.5517578125, "learning_rate": 5.450777202072539e-07, "loss": 0.0023, "reward": 1.7269089221954346, "reward_std": 0.23277018254157156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2269088625907898, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.551813471502591, "grad_norm": 0.32302217913866643, "kl": 0.576171875, "learning_rate": 5.44818652849741e-07, "loss": 0.0019, "reward": 2.499995231628418, "reward_std": 3.7167684467931394e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.55440414507772, "grad_norm": 1.6605268892005571, "kl": 0.56640625, "learning_rate": 5.445595854922279e-07, "loss": 0.0023, "reward": 2.499987006187439, "reward_std": 1.2756046771755791e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999868869781494, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.55699481865285, "grad_norm": 0.2575292243714762, "kl": 0.6171875, "learning_rate": 5.44300518134715e-07, "loss": 0.0007, "reward": 2.4999959468841553, "reward_std": 2.1915997763244377e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.5595854922279795, "grad_norm": 0.15290801796079534, "kl": 0.5673828125, "learning_rate": 5.440414507772021e-07, "loss": 0.0018, "reward": 2.499996304512024, "reward_std": 2.3289724140340695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.562176165803109, "grad_norm": 0.14637406481393658, "kl": 0.591796875, "learning_rate": 5.437823834196891e-07, "loss": 0.0035, "reward": 2.4999972581863403, "reward_std": 1.7768437032827933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.564766839378239, "grad_norm": 0.13265636769476613, "kl": 0.53515625, "learning_rate": 5.435233160621762e-07, "loss": 0.003, "reward": 2.4999974966049194, "reward_std": 1.460764337934961e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.567357512953368, "grad_norm": 0.18047171317622523, "kl": 0.646484375, "learning_rate": 5.432642487046632e-07, "loss": 0.003, "reward": 2.4999979734420776, "reward_std": 1.1993039379376569e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.569948186528498, "grad_norm": 23.235751212395655, "kl": 0.654296875, "learning_rate": 5.430051813471502e-07, "loss": 0.003, "reward": 2.43737530708313, "reward_std": 0.17711934052886136, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373751878738403, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.572538860103627, "grad_norm": 13.439235982113347, "kl": 0.64453125, "learning_rate": 5.427461139896373e-07, "loss": 0.0026, "reward": 1.999017357826233, "reward_std": 6.0088537793490104e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499017357826233, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.575129533678757, "grad_norm": 20.221216970627538, "kl": 0.59765625, "learning_rate": 5.424870466321243e-07, "loss": 0.0023, "reward": 1.7476866245269775, "reward_std": 0.26774158588887076, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2476867735385895, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.577720207253886, "grad_norm": 1.469819789500706, "kl": 0.6015625, "learning_rate": 5.422279792746114e-07, "loss": 0.0028, "reward": 1.9952036142349243, "reward_std": 8.8256499225281e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4952035248279572, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 4.580310880829016, "grad_norm": 6.426922880899402, "kl": 0.646484375, "learning_rate": 5.419689119170984e-07, "loss": 0.0026, "reward": 2.4374765157699585, "reward_std": 0.17682311554131047, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374765157699585, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.582901554404145, "grad_norm": 0.08208408064410265, "kl": 0.59765625, "learning_rate": 5.417098445595855e-07, "loss": 0.0031, "reward": 2.4999972581863403, "reward_std": 1.0890010457842436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.585492227979275, "grad_norm": 6.111625446389545, "kl": 0.603515625, "learning_rate": 5.414507772020725e-07, "loss": 0.0025, "reward": 1.8576511144638062, "reward_std": 0.00042241121252573066, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.357651174068451, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.5880829015544045, "grad_norm": 0.659742608437132, "kl": 0.69140625, "learning_rate": 5.411917098445595e-07, "loss": 0.0024, "reward": 2.4999818801879883, "reward_std": 1.368320681649493e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999817609786987, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.590673575129534, "grad_norm": 0.09670849331466158, "kl": 0.625, "learning_rate": 5.409326424870466e-07, "loss": 0.0019, "reward": 2.4999955892562866, "reward_std": 1.415864460341254e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.5932642487046635, "grad_norm": 2.293199858507696, "kl": 0.583984375, "learning_rate": 5.406735751295336e-07, "loss": 0.0026, "reward": 2.4987624883651733, "reward_std": 4.3540338538150536e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9987622499465942, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.595854922279793, "grad_norm": 4.734413552364635, "kl": 0.572265625, "learning_rate": 5.404145077720207e-07, "loss": 0.0023, "reward": 0.9519478678703308, "reward_std": 0.000744265504181385, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.4519478678703308, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.598445595854923, "grad_norm": 4.898229189013849, "kl": 0.57421875, "learning_rate": 5.401554404145078e-07, "loss": 0.0024, "reward": 2.4999840259552, "reward_std": 8.808054758446815e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999841451644897, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.601036269430052, "grad_norm": 0.31904377577011545, "kl": 0.578125, "learning_rate": 5.398963730569947e-07, "loss": 0.0016, "reward": 2.499996066093445, "reward_std": 2.154585615699034e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.603626943005182, "grad_norm": 12.972947098203718, "kl": 0.5380859375, "learning_rate": 5.396373056994818e-07, "loss": 0.002, "reward": 1.9772072434425354, "reward_std": 0.0001696238823569729, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4772072434425354, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.606217616580311, "grad_norm": 5.713668432664901, "kl": 0.560546875, "learning_rate": 5.393782383419689e-07, "loss": 0.0034, "reward": 2.499952554702759, "reward_std": 1.8922054096037755e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999524354934692, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.608808290155441, "grad_norm": 0.19102232529841898, "kl": 0.6171875, "learning_rate": 5.391191709844559e-07, "loss": 0.0023, "reward": 2.4999969005584717, "reward_std": 1.7154123952423106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.61139896373057, "grad_norm": 0.17361483369685285, "kl": 0.62890625, "learning_rate": 5.38860103626943e-07, "loss": 0.0027, "reward": 2.4999892711639404, "reward_std": 2.589119276308338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999893307685852, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 4.6139896373057, "grad_norm": 5.041562742125009, "kl": 0.685546875, "learning_rate": 5.3860103626943e-07, "loss": 0.0035, "reward": 2.312476396560669, "reward_std": 0.258787999965989, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124763369560242, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 4.616580310880829, "grad_norm": 13.885707720303506, "kl": 0.572265625, "learning_rate": 5.38341968911917e-07, "loss": 0.0016, "reward": 1.941656768321991, "reward_std": 0.015803210081685393, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4416569471359253, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.619170984455959, "grad_norm": 0.5063484007696724, "kl": 0.6103515625, "learning_rate": 5.380829015544041e-07, "loss": 0.003, "reward": 2.499993681907654, "reward_std": 2.362134409850114e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.6217616580310885, "grad_norm": 0.9063531589994713, "kl": 0.4765625, "learning_rate": 5.378238341968911e-07, "loss": 0.0025, "reward": 2.4999959468841553, "reward_std": 2.7898926191483042e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.624352331606218, "grad_norm": 0.19185865981661437, "kl": 0.65234375, "learning_rate": 5.375647668393782e-07, "loss": 0.003, "reward": 2.499997615814209, "reward_std": 2.14255840091937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.626943005181348, "grad_norm": 0.28493208626398264, "kl": 0.69921875, "learning_rate": 5.373056994818652e-07, "loss": 0.0024, "reward": 2.499990701675415, "reward_std": 2.4782627860986395e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.629533678756477, "grad_norm": 40.14299161314204, "kl": 0.716796875, "learning_rate": 5.370466321243523e-07, "loss": 0.0022, "reward": 2.1209282875061035, "reward_std": 0.23396665196037247, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.620928406715393, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.632124352331607, "grad_norm": 0.6816147668943155, "kl": 0.595703125, "learning_rate": 5.367875647668393e-07, "loss": 0.0019, "reward": 2.4999969005584717, "reward_std": 2.3878390038589714e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.634715025906736, "grad_norm": 71.16287295003491, "kl": 0.607421875, "learning_rate": 5.365284974093263e-07, "loss": 0.0028, "reward": 1.999851107597351, "reward_std": 4.35088986705523e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998509883880615, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.637305699481866, "grad_norm": 1.1674097279972813, "kl": 0.6171875, "learning_rate": 5.362694300518134e-07, "loss": 0.0026, "reward": 2.499990940093994, "reward_std": 7.167728654167149e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.639896373056995, "grad_norm": 7.234569568838252, "kl": 0.525390625, "learning_rate": 5.360103626943004e-07, "loss": 0.0022, "reward": 2.4373950958251953, "reward_std": 0.17680896514775668, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937394917011261, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.642487046632124, "grad_norm": 1.42189471837408, "kl": 0.578125, "learning_rate": 5.357512953367876e-07, "loss": 0.0025, "reward": 1.9986125230789185, "reward_std": 3.6035962580172054e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498612403869629, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.6450777202072535, "grad_norm": 0.14902310670556682, "kl": 0.5908203125, "learning_rate": 5.354922279792747e-07, "loss": 0.0021, "reward": 2.4999985694885254, "reward_std": 1.2857792626164155e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.647668393782383, "grad_norm": 29.080914860016154, "kl": 0.564453125, "learning_rate": 5.352331606217616e-07, "loss": 0.0025, "reward": 1.9930524230003357, "reward_std": 0.0013153717563909595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4930524230003357, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.650259067357513, "grad_norm": 2.7990237666184035, "kl": 0.57421875, "learning_rate": 5.349740932642487e-07, "loss": 0.003, "reward": 1.809572458267212, "reward_std": 0.00025621324618896324, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.309572458267212, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.652849740932642, "grad_norm": 122.82746336520438, "kl": 0.56640625, "learning_rate": 5.347150259067357e-07, "loss": 0.0026, "reward": 1.995053231716156, "reward_std": 0.0033886614224343248, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.495053231716156, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.655440414507772, "grad_norm": 41.204879081156086, "kl": 0.533203125, "learning_rate": 5.344559585492228e-07, "loss": 0.0026, "reward": 2.374815344810486, "reward_std": 0.2317947340648061, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748153448104858, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 4.658031088082901, "grad_norm": 0.10137500753411668, "kl": 0.607421875, "learning_rate": 5.341968911917099e-07, "loss": 0.0024, "reward": 2.4999983310699463, "reward_std": 2.1150336237951706e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.660621761658031, "grad_norm": 0.5836930060628485, "kl": 0.65625, "learning_rate": 5.339378238341969e-07, "loss": 0.0021, "reward": 2.4999892711639404, "reward_std": 4.6326625806614175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892115592957, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.66321243523316, "grad_norm": 0.3168844208530169, "kl": 0.5302734375, "learning_rate": 5.336787564766839e-07, "loss": 0.0027, "reward": 2.4999955892562866, "reward_std": 3.396912006792263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.66580310880829, "grad_norm": 0.1619978429277008, "kl": 0.619140625, "learning_rate": 5.33419689119171e-07, "loss": 0.0015, "reward": 2.4999935626983643, "reward_std": 2.7811253175968886e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.668393782383419, "grad_norm": 0.3024160461629981, "kl": 0.544921875, "learning_rate": 5.33160621761658e-07, "loss": 0.0029, "reward": 2.4999828338623047, "reward_std": 4.953754228154139e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999827146530151, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.670984455958549, "grad_norm": 1.8355855769777398, "kl": 0.58203125, "learning_rate": 5.329015544041451e-07, "loss": 0.0033, "reward": 2.4999759197235107, "reward_std": 8.956049441621872e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999757409095764, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.6735751295336785, "grad_norm": 0.07874843146296348, "kl": 0.599609375, "learning_rate": 5.326424870466321e-07, "loss": 0.0018, "reward": 2.499996304512024, "reward_std": 1.3413531405603862e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.676165803108808, "grad_norm": 39.52064164572289, "kl": 0.578125, "learning_rate": 5.323834196891192e-07, "loss": 0.0014, "reward": 2.3747888803482056, "reward_std": 0.2317998306912159, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747888803482056, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.678756476683938, "grad_norm": 0.10513449090776979, "kl": 0.6171875, "learning_rate": 5.321243523316063e-07, "loss": 0.0014, "reward": 2.4999982118606567, "reward_std": 1.3175453545954952e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 4.681347150259067, "grad_norm": 40.26409593766495, "kl": 0.708984375, "learning_rate": 5.318652849740932e-07, "loss": 0.0021, "reward": 1.952300250530243, "reward_std": 0.02883008645881091, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4523003101348877, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.683937823834197, "grad_norm": 0.21198224888531259, "kl": 0.65625, "learning_rate": 5.316062176165803e-07, "loss": 0.0025, "reward": 2.499997615814209, "reward_std": 2.132813506250386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.686528497409326, "grad_norm": 0.23658447132647606, "kl": 0.607421875, "learning_rate": 5.313471502590673e-07, "loss": 0.0022, "reward": 2.49999737739563, "reward_std": 2.0154017192908213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.689119170984456, "grad_norm": 2.6801782628180675, "kl": 0.609375, "learning_rate": 5.310880829015544e-07, "loss": 0.003, "reward": 1.9993236660957336, "reward_std": 2.9892015845689457e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993236660957336, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.691709844559585, "grad_norm": 11.879481930763447, "kl": 0.82421875, "learning_rate": 5.308290155440415e-07, "loss": 0.0033, "reward": 1.5602394342422485, "reward_std": 0.17707705926295603, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0602394342422485, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 4.694300518134715, "grad_norm": 1.221421949052627, "kl": 0.603515625, "learning_rate": 5.305699481865284e-07, "loss": 0.0035, "reward": 2.4999812841415405, "reward_std": 1.4232611192710465e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999811053276062, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.696891191709844, "grad_norm": 10.436904686588218, "kl": 0.5703125, "learning_rate": 5.303108808290155e-07, "loss": 0.002, "reward": 2.062255859375, "reward_std": 0.17687103836391316, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5622559785842896, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.699481865284974, "grad_norm": 0.500523556258103, "kl": 0.63671875, "learning_rate": 5.300518134715025e-07, "loss": 0.0025, "reward": 2.499993085861206, "reward_std": 4.178241624686052e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.7020725388601035, "grad_norm": 8.68863882854078, "kl": 0.57421875, "learning_rate": 5.297927461139896e-07, "loss": 0.0029, "reward": 1.9997514486312866, "reward_std": 2.9319265195226762e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499751329421997, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.704663212435233, "grad_norm": 0.39444445109213466, "kl": 0.5458984375, "learning_rate": 5.295336787564767e-07, "loss": 0.0009, "reward": 2.499992609024048, "reward_std": 2.835566419889801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.7072538860103625, "grad_norm": 0.06711282412929337, "kl": 0.56640625, "learning_rate": 5.292746113989637e-07, "loss": 0.0029, "reward": 2.49999737739563, "reward_std": 8.894726022390387e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.709844559585492, "grad_norm": 0.5728652110912531, "kl": 0.66015625, "learning_rate": 5.290155440414508e-07, "loss": 0.0015, "reward": 2.4999961853027344, "reward_std": 2.490233043772605e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 4.712435233160622, "grad_norm": 0.4214467082761593, "kl": 0.4892578125, "learning_rate": 5.287564766839377e-07, "loss": 0.0015, "reward": 2.4999865293502808, "reward_std": 3.6112055568082724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999866485595703, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.715025906735751, "grad_norm": 0.1351926034466714, "kl": 0.607421875, "learning_rate": 5.284974093264248e-07, "loss": 0.0029, "reward": 2.4999988079071045, "reward_std": 1.2381971998820518e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.717616580310881, "grad_norm": 0.2247191822078499, "kl": 0.62890625, "learning_rate": 5.282383419689119e-07, "loss": 0.0024, "reward": 2.499997138977051, "reward_std": 1.8238506527268328e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.72020725388601, "grad_norm": 0.31670925014666357, "kl": 0.779296875, "learning_rate": 5.279792746113989e-07, "loss": 0.0028, "reward": 2.499997854232788, "reward_std": 3.0763594054405985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.72279792746114, "grad_norm": 0.15792110933294143, "kl": 0.546875, "learning_rate": 5.27720207253886e-07, "loss": 0.0018, "reward": 2.4999977350234985, "reward_std": 1.6389829795571131e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.725388601036269, "grad_norm": 1.446449858591953, "kl": 0.61328125, "learning_rate": 5.274611398963731e-07, "loss": 0.0029, "reward": 2.4999923706054688, "reward_std": 2.586664294312868e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.727979274611399, "grad_norm": 32.69221708553658, "kl": 0.59375, "learning_rate": 5.2720207253886e-07, "loss": 0.0021, "reward": 1.9964823722839355, "reward_std": 0.0005116369513871177, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4964823424816132, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.730569948186528, "grad_norm": 34.79554486324595, "kl": 0.615234375, "learning_rate": 5.269430051813471e-07, "loss": 0.0023, "reward": 1.4494886994361877, "reward_std": 0.005550071677134838, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9494887888431549, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.733160621761658, "grad_norm": 2.1071989631027943, "kl": 0.53515625, "learning_rate": 5.266839378238341e-07, "loss": 0.0023, "reward": 2.4999938011169434, "reward_std": 5.288306851980451e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 4.7357512953367875, "grad_norm": 0.6819554463214496, "kl": 0.576171875, "learning_rate": 5.264248704663212e-07, "loss": 0.0022, "reward": 1.9988545179367065, "reward_std": 2.0810858018194267e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498854637145996, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.738341968911917, "grad_norm": 0.6306507981874484, "kl": 0.599609375, "learning_rate": 5.261658031088083e-07, "loss": 0.0021, "reward": 2.4999862909317017, "reward_std": 5.9273586430208525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986171722412, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.740932642487047, "grad_norm": 3.5659756996789826, "kl": 0.642578125, "learning_rate": 5.259067357512953e-07, "loss": 0.0017, "reward": 1.9988386631011963, "reward_std": 9.502394976834694e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988389015197754, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.743523316062176, "grad_norm": 5.719273206203651, "kl": 0.69921875, "learning_rate": 5.256476683937823e-07, "loss": 0.0038, "reward": 1.98576819896698, "reward_std": 0.00013047942798039003, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4857680797576904, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.746113989637306, "grad_norm": 16.460683124779575, "kl": 0.587890625, "learning_rate": 5.253886010362693e-07, "loss": 0.0028, "reward": 2.3122146129608154, "reward_std": 0.25916074834276515, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122146129608154, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.748704663212435, "grad_norm": 4.220797905488209, "kl": 0.466796875, "learning_rate": 5.251295336787564e-07, "loss": 0.0023, "reward": 1.9997503757476807, "reward_std": 3.366740429555648e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997504353523254, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.751295336787565, "grad_norm": 3.306308439360848, "kl": 0.5556640625, "learning_rate": 5.248704663212436e-07, "loss": 0.0024, "reward": 2.4973855018615723, "reward_std": 4.588419039919245e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9973855018615723, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.753886010362694, "grad_norm": 6.4093566583690595, "kl": 0.55078125, "learning_rate": 5.246113989637306e-07, "loss": 0.003, "reward": 2.498948574066162, "reward_std": 0.00031569721960522656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998948335647583, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.756476683937824, "grad_norm": 5.609396504327555, "kl": 0.79296875, "learning_rate": 5.243523316062177e-07, "loss": 0.0029, "reward": 1.8671010732650757, "reward_std": 0.0005569205313804559, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3671010732650757, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.759067357512953, "grad_norm": 9.496866865626663, "kl": 0.498046875, "learning_rate": 5.240932642487046e-07, "loss": 0.0013, "reward": 2.18746417760849, "reward_std": 0.2587937697298912, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6874643564224243, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.761658031088083, "grad_norm": 0.13479710234145797, "kl": 0.5419921875, "learning_rate": 5.238341968911917e-07, "loss": 0.0023, "reward": 2.49999737739563, "reward_std": 1.0421831007079163e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 4.7642487046632125, "grad_norm": 21.04156964339258, "kl": 0.54296875, "learning_rate": 5.235751295336788e-07, "loss": 0.0017, "reward": 2.499312400817871, "reward_std": 0.0012653632968522288, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9993124604225159, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.766839378238342, "grad_norm": 27.699704329621383, "kl": 0.5205078125, "learning_rate": 5.233160621761658e-07, "loss": 0.0021, "reward": 2.370957136154175, "reward_std": 0.23395942035131156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.87095707654953, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.769430051813472, "grad_norm": 0.09801918173827034, "kl": 0.43359375, "learning_rate": 5.230569948186529e-07, "loss": 0.0017, "reward": 2.499998092651367, "reward_std": 1.7403809522420488e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.772020725388601, "grad_norm": 2.0823922900069682, "kl": 0.65234375, "learning_rate": 5.227979274611399e-07, "loss": 0.0028, "reward": 2.499983072280884, "reward_std": 2.2454955797002185e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999830722808838, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.774611398963731, "grad_norm": 0.17903604870432266, "kl": 0.552734375, "learning_rate": 5.225388601036269e-07, "loss": 0.0019, "reward": 2.4999934434890747, "reward_std": 2.8300887606746983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 4.77720207253886, "grad_norm": 0.1548338151914464, "kl": 0.5546875, "learning_rate": 5.22279792746114e-07, "loss": 0.0019, "reward": 2.4999982118606567, "reward_std": 1.7898020132633974e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.77979274611399, "grad_norm": 0.8399770465531209, "kl": 0.568359375, "learning_rate": 5.22020725388601e-07, "loss": 0.0016, "reward": 2.4999920129776, "reward_std": 6.600384494959144e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.782383419689119, "grad_norm": 37.72101770771968, "kl": 0.650390625, "learning_rate": 5.217616580310881e-07, "loss": 0.0027, "reward": 2.4365172386169434, "reward_std": 0.17687223727989476, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9365171790122986, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.784974093264249, "grad_norm": 0.7325019819764619, "kl": 0.5166015625, "learning_rate": 5.215025906735752e-07, "loss": 0.0017, "reward": 1.99922513961792, "reward_std": 1.0850484500224411e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992252588272095, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.787564766839378, "grad_norm": 13.273361753008693, "kl": 0.591796875, "learning_rate": 5.212435233160622e-07, "loss": 0.0023, "reward": 1.936770498752594, "reward_std": 0.17754539232737443, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4367704689502716, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.790155440414508, "grad_norm": 0.5444764359351981, "kl": 0.5419921875, "learning_rate": 5.209844559585492e-07, "loss": 0.0016, "reward": 2.4999918937683105, "reward_std": 6.114944085311436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.7927461139896375, "grad_norm": 0.39927784979539843, "kl": 0.580078125, "learning_rate": 5.207253886010362e-07, "loss": 0.0026, "reward": 2.4999958276748657, "reward_std": 3.3531737244629767e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.795336787564767, "grad_norm": 23.070906338264653, "kl": 0.69140625, "learning_rate": 5.204663212435233e-07, "loss": 0.0027, "reward": 1.6114187836647034, "reward_std": 0.2682804054347798, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1114187836647034, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.7979274611398965, "grad_norm": 0.24368211032764536, "kl": 0.615234375, "learning_rate": 5.202072538860104e-07, "loss": 0.0016, "reward": 2.4999964237213135, "reward_std": 2.84736586309009e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.800518134715026, "grad_norm": 3.2851445892094295, "kl": 0.54296875, "learning_rate": 5.199481865284974e-07, "loss": 0.0025, "reward": 1.999822974205017, "reward_std": 3.1823417430132395e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998229146003723, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.803108808290156, "grad_norm": 3.90088689951452, "kl": 0.640625, "learning_rate": 5.196891191709845e-07, "loss": 0.0024, "reward": 1.3197072744369507, "reward_std": 0.0003404725393920671, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8197072744369507, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.805699481865285, "grad_norm": 4.000846766332576, "kl": 1.220703125, "learning_rate": 5.194300518134714e-07, "loss": 0.0049, "reward": 1.4132771492004395, "reward_std": 0.0003027062994078733, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9132770299911499, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 4.808290155440415, "grad_norm": 30.42574760471824, "kl": 0.56640625, "learning_rate": 5.191709844559585e-07, "loss": 0.0014, "reward": 2.499292016029358, "reward_std": 0.00035478571550129345, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9992921948432922, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.810880829015544, "grad_norm": 1.0754807786764269, "kl": 0.498046875, "learning_rate": 5.189119170984456e-07, "loss": 0.0014, "reward": 1.9992250204086304, "reward_std": 2.737228271598724e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992251098155975, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.813471502590674, "grad_norm": 0.48918117877757056, "kl": 0.609375, "learning_rate": 5.186528497409326e-07, "loss": 0.002, "reward": 2.499996542930603, "reward_std": 4.088528726242657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 4.816062176165803, "grad_norm": 36.05341820257133, "kl": 0.5615234375, "learning_rate": 5.183937823834197e-07, "loss": 0.0021, "reward": 1.988953948020935, "reward_std": 0.006258147246171575, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4889540672302246, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.818652849740933, "grad_norm": 0.12578894992361253, "kl": 0.6171875, "learning_rate": 5.181347150259067e-07, "loss": 0.0013, "reward": 2.499997138977051, "reward_std": 1.895381046779221e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.821243523316062, "grad_norm": 1.9686028885503286, "kl": 0.55078125, "learning_rate": 5.178756476683937e-07, "loss": 0.002, "reward": 2.49997615814209, "reward_std": 1.5635020872650784e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999761581420898, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.823834196891192, "grad_norm": 1.3817386153810833, "kl": 0.5546875, "learning_rate": 5.176165803108808e-07, "loss": 0.0032, "reward": 2.499987244606018, "reward_std": 8.275574373328709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871253967285, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.8264248704663215, "grad_norm": 0.37744956167034316, "kl": 0.630859375, "learning_rate": 5.173575129533678e-07, "loss": 0.0017, "reward": 2.499996542930603, "reward_std": 5.450698608910898e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.829015544041451, "grad_norm": 1.3306233889201733, "kl": 0.5625, "learning_rate": 5.170984455958549e-07, "loss": 0.0023, "reward": 2.4999706745147705, "reward_std": 1.2392022426865879e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999706745147705, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.831606217616581, "grad_norm": 0.09260298812971482, "kl": 0.51953125, "learning_rate": 5.168393782383419e-07, "loss": 0.0022, "reward": 2.4999961853027344, "reward_std": 1.3591929928225e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.83419689119171, "grad_norm": 5.325110580377796, "kl": 0.599609375, "learning_rate": 5.16580310880829e-07, "loss": 0.0023, "reward": 1.9868172407150269, "reward_std": 0.00012696221347141545, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4868172407150269, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.83678756476684, "grad_norm": 27.96903964259055, "kl": 0.63671875, "learning_rate": 5.16321243523316e-07, "loss": 0.0031, "reward": 2.374980330467224, "reward_std": 0.2314816612837376, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749802708625793, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.839378238341969, "grad_norm": 0.37152461260286196, "kl": 0.58984375, "learning_rate": 5.16062176165803e-07, "loss": 0.0025, "reward": 2.4999958276748657, "reward_std": 3.604128096412751e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.841968911917099, "grad_norm": 0.32849969673539337, "kl": 0.548828125, "learning_rate": 5.158031088082901e-07, "loss": 0.0032, "reward": 2.499993681907654, "reward_std": 4.349315929630393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.844559585492228, "grad_norm": 0.4346535496521582, "kl": 0.576171875, "learning_rate": 5.155440414507772e-07, "loss": 0.0022, "reward": 2.4999942779541016, "reward_std": 3.629805519267393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.847150259067358, "grad_norm": 2.4854993359927238, "kl": 0.52734375, "learning_rate": 5.152849740932642e-07, "loss": 0.0022, "reward": 2.4999887943267822, "reward_std": 1.3400277566688601e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.849740932642487, "grad_norm": 62.88596695988622, "kl": 0.505859375, "learning_rate": 5.150259067357513e-07, "loss": 0.0025, "reward": 1.9924925565719604, "reward_std": 0.004712800299785158, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924925863742828, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.852331606217617, "grad_norm": 0.3342380791697756, "kl": 0.58984375, "learning_rate": 5.147668393782382e-07, "loss": 0.0016, "reward": 2.499995470046997, "reward_std": 4.471806164474401e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.8549222797927465, "grad_norm": 0.7810219957064567, "kl": 0.619140625, "learning_rate": 5.145077720207253e-07, "loss": 0.0028, "reward": 2.4999948740005493, "reward_std": 5.871624580322532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.857512953367876, "grad_norm": 4.338570015527022, "kl": 0.544921875, "learning_rate": 5.142487046632125e-07, "loss": 0.0027, "reward": 2.4999876022338867, "reward_std": 1.3871165265300078e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876022338867, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.860103626943005, "grad_norm": 1.3868282411550084, "kl": 0.4892578125, "learning_rate": 5.139896373056995e-07, "loss": 0.0012, "reward": 2.4999916553497314, "reward_std": 1.0751708714451524e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.862694300518134, "grad_norm": 2.121035198879685, "kl": 0.55078125, "learning_rate": 5.137305699481866e-07, "loss": 0.003, "reward": 2.4999866485595703, "reward_std": 1.6998868886730634e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999866485595703, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.865284974093264, "grad_norm": 0.09069363898449548, "kl": 0.580078125, "learning_rate": 5.134715025906736e-07, "loss": 0.0026, "reward": 2.4999983310699463, "reward_std": 1.3145447042006708e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 4.867875647668393, "grad_norm": 0.619959305352506, "kl": 0.638671875, "learning_rate": 5.132124352331606e-07, "loss": 0.0033, "reward": 2.499996781349182, "reward_std": 2.656600656791852e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.870466321243523, "grad_norm": 0.1835811582809222, "kl": 0.609375, "learning_rate": 5.129533678756477e-07, "loss": 0.0036, "reward": 2.499993681907654, "reward_std": 3.191871030594484e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.873056994818652, "grad_norm": 0.1088146752102806, "kl": 0.666015625, "learning_rate": 5.126943005181347e-07, "loss": 0.0018, "reward": 2.4999966621398926, "reward_std": 2.0380452951940242e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.875647668393782, "grad_norm": 4.711072189557616, "kl": 0.650390625, "learning_rate": 5.124352331606218e-07, "loss": 0.0026, "reward": 1.8522638082504272, "reward_std": 0.00047310423519775213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3522638082504272, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.8782383419689115, "grad_norm": 0.0933628987342195, "kl": 0.611328125, "learning_rate": 5.121761658031088e-07, "loss": 0.0027, "reward": 2.4999985694885254, "reward_std": 7.226404221682969e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.880829015544041, "grad_norm": 1.2953533708529024, "kl": 0.509765625, "learning_rate": 5.119170984455959e-07, "loss": 0.0033, "reward": 1.9984333515167236, "reward_std": 2.3370661097033008e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984330534934998, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.883419689119171, "grad_norm": 2.012985106039309, "kl": 0.623046875, "learning_rate": 5.116580310880829e-07, "loss": 0.0025, "reward": 2.4999804496765137, "reward_std": 1.0961868781578232e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999804496765137, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.8860103626943, "grad_norm": 0.12552354488530978, "kl": 0.62890625, "learning_rate": 5.113989637305699e-07, "loss": 0.0035, "reward": 2.499997138977051, "reward_std": 2.487526728600642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.88860103626943, "grad_norm": 0.1966104272144724, "kl": 0.59375, "learning_rate": 5.11139896373057e-07, "loss": 0.0028, "reward": 2.499998092651367, "reward_std": 1.1534895065778983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.891191709844559, "grad_norm": 0.4182292049665084, "kl": 0.56640625, "learning_rate": 5.10880829015544e-07, "loss": 0.0021, "reward": 2.499932050704956, "reward_std": 6.727790378135978e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999321103096008, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.893782383419689, "grad_norm": 0.9557110777734575, "kl": 0.5810546875, "learning_rate": 5.106217616580311e-07, "loss": 0.0018, "reward": 2.4999932050704956, "reward_std": 1.1273125892330427e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.896373056994818, "grad_norm": 34.32260799992067, "kl": 0.5703125, "learning_rate": 5.103626943005182e-07, "loss": 0.0025, "reward": 1.9785298109054565, "reward_std": 0.00010693512012949213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4785297513008118, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.898963730569948, "grad_norm": 0.19115974599265903, "kl": 0.533203125, "learning_rate": 5.101036269430051e-07, "loss": 0.0017, "reward": 2.4999977350234985, "reward_std": 2.7497833912093483e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.901554404145077, "grad_norm": 0.1493691447415382, "kl": 0.529296875, "learning_rate": 5.098445595854922e-07, "loss": 0.0018, "reward": 2.4999982118606567, "reward_std": 1.586098676398251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 4.904145077720207, "grad_norm": 37.60297068966974, "kl": 0.564453125, "learning_rate": 5.095854922279792e-07, "loss": 0.0025, "reward": 2.123812437057495, "reward_std": 0.23218894355977682, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6238122582435608, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.9067357512953365, "grad_norm": 0.9720027093338267, "kl": 0.744140625, "learning_rate": 5.093264248704663e-07, "loss": 0.003, "reward": 2.499995470046997, "reward_std": 1.8237520862385281e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.909326424870466, "grad_norm": 13.509256229363535, "kl": 0.634765625, "learning_rate": 5.090673575129534e-07, "loss": 0.0025, "reward": 2.0624135732650757, "reward_std": 0.17679639894640786, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624135732650757, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.9119170984455955, "grad_norm": 8.476338709822992, "kl": 0.580078125, "learning_rate": 5.088082901554404e-07, "loss": 0.0025, "reward": 1.9990899562835693, "reward_std": 0.00012022216844798095, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990899562835693, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.914507772020725, "grad_norm": 37.17342193523048, "kl": 0.529296875, "learning_rate": 5.085492227979274e-07, "loss": 0.0023, "reward": 1.9959046840667725, "reward_std": 0.0002611415633282377, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4959046840667725, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.917098445595855, "grad_norm": 0.07089617937434776, "kl": 0.5068359375, "learning_rate": 5.082901554404145e-07, "loss": 0.0029, "reward": 2.499997138977051, "reward_std": 1.8200578324467642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.919689119170984, "grad_norm": 0.12467995188107271, "kl": 0.609375, "learning_rate": 5.080310880829015e-07, "loss": 0.003, "reward": 2.4999985694885254, "reward_std": 1.2409701639626292e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.922279792746114, "grad_norm": 0.40501229350632956, "kl": 0.54296875, "learning_rate": 5.077720207253886e-07, "loss": 0.0022, "reward": 2.4999972581863403, "reward_std": 2.399094228167087e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.924870466321243, "grad_norm": 78.69617892684188, "kl": 0.4375, "learning_rate": 5.075129533678756e-07, "loss": 0.0023, "reward": 2.1850425004959106, "reward_std": 0.26084945077843713, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.685042381286621, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 4.927461139896373, "grad_norm": 8.209624593851792, "kl": 0.54296875, "learning_rate": 5.072538860103627e-07, "loss": 0.0024, "reward": 1.5614228248596191, "reward_std": 0.17717482581247168, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0614228546619415, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.930051813471502, "grad_norm": 6.242925424236701, "kl": 0.51953125, "learning_rate": 5.069948186528497e-07, "loss": 0.0018, "reward": 2.499987840652466, "reward_std": 2.0161417864983378e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.932642487046632, "grad_norm": 10.037474744738857, "kl": 0.64453125, "learning_rate": 5.067357512953367e-07, "loss": 0.0024, "reward": 1.9413501024246216, "reward_std": 0.00030878394090905203, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4413501024246216, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.935233160621761, "grad_norm": 39.740460408716295, "kl": 0.712890625, "learning_rate": 5.064766839378238e-07, "loss": 0.0029, "reward": 1.437481164932251, "reward_std": 0.17682889103889465, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9374810457229614, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.937823834196891, "grad_norm": 0.27067331455158716, "kl": 0.58203125, "learning_rate": 5.062176165803108e-07, "loss": 0.0025, "reward": 2.4999958276748657, "reward_std": 2.1770819671473873e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.9404145077720205, "grad_norm": 1.0983912475978301, "kl": 0.611328125, "learning_rate": 5.059585492227979e-07, "loss": 0.0029, "reward": 2.4999873638153076, "reward_std": 4.244365186423238e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987244606018, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.94300518134715, "grad_norm": 0.34252435407675597, "kl": 0.4599609375, "learning_rate": 5.05699481865285e-07, "loss": 0.0016, "reward": 2.499997854232788, "reward_std": 2.0696473939096904e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.94559585492228, "grad_norm": 1.8364421430709192, "kl": 0.591796875, "learning_rate": 5.054404145077719e-07, "loss": 0.0021, "reward": 1.998897910118103, "reward_std": 2.42316208414195e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988979399204254, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.948186528497409, "grad_norm": 0.40836544352242105, "kl": 0.71875, "learning_rate": 5.05181347150259e-07, "loss": 0.0035, "reward": 2.4999945163726807, "reward_std": 3.1591259812557837e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.950777202072539, "grad_norm": 34.72739074914931, "kl": 0.60546875, "learning_rate": 5.04922279792746e-07, "loss": 0.0024, "reward": 1.9313993453979492, "reward_std": 0.17831011599628255, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4313994944095612, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.953367875647668, "grad_norm": 4.234481860185737, "kl": 0.583984375, "learning_rate": 5.046632124352331e-07, "loss": 0.0028, "reward": 1.999099850654602, "reward_std": 2.4047577653618646e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499099850654602, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.955958549222798, "grad_norm": 0.7855698696440275, "kl": 0.623046875, "learning_rate": 5.044041450777202e-07, "loss": 0.0031, "reward": 1.9999033212661743, "reward_std": 1.2031283517899283e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999032616615295, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 4.958549222797927, "grad_norm": 0.1257987665649695, "kl": 0.51171875, "learning_rate": 5.041450777202072e-07, "loss": 0.0029, "reward": 2.4999970197677612, "reward_std": 1.4720813226176688e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 4.961139896373057, "grad_norm": 0.3861809608417685, "kl": 0.619140625, "learning_rate": 5.038860103626942e-07, "loss": 0.003, "reward": 2.4999964237213135, "reward_std": 3.1752243785376777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.963730569948186, "grad_norm": 14.916148311402084, "kl": 0.716796875, "learning_rate": 5.036269430051812e-07, "loss": 0.0029, "reward": 1.49666166305542, "reward_std": 0.00019398711083340459, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9966616630554199, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.966321243523316, "grad_norm": 0.4871583839495978, "kl": 0.625, "learning_rate": 5.033678756476683e-07, "loss": 0.0015, "reward": 2.49998676776886, "reward_std": 5.0275706371394335e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999870657920837, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.9689119170984455, "grad_norm": 2.782806475054549, "kl": 0.599609375, "learning_rate": 5.031088082901555e-07, "loss": 0.0028, "reward": 2.4999539852142334, "reward_std": 3.555295393198321e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999539852142334, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 4.971502590673575, "grad_norm": 161.79644217196318, "kl": 0.50390625, "learning_rate": 5.028497409326425e-07, "loss": 0.0014, "reward": 2.183952510356903, "reward_std": 0.26171009877748475, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6839526295661926, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.974093264248705, "grad_norm": 5.255266260381215, "kl": 0.671875, "learning_rate": 5.025906735751296e-07, "loss": 0.0034, "reward": 1.864271640777588, "reward_std": 0.0006260482128936928, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.364271640777588, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.976683937823834, "grad_norm": 15.175339097952422, "kl": 0.89453125, "learning_rate": 5.023316062176167e-07, "loss": 0.0038, "reward": 1.4031597971916199, "reward_std": 0.005603702460575732, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9031597673892975, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.979274611398964, "grad_norm": 0.13320643919122854, "kl": 0.583984375, "learning_rate": 5.020725388601036e-07, "loss": 0.0028, "reward": 2.4999935626983643, "reward_std": 2.462687803017616e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.981865284974093, "grad_norm": 0.18689677407971264, "kl": 0.576171875, "learning_rate": 5.018134715025907e-07, "loss": 0.0014, "reward": 2.499998092651367, "reward_std": 1.7303628396803106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.984455958549223, "grad_norm": 0.07041009629042816, "kl": 0.703125, "learning_rate": 5.015544041450777e-07, "loss": 0.0034, "reward": 2.4999985694885254, "reward_std": 1.5891518785338121e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 4.987046632124352, "grad_norm": 0.04704741250852375, "kl": 0.564453125, "learning_rate": 5.012953367875648e-07, "loss": 0.0017, "reward": 2.4999983310699463, "reward_std": 1.0078448724470945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 4.989637305699482, "grad_norm": 22.97245061206414, "kl": 0.640625, "learning_rate": 5.010362694300519e-07, "loss": 0.0017, "reward": 1.7927783131599426, "reward_std": 0.14884022242904393, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2927784323692322, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.992227979274611, "grad_norm": 2.120640119412175, "kl": 0.6484375, "learning_rate": 5.007772020725388e-07, "loss": 0.0024, "reward": 2.4999871253967285, "reward_std": 1.3021540951285715e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987244606018, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.994818652849741, "grad_norm": 0.29357991494434654, "kl": 0.75390625, "learning_rate": 5.005181347150259e-07, "loss": 0.0027, "reward": 2.499995231628418, "reward_std": 3.278447877619328e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.9974093264248705, "grad_norm": 7.870270723908353, "kl": 0.53515625, "learning_rate": 5.002590673575129e-07, "loss": 0.0025, "reward": 2.499956488609314, "reward_std": 1.9368054154256242e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999563694000244, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.0, "grad_norm": 5.349111276688158, "kl": 0.576171875, "learning_rate": 5e-07, "loss": 0.0024, "reward": 1.9919170141220093, "reward_std": 0.00014929518056305824, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4919170141220093, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.0025906735751295, "grad_norm": 0.3309959131220274, "kl": 0.51171875, "learning_rate": 4.99740932642487e-07, "loss": 0.0027, "reward": 2.499990224838257, "reward_std": 7.0875109940971015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.005181347150259, "grad_norm": 0.0895107570455574, "kl": 0.505859375, "learning_rate": 4.994818652849741e-07, "loss": 0.0017, "reward": 2.4999964237213135, "reward_std": 2.0527048150142946e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.007772020725389, "grad_norm": 0.06372659943935378, "kl": 0.55078125, "learning_rate": 4.992227979274612e-07, "loss": 0.0015, "reward": 2.4999988079071045, "reward_std": 1.5183268544660677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.010362694300518, "grad_norm": 3.607212375657742, "kl": 0.974609375, "learning_rate": 4.989637305699482e-07, "loss": 0.0055, "reward": 2.49999463558197, "reward_std": 3.0373288382179453e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.012953367875648, "grad_norm": 4.789911046186114, "kl": 0.5546875, "learning_rate": 4.987046632124352e-07, "loss": 0.0012, "reward": 1.8772040605545044, "reward_std": 0.0005245279011774073, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.377204179763794, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.015544041450777, "grad_norm": 0.11069658855782345, "kl": 0.537109375, "learning_rate": 4.984455958549223e-07, "loss": 0.0023, "reward": 2.499998092651367, "reward_std": 2.244379686544562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.018134715025907, "grad_norm": 0.11519553118550525, "kl": 0.5068359375, "learning_rate": 4.981865284974093e-07, "loss": 0.0022, "reward": 2.4999970197677612, "reward_std": 1.8629937699188304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.020725388601036, "grad_norm": 1.6433093314816967, "kl": 0.7109375, "learning_rate": 4.979274611398964e-07, "loss": 0.003, "reward": 1.997984766960144, "reward_std": 5.172375290385389e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497984766960144, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.023316062176166, "grad_norm": 0.22932638413217685, "kl": 0.595703125, "learning_rate": 4.976683937823834e-07, "loss": 0.0018, "reward": 2.4999961853027344, "reward_std": 1.9508374293764064e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.025906735751295, "grad_norm": 0.2073261591617269, "kl": 0.55859375, "learning_rate": 4.974093264248704e-07, "loss": 0.0024, "reward": 2.4999951124191284, "reward_std": 2.6252536144966143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.028497409326425, "grad_norm": 15.420401368111131, "kl": 0.5859375, "learning_rate": 4.971502590673575e-07, "loss": 0.0018, "reward": 2.437483072280884, "reward_std": 0.17679226416976235, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374832510948181, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.0310880829015545, "grad_norm": 0.2599973900423212, "kl": 0.58984375, "learning_rate": 4.968911917098446e-07, "loss": 0.0016, "reward": 2.499997854232788, "reward_std": 2.1153026636966388e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.033678756476684, "grad_norm": 0.20308242292603312, "kl": 0.568359375, "learning_rate": 4.966321243523316e-07, "loss": 0.0028, "reward": 2.4999921321868896, "reward_std": 4.924142558593303e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.036269430051814, "grad_norm": 0.1398723055166168, "kl": 0.6796875, "learning_rate": 4.963730569948186e-07, "loss": 0.0026, "reward": 2.499996542930603, "reward_std": 3.2267389542539604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.038860103626943, "grad_norm": 1.7314011611697686, "kl": 0.4599609375, "learning_rate": 4.961139896373057e-07, "loss": 0.0015, "reward": 2.4999927282333374, "reward_std": 5.2158480059461e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.041450777202073, "grad_norm": 5.813137718791195, "kl": 0.732421875, "learning_rate": 4.958549222797927e-07, "loss": 0.0029, "reward": 2.374962091445923, "reward_std": 0.23146274032569636, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749620914459229, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.044041450777202, "grad_norm": 0.3098200786264177, "kl": 0.6171875, "learning_rate": 4.955958549222798e-07, "loss": 0.0039, "reward": 2.499992609024048, "reward_std": 3.5715901276489603e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.046632124352332, "grad_norm": 4.163011826987363, "kl": 0.626953125, "learning_rate": 4.953367875647668e-07, "loss": 0.0032, "reward": 1.9343201518058777, "reward_std": 0.00029608538977754506, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.434320092201233, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.049222797927461, "grad_norm": 3.546204984343011, "kl": 0.4677734375, "learning_rate": 4.950777202072538e-07, "loss": 0.0019, "reward": 2.343746304512024, "reward_std": 0.44194426460035174, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749961853027344, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.051813471502591, "grad_norm": 2.5460293978025663, "kl": 0.564453125, "learning_rate": 4.948186528497409e-07, "loss": 0.0019, "reward": 2.4999502897262573, "reward_std": 4.510511269018025e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999950349330902, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.05440414507772, "grad_norm": 0.1468743571162333, "kl": 0.599609375, "learning_rate": 4.94559585492228e-07, "loss": 0.0017, "reward": 2.4999951124191284, "reward_std": 1.9104280113424466e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.05699481865285, "grad_norm": 0.22061277285370884, "kl": 0.4765625, "learning_rate": 4.94300518134715e-07, "loss": 0.0019, "reward": 2.4999821186065674, "reward_std": 3.806088102464855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821782112122, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.0595854922279795, "grad_norm": 1.412474516137496, "kl": 0.583984375, "learning_rate": 4.94041450777202e-07, "loss": 0.0032, "reward": 1.9984676837921143, "reward_std": 4.002610205589008e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984675645828247, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.062176165803109, "grad_norm": 0.43711238697361626, "kl": 0.572265625, "learning_rate": 4.937823834196891e-07, "loss": 0.0035, "reward": 2.499988555908203, "reward_std": 4.101265858480474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884366989136, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.064766839378239, "grad_norm": 0.13030983201099272, "kl": 0.640625, "learning_rate": 4.935233160621761e-07, "loss": 0.0036, "reward": 2.499997854232788, "reward_std": 2.0924464934068965e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.067357512953368, "grad_norm": 0.912972565434476, "kl": 0.623046875, "learning_rate": 4.932642487046632e-07, "loss": 0.0032, "reward": 2.499985933303833, "reward_std": 7.893547035564552e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999859929084778, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.069948186528498, "grad_norm": 0.12901261107904027, "kl": 0.59375, "learning_rate": 4.930051813471502e-07, "loss": 0.0017, "reward": 2.4999918937683105, "reward_std": 3.229561002626724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.072538860103627, "grad_norm": 1.4531948578549148, "kl": 0.609375, "learning_rate": 4.927461139896372e-07, "loss": 0.0023, "reward": 2.499990701675415, "reward_std": 7.470276386811747e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990701675415, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.075129533678757, "grad_norm": 1.9906515771019526, "kl": 0.591796875, "learning_rate": 4.924870466321243e-07, "loss": 0.0026, "reward": 2.4999914169311523, "reward_std": 8.113255262287566e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.077720207253886, "grad_norm": 0.08201688060089059, "kl": 0.603515625, "learning_rate": 4.922279792746113e-07, "loss": 0.0017, "reward": 2.4999942779541016, "reward_std": 1.6144485641689243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.080310880829016, "grad_norm": 2.2800675721921735, "kl": 1.603515625, "learning_rate": 4.919689119170985e-07, "loss": 0.007, "reward": 2.4999958276748657, "reward_std": 5.10711322476709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.082901554404145, "grad_norm": 0.5162436049002317, "kl": 0.6796875, "learning_rate": 4.917098445595855e-07, "loss": 0.0026, "reward": 2.4999955892562866, "reward_std": 3.7039921494397277e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.085492227979275, "grad_norm": 7.5611900960492235, "kl": 0.669921875, "learning_rate": 4.914507772020726e-07, "loss": 0.0019, "reward": 1.9988080263137817, "reward_std": 6.082757806780137e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498808115720749, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.0880829015544045, "grad_norm": 0.5605757036519623, "kl": 0.568359375, "learning_rate": 4.911917098445596e-07, "loss": 0.0039, "reward": 2.4999938011169434, "reward_std": 2.346303062950028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.090673575129534, "grad_norm": 0.16172987090169716, "kl": 0.65234375, "learning_rate": 4.909326424870467e-07, "loss": 0.0023, "reward": 2.4999961853027344, "reward_std": 2.6895397695625434e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 5.0932642487046635, "grad_norm": 17.808453968450415, "kl": 0.595703125, "learning_rate": 4.906735751295337e-07, "loss": 0.0015, "reward": 1.9650477766990662, "reward_std": 0.216158190760666, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4650478959083557, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.095854922279793, "grad_norm": 0.3344921853552473, "kl": 0.59375, "learning_rate": 4.904145077720207e-07, "loss": 0.0024, "reward": 2.499992847442627, "reward_std": 3.952516294702946e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.098445595854923, "grad_norm": 32.82021658378623, "kl": 0.701171875, "learning_rate": 4.901554404145078e-07, "loss": 0.0027, "reward": 1.9118532538414001, "reward_std": 0.0004819220757781295, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.411853313446045, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.101036269430052, "grad_norm": 0.12977792925151108, "kl": 0.5859375, "learning_rate": 4.898963730569948e-07, "loss": 0.0023, "reward": 2.499994993209839, "reward_std": 2.128830146830296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.103626943005182, "grad_norm": 0.7251121790162453, "kl": 0.654296875, "learning_rate": 4.896373056994819e-07, "loss": 0.0027, "reward": 2.4999899864196777, "reward_std": 5.948142529632605e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.106217616580311, "grad_norm": 3.3338447659626187, "kl": 0.642578125, "learning_rate": 4.893782383419689e-07, "loss": 0.0023, "reward": 2.4999886751174927, "reward_std": 7.336216640396742e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.108808290155441, "grad_norm": 0.8462794245290844, "kl": 0.685546875, "learning_rate": 4.89119170984456e-07, "loss": 0.0023, "reward": 2.499980926513672, "reward_std": 7.883079661041847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999809861183167, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.11139896373057, "grad_norm": 0.5339471092291391, "kl": 0.650390625, "learning_rate": 4.88860103626943e-07, "loss": 0.003, "reward": 2.499992251396179, "reward_std": 4.613223040905723e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.1139896373057, "grad_norm": 0.9236657093758577, "kl": 0.5634765625, "learning_rate": 4.886010362694301e-07, "loss": 0.0017, "reward": 2.4999948740005493, "reward_std": 3.7237979313431424e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.116580310880829, "grad_norm": 18.32617619808085, "kl": 0.57421875, "learning_rate": 4.883419689119171e-07, "loss": 0.0023, "reward": 2.437379479408264, "reward_std": 0.17709868192014255, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373795986175537, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.119170984455959, "grad_norm": 33.541572837512014, "kl": 0.640625, "learning_rate": 4.880829015544041e-07, "loss": 0.002, "reward": 2.230610966682434, "reward_std": 0.3717895794353012, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.730610966682434, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.1217616580310885, "grad_norm": 23.42080730305845, "kl": 0.54296875, "learning_rate": 4.878238341968912e-07, "loss": 0.002, "reward": 1.9960647821426392, "reward_std": 0.0005230764843418001, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4960647821426392, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.124352331606218, "grad_norm": 0.14841523859026987, "kl": 0.57421875, "learning_rate": 4.875647668393782e-07, "loss": 0.0023, "reward": 2.49999737739563, "reward_std": 2.1625492081511766e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 5.126943005181348, "grad_norm": 45.47959081995356, "kl": 0.662109375, "learning_rate": 4.873056994818653e-07, "loss": 0.003, "reward": 1.9934128522872925, "reward_std": 0.00023934943919812213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4934128820896149, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.129533678756476, "grad_norm": 0.5212509137546545, "kl": 0.537109375, "learning_rate": 4.870466321243523e-07, "loss": 0.0013, "reward": 2.4999940395355225, "reward_std": 2.7745242618948396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.132124352331606, "grad_norm": 3.3409981011177874, "kl": 0.705078125, "learning_rate": 4.867875647668394e-07, "loss": 0.002, "reward": 2.4999953508377075, "reward_std": 3.2814974701977917e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.134715025906735, "grad_norm": 0.8111611733612011, "kl": 0.662109375, "learning_rate": 4.865284974093264e-07, "loss": 0.0033, "reward": 2.499985933303833, "reward_std": 7.403614176837436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999857544898987, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.137305699481865, "grad_norm": 63.32369155412085, "kl": 0.60546875, "learning_rate": 4.862694300518134e-07, "loss": 0.0026, "reward": 1.9974348545074463, "reward_std": 0.001513685521899788, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497434824705124, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.139896373056994, "grad_norm": 0.6660372587918175, "kl": 0.564453125, "learning_rate": 4.860103626943005e-07, "loss": 0.0021, "reward": 2.499987840652466, "reward_std": 7.202611413958948e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879002571106, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 5.142487046632124, "grad_norm": 0.23981407834248672, "kl": 0.552734375, "learning_rate": 4.857512953367875e-07, "loss": 0.0025, "reward": 2.499995470046997, "reward_std": 2.96356836315681e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.1450777202072535, "grad_norm": 0.4786382809260013, "kl": 0.583984375, "learning_rate": 4.854922279792746e-07, "loss": 0.0038, "reward": 2.499997138977051, "reward_std": 3.1016410275697126e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.147668393782383, "grad_norm": 17.66155389828702, "kl": 0.609375, "learning_rate": 4.852331606217616e-07, "loss": 0.0023, "reward": 1.9371508359909058, "reward_std": 0.17730587602272863, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.437150925397873, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.150259067357513, "grad_norm": 3.718084186403918, "kl": 0.599609375, "learning_rate": 4.849740932642487e-07, "loss": 0.0021, "reward": 2.499992609024048, "reward_std": 4.490123131972723e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.152849740932642, "grad_norm": 1.1210356869688292, "kl": 0.666015625, "learning_rate": 4.847150259067357e-07, "loss": 0.0035, "reward": 2.499992609024048, "reward_std": 6.587655946077575e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.155440414507772, "grad_norm": 11.2979490480687, "kl": 0.591796875, "learning_rate": 4.844559585492228e-07, "loss": 0.002, "reward": 2.4997825622558594, "reward_std": 0.0004875912171087293, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997824430465698, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.158031088082901, "grad_norm": 8.648241837142642, "kl": 0.630859375, "learning_rate": 4.841968911917098e-07, "loss": 0.0023, "reward": 1.4138094186782837, "reward_std": 0.0010126679517270532, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9138094782829285, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.160621761658031, "grad_norm": 0.0873530934835051, "kl": 0.552734375, "learning_rate": 4.839378238341968e-07, "loss": 0.0029, "reward": 2.4999970197677612, "reward_std": 2.0420914665919554e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.16321243523316, "grad_norm": 12.88752714843587, "kl": 0.55078125, "learning_rate": 4.836787564766839e-07, "loss": 0.0021, "reward": 2.4999070167541504, "reward_std": 7.95510255784393e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999070167541504, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.16580310880829, "grad_norm": 0.12974026877942899, "kl": 0.591796875, "learning_rate": 4.834196891191709e-07, "loss": 0.0027, "reward": 2.49999463558197, "reward_std": 2.578767634986434e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.168393782383419, "grad_norm": 0.7898253116267124, "kl": 0.517578125, "learning_rate": 4.83160621761658e-07, "loss": 0.0019, "reward": 2.4999868869781494, "reward_std": 6.127030246716458e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999868869781494, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.170984455958549, "grad_norm": 0.3095080281134417, "kl": 0.4833984375, "learning_rate": 4.82901554404145e-07, "loss": 0.0028, "reward": 2.4999948740005493, "reward_std": 3.607050871323736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.1735751295336785, "grad_norm": 0.16930763858464687, "kl": 0.5029296875, "learning_rate": 4.826424870466321e-07, "loss": 0.0032, "reward": 2.4999969005584717, "reward_std": 2.9597926527458185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.176165803108808, "grad_norm": 0.9421099227169808, "kl": 0.568359375, "learning_rate": 4.823834196891191e-07, "loss": 0.0025, "reward": 2.4999881982803345, "reward_std": 8.228210845118156e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.178756476683938, "grad_norm": 0.19240014435384, "kl": 0.51953125, "learning_rate": 4.821243523316062e-07, "loss": 0.0016, "reward": 2.4999972581863403, "reward_std": 2.003538384087733e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.181347150259067, "grad_norm": 0.4073198759209443, "kl": 0.673828125, "learning_rate": 4.818652849740932e-07, "loss": 0.0025, "reward": 2.4999908208847046, "reward_std": 4.080458779753826e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.183937823834197, "grad_norm": 0.14707410223170173, "kl": 0.62109375, "learning_rate": 4.816062176165802e-07, "loss": 0.0019, "reward": 2.499996304512024, "reward_std": 2.558437302013772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.186528497409326, "grad_norm": 2.7757069784345916, "kl": 0.587890625, "learning_rate": 4.813471502590673e-07, "loss": 0.0027, "reward": 1.9997884035110474, "reward_std": 2.371479763496609e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997883439064026, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.189119170984456, "grad_norm": 8.743527936401994, "kl": 0.58984375, "learning_rate": 4.810880829015543e-07, "loss": 0.0027, "reward": 1.8668336868286133, "reward_std": 0.0011855592308620544, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3668335676193237, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.191709844559585, "grad_norm": 5.509541511382642, "kl": 0.619140625, "learning_rate": 4.808290155440415e-07, "loss": 0.0023, "reward": 1.7442642748355865, "reward_std": 0.0007097785158975967, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2442642003297806, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.194300518134715, "grad_norm": 2.6798573868059536, "kl": 0.5859375, "learning_rate": 4.805699481865285e-07, "loss": 0.002, "reward": 2.499974489212036, "reward_std": 1.2569078194246686e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999744892120361, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.196891191709844, "grad_norm": 11.528979777951038, "kl": 0.634765625, "learning_rate": 4.803108808290155e-07, "loss": 0.0025, "reward": 1.8211603164672852, "reward_std": 0.00030836322957839, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3211603164672852, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.199481865284974, "grad_norm": 0.1680317474338068, "kl": 0.63671875, "learning_rate": 4.800518134715026e-07, "loss": 0.0028, "reward": 2.499996066093445, "reward_std": 2.094256160489749e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.2020725388601035, "grad_norm": 0.1935721991558207, "kl": 0.544921875, "learning_rate": 4.797927461139897e-07, "loss": 0.0027, "reward": 2.499997138977051, "reward_std": 3.406874895972578e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.204663212435233, "grad_norm": 3.3897425789222724, "kl": 0.541015625, "learning_rate": 4.795336787564767e-07, "loss": 0.002, "reward": 1.986894130706787, "reward_std": 0.00023161470016930252, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.486894130706787, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.2072538860103625, "grad_norm": 9.393578729394891, "kl": 0.537109375, "learning_rate": 4.792746113989637e-07, "loss": 0.0021, "reward": 1.6467503607273102, "reward_std": 0.0005640337512886617, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1467503607273102, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.209844559585492, "grad_norm": 0.13384638624569697, "kl": 0.63671875, "learning_rate": 4.790155440414508e-07, "loss": 0.0027, "reward": 2.499997138977051, "reward_std": 1.9401205122449028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.212435233160622, "grad_norm": 0.5229251623926119, "kl": 0.64453125, "learning_rate": 4.787564766839378e-07, "loss": 0.002, "reward": 2.499983310699463, "reward_std": 5.8567760561345494e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831914901733, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.215025906735751, "grad_norm": 9.778215539679904, "kl": 0.677734375, "learning_rate": 4.784974093264249e-07, "loss": 0.0031, "reward": 1.7977296113967896, "reward_std": 0.00019399417601562163, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2977294921875, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 5.217616580310881, "grad_norm": 18.005151191399282, "kl": 0.548828125, "learning_rate": 4.782383419689119e-07, "loss": 0.002, "reward": 1.9995366930961609, "reward_std": 7.068459746051303e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995368123054504, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.22020725388601, "grad_norm": 1.09755504572536, "kl": 0.56640625, "learning_rate": 4.779792746113989e-07, "loss": 0.002, "reward": 2.499993085861206, "reward_std": 3.998499551016721e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.22279792746114, "grad_norm": 0.16067046088000247, "kl": 0.66015625, "learning_rate": 4.77720207253886e-07, "loss": 0.0032, "reward": 2.4999942779541016, "reward_std": 2.4087808014883194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.225388601036269, "grad_norm": 1.0479803946626134, "kl": 0.49609375, "learning_rate": 4.774611398963731e-07, "loss": 0.0033, "reward": 2.499995470046997, "reward_std": 6.696399395877961e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.227979274611399, "grad_norm": 34.79884825653209, "kl": 0.65234375, "learning_rate": 4.772020725388601e-07, "loss": 0.0026, "reward": 2.310889482498169, "reward_std": 0.2609879431676063, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8108895421028137, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.230569948186528, "grad_norm": 36.52252802245716, "kl": 0.681640625, "learning_rate": 4.769430051813471e-07, "loss": 0.0035, "reward": 1.99883633852005, "reward_std": 6.287482119660126e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988361299037933, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 5.233160621761658, "grad_norm": 20.21383504455905, "kl": 0.6796875, "learning_rate": 4.7668393782383414e-07, "loss": 0.0033, "reward": 2.187022864818573, "reward_std": 0.25916639960360044, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6870228052139282, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 5.2357512953367875, "grad_norm": 36.38491203771738, "kl": 0.6015625, "learning_rate": 4.7642487046632124e-07, "loss": 0.0024, "reward": 2.3118808269500732, "reward_std": 0.25956653331491, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8118808269500732, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.238341968911917, "grad_norm": 0.6713131936462704, "kl": 0.4853515625, "learning_rate": 4.761658031088083e-07, "loss": 0.0017, "reward": 2.499993681907654, "reward_std": 4.551273036668135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.240932642487047, "grad_norm": 0.16357305204694614, "kl": 0.607421875, "learning_rate": 4.759067357512953e-07, "loss": 0.0033, "reward": 2.4999982118606567, "reward_std": 1.1216209827580315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 5.243523316062176, "grad_norm": 18.653606918278758, "kl": 0.556640625, "learning_rate": 4.7564766839378235e-07, "loss": 0.0022, "reward": 1.5619222521781921, "reward_std": 0.17692009590973612, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0619222521781921, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.246113989637306, "grad_norm": 0.26558072730962295, "kl": 0.521484375, "learning_rate": 4.7538860103626945e-07, "loss": 0.0019, "reward": 2.4999955892562866, "reward_std": 4.173862748757529e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.248704663212435, "grad_norm": 3.427210689256347, "kl": 0.560546875, "learning_rate": 4.7512953367875645e-07, "loss": 0.001, "reward": 2.4998998641967773, "reward_std": 3.8777103554821224e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998998641967773, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.251295336787565, "grad_norm": 0.4545352073886684, "kl": 0.666015625, "learning_rate": 4.748704663212435e-07, "loss": 0.0024, "reward": 2.499992251396179, "reward_std": 4.144454123888863e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.253886010362694, "grad_norm": 0.5687053407004764, "kl": 0.5009765625, "learning_rate": 4.7461139896373056e-07, "loss": 0.0019, "reward": 2.499995708465576, "reward_std": 2.19676161350435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.256476683937824, "grad_norm": 31.663033810910193, "kl": 0.662109375, "learning_rate": 4.7435233160621756e-07, "loss": 0.0029, "reward": 2.4373068809509277, "reward_std": 0.17728418890317243, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373069405555725, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.259067357512953, "grad_norm": 0.21005565386511563, "kl": 0.615234375, "learning_rate": 4.7409326424870466e-07, "loss": 0.0023, "reward": 2.4999955892562866, "reward_std": 2.315393004437283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.261658031088083, "grad_norm": 32.008046756202965, "kl": 0.474609375, "learning_rate": 4.738341968911917e-07, "loss": 0.0019, "reward": 2.498546600341797, "reward_std": 0.0006745506852894323, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9985467791557312, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.2642487046632125, "grad_norm": 6.165008863044795, "kl": 0.576171875, "learning_rate": 4.735751295336787e-07, "loss": 0.0022, "reward": 1.999077558517456, "reward_std": 5.615064799258107e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990775883197784, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.266839378238342, "grad_norm": 1.4787077845878371, "kl": 0.591796875, "learning_rate": 4.7331606217616577e-07, "loss": 0.0023, "reward": 2.4999886751174927, "reward_std": 3.938819475024502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.269430051813472, "grad_norm": 2.242359027941659, "kl": 0.5234375, "learning_rate": 4.730569948186529e-07, "loss": 0.003, "reward": 1.9998422265052795, "reward_std": 3.223239326644034e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499842256307602, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.272020725388601, "grad_norm": 0.4289348893249591, "kl": 0.580078125, "learning_rate": 4.7279792746113987e-07, "loss": 0.0025, "reward": 2.4999899864196777, "reward_std": 4.042483283228648e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898672103882, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.274611398963731, "grad_norm": 1.630850965819461, "kl": 0.6015625, "learning_rate": 4.725388601036269e-07, "loss": 0.0023, "reward": 1.9994494915008545, "reward_std": 2.7383456085772195e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994495511054993, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.27720207253886, "grad_norm": 24.63130218824497, "kl": 0.681640625, "learning_rate": 4.72279792746114e-07, "loss": 0.0026, "reward": 1.993932843208313, "reward_std": 0.0012224699647163106, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4939329028129578, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.27979274611399, "grad_norm": 0.8264898836459835, "kl": 0.607421875, "learning_rate": 4.72020725388601e-07, "loss": 0.0028, "reward": 1.9999346733093262, "reward_std": 6.702154792037618e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999346733093262, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.282383419689119, "grad_norm": 90.94590539395203, "kl": 2.591796875, "learning_rate": 4.717616580310881e-07, "loss": 0.0106, "reward": 1.9989628791809082, "reward_std": 0.00012178460656286916, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989627599716187, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.284974093264249, "grad_norm": 0.14237641953238792, "kl": 0.625, "learning_rate": 4.7150259067357514e-07, "loss": 0.0038, "reward": 2.499995708465576, "reward_std": 3.3828183063633332e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.287564766839378, "grad_norm": 1.3473306841169368, "kl": 0.625, "learning_rate": 4.7124352331606214e-07, "loss": 0.0021, "reward": 1.9998908042907715, "reward_std": 1.780945876816986e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998908936977386, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.290155440414508, "grad_norm": 1.4450609281035576, "kl": 0.6015625, "learning_rate": 4.709844559585492e-07, "loss": 0.003, "reward": 2.4999920129776, "reward_std": 1.3787463672088052e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.2927461139896375, "grad_norm": 51.100744990718255, "kl": 0.615234375, "learning_rate": 4.7072538860103624e-07, "loss": 0.0035, "reward": 1.9629690647125244, "reward_std": 0.0012731026139363166, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4629689157009125, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.295336787564767, "grad_norm": 93.37210339886597, "kl": 0.60546875, "learning_rate": 4.704663212435233e-07, "loss": 0.0023, "reward": 1.9949418902397156, "reward_std": 0.00052468145054263, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4949418902397156, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 5.2979274611398965, "grad_norm": 206.00466497803515, "kl": 0.5859375, "learning_rate": 4.7020725388601035e-07, "loss": 0.0023, "reward": 2.116739273071289, "reward_std": 0.23678716827089374, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6167393326759338, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.300518134715026, "grad_norm": 4.82304910676857, "kl": 0.60546875, "learning_rate": 4.699481865284974e-07, "loss": 0.0026, "reward": 1.9988826513290405, "reward_std": 4.609770076058339e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988827109336853, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.303108808290156, "grad_norm": 0.3885288148933926, "kl": 0.59375, "learning_rate": 4.696891191709844e-07, "loss": 0.0028, "reward": 2.4999921321868896, "reward_std": 4.208285190543393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.305699481865285, "grad_norm": 24.74015217272004, "kl": 0.572265625, "learning_rate": 4.694300518134715e-07, "loss": 0.0013, "reward": 2.3119778633117676, "reward_std": 0.25948782686475624, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8119779825210571, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.308290155440415, "grad_norm": 22.323069421441392, "kl": 0.412109375, "learning_rate": 4.6917098445595856e-07, "loss": 0.0014, "reward": 2.4996964931488037, "reward_std": 0.00041906879391717666, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996965527534485, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.310880829015544, "grad_norm": 0.1991899085096994, "kl": 0.541015625, "learning_rate": 4.6891191709844556e-07, "loss": 0.0017, "reward": 2.499998688697815, "reward_std": 8.67448250119196e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.313471502590674, "grad_norm": 0.7850293434791397, "kl": 0.568359375, "learning_rate": 4.686528497409326e-07, "loss": 0.0035, "reward": 2.4999914169311523, "reward_std": 6.811870889578131e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.316062176165803, "grad_norm": 8.861017233000535, "kl": 0.501953125, "learning_rate": 4.6839378238341966e-07, "loss": 0.0017, "reward": 2.2499526739120483, "reward_std": 0.2672702396263844, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499527335166931, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.318652849740933, "grad_norm": 1.9088548173148083, "kl": 0.5654296875, "learning_rate": 4.681347150259067e-07, "loss": 0.0014, "reward": 2.499991774559021, "reward_std": 4.481182799054295e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.321243523316062, "grad_norm": 8.41369008545033, "kl": 0.658203125, "learning_rate": 4.6787564766839377e-07, "loss": 0.0022, "reward": 1.9456390738487244, "reward_std": 0.00030833890562576016, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4456391036510468, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.323834196891192, "grad_norm": 0.1596845582966857, "kl": 0.54296875, "learning_rate": 4.676165803108808e-07, "loss": 0.0007, "reward": 2.4999961853027344, "reward_std": 2.471329139552836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.3264248704663215, "grad_norm": 20.397159342585407, "kl": 0.591796875, "learning_rate": 4.673575129533678e-07, "loss": 0.0024, "reward": 1.2947803139686584, "reward_std": 0.000791814258263912, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7947803139686584, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.329015544041451, "grad_norm": 1.7382712390139916, "kl": 0.62890625, "learning_rate": 4.670984455958549e-07, "loss": 0.0025, "reward": 2.4999518394470215, "reward_std": 1.1160813500055156e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999518990516663, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.331606217616581, "grad_norm": 2.2666815325894643, "kl": 0.62890625, "learning_rate": 4.66839378238342e-07, "loss": 0.0016, "reward": 1.9997564554214478, "reward_std": 1.9388552118471125e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997565746307373, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.33419689119171, "grad_norm": 25.865093961752674, "kl": 0.5625, "learning_rate": 4.66580310880829e-07, "loss": 0.0026, "reward": 1.995258867740631, "reward_std": 0.0009471460199392823, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4952588081359863, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.33678756476684, "grad_norm": 0.04375164719754628, "kl": 0.640625, "learning_rate": 4.6632124352331603e-07, "loss": 0.0021, "reward": 2.4999983310699463, "reward_std": 1.0892449608945753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 5.339378238341969, "grad_norm": 18.49128414858765, "kl": 0.66015625, "learning_rate": 4.660621761658031e-07, "loss": 0.0024, "reward": 1.3398758172988892, "reward_std": 0.058837971331740846, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8398758172988892, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.341968911917099, "grad_norm": 0.08585935152768848, "kl": 0.59765625, "learning_rate": 4.6580310880829014e-07, "loss": 0.003, "reward": 2.499998450279236, "reward_std": 1.5644384347979212e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.344559585492228, "grad_norm": 0.11030597040818481, "kl": 0.5390625, "learning_rate": 4.655440414507772e-07, "loss": 0.0016, "reward": 2.499997854232788, "reward_std": 1.931144197442336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.347150259067358, "grad_norm": 7.627775670579292, "kl": 0.626953125, "learning_rate": 4.6528497409326424e-07, "loss": 0.003, "reward": 1.9119471311569214, "reward_std": 0.0002573311467131134, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4119471311569214, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.349740932642487, "grad_norm": 2.7236579824120453, "kl": 0.583984375, "learning_rate": 4.6502590673575124e-07, "loss": 0.002, "reward": 1.9901267290115356, "reward_std": 4.748160137069135e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.490126758813858, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.352331606217617, "grad_norm": 0.36138375829162106, "kl": 0.537109375, "learning_rate": 4.647668393782383e-07, "loss": 0.0017, "reward": 2.49999737739563, "reward_std": 1.7440567035009735e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.3549222797927465, "grad_norm": 5.081910127774666, "kl": 0.5703125, "learning_rate": 4.645077720207254e-07, "loss": 0.0024, "reward": 2.368894577026367, "reward_std": 0.0003391225297946221, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8688945174217224, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.357512953367876, "grad_norm": 1.6819953796329077, "kl": 0.595703125, "learning_rate": 4.642487046632124e-07, "loss": 0.0031, "reward": 2.499993920326233, "reward_std": 4.3583797264545865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.360103626943006, "grad_norm": 5.128419794263951, "kl": 0.55078125, "learning_rate": 4.6398963730569945e-07, "loss": 0.0016, "reward": 1.9994136691093445, "reward_std": 0.00010774077691166895, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499413639307022, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.362694300518135, "grad_norm": 20.505858418530753, "kl": 1.08984375, "learning_rate": 4.637305699481865e-07, "loss": 0.0046, "reward": 2.437397837638855, "reward_std": 0.17703994469002282, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373979568481445, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.365284974093265, "grad_norm": 0.7001159261195578, "kl": 0.580078125, "learning_rate": 4.6347150259067356e-07, "loss": 0.002, "reward": 2.4999724626541138, "reward_std": 5.342638587535475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999725222587585, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.367875647668393, "grad_norm": 0.0762820564953761, "kl": 0.576171875, "learning_rate": 4.632124352331606e-07, "loss": 0.0023, "reward": 2.499997615814209, "reward_std": 1.4083316273172386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.370466321243523, "grad_norm": 16.62159896085187, "kl": 0.6640625, "learning_rate": 4.6295336787564766e-07, "loss": 0.0032, "reward": 1.994649589061737, "reward_std": 0.00011473281574581051, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4946495294570923, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.373056994818652, "grad_norm": 0.11178991260231043, "kl": 0.677734375, "learning_rate": 4.6269430051813466e-07, "loss": 0.0023, "reward": 2.4999988079071045, "reward_std": 1.0892100164028307e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.375647668393782, "grad_norm": 162.88173319833368, "kl": 0.5380859375, "learning_rate": 4.624352331606217e-07, "loss": 0.0022, "reward": 1.9353981614112854, "reward_std": 0.1773676525335759, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4353981018066406, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.3782383419689115, "grad_norm": 0.08961375351655594, "kl": 0.5703125, "learning_rate": 4.621761658031088e-07, "loss": 0.0024, "reward": 2.4999974966049194, "reward_std": 1.2754248643886967e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.380829015544041, "grad_norm": 0.0775557185253988, "kl": 0.63671875, "learning_rate": 4.619170984455958e-07, "loss": 0.0025, "reward": 2.499998927116394, "reward_std": 1.0456015644422223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.383419689119171, "grad_norm": 0.09683472030426697, "kl": 0.619140625, "learning_rate": 4.616580310880829e-07, "loss": 0.0022, "reward": 2.499997138977051, "reward_std": 1.3148884363545221e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.3860103626943, "grad_norm": 2.324281154473838, "kl": 0.51171875, "learning_rate": 4.6139896373056993e-07, "loss": 0.0013, "reward": 1.999828040599823, "reward_std": 4.258999530293295e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998281002044678, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.38860103626943, "grad_norm": 3.494076199446902, "kl": 0.59375, "learning_rate": 4.611398963730569e-07, "loss": 0.0021, "reward": 2.4998531341552734, "reward_std": 2.3317046725424007e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998531937599182, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.391191709844559, "grad_norm": 34.00913178055726, "kl": 0.5703125, "learning_rate": 4.6088082901554403e-07, "loss": 0.0023, "reward": 1.998158574104309, "reward_std": 0.35475657880306244, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981584548950195, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.393782383419689, "grad_norm": 0.5288778671320479, "kl": 0.625, "learning_rate": 4.606217616580311e-07, "loss": 0.0019, "reward": 2.4999972581863403, "reward_std": 3.0179442376265797e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.396373056994818, "grad_norm": 38.11323418392687, "kl": 0.60546875, "learning_rate": 4.603626943005181e-07, "loss": 0.0021, "reward": 2.249288558959961, "reward_std": 0.2680063200186851, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7492886781692505, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.398963730569948, "grad_norm": 2.2988047611786353, "kl": 0.66015625, "learning_rate": 4.6010362694300514e-07, "loss": 0.0027, "reward": 1.9989478588104248, "reward_std": 2.7495047802972294e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989478886127472, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.401554404145077, "grad_norm": 0.8077196064902931, "kl": 0.626953125, "learning_rate": 4.5984455958549224e-07, "loss": 0.0022, "reward": 2.4999901056289673, "reward_std": 7.5538323471846525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.404145077720207, "grad_norm": 1.378447861894592, "kl": 0.74609375, "learning_rate": 4.5958549222797924e-07, "loss": 0.0025, "reward": 1.9999540448188782, "reward_std": 9.382358257425949e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999540150165558, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.4067357512953365, "grad_norm": 4.015163170245785, "kl": 0.580078125, "learning_rate": 4.593264248704663e-07, "loss": 0.0014, "reward": 2.4999797344207764, "reward_std": 1.4419170724977448e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979853630066, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.409326424870466, "grad_norm": 0.5089678495021858, "kl": 0.4833984375, "learning_rate": 4.5906735751295335e-07, "loss": 0.0014, "reward": 2.4999938011169434, "reward_std": 4.191305720269156e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.4119170984455955, "grad_norm": 2.8632615610919285, "kl": 0.4716796875, "learning_rate": 4.5880829015544035e-07, "loss": 0.0022, "reward": 2.499985694885254, "reward_std": 2.409307637663005e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999857544898987, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.414507772020725, "grad_norm": 0.14362424491263173, "kl": 0.4453125, "learning_rate": 4.5854922279792745e-07, "loss": 0.0011, "reward": 2.499995708465576, "reward_std": 2.873550783988321e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.417098445595855, "grad_norm": 55.133793653693466, "kl": 0.556640625, "learning_rate": 4.582901554404145e-07, "loss": 0.0019, "reward": 2.374717950820923, "reward_std": 0.23197395640227114, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747180700302124, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.419689119170984, "grad_norm": 0.27913790743624894, "kl": 0.591796875, "learning_rate": 4.580310880829015e-07, "loss": 0.0015, "reward": 2.4999990463256836, "reward_std": 1.194757928146828e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.422279792746114, "grad_norm": 0.3406690019287161, "kl": 0.513671875, "learning_rate": 4.5777202072538856e-07, "loss": 0.0026, "reward": 2.4999951124191284, "reward_std": 3.1853841164775076e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.424870466321243, "grad_norm": 15.785216947605809, "kl": 0.658203125, "learning_rate": 4.5751295336787566e-07, "loss": 0.002, "reward": 1.9924931526184082, "reward_std": 0.00011809786633421027, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924933016300201, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.427461139896373, "grad_norm": 17.715010879685455, "kl": 0.642578125, "learning_rate": 4.5725388601036266e-07, "loss": 0.0023, "reward": 1.8631353378295898, "reward_std": 0.007072416904520651, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3631353378295898, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.430051813471502, "grad_norm": 1.2717046160985757, "kl": 0.552734375, "learning_rate": 4.569948186528497e-07, "loss": 0.0029, "reward": 2.499984383583069, "reward_std": 7.996441922841768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999844431877136, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.432642487046632, "grad_norm": 1.605455293440393, "kl": 0.703125, "learning_rate": 4.5673575129533677e-07, "loss": 0.003, "reward": 2.4999953508377075, "reward_std": 3.491977054181916e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.435233160621761, "grad_norm": 1.850284512043259, "kl": 0.578125, "learning_rate": 4.5647668393782377e-07, "loss": 0.0026, "reward": 2.499994397163391, "reward_std": 5.606585091300076e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 5.437823834196891, "grad_norm": 52.79668927172298, "kl": 0.60546875, "learning_rate": 4.562176165803109e-07, "loss": 0.0024, "reward": 1.5219902396202087, "reward_std": 0.29126326138793956, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0219903886318207, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.4404145077720205, "grad_norm": 3.340759304463147, "kl": 0.603515625, "learning_rate": 4.5595854922279793e-07, "loss": 0.0025, "reward": 1.9986661672592163, "reward_std": 3.571314618966426e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498666226863861, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 5.44300518134715, "grad_norm": 18.809712724286577, "kl": 0.611328125, "learning_rate": 4.5569948186528493e-07, "loss": 0.0024, "reward": 1.997714877128601, "reward_std": 0.0013183222047814525, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497714877128601, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 40.4375, "epoch": 5.44559585492228, "grad_norm": 8.546424562337487, "kl": 0.578125, "learning_rate": 4.55440414507772e-07, "loss": 0.002, "reward": 2.3437331914901733, "reward_std": 0.44193710665456365, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749832510948181, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.448186528497409, "grad_norm": 3.093420388957262, "kl": 0.8671875, "learning_rate": 4.5518134715025903e-07, "loss": 0.0039, "reward": 2.499948263168335, "reward_std": 1.8631685634318274e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999948263168335, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.450777202072539, "grad_norm": 7.960340133936694, "kl": 0.63671875, "learning_rate": 4.549222797927461e-07, "loss": 0.0024, "reward": 1.8650342226028442, "reward_std": 0.24860556428211567, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3650343120098114, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.453367875647668, "grad_norm": 2.5116592944646006, "kl": 0.64453125, "learning_rate": 4.5466321243523314e-07, "loss": 0.0028, "reward": 1.9993535280227661, "reward_std": 3.119333882750652e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993534982204437, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.455958549222798, "grad_norm": 0.07835816693150476, "kl": 0.533203125, "learning_rate": 4.544041450777202e-07, "loss": 0.0029, "reward": 2.499998927116394, "reward_std": 8.099165142994025e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.458549222797927, "grad_norm": 0.923285844264546, "kl": 0.60546875, "learning_rate": 4.541450777202072e-07, "loss": 0.0031, "reward": 2.499990940093994, "reward_std": 8.668633654451696e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 5.461139896373057, "grad_norm": 24.101800861541953, "kl": 0.5458984375, "learning_rate": 4.538860103626943e-07, "loss": 0.0029, "reward": 2.437446355819702, "reward_std": 0.1769175256949893, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374462366104126, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.463730569948186, "grad_norm": 16.13979640996191, "kl": 0.576171875, "learning_rate": 4.5362694300518135e-07, "loss": 0.0029, "reward": 2.4990875720977783, "reward_std": 0.0009680720117444253, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990875124931335, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.466321243523316, "grad_norm": 0.24031115360858563, "kl": 0.697265625, "learning_rate": 4.5336787564766835e-07, "loss": 0.0034, "reward": 2.4999935626983643, "reward_std": 2.5406120016668865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.4689119170984455, "grad_norm": 12.030676190043021, "kl": 0.677734375, "learning_rate": 4.531088082901554e-07, "loss": 0.0025, "reward": 1.999133288860321, "reward_std": 3.672354534955957e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991333484649658, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 5.471502590673575, "grad_norm": 51.19160187356243, "kl": 1.2265625, "learning_rate": 4.5284974093264245e-07, "loss": 0.0064, "reward": 2.4999953508377075, "reward_std": 6.952963303774595e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.474093264248705, "grad_norm": 3.842299416034242, "kl": 0.599609375, "learning_rate": 4.5259067357512956e-07, "loss": 0.0024, "reward": 2.499972343444824, "reward_std": 1.998252491830499e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999972403049469, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.476683937823834, "grad_norm": 4.426029162149632, "kl": 0.625, "learning_rate": 4.5233160621761656e-07, "loss": 0.0025, "reward": 2.4988327026367188, "reward_std": 3.808872816080111e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998832643032074, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.479274611398964, "grad_norm": 17.575883663990453, "kl": 0.533203125, "learning_rate": 4.520725388601036e-07, "loss": 0.002, "reward": 1.9952016472816467, "reward_std": 0.0005358290783306074, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4952015280723572, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.481865284974093, "grad_norm": 0.4048142747427771, "kl": 0.5810546875, "learning_rate": 4.5181347150259066e-07, "loss": 0.0031, "reward": 2.4999942779541016, "reward_std": 3.7471388623089297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.484455958549223, "grad_norm": 0.14316342765385484, "kl": 0.580078125, "learning_rate": 4.515544041450777e-07, "loss": 0.0029, "reward": 2.4999990463256836, "reward_std": 6.802391681048903e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.487046632124352, "grad_norm": 12.954317312591515, "kl": 0.56640625, "learning_rate": 4.5129533678756477e-07, "loss": 0.0022, "reward": 1.9762251377105713, "reward_std": 0.00027579056973081606, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4762250781059265, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.489637305699482, "grad_norm": 0.6315118460909692, "kl": 0.599609375, "learning_rate": 4.510362694300518e-07, "loss": 0.0031, "reward": 2.499990940093994, "reward_std": 3.6023498068971094e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.492227979274611, "grad_norm": 16.46808890010898, "kl": 0.55859375, "learning_rate": 4.507772020725388e-07, "loss": 0.002, "reward": 1.9998152256011963, "reward_std": 3.675452848028726e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499815285205841, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.494818652849741, "grad_norm": 53.880169103801826, "kl": 0.60546875, "learning_rate": 4.505181347150259e-07, "loss": 0.0029, "reward": 2.435734748840332, "reward_std": 0.18149448120908573, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9357348084449768, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.4974093264248705, "grad_norm": 0.1898586578531636, "kl": 0.603515625, "learning_rate": 4.50259067357513e-07, "loss": 0.0021, "reward": 2.4999983310699463, "reward_std": 1.6392335169257422e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.5, "grad_norm": 1.081750328473616, "kl": 0.58203125, "learning_rate": 4.5e-07, "loss": 0.0026, "reward": 2.499993324279785, "reward_std": 3.354904663410707e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 5.5025906735751295, "grad_norm": 15.513208445898432, "kl": 0.7890625, "learning_rate": 4.4974093264248703e-07, "loss": 0.0027, "reward": 2.3436906337738037, "reward_std": 0.2894253993229654, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8436909317970276, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.505181347150259, "grad_norm": 0.9220185657585894, "kl": 0.55859375, "learning_rate": 4.494818652849741e-07, "loss": 0.0026, "reward": 2.499982237815857, "reward_std": 6.199550284691213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982237815857, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.507772020725389, "grad_norm": 0.24139717188607474, "kl": 0.61328125, "learning_rate": 4.492227979274611e-07, "loss": 0.0033, "reward": 2.499994158744812, "reward_std": 2.8356545271890354e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.510362694300518, "grad_norm": 0.05078577594543837, "kl": 0.609375, "learning_rate": 4.489637305699482e-07, "loss": 0.0018, "reward": 2.499998927116394, "reward_std": 6.669413892268494e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.512953367875648, "grad_norm": 1.951115089824087, "kl": 0.62109375, "learning_rate": 4.4870466321243524e-07, "loss": 0.0024, "reward": 1.9919214248657227, "reward_std": 3.884211102445079e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.491921454668045, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.515544041450777, "grad_norm": 0.49627224393649144, "kl": 0.626953125, "learning_rate": 4.4844559585492224e-07, "loss": 0.0031, "reward": 2.499989628791809, "reward_std": 3.981983411449619e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 5.518134715025907, "grad_norm": 15.290964655740366, "kl": 0.70703125, "learning_rate": 4.481865284974093e-07, "loss": 0.003, "reward": 2.4999688863754272, "reward_std": 6.461364523602242e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999686479568481, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.520725388601036, "grad_norm": 0.5043602366496162, "kl": 0.556640625, "learning_rate": 4.479274611398964e-07, "loss": 0.0014, "reward": 2.499994993209839, "reward_std": 6.354811148412409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.523316062176166, "grad_norm": 0.3021764452094086, "kl": 0.576171875, "learning_rate": 4.476683937823834e-07, "loss": 0.0036, "reward": 2.499994158744812, "reward_std": 4.0871573219192214e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.525906735751295, "grad_norm": 10.281237845463494, "kl": 0.572265625, "learning_rate": 4.4740932642487045e-07, "loss": 0.0026, "reward": 2.0617260336875916, "reward_std": 0.17708256005295198, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5617260336875916, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.528497409326425, "grad_norm": 0.17408388973001396, "kl": 0.5078125, "learning_rate": 4.471502590673575e-07, "loss": 0.0017, "reward": 2.49999737739563, "reward_std": 2.3988833390831132e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.5310880829015545, "grad_norm": 0.41786312004045073, "kl": 0.63671875, "learning_rate": 4.468911917098445e-07, "loss": 0.0037, "reward": 2.4999966621398926, "reward_std": 1.7078439213946695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.533678756476684, "grad_norm": 0.12728986750924165, "kl": 0.564453125, "learning_rate": 4.466321243523316e-07, "loss": 0.0021, "reward": 2.499994993209839, "reward_std": 2.0178265458525857e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 5.536269430051814, "grad_norm": 101.91084905176297, "kl": 0.560546875, "learning_rate": 4.4637305699481866e-07, "loss": 0.0018, "reward": 2.4370334148406982, "reward_std": 0.17808434657843009, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9370335340499878, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.538860103626943, "grad_norm": 6.832191046218927, "kl": 0.576171875, "learning_rate": 4.4611398963730566e-07, "loss": 0.0012, "reward": 1.931475043296814, "reward_std": 0.00021008539738431864, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4314753115177155, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.541450777202073, "grad_norm": 0.1303785512664684, "kl": 0.572265625, "learning_rate": 4.458549222797927e-07, "loss": 0.0036, "reward": 2.499998450279236, "reward_std": 1.4220949822174589e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.544041450777202, "grad_norm": 2.798450457414117, "kl": 0.576171875, "learning_rate": 4.4559585492227977e-07, "loss": 0.0024, "reward": 1.9987863302230835, "reward_std": 2.4533016130590113e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987864792346954, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.546632124352332, "grad_norm": 4.613275598537315, "kl": 0.66796875, "learning_rate": 4.453367875647668e-07, "loss": 0.0033, "reward": 2.4999505281448364, "reward_std": 4.116662103115232e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999505281448364, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.549222797927461, "grad_norm": 3.910269599518884, "kl": 0.654296875, "learning_rate": 4.450777202072539e-07, "loss": 0.0021, "reward": 1.9191782474517822, "reward_std": 0.00017609039048238628, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.419178307056427, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.551813471502591, "grad_norm": 0.563248452097465, "kl": 0.662109375, "learning_rate": 4.4481865284974093e-07, "loss": 0.0027, "reward": 2.4999866485595703, "reward_std": 6.185223639931792e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999867677688599, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.55440414507772, "grad_norm": 0.236988035870309, "kl": 0.626953125, "learning_rate": 4.4455958549222793e-07, "loss": 0.0031, "reward": 2.499995231628418, "reward_std": 2.3343900465988554e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.55699481865285, "grad_norm": 1.6595071120536327, "kl": 0.669921875, "learning_rate": 4.4430051813471503e-07, "loss": 0.0031, "reward": 2.4999934434890747, "reward_std": 4.7362771056214115e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.5595854922279795, "grad_norm": 0.29887078974071185, "kl": 0.53515625, "learning_rate": 4.440414507772021e-07, "loss": 0.002, "reward": 2.499996542930603, "reward_std": 2.30011636404015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.562176165803109, "grad_norm": 0.34048998929354135, "kl": 0.568359375, "learning_rate": 4.437823834196891e-07, "loss": 0.0024, "reward": 2.4999935626983643, "reward_std": 3.625597514655965e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.564766839378239, "grad_norm": 0.391680935815252, "kl": 0.5341796875, "learning_rate": 4.4352331606217614e-07, "loss": 0.0032, "reward": 2.4999947547912598, "reward_std": 3.7932162513243384e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.567357512953368, "grad_norm": 0.9410732729197439, "kl": 0.619140625, "learning_rate": 4.432642487046632e-07, "loss": 0.0019, "reward": 2.4999927282333374, "reward_std": 3.534173970365373e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 5.569948186528498, "grad_norm": 47.46912722126846, "kl": 0.64453125, "learning_rate": 4.4300518134715024e-07, "loss": 0.0035, "reward": 1.9643234610557556, "reward_std": 0.021152627428790538, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4643233716487885, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.572538860103627, "grad_norm": 10.070877331140585, "kl": 0.576171875, "learning_rate": 4.427461139896373e-07, "loss": 0.0013, "reward": 2.499990701675415, "reward_std": 6.061561066417198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.575129533678757, "grad_norm": 8.679077148044042, "kl": 0.669921875, "learning_rate": 4.4248704663212435e-07, "loss": 0.0021, "reward": 2.437474012374878, "reward_std": 0.17679464732282213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937474012374878, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.577720207253886, "grad_norm": 6.715685130529924, "kl": 0.58203125, "learning_rate": 4.4222797927461135e-07, "loss": 0.002, "reward": 1.9975281953811646, "reward_std": 0.00016252085694645757, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975283443927765, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.580310880829016, "grad_norm": 0.20935759876984136, "kl": 0.5234375, "learning_rate": 4.4196891191709845e-07, "loss": 0.0034, "reward": 2.49999737739563, "reward_std": 1.958033237769996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.582901554404145, "grad_norm": 13.20143306003385, "kl": 0.4853515625, "learning_rate": 4.417098445595855e-07, "loss": 0.0021, "reward": 1.9996974468231201, "reward_std": 0.0001438166032130539, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996974766254425, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.585492227979275, "grad_norm": 10.139894068808987, "kl": 0.599609375, "learning_rate": 4.414507772020725e-07, "loss": 0.0021, "reward": 2.4988938570022583, "reward_std": 0.0014072588678573084, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9988938570022583, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.5880829015544045, "grad_norm": 68.70833382902809, "kl": 0.6015625, "learning_rate": 4.4119170984455956e-07, "loss": 0.0024, "reward": 1.9983690977096558, "reward_std": 0.3544410914182663, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498369038105011, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.590673575129534, "grad_norm": 0.3706771844027531, "kl": 0.578125, "learning_rate": 4.409326424870466e-07, "loss": 0.0012, "reward": 2.499990463256836, "reward_std": 2.8434129148990905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.5932642487046635, "grad_norm": 0.735284547213945, "kl": 0.6484375, "learning_rate": 4.4067357512953366e-07, "loss": 0.0013, "reward": 2.4999799728393555, "reward_std": 6.005983664181258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999802112579346, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.595854922279793, "grad_norm": 0.48807826455513514, "kl": 0.57421875, "learning_rate": 4.404145077720207e-07, "loss": 0.0035, "reward": 2.4999979734420776, "reward_std": 1.5453558575018178e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.598445595854923, "grad_norm": 0.12223566450142694, "kl": 0.634765625, "learning_rate": 4.4015544041450777e-07, "loss": 0.002, "reward": 2.4999955892562866, "reward_std": 1.9507425577103277e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.601036269430052, "grad_norm": 0.18031256873683196, "kl": 0.66796875, "learning_rate": 4.3989637305699477e-07, "loss": 0.0019, "reward": 2.49999737739563, "reward_std": 1.7048872109626245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.603626943005182, "grad_norm": 17.89022813435865, "kl": 0.517578125, "learning_rate": 4.396373056994818e-07, "loss": 0.0015, "reward": 1.997665286064148, "reward_std": 0.0003555154661398774, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4976653754711151, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.606217616580311, "grad_norm": 0.15864624505138553, "kl": 0.6015625, "learning_rate": 4.3937823834196893e-07, "loss": 0.0015, "reward": 2.499998688697815, "reward_std": 9.450778577502206e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.608808290155441, "grad_norm": 0.32388581267757965, "kl": 0.537109375, "learning_rate": 4.3911917098445593e-07, "loss": 0.0019, "reward": 2.4999688863754272, "reward_std": 5.681996867679118e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999690055847168, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.61139896373057, "grad_norm": 0.1207503744340713, "kl": 0.568359375, "learning_rate": 4.38860103626943e-07, "loss": 0.0008, "reward": 2.499997615814209, "reward_std": 1.7324174450550345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.6139896373057, "grad_norm": 0.1085866912561129, "kl": 0.603515625, "learning_rate": 4.3860103626943003e-07, "loss": 0.0026, "reward": 2.4999966621398926, "reward_std": 1.2707228052022401e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.616580310880829, "grad_norm": 4.630161957316154, "kl": 0.6484375, "learning_rate": 4.383419689119171e-07, "loss": 0.0022, "reward": 1.9988139867782593, "reward_std": 3.455091291471035e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988139867782593, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.619170984455959, "grad_norm": 4.163114909011051, "kl": 0.58203125, "learning_rate": 4.3808290155440414e-07, "loss": 0.0031, "reward": 2.4999775886535645, "reward_std": 2.13679157354818e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999775886535645, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.6217616580310885, "grad_norm": 7.29823493877786, "kl": 0.4580078125, "learning_rate": 4.378238341968912e-07, "loss": 0.0016, "reward": 2.499937653541565, "reward_std": 4.983820250004101e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999375939369202, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.624352331606218, "grad_norm": 15.152487096265524, "kl": 0.58984375, "learning_rate": 4.375647668393782e-07, "loss": 0.0025, "reward": 2.4357879161834717, "reward_std": 0.1815943693529789, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9357876181602478, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 5.626943005181348, "grad_norm": 0.12784266707379652, "kl": 0.59375, "learning_rate": 4.3730569948186524e-07, "loss": 0.0019, "reward": 2.4999961853027344, "reward_std": 2.2044829961487267e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.629533678756477, "grad_norm": 1.737365819724787, "kl": 0.58203125, "learning_rate": 4.3704663212435235e-07, "loss": 0.0021, "reward": 2.4999741315841675, "reward_std": 1.1470683517700309e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999974250793457, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.632124352331607, "grad_norm": 146.9006266794109, "kl": 0.57421875, "learning_rate": 4.3678756476683935e-07, "loss": 0.0024, "reward": 1.9014395475387573, "reward_std": 0.007950311124204745, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4014394283294678, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.634715025906736, "grad_norm": 6.765509672921789, "kl": 0.654296875, "learning_rate": 4.365284974093264e-07, "loss": 0.0029, "reward": 1.9989939332008362, "reward_std": 0.00012186400999780744, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989938735961914, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.637305699481866, "grad_norm": 0.2187443247538006, "kl": 0.5048828125, "learning_rate": 4.3626943005181345e-07, "loss": 0.001, "reward": 2.4999942779541016, "reward_std": 3.5747434878885542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.639896373056995, "grad_norm": 2.401083211747061, "kl": 0.55078125, "learning_rate": 4.3601036269430045e-07, "loss": 0.0005, "reward": 2.4999818801879883, "reward_std": 1.4644616243231212e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821186065674, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.642487046632124, "grad_norm": 0.19834292625319866, "kl": 0.677734375, "learning_rate": 4.3575129533678756e-07, "loss": 0.0028, "reward": 2.499997138977051, "reward_std": 3.2076026172944694e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.6450777202072535, "grad_norm": 0.3521104426567708, "kl": 0.677734375, "learning_rate": 4.354922279792746e-07, "loss": 0.0029, "reward": 2.499996781349182, "reward_std": 4.390677418086852e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.647668393782383, "grad_norm": 3.048021414513414, "kl": 0.525390625, "learning_rate": 4.352331606217616e-07, "loss": 0.0027, "reward": 2.4999783039093018, "reward_std": 2.1690430230592028e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999978244304657, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.650259067357513, "grad_norm": 1.5839717545620293, "kl": 0.626953125, "learning_rate": 4.3497409326424866e-07, "loss": 0.0027, "reward": 1.999333381652832, "reward_std": 3.958182185215264e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993332028388977, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.652849740932642, "grad_norm": 79.12772461546489, "kl": 0.63671875, "learning_rate": 4.3471502590673577e-07, "loss": 0.0025, "reward": 1.7733708024024963, "reward_std": 0.1783041607704945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2733709216117859, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.655440414507772, "grad_norm": 8.016416951171216, "kl": 0.62109375, "learning_rate": 4.3445595854922277e-07, "loss": 0.0024, "reward": 1.9994497299194336, "reward_std": 7.633250413618953e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994497895240784, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.658031088082901, "grad_norm": 20.744348692603673, "kl": 0.587890625, "learning_rate": 4.341968911917098e-07, "loss": 0.0029, "reward": 1.9754235744476318, "reward_std": 0.0025765437520703927, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4754235744476318, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.660621761658031, "grad_norm": 0.3331253444567791, "kl": 0.556640625, "learning_rate": 4.339378238341969e-07, "loss": 0.0022, "reward": 2.499997854232788, "reward_std": 2.253433933674387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 5.66321243523316, "grad_norm": 0.37455056370598017, "kl": 0.63671875, "learning_rate": 4.336787564766839e-07, "loss": 0.0026, "reward": 2.4999945163726807, "reward_std": 3.571464560536697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.66580310880829, "grad_norm": 2.240968328821705, "kl": 0.708984375, "learning_rate": 4.33419689119171e-07, "loss": 0.0035, "reward": 1.9995262622833252, "reward_std": 3.4765404393510835e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995260834693909, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.668393782383419, "grad_norm": 1.0338743823800902, "kl": 0.607421875, "learning_rate": 4.3316062176165803e-07, "loss": 0.0026, "reward": 2.499996066093445, "reward_std": 3.389849553059321e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.670984455958549, "grad_norm": 0.28816857631460624, "kl": 0.55859375, "learning_rate": 4.3290155440414503e-07, "loss": 0.0016, "reward": 2.49999737739563, "reward_std": 2.2634075094174477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.6735751295336785, "grad_norm": 3.638972151594444, "kl": 0.650390625, "learning_rate": 4.326424870466321e-07, "loss": 0.0031, "reward": 1.9722049236297607, "reward_std": 0.00013099318229592427, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4722049236297607, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.676165803108808, "grad_norm": 3.664788568180686, "kl": 0.56640625, "learning_rate": 4.323834196891192e-07, "loss": 0.0016, "reward": 1.9987136125564575, "reward_std": 3.8688379390805494e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987136721611023, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.678756476683938, "grad_norm": 7.713215938383101, "kl": 0.5146484375, "learning_rate": 4.321243523316062e-07, "loss": 0.0031, "reward": 2.3124687671661377, "reward_std": 0.25881342299737753, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812468707561493, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.681347150259067, "grad_norm": 31.66227663465781, "kl": 0.728515625, "learning_rate": 4.3186528497409324e-07, "loss": 0.0028, "reward": 1.4985071420669556, "reward_std": 6.849597593827639e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9985070824623108, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.683937823834197, "grad_norm": 15.72548294738737, "kl": 0.619140625, "learning_rate": 4.316062176165803e-07, "loss": 0.0022, "reward": 1.9355891942977905, "reward_std": 0.00031059995546911523, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.43558931350708, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.686528497409326, "grad_norm": 0.32702084182897545, "kl": 0.515625, "learning_rate": 4.313471502590673e-07, "loss": 0.0013, "reward": 2.49999737739563, "reward_std": 1.724762512367306e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.689119170984456, "grad_norm": 0.042416052722994216, "kl": 0.63671875, "learning_rate": 4.310880829015544e-07, "loss": 0.0021, "reward": 2.499998688697815, "reward_std": 1.1979638827597228e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.691709844559585, "grad_norm": 15.509808232753123, "kl": 0.5703125, "learning_rate": 4.3082901554404145e-07, "loss": 0.0021, "reward": 2.374963164329529, "reward_std": 0.23151260914829663, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749631643295288, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.694300518134715, "grad_norm": 5.157459691709245, "kl": 0.779296875, "learning_rate": 4.3056994818652845e-07, "loss": 0.0027, "reward": 1.9981765747070312, "reward_std": 0.0001294731284815498, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498176634311676, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.696891191709844, "grad_norm": 3.2911541441690932, "kl": 0.63671875, "learning_rate": 4.303108808290155e-07, "loss": 0.0032, "reward": 2.4999773502349854, "reward_std": 1.3626903864860651e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999977171421051, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.699481865284974, "grad_norm": 15.327834005995316, "kl": 0.587890625, "learning_rate": 4.3005181347150256e-07, "loss": 0.0022, "reward": 2.062413215637207, "reward_std": 0.17680949806629087, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562413215637207, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.7020725388601035, "grad_norm": 0.26666382199387006, "kl": 0.630859375, "learning_rate": 4.297927461139896e-07, "loss": 0.0015, "reward": 2.499994993209839, "reward_std": 3.465418103587581e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.704663212435233, "grad_norm": 0.6182088839346882, "kl": 0.419921875, "learning_rate": 4.2953367875647666e-07, "loss": 0.0022, "reward": 1.999931275844574, "reward_std": 9.895292521377996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999311566352844, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.7072538860103625, "grad_norm": 0.07259140641402627, "kl": 0.498046875, "learning_rate": 4.292746113989637e-07, "loss": 0.002, "reward": 2.4999955892562866, "reward_std": 1.5649401348127867e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.709844559585492, "grad_norm": 0.2932653010377114, "kl": 0.564453125, "learning_rate": 4.290155440414507e-07, "loss": 0.0023, "reward": 2.4999945163726807, "reward_std": 2.450577994750347e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.712435233160622, "grad_norm": 0.3040234564157372, "kl": 0.638671875, "learning_rate": 4.287564766839378e-07, "loss": 0.0039, "reward": 2.4999982118606567, "reward_std": 1.6556171544834797e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.715025906735751, "grad_norm": 0.8276271504692786, "kl": 0.86328125, "learning_rate": 4.284974093264249e-07, "loss": 0.0046, "reward": 2.49999737739563, "reward_std": 2.5373722678523336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.717616580310881, "grad_norm": 0.5324611854974537, "kl": 0.6796875, "learning_rate": 4.282383419689119e-07, "loss": 0.0021, "reward": 2.4999964237213135, "reward_std": 2.235411557194311e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.72020725388601, "grad_norm": 38.99454355067759, "kl": 0.513671875, "learning_rate": 4.2797927461139893e-07, "loss": 0.0026, "reward": 2.499583601951599, "reward_std": 0.0005640287740789063, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995834231376648, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 38.3125, "epoch": 5.72279792746114, "grad_norm": 6.068855485462288, "kl": 0.908203125, "learning_rate": 4.27720207253886e-07, "loss": 0.0047, "reward": 2.4999812841415405, "reward_std": 2.889602683353587e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981164932251, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.725388601036269, "grad_norm": 0.43261198214304497, "kl": 0.556640625, "learning_rate": 4.2746113989637303e-07, "loss": 0.0019, "reward": 2.4999938011169434, "reward_std": 3.8641328501398675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 5.727979274611399, "grad_norm": 24.460818305681546, "kl": 0.61328125, "learning_rate": 4.272020725388601e-07, "loss": 0.0033, "reward": 2.0568240880966187, "reward_std": 0.36697188029887684, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5568240880966187, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.730569948186528, "grad_norm": 0.2246039551841444, "kl": 0.517578125, "learning_rate": 4.2694300518134714e-07, "loss": 0.0028, "reward": 2.499996304512024, "reward_std": 2.712076934585639e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.733160621761658, "grad_norm": 0.10513611245061655, "kl": 0.6015625, "learning_rate": 4.2668393782383414e-07, "loss": 0.0027, "reward": 2.4999918937683105, "reward_std": 3.3397703873561113e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 5.7357512953367875, "grad_norm": 15.538345886009557, "kl": 0.591796875, "learning_rate": 4.2642487046632124e-07, "loss": 0.0027, "reward": 2.2719736099243164, "reward_std": 0.31470449902309383, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.771973729133606, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.738341968911917, "grad_norm": 0.7378060191228873, "kl": 0.560546875, "learning_rate": 4.261658031088083e-07, "loss": 0.0024, "reward": 2.499995231628418, "reward_std": 3.801412162829365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.740932642487047, "grad_norm": 0.5160644792626434, "kl": 0.533203125, "learning_rate": 4.259067357512953e-07, "loss": 0.002, "reward": 2.499990701675415, "reward_std": 8.735602250453667e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.743523316062176, "grad_norm": 0.40197497105519764, "kl": 0.572265625, "learning_rate": 4.2564766839378235e-07, "loss": 0.0031, "reward": 2.4999945163726807, "reward_std": 4.899395435131737e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.746113989637306, "grad_norm": 80.90347262445566, "kl": 0.607421875, "learning_rate": 4.253886010362694e-07, "loss": 0.0026, "reward": 1.8977924585342407, "reward_std": 0.24332564580299731, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3977922797203064, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.748704663212435, "grad_norm": 2.1220979822979436, "kl": 0.623046875, "learning_rate": 4.2512953367875645e-07, "loss": 0.0015, "reward": 1.954761266708374, "reward_std": 6.067976755730342e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4547613859176636, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 5.751295336787565, "grad_norm": 1.456874397486117, "kl": 0.7578125, "learning_rate": 4.248704663212435e-07, "loss": 0.0029, "reward": 2.4999899864196777, "reward_std": 7.284868388524046e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.753886010362694, "grad_norm": 0.20984683015127917, "kl": 0.669921875, "learning_rate": 4.2461139896373056e-07, "loss": 0.0018, "reward": 2.4999982118606567, "reward_std": 2.1430693664115097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.756476683937824, "grad_norm": 0.12144123355692182, "kl": 0.55078125, "learning_rate": 4.2435233160621756e-07, "loss": 0.0029, "reward": 2.4999947547912598, "reward_std": 2.2540745590049482e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.759067357512953, "grad_norm": 0.42641571148282414, "kl": 0.66015625, "learning_rate": 4.240932642487046e-07, "loss": 0.0034, "reward": 2.4999945163726807, "reward_std": 2.4692184297236963e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.761658031088083, "grad_norm": 334.4274808567088, "kl": 0.654296875, "learning_rate": 4.238341968911917e-07, "loss": 0.0026, "reward": 1.9811193943023682, "reward_std": 0.011505160172191609, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.481119453907013, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.7642487046632125, "grad_norm": 0.5377188016413919, "kl": 0.533203125, "learning_rate": 4.235751295336787e-07, "loss": 0.0023, "reward": 2.499994397163391, "reward_std": 2.9941925845378137e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.766839378238342, "grad_norm": 4.347513178815866, "kl": 0.712890625, "learning_rate": 4.2331606217616577e-07, "loss": 0.003, "reward": 1.7288439869880676, "reward_std": 0.00025027684864653565, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2288438230752945, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.769430051813472, "grad_norm": 0.1956882502888999, "kl": 0.615234375, "learning_rate": 4.230569948186528e-07, "loss": 0.0032, "reward": 2.499996304512024, "reward_std": 3.4979291285708314e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.772020725388601, "grad_norm": 6.76886151649005, "kl": 0.58203125, "learning_rate": 4.2279792746113993e-07, "loss": 0.0025, "reward": 1.8828508853912354, "reward_std": 0.00032669794745743275, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3828508853912354, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.774611398963731, "grad_norm": 0.09151317459200689, "kl": 0.60546875, "learning_rate": 4.2253886010362693e-07, "loss": 0.0026, "reward": 2.4999970197677612, "reward_std": 1.7982841882258072e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.77720207253886, "grad_norm": 11.423018751357985, "kl": 0.6484375, "learning_rate": 4.22279792746114e-07, "loss": 0.0022, "reward": 2.499912977218628, "reward_std": 2.7084088060291833e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999912977218628, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.77979274611399, "grad_norm": 43.744054701425306, "kl": 0.578125, "learning_rate": 4.22020725388601e-07, "loss": 0.0023, "reward": 2.3122609853744507, "reward_std": 0.2591033969434591, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122610449790955, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 49.9375, "epoch": 5.782383419689119, "grad_norm": 6.601086022135714, "kl": 0.396484375, "learning_rate": 4.2176165803108803e-07, "loss": 0.0013, "reward": 1.987928867340088, "reward_std": 0.0003471795289442525, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.487928867340088, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.784974093264249, "grad_norm": 50.64751301265519, "kl": 0.6103515625, "learning_rate": 4.2150259067357514e-07, "loss": 0.0024, "reward": 1.9967319965362549, "reward_std": 0.001146038774209046, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4967319965362549, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.787564766839378, "grad_norm": 0.785570443359667, "kl": 0.57421875, "learning_rate": 4.212435233160622e-07, "loss": 0.0026, "reward": 2.4999865293502808, "reward_std": 5.934768182669359e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986469745636, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.790155440414508, "grad_norm": 2.014045671912073, "kl": 0.5859375, "learning_rate": 4.209844559585492e-07, "loss": 0.0014, "reward": 2.4999921321868896, "reward_std": 4.970487282207614e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 5.7927461139896375, "grad_norm": 0.4356614327635853, "kl": 0.4931640625, "learning_rate": 4.2072538860103624e-07, "loss": 0.0021, "reward": 2.4999945163726807, "reward_std": 3.0882305850354896e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.795336787564767, "grad_norm": 27.121344293360334, "kl": 0.556640625, "learning_rate": 4.2046632124352324e-07, "loss": 0.0025, "reward": 1.7953886985778809, "reward_std": 0.01272082473468572, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2953885793685913, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.7979274611398965, "grad_norm": 0.8777437645095614, "kl": 0.53125, "learning_rate": 4.2020725388601035e-07, "loss": 0.0023, "reward": 2.499992847442627, "reward_std": 1.0630372344166972e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.800518134715026, "grad_norm": 0.37940254270873414, "kl": 0.63671875, "learning_rate": 4.199481865284974e-07, "loss": 0.0017, "reward": 2.499994397163391, "reward_std": 3.7740051084256265e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.803108808290156, "grad_norm": 0.09604475572693989, "kl": 0.615234375, "learning_rate": 4.1968911917098445e-07, "loss": 0.0025, "reward": 2.4999964237213135, "reward_std": 2.10905835729136e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.805699481865285, "grad_norm": 2.390699779486656, "kl": 0.65625, "learning_rate": 4.1943005181347145e-07, "loss": 0.0032, "reward": 2.4999704360961914, "reward_std": 1.1854830376023529e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704360961914, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.808290155440415, "grad_norm": 0.19254112049485217, "kl": 0.626953125, "learning_rate": 4.1917098445595856e-07, "loss": 0.0038, "reward": 2.499997615814209, "reward_std": 1.2437956229405245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.810880829015544, "grad_norm": 4.549101061445773, "kl": 0.9501953125, "learning_rate": 4.189119170984456e-07, "loss": 0.0038, "reward": 2.4999953508377075, "reward_std": 2.7874389729731774e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.813471502590674, "grad_norm": 0.28215174622795874, "kl": 0.611328125, "learning_rate": 4.186528497409326e-07, "loss": 0.0026, "reward": 2.499992847442627, "reward_std": 5.852658773619623e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 5.816062176165803, "grad_norm": 14.300768989145384, "kl": 0.71875, "learning_rate": 4.1839378238341967e-07, "loss": 0.0029, "reward": 2.432662010192871, "reward_std": 0.1904019933157315, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9326621294021606, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.818652849740933, "grad_norm": 0.09216235971961871, "kl": 0.580078125, "learning_rate": 4.181347150259067e-07, "loss": 0.0025, "reward": 2.499998092651367, "reward_std": 2.042708615590527e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.821243523316062, "grad_norm": 7.028169923616177, "kl": 0.501953125, "learning_rate": 4.1787564766839377e-07, "loss": 0.0022, "reward": 1.9985456466674805, "reward_std": 0.00013017125013448094, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985455870628357, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.823834196891192, "grad_norm": 2.2118435536763252, "kl": 0.54296875, "learning_rate": 4.176165803108808e-07, "loss": 0.0021, "reward": 2.499984860420227, "reward_std": 1.3132379990565823e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999848008155823, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.8264248704663215, "grad_norm": 0.8315096257896766, "kl": 0.642578125, "learning_rate": 4.173575129533679e-07, "loss": 0.0031, "reward": 2.499984622001648, "reward_std": 6.888349730616028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845623970032, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.829015544041451, "grad_norm": 0.06327407309185087, "kl": 0.53125, "learning_rate": 4.170984455958549e-07, "loss": 0.002, "reward": 2.4999966621398926, "reward_std": 1.0966651018406992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.831606217616581, "grad_norm": 33.41035753056715, "kl": 0.6015625, "learning_rate": 4.16839378238342e-07, "loss": 0.0034, "reward": 2.3746973276138306, "reward_std": 0.23201148312699615, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8746974468231201, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.83419689119171, "grad_norm": 0.07531245376281354, "kl": 0.61328125, "learning_rate": 4.1658031088082903e-07, "loss": 0.0017, "reward": 2.4999985694885254, "reward_std": 1.4130282011137751e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.83678756476684, "grad_norm": 5.02337600255268, "kl": 1.267578125, "learning_rate": 4.1632124352331603e-07, "loss": 0.0044, "reward": 2.4999920129776, "reward_std": 8.119215351598541e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.839378238341969, "grad_norm": 13.444060658323668, "kl": 0.515625, "learning_rate": 4.160621761658031e-07, "loss": 0.0018, "reward": 2.4998981952667236, "reward_std": 4.665579081120086e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998983144760132, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.841968911917099, "grad_norm": 0.10666262882666215, "kl": 0.55859375, "learning_rate": 4.1580310880829014e-07, "loss": 0.0015, "reward": 2.4999979734420776, "reward_std": 1.5641750223949202e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.844559585492228, "grad_norm": 0.13761238078515353, "kl": 0.58203125, "learning_rate": 4.155440414507772e-07, "loss": 0.0026, "reward": 2.49999737739563, "reward_std": 1.7045592244357977e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.847150259067358, "grad_norm": 0.08499075553685341, "kl": 0.615234375, "learning_rate": 4.1528497409326424e-07, "loss": 0.0022, "reward": 2.4999979734420776, "reward_std": 2.3178063770501467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 5.849740932642487, "grad_norm": 25.641568476901426, "kl": 0.65625, "learning_rate": 4.150259067357513e-07, "loss": 0.0028, "reward": 2.4997302293777466, "reward_std": 0.0004808433845937543, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997302293777466, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.852331606217617, "grad_norm": 0.10190939545237486, "kl": 0.59375, "learning_rate": 4.147668393782383e-07, "loss": 0.0018, "reward": 2.4999988079071045, "reward_std": 1.351523962966894e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.8549222797927465, "grad_norm": 7.2929936255186565, "kl": 0.611328125, "learning_rate": 4.1450777202072535e-07, "loss": 0.0024, "reward": 1.8531391024589539, "reward_std": 0.0006309927214260824, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3531391024589539, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.857512953367876, "grad_norm": 0.22624671930826315, "kl": 0.541015625, "learning_rate": 4.1424870466321246e-07, "loss": 0.001, "reward": 2.4999942779541016, "reward_std": 2.9733999440395564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.860103626943005, "grad_norm": 24.74097024509963, "kl": 0.62109375, "learning_rate": 4.1398963730569945e-07, "loss": 0.0032, "reward": 2.3121002912521362, "reward_std": 0.25932228997930906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8121001720428467, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.862694300518134, "grad_norm": 0.42736506321586437, "kl": 0.484375, "learning_rate": 4.137305699481865e-07, "loss": 0.0024, "reward": 2.499994158744812, "reward_std": 6.378920716088032e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.865284974093264, "grad_norm": 0.06027068912055578, "kl": 0.60546875, "learning_rate": 4.1347150259067356e-07, "loss": 0.002, "reward": 2.4999982118606567, "reward_std": 9.583088456111e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.867875647668393, "grad_norm": 0.0891132521971237, "kl": 0.4853515625, "learning_rate": 4.132124352331606e-07, "loss": 0.0022, "reward": 2.4999961853027344, "reward_std": 1.8436936102261825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.870466321243523, "grad_norm": 23.05465906988665, "kl": 0.4501953125, "learning_rate": 4.1295336787564767e-07, "loss": 0.0008, "reward": 2.249707579612732, "reward_std": 0.26757004655212313, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7497077584266663, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.873056994818652, "grad_norm": 36.03432907664968, "kl": 0.611328125, "learning_rate": 4.126943005181347e-07, "loss": 0.0035, "reward": 2.249513268470764, "reward_std": 0.2677719316267826, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7495130896568298, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.875647668393782, "grad_norm": 15.820384131497987, "kl": 0.712890625, "learning_rate": 4.124352331606217e-07, "loss": 0.0038, "reward": 1.9983000755310059, "reward_std": 0.00017554736166403018, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982999563217163, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.8782383419689115, "grad_norm": 44.944497117271816, "kl": 0.59765625, "learning_rate": 4.1217616580310877e-07, "loss": 0.0028, "reward": 1.7933465242385864, "reward_std": 0.01074795096235448, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2933465540409088, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.880829015544041, "grad_norm": 0.6404109990975678, "kl": 0.65234375, "learning_rate": 4.119170984455959e-07, "loss": 0.0028, "reward": 2.4999942779541016, "reward_std": 5.864912282049772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.883419689119171, "grad_norm": 0.49766274243190567, "kl": 0.640625, "learning_rate": 4.116580310880829e-07, "loss": 0.0034, "reward": 2.4999945163726807, "reward_std": 4.723808046946942e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.8860103626943, "grad_norm": 0.16784711824386825, "kl": 0.541015625, "learning_rate": 4.1139896373056993e-07, "loss": 0.0019, "reward": 2.499992847442627, "reward_std": 2.8794649438168562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.88860103626943, "grad_norm": 0.23453800621641208, "kl": 0.59765625, "learning_rate": 4.11139896373057e-07, "loss": 0.0028, "reward": 2.4999924898147583, "reward_std": 3.1888802141111228e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.891191709844559, "grad_norm": 4.823272109847591, "kl": 0.615234375, "learning_rate": 4.1088082901554403e-07, "loss": 0.0022, "reward": 2.437484860420227, "reward_std": 0.1767847551614068, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374847412109375, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.893782383419689, "grad_norm": 5.9498875790992996, "kl": 0.544921875, "learning_rate": 4.106217616580311e-07, "loss": 0.0026, "reward": 1.94185870885849, "reward_std": 0.00047599092204109184, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4418585896492004, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.896373056994818, "grad_norm": 0.6467674730253818, "kl": 0.59765625, "learning_rate": 4.1036269430051814e-07, "loss": 0.0011, "reward": 2.4999947547912598, "reward_std": 3.816979301518586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.898963730569948, "grad_norm": 0.168066988304121, "kl": 0.568359375, "learning_rate": 4.1010362694300514e-07, "loss": 0.0039, "reward": 2.499997138977051, "reward_std": 2.1245919015200343e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.901554404145077, "grad_norm": 2.36461160606414, "kl": 1.310546875, "learning_rate": 4.098445595854922e-07, "loss": 0.0047, "reward": 2.49999737739563, "reward_std": 1.2619296398952429e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.904145077720207, "grad_norm": 0.2679312313433942, "kl": 0.5546875, "learning_rate": 4.095854922279793e-07, "loss": 0.0024, "reward": 2.4999959468841553, "reward_std": 3.6371325222717132e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.9067357512953365, "grad_norm": 0.15486687311127945, "kl": 0.4873046875, "learning_rate": 4.093264248704663e-07, "loss": 0.0021, "reward": 2.4999921321868896, "reward_std": 3.998341867372801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.909326424870466, "grad_norm": 14.620985207261448, "kl": 0.5830078125, "learning_rate": 4.0906735751295335e-07, "loss": 0.0017, "reward": 2.4999630451202393, "reward_std": 1.721831563372689e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999631643295288, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.9119170984455955, "grad_norm": 0.2566520707223171, "kl": 0.5546875, "learning_rate": 4.088082901554404e-07, "loss": 0.0009, "reward": 2.4999959468841553, "reward_std": 2.5605590963095892e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.914507772020725, "grad_norm": 0.35606870657894707, "kl": 0.662109375, "learning_rate": 4.085492227979274e-07, "loss": 0.0022, "reward": 2.49999737739563, "reward_std": 1.8928267877527105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.917098445595855, "grad_norm": 0.705858615982928, "kl": 0.625, "learning_rate": 4.082901554404145e-07, "loss": 0.0021, "reward": 2.499995470046997, "reward_std": 4.297198302083416e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.919689119170984, "grad_norm": 0.23951742727467817, "kl": 0.533203125, "learning_rate": 4.0803108808290156e-07, "loss": 0.0014, "reward": 2.499997138977051, "reward_std": 1.980426816317049e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 50.8125, "epoch": 5.922279792746114, "grad_norm": 1.2982916201383508, "kl": 0.4423828125, "learning_rate": 4.0777202072538856e-07, "loss": 0.0014, "reward": 1.9988945722579956, "reward_std": 2.782274054879963e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988945722579956, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.924870466321243, "grad_norm": 9.00766348343376, "kl": 0.55078125, "learning_rate": 4.075129533678756e-07, "loss": 0.0028, "reward": 1.9996126890182495, "reward_std": 0.00013223820582197732, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996126294136047, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.927461139896373, "grad_norm": 43.660419576044944, "kl": 0.607421875, "learning_rate": 4.072538860103627e-07, "loss": 0.0022, "reward": 2.312422037124634, "reward_std": 0.25879259490398, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124219179153442, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.930051813471502, "grad_norm": 0.1566896537277455, "kl": 0.572265625, "learning_rate": 4.069948186528497e-07, "loss": 0.0014, "reward": 2.4999974966049194, "reward_std": 1.5549341014775564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.932642487046632, "grad_norm": 13.257608950214463, "kl": 0.6640625, "learning_rate": 4.0673575129533677e-07, "loss": 0.0027, "reward": 2.4999806880950928, "reward_std": 4.723568943632017e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999808073043823, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.935233160621761, "grad_norm": 4.279056237807013, "kl": 0.5458984375, "learning_rate": 4.064766839378238e-07, "loss": 0.0025, "reward": 2.4290796518325806, "reward_std": 0.2005742703023543, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9290797710418701, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.937823834196891, "grad_norm": 0.10560979735911015, "kl": 0.60546875, "learning_rate": 4.062176165803108e-07, "loss": 0.0007, "reward": 2.4999974966049194, "reward_std": 2.873113146506512e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.9404145077720205, "grad_norm": 23.17204940838671, "kl": 0.58203125, "learning_rate": 4.0595854922279793e-07, "loss": 0.0022, "reward": 2.2492200136184692, "reward_std": 0.2680865251822979, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7492201328277588, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.94300518134715, "grad_norm": 0.1803360777883701, "kl": 0.6171875, "learning_rate": 4.05699481865285e-07, "loss": 0.0016, "reward": 2.499948024749756, "reward_std": 2.7457962801236135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999481439590454, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.94559585492228, "grad_norm": 24.036650583065594, "kl": 0.6171875, "learning_rate": 4.05440414507772e-07, "loss": 0.0016, "reward": 1.981432557106018, "reward_std": 0.0008268835969147403, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4814326167106628, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.948186528497409, "grad_norm": 0.3014704125122659, "kl": 0.59765625, "learning_rate": 4.0518134715025903e-07, "loss": 0.0033, "reward": 2.4999940395355225, "reward_std": 3.649049972409557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.950777202072539, "grad_norm": 84.43615562607651, "kl": 0.599609375, "learning_rate": 4.049222797927461e-07, "loss": 0.0024, "reward": 2.437362313270569, "reward_std": 0.1770869020522241, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373623132705688, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.953367875647668, "grad_norm": 10.861941040014107, "kl": 0.5107421875, "learning_rate": 4.0466321243523314e-07, "loss": 0.0028, "reward": 2.4998624324798584, "reward_std": 0.00036611199783465054, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998623728752136, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.955958549222798, "grad_norm": 0.21111243775745028, "kl": 0.53125, "learning_rate": 4.044041450777202e-07, "loss": 0.0032, "reward": 2.4999983310699463, "reward_std": 1.268674850507523e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.958549222797927, "grad_norm": 5.179401960884001, "kl": 0.5703125, "learning_rate": 4.0414507772020724e-07, "loss": 0.0021, "reward": 1.978162169456482, "reward_std": 8.482964209122201e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4781621992588043, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.961139896373057, "grad_norm": 19.276108091051206, "kl": 0.5576171875, "learning_rate": 4.0388601036269424e-07, "loss": 0.0031, "reward": 1.9092462062835693, "reward_std": 0.002070564578843914, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4092459976673126, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.963730569948186, "grad_norm": 0.47671234780575056, "kl": 0.84765625, "learning_rate": 4.0362694300518135e-07, "loss": 0.0031, "reward": 2.4999972581863403, "reward_std": 2.059810412902152e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.966321243523316, "grad_norm": 0.12361005474544362, "kl": 0.6796875, "learning_rate": 4.033678756476684e-07, "loss": 0.0024, "reward": 2.499998688697815, "reward_std": 8.101619641820434e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.9689119170984455, "grad_norm": 0.3553640640723943, "kl": 0.5703125, "learning_rate": 4.031088082901554e-07, "loss": 0.003, "reward": 2.499995470046997, "reward_std": 2.136344505743182e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 51.75, "epoch": 5.971502590673575, "grad_norm": 0.16967078461323062, "kl": 0.3701171875, "learning_rate": 4.0284974093264246e-07, "loss": 0.001, "reward": 2.4999990463256836, "reward_std": 7.510056718729174e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.974093264248705, "grad_norm": 0.285001697583028, "kl": 0.572265625, "learning_rate": 4.025906735751295e-07, "loss": 0.002, "reward": 2.4999961853027344, "reward_std": 2.3594932372361654e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 5.976683937823834, "grad_norm": 14.829975355362082, "kl": 0.6875, "learning_rate": 4.0233160621761656e-07, "loss": 0.0023, "reward": 1.5624260306358337, "reward_std": 0.17657509538901195, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0624261572957039, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.979274611398964, "grad_norm": 13.668435357251393, "kl": 0.61328125, "learning_rate": 4.020725388601036e-07, "loss": 0.0031, "reward": 1.992610216140747, "reward_std": 7.50823842849968e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492610216140747, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 5.981865284974093, "grad_norm": 10.05326159633134, "kl": 0.564453125, "learning_rate": 4.0181347150259067e-07, "loss": 0.002, "reward": 1.8028271794319153, "reward_std": 0.0012539791109702492, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3028272092342377, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.984455958549223, "grad_norm": 3.2396527432739064, "kl": 0.623046875, "learning_rate": 4.0155440414507767e-07, "loss": 0.0012, "reward": 1.9989696741104126, "reward_std": 3.640538284344075e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989697933197021, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.987046632124352, "grad_norm": 0.7357434883054466, "kl": 0.69140625, "learning_rate": 4.0129533678756477e-07, "loss": 0.0022, "reward": 2.49998140335083, "reward_std": 6.539165383401269e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999814629554749, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 5.989637305699482, "grad_norm": 1.2110216984944153, "kl": 0.615234375, "learning_rate": 4.010362694300518e-07, "loss": 0.0022, "reward": 1.997019112110138, "reward_std": 2.1741641148764757e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497019112110138, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.992227979274611, "grad_norm": 2.441262297171115, "kl": 1.01953125, "learning_rate": 4.007772020725388e-07, "loss": 0.0041, "reward": 2.4999934434890747, "reward_std": 1.8821774006028136e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.994818652849741, "grad_norm": 0.8018775513342542, "kl": 0.609375, "learning_rate": 4.005181347150259e-07, "loss": 0.0021, "reward": 1.9992656707763672, "reward_std": 2.3982807988431887e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992658197879791, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 5.9974093264248705, "grad_norm": 2.0344984559817156, "kl": 0.607421875, "learning_rate": 4.0025906735751293e-07, "loss": 0.0037, "reward": 2.4999868869781494, "reward_std": 6.542083639260454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999867677688599, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 6.0, "grad_norm": 7.20193903048705, "kl": 0.58984375, "learning_rate": 4e-07, "loss": 0.0023, "reward": 1.9307444095611572, "reward_std": 0.022792309694750656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4307443797588348, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.0025906735751295, "grad_norm": 0.10200920396766484, "kl": 0.572265625, "learning_rate": 3.9974093264248703e-07, "loss": 0.0035, "reward": 2.499998450279236, "reward_std": 1.3772501574749185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.005181347150259, "grad_norm": 28.60869233130163, "kl": 0.705078125, "learning_rate": 3.994818652849741e-07, "loss": 0.0029, "reward": 1.9997926950454712, "reward_std": 0.0001696982412795478, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997925758361816, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 6.007772020725389, "grad_norm": 1.0718874342894962, "kl": 0.658203125, "learning_rate": 3.992227979274611e-07, "loss": 0.0016, "reward": 1.9970639944076538, "reward_std": 3.4244836342622875e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970641136169434, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.010362694300518, "grad_norm": 2.702308296758614, "kl": 0.580078125, "learning_rate": 3.9896373056994814e-07, "loss": 0.0021, "reward": 1.9986712336540222, "reward_std": 6.307634714630694e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986713528633118, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 50.875, "epoch": 6.012953367875648, "grad_norm": 3.1184428933168054, "kl": 0.44140625, "learning_rate": 3.9870466321243525e-07, "loss": 0.0019, "reward": 2.4999959468841553, "reward_std": 3.986063575212029e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.015544041450777, "grad_norm": 0.08816755529314213, "kl": 0.529296875, "learning_rate": 3.9844559585492225e-07, "loss": 0.0026, "reward": 2.4999972581863403, "reward_std": 1.7527185605104023e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.018134715025907, "grad_norm": 0.2968954883414716, "kl": 0.59375, "learning_rate": 3.981865284974093e-07, "loss": 0.0034, "reward": 2.499994993209839, "reward_std": 3.844753450721328e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.020725388601036, "grad_norm": 49.148754650419086, "kl": 0.66015625, "learning_rate": 3.9792746113989635e-07, "loss": 0.0024, "reward": 1.8586134910583496, "reward_std": 0.029080375013791127, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3586134314537048, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.023316062176166, "grad_norm": 0.7838120544695829, "kl": 0.619140625, "learning_rate": 3.976683937823834e-07, "loss": 0.0026, "reward": 2.4999927282333374, "reward_std": 3.879266159856343e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.025906735751295, "grad_norm": 10.333303866175683, "kl": 0.603515625, "learning_rate": 3.9740932642487046e-07, "loss": 0.0021, "reward": 1.996050477027893, "reward_std": 0.0005086387997152997, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.496050626039505, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.028497409326425, "grad_norm": 22.304793723522163, "kl": 0.56640625, "learning_rate": 3.971502590673575e-07, "loss": 0.0018, "reward": 2.4989014863967896, "reward_std": 0.0004935504937293445, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9989015460014343, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 6.0310880829015545, "grad_norm": 0.49506819700307575, "kl": 0.6015625, "learning_rate": 3.968911917098445e-07, "loss": 0.0018, "reward": 2.4999923706054688, "reward_std": 5.335845003173745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 53.375, "epoch": 6.033678756476684, "grad_norm": 0.2237976767587706, "kl": 0.3671875, "learning_rate": 3.9663212435233156e-07, "loss": 0.0019, "reward": 2.4999932050704956, "reward_std": 2.2260313130573195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 6.036269430051814, "grad_norm": 0.30388036616193087, "kl": 0.578125, "learning_rate": 3.9637305699481867e-07, "loss": 0.0029, "reward": 2.49999737739563, "reward_std": 2.3874461021478055e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.038860103626943, "grad_norm": 0.6354021678471385, "kl": 0.587890625, "learning_rate": 3.9611398963730567e-07, "loss": 0.0029, "reward": 2.49999463558197, "reward_std": 9.720666639623232e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.041450777202073, "grad_norm": 16.048013968815038, "kl": 0.5, "learning_rate": 3.958549222797927e-07, "loss": 0.0026, "reward": 2.437489867210388, "reward_std": 0.17679648072635246, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374898672103882, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 6.044041450777202, "grad_norm": 10.451422796518953, "kl": 0.5302734375, "learning_rate": 3.9559585492227977e-07, "loss": 0.0013, "reward": 2.2824963331222534, "reward_std": 0.3001809886030742, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.782496452331543, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.046632124352332, "grad_norm": 10.259132519292756, "kl": 0.78125, "learning_rate": 3.9533678756476677e-07, "loss": 0.0028, "reward": 1.9989067316055298, "reward_std": 4.252856888342649e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989067316055298, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.049222797927461, "grad_norm": 4.180384230920244, "kl": 0.75, "learning_rate": 3.950777202072539e-07, "loss": 0.0031, "reward": 2.4998767375946045, "reward_std": 5.885975065211824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998767971992493, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.051813471502591, "grad_norm": 0.15497015241914663, "kl": 0.4921875, "learning_rate": 3.9481865284974093e-07, "loss": 0.0021, "reward": 2.499996304512024, "reward_std": 2.20119105165395e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.05440414507772, "grad_norm": 0.4517014916963544, "kl": 0.546875, "learning_rate": 3.9455958549222793e-07, "loss": 0.003, "reward": 2.499996542930603, "reward_std": 4.45335942345082e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.05699481865285, "grad_norm": 20.371475465848413, "kl": 0.4921875, "learning_rate": 3.94300518134715e-07, "loss": 0.0031, "reward": 2.2496540546417236, "reward_std": 0.26762579200760683, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7496539950370789, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.0595854922279795, "grad_norm": 19.905304253499427, "kl": 0.583984375, "learning_rate": 3.940414507772021e-07, "loss": 0.0023, "reward": 2.1867308020591736, "reward_std": 0.259397614568627, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6867307424545288, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.062176165803109, "grad_norm": 0.11707988981584115, "kl": 0.5859375, "learning_rate": 3.937823834196891e-07, "loss": 0.0022, "reward": 2.4999961853027344, "reward_std": 1.9282774701423477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.064766839378239, "grad_norm": 0.0840786431095149, "kl": 0.595703125, "learning_rate": 3.9352331606217614e-07, "loss": 0.0013, "reward": 2.4999979734420776, "reward_std": 1.561680278427957e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 56.5, "epoch": 6.067357512953368, "grad_norm": 0.10513087360287453, "kl": 0.40625, "learning_rate": 3.932642487046632e-07, "loss": 0.0024, "reward": 2.4999990463256836, "reward_std": 9.205030835346406e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.069948186528498, "grad_norm": 0.8048417224886928, "kl": 0.568359375, "learning_rate": 3.930051813471502e-07, "loss": 0.0037, "reward": 2.4999884366989136, "reward_std": 6.759727284588735e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.072538860103627, "grad_norm": 7.500285795261913, "kl": 0.525390625, "learning_rate": 3.927461139896373e-07, "loss": 0.0022, "reward": 1.890099287033081, "reward_std": 0.004882001126588875, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3900991678237915, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.075129533678757, "grad_norm": 0.27006923154328294, "kl": 0.5693359375, "learning_rate": 3.9248704663212435e-07, "loss": 0.0021, "reward": 2.499996304512024, "reward_std": 4.201673277748341e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.077720207253886, "grad_norm": 7.00136613194721, "kl": 0.623046875, "learning_rate": 3.9222797927461135e-07, "loss": 0.0031, "reward": 1.998529076576233, "reward_std": 3.819717767328257e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498529076576233, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 58.9375, "epoch": 6.080310880829016, "grad_norm": 0.8625346700815173, "kl": 0.3681640625, "learning_rate": 3.919689119170984e-07, "loss": 0.0021, "reward": 2.4999961853027344, "reward_std": 4.603844615758135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.082901554404145, "grad_norm": 0.7794224925620294, "kl": 0.6015625, "learning_rate": 3.917098445595855e-07, "loss": 0.0016, "reward": 2.4998769760131836, "reward_std": 1.451380569506e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998770952224731, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.085492227979275, "grad_norm": 1.0170987339465722, "kl": 0.552734375, "learning_rate": 3.9145077720207256e-07, "loss": 0.0024, "reward": 2.499998450279236, "reward_std": 1.4314639429358067e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.0880829015544045, "grad_norm": 0.2812571276130881, "kl": 0.455078125, "learning_rate": 3.9119170984455956e-07, "loss": 0.0024, "reward": 2.4999932050704956, "reward_std": 3.192690371633944e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.090673575129534, "grad_norm": 0.24055637078136483, "kl": 0.5126953125, "learning_rate": 3.909326424870466e-07, "loss": 0.0019, "reward": 2.4999958276748657, "reward_std": 4.585271710766392e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.0932642487046635, "grad_norm": 0.7305480160219144, "kl": 0.5419921875, "learning_rate": 3.906735751295336e-07, "loss": 0.0027, "reward": 2.4999911785125732, "reward_std": 6.512546406156616e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911189079285, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 40.375, "epoch": 6.095854922279793, "grad_norm": 0.6308500306624054, "kl": 0.6171875, "learning_rate": 3.904145077720207e-07, "loss": 0.0027, "reward": 2.499988555908203, "reward_std": 5.281779351662408e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988615512848, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.098445595854923, "grad_norm": 4.868832117008129, "kl": 0.7109375, "learning_rate": 3.9015544041450777e-07, "loss": 0.0027, "reward": 1.9921448230743408, "reward_std": 9.046618197317002e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4921448230743408, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 6.101036269430052, "grad_norm": 36.35618821259656, "kl": 0.546875, "learning_rate": 3.898963730569948e-07, "loss": 0.0022, "reward": 2.374225378036499, "reward_std": 0.35421258211135864, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8742254376411438, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.103626943005182, "grad_norm": 2.5800876035176765, "kl": 0.646484375, "learning_rate": 3.896373056994818e-07, "loss": 0.0023, "reward": 2.499996542930603, "reward_std": 4.017754463347956e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.106217616580311, "grad_norm": 7.815941454249537, "kl": 0.76953125, "learning_rate": 3.893782383419689e-07, "loss": 0.0025, "reward": 1.9947028160095215, "reward_std": 0.0001354497321699455, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4947029948234558, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 50.5625, "epoch": 6.108808290155441, "grad_norm": 6.932896996600901, "kl": 0.423828125, "learning_rate": 3.89119170984456e-07, "loss": 0.0016, "reward": 2.4987293481826782, "reward_std": 0.0005370166975353641, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998729169368744, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.11139896373057, "grad_norm": 0.24120425290817052, "kl": 0.603515625, "learning_rate": 3.88860103626943e-07, "loss": 0.0015, "reward": 2.499998092651367, "reward_std": 1.3253751376396394e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.1139896373057, "grad_norm": 0.2002020138645121, "kl": 0.599609375, "learning_rate": 3.8860103626943004e-07, "loss": 0.0024, "reward": 2.499996304512024, "reward_std": 2.7333761636327836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.116580310880829, "grad_norm": 0.09108589123055691, "kl": 0.560546875, "learning_rate": 3.883419689119171e-07, "loss": 0.0019, "reward": 2.499998092651367, "reward_std": 1.5015187386779871e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.119170984455959, "grad_norm": 0.34925266137900335, "kl": 0.634765625, "learning_rate": 3.8808290155440414e-07, "loss": 0.0028, "reward": 1.9998998641967773, "reward_std": 6.614999932708088e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998998045921326, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 6.1217616580310885, "grad_norm": 53.643970085981245, "kl": 0.5048828125, "learning_rate": 3.878238341968912e-07, "loss": 0.0013, "reward": 2.3115508556365967, "reward_std": 0.2600827500905609, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.811551034450531, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.124352331606218, "grad_norm": 11.179585156973944, "kl": 0.62890625, "learning_rate": 3.8756476683937825e-07, "loss": 0.0024, "reward": 1.9392640590667725, "reward_std": 0.001326752508248319, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4392640590667725, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.126943005181348, "grad_norm": 0.3176844944526175, "kl": 0.59375, "learning_rate": 3.8730569948186525e-07, "loss": 0.0019, "reward": 2.4999951124191284, "reward_std": 3.3254190157094854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.129533678756476, "grad_norm": 0.11800189087368446, "kl": 0.591796875, "learning_rate": 3.870466321243523e-07, "loss": 0.0024, "reward": 2.4999959468841553, "reward_std": 2.0004431462439243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.132124352331606, "grad_norm": 0.4664906557940549, "kl": 0.59765625, "learning_rate": 3.867875647668394e-07, "loss": 0.0018, "reward": 2.4999966621398926, "reward_std": 2.77632057077426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.134715025906735, "grad_norm": 0.8144972325445732, "kl": 0.55859375, "learning_rate": 3.865284974093264e-07, "loss": 0.0027, "reward": 2.4999955892562866, "reward_std": 3.2687846669432474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.137305699481865, "grad_norm": 5.695798667599587, "kl": 0.603515625, "learning_rate": 3.8626943005181346e-07, "loss": 0.002, "reward": 1.6468836069107056, "reward_std": 0.0005491398354706689, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1468836069107056, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.139896373056994, "grad_norm": 0.29679972700833174, "kl": 0.609375, "learning_rate": 3.860103626943005e-07, "loss": 0.0024, "reward": 2.49999463558197, "reward_std": 2.018517761825933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.142487046632124, "grad_norm": 0.2692642879365334, "kl": 0.4755859375, "learning_rate": 3.8575129533678756e-07, "loss": 0.0021, "reward": 2.4999972581863403, "reward_std": 2.295217939263239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.1450777202072535, "grad_norm": 0.7237045544825196, "kl": 0.5390625, "learning_rate": 3.854922279792746e-07, "loss": 0.002, "reward": 2.4999938011169434, "reward_std": 3.562689286695786e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.147668393782383, "grad_norm": 6.520767082726417, "kl": 0.59375, "learning_rate": 3.8523316062176167e-07, "loss": 0.0016, "reward": 1.9967049956321716, "reward_std": 0.00016671416057079114, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4967051446437836, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.150259067357513, "grad_norm": 0.18841628647281025, "kl": 0.47265625, "learning_rate": 3.8497409326424867e-07, "loss": 0.0015, "reward": 2.4999945163726807, "reward_std": 2.815159433566805e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.152849740932642, "grad_norm": 0.22460538063500185, "kl": 0.595703125, "learning_rate": 3.847150259067357e-07, "loss": 0.0035, "reward": 2.4999990463256836, "reward_std": 1.5554087724467536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.155440414507772, "grad_norm": 0.19666977469844849, "kl": 0.607421875, "learning_rate": 3.844559585492228e-07, "loss": 0.0017, "reward": 2.4999979734420776, "reward_std": 2.784382047593681e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.158031088082901, "grad_norm": 8.033234524874963, "kl": 0.689453125, "learning_rate": 3.841968911917098e-07, "loss": 0.0026, "reward": 1.998391330242157, "reward_std": 9.630453064346511e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983912706375122, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.160621761658031, "grad_norm": 32.15700031627614, "kl": 0.513671875, "learning_rate": 3.839378238341969e-07, "loss": 0.0018, "reward": 2.437095522880554, "reward_std": 0.17752145866609226, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9370955228805542, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.16321243523316, "grad_norm": 0.3287877422959666, "kl": 0.58203125, "learning_rate": 3.8367875647668393e-07, "loss": 0.0032, "reward": 2.499979615211487, "reward_std": 5.533145667868666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999796152114868, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.16580310880829, "grad_norm": 0.7778036254470807, "kl": 0.6875, "learning_rate": 3.8341968911917093e-07, "loss": 0.0029, "reward": 2.4999953508377075, "reward_std": 3.83149495064572e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.168393782383419, "grad_norm": 36.6988154196832, "kl": 0.58984375, "learning_rate": 3.8316062176165804e-07, "loss": 0.0024, "reward": 2.0622928142547607, "reward_std": 0.4083412438631058, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562292754650116, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.170984455958549, "grad_norm": 0.16614464230031584, "kl": 0.4921875, "learning_rate": 3.829015544041451e-07, "loss": 0.0024, "reward": 2.4999920129776, "reward_std": 2.5489100323738967e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.1735751295336785, "grad_norm": 0.8639522377135211, "kl": 0.65625, "learning_rate": 3.826424870466321e-07, "loss": 0.0027, "reward": 2.4999947547912598, "reward_std": 5.701016448256269e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.176165803108808, "grad_norm": 0.14293179870297315, "kl": 0.611328125, "learning_rate": 3.8238341968911914e-07, "loss": 0.0036, "reward": 2.499998092651367, "reward_std": 1.8300538044968562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 6.178756476683938, "grad_norm": 7.172150949102787, "kl": 1.322265625, "learning_rate": 3.8212435233160625e-07, "loss": 0.006, "reward": 2.4578185081481934, "reward_std": 0.058365467602470744, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9578187465667725, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.181347150259067, "grad_norm": 1.683825715368485, "kl": 1.154296875, "learning_rate": 3.8186528497409325e-07, "loss": 0.0053, "reward": 2.499997854232788, "reward_std": 2.1640898069108516e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.183937823834197, "grad_norm": 14.812621981360694, "kl": 0.525390625, "learning_rate": 3.816062176165803e-07, "loss": 0.0019, "reward": 2.3747498989105225, "reward_std": 0.23189023342274595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874750018119812, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.186528497409326, "grad_norm": 4.782306445983391, "kl": 0.583984375, "learning_rate": 3.8134715025906735e-07, "loss": 0.002, "reward": 1.9642409682273865, "reward_std": 0.00046277981323328277, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.464240938425064, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.189119170984456, "grad_norm": 1.4819357908203445, "kl": 0.541015625, "learning_rate": 3.8108808290155435e-07, "loss": 0.0022, "reward": 1.9998798370361328, "reward_std": 1.5956331651523215e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998798966407776, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.191709844559585, "grad_norm": 0.7065774488966414, "kl": 0.638671875, "learning_rate": 3.8082901554404146e-07, "loss": 0.0017, "reward": 1.9999372959136963, "reward_std": 9.41381040320266e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999374449253082, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.194300518134715, "grad_norm": 0.1805261900912792, "kl": 0.611328125, "learning_rate": 3.805699481865285e-07, "loss": 0.0027, "reward": 2.4999961853027344, "reward_std": 1.774761926753854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.196891191709844, "grad_norm": 2.652592677548065, "kl": 0.546875, "learning_rate": 3.803108808290155e-07, "loss": 0.0029, "reward": 1.9956845045089722, "reward_std": 9.287631473853253e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4956845343112946, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.199481865284974, "grad_norm": 4.621083470087449, "kl": 0.58984375, "learning_rate": 3.8005181347150256e-07, "loss": 0.0031, "reward": 1.9938815236091614, "reward_std": 0.00016322338092322752, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4938814640045166, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 6.2020725388601035, "grad_norm": 10.917493490223334, "kl": 0.5859375, "learning_rate": 3.797927461139896e-07, "loss": 0.0032, "reward": 1.9507100582122803, "reward_std": 0.03316134746523858, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4507099390029907, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 6.204663212435233, "grad_norm": 29.818262157581053, "kl": 0.609375, "learning_rate": 3.7953367875647667e-07, "loss": 0.0024, "reward": 2.2446109652519226, "reward_std": 0.27305933810202987, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7446110248565674, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 50.0, "epoch": 6.2072538860103625, "grad_norm": 7.415186693460644, "kl": 0.4013671875, "learning_rate": 3.792746113989637e-07, "loss": 0.0023, "reward": 2.49934720993042, "reward_std": 0.0005428095820434464, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9993471503257751, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.209844559585492, "grad_norm": 0.17763229184767385, "kl": 0.63671875, "learning_rate": 3.7901554404145077e-07, "loss": 0.0028, "reward": 2.499997138977051, "reward_std": 2.3226127154885035e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.212435233160622, "grad_norm": 0.8181346668053645, "kl": 0.56640625, "learning_rate": 3.7875647668393777e-07, "loss": 0.0021, "reward": 2.4999951124191284, "reward_std": 6.649237150213594e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.215025906735751, "grad_norm": 1.9696985331915227, "kl": 0.5390625, "learning_rate": 3.784974093264249e-07, "loss": 0.0019, "reward": 2.499979019165039, "reward_std": 1.0097213362314506e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979019165039, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.217616580310881, "grad_norm": 2.1714962208344897, "kl": 0.564453125, "learning_rate": 3.7823834196891193e-07, "loss": 0.002, "reward": 1.9196088314056396, "reward_std": 0.00013774065359939414, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4196088314056396, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 6.22020725388601, "grad_norm": 0.6401617093781331, "kl": 0.560546875, "learning_rate": 3.7797927461139893e-07, "loss": 0.0025, "reward": 2.499993324279785, "reward_std": 7.727683907887695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.22279792746114, "grad_norm": 0.2443240622661962, "kl": 0.5009765625, "learning_rate": 3.77720207253886e-07, "loss": 0.0035, "reward": 2.4999959468841553, "reward_std": 2.1286985543156334e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.225388601036269, "grad_norm": 107.0880198193637, "kl": 0.662109375, "learning_rate": 3.7746113989637304e-07, "loss": 0.0026, "reward": 1.9988529682159424, "reward_std": 8.453370878669375e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988529682159424, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.227979274611399, "grad_norm": 0.12802071439786641, "kl": 0.5859375, "learning_rate": 3.772020725388601e-07, "loss": 0.0029, "reward": 2.499998092651367, "reward_std": 1.5262795614034985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.230569948186528, "grad_norm": 0.35003499699586094, "kl": 0.509765625, "learning_rate": 3.7694300518134714e-07, "loss": 0.0029, "reward": 2.499996542930603, "reward_std": 2.058845552710409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.233160621761658, "grad_norm": 6.074482624147, "kl": 0.646484375, "learning_rate": 3.766839378238342e-07, "loss": 0.0017, "reward": 1.9987375140190125, "reward_std": 7.176611302384117e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987376928329468, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 60.25, "epoch": 6.2357512953367875, "grad_norm": 3.568133138782448, "kl": 0.384765625, "learning_rate": 3.764248704663212e-07, "loss": 0.0017, "reward": 2.1239195466041565, "reward_std": 0.2314964799252266, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6239194869995117, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.238341968911917, "grad_norm": 17.21621430729764, "kl": 0.720703125, "learning_rate": 3.761658031088083e-07, "loss": 0.002, "reward": 1.7997077703475952, "reward_std": 0.001265492549919145, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2997078597545624, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.240932642487047, "grad_norm": 0.20431824669053975, "kl": 0.615234375, "learning_rate": 3.7590673575129535e-07, "loss": 0.0017, "reward": 2.499995231628418, "reward_std": 4.0054887904261705e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.243523316062176, "grad_norm": 0.6512658351304268, "kl": 0.576171875, "learning_rate": 3.7564766839378235e-07, "loss": 0.002, "reward": 2.499955415725708, "reward_std": 9.441144470656582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999955654144287, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.246113989637306, "grad_norm": 0.745975038125287, "kl": 0.658203125, "learning_rate": 3.753886010362694e-07, "loss": 0.0027, "reward": 1.9992695450782776, "reward_std": 1.7439156295040448e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992695450782776, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.248704663212435, "grad_norm": 29.403135375160854, "kl": 0.65234375, "learning_rate": 3.7512953367875646e-07, "loss": 0.0021, "reward": 1.9996057152748108, "reward_std": 0.0002917600315868185, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996058344841003, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.251295336787565, "grad_norm": 0.19573519289849928, "kl": 0.68359375, "learning_rate": 3.748704663212435e-07, "loss": 0.0033, "reward": 2.4999934434890747, "reward_std": 3.947573645746161e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.253886010362694, "grad_norm": 0.15743304450313444, "kl": 0.5654296875, "learning_rate": 3.7461139896373056e-07, "loss": 0.0015, "reward": 2.499995708465576, "reward_std": 1.984752827866032e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.256476683937824, "grad_norm": 4.49458036679588, "kl": 0.6015625, "learning_rate": 3.743523316062176e-07, "loss": 0.0041, "reward": 2.4999866485595703, "reward_std": 6.326114828425489e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986469745636, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.259067357512953, "grad_norm": 1.5165261573269917, "kl": 0.580078125, "learning_rate": 3.740932642487046e-07, "loss": 0.0026, "reward": 2.4999945163726807, "reward_std": 4.025585894851247e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.261658031088083, "grad_norm": 0.1729856871119715, "kl": 0.591796875, "learning_rate": 3.7383419689119167e-07, "loss": 0.0018, "reward": 2.4999974966049194, "reward_std": 2.2689604293191223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.2642487046632125, "grad_norm": 7.033629906155083, "kl": 0.685546875, "learning_rate": 3.7357512953367877e-07, "loss": 0.0036, "reward": 1.9991326928138733, "reward_std": 6.678093609480129e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991325736045837, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.266839378238342, "grad_norm": 5.699229612136398, "kl": 0.67578125, "learning_rate": 3.7331606217616577e-07, "loss": 0.0027, "reward": 1.999781847000122, "reward_std": 1.733098133627209e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997817873954773, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.269430051813472, "grad_norm": 2.4596054778072074, "kl": 0.5205078125, "learning_rate": 3.730569948186528e-07, "loss": 0.0023, "reward": 1.9998244643211365, "reward_std": 1.976607279630116e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499824434518814, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 6.272020725388601, "grad_norm": 6.3239306725582, "kl": 0.6015625, "learning_rate": 3.727979274611399e-07, "loss": 0.002, "reward": 2.499990463256836, "reward_std": 9.180197849900651e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 6.274611398963731, "grad_norm": 45.55264285322464, "kl": 0.6328125, "learning_rate": 3.7253886010362693e-07, "loss": 0.0023, "reward": 1.9854003190994263, "reward_std": 0.0010355417575738102, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4854004383087158, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.27720207253886, "grad_norm": 27.363365147031217, "kl": 0.63671875, "learning_rate": 3.72279792746114e-07, "loss": 0.0031, "reward": 1.9826002717018127, "reward_std": 0.0008668652354231199, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4826001226902008, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.27979274611399, "grad_norm": 1.0565169980489546, "kl": 0.6240234375, "learning_rate": 3.7202072538860104e-07, "loss": 0.0019, "reward": 2.499994993209839, "reward_std": 2.4262038778033457e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.282383419689119, "grad_norm": 0.33142273291907914, "kl": 0.6171875, "learning_rate": 3.7176165803108804e-07, "loss": 0.0022, "reward": 2.4999958276748657, "reward_std": 2.386830885825475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.284974093264249, "grad_norm": 4.272788951839864, "kl": 0.625, "learning_rate": 3.715025906735751e-07, "loss": 0.0018, "reward": 1.994506597518921, "reward_std": 0.00012360996527149837, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4945066571235657, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.287564766839378, "grad_norm": 6.4327591491307174, "kl": 0.55078125, "learning_rate": 3.712435233160622e-07, "loss": 0.0019, "reward": 1.9970933198928833, "reward_std": 8.09226690421383e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970933496952057, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.290155440414508, "grad_norm": 0.8293801305709964, "kl": 0.546875, "learning_rate": 3.709844559585492e-07, "loss": 0.0016, "reward": 2.499996066093445, "reward_std": 2.7491274749991135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.2927461139896375, "grad_norm": 0.20683414298495925, "kl": 0.58984375, "learning_rate": 3.7072538860103625e-07, "loss": 0.0038, "reward": 2.4999942779541016, "reward_std": 3.496883891784819e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.295336787564767, "grad_norm": 0.38349978928746425, "kl": 0.587890625, "learning_rate": 3.704663212435233e-07, "loss": 0.0029, "reward": 2.4999982118606567, "reward_std": 2.5799910190471564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 6.2979274611398965, "grad_norm": 36.00899332850582, "kl": 0.62890625, "learning_rate": 3.7020725388601035e-07, "loss": 0.0021, "reward": 2.3139092922210693, "reward_std": 0.34455967634289664, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8139091730117798, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.300518134715026, "grad_norm": 0.16048156503365948, "kl": 0.59765625, "learning_rate": 3.699481865284974e-07, "loss": 0.0027, "reward": 2.4999983310699463, "reward_std": 1.677137049682642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.303108808290156, "grad_norm": 3.4773721149395724, "kl": 0.623046875, "learning_rate": 3.6968911917098446e-07, "loss": 0.0029, "reward": 2.499987006187439, "reward_std": 1.629791529467184e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999869465827942, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.305699481865285, "grad_norm": 0.177943027887078, "kl": 0.56640625, "learning_rate": 3.6943005181347146e-07, "loss": 0.001, "reward": 2.499996066093445, "reward_std": 3.226262151656556e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.308290155440415, "grad_norm": 15.123388304692446, "kl": 0.69140625, "learning_rate": 3.691709844559585e-07, "loss": 0.0023, "reward": 1.6876229047775269, "reward_std": 0.1553719400060345, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1876230239868164, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 55.6875, "epoch": 6.310880829015544, "grad_norm": 0.14010144791185583, "kl": 0.39453125, "learning_rate": 3.689119170984456e-07, "loss": 0.0016, "reward": 2.499997138977051, "reward_std": 2.12668533094984e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.313471502590674, "grad_norm": 5.814775021679669, "kl": 0.57421875, "learning_rate": 3.686528497409326e-07, "loss": 0.0026, "reward": 1.7463675737380981, "reward_std": 0.0009286357785640575, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2463675439357758, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.316062176165803, "grad_norm": 11.621090294602334, "kl": 0.591796875, "learning_rate": 3.6839378238341967e-07, "loss": 0.003, "reward": 2.3713343143463135, "reward_std": 0.00045188154126662994, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8713342547416687, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.318652849740933, "grad_norm": 0.32163879921504945, "kl": 0.740234375, "learning_rate": 3.681347150259067e-07, "loss": 0.003, "reward": 2.4999961853027344, "reward_std": 3.1484574947171495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.321243523316062, "grad_norm": 5.5142367545192705, "kl": 0.62109375, "learning_rate": 3.678756476683937e-07, "loss": 0.0034, "reward": 1.9995797872543335, "reward_std": 0.00019720889144991816, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995797276496887, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.323834196891192, "grad_norm": 0.3042293507590256, "kl": 0.7109375, "learning_rate": 3.676165803108808e-07, "loss": 0.0018, "reward": 2.499996304512024, "reward_std": 3.2797728977129736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.3264248704663215, "grad_norm": 6.225768486230187, "kl": 0.52734375, "learning_rate": 3.673575129533679e-07, "loss": 0.0026, "reward": 1.9321548342704773, "reward_std": 0.00031323718280873436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4321547746658325, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.329015544041451, "grad_norm": 0.09255132319333872, "kl": 0.595703125, "learning_rate": 3.670984455958549e-07, "loss": 0.003, "reward": 2.4999940395355225, "reward_std": 2.4168521122192033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.331606217616581, "grad_norm": 0.10389491901240697, "kl": 0.576171875, "learning_rate": 3.6683937823834193e-07, "loss": 0.0023, "reward": 2.4999974966049194, "reward_std": 2.1534840470849304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.33419689119171, "grad_norm": 0.41900751036282224, "kl": 0.53125, "learning_rate": 3.6658031088082904e-07, "loss": 0.0014, "reward": 2.4999966621398926, "reward_std": 4.43160737972903e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.33678756476684, "grad_norm": 0.13281866422797972, "kl": 0.58984375, "learning_rate": 3.6632124352331604e-07, "loss": 0.0022, "reward": 2.499995708465576, "reward_std": 1.6909523310459917e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.339378238341969, "grad_norm": 10.758898363699, "kl": 0.56640625, "learning_rate": 3.660621761658031e-07, "loss": 0.0026, "reward": 1.9270581007003784, "reward_std": 0.00304933155314302, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.427058219909668, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 6.341968911917099, "grad_norm": 37.70681442978815, "kl": 0.55078125, "learning_rate": 3.6580310880829014e-07, "loss": 0.0018, "reward": 2.3116129636764526, "reward_std": 0.2599803860944121, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8116129040718079, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.344559585492228, "grad_norm": 5.630534616535154, "kl": 0.53515625, "learning_rate": 3.6554404145077714e-07, "loss": 0.003, "reward": 1.9995397329330444, "reward_std": 0.00016405612723247032, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995397329330444, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.347150259067358, "grad_norm": 3.8910361452207414, "kl": 0.564453125, "learning_rate": 3.6528497409326425e-07, "loss": 0.002, "reward": 2.4999393224716187, "reward_std": 5.345341696738615e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999393224716187, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 6.349740932642487, "grad_norm": 1.1636360849948086, "kl": 0.763671875, "learning_rate": 3.650259067357513e-07, "loss": 0.004, "reward": 2.499992609024048, "reward_std": 5.680910817318363e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.352331606217617, "grad_norm": 0.26832765027655786, "kl": 0.697265625, "learning_rate": 3.647668393782383e-07, "loss": 0.0034, "reward": 2.499997854232788, "reward_std": 1.8003885315920343e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.3549222797927465, "grad_norm": 0.13443939140398106, "kl": 0.5703125, "learning_rate": 3.6450777202072535e-07, "loss": 0.0018, "reward": 2.4999970197677612, "reward_std": 2.5179799649777124e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.357512953367876, "grad_norm": 0.05883055055202931, "kl": 0.5322265625, "learning_rate": 3.642487046632124e-07, "loss": 0.0032, "reward": 2.499997854232788, "reward_std": 1.6208207398449304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 51.0625, "epoch": 6.360103626943006, "grad_norm": 1.6940569609666691, "kl": 0.3984375, "learning_rate": 3.6398963730569946e-07, "loss": 0.0007, "reward": 2.4999947547912598, "reward_std": 4.057784508404438e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.362694300518135, "grad_norm": 12.313099367941254, "kl": 0.580078125, "learning_rate": 3.637305699481865e-07, "loss": 0.0025, "reward": 1.957121729850769, "reward_std": 0.0008134684999276942, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4571215212345123, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.365284974093265, "grad_norm": 35.33291645738469, "kl": 0.58203125, "learning_rate": 3.6347150259067356e-07, "loss": 0.0028, "reward": 1.997328519821167, "reward_std": 0.0007430430339354643, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4973284602165222, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.367875647668393, "grad_norm": 0.09546959418270544, "kl": 0.572265625, "learning_rate": 3.6321243523316056e-07, "loss": 0.0023, "reward": 2.4999969005584717, "reward_std": 1.826785592129454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 58.4375, "epoch": 6.370466321243523, "grad_norm": 0.4800825009888611, "kl": 0.4404296875, "learning_rate": 3.6295336787564767e-07, "loss": 0.0019, "reward": 2.4999947547912598, "reward_std": 5.133964918968559e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.373056994818652, "grad_norm": 0.9589513874582359, "kl": 0.634765625, "learning_rate": 3.626943005181347e-07, "loss": 0.0024, "reward": 2.499997615814209, "reward_std": 2.183047001835803e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.375647668393782, "grad_norm": 24.642341475855922, "kl": 0.568359375, "learning_rate": 3.624352331606217e-07, "loss": 0.0021, "reward": 1.9054408073425293, "reward_std": 0.0013638980180985527, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.405440777540207, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.3782383419689115, "grad_norm": 0.09155707494797194, "kl": 0.673828125, "learning_rate": 3.6217616580310877e-07, "loss": 0.0034, "reward": 2.4999983310699463, "reward_std": 1.4926608855603263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.380829015544041, "grad_norm": 0.20226792064611065, "kl": 0.57421875, "learning_rate": 3.619170984455958e-07, "loss": 0.0014, "reward": 2.49999737739563, "reward_std": 3.4160370887548197e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.383419689119171, "grad_norm": 56.389666698763044, "kl": 0.525390625, "learning_rate": 3.616580310880829e-07, "loss": 0.0019, "reward": 1.5407918095588684, "reward_std": 0.19407917675562203, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0407918095588684, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.3860103626943, "grad_norm": 3.7776183663839933, "kl": 0.509765625, "learning_rate": 3.6139896373056993e-07, "loss": 0.0018, "reward": 1.9550496339797974, "reward_std": 0.00025495435193079174, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.455049753189087, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.38860103626943, "grad_norm": 15.993185333438726, "kl": 0.634765625, "learning_rate": 3.61139896373057e-07, "loss": 0.0024, "reward": 2.4372482299804688, "reward_std": 0.1774293115522596, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372481107711792, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.391191709844559, "grad_norm": 0.8852296624841952, "kl": 0.5908203125, "learning_rate": 3.60880829015544e-07, "loss": 0.0019, "reward": 2.499973177909851, "reward_std": 5.385791837397846e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999732375144958, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.393782383419689, "grad_norm": 0.13367925601202274, "kl": 0.55859375, "learning_rate": 3.606217616580311e-07, "loss": 0.0032, "reward": 2.4999945163726807, "reward_std": 3.00293788768613e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.396373056994818, "grad_norm": 43.11711472734937, "kl": 0.65234375, "learning_rate": 3.6036269430051814e-07, "loss": 0.0029, "reward": 1.560460090637207, "reward_std": 0.1770102435730223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0604601204395294, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 54.0, "epoch": 6.398963730569948, "grad_norm": 10.517962609782737, "kl": 0.43359375, "learning_rate": 3.6010362694300514e-07, "loss": 0.0014, "reward": 2.4998828172683716, "reward_std": 8.300930016957864e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998830556869507, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.401554404145077, "grad_norm": 0.09535148131971638, "kl": 0.49609375, "learning_rate": 3.598445595854922e-07, "loss": 0.0026, "reward": 2.4999961853027344, "reward_std": 2.5311478566436563e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.404145077720207, "grad_norm": 3.912459993880358, "kl": 0.583984375, "learning_rate": 3.5958549222797925e-07, "loss": 0.002, "reward": 1.9888484477996826, "reward_std": 8.400992396673246e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4888485074043274, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.4067357512953365, "grad_norm": 4.487000743308011, "kl": 0.595703125, "learning_rate": 3.5932642487046635e-07, "loss": 0.0027, "reward": 1.9987983107566833, "reward_std": 7.907915119176323e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987983405590057, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.409326424870466, "grad_norm": 33.78938781067593, "kl": 0.599609375, "learning_rate": 3.5906735751295335e-07, "loss": 0.003, "reward": 1.9904080629348755, "reward_std": 0.0002417833263734792, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4904080629348755, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.4119170984455955, "grad_norm": 0.6434060052424042, "kl": 0.6328125, "learning_rate": 3.588082901554404e-07, "loss": 0.0025, "reward": 2.4999932050704956, "reward_std": 6.897476112044387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.414507772020725, "grad_norm": 51.005450960010975, "kl": 0.712890625, "learning_rate": 3.585492227979274e-07, "loss": 0.0029, "reward": 1.8305613994598389, "reward_std": 0.0044385627656993165, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3305614590644836, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 6.417098445595855, "grad_norm": 30.366180922739456, "kl": 0.595703125, "learning_rate": 3.5829015544041446e-07, "loss": 0.0025, "reward": 1.3965474367141724, "reward_std": 0.14170544006265118, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8965473771095276, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.419689119170984, "grad_norm": 363.3360089422605, "kl": 76.283203125, "learning_rate": 3.5803108808290156e-07, "loss": 0.3056, "reward": 2.499990701675415, "reward_std": 2.3834739579342568e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 38.3125, "epoch": 6.422279792746114, "grad_norm": 17.551790810475627, "kl": 0.583984375, "learning_rate": 3.577720207253886e-07, "loss": 0.002, "reward": 1.9975073337554932, "reward_std": 0.0005042523490601525, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975075125694275, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.424870466321243, "grad_norm": 0.5490606211184182, "kl": 0.525390625, "learning_rate": 3.575129533678756e-07, "loss": 0.0018, "reward": 2.4999953508377075, "reward_std": 5.939624259099219e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.427461139896373, "grad_norm": 26.1585638653321, "kl": 0.734375, "learning_rate": 3.5725388601036267e-07, "loss": 0.0026, "reward": 2.499939203262329, "reward_std": 4.7983474360080436e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999939203262329, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.430051813471502, "grad_norm": 0.8273259832569573, "kl": 0.5224609375, "learning_rate": 3.569948186528498e-07, "loss": 0.0018, "reward": 2.4999927282333374, "reward_std": 3.886736180902517e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.432642487046632, "grad_norm": 0.051250694997337996, "kl": 0.658203125, "learning_rate": 3.5673575129533677e-07, "loss": 0.0031, "reward": 2.499998688697815, "reward_std": 1.6565215332775551e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.435233160621761, "grad_norm": 1.818603958053311, "kl": 0.6171875, "learning_rate": 3.564766839378238e-07, "loss": 0.0023, "reward": 2.499969720840454, "reward_std": 6.256649044189544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999696612358093, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.437823834196891, "grad_norm": 4.743737908839945, "kl": 0.671875, "learning_rate": 3.562176165803109e-07, "loss": 0.0027, "reward": 2.4999709129333496, "reward_std": 3.730114497102477e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999971091747284, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.4404145077720205, "grad_norm": 0.6987966528214589, "kl": 0.638671875, "learning_rate": 3.559585492227979e-07, "loss": 0.003, "reward": 2.4999940395355225, "reward_std": 5.394414870352193e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.44300518134715, "grad_norm": 0.07685592183138175, "kl": 0.583984375, "learning_rate": 3.55699481865285e-07, "loss": 0.0019, "reward": 2.499998450279236, "reward_std": 1.3653622659148823e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.44559585492228, "grad_norm": 0.7347973284792281, "kl": 0.857421875, "learning_rate": 3.5544041450777204e-07, "loss": 0.0049, "reward": 2.4999951124191284, "reward_std": 2.0051574551871454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.448186528497409, "grad_norm": 0.06860525255649329, "kl": 0.48828125, "learning_rate": 3.5518134715025904e-07, "loss": 0.0017, "reward": 2.4999985694885254, "reward_std": 1.251547303127154e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.450777202072539, "grad_norm": 0.40046824726495245, "kl": 0.66015625, "learning_rate": 3.549222797927461e-07, "loss": 0.002, "reward": 2.49999463558197, "reward_std": 3.5408719440965797e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.453367875647668, "grad_norm": 65.54843783283775, "kl": 0.57421875, "learning_rate": 3.546632124352332e-07, "loss": 0.0022, "reward": 2.3749523162841797, "reward_std": 0.23150436287278353, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874952495098114, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.455958549222798, "grad_norm": 4.37351035279753, "kl": 0.576171875, "learning_rate": 3.544041450777202e-07, "loss": 0.0018, "reward": 2.4999496936798096, "reward_std": 4.256652164258412e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999496936798096, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 52.8125, "epoch": 6.458549222797927, "grad_norm": 15.961892297351133, "kl": 0.4091796875, "learning_rate": 3.5414507772020725e-07, "loss": 0.0017, "reward": 2.4998892545700073, "reward_std": 0.00020560180524853422, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999889314174652, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.461139896373057, "grad_norm": 0.2972078717876346, "kl": 0.57421875, "learning_rate": 3.538860103626943e-07, "loss": 0.0023, "reward": 2.499996304512024, "reward_std": 4.00802912281506e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 6.463730569948186, "grad_norm": 17.672163035102912, "kl": 0.61328125, "learning_rate": 3.536269430051813e-07, "loss": 0.0035, "reward": 2.4372905492782593, "reward_std": 0.1773590263172764, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372904300689697, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.466321243523316, "grad_norm": 0.7148086225198583, "kl": 0.703125, "learning_rate": 3.533678756476684e-07, "loss": 0.0032, "reward": 2.4999940395355225, "reward_std": 4.734522008220665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.4689119170984455, "grad_norm": 0.14099546471212357, "kl": 0.595703125, "learning_rate": 3.5310880829015546e-07, "loss": 0.0027, "reward": 2.4999951124191284, "reward_std": 2.9207587886048714e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 65.625, "epoch": 6.471502590673575, "grad_norm": 2.703778896797878, "kl": 0.232421875, "learning_rate": 3.5284974093264246e-07, "loss": -0.0004, "reward": 2.499972105026245, "reward_std": 3.149874669361452e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99997216463089, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.474093264248705, "grad_norm": 0.43172242710105463, "kl": 0.609375, "learning_rate": 3.525906735751295e-07, "loss": 0.0016, "reward": 2.4999964237213135, "reward_std": 3.2851244213816244e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.476683937823834, "grad_norm": 0.3908890422479376, "kl": 0.58984375, "learning_rate": 3.5233160621761656e-07, "loss": 0.0018, "reward": 2.499996304512024, "reward_std": 1.7118950950134604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2500 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.479274611398964, "grad_norm": 8.045372284076922, "kl": 0.61328125, "learning_rate": 3.520725388601036e-07, "loss": 0.0027, "reward": 1.8208626508712769, "reward_std": 0.00038032287125133735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3208626210689545, "step": 2501 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.481865284974093, "grad_norm": 0.09673648480259098, "kl": 0.669921875, "learning_rate": 3.5181347150259067e-07, "loss": 0.003, "reward": 2.499998092651367, "reward_std": 2.124902835021203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2502 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.484455958549223, "grad_norm": 0.7606915284154347, "kl": 0.662109375, "learning_rate": 3.515544041450777e-07, "loss": 0.0035, "reward": 2.4999972581863403, "reward_std": 1.6213274989240745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2503 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.487046632124352, "grad_norm": 3.1599351489568965, "kl": 0.615234375, "learning_rate": 3.512953367875647e-07, "loss": 0.002, "reward": 1.9998215436935425, "reward_std": 2.3732471618131967e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998216032981873, "step": 2504 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.489637305699482, "grad_norm": 2.8120782662224775, "kl": 0.5283203125, "learning_rate": 3.510362694300518e-07, "loss": 0.0025, "reward": 1.9926303625106812, "reward_std": 8.41439750729478e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4926303625106812, "step": 2505 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.492227979274611, "grad_norm": 0.40365887362547903, "kl": 0.5703125, "learning_rate": 3.507772020725389e-07, "loss": 0.0034, "reward": 2.4999964237213135, "reward_std": 3.197951571110025e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2506 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.494818652849741, "grad_norm": 9.315739744129436, "kl": 0.55078125, "learning_rate": 3.505181347150259e-07, "loss": 0.003, "reward": 1.9991828203201294, "reward_std": 3.659055266780342e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991826713085175, "step": 2507 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.4974093264248705, "grad_norm": 4.405754236247919, "kl": 0.7109375, "learning_rate": 3.5025906735751293e-07, "loss": 0.0033, "reward": 2.0624531507492065, "reward_std": 0.17678555225279524, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624529719352722, "step": 2508 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.5, "grad_norm": 3.252864188333913, "kl": 0.529296875, "learning_rate": 3.5e-07, "loss": 0.0023, "reward": 2.4999501705169678, "reward_std": 5.590870060245834e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999501705169678, "step": 2509 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.5025906735751295, "grad_norm": 0.48642912020352014, "kl": 0.595703125, "learning_rate": 3.4974093264248704e-07, "loss": 0.0034, "reward": 2.4999964237213135, "reward_std": 4.995093377146986e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2510 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.505181347150259, "grad_norm": 0.689929489695362, "kl": 0.5859375, "learning_rate": 3.494818652849741e-07, "loss": 0.0035, "reward": 2.499994158744812, "reward_std": 5.815761824123911e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 2511 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.507772020725389, "grad_norm": 0.1336257466960235, "kl": 0.4228515625, "learning_rate": 3.4922279792746114e-07, "loss": 0.0004, "reward": 2.4999932050704956, "reward_std": 2.6320868755647098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 2512 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.510362694300518, "grad_norm": 7.570809595782372, "kl": 0.615234375, "learning_rate": 3.4896373056994814e-07, "loss": 0.0022, "reward": 1.9997441172599792, "reward_std": 8.401849345318624e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997441172599792, "step": 2513 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.512953367875648, "grad_norm": 3.1765073551956386, "kl": 0.59765625, "learning_rate": 3.487046632124352e-07, "loss": 0.0019, "reward": 1.9981704354286194, "reward_std": 9.881821443968875e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981703162193298, "step": 2514 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.515544041450777, "grad_norm": 1.8978315288215728, "kl": 0.62109375, "learning_rate": 3.484455958549223e-07, "loss": 0.0021, "reward": 1.9991703033447266, "reward_std": 3.8660600694129243e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991703033447266, "step": 2515 }, { "clip_ratio": 0.0, "completion_length": 62.9375, "epoch": 6.518134715025907, "grad_norm": 13.022693191499009, "kl": 0.533203125, "learning_rate": 3.481865284974093e-07, "loss": 0.0021, "reward": 1.8486659526824951, "reward_std": 0.2033550472697243, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.34866601228714, "step": 2516 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.520725388601036, "grad_norm": 0.22287928658228098, "kl": 0.5166015625, "learning_rate": 3.4792746113989635e-07, "loss": 0.0019, "reward": 2.4999935626983643, "reward_std": 3.1768583994562505e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 2517 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.523316062176166, "grad_norm": 0.12095067631981185, "kl": 0.609375, "learning_rate": 3.476683937823834e-07, "loss": 0.0021, "reward": 2.499997138977051, "reward_std": 3.4189604320999933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2518 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 6.525906735751295, "grad_norm": 0.5526369124535546, "kl": 0.6171875, "learning_rate": 3.4740932642487046e-07, "loss": 0.002, "reward": 2.4999953508377075, "reward_std": 4.723354550151271e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 2519 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 6.528497409326425, "grad_norm": 49.850693944802835, "kl": 7.5234375, "learning_rate": 3.471502590673575e-07, "loss": 0.0311, "reward": 2.4372897148132324, "reward_std": 0.1773627500982684, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372894763946533, "step": 2520 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.5310880829015545, "grad_norm": 4.452693061692807, "kl": 0.525390625, "learning_rate": 3.4689119170984456e-07, "loss": 0.0028, "reward": 1.999515414237976, "reward_std": 4.594479889874492e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995153546333313, "step": 2521 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.533678756476684, "grad_norm": 0.2280083265811246, "kl": 0.578125, "learning_rate": 3.4663212435233156e-07, "loss": 0.0023, "reward": 2.4999959468841553, "reward_std": 4.121835445403121e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2522 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.536269430051814, "grad_norm": 0.22069841949970828, "kl": 0.615234375, "learning_rate": 3.463730569948186e-07, "loss": 0.0029, "reward": 2.4999969005584717, "reward_std": 2.999440141593368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2523 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.538860103626943, "grad_norm": 0.13241095552344828, "kl": 0.56640625, "learning_rate": 3.461139896373057e-07, "loss": 0.0026, "reward": 2.499996781349182, "reward_std": 2.6606455776345683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2524 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.541450777202073, "grad_norm": 1.2042626781221535, "kl": 0.58203125, "learning_rate": 3.458549222797927e-07, "loss": 0.0025, "reward": 2.499994158744812, "reward_std": 6.350226840368123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 2525 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.544041450777202, "grad_norm": 35.67345340800504, "kl": 0.6953125, "learning_rate": 3.455958549222798e-07, "loss": 0.0027, "reward": 1.8745745420455933, "reward_std": 0.23185789426861447, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3745745718479156, "step": 2526 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.546632124352332, "grad_norm": 0.3949234426445559, "kl": 0.55859375, "learning_rate": 3.4533678756476683e-07, "loss": 0.0009, "reward": 2.4999938011169434, "reward_std": 3.931534649836976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2527 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.549222797927461, "grad_norm": 0.1285848469687838, "kl": 0.4931640625, "learning_rate": 3.450777202072539e-07, "loss": 0.002, "reward": 2.4999974966049194, "reward_std": 2.397010575805325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2528 }, { "clip_ratio": 0.0, "completion_length": 52.125, "epoch": 6.551813471502591, "grad_norm": 0.7299352318393123, "kl": 0.35205078125, "learning_rate": 3.4481865284974093e-07, "loss": 0.0017, "reward": 2.499986171722412, "reward_std": 6.950015404072474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999861121177673, "step": 2529 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.55440414507772, "grad_norm": 0.16374631381838006, "kl": 0.63671875, "learning_rate": 3.44559585492228e-07, "loss": 0.002, "reward": 2.4999969005584717, "reward_std": 2.6270508897141553e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2530 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.55699481865285, "grad_norm": 5.625726721601132, "kl": 0.630859375, "learning_rate": 3.44300518134715e-07, "loss": 0.0014, "reward": 2.4999189376831055, "reward_std": 3.317349273856962e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999191164970398, "step": 2531 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 6.5595854922279795, "grad_norm": 0.3501248157191486, "kl": 0.45703125, "learning_rate": 3.4404145077720204e-07, "loss": 0.0015, "reward": 2.4999966621398926, "reward_std": 2.729808187496019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2532 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.562176165803109, "grad_norm": 2.922415918224334, "kl": 0.55078125, "learning_rate": 3.4378238341968914e-07, "loss": 0.0024, "reward": 2.4999440908432007, "reward_std": 4.152141332269821e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999944031238556, "step": 2533 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 6.564766839378239, "grad_norm": 104.0160604523358, "kl": 0.587890625, "learning_rate": 3.4352331606217614e-07, "loss": 0.0024, "reward": 1.9684685468673706, "reward_std": 0.05646914435783401, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4684685468673706, "step": 2534 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.567357512953368, "grad_norm": 0.171253998272141, "kl": 0.62109375, "learning_rate": 3.432642487046632e-07, "loss": 0.0023, "reward": 2.4999974966049194, "reward_std": 1.762431224960892e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2535 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.569948186528498, "grad_norm": 3.4462769684661034, "kl": 0.611328125, "learning_rate": 3.4300518134715025e-07, "loss": 0.0034, "reward": 2.4999901056289673, "reward_std": 1.1918034999780502e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 2536 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.572538860103627, "grad_norm": 0.3827412492464736, "kl": 0.583984375, "learning_rate": 3.4274611398963725e-07, "loss": 0.0015, "reward": 2.499994158744812, "reward_std": 5.507912192115327e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2537 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.575129533678757, "grad_norm": 24.6665125395705, "kl": 0.57421875, "learning_rate": 3.4248704663212435e-07, "loss": 0.0022, "reward": 2.4374195337295532, "reward_std": 0.17698130340340867, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374194145202637, "step": 2538 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 6.577720207253886, "grad_norm": 1.2327184318534954, "kl": 0.990234375, "learning_rate": 3.422279792746114e-07, "loss": 0.0056, "reward": 2.499995231628418, "reward_std": 7.23657990420179e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2539 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.580310880829016, "grad_norm": 0.1837923629233923, "kl": 0.6015625, "learning_rate": 3.419689119170984e-07, "loss": 0.0024, "reward": 2.499989867210388, "reward_std": 4.329044259065995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989926815033, "step": 2540 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.582901554404145, "grad_norm": 0.16050077595605963, "kl": 0.5263671875, "learning_rate": 3.4170984455958546e-07, "loss": 0.0006, "reward": 2.4999977350234985, "reward_std": 2.261822828586446e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2541 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 6.585492227979275, "grad_norm": 0.09082322887657092, "kl": 0.49609375, "learning_rate": 3.4145077720207256e-07, "loss": 0.0017, "reward": 2.499995708465576, "reward_std": 1.6444170114482404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2542 }, { "clip_ratio": 0.0, "completion_length": 110.8125, "epoch": 6.5880829015544045, "grad_norm": 0.412360257316038, "kl": 0.333984375, "learning_rate": 3.4119170984455956e-07, "loss": 0.0015, "reward": 2.4999958276748657, "reward_std": 3.6013015574098972e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2543 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.590673575129534, "grad_norm": 12.05075134300481, "kl": 0.5546875, "learning_rate": 3.409326424870466e-07, "loss": 0.0024, "reward": 1.8625807762145996, "reward_std": 0.0013466848067764658, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3625807762145996, "step": 2544 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.5932642487046635, "grad_norm": 7.830103166581443, "kl": 0.5634765625, "learning_rate": 3.4067357512953367e-07, "loss": 0.0017, "reward": 2.4999786615371704, "reward_std": 1.6338393720616295e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999786615371704, "step": 2545 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.595854922279793, "grad_norm": 1.3539726403584982, "kl": 1.2021484375, "learning_rate": 3.4041450777202067e-07, "loss": 0.0058, "reward": 2.4999817609786987, "reward_std": 1.2897937267553061e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999815225601196, "step": 2546 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.598445595854923, "grad_norm": 9.789107008865555, "kl": 0.64453125, "learning_rate": 3.401554404145078e-07, "loss": 0.0025, "reward": 1.9993674159049988, "reward_std": 3.848960619734498e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993673861026764, "step": 2547 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.601036269430052, "grad_norm": 3.226182271196218, "kl": 0.580078125, "learning_rate": 3.3989637305699483e-07, "loss": 0.0022, "reward": 2.499895453453064, "reward_std": 1.750432846847616e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998955726623535, "step": 2548 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.603626943005182, "grad_norm": 0.30986145283839434, "kl": 0.587890625, "learning_rate": 3.3963730569948183e-07, "loss": 0.0029, "reward": 2.4999945163726807, "reward_std": 4.972214298959443e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2549 }, { "clip_ratio": 0.0, "completion_length": 92.3125, "epoch": 6.606217616580311, "grad_norm": 1.5930014107631858, "kl": 0.556640625, "learning_rate": 3.393782383419689e-07, "loss": 0.0015, "reward": 2.499996066093445, "reward_std": 2.7539030043044477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2550 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.608808290155441, "grad_norm": 27.594985218051004, "kl": 0.5927734375, "learning_rate": 3.3911917098445593e-07, "loss": 0.0022, "reward": 1.7485663294792175, "reward_std": 0.26787346821220126, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2485662698745728, "step": 2551 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.61139896373057, "grad_norm": 0.22500560910428152, "kl": 0.623046875, "learning_rate": 3.38860103626943e-07, "loss": 0.0028, "reward": 2.4999966621398926, "reward_std": 1.770209621554386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2552 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.6139896373057, "grad_norm": 0.14418035733986334, "kl": 0.63671875, "learning_rate": 3.3860103626943004e-07, "loss": 0.0024, "reward": 2.499995708465576, "reward_std": 1.949951013102691e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2553 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.616580310880829, "grad_norm": 0.2729733316733411, "kl": 0.640625, "learning_rate": 3.383419689119171e-07, "loss": 0.0026, "reward": 2.4999935626983643, "reward_std": 3.3477872420917265e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 2554 }, { "clip_ratio": 0.0, "completion_length": 50.8125, "epoch": 6.619170984455959, "grad_norm": 0.060731655784772846, "kl": 0.455078125, "learning_rate": 3.380829015544041e-07, "loss": 0.0023, "reward": 2.499998092651367, "reward_std": 1.2793244650310953e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2555 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 6.6217616580310885, "grad_norm": 0.035779779765292, "kl": 0.5732421875, "learning_rate": 3.378238341968912e-07, "loss": 0.0014, "reward": 1.4999992847442627, "reward_std": 3.849661140975513e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999994039535522, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.624352331606218, "grad_norm": 0.22667884523369272, "kl": 0.541015625, "learning_rate": 3.3756476683937825e-07, "loss": 0.0007, "reward": 2.4999970197677612, "reward_std": 3.0450231633949443e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2557 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.626943005181348, "grad_norm": 0.09095984024189582, "kl": 0.5869140625, "learning_rate": 3.3730569948186525e-07, "loss": 0.0027, "reward": 2.4999977350234985, "reward_std": 1.5637142496416345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2558 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.629533678756477, "grad_norm": 1.1088931566932956, "kl": 0.51953125, "learning_rate": 3.370466321243523e-07, "loss": 0.0031, "reward": 1.9999357461929321, "reward_std": 1.0583863740976085e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999355673789978, "step": 2559 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.632124352331607, "grad_norm": 0.17955739740636073, "kl": 0.59765625, "learning_rate": 3.3678756476683935e-07, "loss": 0.0022, "reward": 2.499997854232788, "reward_std": 1.3127217926012236e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2560 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.634715025906736, "grad_norm": 0.05708168593220096, "kl": 0.53125, "learning_rate": 3.365284974093264e-07, "loss": 0.0011, "reward": 2.499994993209839, "reward_std": 2.448397196985752e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 2561 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 6.637305699481866, "grad_norm": 19.89522830321068, "kl": 0.615234375, "learning_rate": 3.3626943005181346e-07, "loss": 0.0021, "reward": 2.4998018741607666, "reward_std": 0.00045278063248588296, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999802052974701, "step": 2562 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.639896373056995, "grad_norm": 0.29392224183282584, "kl": 0.587890625, "learning_rate": 3.360103626943005e-07, "loss": 0.0022, "reward": 2.4999889135360718, "reward_std": 5.9904020872636465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889135360718, "step": 2563 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.642487046632124, "grad_norm": 0.13106442372656738, "kl": 0.58984375, "learning_rate": 3.357512953367875e-07, "loss": 0.0034, "reward": 2.499996542930603, "reward_std": 3.7942767789900245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2564 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.6450777202072535, "grad_norm": 0.9681326501376056, "kl": 0.603515625, "learning_rate": 3.354922279792746e-07, "loss": 0.0033, "reward": 1.9997896552085876, "reward_std": 1.5792831391081563e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997895658016205, "step": 2565 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.647668393782383, "grad_norm": 0.19708887480439427, "kl": 0.56640625, "learning_rate": 3.3523316062176167e-07, "loss": 0.0025, "reward": 2.4999951124191284, "reward_std": 4.040085059386911e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2566 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.650259067357513, "grad_norm": 0.12004796002235518, "kl": 0.7421875, "learning_rate": 3.3497409326424867e-07, "loss": 0.0034, "reward": 2.499997854232788, "reward_std": 1.7860778598333127e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2567 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.652849740932642, "grad_norm": 35.48401861626182, "kl": 0.55859375, "learning_rate": 3.347150259067357e-07, "loss": 0.0027, "reward": 1.9868532419204712, "reward_std": 0.011069259455553038, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4868531823158264, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.655440414507772, "grad_norm": 26.004122691299152, "kl": 0.54296875, "learning_rate": 3.344559585492228e-07, "loss": 0.0016, "reward": 1.9972790479660034, "reward_std": 0.00023357092936748813, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972789883613586, "step": 2569 }, { "clip_ratio": 0.0, "completion_length": 67.6875, "epoch": 6.658031088082901, "grad_norm": 0.9884624649398649, "kl": 0.4736328125, "learning_rate": 3.3419689119170983e-07, "loss": 0.0029, "reward": 2.4999938011169434, "reward_std": 5.513084033736959e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2570 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.660621761658031, "grad_norm": 0.05776646268777295, "kl": 0.525390625, "learning_rate": 3.339378238341969e-07, "loss": 0.0021, "reward": 2.4999983310699463, "reward_std": 7.984220928847208e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2571 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.66321243523316, "grad_norm": 0.39062856280611896, "kl": 0.615234375, "learning_rate": 3.3367875647668393e-07, "loss": 0.003, "reward": 2.499992609024048, "reward_std": 6.3657274722572765e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924898147583, "step": 2572 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.66580310880829, "grad_norm": 0.9422779564887496, "kl": 0.546875, "learning_rate": 3.3341968911917093e-07, "loss": 0.0025, "reward": 2.499992847442627, "reward_std": 5.139506129125948e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 2573 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.668393782383419, "grad_norm": 0.13761087473586736, "kl": 0.6015625, "learning_rate": 3.33160621761658e-07, "loss": 0.0021, "reward": 2.4999958276748657, "reward_std": 2.8767564117515576e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2574 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.670984455958549, "grad_norm": 0.06411698161826337, "kl": 0.5859375, "learning_rate": 3.329015544041451e-07, "loss": 0.0025, "reward": 2.4999983310699463, "reward_std": 1.510851717512196e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2575 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 6.6735751295336785, "grad_norm": 0.9413156050855895, "kl": 0.84765625, "learning_rate": 3.326424870466321e-07, "loss": 0.0033, "reward": 2.4999899864196777, "reward_std": 9.727512406243477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 2576 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.676165803108808, "grad_norm": 0.2066312709743625, "kl": 0.591796875, "learning_rate": 3.3238341968911914e-07, "loss": 0.0016, "reward": 2.4999966621398926, "reward_std": 2.0913388141252653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 2577 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.678756476683938, "grad_norm": 0.6048593256992686, "kl": 0.6220703125, "learning_rate": 3.321243523316062e-07, "loss": 0.0026, "reward": 2.4999979734420776, "reward_std": 3.025106536824751e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2578 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.681347150259067, "grad_norm": 3.3667490485315, "kl": 0.634765625, "learning_rate": 3.3186528497409325e-07, "loss": 0.0025, "reward": 2.3124784231185913, "reward_std": 0.2587865337987978, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124784231185913, "step": 2579 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.683937823834197, "grad_norm": 1.5680757595547221, "kl": 0.576171875, "learning_rate": 3.316062176165803e-07, "loss": 0.0014, "reward": 2.49999463558197, "reward_std": 5.065743380328058e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2580 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.686528497409326, "grad_norm": 0.13492528000289114, "kl": 0.5068359375, "learning_rate": 3.3134715025906735e-07, "loss": 0.0025, "reward": 2.499997615814209, "reward_std": 2.5062678332687938e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2581 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.689119170984456, "grad_norm": 0.2544904811727551, "kl": 0.58203125, "learning_rate": 3.3108808290155435e-07, "loss": 0.001, "reward": 2.4999966621398926, "reward_std": 3.670354885798588e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2582 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.691709844559585, "grad_norm": 0.2219847851909355, "kl": 0.603515625, "learning_rate": 3.308290155440414e-07, "loss": 0.0018, "reward": 2.499998927116394, "reward_std": 7.424881403039763e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 2583 }, { "clip_ratio": 0.0, "completion_length": 36.0625, "epoch": 6.694300518134715, "grad_norm": 0.8011708537331311, "kl": 0.587890625, "learning_rate": 3.305699481865285e-07, "loss": 0.0025, "reward": 2.499994397163391, "reward_std": 4.666415179599426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 2584 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.696891191709844, "grad_norm": 51.67838377503675, "kl": 0.7265625, "learning_rate": 3.303108808290155e-07, "loss": 0.0033, "reward": 1.9978221654891968, "reward_std": 0.0027418287631917337, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978220462799072, "step": 2585 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.699481865284974, "grad_norm": 14.066892617289529, "kl": 0.5546875, "learning_rate": 3.3005181347150256e-07, "loss": 0.0016, "reward": 2.499913454055786, "reward_std": 0.00022202646022151384, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999135732650757, "step": 2586 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.7020725388601035, "grad_norm": 22.531739092825575, "kl": 0.646484375, "learning_rate": 3.297927461139896e-07, "loss": 0.0021, "reward": 1.9988712072372437, "reward_std": 0.00010663975075431154, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988711774349213, "step": 2587 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.704663212435233, "grad_norm": 0.4804202241756651, "kl": 0.5185546875, "learning_rate": 3.295336787564767e-07, "loss": 0.0024, "reward": 2.4999940395355225, "reward_std": 4.471949523576768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 2588 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.7072538860103625, "grad_norm": 4.105229241648636, "kl": 0.662109375, "learning_rate": 3.292746113989637e-07, "loss": 0.0023, "reward": 2.4999916553497314, "reward_std": 6.343163818200992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 2589 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.709844559585492, "grad_norm": 0.23604650813022537, "kl": 0.625, "learning_rate": 3.290155440414508e-07, "loss": 0.0038, "reward": 2.4999948740005493, "reward_std": 3.974345872848062e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2590 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.712435233160622, "grad_norm": 0.10849167813624833, "kl": 0.494140625, "learning_rate": 3.287564766839378e-07, "loss": 0.0021, "reward": 2.499997138977051, "reward_std": 1.7942866747944208e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2591 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.715025906735751, "grad_norm": 0.2961769744611308, "kl": 0.60546875, "learning_rate": 3.2849740932642483e-07, "loss": 0.0026, "reward": 2.499997138977051, "reward_std": 2.3381018650070473e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2592 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.717616580310881, "grad_norm": 0.04125957867599274, "kl": 0.56640625, "learning_rate": 3.2823834196891193e-07, "loss": 0.0017, "reward": 2.499997615814209, "reward_std": 1.311662686021009e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2593 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.72020725388601, "grad_norm": 10.672103151884405, "kl": 0.5625, "learning_rate": 3.27979274611399e-07, "loss": 0.0018, "reward": 1.9933863878250122, "reward_std": 0.0003054944096447798, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4933863878250122, "step": 2594 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.72279792746114, "grad_norm": 0.3139300752149427, "kl": 0.630859375, "learning_rate": 3.27720207253886e-07, "loss": 0.0027, "reward": 2.499995708465576, "reward_std": 2.0028541030114866e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2595 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.725388601036269, "grad_norm": 0.15345488211398525, "kl": 0.640625, "learning_rate": 3.2746113989637304e-07, "loss": 0.0034, "reward": 2.499991774559021, "reward_std": 2.4906015596570796e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999915957450867, "step": 2596 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.727979274611399, "grad_norm": 3.94564611480237, "kl": 0.669921875, "learning_rate": 3.2720207253886004e-07, "loss": 0.003, "reward": 1.998685598373413, "reward_std": 6.344885741782491e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498685508966446, "step": 2597 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.730569948186528, "grad_norm": 0.1971369607066066, "kl": 0.53515625, "learning_rate": 3.2694300518134714e-07, "loss": 0.0039, "reward": 2.499995231628418, "reward_std": 3.546284915501019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2598 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.733160621761658, "grad_norm": 5.848081743173891, "kl": 0.583984375, "learning_rate": 3.266839378238342e-07, "loss": 0.0013, "reward": 2.4999241828918457, "reward_std": 2.9622204010593123e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999244213104248, "step": 2599 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.7357512953367875, "grad_norm": 0.2206290847471597, "kl": 0.578125, "learning_rate": 3.2642487046632125e-07, "loss": 0.0029, "reward": 2.499995231628418, "reward_std": 3.5381975749260164e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 2600 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.738341968911917, "grad_norm": 20.507241542060424, "kl": 0.599609375, "learning_rate": 3.2616580310880825e-07, "loss": 0.0021, "reward": 2.4373151063919067, "reward_std": 0.1772415928871851, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373151063919067, "step": 2601 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.740932642487047, "grad_norm": 15.442342525455052, "kl": 0.564453125, "learning_rate": 3.2590673575129535e-07, "loss": 0.0014, "reward": 2.0543004274368286, "reward_std": 0.18009086341970715, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5543005466461182, "step": 2602 }, { "clip_ratio": 0.0, "completion_length": 68.375, "epoch": 6.743523316062176, "grad_norm": 0.08880068563276554, "kl": 0.37646484375, "learning_rate": 3.256476683937824e-07, "loss": 0.0022, "reward": 2.499998092651367, "reward_std": 1.6253806620625255e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2603 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.746113989637306, "grad_norm": 0.1472111334689301, "kl": 0.51171875, "learning_rate": 3.253886010362694e-07, "loss": 0.0022, "reward": 2.4999983310699463, "reward_std": 1.5325628623941157e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2604 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.748704663212435, "grad_norm": 0.23526280112124026, "kl": 0.587890625, "learning_rate": 3.2512953367875646e-07, "loss": 0.0031, "reward": 2.499995231628418, "reward_std": 3.671416266115557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2605 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.751295336787565, "grad_norm": 1.7022499958772488, "kl": 0.8984375, "learning_rate": 3.248704663212435e-07, "loss": 0.0035, "reward": 2.4999887943267822, "reward_std": 1.292620208914741e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 2606 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.753886010362694, "grad_norm": 0.2147851697852046, "kl": 0.591796875, "learning_rate": 3.2461139896373056e-07, "loss": 0.0026, "reward": 2.4999685287475586, "reward_std": 4.098958470422076e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999686479568481, "step": 2607 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.756476683937824, "grad_norm": 12.032540300140415, "kl": 0.63671875, "learning_rate": 3.243523316062176e-07, "loss": 0.0023, "reward": 1.7297423481941223, "reward_std": 0.0006678785734948178, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2297424226999283, "step": 2608 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 6.759067357512953, "grad_norm": 0.7565432002908565, "kl": 0.859375, "learning_rate": 3.2409326424870467e-07, "loss": 0.0032, "reward": 2.499988079071045, "reward_std": 1.2154487421867088e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 2609 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 6.761658031088083, "grad_norm": 0.13455559573114925, "kl": 0.587890625, "learning_rate": 3.2383419689119167e-07, "loss": 0.0024, "reward": 2.499975800514221, "reward_std": 3.612600266933441e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975860118866, "step": 2610 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 6.7642487046632125, "grad_norm": 14.029612164987206, "kl": 0.572265625, "learning_rate": 3.235751295336787e-07, "loss": 0.0017, "reward": 1.945224404335022, "reward_std": 0.01475976220172015, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4452244639396667, "step": 2611 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.766839378238342, "grad_norm": 6.547993068103591, "kl": 0.623046875, "learning_rate": 3.2331606217616583e-07, "loss": 0.0021, "reward": 2.437437653541565, "reward_std": 0.17678547140189949, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937437653541565, "step": 2612 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.769430051813472, "grad_norm": 7.5103171920314615, "kl": 0.5849609375, "learning_rate": 3.2305699481865283e-07, "loss": 0.0029, "reward": 1.8539791107177734, "reward_std": 0.002041163443834648, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3539791703224182, "step": 2613 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.772020725388601, "grad_norm": 0.06540892114995131, "kl": 0.4853515625, "learning_rate": 3.227979274611399e-07, "loss": 0.0021, "reward": 2.499998092651367, "reward_std": 1.7457676904086838e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2614 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 6.774611398963731, "grad_norm": 72.63338007894475, "kl": 0.751953125, "learning_rate": 3.2253886010362693e-07, "loss": 0.003, "reward": 1.4922677874565125, "reward_std": 0.005267134052701294, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9922677576541901, "step": 2615 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.77720207253886, "grad_norm": 0.5801883205466328, "kl": 0.5146484375, "learning_rate": 3.22279792746114e-07, "loss": 0.002, "reward": 2.49999463558197, "reward_std": 4.4377568428899394e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2616 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.77979274611399, "grad_norm": 0.08495207498178828, "kl": 0.62109375, "learning_rate": 3.2202072538860104e-07, "loss": 0.0023, "reward": 2.499998092651367, "reward_std": 1.1814952927124978e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2617 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.782383419689119, "grad_norm": 1.2267516642142968, "kl": 0.5078125, "learning_rate": 3.217616580310881e-07, "loss": 0.0021, "reward": 2.499994158744812, "reward_std": 5.004491782756304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2618 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.784974093264249, "grad_norm": 0.33162845442675415, "kl": 0.587890625, "learning_rate": 3.215025906735751e-07, "loss": 0.0035, "reward": 2.499992609024048, "reward_std": 4.364744199847337e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 2619 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.787564766839378, "grad_norm": 0.1259788196852407, "kl": 0.62890625, "learning_rate": 3.2124352331606214e-07, "loss": 0.0037, "reward": 2.499996304512024, "reward_std": 3.39040695962467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2620 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.790155440414508, "grad_norm": 0.11712548171819483, "kl": 0.609375, "learning_rate": 3.2098445595854925e-07, "loss": 0.0029, "reward": 2.499994993209839, "reward_std": 2.48993040941059e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 2621 }, { "clip_ratio": 0.0, "completion_length": 177.1875, "epoch": 6.7927461139896375, "grad_norm": 0.6758655910781053, "kl": 0.3359375, "learning_rate": 3.2072538860103625e-07, "loss": 0.0016, "reward": 2.4999797344207764, "reward_std": 7.111461172826239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999797344207764, "step": 2622 }, { "clip_ratio": 0.0, "completion_length": 66.3125, "epoch": 6.795336787564767, "grad_norm": 0.15593447732981458, "kl": 0.4912109375, "learning_rate": 3.204663212435233e-07, "loss": 0.0024, "reward": 2.499990224838257, "reward_std": 3.4399275818941533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 2623 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.7979274611398965, "grad_norm": 29.35026504112677, "kl": 0.513671875, "learning_rate": 3.2020725388601035e-07, "loss": 0.0025, "reward": 2.3749542236328125, "reward_std": 0.2315270150257902, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749540448188782, "step": 2624 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 6.800518134715026, "grad_norm": 5.99470335967406, "kl": 0.56640625, "learning_rate": 3.199481865284974e-07, "loss": 0.0017, "reward": 1.9121917486190796, "reward_std": 0.00045189330194261856, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4121918082237244, "step": 2625 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 6.803108808290156, "grad_norm": 0.7407097022339368, "kl": 0.5498046875, "learning_rate": 3.1968911917098446e-07, "loss": 0.0018, "reward": 2.4999932050704956, "reward_std": 6.212685775608406e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2626 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.805699481865285, "grad_norm": 11.567722570103708, "kl": 0.6015625, "learning_rate": 3.194300518134715e-07, "loss": 0.0028, "reward": 1.8845289945602417, "reward_std": 0.0016080748960121127, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3845289945602417, "step": 2627 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.808290155440415, "grad_norm": 0.09294851255621667, "kl": 0.62890625, "learning_rate": 3.191709844559585e-07, "loss": 0.0029, "reward": 2.499997854232788, "reward_std": 1.6124543833484495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2628 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.810880829015544, "grad_norm": 0.4524401510327518, "kl": 0.591796875, "learning_rate": 3.1891191709844556e-07, "loss": 0.0015, "reward": 2.4999927282333374, "reward_std": 4.098774184058129e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 2629 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 6.813471502590674, "grad_norm": 0.6653411082948673, "kl": 0.564453125, "learning_rate": 3.1865284974093267e-07, "loss": 0.002, "reward": 2.4999947547912598, "reward_std": 2.7873016961166286e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2630 }, { "clip_ratio": 0.0, "completion_length": 44.25, "epoch": 6.816062176165803, "grad_norm": 0.3702123623453324, "kl": 0.455078125, "learning_rate": 3.1839378238341967e-07, "loss": 0.0017, "reward": 2.499994993209839, "reward_std": 3.1852805477683432e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 2631 }, { "clip_ratio": 0.0, "completion_length": 45.25, "epoch": 6.818652849740933, "grad_norm": 2.387940726436642, "kl": 0.509765625, "learning_rate": 3.181347150259067e-07, "loss": 0.002, "reward": 2.4999938011169434, "reward_std": 4.994251298739982e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2632 }, { "clip_ratio": 0.0, "completion_length": 39.625, "epoch": 6.821243523316062, "grad_norm": 0.5595640584468617, "kl": 0.4521484375, "learning_rate": 3.178756476683938e-07, "loss": 0.0018, "reward": 2.4999966621398926, "reward_std": 2.7485283453643206e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2633 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 6.823834196891192, "grad_norm": 0.18742380162463845, "kl": 0.51953125, "learning_rate": 3.176165803108808e-07, "loss": 0.0027, "reward": 2.4999982118606567, "reward_std": 1.3594919892057078e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2634 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 6.8264248704663215, "grad_norm": 0.36010807760114727, "kl": 0.642578125, "learning_rate": 3.173575129533679e-07, "loss": 0.0023, "reward": 2.499996781349182, "reward_std": 2.887275741159101e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2635 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.829015544041451, "grad_norm": 0.30518428664676717, "kl": 0.58203125, "learning_rate": 3.1709844559585493e-07, "loss": 0.0023, "reward": 2.499996781349182, "reward_std": 1.6964377209660597e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2636 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.831606217616581, "grad_norm": 2.8497916845888436, "kl": 0.69921875, "learning_rate": 3.1683937823834193e-07, "loss": 0.0025, "reward": 2.3749804496765137, "reward_std": 0.23146095145705203, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749805688858032, "step": 2637 }, { "clip_ratio": 0.0, "completion_length": 114.3125, "epoch": 6.83419689119171, "grad_norm": 0.7379507441259705, "kl": 0.4169921875, "learning_rate": 3.16580310880829e-07, "loss": 0.003, "reward": 2.499997854232788, "reward_std": 1.0855220580197056e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2638 }, { "clip_ratio": 0.0, "completion_length": 58.25, "epoch": 6.83678756476684, "grad_norm": 0.4214523751649702, "kl": 0.5673828125, "learning_rate": 3.163212435233161e-07, "loss": 0.0028, "reward": 2.4999942779541016, "reward_std": 3.159865059387812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2639 }, { "clip_ratio": 0.0, "completion_length": 43.75, "epoch": 6.839378238341969, "grad_norm": 22.319181319215073, "kl": 0.67578125, "learning_rate": 3.160621761658031e-07, "loss": 0.0031, "reward": 2.4993340969085693, "reward_std": 0.00041619939111114945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9993340373039246, "step": 2640 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.841968911917099, "grad_norm": 0.25231808005941253, "kl": 0.61328125, "learning_rate": 3.1580310880829014e-07, "loss": 0.0027, "reward": 2.4999955892562866, "reward_std": 3.499925469441223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2641 }, { "clip_ratio": 0.0, "completion_length": 102.375, "epoch": 6.844559585492228, "grad_norm": 5.684887504287627, "kl": 0.3857421875, "learning_rate": 3.155440414507772e-07, "loss": 0.0017, "reward": 1.9456443786621094, "reward_std": 0.0003965381826844805, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4456443190574646, "step": 2642 }, { "clip_ratio": 0.0, "completion_length": 43.5625, "epoch": 6.847150259067358, "grad_norm": 0.14999298385128676, "kl": 0.51953125, "learning_rate": 3.152849740932642e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.5742372738714039e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2643 }, { "clip_ratio": 0.0, "completion_length": 72.25, "epoch": 6.849740932642487, "grad_norm": 0.7236686210798136, "kl": 0.41796875, "learning_rate": 3.150259067357513e-07, "loss": 0.0014, "reward": 2.499990701675415, "reward_std": 4.274629873179947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 2644 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.852331606217617, "grad_norm": 13.450923247008108, "kl": 0.59765625, "learning_rate": 3.1476683937823835e-07, "loss": 0.0028, "reward": 2.4999959468841553, "reward_std": 3.915709271495871e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2645 }, { "clip_ratio": 0.0, "completion_length": 73.8125, "epoch": 6.8549222797927465, "grad_norm": 0.297115627829623, "kl": 0.443359375, "learning_rate": 3.1450777202072535e-07, "loss": 0.0025, "reward": 2.499996066093445, "reward_std": 4.092347580808564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 2646 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.857512953367876, "grad_norm": 0.14846637429704046, "kl": 0.611328125, "learning_rate": 3.142487046632124e-07, "loss": 0.0017, "reward": 2.499995231628418, "reward_std": 3.2032246508606477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2647 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 6.860103626943005, "grad_norm": 0.7988857431872334, "kl": 0.62109375, "learning_rate": 3.139896373056995e-07, "loss": 0.0028, "reward": 2.4999945163726807, "reward_std": 4.854964544165341e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 2648 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.862694300518134, "grad_norm": 0.884611222881597, "kl": 0.6171875, "learning_rate": 3.137305699481865e-07, "loss": 0.0023, "reward": 2.499987840652466, "reward_std": 9.37633149078465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987781047821, "step": 2649 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.865284974093264, "grad_norm": 5.451251961773984, "kl": 0.51953125, "learning_rate": 3.1347150259067356e-07, "loss": 0.0023, "reward": 1.7658424377441406, "reward_std": 0.00018743465838610973, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2658424973487854, "step": 2650 }, { "clip_ratio": 0.0, "completion_length": 90.9375, "epoch": 6.867875647668393, "grad_norm": 0.4079116374960755, "kl": 0.3427734375, "learning_rate": 3.132124352331606e-07, "loss": 0.0024, "reward": 2.499989628791809, "reward_std": 3.2802618648020143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 2651 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.870466321243523, "grad_norm": 0.1426616336989692, "kl": 0.5576171875, "learning_rate": 3.129533678756476e-07, "loss": 0.0021, "reward": 2.499999165534973, "reward_std": 8.7676201587783e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 2652 }, { "clip_ratio": 0.0, "completion_length": 237.9375, "epoch": 6.873056994818652, "grad_norm": 0.18722696799170283, "kl": 0.1402587890625, "learning_rate": 3.126943005181347e-07, "loss": 0.0013, "reward": 2.499996066093445, "reward_std": 4.259555225871736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2653 }, { "clip_ratio": 0.0, "completion_length": 194.75, "epoch": 6.875647668393782, "grad_norm": 0.06190333158995358, "kl": 0.29931640625, "learning_rate": 3.124352331606218e-07, "loss": 0.0002, "reward": 2.499995470046997, "reward_std": 2.6046949130886787e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2654 }, { "clip_ratio": 0.0, "completion_length": 199.75, "epoch": 6.8782383419689115, "grad_norm": 0.4634670814057826, "kl": 0.19580078125, "learning_rate": 3.121761658031088e-07, "loss": 0.0008, "reward": 2.4999955892562866, "reward_std": 4.785621968039777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2655 }, { "clip_ratio": 0.0, "completion_length": 247.0, "epoch": 6.880829015544041, "grad_norm": 0.10374405288171888, "kl": 0.1591796875, "learning_rate": 3.1191709844559583e-07, "loss": 0.0004, "reward": 2.4999934434890747, "reward_std": 3.689405389195599e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 2656 }, { "clip_ratio": 0.0, "completion_length": 89.4375, "epoch": 6.883419689119171, "grad_norm": 0.39630031247323033, "kl": 0.30078125, "learning_rate": 3.116580310880829e-07, "loss": 0.0032, "reward": 2.4999923706054688, "reward_std": 5.535524906008504e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 2657 }, { "clip_ratio": 0.0, "completion_length": 120.375, "epoch": 6.8860103626943, "grad_norm": 3.810986803277096, "kl": 0.37255859375, "learning_rate": 3.1139896373056993e-07, "loss": 0.0012, "reward": 1.4976598620414734, "reward_std": 8.930298281484284e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9976598620414734, "step": 2658 }, { "clip_ratio": 0.0, "completion_length": 99.625, "epoch": 6.88860103626943, "grad_norm": 2.799727158208184, "kl": 0.3447265625, "learning_rate": 3.11139896373057e-07, "loss": 0.0014, "reward": 1.9998128414154053, "reward_std": 1.6439483658814424e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998127818107605, "step": 2659 }, { "clip_ratio": 0.0, "completion_length": 72.125, "epoch": 6.891191709844559, "grad_norm": 3.9177114566813653, "kl": 0.458984375, "learning_rate": 3.1088082901554404e-07, "loss": 0.0016, "reward": 1.9345767498016357, "reward_std": 0.00034057736161230423, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4345768988132477, "step": 2660 }, { "clip_ratio": 0.0, "completion_length": 137.75, "epoch": 6.893782383419689, "grad_norm": 3.2134312767572024, "kl": 0.37548828125, "learning_rate": 3.1062176165803104e-07, "loss": 0.0018, "reward": 1.999786376953125, "reward_std": 2.332293297513388e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499786376953125, "step": 2661 }, { "clip_ratio": 0.0, "completion_length": 69.6875, "epoch": 6.896373056994818, "grad_norm": 0.6434932466973682, "kl": 0.5048828125, "learning_rate": 3.1036269430051814e-07, "loss": 0.0035, "reward": 2.499995231628418, "reward_std": 3.817415858975437e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 2662 }, { "clip_ratio": 0.0, "completion_length": 85.6875, "epoch": 6.898963730569948, "grad_norm": 0.338302456192817, "kl": 0.404296875, "learning_rate": 3.101036269430052e-07, "loss": 0.0013, "reward": 2.499997854232788, "reward_std": 1.2933705590967293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2663 }, { "clip_ratio": 0.0, "completion_length": 256.5625, "epoch": 6.901554404145077, "grad_norm": 2.1258884831923366, "kl": 0.157958984375, "learning_rate": 3.098445595854922e-07, "loss": 0.0015, "reward": 1.9965699911117554, "reward_std": 0.0004569274798313927, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4965699315071106, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.904145077720207, "grad_norm": 0.3180871805362857, "kl": 0.62109375, "learning_rate": 3.0958549222797925e-07, "loss": 0.0018, "reward": 2.4999972581863403, "reward_std": 2.35221239108796e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2665 }, { "clip_ratio": 0.0, "completion_length": 37.625, "epoch": 6.9067357512953365, "grad_norm": 3.168915012760514, "kl": 0.5693359375, "learning_rate": 3.093264248704663e-07, "loss": 0.002, "reward": 1.9866647720336914, "reward_std": 0.0002730870309335387, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4866649210453033, "step": 2666 }, { "clip_ratio": 0.0, "completion_length": 47.25, "epoch": 6.909326424870466, "grad_norm": 1.2859448664848676, "kl": 0.634765625, "learning_rate": 3.0906735751295335e-07, "loss": 0.0035, "reward": 1.999306559562683, "reward_std": 3.8479502222799056e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993063509464264, "step": 2667 }, { "clip_ratio": 0.0, "completion_length": 82.25, "epoch": 6.9119170984455955, "grad_norm": 0.45395596173603614, "kl": 0.4208984375, "learning_rate": 3.088082901554404e-07, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 2.4942166874097893e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2668 }, { "clip_ratio": 0.0, "completion_length": 159.5625, "epoch": 6.914507772020725, "grad_norm": 6.068043457104671, "kl": 1.6474609375, "learning_rate": 3.0854922279792746e-07, "loss": 0.007, "reward": 2.499996542930603, "reward_std": 3.507345354591962e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2669 }, { "clip_ratio": 0.0, "completion_length": 125.8125, "epoch": 6.917098445595855, "grad_norm": 45.085484989627794, "kl": 0.259765625, "learning_rate": 3.0829015544041446e-07, "loss": 0.0009, "reward": 2.4374114274978638, "reward_std": 0.17702006061472275, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374111890792847, "step": 2670 }, { "clip_ratio": 0.0, "completion_length": 158.5, "epoch": 6.919689119170984, "grad_norm": 4.485929222820897, "kl": 0.25146484375, "learning_rate": 3.080310880829015e-07, "loss": 0.001, "reward": 2.4318203926086426, "reward_std": 0.19275066877025893, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9318203330039978, "step": 2671 }, { "clip_ratio": 0.0, "completion_length": 236.1875, "epoch": 6.922279792746114, "grad_norm": 1.1481948638955366, "kl": 0.42041015625, "learning_rate": 3.077720207253886e-07, "loss": 0.001, "reward": 2.499994158744812, "reward_std": 7.312045681828749e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2672 }, { "clip_ratio": 0.0, "completion_length": 60.5, "epoch": 6.924870466321243, "grad_norm": 0.346505288618612, "kl": 0.53125, "learning_rate": 3.075129533678756e-07, "loss": 0.002, "reward": 2.4999982118606567, "reward_std": 1.4052205301595677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2673 }, { "clip_ratio": 0.0, "completion_length": 354.375, "epoch": 6.927461139896373, "grad_norm": 0.2307839320412542, "kl": 0.154541015625, "learning_rate": 3.0725388601036267e-07, "loss": 0.0007, "reward": 2.499995231628418, "reward_std": 4.740027122807078e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2674 }, { "clip_ratio": 0.0, "completion_length": 278.1875, "epoch": 6.930051813471502, "grad_norm": 2.2611951578904033, "kl": 0.084716796875, "learning_rate": 3.069948186528497e-07, "loss": 0.0005, "reward": 2.437395215034485, "reward_std": 0.1770643861040071, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.93739515542984, "step": 2675 }, { "clip_ratio": 0.0, "completion_length": 412.5, "epoch": 6.932642487046632, "grad_norm": 0.4944028376522965, "kl": 0.1494140625, "learning_rate": 3.067357512953368e-07, "loss": 0.0016, "reward": 2.499995231628418, "reward_std": 1.761127293775644e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 264.125, "epoch": 6.935233160621761, "grad_norm": 0.061906912586494174, "kl": 0.115478515625, "learning_rate": 3.0647668393782383e-07, "loss": -0.0004, "reward": 2.499998092651367, "reward_std": 1.3794637254704867e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2677 }, { "clip_ratio": 0.0, "completion_length": 305.8125, "epoch": 6.937823834196891, "grad_norm": 0.21579516945714095, "kl": 0.21142578125, "learning_rate": 3.062176165803109e-07, "loss": 0.0019, "reward": 2.4999959468841553, "reward_std": 3.0739556677872315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2678 }, { "clip_ratio": 0.0, "completion_length": 150.4375, "epoch": 6.9404145077720205, "grad_norm": 0.564404259480908, "kl": 0.3076171875, "learning_rate": 3.059585492227979e-07, "loss": 0.002, "reward": 2.499980926513672, "reward_std": 7.057784387143329e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999807476997375, "step": 2679 }, { "clip_ratio": 0.0, "completion_length": 278.875, "epoch": 6.94300518134715, "grad_norm": 0.0583171153441049, "kl": 0.103271484375, "learning_rate": 3.0569948186528493e-07, "loss": -0.0002, "reward": 2.4999974966049194, "reward_std": 1.4537956474214297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2680 }, { "clip_ratio": 0.0, "completion_length": 125.3125, "epoch": 6.94559585492228, "grad_norm": 0.21391051008090628, "kl": 0.3740234375, "learning_rate": 3.0544041450777204e-07, "loss": 0.0011, "reward": 2.4999948740005493, "reward_std": 3.6709993764816318e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2681 }, { "clip_ratio": 0.0, "completion_length": 206.8125, "epoch": 6.948186528497409, "grad_norm": 0.3359097245723334, "kl": 0.1396484375, "learning_rate": 3.0518134715025904e-07, "loss": 0.0021, "reward": 2.499948263168335, "reward_std": 8.535561050848628e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999480247497559, "step": 2682 }, { "clip_ratio": 0.0, "completion_length": 261.4375, "epoch": 6.950777202072539, "grad_norm": 2.7704281096540213, "kl": 0.197998046875, "learning_rate": 3.049222797927461e-07, "loss": -0.0001, "reward": 2.4999982118606567, "reward_std": 1.7992605307881604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2683 }, { "clip_ratio": 0.0, "completion_length": 176.5, "epoch": 6.953367875647668, "grad_norm": 0.12194307796119151, "kl": 0.23486328125, "learning_rate": 3.0466321243523314e-07, "loss": 0.002, "reward": 2.4999945163726807, "reward_std": 2.2499032183986856e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 2684 }, { "clip_ratio": 0.0, "completion_length": 262.875, "epoch": 6.955958549222798, "grad_norm": 5.717945520190545, "kl": 0.2080078125, "learning_rate": 3.044041450777202e-07, "loss": 0.0008, "reward": 1.3525401949882507, "reward_std": 0.0015482378366868943, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8525401949882507, "step": 2685 }, { "clip_ratio": 0.0, "completion_length": 188.375, "epoch": 6.958549222797927, "grad_norm": 0.09638576830112813, "kl": 0.32470703125, "learning_rate": 3.0414507772020725e-07, "loss": 0.0013, "reward": 2.4999977350234985, "reward_std": 2.0514265770543716e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2686 }, { "clip_ratio": 0.0, "completion_length": 203.8125, "epoch": 6.961139896373057, "grad_norm": 0.10154372564963778, "kl": 0.097412109375, "learning_rate": 3.038860103626943e-07, "loss": 0.001, "reward": 2.4999972581863403, "reward_std": 2.935250677182921e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2687 }, { "clip_ratio": 0.0, "completion_length": 156.25, "epoch": 6.963730569948186, "grad_norm": 0.25404301310109767, "kl": 0.16552734375, "learning_rate": 3.036269430051813e-07, "loss": 0.0001, "reward": 2.499997854232788, "reward_std": 1.8216970261164533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2688 }, { "clip_ratio": 0.0, "completion_length": 216.4375, "epoch": 6.966321243523316, "grad_norm": 21.044170576485044, "kl": 0.3994140625, "learning_rate": 3.0336787564766835e-07, "loss": 0.0017, "reward": 2.374773621559143, "reward_std": 0.23187007569549678, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874773621559143, "step": 2689 }, { "clip_ratio": 0.0, "completion_length": 201.5, "epoch": 6.9689119170984455, "grad_norm": 2.7988595771187175, "kl": 0.1875, "learning_rate": 3.0310880829015546e-07, "loss": 0.0009, "reward": 2.499629259109497, "reward_std": 0.0010295482910009923, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999629259109497, "step": 2690 }, { "clip_ratio": 0.0, "completion_length": 192.4375, "epoch": 6.971502590673575, "grad_norm": 1.1036819801765227, "kl": 0.21337890625, "learning_rate": 3.0284974093264246e-07, "loss": 0.0006, "reward": 2.499993681907654, "reward_std": 4.187603963146103e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 2691 }, { "clip_ratio": 0.0, "completion_length": 165.625, "epoch": 6.974093264248705, "grad_norm": 2.019736391699108, "kl": 0.258056640625, "learning_rate": 3.025906735751295e-07, "loss": 0.0011, "reward": 1.99904203414917, "reward_std": 4.501106900534069e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990421533584595, "step": 2692 }, { "clip_ratio": 0.0, "completion_length": 200.8125, "epoch": 6.976683937823834, "grad_norm": 0.3263817249423562, "kl": 0.21484375, "learning_rate": 3.0233160621761657e-07, "loss": 0.0002, "reward": 2.4999920129776, "reward_std": 7.236032843138673e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 2693 }, { "clip_ratio": 0.0, "completion_length": 192.0, "epoch": 6.979274611398964, "grad_norm": 1.6410592115266542, "kl": 0.15966796875, "learning_rate": 3.0207253886010356e-07, "loss": 0.0019, "reward": 2.499917984008789, "reward_std": 3.869059764838312e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99991774559021, "step": 2694 }, { "clip_ratio": 0.0, "completion_length": 142.6875, "epoch": 6.981865284974093, "grad_norm": 0.4484104566886619, "kl": 0.26953125, "learning_rate": 3.0181347150259067e-07, "loss": 0.0015, "reward": 2.4999895095825195, "reward_std": 5.679362402588595e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895691871643, "step": 2695 }, { "clip_ratio": 0.0, "completion_length": 166.75, "epoch": 6.984455958549223, "grad_norm": 0.6056662230029155, "kl": 0.16748046875, "learning_rate": 3.015544041450777e-07, "loss": 0.0004, "reward": 2.4999945163726807, "reward_std": 6.576697160198819e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 2696 }, { "clip_ratio": 0.0, "completion_length": 114.0625, "epoch": 6.987046632124352, "grad_norm": 0.31468162708538183, "kl": 0.29443359375, "learning_rate": 3.012953367875647e-07, "loss": 0.0006, "reward": 2.4999942779541016, "reward_std": 2.7945516194449738e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2697 }, { "clip_ratio": 0.0, "completion_length": 141.3125, "epoch": 6.989637305699482, "grad_norm": 0.4128109408919594, "kl": 0.171875, "learning_rate": 3.010362694300518e-07, "loss": 0.001, "reward": 2.4999945163726807, "reward_std": 4.097817509318702e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 2698 }, { "clip_ratio": 0.0, "completion_length": 104.625, "epoch": 6.992227979274611, "grad_norm": 8.198402893360587, "kl": 0.43994140625, "learning_rate": 3.007772020725389e-07, "loss": 0.0025, "reward": 1.8042879104614258, "reward_std": 0.0013463066745771357, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3042878806591034, "step": 2699 }, { "clip_ratio": 0.0, "completion_length": 73.5625, "epoch": 6.994818652849741, "grad_norm": 9.774708838144486, "kl": 0.34716796875, "learning_rate": 3.005181347150259e-07, "loss": 0.0018, "reward": 2.373953700065613, "reward_std": 0.2318424973746005, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8739535808563232, "step": 2700 }, { "clip_ratio": 0.0, "completion_length": 98.75, "epoch": 6.9974093264248705, "grad_norm": 0.7445630315488271, "kl": 0.23681640625, "learning_rate": 3.0025906735751293e-07, "loss": 0.0007, "reward": 2.4999970197677612, "reward_std": 3.3564775776540046e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2701 }, { "clip_ratio": 0.0, "completion_length": 71.625, "epoch": 7.0, "grad_norm": 1.823763217674456, "kl": 0.25927734375, "learning_rate": 3e-07, "loss": 0.0017, "reward": 1.9764949083328247, "reward_std": 0.00014669514507659187, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4764948785305023, "step": 2702 }, { "clip_ratio": 0.0, "completion_length": 85.625, "epoch": 7.0025906735751295, "grad_norm": 0.569121717365465, "kl": 0.361328125, "learning_rate": 2.99740932642487e-07, "loss": 0.0009, "reward": 2.4999974966049194, "reward_std": 2.2646967750006297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2703 }, { "clip_ratio": 0.0, "completion_length": 67.1875, "epoch": 7.005181347150259, "grad_norm": 0.7447953366827806, "kl": 0.169921875, "learning_rate": 2.994818652849741e-07, "loss": 0.0008, "reward": 2.499997615814209, "reward_std": 2.929365962245356e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2704 }, { "clip_ratio": 0.0, "completion_length": 70.8125, "epoch": 7.007772020725389, "grad_norm": 5.02352723984411, "kl": 0.212890625, "learning_rate": 2.9922279792746114e-07, "loss": 0.001, "reward": 1.992610216140747, "reward_std": 0.00013362670904371043, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4926101863384247, "step": 2705 }, { "clip_ratio": 0.0, "completion_length": 59.0, "epoch": 7.010362694300518, "grad_norm": 22.11905056800504, "kl": 0.1845703125, "learning_rate": 2.9896373056994814e-07, "loss": 0.0002, "reward": 2.3530369997024536, "reward_std": 0.27211283638007444, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8530371189117432, "step": 2706 }, { "clip_ratio": 0.0, "completion_length": 62.0, "epoch": 7.012953367875648, "grad_norm": 0.22293305868937738, "kl": 0.134765625, "learning_rate": 2.987046632124352e-07, "loss": 0.0005, "reward": 2.4999942779541016, "reward_std": 5.27215888723731e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 2707 }, { "clip_ratio": 0.0, "completion_length": 132.0, "epoch": 7.015544041450777, "grad_norm": 5.020182474708354, "kl": 0.21533203125, "learning_rate": 2.9844559585492225e-07, "loss": 0.0009, "reward": 1.9949113130569458, "reward_std": 9.656198551510897e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.494911402463913, "step": 2708 }, { "clip_ratio": 0.0, "completion_length": 73.3125, "epoch": 7.018134715025907, "grad_norm": 3.6875234197361904, "kl": 0.2177734375, "learning_rate": 2.981865284974093e-07, "loss": 0.0007, "reward": 1.9989154934883118, "reward_std": 3.434862583162612e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989156126976013, "step": 2709 }, { "clip_ratio": 0.0, "completion_length": 67.375, "epoch": 7.020725388601036, "grad_norm": 0.15734304763670398, "kl": 0.1787109375, "learning_rate": 2.9792746113989635e-07, "loss": -0.0002, "reward": 2.499997615814209, "reward_std": 1.83287085064876e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2710 }, { "clip_ratio": 0.0, "completion_length": 46.625, "epoch": 7.023316062176166, "grad_norm": 0.20989761900422077, "kl": 0.105224609375, "learning_rate": 2.976683937823834e-07, "loss": -0.0006, "reward": 2.4999964237213135, "reward_std": 3.0140967055558576e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2711 }, { "clip_ratio": 0.0, "completion_length": 56.625, "epoch": 7.025906735751295, "grad_norm": 0.4221832723094387, "kl": 0.16845703125, "learning_rate": 2.974093264248704e-07, "loss": 0.0017, "reward": 2.4999959468841553, "reward_std": 5.543153690723557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2712 }, { "clip_ratio": 0.0, "completion_length": 47.8125, "epoch": 7.028497409326425, "grad_norm": 21.01471608615529, "kl": 0.1259765625, "learning_rate": 2.971502590673575e-07, "loss": 0.0014, "reward": 2.4372763633728027, "reward_std": 0.1774002746763017, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372762441635132, "step": 2713 }, { "clip_ratio": 0.0, "completion_length": 64.75, "epoch": 7.0310880829015545, "grad_norm": 19.27631022751777, "kl": 0.14892578125, "learning_rate": 2.9689119170984457e-07, "loss": 0.0004, "reward": 2.49875545501709, "reward_std": 0.00039149958865891676, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9987554550170898, "step": 2714 }, { "clip_ratio": 0.0, "completion_length": 50.4375, "epoch": 7.033678756476684, "grad_norm": 4.345907998496661, "kl": 0.228515625, "learning_rate": 2.966321243523316e-07, "loss": 0.0008, "reward": 1.999367356300354, "reward_std": 8.119913644577537e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499367505311966, "step": 2715 }, { "clip_ratio": 0.0, "completion_length": 58.625, "epoch": 7.036269430051814, "grad_norm": 14.635533084469731, "kl": 0.14453125, "learning_rate": 2.963730569948186e-07, "loss": 0.0007, "reward": 1.9900410175323486, "reward_std": 0.00024803806672935025, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4900410771369934, "step": 2716 }, { "clip_ratio": 0.0, "completion_length": 62.375, "epoch": 7.038860103626943, "grad_norm": 0.8898577007577386, "kl": 0.09619140625, "learning_rate": 2.9611398963730567e-07, "loss": 0.002, "reward": 2.499992847442627, "reward_std": 4.905405603494728e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 2717 }, { "clip_ratio": 0.0, "completion_length": 60.9375, "epoch": 7.041450777202073, "grad_norm": 1.6612582440878152, "kl": 0.107177734375, "learning_rate": 2.958549222797928e-07, "loss": -0.0006, "reward": 2.4999969005584717, "reward_std": 4.851921119097824e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2718 }, { "clip_ratio": 0.0, "completion_length": 74.375, "epoch": 7.044041450777202, "grad_norm": 4.901581387582648, "kl": 0.20263671875, "learning_rate": 2.955958549222798e-07, "loss": 0.0008, "reward": 2.0620840787887573, "reward_std": 0.1768648830190216, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5620839595794678, "step": 2719 }, { "clip_ratio": 0.0, "completion_length": 49.6875, "epoch": 7.046632124352332, "grad_norm": 0.5210273679890672, "kl": 0.14501953125, "learning_rate": 2.9533678756476683e-07, "loss": 0.001, "reward": 2.4999948740005493, "reward_std": 4.199686941319669e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2720 }, { "clip_ratio": 0.0, "completion_length": 68.0, "epoch": 7.049222797927461, "grad_norm": 6.620291450504624, "kl": 0.168701171875, "learning_rate": 2.950777202072539e-07, "loss": 0.0012, "reward": 1.7662118673324585, "reward_std": 0.0011479298079848377, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2662118971347809, "step": 2721 }, { "clip_ratio": 0.0, "completion_length": 47.3125, "epoch": 7.051813471502591, "grad_norm": 0.22932980760212365, "kl": 0.065673828125, "learning_rate": 2.9481865284974093e-07, "loss": 0.0007, "reward": 2.4999966621398926, "reward_std": 2.193232376157539e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2722 }, { "clip_ratio": 0.0, "completion_length": 61.6875, "epoch": 7.05440414507772, "grad_norm": 0.8190332820271327, "kl": 0.15869140625, "learning_rate": 2.94559585492228e-07, "loss": 0.0008, "reward": 2.49998676776886, "reward_std": 8.754524287724053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999867677688599, "step": 2723 }, { "clip_ratio": 0.0, "completion_length": 55.125, "epoch": 7.05699481865285, "grad_norm": 8.198532025284965, "kl": 0.067138671875, "learning_rate": 2.9430051813471504e-07, "loss": 0.0009, "reward": 2.499794602394104, "reward_std": 0.0001917930989634442, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997944235801697, "step": 2724 }, { "clip_ratio": 0.0, "completion_length": 72.25, "epoch": 7.0595854922279795, "grad_norm": 1.0986247466584278, "kl": 0.1162109375, "learning_rate": 2.9404145077720204e-07, "loss": 0.0002, "reward": 2.499996304512024, "reward_std": 2.7830762974190293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2725 }, { "clip_ratio": 0.0, "completion_length": 61.75, "epoch": 7.062176165803109, "grad_norm": 0.07829665781726379, "kl": 0.1572265625, "learning_rate": 2.937823834196891e-07, "loss": 0.0011, "reward": 2.4999988079071045, "reward_std": 1.141340902677257e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2726 }, { "clip_ratio": 0.0, "completion_length": 77.875, "epoch": 7.064766839378239, "grad_norm": 0.2516380611374419, "kl": 0.21826171875, "learning_rate": 2.935233160621762e-07, "loss": 0.0006, "reward": 2.4999972581863403, "reward_std": 2.8480247920015245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2727 }, { "clip_ratio": 0.0, "completion_length": 145.5625, "epoch": 7.067357512953368, "grad_norm": 6.131663327496966, "kl": 0.1806640625, "learning_rate": 2.932642487046632e-07, "loss": 0.0009, "reward": 2.312238931655884, "reward_std": 0.2591267380967963, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122390508651733, "step": 2728 }, { "clip_ratio": 0.0, "completion_length": 62.5, "epoch": 7.069948186528498, "grad_norm": 23.92330530169106, "kl": 0.1640625, "learning_rate": 2.9300518134715025e-07, "loss": 0.0007, "reward": 2.4995886087417603, "reward_std": 0.0003340699414593473, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999588668346405, "step": 2729 }, { "clip_ratio": 0.0, "completion_length": 50.3125, "epoch": 7.072538860103627, "grad_norm": 0.2020154580262241, "kl": 0.1334228515625, "learning_rate": 2.927461139896373e-07, "loss": 0.0008, "reward": 2.499998092651367, "reward_std": 1.8601297142595286e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2730 }, { "clip_ratio": 0.0, "completion_length": 65.625, "epoch": 7.075129533678757, "grad_norm": 1.6792140566516012, "kl": 0.13720703125, "learning_rate": 2.924870466321243e-07, "loss": 0.0005, "reward": 2.499990224838257, "reward_std": 9.38606262934627e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 2731 }, { "clip_ratio": 0.0, "completion_length": 71.4375, "epoch": 7.077720207253886, "grad_norm": 0.2621990187066607, "kl": 0.22412109375, "learning_rate": 2.922279792746114e-07, "loss": -0.0003, "reward": 2.49999737739563, "reward_std": 3.027597699656326e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2732 }, { "clip_ratio": 0.0, "completion_length": 69.75, "epoch": 7.080310880829016, "grad_norm": 12.456148974241055, "kl": 0.115478515625, "learning_rate": 2.9196891191709846e-07, "loss": 0.0003, "reward": 1.9977619647979736, "reward_std": 0.00034299797835046775, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497761994600296, "step": 2733 }, { "clip_ratio": 0.0, "completion_length": 51.4375, "epoch": 7.082901554404145, "grad_norm": 0.14577059620634794, "kl": 0.1572265625, "learning_rate": 2.9170984455958546e-07, "loss": 0.0022, "reward": 2.499998092651367, "reward_std": 1.5416800351886195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2734 }, { "clip_ratio": 0.0, "completion_length": 56.0, "epoch": 7.085492227979275, "grad_norm": 0.17857576338091405, "kl": 0.15673828125, "learning_rate": 2.914507772020725e-07, "loss": 0.0002, "reward": 2.4999969005584717, "reward_std": 1.6470879131702532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2735 }, { "clip_ratio": 0.0, "completion_length": 48.5, "epoch": 7.0880829015544045, "grad_norm": 0.4556923362730404, "kl": 0.3740234375, "learning_rate": 2.911917098445596e-07, "loss": 0.0014, "reward": 2.499996066093445, "reward_std": 3.2598966299701715e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2736 }, { "clip_ratio": 0.0, "completion_length": 83.3125, "epoch": 7.090673575129534, "grad_norm": 0.1890989992248151, "kl": 0.128173828125, "learning_rate": 2.909326424870466e-07, "loss": -0.0007, "reward": 2.4999966621398926, "reward_std": 2.4774795974735753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 2737 }, { "clip_ratio": 0.0, "completion_length": 55.75, "epoch": 7.0932642487046635, "grad_norm": 0.3721910769971386, "kl": 0.113037109375, "learning_rate": 2.9067357512953367e-07, "loss": 0.0016, "reward": 2.4999914169311523, "reward_std": 4.1055883457374875e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 2738 }, { "clip_ratio": 0.0, "completion_length": 51.8125, "epoch": 7.095854922279793, "grad_norm": 0.10831821610534416, "kl": 0.137939453125, "learning_rate": 2.904145077720207e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 1.6040703627595576e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2739 }, { "clip_ratio": 0.0, "completion_length": 72.3125, "epoch": 7.098445595854923, "grad_norm": 0.2580354246702583, "kl": 0.13427734375, "learning_rate": 2.901554404145077e-07, "loss": -0.0006, "reward": 2.499997615814209, "reward_std": 2.4808185230540403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2740 }, { "clip_ratio": 0.0, "completion_length": 73.625, "epoch": 7.101036269430052, "grad_norm": 3.7921552513192722, "kl": 0.3056640625, "learning_rate": 2.8989637305699483e-07, "loss": 0.0018, "reward": 1.920997977256775, "reward_std": 0.001405717248644578, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4209979176521301, "step": 2741 }, { "clip_ratio": 0.0, "completion_length": 57.6875, "epoch": 7.103626943005182, "grad_norm": 0.5536072376643791, "kl": 0.24365234375, "learning_rate": 2.896373056994819e-07, "loss": -0.0001, "reward": 2.4999985694885254, "reward_std": 1.4904555882822024e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2742 }, { "clip_ratio": 0.0, "completion_length": 44.375, "epoch": 7.106217616580311, "grad_norm": 1.075725186045907, "kl": 0.07415771484375, "learning_rate": 2.893782383419689e-07, "loss": -0.0001, "reward": 1.999372124671936, "reward_std": 3.3179710271724616e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499372124671936, "step": 2743 }, { "clip_ratio": 0.0, "completion_length": 49.5, "epoch": 7.108808290155441, "grad_norm": 0.15931380832833264, "kl": 0.0933837890625, "learning_rate": 2.8911917098445593e-07, "loss": 0.001, "reward": 2.4999979734420776, "reward_std": 1.6614399669379054e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2744 }, { "clip_ratio": 0.0, "completion_length": 63.5, "epoch": 7.11139896373057, "grad_norm": 2.398892000840722, "kl": 0.185546875, "learning_rate": 2.8886010362694304e-07, "loss": 0.0018, "reward": 2.4999845027923584, "reward_std": 3.7284518725755333e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984323978424, "step": 2745 }, { "clip_ratio": 0.0, "completion_length": 43.6875, "epoch": 7.1139896373057, "grad_norm": 0.09324531725077016, "kl": 0.208984375, "learning_rate": 2.8860103626943004e-07, "loss": 0.0016, "reward": 1.4999990463256836, "reward_std": 5.397430413722759e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999990463256836, "step": 2746 }, { "clip_ratio": 0.0, "completion_length": 66.5, "epoch": 7.116580310880829, "grad_norm": 0.1273377639345571, "kl": 0.1435546875, "learning_rate": 2.883419689119171e-07, "loss": 0.0006, "reward": 2.4999988079071045, "reward_std": 1.28331407722726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 2747 }, { "clip_ratio": 0.0, "completion_length": 83.625, "epoch": 7.119170984455959, "grad_norm": 0.8780727065862953, "kl": 0.117919921875, "learning_rate": 2.8808290155440414e-07, "loss": -0.0, "reward": 2.499990701675415, "reward_std": 7.63561030225901e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 64.125, "epoch": 7.1217616580310885, "grad_norm": 0.25647649127342764, "kl": 0.111083984375, "learning_rate": 2.8782383419689114e-07, "loss": 0.0013, "reward": 2.499995231628418, "reward_std": 4.3182546392017684e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2749 }, { "clip_ratio": 0.0, "completion_length": 53.25, "epoch": 7.124352331606218, "grad_norm": 0.1320309939495788, "kl": 0.0908203125, "learning_rate": 2.8756476683937825e-07, "loss": 0.0001, "reward": 2.499998450279236, "reward_std": 1.2511067950526922e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2750 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 7.126943005181348, "grad_norm": 0.675371053328721, "kl": 0.13427734375, "learning_rate": 2.873056994818653e-07, "loss": 0.0014, "reward": 2.4999977350234985, "reward_std": 3.0955292231737985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2751 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.129533678756476, "grad_norm": 0.07917153127612298, "kl": 0.0733642578125, "learning_rate": 2.870466321243523e-07, "loss": 0.0015, "reward": 2.499997138977051, "reward_std": 1.2772664490512398e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2752 }, { "clip_ratio": 0.0, "completion_length": 63.9375, "epoch": 7.132124352331606, "grad_norm": 22.234257979348794, "kl": 0.079833984375, "learning_rate": 2.8678756476683936e-07, "loss": -0.0001, "reward": 2.4372711181640625, "reward_std": 0.17741738820052433, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937271237373352, "step": 2753 }, { "clip_ratio": 0.0, "completion_length": 47.625, "epoch": 7.134715025906735, "grad_norm": 3.406828833418073, "kl": 0.0841064453125, "learning_rate": 2.865284974093264e-07, "loss": 0.0008, "reward": 1.999072015285492, "reward_std": 0.0001026932214927001, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990719854831696, "step": 2754 }, { "clip_ratio": 0.0, "completion_length": 102.25, "epoch": 7.137305699481865, "grad_norm": 0.09385018354594273, "kl": 0.11572265625, "learning_rate": 2.8626943005181346e-07, "loss": 0.0008, "reward": 2.4999977350234985, "reward_std": 1.7511292185190541e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2755 }, { "clip_ratio": 0.0, "completion_length": 45.9375, "epoch": 7.139896373056994, "grad_norm": 0.1323049360908655, "kl": 0.06396484375, "learning_rate": 2.860103626943005e-07, "loss": 0.0014, "reward": 2.4999966621398926, "reward_std": 2.571039544818632e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2756 }, { "clip_ratio": 0.0, "completion_length": 62.875, "epoch": 7.142487046632124, "grad_norm": 2.1970864112669894, "kl": 0.108642578125, "learning_rate": 2.8575129533678757e-07, "loss": 0.0003, "reward": 2.499996781349182, "reward_std": 2.3749721549393144e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 2757 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.1450777202072535, "grad_norm": 0.2159591783590185, "kl": 0.140625, "learning_rate": 2.8549222797927457e-07, "loss": -0.0007, "reward": 2.4999964237213135, "reward_std": 2.3543741463072365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2758 }, { "clip_ratio": 0.0, "completion_length": 48.0, "epoch": 7.147668393782383, "grad_norm": 1.4415101977374312, "kl": 0.35107421875, "learning_rate": 2.8523316062176167e-07, "loss": 0.0029, "reward": 2.499972105026245, "reward_std": 9.848175068327691e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999971866607666, "step": 2759 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 7.150259067357513, "grad_norm": 0.08356018750469829, "kl": 0.08447265625, "learning_rate": 2.849740932642487e-07, "loss": -0.0004, "reward": 2.499997854232788, "reward_std": 2.5110973638220457e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2760 }, { "clip_ratio": 0.0, "completion_length": 38.8125, "epoch": 7.152849740932642, "grad_norm": 9.30838516712042, "kl": 0.097900390625, "learning_rate": 2.847150259067357e-07, "loss": 0.0007, "reward": 2.3713927268981934, "reward_std": 0.0005720324654134856, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8713927865028381, "step": 2761 }, { "clip_ratio": 0.0, "completion_length": 52.0625, "epoch": 7.155440414507772, "grad_norm": 0.41846218615455133, "kl": 0.224609375, "learning_rate": 2.844559585492228e-07, "loss": 0.0016, "reward": 2.499995708465576, "reward_std": 5.274730256132898e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2762 }, { "clip_ratio": 0.0, "completion_length": 45.125, "epoch": 7.158031088082901, "grad_norm": 0.2967800270709217, "kl": 0.13037109375, "learning_rate": 2.8419689119170983e-07, "loss": 0.0007, "reward": 2.499997615814209, "reward_std": 3.1184480349111254e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2763 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 7.160621761658031, "grad_norm": 18.89378965602224, "kl": 0.08544921875, "learning_rate": 2.839378238341969e-07, "loss": 0.0, "reward": 1.9957534074783325, "reward_std": 0.0004774313038069522, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4957535564899445, "step": 2764 }, { "clip_ratio": 0.0, "completion_length": 49.1875, "epoch": 7.16321243523316, "grad_norm": 0.09548112564824318, "kl": 0.06689453125, "learning_rate": 2.8367875647668393e-07, "loss": -0.0008, "reward": 2.499997615814209, "reward_std": 1.5476737473818503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2765 }, { "clip_ratio": 0.0, "completion_length": 48.5, "epoch": 7.16580310880829, "grad_norm": 25.235559196393837, "kl": 0.16796875, "learning_rate": 2.83419689119171e-07, "loss": 0.0005, "reward": 2.1245734095573425, "reward_std": 0.23171603115122252, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6245734095573425, "step": 2766 }, { "clip_ratio": 0.0, "completion_length": 47.4375, "epoch": 7.168393782383419, "grad_norm": 10.866487938191703, "kl": 0.2158203125, "learning_rate": 2.83160621761658e-07, "loss": 0.0005, "reward": 1.9972398281097412, "reward_std": 0.0001761026896929252, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972397685050964, "step": 2767 }, { "clip_ratio": 0.0, "completion_length": 50.1875, "epoch": 7.170984455958549, "grad_norm": 0.3920817589828947, "kl": 0.0966796875, "learning_rate": 2.8290155440414504e-07, "loss": 0.0017, "reward": 2.4999966621398926, "reward_std": 3.1635075288249936e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2768 }, { "clip_ratio": 0.0, "completion_length": 53.3125, "epoch": 7.1735751295336785, "grad_norm": 0.14242162163706185, "kl": 0.05126953125, "learning_rate": 2.8264248704663215e-07, "loss": 0.0006, "reward": 2.4999970197677612, "reward_std": 2.364397175824706e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2769 }, { "clip_ratio": 0.0, "completion_length": 41.75, "epoch": 7.176165803108808, "grad_norm": 4.148238489823068, "kl": 0.0618896484375, "learning_rate": 2.8238341968911915e-07, "loss": 0.0004, "reward": 2.49986732006073, "reward_std": 0.0003616849868421923, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998672008514404, "step": 2770 }, { "clip_ratio": 0.0, "completion_length": 62.4375, "epoch": 7.178756476683938, "grad_norm": 0.882542080410152, "kl": 0.12158203125, "learning_rate": 2.821243523316062e-07, "loss": 0.0015, "reward": 2.4999948740005493, "reward_std": 6.755963340765447e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2771 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.181347150259067, "grad_norm": 1.7106334024062297, "kl": 0.865478515625, "learning_rate": 2.8186528497409325e-07, "loss": 0.0041, "reward": 2.4999938011169434, "reward_std": 6.9032377041367e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 2772 }, { "clip_ratio": 0.0, "completion_length": 41.3125, "epoch": 7.183937823834197, "grad_norm": 0.9634720246093539, "kl": 0.13720703125, "learning_rate": 2.816062176165803e-07, "loss": 0.0011, "reward": 2.4999706745147705, "reward_std": 9.926606594490295e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704957008362, "step": 2773 }, { "clip_ratio": 0.0, "completion_length": 47.875, "epoch": 7.186528497409326, "grad_norm": 0.9763700872088213, "kl": 0.0875244140625, "learning_rate": 2.8134715025906736e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 4.051745008837315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2774 }, { "clip_ratio": 0.0, "completion_length": 50.125, "epoch": 7.189119170984456, "grad_norm": 0.1978961703856874, "kl": 0.090576171875, "learning_rate": 2.810880829015544e-07, "loss": 0.0005, "reward": 2.4999985694885254, "reward_std": 1.7637462974562368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2775 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 7.191709844559585, "grad_norm": 0.3005341124289149, "kl": 0.07940673828125, "learning_rate": 2.808290155440414e-07, "loss": 0.0007, "reward": 2.4999935626983643, "reward_std": 3.5874263630830683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 2776 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 7.194300518134715, "grad_norm": 0.27142335974627924, "kl": 0.07421875, "learning_rate": 2.8056994818652846e-07, "loss": 0.0007, "reward": 2.499993920326233, "reward_std": 3.7584792380584986e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 2777 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.196891191709844, "grad_norm": 0.07805892110493465, "kl": 0.090728759765625, "learning_rate": 2.8031088082901557e-07, "loss": 0.0001, "reward": 2.4999982118606567, "reward_std": 2.337225168957957e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2778 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.199481865284974, "grad_norm": 0.4053153123917317, "kl": 0.0579833984375, "learning_rate": 2.8005181347150257e-07, "loss": 0.0004, "reward": 2.499995470046997, "reward_std": 3.7510284300878993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 2779 }, { "clip_ratio": 0.0, "completion_length": 58.8125, "epoch": 7.2020725388601035, "grad_norm": 0.061297485377620774, "kl": 0.103515625, "learning_rate": 2.797927461139896e-07, "loss": 0.0001, "reward": 2.499998092651367, "reward_std": 1.5198098708424368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2780 }, { "clip_ratio": 0.0, "completion_length": 40.0, "epoch": 7.204663212435233, "grad_norm": 0.18572430127690548, "kl": 0.077880859375, "learning_rate": 2.7953367875647667e-07, "loss": 0.0007, "reward": 2.499997138977051, "reward_std": 2.5101629717028118e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2781 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 7.2072538860103625, "grad_norm": 0.5069754569199848, "kl": 0.173828125, "learning_rate": 2.792746113989637e-07, "loss": 0.0017, "reward": 2.499993920326233, "reward_std": 4.875779495705501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2782 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.209844559585492, "grad_norm": 0.3078352270232401, "kl": 0.078369140625, "learning_rate": 2.790155440414508e-07, "loss": -0.0003, "reward": 2.4999923706054688, "reward_std": 3.3644195127635612e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2783 }, { "clip_ratio": 0.0, "completion_length": 38.75, "epoch": 7.212435233160622, "grad_norm": 0.716475875381491, "kl": 0.0849609375, "learning_rate": 2.7875647668393783e-07, "loss": 0.0002, "reward": 2.4999938011169434, "reward_std": 4.077418680026312e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2784 }, { "clip_ratio": 0.0, "completion_length": 38.875, "epoch": 7.215025906735751, "grad_norm": 3.9987436427963305, "kl": 0.1214599609375, "learning_rate": 2.7849740932642483e-07, "loss": 0.0003, "reward": 1.8853871822357178, "reward_std": 0.0006756444174698117, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3853871822357178, "step": 2785 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.217616580310881, "grad_norm": 0.5746806960643912, "kl": 0.0390625, "learning_rate": 2.782383419689119e-07, "loss": -0.0006, "reward": 2.499997615814209, "reward_std": 2.626373685643557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2786 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.22020725388601, "grad_norm": 0.4342116418965283, "kl": 0.0628662109375, "learning_rate": 2.77979274611399e-07, "loss": 0.0005, "reward": 2.499996066093445, "reward_std": 2.582114802862634e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2787 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.22279792746114, "grad_norm": 0.09360002280017325, "kl": 0.0567626953125, "learning_rate": 2.77720207253886e-07, "loss": -0.0002, "reward": 2.4999983310699463, "reward_std": 2.2234952439248445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2788 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.225388601036269, "grad_norm": 0.0836814087338117, "kl": 0.068115234375, "learning_rate": 2.7746113989637304e-07, "loss": -0.0006, "reward": 2.4999948740005493, "reward_std": 2.8410729555616854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 2789 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 7.227979274611399, "grad_norm": 0.22721957239436388, "kl": 0.06005859375, "learning_rate": 2.772020725388601e-07, "loss": -0.0006, "reward": 2.4999947547912598, "reward_std": 3.0421141445913236e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2790 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.230569948186528, "grad_norm": 0.14052786254811955, "kl": 0.08203125, "learning_rate": 2.769430051813471e-07, "loss": 0.0012, "reward": 2.499996304512024, "reward_std": 3.0107394195511006e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2791 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.233160621761658, "grad_norm": 1.0404226442242954, "kl": 0.0643310546875, "learning_rate": 2.766839378238342e-07, "loss": 0.0002, "reward": 2.499955654144287, "reward_std": 1.0739899380496354e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999955654144287, "step": 2792 }, { "clip_ratio": 0.0, "completion_length": 39.9375, "epoch": 7.2357512953367875, "grad_norm": 0.11842060023621352, "kl": 0.0867919921875, "learning_rate": 2.7642487046632125e-07, "loss": 0.0009, "reward": 2.4999979734420776, "reward_std": 1.9331413341205916e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2793 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 7.238341968911917, "grad_norm": 14.12022342797606, "kl": 0.0682373046875, "learning_rate": 2.7616580310880825e-07, "loss": 0.0003, "reward": 2.0623674392700195, "reward_std": 0.17683051066956068, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5623673796653748, "step": 2794 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 7.240932642487047, "grad_norm": 13.848465650995172, "kl": 0.0941162109375, "learning_rate": 2.759067357512953e-07, "loss": 0.0004, "reward": 1.792869746685028, "reward_std": 0.0025054991798469928, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.292869746685028, "step": 2795 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.243523316062176, "grad_norm": 0.6273764536852642, "kl": 0.056884765625, "learning_rate": 2.756476683937824e-07, "loss": -0.001, "reward": 2.4999897480010986, "reward_std": 6.2963765685708495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 2796 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 7.246113989637306, "grad_norm": 0.9222748480744986, "kl": 0.10693359375, "learning_rate": 2.753886010362694e-07, "loss": 0.0003, "reward": 1.9989904761314392, "reward_std": 2.6255145144205017e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989904165267944, "step": 2797 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 7.248704663212435, "grad_norm": 0.24421433028836534, "kl": 0.077880859375, "learning_rate": 2.7512953367875646e-07, "loss": -0.0003, "reward": 2.4999964237213135, "reward_std": 3.3715338076945045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2798 }, { "clip_ratio": 0.0, "completion_length": 36.9375, "epoch": 7.251295336787565, "grad_norm": 19.151656041504836, "kl": 0.1123046875, "learning_rate": 2.748704663212435e-07, "loss": 0.0004, "reward": 1.9833894968032837, "reward_std": 0.0005476024621771103, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.483389675617218, "step": 2799 }, { "clip_ratio": 0.0, "completion_length": 48.6875, "epoch": 7.253886010362694, "grad_norm": 0.48780848404956095, "kl": 0.0703125, "learning_rate": 2.746113989637305e-07, "loss": 0.0003, "reward": 2.499991774559021, "reward_std": 6.476408316302695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 2800 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.256476683937824, "grad_norm": 0.3328052256251321, "kl": 0.03863525390625, "learning_rate": 2.743523316062176e-07, "loss": 0.0004, "reward": 2.499974489212036, "reward_std": 5.242035740593565e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999746680259705, "step": 2801 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.259067357512953, "grad_norm": 118.77202944814307, "kl": 0.125, "learning_rate": 2.7409326424870467e-07, "loss": -0.0005, "reward": 2.3124117851257324, "reward_std": 0.2588757092889864, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124120235443115, "step": 2802 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 7.261658031088083, "grad_norm": 0.4213469180093501, "kl": 0.107421875, "learning_rate": 2.7383419689119167e-07, "loss": 0.0004, "reward": 2.4999920129776, "reward_std": 6.604160375900392e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 2803 }, { "clip_ratio": 0.0, "completion_length": 42.75, "epoch": 7.2642487046632125, "grad_norm": 0.16623592266200646, "kl": 0.068115234375, "learning_rate": 2.735751295336787e-07, "loss": -0.001, "reward": 2.4999982118606567, "reward_std": 1.4977586033637635e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2804 }, { "clip_ratio": 0.0, "completion_length": 42.8125, "epoch": 7.266839378238342, "grad_norm": 3.9898939029703095, "kl": 0.14306640625, "learning_rate": 2.7331606217616583e-07, "loss": 0.0004, "reward": 1.9994579553604126, "reward_std": 5.429817809954329e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994579255580902, "step": 2805 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.269430051813472, "grad_norm": 0.10647955478312049, "kl": 0.10888671875, "learning_rate": 2.7305699481865283e-07, "loss": 0.001, "reward": 2.499996304512024, "reward_std": 2.5021242038292257e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2806 }, { "clip_ratio": 0.0, "completion_length": 39.0625, "epoch": 7.272020725388601, "grad_norm": 5.745287050302857, "kl": 0.13134765625, "learning_rate": 2.727979274611399e-07, "loss": 0.0, "reward": 1.8540267944335938, "reward_std": 0.0011137344131384452, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3540268242359161, "step": 2807 }, { "clip_ratio": 0.0, "completion_length": 38.1875, "epoch": 7.274611398963731, "grad_norm": 0.17052295274322676, "kl": 0.134765625, "learning_rate": 2.7253886010362694e-07, "loss": 0.0001, "reward": 2.4999945163726807, "reward_std": 2.5619144707889063e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 7.27720207253886, "grad_norm": 8.806248399927453, "kl": 0.08544921875, "learning_rate": 2.7227979274611393e-07, "loss": -0.0004, "reward": 2.062047243118286, "reward_std": 0.17695858235327933, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562047302722931, "step": 2809 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.27979274611399, "grad_norm": 0.41832897431886584, "kl": 0.09228515625, "learning_rate": 2.7202072538860104e-07, "loss": 0.0009, "reward": 2.499993920326233, "reward_std": 3.931162495973695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2810 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.282383419689119, "grad_norm": 4.284472718345191, "kl": 0.10791015625, "learning_rate": 2.717616580310881e-07, "loss": 0.0, "reward": 2.187442421913147, "reward_std": 0.25878642559598575, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6874424815177917, "step": 2811 }, { "clip_ratio": 0.0, "completion_length": 55.8125, "epoch": 7.284974093264249, "grad_norm": 0.3099189979951269, "kl": 0.1484375, "learning_rate": 2.715025906735751e-07, "loss": -0.001, "reward": 2.499993085861206, "reward_std": 3.157718737156756e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2812 }, { "clip_ratio": 0.0, "completion_length": 45.9375, "epoch": 7.287564766839378, "grad_norm": 2.537734512483909, "kl": 0.1484375, "learning_rate": 2.7124352331606215e-07, "loss": -0.0005, "reward": 1.9816818237304688, "reward_std": 0.00012104420579817088, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4816819727420807, "step": 2813 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 7.290155440414508, "grad_norm": 3.78742418763649, "kl": 0.14111328125, "learning_rate": 2.709844559585492e-07, "loss": 0.0001, "reward": 2.4999947547912598, "reward_std": 7.405869496324158e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2814 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.2927461139896375, "grad_norm": 0.2163917865945186, "kl": 0.092529296875, "learning_rate": 2.7072538860103625e-07, "loss": 0.0005, "reward": 2.499988555908203, "reward_std": 5.6749556733848294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999885559082031, "step": 2815 }, { "clip_ratio": 0.0, "completion_length": 44.5, "epoch": 7.295336787564767, "grad_norm": 0.23052783363695492, "kl": 0.10693359375, "learning_rate": 2.704663212435233e-07, "loss": -0.0003, "reward": 2.499997138977051, "reward_std": 2.322498517060012e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2816 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 7.2979274611398965, "grad_norm": 0.13332172594018538, "kl": 0.166015625, "learning_rate": 2.7020725388601036e-07, "loss": 0.0005, "reward": 2.499996781349182, "reward_std": 3.1560688285026117e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2817 }, { "clip_ratio": 0.0, "completion_length": 51.625, "epoch": 7.300518134715026, "grad_norm": 2.371545243504021, "kl": 0.13525390625, "learning_rate": 2.6994818652849736e-07, "loss": 0.0007, "reward": 2.499991536140442, "reward_std": 1.0952465686386859e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 2818 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.303108808290156, "grad_norm": 0.6149013369118381, "kl": 0.04779052734375, "learning_rate": 2.6968911917098446e-07, "loss": -0.0, "reward": 2.499993085861206, "reward_std": 6.765184252799372e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 2819 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.305699481865285, "grad_norm": 0.12331061029529303, "kl": 0.08660888671875, "learning_rate": 2.694300518134715e-07, "loss": 0.0011, "reward": 2.499989151954651, "reward_std": 3.8477694488392444e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 2820 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.308290155440415, "grad_norm": 0.48422328572556544, "kl": 0.0396728515625, "learning_rate": 2.691709844559585e-07, "loss": 0.0006, "reward": 2.499987244606018, "reward_std": 6.462998044298729e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873638153076, "step": 2821 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.310880829015544, "grad_norm": 114.81426787472601, "kl": 0.096435546875, "learning_rate": 2.6891191709844557e-07, "loss": 0.001, "reward": 1.9665817022323608, "reward_std": 0.0026201429191132775, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4665816724300385, "step": 2822 }, { "clip_ratio": 0.0, "completion_length": 58.125, "epoch": 7.313471502590674, "grad_norm": 0.24100922836389044, "kl": 0.0679931640625, "learning_rate": 2.686528497409326e-07, "loss": 0.0011, "reward": 2.4999905824661255, "reward_std": 5.153240749677934e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 2823 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.316062176165803, "grad_norm": 0.26572598086062565, "kl": 0.0426025390625, "learning_rate": 2.6839378238341967e-07, "loss": 0.0012, "reward": 2.4999979734420776, "reward_std": 2.272717210871633e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2824 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.318652849740933, "grad_norm": 0.8230820690873245, "kl": 0.070068359375, "learning_rate": 2.681347150259067e-07, "loss": 0.001, "reward": 2.499993085861206, "reward_std": 9.370142606712761e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 2825 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.321243523316062, "grad_norm": 15.853041258093283, "kl": 0.0596923828125, "learning_rate": 2.678756476683938e-07, "loss": 0.0004, "reward": 1.9358866214752197, "reward_std": 0.1768560843047453, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4358866214752197, "step": 2826 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.323834196891192, "grad_norm": 0.2267238374656739, "kl": 0.0682373046875, "learning_rate": 2.676165803108808e-07, "loss": 0.0013, "reward": 2.4999947547912598, "reward_std": 3.0634977292720578e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2827 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.3264248704663215, "grad_norm": 2.4040322739748397, "kl": 0.1748046875, "learning_rate": 2.6735751295336783e-07, "loss": 0.0015, "reward": 2.499988317489624, "reward_std": 1.0105330602527829e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988317489624, "step": 2828 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.329015544041451, "grad_norm": 0.18399211060077583, "kl": 0.0927734375, "learning_rate": 2.6709844559585494e-07, "loss": 0.002, "reward": 2.499997854232788, "reward_std": 3.0380467705981573e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2829 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.331606217616581, "grad_norm": 3.943492718155222, "kl": 0.1068115234375, "learning_rate": 2.6683937823834194e-07, "loss": 0.0, "reward": 1.9466025233268738, "reward_std": 0.0004151339262534748, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4466025829315186, "step": 2830 }, { "clip_ratio": 0.0, "completion_length": 54.1875, "epoch": 7.33419689119171, "grad_norm": 0.26076750719657105, "kl": 0.095947265625, "learning_rate": 2.66580310880829e-07, "loss": 0.0001, "reward": 2.4999959468841553, "reward_std": 3.5348149367564474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2831 }, { "clip_ratio": 0.0, "completion_length": 37.5625, "epoch": 7.33678756476684, "grad_norm": 13.947704372621098, "kl": 0.43505859375, "learning_rate": 2.6632124352331604e-07, "loss": 0.0006, "reward": 1.966921091079712, "reward_std": 0.07501061263269548, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.466921180486679, "step": 2832 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.339378238341969, "grad_norm": 18.654807167505492, "kl": 0.0751953125, "learning_rate": 2.6606217616580315e-07, "loss": 0.0009, "reward": 2.49998676776886, "reward_std": 1.3137173937138868e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986708164215, "step": 2833 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.341968911917099, "grad_norm": 0.5199476493170736, "kl": 0.0361328125, "learning_rate": 2.6580310880829015e-07, "loss": -0.0007, "reward": 2.499991774559021, "reward_std": 5.598021743935533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 2834 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.344559585492228, "grad_norm": 0.2247444577988363, "kl": 0.0670166015625, "learning_rate": 2.655440414507772e-07, "loss": -0.0003, "reward": 2.499994993209839, "reward_std": 2.639208162236173e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 2835 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.347150259067358, "grad_norm": 0.10705600166099377, "kl": 0.0616455078125, "learning_rate": 2.652849740932642e-07, "loss": 0.0004, "reward": 2.499996781349182, "reward_std": 2.464444548877509e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 2836 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.349740932642487, "grad_norm": 0.018561783592428167, "kl": 0.107421875, "learning_rate": 2.6502590673575125e-07, "loss": 0.0016, "reward": 2.4999985694885254, "reward_std": 1.1445536074461415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2837 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.352331606217617, "grad_norm": 0.657021068049701, "kl": 0.111083984375, "learning_rate": 2.6476683937823836e-07, "loss": 0.0, "reward": 2.4999895095825195, "reward_std": 7.188062909335713e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 2838 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.3549222797927465, "grad_norm": 3.4598235324862605, "kl": 0.22412109375, "learning_rate": 2.645077720207254e-07, "loss": 0.0003, "reward": 2.12495756149292, "reward_std": 0.23146428774930428, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249577403068542, "step": 2839 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.357512953367876, "grad_norm": 2.0884232968303085, "kl": 0.0433349609375, "learning_rate": 2.642487046632124e-07, "loss": 0.0004, "reward": 1.9992027282714844, "reward_std": 5.104341406081403e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499202698469162, "step": 2840 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 7.360103626943006, "grad_norm": 1.2139484654753323, "kl": 0.070068359375, "learning_rate": 2.6398963730569946e-07, "loss": 0.0003, "reward": 2.4999876022338867, "reward_std": 1.6814673131193558e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876618385315, "step": 2841 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.362694300518135, "grad_norm": 0.7275086763221452, "kl": 0.073486328125, "learning_rate": 2.6373056994818657e-07, "loss": -0.001, "reward": 2.4999908208847046, "reward_std": 6.331959070848825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 2842 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.365284974093265, "grad_norm": 0.32922830260106595, "kl": 0.17236328125, "learning_rate": 2.6347150259067357e-07, "loss": 0.0001, "reward": 2.4999938011169434, "reward_std": 3.1762632772824873e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 2843 }, { "clip_ratio": 0.0, "completion_length": 44.4375, "epoch": 7.367875647668393, "grad_norm": 37.24631072776548, "kl": 0.09619140625, "learning_rate": 2.632124352331606e-07, "loss": 0.0004, "reward": 2.37445068359375, "reward_std": 0.2324360539346344, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8744506239891052, "step": 2844 }, { "clip_ratio": 0.0, "completion_length": 44.625, "epoch": 7.370466321243523, "grad_norm": 0.6719832871630285, "kl": 0.10009765625, "learning_rate": 2.6295336787564767e-07, "loss": 0.0002, "reward": 2.499996781349182, "reward_std": 3.159388484164083e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2845 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 7.373056994818652, "grad_norm": 0.8183118987691836, "kl": 0.059326171875, "learning_rate": 2.6269430051813467e-07, "loss": 0.0008, "reward": 2.4999959468841553, "reward_std": 4.520807863173104e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2846 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.375647668393782, "grad_norm": 0.27816854822772047, "kl": 0.06304931640625, "learning_rate": 2.624352331606218e-07, "loss": 0.001, "reward": 2.4999953508377075, "reward_std": 3.0618558071182633e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 2847 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 7.3782383419689115, "grad_norm": 0.655354699229152, "kl": 0.127197265625, "learning_rate": 2.6217616580310883e-07, "loss": -0.0005, "reward": 2.499995708465576, "reward_std": 3.099728161259918e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2848 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 7.380829015544041, "grad_norm": 0.12294457342582503, "kl": 0.106201171875, "learning_rate": 2.6191709844559583e-07, "loss": 0.0009, "reward": 2.499997854232788, "reward_std": 1.711046650143544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2849 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.383419689119171, "grad_norm": 0.03627506860509828, "kl": 0.034423828125, "learning_rate": 2.616580310880829e-07, "loss": -0.0012, "reward": 2.4999985694885254, "reward_std": 9.359862076507852e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2850 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.3860103626943, "grad_norm": 0.45144080246465756, "kl": 0.0638427734375, "learning_rate": 2.6139896373056994e-07, "loss": -0.0002, "reward": 2.499995231628418, "reward_std": 3.0951301823733957e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2851 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.38860103626943, "grad_norm": 1.7905715316442805, "kl": 0.053466796875, "learning_rate": 2.61139896373057e-07, "loss": 0.0003, "reward": 2.499985933303833, "reward_std": 1.1968665830863756e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999858140945435, "step": 2852 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.391191709844559, "grad_norm": 0.28021979739016944, "kl": 0.08056640625, "learning_rate": 2.6088082901554404e-07, "loss": 0.0008, "reward": 2.4999961853027344, "reward_std": 3.057433218600636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2853 }, { "clip_ratio": 0.0, "completion_length": 38.0, "epoch": 7.393782383419689, "grad_norm": 0.2870526830282816, "kl": 0.07275390625, "learning_rate": 2.606217616580311e-07, "loss": -0.0004, "reward": 2.4999961853027344, "reward_std": 2.8561249223457708e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2854 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 7.396373056994818, "grad_norm": 217.38870131547864, "kl": 0.15966796875, "learning_rate": 2.603626943005181e-07, "loss": 0.0004, "reward": 2.2079442739486694, "reward_std": 0.3135430511972572, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7079442739486694, "step": 2855 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.398963730569948, "grad_norm": 3.0330710300550443, "kl": 0.30712890625, "learning_rate": 2.601036269430052e-07, "loss": 0.0011, "reward": 2.4999889135360718, "reward_std": 1.550665797367401e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889135360718, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 41.25, "epoch": 7.401554404145077, "grad_norm": 0.10099451882459752, "kl": 0.096923828125, "learning_rate": 2.5984455958549225e-07, "loss": -0.0009, "reward": 2.4999964237213135, "reward_std": 2.758949932513133e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2857 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.404145077720207, "grad_norm": 0.11226810591653304, "kl": 0.17333984375, "learning_rate": 2.5958549222797925e-07, "loss": 0.0017, "reward": 2.499997615814209, "reward_std": 2.244200345558056e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2858 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.4067357512953365, "grad_norm": 0.16640398761736344, "kl": 0.0670166015625, "learning_rate": 2.593264248704663e-07, "loss": -0.0006, "reward": 2.4999969005584717, "reward_std": 3.5123616726195905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 2859 }, { "clip_ratio": 0.0, "completion_length": 39.375, "epoch": 7.409326424870466, "grad_norm": 11.040685224332671, "kl": 0.115966796875, "learning_rate": 2.5906735751295336e-07, "loss": 0.0002, "reward": 1.998278796672821, "reward_std": 0.004428709168223577, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982788562774658, "step": 2860 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 7.4119170984455955, "grad_norm": 21.266592319603436, "kl": 0.0445556640625, "learning_rate": 2.588082901554404e-07, "loss": 0.0002, "reward": 1.8421260118484497, "reward_std": 0.2818194814026356, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3421261608600616, "step": 2861 }, { "clip_ratio": 0.0, "completion_length": 41.5625, "epoch": 7.414507772020725, "grad_norm": 0.21038798116775445, "kl": 0.05096435546875, "learning_rate": 2.5854922279792746e-07, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 4.3028921368204465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2862 }, { "clip_ratio": 0.0, "completion_length": 44.0, "epoch": 7.417098445595855, "grad_norm": 26.12373539379843, "kl": 0.18115234375, "learning_rate": 2.582901554404145e-07, "loss": -0.0003, "reward": 1.8681765794754028, "reward_std": 0.0008964653118255228, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3681766986846924, "step": 2863 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 7.419689119170984, "grad_norm": 68.52289157552886, "kl": 0.3115234375, "learning_rate": 2.580310880829015e-07, "loss": 0.0013, "reward": 1.8686418533325195, "reward_std": 0.1795196559105534, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3686416745185852, "step": 2864 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.422279792746114, "grad_norm": 3.1555810192652523, "kl": 0.095458984375, "learning_rate": 2.577720207253886e-07, "loss": 0.0002, "reward": 2.499984860420227, "reward_std": 1.8985489987244364e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 2865 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.424870466321243, "grad_norm": 0.8113976099245706, "kl": 0.118896484375, "learning_rate": 2.5751295336787567e-07, "loss": 0.0011, "reward": 2.499988079071045, "reward_std": 5.2757983439732925e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 2866 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.427461139896373, "grad_norm": 0.08767566489745604, "kl": 0.0357666015625, "learning_rate": 2.5725388601036267e-07, "loss": -0.0009, "reward": 2.499998092651367, "reward_std": 1.8523371068113192e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2867 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 7.430051813471502, "grad_norm": 9.710033279893338, "kl": 0.21875, "learning_rate": 2.569948186528497e-07, "loss": 0.0009, "reward": 1.647491216659546, "reward_std": 0.0006052905555407051, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1474913209676743, "step": 2868 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.432642487046632, "grad_norm": 13.998694220421678, "kl": 0.05780029296875, "learning_rate": 2.567357512953368e-07, "loss": 0.0005, "reward": 2.437385320663452, "reward_std": 0.17704777762901358, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373853206634521, "step": 2869 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.435233160621761, "grad_norm": 0.16465528510708474, "kl": 0.088623046875, "learning_rate": 2.5647668393782383e-07, "loss": 0.0005, "reward": 2.4999924898147583, "reward_std": 3.612196110225341e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2870 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.437823834196891, "grad_norm": 0.05392266160926763, "kl": 0.0621337890625, "learning_rate": 2.562176165803109e-07, "loss": 0.0013, "reward": 2.4999988079071045, "reward_std": 1.0926277695944009e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2871 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.4404145077720205, "grad_norm": 0.10154280062898931, "kl": 0.0482177734375, "learning_rate": 2.5595854922279794e-07, "loss": 0.0009, "reward": 2.499998688697815, "reward_std": 1.1788387723754568e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2872 }, { "clip_ratio": 0.0, "completion_length": 59.625, "epoch": 7.44300518134715, "grad_norm": 1.031094566569159, "kl": 0.14990234375, "learning_rate": 2.5569948186528494e-07, "loss": 0.0013, "reward": 2.4999881982803345, "reward_std": 1.3664176663041872e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 2873 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 7.44559585492228, "grad_norm": 8.133714357943765, "kl": 0.152099609375, "learning_rate": 2.55440414507772e-07, "loss": 0.0006, "reward": 1.9903766512870789, "reward_std": 0.00016628051548650546, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.490376740694046, "step": 2874 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 7.448186528497409, "grad_norm": 4.419363154990442, "kl": 0.1279296875, "learning_rate": 2.551813471502591e-07, "loss": 0.0007, "reward": 2.0624454021453857, "reward_std": 0.17678579260621063, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624454617500305, "step": 2875 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.450777202072539, "grad_norm": 0.1810351162985629, "kl": 0.03326416015625, "learning_rate": 2.549222797927461e-07, "loss": 0.0016, "reward": 2.4999969005584717, "reward_std": 3.317067807984131e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2876 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.453367875647668, "grad_norm": 2.7768120519941037, "kl": 0.0377197265625, "learning_rate": 2.5466321243523315e-07, "loss": 0.0007, "reward": 2.4999961853027344, "reward_std": 4.280695918623678e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2877 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.455958549222798, "grad_norm": 21.278880850573763, "kl": 0.091796875, "learning_rate": 2.544041450777202e-07, "loss": 0.0011, "reward": 2.3747819662094116, "reward_std": 0.23182096764728044, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747819662094116, "step": 2878 }, { "clip_ratio": 0.0, "completion_length": 37.625, "epoch": 7.458549222797927, "grad_norm": 0.21779261524150592, "kl": 0.0621337890625, "learning_rate": 2.5414507772020725e-07, "loss": -0.0003, "reward": 2.499998688697815, "reward_std": 1.2202301036268182e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2879 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.461139896373057, "grad_norm": 0.19591464061489242, "kl": 0.08837890625, "learning_rate": 2.538860103626943e-07, "loss": 0.0008, "reward": 2.4999966621398926, "reward_std": 2.9944078505650396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2880 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.463730569948186, "grad_norm": 30.340650096870498, "kl": 0.067138671875, "learning_rate": 2.5362694300518136e-07, "loss": -0.0003, "reward": 1.9994415044784546, "reward_std": 0.0004057447584386864, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994415640830994, "step": 2881 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 7.466321243523316, "grad_norm": 0.3930573627018515, "kl": 0.16064453125, "learning_rate": 2.5336787564766836e-07, "loss": -0.0011, "reward": 2.4999935626983643, "reward_std": 4.641638952307403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2882 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.4689119170984455, "grad_norm": 0.3311912323666093, "kl": 0.0606689453125, "learning_rate": 2.531088082901554e-07, "loss": 0.0014, "reward": 2.4999966621398926, "reward_std": 2.9953508828839404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2883 }, { "clip_ratio": 0.0, "completion_length": 37.0625, "epoch": 7.471502590673575, "grad_norm": 0.24211565289386922, "kl": 0.091064453125, "learning_rate": 2.528497409326425e-07, "loss": 0.0006, "reward": 2.4999947547912598, "reward_std": 2.86549729366925e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 2884 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.474093264248705, "grad_norm": 0.995471269906064, "kl": 0.05987548828125, "learning_rate": 2.525906735751295e-07, "loss": -0.0003, "reward": 2.4999938011169434, "reward_std": 4.358707769824832e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2885 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.476683937823834, "grad_norm": 9.599562549456211, "kl": 0.047454833984375, "learning_rate": 2.5233160621761657e-07, "loss": 0.001, "reward": 1.9990994334220886, "reward_std": 0.0002510231058181489, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990993738174438, "step": 2886 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.479274611398964, "grad_norm": 1.7798898335912086, "kl": 0.10809326171875, "learning_rate": 2.520725388601036e-07, "loss": 0.0006, "reward": 1.9197728037834167, "reward_std": 0.00010342782672978501, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4197728335857391, "step": 2887 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.481865284974093, "grad_norm": 3.546187209073305, "kl": 0.0849609375, "learning_rate": 2.518134715025906e-07, "loss": 0.0012, "reward": 2.4999890327453613, "reward_std": 1.1297007119992486e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989092350006, "step": 2888 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.484455958549223, "grad_norm": 0.35811821135344973, "kl": 0.07672119140625, "learning_rate": 2.515544041450777e-07, "loss": -0.0007, "reward": 2.4999983310699463, "reward_std": 1.3261494018479425e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 2889 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 7.487046632124352, "grad_norm": 17.627446940476652, "kl": 0.08642578125, "learning_rate": 2.512953367875648e-07, "loss": 0.001, "reward": 1.9957618713378906, "reward_std": 0.0006085288099484387, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4957618117332458, "step": 2890 }, { "clip_ratio": 0.0, "completion_length": 51.1875, "epoch": 7.489637305699482, "grad_norm": 1.8591301849268267, "kl": 0.0753173828125, "learning_rate": 2.510362694300518e-07, "loss": 0.0004, "reward": 2.4999879598617554, "reward_std": 1.4277914715421502e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999880194664001, "step": 2891 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.492227979274611, "grad_norm": 0.09610447968584064, "kl": 0.017486572265625, "learning_rate": 2.5077720207253883e-07, "loss": 0.0011, "reward": 2.499996781349182, "reward_std": 2.0122166120017937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2892 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.494818652849741, "grad_norm": 0.12789858851972835, "kl": 0.09210205078125, "learning_rate": 2.5051813471502594e-07, "loss": 0.0002, "reward": 2.499996781349182, "reward_std": 2.1849868403478467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 2893 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.4974093264248705, "grad_norm": 38.86827167754077, "kl": 0.2685546875, "learning_rate": 2.5025906735751294e-07, "loss": 0.0012, "reward": 2.4372549057006836, "reward_std": 0.1774314764306837, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372549057006836, "step": 2894 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 7.5, "grad_norm": 2.176330224244786, "kl": 0.091552734375, "learning_rate": 2.5e-07, "loss": 0.0001, "reward": 1.9987345933914185, "reward_std": 5.893352886232606e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987345933914185, "step": 2895 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 7.5025906735751295, "grad_norm": 20.27320698074359, "kl": 0.23583984375, "learning_rate": 2.4974093264248704e-07, "loss": 0.0011, "reward": 1.98313307762146, "reward_std": 0.02756749426589522, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4831331372261047, "step": 2896 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.505181347150259, "grad_norm": 75.96024746110373, "kl": 0.03924560546875, "learning_rate": 2.494818652849741e-07, "loss": -0.0003, "reward": 2.124409794807434, "reward_std": 0.2318184550318847, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6244099140167236, "step": 2897 }, { "clip_ratio": 0.0, "completion_length": 57.25, "epoch": 7.507772020725389, "grad_norm": 0.10187546978186086, "kl": 0.112548828125, "learning_rate": 2.4922279792746115e-07, "loss": 0.0003, "reward": 2.499996781349182, "reward_std": 2.035566865288274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2898 }, { "clip_ratio": 0.0, "completion_length": 38.8125, "epoch": 7.510362694300518, "grad_norm": 14.54003433987651, "kl": 0.111083984375, "learning_rate": 2.489637305699482e-07, "loss": 0.0011, "reward": 1.9998098611831665, "reward_std": 2.7257150122750318e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998098611831665, "step": 2899 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.512953367875648, "grad_norm": 0.08426660182378745, "kl": 0.0743408203125, "learning_rate": 2.487046632124352e-07, "loss": -0.0003, "reward": 2.499995708465576, "reward_std": 1.9534591046976857e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2900 }, { "clip_ratio": 0.0, "completion_length": 51.3125, "epoch": 7.515544041450777, "grad_norm": 5.757352921130543, "kl": 0.1199951171875, "learning_rate": 2.484455958549223e-07, "loss": 0.0009, "reward": 1.9984426498413086, "reward_std": 7.466021497748443e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984425902366638, "step": 2901 }, { "clip_ratio": 0.0, "completion_length": 49.5, "epoch": 7.518134715025907, "grad_norm": 3.6028100560578045, "kl": 0.146728515625, "learning_rate": 2.481865284974093e-07, "loss": -0.0001, "reward": 2.4365127086639404, "reward_std": 0.17680768987384, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9365127086639404, "step": 2902 }, { "clip_ratio": 0.0, "completion_length": 34.1875, "epoch": 7.520725388601036, "grad_norm": 5.764095948400247, "kl": 0.11669921875, "learning_rate": 2.4792746113989636e-07, "loss": 0.0016, "reward": 2.499219298362732, "reward_std": 6.0598649724852294e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9992192387580872, "step": 2903 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.523316062176166, "grad_norm": 63.96575160607159, "kl": 0.106689453125, "learning_rate": 2.476683937823834e-07, "loss": 0.0009, "reward": 1.9438812732696533, "reward_std": 0.0008200876773116761, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4438812732696533, "step": 2904 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 7.525906735751295, "grad_norm": 0.13264398214774878, "kl": 0.0657958984375, "learning_rate": 2.4740932642487046e-07, "loss": 0.0006, "reward": 2.499997854232788, "reward_std": 1.7752205963006418e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2905 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.528497409326425, "grad_norm": 0.24307264681410803, "kl": 0.02520751953125, "learning_rate": 2.471502590673575e-07, "loss": -0.0005, "reward": 2.4999964237213135, "reward_std": 2.921048462667386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2906 }, { "clip_ratio": 0.0, "completion_length": 39.875, "epoch": 7.5310880829015545, "grad_norm": 10.203197042976507, "kl": 0.107666015625, "learning_rate": 2.4689119170984457e-07, "loss": 0.0014, "reward": 1.8725891709327698, "reward_std": 0.0031504064833143275, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3725890219211578, "step": 2907 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.533678756476684, "grad_norm": 0.05959218645697164, "kl": 0.11572265625, "learning_rate": 2.466321243523316e-07, "loss": 0.0013, "reward": 2.499995708465576, "reward_std": 1.5602266216774296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2908 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.536269430051814, "grad_norm": 4.273703319346291, "kl": 0.109130859375, "learning_rate": 2.463730569948186e-07, "loss": 0.0002, "reward": 1.9573701620101929, "reward_std": 0.000573316504869581, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4573699831962585, "step": 2909 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.538860103626943, "grad_norm": 0.62097393057156, "kl": 0.05908203125, "learning_rate": 2.4611398963730567e-07, "loss": 0.0008, "reward": 2.499994993209839, "reward_std": 5.075153467259952e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2910 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.541450777202073, "grad_norm": 1.508867784485775, "kl": 0.18017578125, "learning_rate": 2.458549222797927e-07, "loss": 0.0015, "reward": 2.499988555908203, "reward_std": 9.98476843960816e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999885559082031, "step": 2911 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.544041450777202, "grad_norm": 4.077878992744716, "kl": 0.148193359375, "learning_rate": 2.455958549222798e-07, "loss": 0.0011, "reward": 1.997275471687317, "reward_std": 5.50059173747286e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972753822803497, "step": 2912 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.546632124352332, "grad_norm": 0.07317176209732278, "kl": 0.139892578125, "learning_rate": 2.4533678756476683e-07, "loss": 0.0001, "reward": 2.499997854232788, "reward_std": 2.5594776502657623e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2913 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.549222797927461, "grad_norm": 0.20197623319609426, "kl": 0.0596923828125, "learning_rate": 2.450777202072539e-07, "loss": -0.0008, "reward": 2.4999964237213135, "reward_std": 2.4064102035481483e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2914 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.551813471502591, "grad_norm": 0.05484888640795127, "kl": 0.06884765625, "learning_rate": 2.4481865284974094e-07, "loss": -0.0009, "reward": 2.499997615814209, "reward_std": 1.2601046819327166e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2915 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 7.55440414507772, "grad_norm": 6.24347464869651, "kl": 0.13623046875, "learning_rate": 2.44559585492228e-07, "loss": 0.0007, "reward": 1.9948316812515259, "reward_std": 0.00013380632390180836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.494831770658493, "step": 2916 }, { "clip_ratio": 0.0, "completion_length": 62.5, "epoch": 7.55699481865285, "grad_norm": 7.412361237136367, "kl": 0.105712890625, "learning_rate": 2.4430051813471504e-07, "loss": -0.0001, "reward": 1.9998613595962524, "reward_std": 4.521860228123842e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998614490032196, "step": 2917 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.5595854922279795, "grad_norm": 0.05731684323616922, "kl": 0.117919921875, "learning_rate": 2.4404145077720204e-07, "loss": -0.0001, "reward": 2.4999988079071045, "reward_std": 1.715909462518539e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 2918 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.562176165803109, "grad_norm": 0.310881284101341, "kl": 0.05126953125, "learning_rate": 2.437823834196891e-07, "loss": 0.0001, "reward": 2.499997854232788, "reward_std": 2.7595490337262163e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2919 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.564766839378239, "grad_norm": 0.7084003157512521, "kl": 0.1009521484375, "learning_rate": 2.4352331606217615e-07, "loss": -0.0005, "reward": 2.4999921321868896, "reward_std": 4.748978881252697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2920 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.567357512953368, "grad_norm": 8.448975142854147, "kl": 0.097076416015625, "learning_rate": 2.432642487046632e-07, "loss": 0.0002, "reward": 1.9792988896369934, "reward_std": 0.0004574072270884244, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4792989492416382, "step": 2921 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.569948186528498, "grad_norm": 16.483086308526538, "kl": 0.1494140625, "learning_rate": 2.4300518134715025e-07, "loss": 0.0003, "reward": 1.9984809160232544, "reward_std": 0.000363802665788171, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498481035232544, "step": 2922 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.572538860103627, "grad_norm": 2.47508523225066, "kl": 0.0966796875, "learning_rate": 2.427461139896373e-07, "loss": 0.0006, "reward": 1.9987071752548218, "reward_std": 7.415192203552579e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987072348594666, "step": 2923 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 7.575129533678757, "grad_norm": 0.2814131275167612, "kl": 0.1622314453125, "learning_rate": 2.4248704663212436e-07, "loss": 0.0005, "reward": 2.4999945163726807, "reward_std": 3.6854370364380884e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2924 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.577720207253886, "grad_norm": 0.723221588350446, "kl": 0.2288818359375, "learning_rate": 2.422279792746114e-07, "loss": 0.0016, "reward": 2.499991536140442, "reward_std": 7.655579338461393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 2925 }, { "clip_ratio": 0.0, "completion_length": 53.625, "epoch": 7.580310880829016, "grad_norm": 0.2038541436438678, "kl": 0.0938720703125, "learning_rate": 2.419689119170984e-07, "loss": 0.001, "reward": 2.4999988079071045, "reward_std": 9.039480772798925e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2926 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 7.582901554404145, "grad_norm": 0.05632317236259152, "kl": 0.0703125, "learning_rate": 2.4170984455958546e-07, "loss": -0.0, "reward": 2.4999988079071045, "reward_std": 9.606218327462557e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 2927 }, { "clip_ratio": 0.0, "completion_length": 38.5, "epoch": 7.585492227979275, "grad_norm": 0.3233031651586751, "kl": 0.0712890625, "learning_rate": 2.414507772020725e-07, "loss": -0.001, "reward": 2.4999970197677612, "reward_std": 1.9208071080356603e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 41.6875, "epoch": 7.5880829015544045, "grad_norm": 0.18377571253010155, "kl": 0.131591796875, "learning_rate": 2.4119170984455957e-07, "loss": 0.0002, "reward": 2.499995708465576, "reward_std": 3.3499027267680503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2929 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.590673575129534, "grad_norm": 0.23327025119254738, "kl": 0.0543212890625, "learning_rate": 2.409326424870466e-07, "loss": -0.0004, "reward": 2.499995231628418, "reward_std": 1.8809258790497552e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2930 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.5932642487046635, "grad_norm": 0.8551416610084788, "kl": 0.1493988037109375, "learning_rate": 2.4067357512953367e-07, "loss": 0.0016, "reward": 2.499994993209839, "reward_std": 1.957174504241266e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 2931 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.595854922279793, "grad_norm": 0.2721438377240209, "kl": 0.021728515625, "learning_rate": 2.404145077720207e-07, "loss": 0.0001, "reward": 2.499994993209839, "reward_std": 3.92514351688078e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2932 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.598445595854923, "grad_norm": 0.23494322074090024, "kl": 0.2669677734375, "learning_rate": 2.401554404145077e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 6.877528676341171e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2933 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.601036269430052, "grad_norm": 0.36109817140471184, "kl": 0.0811767578125, "learning_rate": 2.3989637305699483e-07, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 2.3986141854948073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2934 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.603626943005182, "grad_norm": 15.156670239319249, "kl": 0.154296875, "learning_rate": 2.3963730569948183e-07, "loss": 0.0004, "reward": 1.9999365210533142, "reward_std": 2.780110867206531e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999365210533142, "step": 2935 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.606217616580311, "grad_norm": 0.1267193709900675, "kl": 0.0595703125, "learning_rate": 2.393782383419689e-07, "loss": 0.0001, "reward": 2.499998450279236, "reward_std": 1.393046602515824e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2936 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.608808290155441, "grad_norm": 15.511280506324443, "kl": 0.18212890625, "learning_rate": 2.3911917098445594e-07, "loss": 0.0011, "reward": 1.9559872150421143, "reward_std": 0.00026224213070236146, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4559870958328247, "step": 2937 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.61139896373057, "grad_norm": 0.37426144497792, "kl": 0.0751953125, "learning_rate": 2.38860103626943e-07, "loss": -0.001, "reward": 2.4999903440475464, "reward_std": 4.634290405647334e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905228614807, "step": 2938 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.6139896373057, "grad_norm": 0.2281228421159099, "kl": 0.0938720703125, "learning_rate": 2.3860103626943004e-07, "loss": 0.0008, "reward": 2.4999959468841553, "reward_std": 2.4849618966982234e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2939 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.616580310880829, "grad_norm": 18.963372524842676, "kl": 0.095703125, "learning_rate": 2.3834196891191707e-07, "loss": -0.0002, "reward": 1.9943671822547913, "reward_std": 0.0002248529199277982, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.494367241859436, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.619170984455959, "grad_norm": 0.2151756746177725, "kl": 0.056121826171875, "learning_rate": 2.3808290155440415e-07, "loss": 0.0002, "reward": 2.499996781349182, "reward_std": 1.993753585338709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2941 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.6217616580310885, "grad_norm": 0.09343218362117965, "kl": 0.04669189453125, "learning_rate": 2.3782383419689117e-07, "loss": -0.0016, "reward": 2.4999964237213135, "reward_std": 1.8959590875056165e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2942 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 7.624352331606218, "grad_norm": 0.20900352356491086, "kl": 0.06829833984375, "learning_rate": 2.3756476683937823e-07, "loss": -0.0, "reward": 2.4999958276748657, "reward_std": 2.2507326207232836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2943 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.626943005181348, "grad_norm": 0.07172188712283907, "kl": 0.16162109375, "learning_rate": 2.3730569948186528e-07, "loss": 0.0004, "reward": 2.499997615814209, "reward_std": 1.4926845608442818e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2944 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 7.629533678756477, "grad_norm": 0.09013347104142419, "kl": 0.1025390625, "learning_rate": 2.3704663212435233e-07, "loss": -0.0003, "reward": 2.4999982118606567, "reward_std": 1.399378902533499e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2945 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.632124352331607, "grad_norm": 0.2624817251248264, "kl": 0.07275390625, "learning_rate": 2.3678756476683936e-07, "loss": 0.0, "reward": 2.499995708465576, "reward_std": 2.676186682037951e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2946 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.634715025906736, "grad_norm": 21.68251085613039, "kl": 0.14111328125, "learning_rate": 2.3652849740932644e-07, "loss": 0.0003, "reward": 1.4946138858795166, "reward_std": 0.0003615196037571877, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9946139752864838, "step": 2947 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.637305699481866, "grad_norm": 0.11362354058349974, "kl": 0.142333984375, "learning_rate": 2.3626943005181346e-07, "loss": 0.0001, "reward": 2.4999977350234985, "reward_std": 2.0696052160928957e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2948 }, { "clip_ratio": 0.0, "completion_length": 56.25, "epoch": 7.639896373056995, "grad_norm": 0.5081918990999894, "kl": 0.1644287109375, "learning_rate": 2.360103626943005e-07, "loss": 0.0007, "reward": 2.499997854232788, "reward_std": 2.155113293156319e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2949 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.642487046632124, "grad_norm": 26.434326333996307, "kl": 0.207763671875, "learning_rate": 2.3575129533678757e-07, "loss": 0.0004, "reward": 2.4374284744262695, "reward_std": 0.17694669044453804, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374284744262695, "step": 2950 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.6450777202072535, "grad_norm": 0.6008088957294256, "kl": 0.1146240234375, "learning_rate": 2.354922279792746e-07, "loss": 0.0012, "reward": 2.499993920326233, "reward_std": 3.606780069276283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 2951 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.647668393782383, "grad_norm": 0.1051846941450007, "kl": 0.0528564453125, "learning_rate": 2.3523316062176165e-07, "loss": -0.0001, "reward": 2.4999979734420776, "reward_std": 1.942912490449089e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2952 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.650259067357513, "grad_norm": 1.0003136360293363, "kl": 0.1531982421875, "learning_rate": 2.349740932642487e-07, "loss": -0.0004, "reward": 2.4999920129776, "reward_std": 4.389560785966751e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 2953 }, { "clip_ratio": 0.0, "completion_length": 40.75, "epoch": 7.652849740932642, "grad_norm": 1.1177926038920152, "kl": 0.082275390625, "learning_rate": 2.3471502590673575e-07, "loss": 0.0004, "reward": 2.4999881982803345, "reward_std": 6.322336730590905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 2954 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.655440414507772, "grad_norm": 0.0855577607850077, "kl": 0.03887939453125, "learning_rate": 2.3445595854922278e-07, "loss": -0.0007, "reward": 2.499997854232788, "reward_std": 9.333904529285064e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2955 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 7.658031088082901, "grad_norm": 1.366519005717628, "kl": 0.1217041015625, "learning_rate": 2.3419689119170983e-07, "loss": -0.0006, "reward": 2.499981999397278, "reward_std": 1.0933534667856293e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982237815857, "step": 2956 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.660621761658031, "grad_norm": 0.4426961287845873, "kl": 0.12158203125, "learning_rate": 2.3393782383419688e-07, "loss": -0.0005, "reward": 2.4999959468841553, "reward_std": 6.3268050780607155e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2957 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.66321243523316, "grad_norm": 0.20242673814524642, "kl": 0.0428466796875, "learning_rate": 2.336787564766839e-07, "loss": -0.0013, "reward": 2.4999985694885254, "reward_std": 1.1756038418297976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2958 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 7.66580310880829, "grad_norm": 0.745336950931801, "kl": 0.0472412109375, "learning_rate": 2.33419689119171e-07, "loss": -0.0006, "reward": 2.499993085861206, "reward_std": 5.1215583880548365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2959 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 7.668393782383419, "grad_norm": 0.10487343480430736, "kl": 0.0797119140625, "learning_rate": 2.3316062176165802e-07, "loss": -0.0008, "reward": 2.499997615814209, "reward_std": 1.7968870338336274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2960 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.670984455958549, "grad_norm": 0.3082873756276534, "kl": 0.05078125, "learning_rate": 2.3290155440414507e-07, "loss": 0.0008, "reward": 2.4999964237213135, "reward_std": 5.062747277406743e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2961 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.6735751295336785, "grad_norm": 0.7115699257966813, "kl": 0.0946044921875, "learning_rate": 2.3264248704663212e-07, "loss": 0.0003, "reward": 2.499991774559021, "reward_std": 4.858100737692439e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 2962 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.676165803108808, "grad_norm": 2.3988742006310577, "kl": 0.097412109375, "learning_rate": 2.3238341968911915e-07, "loss": 0.0009, "reward": 1.9991474747657776, "reward_std": 4.1811253595369635e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991474449634552, "step": 2963 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.678756476683938, "grad_norm": 9.972383523493665, "kl": 0.12158203125, "learning_rate": 2.321243523316062e-07, "loss": 0.0005, "reward": 1.749154418706894, "reward_std": 0.0010599661536616622, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.249154418706894, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.681347150259067, "grad_norm": 0.21541104397373564, "kl": 0.087158203125, "learning_rate": 2.3186528497409325e-07, "loss": 0.0005, "reward": 2.4999977350234985, "reward_std": 2.268307923714019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2965 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.683937823834197, "grad_norm": 6.585270471161858, "kl": 0.08331298828125, "learning_rate": 2.316062176165803e-07, "loss": 0.0007, "reward": 1.9986374378204346, "reward_std": 0.00010126053098247212, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986374378204346, "step": 2966 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.686528497409326, "grad_norm": 67.72901340280286, "kl": 0.07763671875, "learning_rate": 2.3134715025906733e-07, "loss": 0.0007, "reward": 2.4993667602539062, "reward_std": 0.0003897433296060626, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9993669986724854, "step": 2967 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.689119170984456, "grad_norm": 0.6859893852119695, "kl": 0.082275390625, "learning_rate": 2.310880829015544e-07, "loss": 0.0011, "reward": 2.4999659061431885, "reward_std": 9.498003691987833e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999659657478333, "step": 2968 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 7.691709844559585, "grad_norm": 1.1384016074539336, "kl": 0.1207275390625, "learning_rate": 2.3082901554404144e-07, "loss": 0.0002, "reward": 1.9998491406440735, "reward_std": 1.4505180160995224e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998493194580078, "step": 2969 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.694300518134715, "grad_norm": 0.09950597713307814, "kl": 0.0665283203125, "learning_rate": 2.3056994818652846e-07, "loss": 0.0006, "reward": 2.4999983310699463, "reward_std": 1.8713426470640115e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2970 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.696891191709844, "grad_norm": 0.22124403328707112, "kl": 0.02740478515625, "learning_rate": 2.3031088082901554e-07, "loss": -0.0, "reward": 2.499997854232788, "reward_std": 1.7533334357722197e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2971 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.699481865284974, "grad_norm": 0.14643097844246533, "kl": 0.05419921875, "learning_rate": 2.3005181347150257e-07, "loss": 0.0005, "reward": 2.499997615814209, "reward_std": 2.1564043208854855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2972 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.7020725388601035, "grad_norm": 0.16074284350459964, "kl": 0.0556640625, "learning_rate": 2.2979274611398962e-07, "loss": -0.0004, "reward": 2.49999737739563, "reward_std": 1.909292564050702e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2973 }, { "clip_ratio": 0.0, "completion_length": 50.4375, "epoch": 7.704663212435233, "grad_norm": 0.3597394914780877, "kl": 0.101806640625, "learning_rate": 2.2953367875647667e-07, "loss": 0.0007, "reward": 2.4999879598617554, "reward_std": 5.832936949445866e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999880194664001, "step": 2974 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.7072538860103625, "grad_norm": 0.17703672797385214, "kl": 0.203125, "learning_rate": 2.2927461139896373e-07, "loss": 0.0021, "reward": 2.499998092651367, "reward_std": 1.956017769089158e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2975 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.709844559585492, "grad_norm": 7.92368945668728, "kl": 0.09326171875, "learning_rate": 2.2901554404145075e-07, "loss": 0.0005, "reward": 2.499956250190735, "reward_std": 1.6112611831431423e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99995619058609, "step": 2976 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 7.712435233160622, "grad_norm": 0.10236311235231303, "kl": 0.14013671875, "learning_rate": 2.2875647668393783e-07, "loss": 0.0006, "reward": 2.4999982118606567, "reward_std": 1.6020362636481877e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2977 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.715025906735751, "grad_norm": 24.94233716930462, "kl": 0.109130859375, "learning_rate": 2.2849740932642486e-07, "loss": 0.0009, "reward": 2.437318205833435, "reward_std": 0.17722765591750544, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373182654380798, "step": 2978 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.717616580310881, "grad_norm": 0.1166052795153088, "kl": 0.036895751953125, "learning_rate": 2.2823834196891188e-07, "loss": 0.0007, "reward": 2.4999988079071045, "reward_std": 1.0677501904865494e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2979 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.72020725388601, "grad_norm": 0.10273634066576631, "kl": 0.094970703125, "learning_rate": 2.2797927461139896e-07, "loss": 0.001, "reward": 2.4999990463256836, "reward_std": 8.790932213287306e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 2980 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.72279792746114, "grad_norm": 0.541149799571857, "kl": 0.1041259765625, "learning_rate": 2.27720207253886e-07, "loss": 0.0011, "reward": 2.4999892711639404, "reward_std": 4.956806833433802e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 2981 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 7.725388601036269, "grad_norm": 1.078910526850145, "kl": 0.103759765625, "learning_rate": 2.2746113989637304e-07, "loss": -0.0, "reward": 2.499979615211487, "reward_std": 1.4569647419193643e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999796748161316, "step": 2982 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.727979274611399, "grad_norm": 0.1053013311782325, "kl": 0.02398681640625, "learning_rate": 2.272020725388601e-07, "loss": -0.0003, "reward": 2.4999979734420776, "reward_std": 1.7312717375261855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2983 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 7.730569948186528, "grad_norm": 0.22012352056918047, "kl": 0.0712890625, "learning_rate": 2.2694300518134715e-07, "loss": 0.001, "reward": 2.499997615814209, "reward_std": 2.2719966068507347e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2984 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.733160621761658, "grad_norm": 6.840393493905241, "kl": 0.135986328125, "learning_rate": 2.2668393782383417e-07, "loss": 0.001, "reward": 1.8857450485229492, "reward_std": 0.0005485858931137955, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3857450187206268, "step": 2985 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.7357512953367875, "grad_norm": 1.195792753387545, "kl": 0.088623046875, "learning_rate": 2.2642487046632123e-07, "loss": -0.0002, "reward": 1.999900460243225, "reward_std": 1.10291978216992e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999005496501923, "step": 2986 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.738341968911917, "grad_norm": 0.5503608916768779, "kl": 0.23486328125, "learning_rate": 2.2616580310880828e-07, "loss": -0.0002, "reward": 2.499993085861206, "reward_std": 7.3362193688808475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 2987 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.740932642487047, "grad_norm": 0.21467000819454535, "kl": 0.0697021484375, "learning_rate": 2.2590673575129533e-07, "loss": -0.0013, "reward": 2.499997138977051, "reward_std": 1.8295448853677954e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 48.75, "epoch": 7.743523316062176, "grad_norm": 0.11209818085710388, "kl": 0.081787109375, "learning_rate": 2.2564766839378238e-07, "loss": 0.0005, "reward": 2.4999979734420776, "reward_std": 1.6308087538163818e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2989 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.746113989637306, "grad_norm": 5.576181027361054, "kl": 0.074951171875, "learning_rate": 2.253886010362694e-07, "loss": 0.0002, "reward": 1.9967710971832275, "reward_std": 0.00011377934310985438, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4967711567878723, "step": 2990 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.748704663212435, "grad_norm": 0.3863917480883561, "kl": 0.06591796875, "learning_rate": 2.251295336787565e-07, "loss": 0.0003, "reward": 2.499995470046997, "reward_std": 2.436622821733181e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 2991 }, { "clip_ratio": 0.0, "completion_length": 50.8125, "epoch": 7.751295336787565, "grad_norm": 0.30324574535945753, "kl": 0.1666259765625, "learning_rate": 2.2487046632124352e-07, "loss": 0.0025, "reward": 2.4999964237213135, "reward_std": 4.123945075207303e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2992 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 7.753886010362694, "grad_norm": 6.507414054359937, "kl": 0.181396484375, "learning_rate": 2.2461139896373054e-07, "loss": -0.0, "reward": 2.3124806880950928, "reward_std": 0.2587983173305872, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124808073043823, "step": 2993 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.756476683937824, "grad_norm": 0.7277007667711758, "kl": 0.10791015625, "learning_rate": 2.2435233160621762e-07, "loss": -0.0006, "reward": 2.4999972581863403, "reward_std": 2.1272334436162055e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2994 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 7.759067357512953, "grad_norm": 4.394600432599988, "kl": 0.114990234375, "learning_rate": 2.2409326424870465e-07, "loss": 0.0013, "reward": 1.9768844842910767, "reward_std": 0.00026186189910504254, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4768843352794647, "step": 2995 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.761658031088083, "grad_norm": 0.16844499449103809, "kl": 0.10137939453125, "learning_rate": 2.238341968911917e-07, "loss": 0.0005, "reward": 2.499995231628418, "reward_std": 2.7149463903697324e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2996 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.7642487046632125, "grad_norm": 0.38754342560264987, "kl": 0.04638671875, "learning_rate": 2.2357512953367875e-07, "loss": -0.0002, "reward": 2.499997854232788, "reward_std": 2.3842546283958654e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2997 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.766839378238342, "grad_norm": 0.041752207302425244, "kl": 0.083740234375, "learning_rate": 2.233160621761658e-07, "loss": 0.0012, "reward": 2.499998688697815, "reward_std": 9.92981682657046e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2998 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.769430051813472, "grad_norm": 6.996812347178321, "kl": 0.128662109375, "learning_rate": 2.2305699481865283e-07, "loss": 0.0006, "reward": 1.2891470193862915, "reward_std": 0.0005185952140891459, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.789146900177002, "step": 2999 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.772020725388601, "grad_norm": 0.3354768383178086, "kl": 0.1138916015625, "learning_rate": 2.2279792746113988e-07, "loss": 0.0009, "reward": 2.499992847442627, "reward_std": 3.500345201246091e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 3000 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.774611398963731, "grad_norm": 44.60991259135863, "kl": 0.24365234375, "learning_rate": 2.2253886010362694e-07, "loss": 0.0008, "reward": 1.9976395964622498, "reward_std": 0.0026098146537378852, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4976397156715393, "step": 3001 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.77720207253886, "grad_norm": 0.6179131623695943, "kl": 0.083740234375, "learning_rate": 2.2227979274611396e-07, "loss": -0.0003, "reward": 2.499996066093445, "reward_std": 4.647369678423274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3002 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.77979274611399, "grad_norm": 11.476161014523134, "kl": 0.177734375, "learning_rate": 2.2202072538860104e-07, "loss": 0.001, "reward": 1.7996803522109985, "reward_std": 0.000799538857194193, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2996802628040314, "step": 3003 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.782383419689119, "grad_norm": 0.0859598458168097, "kl": 0.119384765625, "learning_rate": 2.2176165803108807e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.1724314958883042e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3004 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.784974093264249, "grad_norm": 0.6372858301731534, "kl": 0.03839111328125, "learning_rate": 2.2150259067357512e-07, "loss": -0.0005, "reward": 2.4999921321868896, "reward_std": 6.650734576396644e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 3005 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.787564766839378, "grad_norm": 1.1916148983792458, "kl": 0.091796875, "learning_rate": 2.2124352331606217e-07, "loss": 0.0009, "reward": 2.4999659061431885, "reward_std": 1.2026725528357929e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999656677246094, "step": 3006 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 7.790155440414508, "grad_norm": 17.489054007325144, "kl": 0.079345703125, "learning_rate": 2.2098445595854923e-07, "loss": -0.0006, "reward": 1.983235239982605, "reward_std": 0.041816088081873204, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4832352995872498, "step": 3007 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.7927461139896375, "grad_norm": 0.9710180200488386, "kl": 0.137939453125, "learning_rate": 2.2072538860103625e-07, "loss": -0.0002, "reward": 2.499995470046997, "reward_std": 3.855029262922471e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 3008 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.795336787564767, "grad_norm": 0.1904780758251927, "kl": 0.09814453125, "learning_rate": 2.204663212435233e-07, "loss": 0.0005, "reward": 2.4999966621398926, "reward_std": 1.9031149918191659e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3009 }, { "clip_ratio": 0.0, "completion_length": 49.375, "epoch": 7.7979274611398965, "grad_norm": 0.3809945716885692, "kl": 0.177734375, "learning_rate": 2.2020725388601036e-07, "loss": -0.0, "reward": 2.4999953508377075, "reward_std": 3.3065974207602267e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3010 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.800518134715026, "grad_norm": 4.13028624798618, "kl": 0.11865234375, "learning_rate": 2.1994818652849738e-07, "loss": -0.0002, "reward": 1.9996672868728638, "reward_std": 4.15631293435581e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996671676635742, "step": 3011 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.803108808290156, "grad_norm": 8.92503129835271, "kl": 0.048583984375, "learning_rate": 2.1968911917098446e-07, "loss": -0.0009, "reward": 1.9992570877075195, "reward_std": 4.9678758500704134e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992572367191315, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.805699481865285, "grad_norm": 0.3884721662967295, "kl": 0.15771484375, "learning_rate": 2.194300518134715e-07, "loss": -0.0001, "reward": 2.499993085861206, "reward_std": 6.256911319724168e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 3013 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.808290155440415, "grad_norm": 0.20188397965633975, "kl": 0.14453125, "learning_rate": 2.1917098445595854e-07, "loss": 0.0013, "reward": 2.4999970197677612, "reward_std": 2.6286099910066696e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3014 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.810880829015544, "grad_norm": 57.212860195592924, "kl": 0.078369140625, "learning_rate": 2.189119170984456e-07, "loss": -0.0002, "reward": 2.499917507171631, "reward_std": 0.0001447477802685171, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999176859855652, "step": 3015 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.813471502590674, "grad_norm": 0.11117378665291582, "kl": 0.08251953125, "learning_rate": 2.1865284974093262e-07, "loss": 0.0005, "reward": 2.499997854232788, "reward_std": 1.1092838860804477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3016 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.816062176165803, "grad_norm": 0.24395739259771884, "kl": 0.06195068359375, "learning_rate": 2.1839378238341967e-07, "loss": 0.0001, "reward": 2.4999951124191284, "reward_std": 3.1535430480289506e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 3017 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.818652849740933, "grad_norm": 0.24968575492166806, "kl": 0.13720703125, "learning_rate": 2.1813471502590673e-07, "loss": -0.0, "reward": 2.499994993209839, "reward_std": 3.369668547748006e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3018 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.821243523316062, "grad_norm": 0.10856029532561345, "kl": 0.065673828125, "learning_rate": 2.1787564766839378e-07, "loss": -0.0014, "reward": 2.4999983310699463, "reward_std": 9.617032787900825e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3019 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.823834196891192, "grad_norm": 0.3990836424354962, "kl": 0.04833984375, "learning_rate": 2.176165803108808e-07, "loss": -0.0011, "reward": 2.499995470046997, "reward_std": 3.2752739400621067e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3020 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.8264248704663215, "grad_norm": 0.328589032840485, "kl": 0.10498046875, "learning_rate": 2.1735751295336789e-07, "loss": 0.0001, "reward": 2.499996066093445, "reward_std": 3.1339155270870833e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3021 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.829015544041451, "grad_norm": 2.610763759191371, "kl": 0.1357421875, "learning_rate": 2.170984455958549e-07, "loss": -0.0004, "reward": 1.9348300695419312, "reward_std": 0.00030914814942661906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4348300993442535, "step": 3022 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.831606217616581, "grad_norm": 0.3323216843991672, "kl": 0.06097412109375, "learning_rate": 2.1683937823834194e-07, "loss": -0.0002, "reward": 2.499992251396179, "reward_std": 5.128441387114435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 3023 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.83419689119171, "grad_norm": 0.9763031635765986, "kl": 0.120849609375, "learning_rate": 2.1658031088082902e-07, "loss": 0.0007, "reward": 2.499966621398926, "reward_std": 1.0766732316369598e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999666213989258, "step": 3024 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.83678756476684, "grad_norm": 0.18471932344471542, "kl": 0.038330078125, "learning_rate": 2.1632124352331604e-07, "loss": 0.001, "reward": 2.49999737739563, "reward_std": 2.951713440779713e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3025 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.839378238341969, "grad_norm": 1.2044626729252217, "kl": 0.095458984375, "learning_rate": 2.160621761658031e-07, "loss": 0.001, "reward": 2.4999927282333374, "reward_std": 4.537896188594459e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 3026 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.841968911917099, "grad_norm": 37.684048461370026, "kl": 0.0628662109375, "learning_rate": 2.1580310880829015e-07, "loss": -0.0002, "reward": 2.124963641166687, "reward_std": 0.23147446412122008, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.624963641166687, "step": 3027 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.844559585492228, "grad_norm": 1.6546396166110229, "kl": 0.087646484375, "learning_rate": 2.155440414507772e-07, "loss": 0.0001, "reward": 2.4999799728393555, "reward_std": 1.3621554444398498e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999799728393555, "step": 3028 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.847150259067358, "grad_norm": 20.999148396473412, "kl": 0.47509765625, "learning_rate": 2.1528497409326423e-07, "loss": 0.002, "reward": 1.819443941116333, "reward_std": 0.005023880454473328, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.319443941116333, "step": 3029 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.849740932642487, "grad_norm": 1.9118497502815999, "kl": 0.068359375, "learning_rate": 2.1502590673575128e-07, "loss": 0.0013, "reward": 2.499988079071045, "reward_std": 1.3485883982866653e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879002571106, "step": 3030 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 7.852331606217617, "grad_norm": 0.3750201761475081, "kl": 0.058349609375, "learning_rate": 2.1476683937823833e-07, "loss": 0.0006, "reward": 2.499994993209839, "reward_std": 5.309543439580011e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 3031 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 7.8549222797927465, "grad_norm": 0.5098717734405034, "kl": 0.05828857421875, "learning_rate": 2.1450777202072536e-07, "loss": -0.0, "reward": 2.4999526739120483, "reward_std": 6.4006620732470765e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999526739120483, "step": 3032 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.857512953367876, "grad_norm": 12.897713036953583, "kl": 0.07965087890625, "learning_rate": 2.1424870466321244e-07, "loss": 0.0003, "reward": 1.7496318221092224, "reward_std": 0.0035267252992525755, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2496318221092224, "step": 3033 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 7.860103626943005, "grad_norm": 5.132877689872265, "kl": 0.08203125, "learning_rate": 2.1398963730569946e-07, "loss": -0.0007, "reward": 2.3110419511795044, "reward_std": 0.26078330373252356, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.811042070388794, "step": 3034 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.862694300518134, "grad_norm": 1.2133874725897307, "kl": 0.04132080078125, "learning_rate": 2.1373056994818652e-07, "loss": 0.0007, "reward": 2.499996304512024, "reward_std": 3.4870166700784466e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3035 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.865284974093264, "grad_norm": 0.17683049106604798, "kl": 0.05859375, "learning_rate": 2.1347150259067357e-07, "loss": -0.0005, "reward": 2.4999977350234985, "reward_std": 1.7219817323166353e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3036 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.867875647668393, "grad_norm": 3.3424894577950863, "kl": 0.106689453125, "learning_rate": 2.1321243523316062e-07, "loss": 0.0001, "reward": 2.374980926513672, "reward_std": 0.231461221099039, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749809265136719, "step": 3037 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.870466321243523, "grad_norm": 0.46156969070648574, "kl": 0.140380859375, "learning_rate": 2.1295336787564765e-07, "loss": -0.0, "reward": 2.499996066093445, "reward_std": 2.9493763520349603e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3038 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.873056994818652, "grad_norm": 11.95592196150453, "kl": 0.0926513671875, "learning_rate": 2.126943005181347e-07, "loss": 0.001, "reward": 1.9992703795433044, "reward_std": 7.580899205095193e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992702305316925, "step": 3039 }, { "clip_ratio": 0.0, "completion_length": 55.5, "epoch": 7.875647668393782, "grad_norm": 0.19229730993944627, "kl": 0.09466552734375, "learning_rate": 2.1243523316062175e-07, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 1.926622587689053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3040 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.8782383419689115, "grad_norm": 4.881048709265627, "kl": 0.125732421875, "learning_rate": 2.1217616580310878e-07, "loss": 0.0004, "reward": 1.4965155124664307, "reward_std": 0.0003549377215676941, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.996515542268753, "step": 3041 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.880829015544041, "grad_norm": 0.25731256280271225, "kl": 0.05828857421875, "learning_rate": 2.1191709844559586e-07, "loss": -0.0004, "reward": 2.4999983310699463, "reward_std": 1.274747631896389e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3042 }, { "clip_ratio": 0.0, "completion_length": 48.1875, "epoch": 7.883419689119171, "grad_norm": 16.394941719779528, "kl": 0.2470703125, "learning_rate": 2.1165803108808289e-07, "loss": 0.001, "reward": 1.7339510321617126, "reward_std": 0.27736324863508344, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2339511513710022, "step": 3043 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.8860103626943, "grad_norm": 1.6724931734167274, "kl": 0.14453125, "learning_rate": 2.1139896373056996e-07, "loss": 0.0012, "reward": 2.4999659061431885, "reward_std": 5.967582524135651e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999966025352478, "step": 3044 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.88860103626943, "grad_norm": 0.08795211915187846, "kl": 0.093017578125, "learning_rate": 2.11139896373057e-07, "loss": 0.0002, "reward": 2.499998450279236, "reward_std": 1.1635381724772742e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3045 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.891191709844559, "grad_norm": 0.1824677821958543, "kl": 0.03912353515625, "learning_rate": 2.1088082901554402e-07, "loss": 0.0004, "reward": 2.499996781349182, "reward_std": 1.606284172339656e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3046 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.893782383419689, "grad_norm": 13.874196884228585, "kl": 0.0908203125, "learning_rate": 2.106217616580311e-07, "loss": 0.0008, "reward": 1.9126826524734497, "reward_std": 0.00032226844800220533, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4126827716827393, "step": 3047 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.896373056994818, "grad_norm": 0.5726213483526537, "kl": 0.12005615234375, "learning_rate": 2.1036269430051812e-07, "loss": 0.0012, "reward": 2.4999969005584717, "reward_std": 3.0798935313214315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3048 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.898963730569948, "grad_norm": 6.646517562114499, "kl": 0.3505859375, "learning_rate": 2.1010362694300517e-07, "loss": 0.0008, "reward": 1.9997873306274414, "reward_std": 4.4795121539209504e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997875094413757, "step": 3049 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.901554404145077, "grad_norm": 0.3732915579262725, "kl": 0.047607421875, "learning_rate": 2.0984455958549223e-07, "loss": -0.0006, "reward": 1.9999539852142334, "reward_std": 4.280818330926195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499954104423523, "step": 3050 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.904145077720207, "grad_norm": 12.515263790020636, "kl": 0.075439453125, "learning_rate": 2.0958549222797928e-07, "loss": 0.0004, "reward": 1.9991610050201416, "reward_std": 2.9185516041252413e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991610646247864, "step": 3051 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.9067357512953365, "grad_norm": 1.4057637608744609, "kl": 0.095947265625, "learning_rate": 2.093264248704663e-07, "loss": 0.0012, "reward": 2.499979257583618, "reward_std": 6.739793661836302e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979317188263, "step": 3052 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.909326424870466, "grad_norm": 0.3506441761016127, "kl": 0.0478515625, "learning_rate": 2.0906735751295336e-07, "loss": 0.0003, "reward": 2.4999935626983643, "reward_std": 3.4979106544597016e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 3053 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.9119170984455955, "grad_norm": 1.8529570767044152, "kl": 0.093994140625, "learning_rate": 2.088082901554404e-07, "loss": 0.0011, "reward": 2.499988079071045, "reward_std": 7.242968422360718e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881386756897, "step": 3054 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.914507772020725, "grad_norm": 0.40637928378831467, "kl": 0.152099609375, "learning_rate": 2.0854922279792744e-07, "loss": 0.0018, "reward": 2.499988317489624, "reward_std": 2.956864705083717e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988317489624, "step": 3055 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.917098445595855, "grad_norm": 0.8957633237392931, "kl": 0.04937744140625, "learning_rate": 2.0829015544041452e-07, "loss": -0.0001, "reward": 2.4999964237213135, "reward_std": 2.1002718995077885e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3056 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.919689119170984, "grad_norm": 0.16231467689703546, "kl": 0.030792236328125, "learning_rate": 2.0803108808290154e-07, "loss": -0.0, "reward": 2.4999964237213135, "reward_std": 1.9314592236696626e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3057 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.922279792746114, "grad_norm": 2.0512923604444806, "kl": 0.03704833984375, "learning_rate": 2.077720207253886e-07, "loss": 0.0009, "reward": 2.4999752044677734, "reward_std": 8.382141913898522e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999752640724182, "step": 3058 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.924870466321243, "grad_norm": 2.075172712150298, "kl": 0.1083984375, "learning_rate": 2.0751295336787565e-07, "loss": 0.0008, "reward": 1.9986644983291626, "reward_std": 7.39293682272546e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986644387245178, "step": 3059 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.927461139896373, "grad_norm": 1.2339311428479074, "kl": 0.090087890625, "learning_rate": 2.0725388601036267e-07, "loss": 0.0013, "reward": 1.9993021488189697, "reward_std": 1.9976253042841563e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499302089214325, "step": 3060 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.930051813471502, "grad_norm": 10.291838412374966, "kl": 0.11181640625, "learning_rate": 2.0699481865284973e-07, "loss": 0.0001, "reward": 1.9063404202461243, "reward_std": 0.0007828716481981246, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4063405394554138, "step": 3061 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.932642487046632, "grad_norm": 0.1619941943263571, "kl": 0.0440673828125, "learning_rate": 2.0673575129533678e-07, "loss": 0.0006, "reward": 2.4999948740005493, "reward_std": 3.0549521454759088e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3062 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.935233160621761, "grad_norm": 2.284121356618222, "kl": 0.1436767578125, "learning_rate": 2.0647668393782383e-07, "loss": 0.0008, "reward": 2.187445044517517, "reward_std": 0.2587815216161289, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6874449849128723, "step": 3063 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.937823834196891, "grad_norm": 0.2178085220611267, "kl": 0.0386962890625, "learning_rate": 2.0621761658031086e-07, "loss": 0.0004, "reward": 2.499998450279236, "reward_std": 1.6970756462342251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3064 }, { "clip_ratio": 0.0, "completion_length": 41.875, "epoch": 7.9404145077720205, "grad_norm": 2.775546446135042, "kl": 0.0660400390625, "learning_rate": 2.0595854922279794e-07, "loss": -0.0003, "reward": 2.4999825954437256, "reward_std": 1.8527036900195526e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999826550483704, "step": 3065 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.94300518134715, "grad_norm": 0.4471748851263829, "kl": 0.065185546875, "learning_rate": 2.0569948186528496e-07, "loss": -0.0013, "reward": 2.499996066093445, "reward_std": 3.1141762519837357e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3066 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.94559585492228, "grad_norm": 0.4581649262815591, "kl": 0.079345703125, "learning_rate": 2.0544041450777202e-07, "loss": 0.0003, "reward": 2.4999953508377075, "reward_std": 4.8299432364729e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3067 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.948186528497409, "grad_norm": 0.5907043414619522, "kl": 0.097930908203125, "learning_rate": 2.0518134715025907e-07, "loss": 0.0004, "reward": 2.499990224838257, "reward_std": 4.091599919320288e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 3068 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 7.950777202072539, "grad_norm": 8.651065187843994, "kl": 0.06781005859375, "learning_rate": 2.049222797927461e-07, "loss": 0.0001, "reward": 2.1872127056121826, "reward_std": 0.2588574823487306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6872127056121826, "step": 3069 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.953367875647668, "grad_norm": 0.10608624095063623, "kl": 0.076416015625, "learning_rate": 2.0466321243523315e-07, "loss": -0.001, "reward": 2.4999983310699463, "reward_std": 1.0430226780044904e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3070 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 7.955958549222798, "grad_norm": 0.0976104703398458, "kl": 0.038818359375, "learning_rate": 2.044041450777202e-07, "loss": -0.0009, "reward": 2.499998688697815, "reward_std": 1.5638408115137281e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3071 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.958549222797927, "grad_norm": 0.12832696211906286, "kl": 0.032196044921875, "learning_rate": 2.0414507772020725e-07, "loss": 0.0004, "reward": 2.499996781349182, "reward_std": 2.036700607277453e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3072 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.961139896373057, "grad_norm": 0.14687950339308198, "kl": 0.102294921875, "learning_rate": 2.0388601036269428e-07, "loss": 0.0017, "reward": 2.4999988079071045, "reward_std": 1.3790034927296801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3073 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.963730569948186, "grad_norm": 4.814866629361969, "kl": 0.07525634765625, "learning_rate": 2.0362694300518136e-07, "loss": 0.0002, "reward": 1.8056188821792603, "reward_std": 0.0005981771820415815, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3056190013885498, "step": 3074 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.966321243523316, "grad_norm": 0.4391403908917793, "kl": 0.0467529296875, "learning_rate": 2.0336787564766839e-07, "loss": 0.0002, "reward": 2.499992609024048, "reward_std": 6.744595907548501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 3075 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.9689119170984455, "grad_norm": 0.19545781194144596, "kl": 0.039794921875, "learning_rate": 2.031088082901554e-07, "loss": -0.0012, "reward": 2.49999737739563, "reward_std": 2.3030311808724946e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3076 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.971502590673575, "grad_norm": 29.425519535576356, "kl": 0.19189453125, "learning_rate": 2.028497409326425e-07, "loss": 0.0009, "reward": 1.499417245388031, "reward_std": 4.873694524576422e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9994172155857086, "step": 3077 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.974093264248705, "grad_norm": 8.61011910224891, "kl": 0.114013671875, "learning_rate": 2.0259067357512952e-07, "loss": 0.0016, "reward": 2.4999672174453735, "reward_std": 1.7612333863326057e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999967098236084, "step": 3078 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.976683937823834, "grad_norm": 0.38591567970148855, "kl": 0.085205078125, "learning_rate": 2.0233160621761657e-07, "loss": 0.0011, "reward": 2.499985933303833, "reward_std": 5.2141411117645475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999856352806091, "step": 3079 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.979274611398964, "grad_norm": 0.07078876950508031, "kl": 0.06304931640625, "learning_rate": 2.0207253886010362e-07, "loss": 0.0011, "reward": 2.4999974966049194, "reward_std": 1.8690761294237745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3080 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.981865284974093, "grad_norm": 0.09250307983687249, "kl": 0.0928955078125, "learning_rate": 2.0181347150259068e-07, "loss": 0.001, "reward": 2.499997615814209, "reward_std": 1.6673891991558776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3081 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.984455958549223, "grad_norm": 7.788774079672367, "kl": 0.120361328125, "learning_rate": 2.015544041450777e-07, "loss": 0.0001, "reward": 1.9992444515228271, "reward_std": 9.306467990199963e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992445409297943, "step": 3082 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.987046632124352, "grad_norm": 3.6689258242200053, "kl": 0.0545654296875, "learning_rate": 2.0129533678756475e-07, "loss": 0.0003, "reward": 2.4999526739120483, "reward_std": 3.7206389379207394e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999952793121338, "step": 3083 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.989637305699482, "grad_norm": 1.5352541442070466, "kl": 0.4453125, "learning_rate": 2.010362694300518e-07, "loss": 0.0003, "reward": 2.49999737739563, "reward_std": 1.857154671824901e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.992227979274611, "grad_norm": 0.10106041878832855, "kl": 0.146484375, "learning_rate": 2.0077720207253883e-07, "loss": 0.0006, "reward": 2.499998450279236, "reward_std": 1.2345023776560993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3085 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.994818652849741, "grad_norm": 0.27822815966538256, "kl": 0.077880859375, "learning_rate": 2.005181347150259e-07, "loss": 0.0006, "reward": 2.4999932050704956, "reward_std": 3.4578056329337414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 3086 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.9974093264248705, "grad_norm": 0.2733977577243428, "kl": 0.090301513671875, "learning_rate": 2.0025906735751294e-07, "loss": 0.0012, "reward": 2.499995231628418, "reward_std": 2.9401608685475367e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3087 }, { "clip_ratio": 0.0, "completion_length": 55.625, "epoch": 8.0, "grad_norm": 20.860859891943928, "kl": 0.203125, "learning_rate": 2e-07, "loss": 0.0016, "reward": 1.8321717977523804, "reward_std": 0.006747323014565154, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.332171767950058, "step": 3088 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 8.00259067357513, "grad_norm": 0.48937311848710896, "kl": 0.39404296875, "learning_rate": 1.9974093264248704e-07, "loss": 0.0014, "reward": 2.499994993209839, "reward_std": 7.083268542373844e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3089 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.005181347150259, "grad_norm": 6.537881634096945, "kl": 0.2763671875, "learning_rate": 1.9948186528497407e-07, "loss": 0.002, "reward": 1.9989084005355835, "reward_std": 9.225488201991539e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498908281326294, "step": 3090 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.007772020725389, "grad_norm": 2.682158201392398, "kl": 0.064208984375, "learning_rate": 1.9922279792746112e-07, "loss": 0.0003, "reward": 1.9992045164108276, "reward_std": 3.66093044021909e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992044866085052, "step": 3091 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.010362694300518, "grad_norm": 0.3128647447330821, "kl": 0.10498046875, "learning_rate": 1.9896373056994818e-07, "loss": 0.0016, "reward": 2.4999983310699463, "reward_std": 2.546264624925243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3092 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.012953367875648, "grad_norm": 0.19923548432394256, "kl": 0.092041015625, "learning_rate": 1.9870466321243523e-07, "loss": 0.0007, "reward": 2.4999911785125732, "reward_std": 4.407925644045463e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 3093 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.015544041450777, "grad_norm": 12.112104627906088, "kl": 0.20068359375, "learning_rate": 1.9844559585492225e-07, "loss": 0.0007, "reward": 1.9948546886444092, "reward_std": 8.660938601678936e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.494854748249054, "step": 3094 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.018134715025907, "grad_norm": 0.12052488404009426, "kl": 0.1202392578125, "learning_rate": 1.9818652849740933e-07, "loss": 0.0006, "reward": 2.499998688697815, "reward_std": 1.3107247127663868e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3095 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.020725388601036, "grad_norm": 0.06571834542639839, "kl": 0.06396484375, "learning_rate": 1.9792746113989636e-07, "loss": 0.0004, "reward": 2.4999990463256836, "reward_std": 8.381195186757395e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3096 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.023316062176166, "grad_norm": 2.3339690081943463, "kl": 0.131591796875, "learning_rate": 1.9766839378238339e-07, "loss": -0.0009, "reward": 2.4999892711639404, "reward_std": 4.988449632037373e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 3097 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.025906735751295, "grad_norm": 0.05359724833176992, "kl": 0.0626220703125, "learning_rate": 1.9740932642487046e-07, "loss": 0.0004, "reward": 2.4999990463256836, "reward_std": 7.621945883329317e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3098 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 8.028497409326425, "grad_norm": 61.89513835469881, "kl": 0.11083984375, "learning_rate": 1.971502590673575e-07, "loss": 0.0004, "reward": 2.4259248971939087, "reward_std": 0.20949823458340688, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9259249567985535, "step": 3099 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.031088082901555, "grad_norm": 0.5949386111352258, "kl": 0.025726318359375, "learning_rate": 1.9689119170984454e-07, "loss": -0.0009, "reward": 2.4999948740005493, "reward_std": 3.988418797007398e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3100 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.033678756476684, "grad_norm": 0.12443107948783938, "kl": 0.09521484375, "learning_rate": 1.966321243523316e-07, "loss": 0.0002, "reward": 2.4999983310699463, "reward_std": 1.4647603165940382e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3101 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.036269430051814, "grad_norm": 0.10706372780345777, "kl": 0.1357421875, "learning_rate": 1.9637305699481865e-07, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 2.667642547748983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3102 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.038860103626943, "grad_norm": 1.7932882488125332, "kl": 0.1746826171875, "learning_rate": 1.9611398963730568e-07, "loss": 0.0011, "reward": 2.499991297721863, "reward_std": 1.84251015866721e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 3103 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.041450777202073, "grad_norm": 0.1333418020752336, "kl": 0.070556640625, "learning_rate": 1.9585492227979275e-07, "loss": -0.0005, "reward": 2.499997615814209, "reward_std": 1.4219914135082945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3104 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.044041450777202, "grad_norm": 5.958990080495133, "kl": 0.06787109375, "learning_rate": 1.9559585492227978e-07, "loss": 0.0006, "reward": 2.4307610988616943, "reward_std": 0.19583399911883248, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9307610392570496, "step": 3105 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.046632124352332, "grad_norm": 0.3766993232712234, "kl": 0.0533447265625, "learning_rate": 1.953367875647668e-07, "loss": 0.0002, "reward": 2.499997138977051, "reward_std": 3.4180906141045853e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3106 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.049222797927461, "grad_norm": 0.2819536434781954, "kl": 0.07421875, "learning_rate": 1.9507772020725389e-07, "loss": 0.0005, "reward": 2.4999920129776, "reward_std": 3.1039201360272273e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 3107 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.05181347150259, "grad_norm": 0.28024881213709213, "kl": 0.09033203125, "learning_rate": 1.948186528497409e-07, "loss": -0.0005, "reward": 2.4999920129776, "reward_std": 3.2281690209856606e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.05440414507772, "grad_norm": 3.2135563776444083, "kl": 0.16796875, "learning_rate": 1.94559585492228e-07, "loss": 0.0004, "reward": 2.124959349632263, "reward_std": 0.23146210133654677, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249593496322632, "step": 3109 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 8.05699481865285, "grad_norm": 0.3197758184387199, "kl": 0.15625, "learning_rate": 1.9430051813471502e-07, "loss": -0.0004, "reward": 2.4999982118606567, "reward_std": 1.3452951748149644e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3110 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.05958549222798, "grad_norm": 0.4872829551488567, "kl": 0.0616455078125, "learning_rate": 1.9404145077720207e-07, "loss": -0.0006, "reward": 2.49999737739563, "reward_std": 2.1217189214439713e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3111 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.062176165803109, "grad_norm": 0.9925036821961221, "kl": 0.0513916015625, "learning_rate": 1.9378238341968912e-07, "loss": -0.0001, "reward": 2.4999964237213135, "reward_std": 2.1728093884121336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3112 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.064766839378239, "grad_norm": 12.29535666888288, "kl": 0.11767578125, "learning_rate": 1.9352331606217615e-07, "loss": -0.0001, "reward": 1.7904213070869446, "reward_std": 0.0005166101626059572, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2904213070869446, "step": 3113 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.067357512953368, "grad_norm": 0.14031474114992099, "kl": 0.16650390625, "learning_rate": 1.932642487046632e-07, "loss": 0.0005, "reward": 2.499997138977051, "reward_std": 4.874631940765539e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3114 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.069948186528498, "grad_norm": 0.17258386651337815, "kl": 0.099853515625, "learning_rate": 1.9300518134715025e-07, "loss": 0.0011, "reward": 2.4999948740005493, "reward_std": 4.240499549723609e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 3115 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.072538860103627, "grad_norm": 9.055534277417499, "kl": 0.065673828125, "learning_rate": 1.927461139896373e-07, "loss": 0.0001, "reward": 1.975152850151062, "reward_std": 0.0008445653210173987, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4751529693603516, "step": 3116 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.075129533678757, "grad_norm": 0.11312184733755459, "kl": 0.111328125, "learning_rate": 1.9248704663212433e-07, "loss": 0.0015, "reward": 2.499997615814209, "reward_std": 1.490138799908891e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3117 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.077720207253886, "grad_norm": 0.430179587137058, "kl": 0.079833984375, "learning_rate": 1.922279792746114e-07, "loss": 0.0002, "reward": 2.4999940395355225, "reward_std": 3.943327897104609e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 3118 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.080310880829016, "grad_norm": 1.5240938243400686, "kl": 0.058837890625, "learning_rate": 1.9196891191709844e-07, "loss": -0.0001, "reward": 2.4999749660491943, "reward_std": 1.006467311981396e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999749660491943, "step": 3119 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.082901554404145, "grad_norm": 14.737100614828844, "kl": 0.115234375, "learning_rate": 1.9170984455958546e-07, "loss": 0.0004, "reward": 1.2674922943115234, "reward_std": 0.0006954815562494332, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7674922347068787, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.085492227979275, "grad_norm": 0.17203470037132984, "kl": 0.05517578125, "learning_rate": 1.9145077720207254e-07, "loss": 0.0007, "reward": 2.4999966621398926, "reward_std": 2.18295832610238e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3121 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.088082901554404, "grad_norm": 2.919856949008707, "kl": 0.0712890625, "learning_rate": 1.9119170984455957e-07, "loss": 0.0003, "reward": 1.9927042722702026, "reward_std": 7.211302977339074e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492704451084137, "step": 3122 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.090673575129534, "grad_norm": 3.327305778999548, "kl": 0.0849609375, "learning_rate": 1.9093264248704662e-07, "loss": -0.0001, "reward": 2.499917507171631, "reward_std": 1.3699254623134038e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999176263809204, "step": 3123 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 8.093264248704664, "grad_norm": 0.5737395881305869, "kl": 0.15234375, "learning_rate": 1.9067357512953368e-07, "loss": 0.0007, "reward": 2.4999958276748657, "reward_std": 3.548377435436123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3124 }, { "clip_ratio": 0.0, "completion_length": 46.75, "epoch": 8.095854922279793, "grad_norm": 5.066960159216271, "kl": 0.1026611328125, "learning_rate": 1.9041450777202073e-07, "loss": 0.0003, "reward": 2.4999983310699463, "reward_std": 1.4627908342390583e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3125 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.098445595854923, "grad_norm": 0.28601654946072086, "kl": 0.068359375, "learning_rate": 1.9015544041450775e-07, "loss": 0.0001, "reward": 2.499994993209839, "reward_std": 3.946287165490503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3126 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.101036269430052, "grad_norm": 0.2643593668523947, "kl": 0.06201171875, "learning_rate": 1.898963730569948e-07, "loss": 0.0001, "reward": 2.4999958276748657, "reward_std": 2.8460362955229357e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3127 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.103626943005182, "grad_norm": 2.176443124340117, "kl": 0.124969482421875, "learning_rate": 1.8963730569948186e-07, "loss": 0.0003, "reward": 1.9989086389541626, "reward_std": 4.350585436441179e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989086389541626, "step": 3128 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.106217616580311, "grad_norm": 7.325899142295712, "kl": 0.178466796875, "learning_rate": 1.8937823834196889e-07, "loss": 0.0008, "reward": 1.9365675449371338, "reward_std": 0.1768198993995611, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4365676045417786, "step": 3129 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.10880829015544, "grad_norm": 0.12023312572556101, "kl": 0.0831298828125, "learning_rate": 1.8911917098445597e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.2465436043385125e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3130 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.11139896373057, "grad_norm": 0.5093373158507887, "kl": 0.128662109375, "learning_rate": 1.88860103626943e-07, "loss": 0.0002, "reward": 2.499987840652466, "reward_std": 3.691865174459963e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 3131 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.1139896373057, "grad_norm": 109.04331751006728, "kl": 0.153564453125, "learning_rate": 1.8860103626943004e-07, "loss": 0.0001, "reward": 1.996941328048706, "reward_std": 0.00011725809372364893, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4969413578510284, "step": 3132 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.11658031088083, "grad_norm": 1.2849867695979236, "kl": 0.0535888671875, "learning_rate": 1.883419689119171e-07, "loss": 0.0003, "reward": 2.499995470046997, "reward_std": 5.302288172970293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3133 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.119170984455959, "grad_norm": 0.20039309266087418, "kl": 0.08544921875, "learning_rate": 1.8808290155440415e-07, "loss": -0.0004, "reward": 2.4999964237213135, "reward_std": 2.39875475926965e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3134 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.121761658031089, "grad_norm": 0.20098931576816054, "kl": 0.060791015625, "learning_rate": 1.8782383419689118e-07, "loss": -0.0002, "reward": 2.4999966621398926, "reward_std": 1.8490782736080291e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3135 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.124352331606218, "grad_norm": 0.13245546512424616, "kl": 0.069580078125, "learning_rate": 1.8756476683937823e-07, "loss": -0.0, "reward": 2.499997138977051, "reward_std": 9.346515241759334e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3136 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.126943005181348, "grad_norm": 2.1824329739214203, "kl": 0.103759765625, "learning_rate": 1.8730569948186528e-07, "loss": 0.0006, "reward": 2.499980092048645, "reward_std": 2.126500066879089e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999802112579346, "step": 3137 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.129533678756477, "grad_norm": 0.5315947579909399, "kl": 0.078857421875, "learning_rate": 1.870466321243523e-07, "loss": 0.001, "reward": 2.499994397163391, "reward_std": 2.7083195277555205e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 3138 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.132124352331607, "grad_norm": 0.05214144489135689, "kl": 0.0506591796875, "learning_rate": 1.8678756476683939e-07, "loss": 0.0002, "reward": 2.4999985694885254, "reward_std": 7.366109571194102e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3139 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.134715025906736, "grad_norm": 3.8393469283397477, "kl": 0.296875, "learning_rate": 1.865284974093264e-07, "loss": 0.0008, "reward": 1.9993937015533447, "reward_std": 2.3958444216987118e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993937611579895, "step": 3140 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.137305699481866, "grad_norm": 8.46869100393302, "kl": 0.173583984375, "learning_rate": 1.8626943005181347e-07, "loss": 0.0005, "reward": 2.2489233016967773, "reward_std": 0.2684091946814249, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7489231824874878, "step": 3141 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.139896373056995, "grad_norm": 0.19194520422235753, "kl": 0.033447265625, "learning_rate": 1.8601036269430052e-07, "loss": 0.0016, "reward": 2.499996781349182, "reward_std": 2.0847651285293978e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3142 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.142487046632125, "grad_norm": 0.10058122678832755, "kl": 0.1024169921875, "learning_rate": 1.8575129533678754e-07, "loss": 0.0009, "reward": 2.4999985694885254, "reward_std": 1.1845976644053735e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3143 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.145077720207254, "grad_norm": 0.7070407237080806, "kl": 0.046142578125, "learning_rate": 1.854922279792746e-07, "loss": -0.0004, "reward": 2.4999953508377075, "reward_std": 5.4875583828106755e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3144 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.147668393782384, "grad_norm": 0.06791811492618365, "kl": 0.0677490234375, "learning_rate": 1.8523316062176165e-07, "loss": 0.0013, "reward": 2.4999988079071045, "reward_std": 1.6877844757345883e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3145 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.150259067357513, "grad_norm": 98.24381862231135, "kl": 0.05865478515625, "learning_rate": 1.849740932642487e-07, "loss": 0.001, "reward": 2.124965190887451, "reward_std": 0.2314691021447004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249650716781616, "step": 3146 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.152849740932643, "grad_norm": 0.17113559089247596, "kl": 0.0400390625, "learning_rate": 1.8471502590673573e-07, "loss": -0.0003, "reward": 2.49999737739563, "reward_std": 2.1307694595407156e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3147 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.155440414507773, "grad_norm": 10.420129529138041, "kl": 0.2021484375, "learning_rate": 1.844559585492228e-07, "loss": 0.0008, "reward": 1.8954155445098877, "reward_std": 0.17728135793004185, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3954156041145325, "step": 3148 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.158031088082902, "grad_norm": 0.35943241768878276, "kl": 0.095947265625, "learning_rate": 1.8419689119170983e-07, "loss": 0.0006, "reward": 2.499995470046997, "reward_std": 4.268991290246049e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3149 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.160621761658032, "grad_norm": 8.11623657150069, "kl": 0.099609375, "learning_rate": 1.8393782383419686e-07, "loss": 0.0012, "reward": 2.4999825954437256, "reward_std": 1.533940036324566e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982476234436, "step": 3150 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 8.163212435233161, "grad_norm": 0.1257755682029672, "kl": 0.130126953125, "learning_rate": 1.8367875647668394e-07, "loss": 0.0007, "reward": 2.4999988079071045, "reward_std": 1.0662363933988672e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3151 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.16580310880829, "grad_norm": 0.09368622845749539, "kl": 0.108642578125, "learning_rate": 1.8341968911917097e-07, "loss": -0.0015, "reward": 2.4999985694885254, "reward_std": 9.347214984245511e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3152 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.16839378238342, "grad_norm": 0.3288933867728653, "kl": 0.160400390625, "learning_rate": 1.8316062176165802e-07, "loss": 0.0021, "reward": 2.4999974966049194, "reward_std": 2.1250071995382314e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3153 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.17098445595855, "grad_norm": 0.23308490609427723, "kl": 0.0699462890625, "learning_rate": 1.8290155440414507e-07, "loss": 0.0013, "reward": 2.4999935626983643, "reward_std": 2.435132955724839e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 3154 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.17357512953368, "grad_norm": 3.2452882574131645, "kl": 0.18310546875, "learning_rate": 1.8264248704663212e-07, "loss": 0.0002, "reward": 1.9949986934661865, "reward_std": 0.00010334063580330621, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4949987828731537, "step": 3155 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.176165803108809, "grad_norm": 0.036850519034636625, "kl": 0.03631591796875, "learning_rate": 1.8238341968911915e-07, "loss": 0.0009, "reward": 2.4999985694885254, "reward_std": 1.5613321693308535e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3156 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.178756476683938, "grad_norm": 0.6596533076798986, "kl": 0.14453125, "learning_rate": 1.821243523316062e-07, "loss": 0.0002, "reward": 2.499956488609314, "reward_std": 6.305028421138559e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999564290046692, "step": 3157 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.181347150259068, "grad_norm": 2.0552805090812187, "kl": 0.10693359375, "learning_rate": 1.8186528497409325e-07, "loss": 0.0006, "reward": 2.4999899864196777, "reward_std": 1.0307559477951145e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 3158 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.183937823834198, "grad_norm": 0.078386768929999, "kl": 0.0584716796875, "learning_rate": 1.8160621761658028e-07, "loss": 0.0008, "reward": 2.499997615814209, "reward_std": 1.2820500927546163e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3159 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.186528497409327, "grad_norm": 0.6495918869200589, "kl": 0.0479736328125, "learning_rate": 1.8134715025906736e-07, "loss": 0.0002, "reward": 2.499993324279785, "reward_std": 5.276404976939375e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 3160 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.189119170984457, "grad_norm": 1.2681547277709064, "kl": 0.158203125, "learning_rate": 1.8108808290155439e-07, "loss": -0.0002, "reward": 2.499994158744812, "reward_std": 6.952769922463631e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 3161 }, { "clip_ratio": 0.0, "completion_length": 48.125, "epoch": 8.191709844559586, "grad_norm": 0.08866770092556833, "kl": 0.05816650390625, "learning_rate": 1.8082901554404144e-07, "loss": 0.0001, "reward": 2.4999983310699463, "reward_std": 1.2841297234444937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3162 }, { "clip_ratio": 0.0, "completion_length": 49.25, "epoch": 8.194300518134716, "grad_norm": 26.584634283869182, "kl": 0.101318359375, "learning_rate": 1.805699481865285e-07, "loss": 0.0001, "reward": 2.37479305267334, "reward_std": 0.23182353156278168, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747931718826294, "step": 3163 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.196891191709845, "grad_norm": 2.606227990625728, "kl": 0.08740234375, "learning_rate": 1.8031088082901554e-07, "loss": -0.0005, "reward": 1.9998180866241455, "reward_std": 3.970749503423576e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998182654380798, "step": 3164 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.199481865284975, "grad_norm": 0.40356799267993154, "kl": 0.14990234375, "learning_rate": 1.8005181347150257e-07, "loss": 0.0005, "reward": 2.4999929666519165, "reward_std": 4.410746214489336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 3165 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.202072538860104, "grad_norm": 0.11859449510465729, "kl": 0.0457763671875, "learning_rate": 1.7979274611398962e-07, "loss": 0.0005, "reward": 2.4999988079071045, "reward_std": 1.0082704022806865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3166 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.204663212435234, "grad_norm": 0.1890545401128121, "kl": 0.1044921875, "learning_rate": 1.7953367875647668e-07, "loss": 0.0012, "reward": 2.4999990463256836, "reward_std": 8.436041980530717e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3167 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.207253886010363, "grad_norm": 5.0034671534021005, "kl": 0.1826171875, "learning_rate": 1.792746113989637e-07, "loss": 0.0007, "reward": 1.2260577082633972, "reward_std": 0.0005348070408217609, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7260576784610748, "step": 3168 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.209844559585493, "grad_norm": 12.31469684914096, "kl": 0.0303955078125, "learning_rate": 1.7901554404145078e-07, "loss": -0.0003, "reward": 2.499963402748108, "reward_std": 4.9888995818037074e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999635219573975, "step": 3169 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.212435233160623, "grad_norm": 0.20302257365561321, "kl": 0.0753173828125, "learning_rate": 1.787564766839378e-07, "loss": 0.0012, "reward": 2.4999979734420776, "reward_std": 1.6111993659251311e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3170 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.215025906735752, "grad_norm": 4.830101926350806, "kl": 0.15087890625, "learning_rate": 1.784974093264249e-07, "loss": 0.0005, "reward": 1.8750877380371094, "reward_std": 0.000541319149363062, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3750877380371094, "step": 3171 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.217616580310882, "grad_norm": 0.14367990341563783, "kl": 0.03057861328125, "learning_rate": 1.782383419689119e-07, "loss": -0.0003, "reward": 2.499996542930603, "reward_std": 2.765297949736123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3172 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.220207253886011, "grad_norm": 0.1485577275254332, "kl": 0.06396484375, "learning_rate": 1.7797927461139894e-07, "loss": -0.0005, "reward": 2.4999977350234985, "reward_std": 1.9533202930688276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3173 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.22279792746114, "grad_norm": 0.3135919301675353, "kl": 0.06494140625, "learning_rate": 1.7772020725388602e-07, "loss": 0.0007, "reward": 2.4999966621398926, "reward_std": 1.8612417989061214e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3174 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.22538860103627, "grad_norm": 16.73314085913067, "kl": 0.192626953125, "learning_rate": 1.7746113989637304e-07, "loss": 0.001, "reward": 1.799765944480896, "reward_std": 0.0003673917522064585, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2997658848762512, "step": 3175 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.2279792746114, "grad_norm": 0.18698237390622247, "kl": 0.10150146484375, "learning_rate": 1.772020725388601e-07, "loss": -0.0003, "reward": 2.499996781349182, "reward_std": 2.16306148104195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3176 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.23056994818653, "grad_norm": 0.04774654871373796, "kl": 0.04510498046875, "learning_rate": 1.7694300518134715e-07, "loss": -0.0003, "reward": 2.499998927116394, "reward_std": 1.0291312833032862e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3177 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.233160621761659, "grad_norm": 0.09055973194775302, "kl": 0.080322265625, "learning_rate": 1.766839378238342e-07, "loss": -0.0003, "reward": 2.499997854232788, "reward_std": 1.3692743721094303e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3178 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.235751295336788, "grad_norm": 0.15809836263552166, "kl": 0.070556640625, "learning_rate": 1.7642487046632123e-07, "loss": 0.0006, "reward": 2.4999982118606567, "reward_std": 1.1734939562302316e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3179 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.238341968911918, "grad_norm": 0.6501189929844517, "kl": 0.196533203125, "learning_rate": 1.7616580310880828e-07, "loss": 0.0011, "reward": 2.4999974966049194, "reward_std": 1.5462585452041822e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3180 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.240932642487047, "grad_norm": 1.103301101005251, "kl": 0.053466796875, "learning_rate": 1.7590673575129533e-07, "loss": 0.0006, "reward": 2.49998939037323, "reward_std": 5.279565357341198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 3181 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.243523316062177, "grad_norm": 0.7119014375622824, "kl": 0.10400390625, "learning_rate": 1.7564766839378236e-07, "loss": -0.0, "reward": 2.4999972581863403, "reward_std": 3.3613985124247847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3182 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.246113989637305, "grad_norm": 15.000538124043533, "kl": 0.2822265625, "learning_rate": 1.7538860103626944e-07, "loss": 0.0014, "reward": 1.8345621824264526, "reward_std": 0.0020594476670794393, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3345622420310974, "step": 3183 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.248704663212436, "grad_norm": 15.276766633385261, "kl": 0.113525390625, "learning_rate": 1.7512953367875647e-07, "loss": 0.0007, "reward": 1.8449835777282715, "reward_std": 0.00036699172540011205, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3449835181236267, "step": 3184 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.251295336787564, "grad_norm": 0.048442803797523144, "kl": 0.08209228515625, "learning_rate": 1.7487046632124352e-07, "loss": -0.0001, "reward": 2.499998092651367, "reward_std": 1.2998042961953615e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3185 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.253886010362695, "grad_norm": 0.36234443899299207, "kl": 0.089599609375, "learning_rate": 1.7461139896373057e-07, "loss": 0.0011, "reward": 2.4999947547912598, "reward_std": 3.988130401921808e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 3186 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.256476683937823, "grad_norm": 0.36415855585047296, "kl": 0.34326171875, "learning_rate": 1.743523316062176e-07, "loss": 0.0021, "reward": 2.499996066093445, "reward_std": 4.263243681634776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3187 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.259067357512953, "grad_norm": 1.0250415228393426, "kl": 0.08026123046875, "learning_rate": 1.7409326424870465e-07, "loss": 0.0008, "reward": 1.999955177307129, "reward_std": 7.0505275857613015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499955028295517, "step": 3188 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.261658031088082, "grad_norm": 23.513328745315256, "kl": 0.0406494140625, "learning_rate": 1.738341968911917e-07, "loss": 0.0005, "reward": 2.187057614326477, "reward_std": 0.25913941718141587, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6870575547218323, "step": 3189 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.264248704663212, "grad_norm": 0.14594198304590528, "kl": 0.0423583984375, "learning_rate": 1.7357512953367876e-07, "loss": 0.0006, "reward": 2.499998688697815, "reward_std": 1.3615378406939271e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3190 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.266839378238341, "grad_norm": 0.11317612807392859, "kl": 0.0428466796875, "learning_rate": 1.7331606217616578e-07, "loss": 0.0015, "reward": 2.4999977350234985, "reward_std": 1.7175225366372615e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3191 }, { "clip_ratio": 0.0, "completion_length": 36.875, "epoch": 8.26943005181347, "grad_norm": 38.3947720091386, "kl": 0.119140625, "learning_rate": 1.7305699481865286e-07, "loss": 0.0004, "reward": 2.4997693300247192, "reward_std": 0.00014748069952474907, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997693300247192, "step": 3192 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.2720207253886, "grad_norm": 0.13426411189197163, "kl": 0.019775390625, "learning_rate": 1.727979274611399e-07, "loss": 0.0005, "reward": 2.499996781349182, "reward_std": 1.9390807892705197e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3193 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 8.27461139896373, "grad_norm": 13.018206887232967, "kl": 0.10296630859375, "learning_rate": 1.7253886010362694e-07, "loss": 0.0012, "reward": 2.4999808073043823, "reward_std": 2.9488910399777524e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980628490448, "step": 3194 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.27720207253886, "grad_norm": 1.3074110273092252, "kl": 0.11328125, "learning_rate": 1.72279792746114e-07, "loss": 0.0012, "reward": 2.4999920129776, "reward_std": 4.7270076493077795e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 3195 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.279792746113989, "grad_norm": 0.12941712126818172, "kl": 0.07958984375, "learning_rate": 1.7202072538860102e-07, "loss": 0.0, "reward": 2.4999985694885254, "reward_std": 1.6768317721016501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3196 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.282383419689118, "grad_norm": 0.29518467981720453, "kl": 0.1875, "learning_rate": 1.7176165803108807e-07, "loss": -0.0001, "reward": 2.4999974966049194, "reward_std": 1.9441690142230073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3197 }, { "clip_ratio": 0.0, "completion_length": 58.375, "epoch": 8.284974093264248, "grad_norm": 0.45334757884690946, "kl": 0.2001953125, "learning_rate": 1.7150259067357512e-07, "loss": -0.0001, "reward": 2.4999961853027344, "reward_std": 4.981795427738689e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3198 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.287564766839377, "grad_norm": 18.70783428985443, "kl": 0.1123046875, "learning_rate": 1.7124352331606218e-07, "loss": 0.0004, "reward": 1.8716132044792175, "reward_std": 0.0004720661308965646, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3716132044792175, "step": 3199 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.290155440414507, "grad_norm": 36.7259519367443, "kl": 0.20654296875, "learning_rate": 1.709844559585492e-07, "loss": 0.0008, "reward": 1.8955614566802979, "reward_std": 0.008073076531047718, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.395561397075653, "step": 3200 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.292746113989637, "grad_norm": 1.6188201887756937, "kl": 0.05767822265625, "learning_rate": 1.7072538860103628e-07, "loss": 0.0002, "reward": 2.4999914169311523, "reward_std": 6.930559493412147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 3201 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.295336787564766, "grad_norm": 0.1288917903115651, "kl": 0.031494140625, "learning_rate": 1.704663212435233e-07, "loss": 0.0009, "reward": 2.499997854232788, "reward_std": 1.97812647684259e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3202 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.297927461139896, "grad_norm": 18.684600006396494, "kl": 0.07562255859375, "learning_rate": 1.7020725388601033e-07, "loss": 0.0002, "reward": 1.9994863271713257, "reward_std": 5.684444437292768e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994864165782928, "step": 3203 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.300518134715025, "grad_norm": 1.4364290019150132, "kl": 0.066162109375, "learning_rate": 1.6994818652849741e-07, "loss": 0.0007, "reward": 2.4999911785125732, "reward_std": 1.0436000650315691e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 3204 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.303108808290155, "grad_norm": 0.10650705980622001, "kl": 0.066650390625, "learning_rate": 1.6968911917098444e-07, "loss": 0.0003, "reward": 2.499996542930603, "reward_std": 1.384457050335186e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3205 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.305699481865284, "grad_norm": 0.5387496833525821, "kl": 0.064697265625, "learning_rate": 1.694300518134715e-07, "loss": 0.0003, "reward": 2.499992847442627, "reward_std": 3.4324597208978957e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 3206 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.308290155440414, "grad_norm": 7.6905935262046325, "kl": 0.08062744140625, "learning_rate": 1.6917098445595854e-07, "loss": -0.0003, "reward": 1.999830186367035, "reward_std": 2.2740757003703038e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998303353786469, "step": 3207 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.310880829015543, "grad_norm": 0.09775192652215665, "kl": 0.0565185546875, "learning_rate": 1.689119170984456e-07, "loss": 0.0005, "reward": 2.4999983310699463, "reward_std": 9.463893491101771e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3208 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.313471502590673, "grad_norm": 0.1403142079783186, "kl": 0.06134033203125, "learning_rate": 1.6865284974093262e-07, "loss": -0.0006, "reward": 2.4999985694885254, "reward_std": 1.8470925056135457e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3209 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.316062176165802, "grad_norm": 0.3323226828597026, "kl": 0.1239013671875, "learning_rate": 1.6839378238341968e-07, "loss": -0.0002, "reward": 2.4999948740005493, "reward_std": 4.336178108133026e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3210 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.318652849740932, "grad_norm": 0.3623764784679946, "kl": 0.08349609375, "learning_rate": 1.6813471502590673e-07, "loss": 0.0006, "reward": 2.499997854232788, "reward_std": 1.5728519429103471e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3211 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.321243523316062, "grad_norm": 0.2017666315902772, "kl": 0.05029296875, "learning_rate": 1.6787564766839376e-07, "loss": 0.0008, "reward": 2.4999970197677612, "reward_std": 2.892981683544349e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3212 }, { "clip_ratio": 0.0, "completion_length": 49.0, "epoch": 8.323834196891191, "grad_norm": 0.032892990314913155, "kl": 0.126953125, "learning_rate": 1.6761658031088083e-07, "loss": 0.0007, "reward": 2.4999985694885254, "reward_std": 1.7188298215842224e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3213 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.32642487046632, "grad_norm": 0.34251603729447894, "kl": 0.1568603515625, "learning_rate": 1.6735751295336786e-07, "loss": 0.0003, "reward": 2.4999955892562866, "reward_std": 7.9040910918593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3214 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.32901554404145, "grad_norm": 6.768723956103338, "kl": 0.096435546875, "learning_rate": 1.6709844559585491e-07, "loss": -0.0003, "reward": 2.4999773502349854, "reward_std": 2.9510303647839464e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999977469444275, "step": 3215 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.33160621761658, "grad_norm": 0.21013872083486865, "kl": 0.15771484375, "learning_rate": 1.6683937823834197e-07, "loss": 0.0011, "reward": 2.499995470046997, "reward_std": 5.5054981658031465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3216 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.33419689119171, "grad_norm": 70.51326847020441, "kl": 0.10809326171875, "learning_rate": 1.66580310880829e-07, "loss": -0.0002, "reward": 2.4331393241882324, "reward_std": 0.18909154747598222, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9331393241882324, "step": 3217 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.336787564766839, "grad_norm": 0.3244645859039052, "kl": 0.0537109375, "learning_rate": 1.6632124352331605e-07, "loss": 0.0006, "reward": 2.4999905824661255, "reward_std": 3.841301236207073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 3218 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.339378238341968, "grad_norm": 0.47991049816342235, "kl": 0.218505859375, "learning_rate": 1.660621761658031e-07, "loss": -0.0007, "reward": 2.499997615814209, "reward_std": 2.4921848762460286e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3219 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.341968911917098, "grad_norm": 0.5515420227885787, "kl": 0.0986328125, "learning_rate": 1.6580310880829015e-07, "loss": 0.001, "reward": 2.499994397163391, "reward_std": 2.7854583777298103e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 3220 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.344559585492227, "grad_norm": 0.1465327189391269, "kl": 0.091796875, "learning_rate": 1.6554404145077718e-07, "loss": -0.0001, "reward": 2.4999966621398926, "reward_std": 2.72318118277326e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3221 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.347150259067357, "grad_norm": 3.376183889760354, "kl": 0.1009521484375, "learning_rate": 1.6528497409326426e-07, "loss": 0.0006, "reward": 1.9990437030792236, "reward_std": 0.00011454803888000242, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990436732769012, "step": 3222 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.349740932642487, "grad_norm": 0.18734024585281264, "kl": 0.05682373046875, "learning_rate": 1.6502590673575128e-07, "loss": 0.0004, "reward": 2.499997615814209, "reward_std": 1.918373243370297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3223 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.352331606217616, "grad_norm": 0.1463500924618652, "kl": 0.13916015625, "learning_rate": 1.6476683937823836e-07, "loss": 0.001, "reward": 2.4999985694885254, "reward_std": 1.2025637943224865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3224 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.354922279792746, "grad_norm": 2.3752356088622584, "kl": 0.0611572265625, "learning_rate": 1.645077720207254e-07, "loss": 0.0004, "reward": 1.999264895915985, "reward_std": 3.1858964746334095e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992647171020508, "step": 3225 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.357512953367875, "grad_norm": 6.111000884646436, "kl": 0.0599365234375, "learning_rate": 1.6424870466321241e-07, "loss": -0.0002, "reward": 1.9995545148849487, "reward_std": 4.7109810083156844e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995545446872711, "step": 3226 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.360103626943005, "grad_norm": 0.13179717905268684, "kl": 0.04449462890625, "learning_rate": 1.639896373056995e-07, "loss": -0.0007, "reward": 2.4999938011169434, "reward_std": 3.4057075026794337e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3227 }, { "clip_ratio": 0.0, "completion_length": 48.4375, "epoch": 8.362694300518134, "grad_norm": 51.488248395089784, "kl": 0.105712890625, "learning_rate": 1.6373056994818652e-07, "loss": 0.0007, "reward": 2.1867703795433044, "reward_std": 0.2589584738320241, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.686770260334015, "step": 3228 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.365284974093264, "grad_norm": 1.2564602827286182, "kl": 0.05816650390625, "learning_rate": 1.6347150259067357e-07, "loss": -0.0003, "reward": 2.4999959468841553, "reward_std": 2.6034725806312053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3229 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.367875647668393, "grad_norm": 0.1135698989061551, "kl": 0.140625, "learning_rate": 1.6321243523316062e-07, "loss": 0.0003, "reward": 2.499997138977051, "reward_std": 2.109903590508111e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 3230 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.370466321243523, "grad_norm": 0.45263499620583997, "kl": 0.074462890625, "learning_rate": 1.6295336787564768e-07, "loss": 0.0002, "reward": 2.4999916553497314, "reward_std": 5.291461206979875e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 3231 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.373056994818652, "grad_norm": 0.711885955001311, "kl": 0.0491943359375, "learning_rate": 1.626943005181347e-07, "loss": 0.0009, "reward": 2.4999961853027344, "reward_std": 2.934345843641495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3232 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.375647668393782, "grad_norm": 3.628513627565293, "kl": 0.08642578125, "learning_rate": 1.6243523316062176e-07, "loss": 0.0007, "reward": 1.9983813762664795, "reward_std": 3.763721936422826e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983814358711243, "step": 3233 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 8.378238341968911, "grad_norm": 145.3009365729936, "kl": 0.0633544921875, "learning_rate": 1.621761658031088e-07, "loss": 0.0005, "reward": 2.124465584754944, "reward_std": 0.23175123968331945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.624465525150299, "step": 3234 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.380829015544041, "grad_norm": 0.42414139367183473, "kl": 0.03253173828125, "learning_rate": 1.6191709844559583e-07, "loss": 0.0007, "reward": 2.49999737739563, "reward_std": 2.1501990516981095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3235 }, { "clip_ratio": 0.0, "completion_length": 61.0, "epoch": 8.38341968911917, "grad_norm": 5.253203836762631, "kl": 0.1455078125, "learning_rate": 1.6165803108808291e-07, "loss": 0.0009, "reward": 2.4999923706054688, "reward_std": 2.8473913289417396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 3236 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.3860103626943, "grad_norm": 0.14625341415206158, "kl": 0.120361328125, "learning_rate": 1.6139896373056994e-07, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 1.8736003539743251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3237 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.38860103626943, "grad_norm": 2.495572202793581, "kl": 0.0943603515625, "learning_rate": 1.61139896373057e-07, "loss": -0.0002, "reward": 2.4999608993530273, "reward_std": 1.2104341664098683e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999608993530273, "step": 3238 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.39119170984456, "grad_norm": 10.615665321766565, "kl": 0.05419921875, "learning_rate": 1.6088082901554405e-07, "loss": 0.0001, "reward": 2.4997620582580566, "reward_std": 0.00011853254915195066, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997620582580566, "step": 3239 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.393782383419689, "grad_norm": 6.092532778992031, "kl": 0.0643310546875, "learning_rate": 1.6062176165803107e-07, "loss": -0.0003, "reward": 1.982175588607788, "reward_std": 0.00017538013844387024, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4821757078170776, "step": 3240 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.396373056994818, "grad_norm": 13.640409892014574, "kl": 0.0706787109375, "learning_rate": 1.6036269430051812e-07, "loss": 0.001, "reward": 2.0624550580978394, "reward_std": 0.17679284224482217, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624550580978394, "step": 3241 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.398963730569948, "grad_norm": 0.23921550073511422, "kl": 0.17724609375, "learning_rate": 1.6010362694300518e-07, "loss": 0.0023, "reward": 2.499997615814209, "reward_std": 2.5406155828022747e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3242 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 8.401554404145077, "grad_norm": 0.15949898835331877, "kl": 0.0797119140625, "learning_rate": 1.5984455958549223e-07, "loss": 0.0006, "reward": 2.4999972581863403, "reward_std": 1.366168334016038e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3243 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.404145077720207, "grad_norm": 0.4433378724016629, "kl": 0.102294921875, "learning_rate": 1.5958549222797926e-07, "loss": 0.0007, "reward": 2.4999983310699463, "reward_std": 1.2141738068294217e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3244 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.406735751295336, "grad_norm": 0.0921673402433888, "kl": 0.1427001953125, "learning_rate": 1.5932642487046634e-07, "loss": 0.0016, "reward": 2.4999982118606567, "reward_std": 1.1553321712653997e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3245 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.409326424870466, "grad_norm": 0.1967057769886357, "kl": 0.0616455078125, "learning_rate": 1.5906735751295336e-07, "loss": -0.0002, "reward": 2.499997138977051, "reward_std": 2.516193291057789e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3246 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.411917098445596, "grad_norm": 0.09122617624906873, "kl": 0.0775146484375, "learning_rate": 1.588082901554404e-07, "loss": 0.0008, "reward": 2.4999974966049194, "reward_std": 2.3035391905068536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 3247 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.414507772020725, "grad_norm": 10.535270515609028, "kl": 0.10009765625, "learning_rate": 1.5854922279792747e-07, "loss": -0.0003, "reward": 1.9837512373924255, "reward_std": 0.0001056549073723545, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4837514162063599, "step": 3248 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.417098445595855, "grad_norm": 0.06388404756002164, "kl": 0.0849609375, "learning_rate": 1.582901554404145e-07, "loss": 0.002, "reward": 2.4999990463256836, "reward_std": 1.1225987464058562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 3249 }, { "clip_ratio": 0.0, "completion_length": 37.3125, "epoch": 8.419689119170984, "grad_norm": 0.13873384701330174, "kl": 0.0615234375, "learning_rate": 1.5803108808290155e-07, "loss": 0.001, "reward": 2.4999977350234985, "reward_std": 1.112781973233723e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3250 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 8.422279792746114, "grad_norm": 0.3438802348927548, "kl": 0.1065673828125, "learning_rate": 1.577720207253886e-07, "loss": 0.0003, "reward": 2.499998092651367, "reward_std": 1.62018505989181e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3251 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.424870466321243, "grad_norm": 0.6836649542955919, "kl": 0.371337890625, "learning_rate": 1.5751295336787565e-07, "loss": 0.0021, "reward": 2.4999959468841553, "reward_std": 1.9391145826830325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3252 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 8.427461139896373, "grad_norm": 0.16270775407010266, "kl": 0.0556640625, "learning_rate": 1.5725388601036268e-07, "loss": 0.0002, "reward": 2.4999982118606567, "reward_std": 2.023762306180288e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3253 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.430051813471502, "grad_norm": 0.552028887369757, "kl": 0.0640869140625, "learning_rate": 1.5699481865284976e-07, "loss": 0.0005, "reward": 2.49999463558197, "reward_std": 5.997048049266596e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 3254 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.432642487046632, "grad_norm": 0.1609069702298302, "kl": 0.03692626953125, "learning_rate": 1.5673575129533678e-07, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 2.165543662613345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3255 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.435233160621761, "grad_norm": 1.3823851626987906, "kl": 0.1572265625, "learning_rate": 1.564766839378238e-07, "loss": 0.0009, "reward": 2.499994993209839, "reward_std": 4.955724307365017e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3256 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.437823834196891, "grad_norm": 0.12855882772429997, "kl": 0.0625, "learning_rate": 1.562176165803109e-07, "loss": 0.0007, "reward": 2.4999977350234985, "reward_std": 1.5347770840890007e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3257 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.44041450777202, "grad_norm": 51.46427914494171, "kl": 0.06488037109375, "learning_rate": 1.5595854922279791e-07, "loss": 0.0003, "reward": 2.0621350407600403, "reward_std": 0.17692414358134556, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5621349215507507, "step": 3258 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.44300518134715, "grad_norm": 2.397390969962657, "kl": 0.116943359375, "learning_rate": 1.5569948186528497e-07, "loss": -0.0004, "reward": 1.9772499799728394, "reward_std": 6.793709175667573e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4772500395774841, "step": 3259 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.44559585492228, "grad_norm": 0.14542542442835713, "kl": 0.04840087890625, "learning_rate": 1.5544041450777202e-07, "loss": 0.0003, "reward": 2.499996304512024, "reward_std": 2.6329436764171987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3260 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 8.44818652849741, "grad_norm": 0.07184934569737657, "kl": 0.1143798828125, "learning_rate": 1.5518134715025907e-07, "loss": -0.0001, "reward": 2.499998450279236, "reward_std": 1.5681051763749565e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3261 }, { "clip_ratio": 0.0, "completion_length": 49.3125, "epoch": 8.450777202072539, "grad_norm": 6.808116214725747, "kl": 0.095703125, "learning_rate": 1.549222797927461e-07, "loss": 0.0006, "reward": 1.9879077672958374, "reward_std": 0.00022908513787456286, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4879076778888702, "step": 3262 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.453367875647668, "grad_norm": 7.772891597087809, "kl": 0.0721435546875, "learning_rate": 1.5466321243523315e-07, "loss": 0.0007, "reward": 2.43719744682312, "reward_std": 0.17745604828553496, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937197208404541, "step": 3263 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.455958549222798, "grad_norm": 5.087651677854303, "kl": 0.107666015625, "learning_rate": 1.544041450777202e-07, "loss": -0.0, "reward": 2.4999735355377197, "reward_std": 7.056898851942606e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999737739562988, "step": 3264 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.458549222797927, "grad_norm": 0.054542395861534365, "kl": 0.0462646484375, "learning_rate": 1.5414507772020723e-07, "loss": -0.0006, "reward": 2.4999985694885254, "reward_std": 7.130839776436915e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3265 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.461139896373057, "grad_norm": 2.1380184276256635, "kl": 0.0963134765625, "learning_rate": 1.538860103626943e-07, "loss": 0.0009, "reward": 1.935023546218872, "reward_std": 0.00015133647866605315, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4350233972072601, "step": 3266 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.463730569948186, "grad_norm": 0.10293703058717056, "kl": 0.0369873046875, "learning_rate": 1.5362694300518134e-07, "loss": -0.0, "reward": 2.4999979734420776, "reward_std": 1.7728141301631695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3267 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.466321243523316, "grad_norm": 10.180751438446471, "kl": 0.143310546875, "learning_rate": 1.533678756476684e-07, "loss": 0.0007, "reward": 1.3551163077354431, "reward_std": 0.0013606666725536343, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8551163375377655, "step": 3268 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.468911917098445, "grad_norm": 0.11509265621239359, "kl": 0.08984375, "learning_rate": 1.5310880829015544e-07, "loss": 0.0014, "reward": 2.4999983310699463, "reward_std": 1.5731226881143812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3269 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.471502590673575, "grad_norm": 0.9932307552913896, "kl": 0.1116943359375, "learning_rate": 1.5284974093264247e-07, "loss": 0.0003, "reward": 2.4999760389328003, "reward_std": 1.1579363672353793e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999759793281555, "step": 3270 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.474093264248705, "grad_norm": 0.2721035766607206, "kl": 0.07958984375, "learning_rate": 1.5259067357512952e-07, "loss": 0.0, "reward": 2.4999982118606567, "reward_std": 1.3245239642856177e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3271 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.476683937823834, "grad_norm": 0.8687197435331077, "kl": 0.09454345703125, "learning_rate": 1.5233160621761657e-07, "loss": 0.0004, "reward": 2.4999953508377075, "reward_std": 2.6398053023513057e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 3272 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.479274611398964, "grad_norm": 0.5301246583414705, "kl": 0.15087890625, "learning_rate": 1.5207253886010362e-07, "loss": 0.0015, "reward": 2.499996066093445, "reward_std": 2.6622250857144536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3273 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.481865284974093, "grad_norm": 0.5948652714497739, "kl": 0.0438232421875, "learning_rate": 1.5181347150259065e-07, "loss": -0.0001, "reward": 2.4999945163726807, "reward_std": 4.837383301037335e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 3274 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.484455958549223, "grad_norm": 0.274313312394979, "kl": 0.050048828125, "learning_rate": 1.5155440414507773e-07, "loss": 0.0005, "reward": 2.499995708465576, "reward_std": 1.9275074123470404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3275 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.487046632124352, "grad_norm": 0.17311506835698331, "kl": 0.0745849609375, "learning_rate": 1.5129533678756476e-07, "loss": 0.0003, "reward": 2.499997854232788, "reward_std": 1.71290804473756e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3276 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.489637305699482, "grad_norm": 0.20070724782434496, "kl": 0.07672119140625, "learning_rate": 1.5103626943005178e-07, "loss": 0.0009, "reward": 2.4999983310699463, "reward_std": 1.4916782902218984e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3277 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.492227979274611, "grad_norm": 0.21979445378670176, "kl": 0.1546630859375, "learning_rate": 1.5077720207253886e-07, "loss": -0.0004, "reward": 2.4999979734420776, "reward_std": 2.182826392527204e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3278 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 8.494818652849741, "grad_norm": 238.2204835646883, "kl": 0.14990234375, "learning_rate": 1.505181347150259e-07, "loss": 0.0009, "reward": 1.903814673423767, "reward_std": 0.02668781653341057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.403814822435379, "step": 3279 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.49740932642487, "grad_norm": 26.731851187313037, "kl": 0.100830078125, "learning_rate": 1.5025906735751294e-07, "loss": -0.0, "reward": 1.9923403859138489, "reward_std": 0.004486036114030867, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492340475320816, "step": 3280 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.5, "grad_norm": 0.3031374041462288, "kl": 0.0941162109375, "learning_rate": 1.5e-07, "loss": 0.0004, "reward": 2.4999895095825195, "reward_std": 3.1613501505489694e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 3281 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.50259067357513, "grad_norm": 0.2185290058221287, "kl": 0.05419921875, "learning_rate": 1.4974093264248705e-07, "loss": -0.0002, "reward": 2.4999983310699463, "reward_std": 1.8197787738927218e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3282 }, { "clip_ratio": 0.0, "completion_length": 39.875, "epoch": 8.505181347150259, "grad_norm": 0.20245842904547912, "kl": 0.06329345703125, "learning_rate": 1.4948186528497407e-07, "loss": -0.0006, "reward": 2.4999974966049194, "reward_std": 1.7338460338578443e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3283 }, { "clip_ratio": 0.0, "completion_length": 53.0, "epoch": 8.507772020725389, "grad_norm": 0.06072766737466977, "kl": 0.134765625, "learning_rate": 1.4922279792746112e-07, "loss": 0.0026, "reward": 2.499997138977051, "reward_std": 1.2979166967852507e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3284 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.510362694300518, "grad_norm": 0.47034190249188784, "kl": 0.12255859375, "learning_rate": 1.4896373056994818e-07, "loss": 0.0005, "reward": 2.49999737739563, "reward_std": 2.01358000140317e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3285 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.512953367875648, "grad_norm": 0.1293201465534031, "kl": 0.029083251953125, "learning_rate": 1.487046632124352e-07, "loss": 0.0009, "reward": 2.499999165534973, "reward_std": 6.847895690498262e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 3286 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 8.515544041450777, "grad_norm": 1.0266762924845336, "kl": 0.03436279296875, "learning_rate": 1.4844559585492228e-07, "loss": 0.0006, "reward": 2.4999935626983643, "reward_std": 1.021530329126108e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 3287 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.518134715025907, "grad_norm": 2.419694381043532, "kl": 0.06640625, "learning_rate": 1.481865284974093e-07, "loss": 0.0014, "reward": 1.9989416003227234, "reward_std": 3.633499011357344e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989413917064667, "step": 3288 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.520725388601036, "grad_norm": 7.323675674440208, "kl": 0.087646484375, "learning_rate": 1.479274611398964e-07, "loss": -0.0003, "reward": 1.9991912841796875, "reward_std": 3.1830990451453545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991913139820099, "step": 3289 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.523316062176166, "grad_norm": 27.554898655492387, "kl": 0.142333984375, "learning_rate": 1.4766839378238341e-07, "loss": 0.0001, "reward": 1.9223848581314087, "reward_std": 0.008517106322642576, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4223849475383759, "step": 3290 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.525906735751295, "grad_norm": 0.13564633142473642, "kl": 0.035430908203125, "learning_rate": 1.4740932642487047e-07, "loss": -0.0009, "reward": 2.499995708465576, "reward_std": 1.5725119624221406e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3291 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.528497409326425, "grad_norm": 0.05352376665976322, "kl": 0.03826904296875, "learning_rate": 1.4715025906735752e-07, "loss": 0.001, "reward": 2.4999990463256836, "reward_std": 9.836083734171552e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 3292 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.531088082901555, "grad_norm": 0.4574062067855682, "kl": 0.07080078125, "learning_rate": 1.4689119170984455e-07, "loss": -0.0002, "reward": 2.4999642372131348, "reward_std": 4.8269457408878225e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999644756317139, "step": 3293 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.533678756476684, "grad_norm": 0.3235814923783658, "kl": 0.0875244140625, "learning_rate": 1.466321243523316e-07, "loss": 0.0006, "reward": 2.499995708465576, "reward_std": 2.54376038810733e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 3294 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.536269430051814, "grad_norm": 0.13471625278881041, "kl": 0.06646728515625, "learning_rate": 1.4637305699481865e-07, "loss": 0.001, "reward": 2.49999737739563, "reward_std": 2.364278827826638e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3295 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.538860103626943, "grad_norm": 28.661828225711496, "kl": 0.06134033203125, "learning_rate": 1.461139896373057e-07, "loss": -0.0002, "reward": 2.499748110771179, "reward_std": 0.0003722159625567656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997482299804688, "step": 3296 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.541450777202073, "grad_norm": 0.39713039325938365, "kl": 0.10595703125, "learning_rate": 1.4585492227979273e-07, "loss": 0.0006, "reward": 2.4999914169311523, "reward_std": 2.586898517620284e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 3297 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.544041450777202, "grad_norm": 0.5603666617154922, "kl": 0.05810546875, "learning_rate": 1.455958549222798e-07, "loss": 0.0004, "reward": 2.499997615814209, "reward_std": 2.3801551947144617e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3298 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.546632124352332, "grad_norm": 0.13270758275000594, "kl": 0.054534912109375, "learning_rate": 1.4533678756476684e-07, "loss": -0.0005, "reward": 2.4999977350234985, "reward_std": 1.646448481551488e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3299 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.549222797927461, "grad_norm": 0.18020369112368834, "kl": 0.08056640625, "learning_rate": 1.4507772020725386e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 2.9499061611204525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3300 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.55181347150259, "grad_norm": 0.1694608883831519, "kl": 0.113525390625, "learning_rate": 1.4481865284974094e-07, "loss": -0.0003, "reward": 2.499997138977051, "reward_std": 1.9870407754751795e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3301 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.55440414507772, "grad_norm": 16.98651499060268, "kl": 0.177734375, "learning_rate": 1.4455958549222797e-07, "loss": 0.0007, "reward": 1.4979868531227112, "reward_std": 0.00014506246952805668, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9979867935180664, "step": 3302 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.55699481865285, "grad_norm": 5.119435456347419, "kl": 0.162841796875, "learning_rate": 1.4430051813471502e-07, "loss": 0.0004, "reward": 1.8213363885879517, "reward_std": 0.0004737289301601777, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.321336418390274, "step": 3303 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.55958549222798, "grad_norm": 0.16458292812135147, "kl": 0.071533203125, "learning_rate": 1.4404145077720207e-07, "loss": -0.0003, "reward": 2.499997854232788, "reward_std": 1.8587726344776456e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3304 }, { "clip_ratio": 0.0, "completion_length": 45.25, "epoch": 8.562176165803109, "grad_norm": 0.6869801943868943, "kl": 0.0987548828125, "learning_rate": 1.4378238341968913e-07, "loss": -0.0002, "reward": 2.4999964237213135, "reward_std": 4.1431040358475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3305 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 8.564766839378239, "grad_norm": 39.57233589606633, "kl": 0.177734375, "learning_rate": 1.4352331606217615e-07, "loss": 0.001, "reward": 1.9977784156799316, "reward_std": 8.788574723439524e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4977784156799316, "step": 3306 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.567357512953368, "grad_norm": 0.264070134400206, "kl": 0.067138671875, "learning_rate": 1.432642487046632e-07, "loss": -0.0, "reward": 2.4999977350234985, "reward_std": 1.991577875060102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3307 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.569948186528498, "grad_norm": 82.58706936402254, "kl": 0.14306640625, "learning_rate": 1.4300518134715026e-07, "loss": 0.001, "reward": 1.9631909132003784, "reward_std": 0.01243763975969614, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4631909132003784, "step": 3308 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.572538860103627, "grad_norm": 0.2983121610507963, "kl": 0.113037109375, "learning_rate": 1.4274611398963728e-07, "loss": 0.0006, "reward": 2.4999951124191284, "reward_std": 3.7372394672274822e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3309 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.575129533678757, "grad_norm": 5.381177773574463, "kl": 0.0670166015625, "learning_rate": 1.4248704663212436e-07, "loss": 0.0004, "reward": 2.4993174076080322, "reward_std": 7.144213407173083e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9993174076080322, "step": 3310 }, { "clip_ratio": 0.0, "completion_length": 38.8125, "epoch": 8.577720207253886, "grad_norm": 1.6763897981650977, "kl": 0.0684814453125, "learning_rate": 1.422279792746114e-07, "loss": 0.0008, "reward": 2.499985098838806, "reward_std": 4.679659014072968e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985158443451, "step": 3311 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.580310880829016, "grad_norm": 0.0785406174453944, "kl": 0.0582275390625, "learning_rate": 1.4196891191709844e-07, "loss": 0.0016, "reward": 2.499996542930603, "reward_std": 1.4997315815890033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3312 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.582901554404145, "grad_norm": 6.365635446658416, "kl": 0.1177978515625, "learning_rate": 1.417098445595855e-07, "loss": 0.0008, "reward": 1.9508140683174133, "reward_std": 0.008431362736189385, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4508141577243805, "step": 3313 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 8.585492227979275, "grad_norm": 12.662971493501352, "kl": 0.09716796875, "learning_rate": 1.4145077720207252e-07, "loss": 0.0004, "reward": 1.6775941848754883, "reward_std": 0.262736116303131, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1775941252708435, "step": 3314 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.588082901554404, "grad_norm": 0.49507997998738834, "kl": 0.18316650390625, "learning_rate": 1.4119170984455957e-07, "loss": 0.0013, "reward": 2.499996542930603, "reward_std": 3.401595847662975e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3315 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.590673575129534, "grad_norm": 11.177029621138917, "kl": 0.1204833984375, "learning_rate": 1.4093264248704663e-07, "loss": 0.0011, "reward": 2.4999808073043823, "reward_std": 1.4798311894992366e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999808073043823, "step": 3316 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.593264248704664, "grad_norm": 0.2764594258504331, "kl": 0.1114501953125, "learning_rate": 1.4067357512953368e-07, "loss": 0.0, "reward": 2.499995231628418, "reward_std": 2.3859020075178705e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3317 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.595854922279793, "grad_norm": 0.11438985023525164, "kl": 0.06884765625, "learning_rate": 1.404145077720207e-07, "loss": 0.0012, "reward": 2.4999969005584717, "reward_std": 1.778905755145388e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3318 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.598445595854923, "grad_norm": 0.208035527705028, "kl": 0.0792236328125, "learning_rate": 1.4015544041450778e-07, "loss": 0.001, "reward": 2.4999961853027344, "reward_std": 1.4428881343064859e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3319 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.601036269430052, "grad_norm": 0.06635636559720648, "kl": 0.09765625, "learning_rate": 1.398963730569948e-07, "loss": 0.0014, "reward": 2.499993920326233, "reward_std": 1.390188344885246e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3320 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.603626943005182, "grad_norm": 0.2805949301165034, "kl": 0.10546875, "learning_rate": 1.3963730569948186e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 3.6220211541149183e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 3321 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.606217616580311, "grad_norm": 0.058405153242124336, "kl": 0.076416015625, "learning_rate": 1.3937823834196891e-07, "loss": 0.0022, "reward": 2.4999985694885254, "reward_std": 7.484358093279297e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3322 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.60880829015544, "grad_norm": 8.093760197008166, "kl": 0.090576171875, "learning_rate": 1.3911917098445594e-07, "loss": 0.0008, "reward": 1.956282138824463, "reward_std": 0.00032121398686513203, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.456282138824463, "step": 3323 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.61139896373057, "grad_norm": 1.8179044152443349, "kl": 0.0997314453125, "learning_rate": 1.38860103626943e-07, "loss": -0.0005, "reward": 2.4999927282333374, "reward_std": 4.202926334073709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 3324 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.6139896373057, "grad_norm": 8.906746122642433, "kl": 0.08984375, "learning_rate": 1.3860103626943005e-07, "loss": 0.0, "reward": 2.4999760389328003, "reward_std": 2.513654771973961e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999761581420898, "step": 3325 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.61658031088083, "grad_norm": 0.07781780455736222, "kl": 0.0596923828125, "learning_rate": 1.383419689119171e-07, "loss": -0.0008, "reward": 2.49999737739563, "reward_std": 1.4165262314236315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3326 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.619170984455959, "grad_norm": 1.9789873980189034, "kl": 0.2451171875, "learning_rate": 1.3808290155440413e-07, "loss": 0.0014, "reward": 2.4999845027923584, "reward_std": 1.2820892607123824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999843835830688, "step": 3327 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.621761658031089, "grad_norm": 0.23809006240546643, "kl": 0.046875, "learning_rate": 1.378238341968912e-07, "loss": 0.0006, "reward": 2.499993920326233, "reward_std": 2.6558788022157387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 3328 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.624352331606218, "grad_norm": 0.04695361164990487, "kl": 0.05206298828125, "learning_rate": 1.3756476683937823e-07, "loss": 0.0002, "reward": 2.499998688697815, "reward_std": 7.430970327959585e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3329 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 8.626943005181348, "grad_norm": 26.89544123299959, "kl": 0.130615234375, "learning_rate": 1.3730569948186526e-07, "loss": 0.0014, "reward": 2.3437492847442627, "reward_std": 0.44194174508898243, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749992847442627, "step": 3330 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.629533678756477, "grad_norm": 0.8571876379507455, "kl": 0.053955078125, "learning_rate": 1.3704663212435234e-07, "loss": 0.0005, "reward": 2.499995231628418, "reward_std": 4.948442921204332e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3331 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.632124352331607, "grad_norm": 4.752589418879249, "kl": 0.1282958984375, "learning_rate": 1.3678756476683936e-07, "loss": 0.0002, "reward": 1.9973363876342773, "reward_std": 7.790958551368021e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497336357831955, "step": 3332 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.634715025906736, "grad_norm": 1.1493226409041875, "kl": 0.1318359375, "learning_rate": 1.3652849740932641e-07, "loss": 0.0013, "reward": 2.4999921321868896, "reward_std": 5.30079489635682e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 3333 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.637305699481866, "grad_norm": 2.951115216265048, "kl": 0.1474609375, "learning_rate": 1.3626943005181347e-07, "loss": 0.0006, "reward": 1.9998269081115723, "reward_std": 2.3463563479708682e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998270869255066, "step": 3334 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.639896373056995, "grad_norm": 4.055866341934763, "kl": 0.1865234375, "learning_rate": 1.3601036269430052e-07, "loss": 0.0005, "reward": 2.2499709129333496, "reward_std": 0.26726628505889494, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499709129333496, "step": 3335 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.642487046632125, "grad_norm": 0.3943785986231596, "kl": 0.113525390625, "learning_rate": 1.3575129533678755e-07, "loss": 0.0017, "reward": 2.4999923706054688, "reward_std": 4.414145791997726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 3336 }, { "clip_ratio": 0.0, "completion_length": 47.1875, "epoch": 8.645077720207254, "grad_norm": 1.5140139044635492, "kl": 0.1494140625, "learning_rate": 1.354922279792746e-07, "loss": 0.0003, "reward": 2.499987840652466, "reward_std": 1.2437555369615438e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 3337 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.647668393782384, "grad_norm": 49.90038510795194, "kl": 0.056396484375, "learning_rate": 1.3523316062176165e-07, "loss": 0.0001, "reward": 2.3122596740722656, "reward_std": 0.25909857736655795, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122597336769104, "step": 3338 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.650259067357513, "grad_norm": 0.3638661244612433, "kl": 0.0792236328125, "learning_rate": 1.3497409326424868e-07, "loss": 0.0002, "reward": 2.499995708465576, "reward_std": 3.278967085407203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 3339 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.652849740932643, "grad_norm": 0.13547404430373156, "kl": 0.0882568359375, "learning_rate": 1.3471502590673576e-07, "loss": -0.0008, "reward": 2.4999985694885254, "reward_std": 1.0313038103504368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3340 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.655440414507773, "grad_norm": 6.8380461228515355, "kl": 0.04736328125, "learning_rate": 1.3445595854922278e-07, "loss": -0.001, "reward": 2.4999945163726807, "reward_std": 3.840118608877674e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3341 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.658031088082902, "grad_norm": 18.708948224400306, "kl": 0.05584716796875, "learning_rate": 1.3419689119170984e-07, "loss": -0.0005, "reward": 1.995237410068512, "reward_std": 0.0010545150990992624, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4952374696731567, "step": 3342 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.660621761658032, "grad_norm": 8.962305417722943, "kl": 0.115478515625, "learning_rate": 1.339378238341969e-07, "loss": 0.0004, "reward": 1.9955896735191345, "reward_std": 8.276755579572637e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4955896735191345, "step": 3343 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 8.663212435233161, "grad_norm": 11.443941806543098, "kl": 0.088623046875, "learning_rate": 1.3367875647668391e-07, "loss": 0.0009, "reward": 1.9968776106834412, "reward_std": 0.000391440822511413, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968776106834412, "step": 3344 }, { "clip_ratio": 0.0, "completion_length": 48.0, "epoch": 8.66580310880829, "grad_norm": 0.4473669545721388, "kl": 0.15283203125, "learning_rate": 1.3341968911917097e-07, "loss": -0.0003, "reward": 2.499997138977051, "reward_std": 2.367170964134857e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3345 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 8.66839378238342, "grad_norm": 4.419341946928744, "kl": 0.113525390625, "learning_rate": 1.3316062176165802e-07, "loss": -0.0001, "reward": 1.9127683639526367, "reward_std": 0.0002881284619888902, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.412768304347992, "step": 3346 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.67098445595855, "grad_norm": 0.04857628895427263, "kl": 0.0572509765625, "learning_rate": 1.3290155440414507e-07, "loss": -0.0008, "reward": 2.4999982118606567, "reward_std": 1.1235063368530973e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3347 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.67357512953368, "grad_norm": 56.313949904341804, "kl": 0.105224609375, "learning_rate": 1.326424870466321e-07, "loss": -0.0005, "reward": 2.437477707862854, "reward_std": 0.1768250900424846, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937477946281433, "step": 3348 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.676165803108809, "grad_norm": 0.036294736540385494, "kl": 0.072021484375, "learning_rate": 1.3238341968911918e-07, "loss": 0.0003, "reward": 2.4999992847442627, "reward_std": 5.495205215311216e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999995231628418, "step": 3349 }, { "clip_ratio": 0.0, "completion_length": 34.1875, "epoch": 8.678756476683938, "grad_norm": 0.11103514405422099, "kl": 0.13623046875, "learning_rate": 1.321243523316062e-07, "loss": 0.0004, "reward": 2.4999977350234985, "reward_std": 2.181939862566651e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3350 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 8.681347150259068, "grad_norm": 3.420022456929766, "kl": 0.0389404296875, "learning_rate": 1.3186528497409328e-07, "loss": 0.0007, "reward": 2.499994993209839, "reward_std": 3.2474370073032333e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 3351 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.683937823834198, "grad_norm": 0.2667511662180898, "kl": 0.0284423828125, "learning_rate": 1.316062176165803e-07, "loss": -0.0005, "reward": 2.4999953508377075, "reward_std": 3.268518526056141e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3352 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.686528497409327, "grad_norm": 0.31117281092758015, "kl": 0.0955810546875, "learning_rate": 1.3134715025906734e-07, "loss": -0.0002, "reward": 2.499996304512024, "reward_std": 1.919219272394912e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3353 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.689119170984457, "grad_norm": 0.184572407281124, "kl": 0.0657958984375, "learning_rate": 1.3108808290155442e-07, "loss": 0.0006, "reward": 2.499998092651367, "reward_std": 1.8382514213044487e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3354 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.691709844559586, "grad_norm": 20.63697861668671, "kl": 0.198486328125, "learning_rate": 1.3082901554404144e-07, "loss": 0.0006, "reward": 1.9931102991104126, "reward_std": 0.0003838316008568654, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4931102991104126, "step": 3355 }, { "clip_ratio": 0.0, "completion_length": 45.75, "epoch": 8.694300518134716, "grad_norm": 0.4368001134485287, "kl": 0.121337890625, "learning_rate": 1.305699481865285e-07, "loss": 0.0003, "reward": 2.499992847442627, "reward_std": 3.2451662264065817e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 3356 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.696891191709845, "grad_norm": 19.930907945982003, "kl": 0.089599609375, "learning_rate": 1.3031088082901555e-07, "loss": 0.0005, "reward": 2.123736619949341, "reward_std": 0.23189840534178074, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.623736560344696, "step": 3357 }, { "clip_ratio": 0.0, "completion_length": 40.0, "epoch": 8.699481865284975, "grad_norm": 0.059735013778231094, "kl": 0.113037109375, "learning_rate": 1.300518134715026e-07, "loss": 0.0003, "reward": 2.49999737739563, "reward_std": 1.707848980458948e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3358 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.702072538860104, "grad_norm": 0.12847861963250928, "kl": 0.1171875, "learning_rate": 1.2979274611398963e-07, "loss": 0.0003, "reward": 2.499993681907654, "reward_std": 1.934683766080525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 3359 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 8.704663212435234, "grad_norm": 40.717174581490504, "kl": 0.08154296875, "learning_rate": 1.2953367875647668e-07, "loss": 0.001, "reward": 2.185337245464325, "reward_std": 0.2605653462869668, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6853373646736145, "step": 3360 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.707253886010363, "grad_norm": 0.04449974108030537, "kl": 0.060302734375, "learning_rate": 1.2927461139896373e-07, "loss": -0.0005, "reward": 2.499998450279236, "reward_std": 7.502982555251947e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3361 }, { "clip_ratio": 0.0, "completion_length": 47.8125, "epoch": 8.709844559585493, "grad_norm": 3.831710417835987, "kl": 0.176513671875, "learning_rate": 1.2901554404145076e-07, "loss": 0.0007, "reward": 2.499067783355713, "reward_std": 7.370854575583508e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990676641464233, "step": 3362 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 8.712435233160623, "grad_norm": 54.451816218693736, "kl": 0.14697265625, "learning_rate": 1.2875647668393784e-07, "loss": 0.0006, "reward": 1.3282039761543274, "reward_std": 0.013426024932414293, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8282040357589722, "step": 3363 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.715025906735752, "grad_norm": 16.134442220217526, "kl": 0.1011962890625, "learning_rate": 1.2849740932642486e-07, "loss": 0.0004, "reward": 1.9970440864562988, "reward_std": 0.00011469001435671089, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970441460609436, "step": 3364 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.717616580310882, "grad_norm": 0.16339440942813072, "kl": 0.13037109375, "learning_rate": 1.2823834196891192e-07, "loss": 0.001, "reward": 2.499998927116394, "reward_std": 1.5587531265737198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3365 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.720207253886011, "grad_norm": 0.8988508269346204, "kl": 0.349609375, "learning_rate": 1.2797927461139897e-07, "loss": 0.0017, "reward": 2.4999947547912598, "reward_std": 4.9453173005531426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3366 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.72279792746114, "grad_norm": 0.09526017793629643, "kl": 0.07720947265625, "learning_rate": 1.27720207253886e-07, "loss": 0.001, "reward": 2.499997615814209, "reward_std": 1.8836039430425444e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3367 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.72538860103627, "grad_norm": 0.2678480897622461, "kl": 0.0572509765625, "learning_rate": 1.2746113989637305e-07, "loss": 0.0004, "reward": 2.499995708465576, "reward_std": 2.8946026304765837e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3368 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.7279792746114, "grad_norm": 0.4185900802236819, "kl": 0.08251953125, "learning_rate": 1.272020725388601e-07, "loss": 0.0006, "reward": 2.499998688697815, "reward_std": 1.3226051009951334e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3369 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 8.73056994818653, "grad_norm": 0.7025746019544262, "kl": 0.14453125, "learning_rate": 1.2694300518134715e-07, "loss": 0.0004, "reward": 2.4999938011169434, "reward_std": 5.127304916641151e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 3370 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.733160621761659, "grad_norm": 0.9309730714805926, "kl": 0.0557861328125, "learning_rate": 1.2668393782383418e-07, "loss": -0.0002, "reward": 2.499995470046997, "reward_std": 4.348956167632423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3371 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.735751295336787, "grad_norm": 3.0856005310878243, "kl": 0.04119873046875, "learning_rate": 1.2642487046632126e-07, "loss": 0.0002, "reward": 1.9998868703842163, "reward_std": 5.602595081199979e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499886691570282, "step": 3372 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.738341968911918, "grad_norm": 0.22363835251756556, "kl": 0.1064453125, "learning_rate": 1.2616580310880828e-07, "loss": -0.0003, "reward": 2.4999969005584717, "reward_std": 2.354826108330599e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3373 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.740932642487046, "grad_norm": 0.11785549190942043, "kl": 0.04010009765625, "learning_rate": 1.259067357512953e-07, "loss": 0.0002, "reward": 2.4999961853027344, "reward_std": 2.102847361129534e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3374 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.743523316062177, "grad_norm": 11.256786602551614, "kl": 0.13818359375, "learning_rate": 1.256476683937824e-07, "loss": 0.0006, "reward": 1.4374878406524658, "reward_std": 0.17680266499519348, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9374878406524658, "step": 3375 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.746113989637305, "grad_norm": 0.44289856924383586, "kl": 0.117919921875, "learning_rate": 1.2538860103626942e-07, "loss": 0.0002, "reward": 2.499998688697815, "reward_std": 7.707761113806555e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3376 }, { "clip_ratio": 0.0, "completion_length": 41.8125, "epoch": 8.748704663212436, "grad_norm": 4.608323444994204, "kl": 0.064208984375, "learning_rate": 1.2512953367875647e-07, "loss": 0.0007, "reward": 2.49997079372406, "reward_std": 1.1472743949525466e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999707341194153, "step": 3377 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.751295336787564, "grad_norm": 1.265491080339265, "kl": 0.28125, "learning_rate": 1.2487046632124352e-07, "loss": 0.0016, "reward": 2.499990701675415, "reward_std": 1.1868063154452102e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905228614807, "step": 3378 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.753886010362695, "grad_norm": 6.596425927913146, "kl": 0.121337890625, "learning_rate": 1.2461139896373057e-07, "loss": -0.0, "reward": 2.4999836683273315, "reward_std": 1.2598140983755002e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 3379 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.756476683937823, "grad_norm": 0.19542602854658486, "kl": 0.1376953125, "learning_rate": 1.243523316062176e-07, "loss": 0.0005, "reward": 2.49999737739563, "reward_std": 2.0361762551601714e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3380 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.759067357512954, "grad_norm": 42.79757819601643, "kl": 0.101318359375, "learning_rate": 1.2409326424870465e-07, "loss": 0.0006, "reward": 2.2495840191841125, "reward_std": 0.2676972923127323, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7495840787887573, "step": 3381 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.761658031088082, "grad_norm": 0.12033499108595715, "kl": 0.0654296875, "learning_rate": 1.238341968911917e-07, "loss": -0.0002, "reward": 2.4999923706054688, "reward_std": 3.16703500402582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 3382 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.764248704663213, "grad_norm": 0.9462702746943731, "kl": 0.13330078125, "learning_rate": 1.2357512953367876e-07, "loss": 0.0008, "reward": 1.9998219013214111, "reward_std": 8.171455533556582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998217225074768, "step": 3383 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.766839378238341, "grad_norm": 0.36757188071895547, "kl": 0.05975341796875, "learning_rate": 1.233160621761658e-07, "loss": 0.0012, "reward": 2.499996066093445, "reward_std": 2.5022278578035184e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3384 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.76943005181347, "grad_norm": 0.05987451481340372, "kl": 0.057373046875, "learning_rate": 1.2305699481865284e-07, "loss": 0.001, "reward": 2.4999985694885254, "reward_std": 1.4483213988114585e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3385 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.7720207253886, "grad_norm": 0.3285659531112559, "kl": 0.0595703125, "learning_rate": 1.227979274611399e-07, "loss": 0.0008, "reward": 2.499996542930603, "reward_std": 1.925674524727583e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3386 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.77461139896373, "grad_norm": 0.15652080217531852, "kl": 0.074462890625, "learning_rate": 1.2253886010362694e-07, "loss": 0.001, "reward": 2.4999983310699463, "reward_std": 1.5194416960184753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3387 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.77720207253886, "grad_norm": 21.36046033183248, "kl": 0.1455078125, "learning_rate": 1.22279792746114e-07, "loss": 0.0012, "reward": 1.999458909034729, "reward_std": 0.00010207260254446737, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994587898254395, "step": 3388 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.779792746113989, "grad_norm": 0.05677383597127939, "kl": 0.0341796875, "learning_rate": 1.2202072538860102e-07, "loss": 0.0001, "reward": 2.499998092651367, "reward_std": 7.984220928847208e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3389 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.782383419689118, "grad_norm": 0.1728357569494381, "kl": 0.02960205078125, "learning_rate": 1.2176165803108807e-07, "loss": -0.0011, "reward": 2.499997854232788, "reward_std": 1.5084495998962666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3390 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.784974093264248, "grad_norm": 0.8322912589918195, "kl": 0.1436767578125, "learning_rate": 1.2150259067357513e-07, "loss": 0.0005, "reward": 2.4999940395355225, "reward_std": 1.027613348014711e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 3391 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.787564766839377, "grad_norm": 0.2924989392441589, "kl": 0.0438232421875, "learning_rate": 1.2124352331606218e-07, "loss": -0.0003, "reward": 2.4999982118606567, "reward_std": 1.2748843118970399e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3392 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.790155440414507, "grad_norm": 0.08526060872944677, "kl": 0.0777587890625, "learning_rate": 1.209844559585492e-07, "loss": -0.0003, "reward": 2.4999983310699463, "reward_std": 1.6211890851991484e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3393 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.792746113989637, "grad_norm": 0.4081431801515191, "kl": 0.05792236328125, "learning_rate": 1.2072538860103626e-07, "loss": 0.0008, "reward": 2.499997615814209, "reward_std": 2.8291184435147443e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3394 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.795336787564766, "grad_norm": 0.5048462409199818, "kl": 0.091796875, "learning_rate": 1.204663212435233e-07, "loss": 0.0011, "reward": 2.4999951124191284, "reward_std": 1.919993337651249e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 3395 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.797927461139896, "grad_norm": 165.60813132889285, "kl": 0.17724609375, "learning_rate": 1.2020725388601036e-07, "loss": 0.0012, "reward": 2.4373884201049805, "reward_std": 0.17709052570899075, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373885989189148, "step": 3396 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.800518134715025, "grad_norm": 13.38675614812453, "kl": 0.1004638671875, "learning_rate": 1.1994818652849742e-07, "loss": 0.0001, "reward": 1.9794475436210632, "reward_std": 0.0005111402032866863, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4794476330280304, "step": 3397 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.803108808290155, "grad_norm": 0.12830292859971593, "kl": 0.10546875, "learning_rate": 1.1968911917098444e-07, "loss": -0.0003, "reward": 2.499998450279236, "reward_std": 1.2960493336322543e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3398 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.805699481865284, "grad_norm": 0.2632599901061041, "kl": 0.06494140625, "learning_rate": 1.194300518134715e-07, "loss": 0.0001, "reward": 2.499996304512024, "reward_std": 3.7108378592165536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3399 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.808290155440414, "grad_norm": 0.32182294398252104, "kl": 0.150390625, "learning_rate": 1.1917098445595853e-07, "loss": 0.0013, "reward": 2.499996781349182, "reward_std": 2.3660023771299166e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3400 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.810880829015543, "grad_norm": 80.67172462890012, "kl": 0.10986328125, "learning_rate": 1.1891191709844559e-07, "loss": 0.0004, "reward": 2.31146240234375, "reward_std": 0.26020324678347606, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.81146240234375, "step": 3401 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.813471502590673, "grad_norm": 60.03732416394882, "kl": 0.087646484375, "learning_rate": 1.1865284974093264e-07, "loss": 0.0007, "reward": 1.997279167175293, "reward_std": 0.0013432932736350267, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972789287567139, "step": 3402 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.816062176165802, "grad_norm": 0.6145585750110337, "kl": 0.0965576171875, "learning_rate": 1.1839378238341968e-07, "loss": -0.0006, "reward": 2.499995708465576, "reward_std": 3.1214149771585653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3403 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.818652849740932, "grad_norm": 0.5407580642620602, "kl": 0.04510498046875, "learning_rate": 1.1813471502590673e-07, "loss": 0.0006, "reward": 2.4999951124191284, "reward_std": 3.586274488043273e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 3404 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.821243523316062, "grad_norm": 5.188672480969817, "kl": 0.0654296875, "learning_rate": 1.1787564766839378e-07, "loss": -0.0004, "reward": 1.9987338185310364, "reward_std": 6.631235225995624e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987339973449707, "step": 3405 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.823834196891191, "grad_norm": 10.660763395294747, "kl": 0.161865234375, "learning_rate": 1.1761658031088082e-07, "loss": 0.0009, "reward": 1.9370936155319214, "reward_std": 0.17760935842488834, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4370936751365662, "step": 3406 }, { "clip_ratio": 0.0, "completion_length": 48.8125, "epoch": 8.82642487046632, "grad_norm": 0.45014410588966747, "kl": 0.21337890625, "learning_rate": 1.1735751295336788e-07, "loss": 0.0014, "reward": 2.4999918937683105, "reward_std": 7.867948397688451e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 3407 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.82901554404145, "grad_norm": 0.09469335705849251, "kl": 0.14208984375, "learning_rate": 1.1709844559585492e-07, "loss": 0.0001, "reward": 2.499998092651367, "reward_std": 1.371992425447388e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3408 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.83160621761658, "grad_norm": 0.15280176317237354, "kl": 0.063323974609375, "learning_rate": 1.1683937823834196e-07, "loss": -0.0, "reward": 2.499998688697815, "reward_std": 1.1230069105749863e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3409 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.83419689119171, "grad_norm": 2.05087528462175, "kl": 0.18603515625, "learning_rate": 1.1658031088082901e-07, "loss": 0.001, "reward": 2.4999959468841553, "reward_std": 3.36249252086418e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3410 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.836787564766839, "grad_norm": 0.5628117142108546, "kl": 0.083740234375, "learning_rate": 1.1632124352331606e-07, "loss": -0.0005, "reward": 2.4999901056289673, "reward_std": 4.7004726866362034e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 3411 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.839378238341968, "grad_norm": 0.768220825866974, "kl": 0.122314453125, "learning_rate": 1.160621761658031e-07, "loss": 0.0001, "reward": 2.499987483024597, "reward_std": 4.2993326587748015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876022338867, "step": 3412 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.841968911917098, "grad_norm": 0.26071272546140967, "kl": 0.066162109375, "learning_rate": 1.1580310880829015e-07, "loss": -0.0002, "reward": 2.4999951124191284, "reward_std": 2.955199306597933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 3413 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.844559585492227, "grad_norm": 0.07892520924077684, "kl": 0.018707275390625, "learning_rate": 1.155440414507772e-07, "loss": -0.0011, "reward": 2.499998688697815, "reward_std": 6.743495504224484e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3414 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.847150259067357, "grad_norm": 0.5028935263467134, "kl": 0.0673828125, "learning_rate": 1.1528497409326423e-07, "loss": 0.0005, "reward": 2.499994158744812, "reward_std": 5.5921011608006665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 3415 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.849740932642487, "grad_norm": 33.31898531546319, "kl": 0.10546875, "learning_rate": 1.1502590673575128e-07, "loss": 0.0007, "reward": 2.2497164011001587, "reward_std": 0.26756066989912597, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749716341495514, "step": 3416 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.852331606217616, "grad_norm": 0.892766548310473, "kl": 0.085693359375, "learning_rate": 1.1476683937823834e-07, "loss": -0.0014, "reward": 2.4999935626983643, "reward_std": 3.214002163076657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3417 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.854922279792746, "grad_norm": 0.4090976160476408, "kl": 0.11083984375, "learning_rate": 1.1450777202072538e-07, "loss": 0.0002, "reward": 2.49999463558197, "reward_std": 6.088661194780798e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3418 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 8.857512953367875, "grad_norm": 5.631578722860013, "kl": 0.095703125, "learning_rate": 1.1424870466321243e-07, "loss": -0.0009, "reward": 2.499959945678711, "reward_std": 1.97009568410067e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999600648880005, "step": 3419 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.860103626943005, "grad_norm": 0.09383342802060476, "kl": 0.0318603515625, "learning_rate": 1.1398963730569948e-07, "loss": -0.0003, "reward": 2.4999979734420776, "reward_std": 1.4170338431540586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3420 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.862694300518134, "grad_norm": 57.831823187800005, "kl": 0.060791015625, "learning_rate": 1.1373056994818652e-07, "loss": 0.0001, "reward": 2.249638259410858, "reward_std": 0.2676438256274878, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7496383786201477, "step": 3421 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.865284974093264, "grad_norm": 0.25165882701740644, "kl": 0.149658203125, "learning_rate": 1.1347150259067357e-07, "loss": 0.0005, "reward": 2.4999964237213135, "reward_std": 2.1534369238906947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3422 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.867875647668393, "grad_norm": 0.3796630362909095, "kl": 0.113037109375, "learning_rate": 1.1321243523316061e-07, "loss": 0.0018, "reward": 2.4999964237213135, "reward_std": 3.4260993402313034e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3423 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.870466321243523, "grad_norm": 0.12615538698269674, "kl": 0.073974609375, "learning_rate": 1.1295336787564767e-07, "loss": -0.0001, "reward": 2.4999942779541016, "reward_std": 1.7638520830587368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 3424 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 8.873056994818652, "grad_norm": 50.21156532251137, "kl": 0.3167724609375, "learning_rate": 1.126943005181347e-07, "loss": 0.0016, "reward": 2.403740406036377, "reward_std": 0.27225524526721756, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9037404656410217, "step": 3425 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.875647668393782, "grad_norm": 0.38140227885333083, "kl": 0.09033203125, "learning_rate": 1.1243523316062176e-07, "loss": -0.0005, "reward": 2.4999961853027344, "reward_std": 3.3243033499275043e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3426 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.878238341968911, "grad_norm": 0.07415055538579222, "kl": 0.0780029296875, "learning_rate": 1.1217616580310881e-07, "loss": 0.0006, "reward": 2.499999165534973, "reward_std": 9.630867054966075e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 3427 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.880829015544041, "grad_norm": 2.3937230990189824, "kl": 0.150390625, "learning_rate": 1.1191709844559585e-07, "loss": -0.0005, "reward": 1.9993305802345276, "reward_std": 2.9648327881659498e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993306994438171, "step": 3428 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.88341968911917, "grad_norm": 0.07476661386735531, "kl": 0.0372772216796875, "learning_rate": 1.116580310880829e-07, "loss": 0.001, "reward": 2.4999982118606567, "reward_std": 1.4147680928999762e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3429 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.8860103626943, "grad_norm": 14.538641415441944, "kl": 0.0775146484375, "learning_rate": 1.1139896373056994e-07, "loss": 0.0005, "reward": 2.0550423860549927, "reward_std": 0.17987008192631038, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5550423860549927, "step": 3430 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.88860103626943, "grad_norm": 0.13118629995293477, "kl": 0.106689453125, "learning_rate": 1.1113989637305698e-07, "loss": 0.0007, "reward": 2.499998092651367, "reward_std": 1.1748778376841074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3431 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.89119170984456, "grad_norm": 39.83248026857953, "kl": 0.13720703125, "learning_rate": 1.1088082901554403e-07, "loss": 0.0003, "reward": 2.1854028701782227, "reward_std": 0.2605106985861312, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.685403048992157, "step": 3432 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.893782383419689, "grad_norm": 23.279695502371585, "kl": 0.0908203125, "learning_rate": 1.1062176165803109e-07, "loss": 0.0001, "reward": 1.751012921333313, "reward_std": 0.0035747609254030976, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2510128915309906, "step": 3433 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.896373056994818, "grad_norm": 305.47275617127116, "kl": 0.1151123046875, "learning_rate": 1.1036269430051813e-07, "loss": 0.0003, "reward": 1.9565163850784302, "reward_std": 0.004551374027016664, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.456516444683075, "step": 3434 }, { "clip_ratio": 0.0, "completion_length": 43.4375, "epoch": 8.898963730569948, "grad_norm": 0.2841693813663103, "kl": 0.099365234375, "learning_rate": 1.1010362694300518e-07, "loss": -0.0, "reward": 2.4999982118606567, "reward_std": 1.935722991674993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3435 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.901554404145077, "grad_norm": 0.1306413722252409, "kl": 0.10400390625, "learning_rate": 1.0984455958549223e-07, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 1.7340373688057298e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3436 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.904145077720207, "grad_norm": 1.840009699811808, "kl": 0.062744140625, "learning_rate": 1.0958549222797927e-07, "loss": -0.0006, "reward": 2.499997854232788, "reward_std": 1.5078662158884981e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3437 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.906735751295336, "grad_norm": 6.592015396276655, "kl": 0.073486328125, "learning_rate": 1.0932642487046631e-07, "loss": 0.0003, "reward": 1.9908595085144043, "reward_std": 5.952446451829019e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4908595085144043, "step": 3438 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.909326424870466, "grad_norm": 0.4098365770800571, "kl": 0.113037109375, "learning_rate": 1.0906735751295336e-07, "loss": 0.0002, "reward": 2.499993920326233, "reward_std": 6.812910100961744e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 3439 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.911917098445596, "grad_norm": 1.8168590398290307, "kl": 0.0726318359375, "learning_rate": 1.088082901554404e-07, "loss": 0.0012, "reward": 2.4999972581863403, "reward_std": 2.019234045746998e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3440 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.914507772020725, "grad_norm": 0.3264402407270602, "kl": 0.181640625, "learning_rate": 1.0854922279792746e-07, "loss": -0.001, "reward": 2.4999966621398926, "reward_std": 1.7722117036100826e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3441 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.917098445595855, "grad_norm": 49.714620718268115, "kl": 0.154296875, "learning_rate": 1.0829015544041451e-07, "loss": -0.0002, "reward": 1.997012436389923, "reward_std": 0.000704359028759427, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970125555992126, "step": 3442 }, { "clip_ratio": 0.0, "completion_length": 41.0625, "epoch": 8.919689119170984, "grad_norm": 0.23795399292179215, "kl": 0.0362548828125, "learning_rate": 1.0803108808290155e-07, "loss": -0.0009, "reward": 2.4999982118606567, "reward_std": 9.619128036320035e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3443 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.922279792746114, "grad_norm": 0.13597618892909877, "kl": 0.0701904296875, "learning_rate": 1.077720207253886e-07, "loss": 0.001, "reward": 2.499997854232788, "reward_std": 1.822109794602511e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3444 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.924870466321243, "grad_norm": 27.131290233905382, "kl": 0.104248046875, "learning_rate": 1.0751295336787564e-07, "loss": -0.0, "reward": 1.751234531402588, "reward_std": 0.0011124392432861896, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2512346804141998, "step": 3445 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.927461139896373, "grad_norm": 0.6480970974123776, "kl": 0.10546875, "learning_rate": 1.0725388601036268e-07, "loss": 0.0007, "reward": 2.4999947547912598, "reward_std": 4.239919121573621e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 3446 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 8.930051813471502, "grad_norm": 0.31832333588833966, "kl": 0.084716796875, "learning_rate": 1.0699481865284973e-07, "loss": 0.0017, "reward": 2.499987483024597, "reward_std": 4.957127089255664e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874830245972, "step": 3447 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.932642487046632, "grad_norm": 0.415100278840218, "kl": 0.0491943359375, "learning_rate": 1.0673575129533678e-07, "loss": 0.0002, "reward": 1.99990975856781, "reward_std": 6.170688550355408e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999096989631653, "step": 3448 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.935233160621761, "grad_norm": 0.20699318694489552, "kl": 0.0596923828125, "learning_rate": 1.0647668393782382e-07, "loss": -0.0, "reward": 2.4999951124191284, "reward_std": 2.5025476020346105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 3449 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.937823834196891, "grad_norm": 0.04913026804515559, "kl": 0.132080078125, "learning_rate": 1.0621761658031088e-07, "loss": 0.002, "reward": 2.4999990463256836, "reward_std": 1.5792405747561133e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3450 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.94041450777202, "grad_norm": 0.3674593768269718, "kl": 0.11767578125, "learning_rate": 1.0595854922279793e-07, "loss": 0.001, "reward": 2.4999938011169434, "reward_std": 3.11776619810189e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 3451 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.94300518134715, "grad_norm": 7.8264951162352, "kl": 0.14013671875, "learning_rate": 1.0569948186528498e-07, "loss": 0.0008, "reward": 1.9998313188552856, "reward_std": 2.6790382378294453e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998313784599304, "step": 3452 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.94559585492228, "grad_norm": 0.33998372393882526, "kl": 0.0526123046875, "learning_rate": 1.0544041450777201e-07, "loss": 0.0001, "reward": 2.499995470046997, "reward_std": 3.307970700916485e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3453 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.94818652849741, "grad_norm": 0.3741917166895687, "kl": 0.03631591796875, "learning_rate": 1.0518134715025906e-07, "loss": -0.0004, "reward": 2.4999979734420776, "reward_std": 9.602138391073822e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3454 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.950777202072539, "grad_norm": 0.08974220876390841, "kl": 0.0753173828125, "learning_rate": 1.0492227979274611e-07, "loss": 0.0009, "reward": 2.4999974966049194, "reward_std": 1.2093374834876158e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3455 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.953367875647668, "grad_norm": 1.8986997007782562, "kl": 0.2109375, "learning_rate": 1.0466321243523315e-07, "loss": 0.0002, "reward": 2.49998676776886, "reward_std": 5.239804465873021e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999868273735046, "step": 3456 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.955958549222798, "grad_norm": 0.08584888481750046, "kl": 0.0408935546875, "learning_rate": 1.044041450777202e-07, "loss": 0.0006, "reward": 2.4999969005584717, "reward_std": 1.162014342526163e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3457 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.958549222797927, "grad_norm": 0.6395217936837021, "kl": 0.0770263671875, "learning_rate": 1.0414507772020726e-07, "loss": 0.0009, "reward": 2.499996542930603, "reward_std": 2.7739946517613134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3458 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.961139896373057, "grad_norm": 2.203796670827671, "kl": 0.091064453125, "learning_rate": 1.038860103626943e-07, "loss": -0.0, "reward": 2.4999269247055054, "reward_std": 5.2807251677222666e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999269843101501, "step": 3459 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.963730569948186, "grad_norm": 37.58097015080119, "kl": 0.134490966796875, "learning_rate": 1.0362694300518134e-07, "loss": -0.0, "reward": 1.8061460256576538, "reward_std": 0.0032029268331825733, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3061461448669434, "step": 3460 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.966321243523316, "grad_norm": 0.7620339922795119, "kl": 0.08721923828125, "learning_rate": 1.0336787564766839e-07, "loss": 0.0006, "reward": 2.499994397163391, "reward_std": 5.155139888302074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 3461 }, { "clip_ratio": 0.0, "completion_length": 45.875, "epoch": 8.968911917098445, "grad_norm": 0.5165651999021758, "kl": 0.110107421875, "learning_rate": 1.0310880829015543e-07, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 3.1055558338266565e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3462 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.971502590673575, "grad_norm": 0.1851567766775072, "kl": 0.07421875, "learning_rate": 1.0284974093264248e-07, "loss": -0.0, "reward": 2.4999988079071045, "reward_std": 1.8109472250671388e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3463 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.974093264248705, "grad_norm": 8.599445050619556, "kl": 0.126220703125, "learning_rate": 1.0259067357512953e-07, "loss": 0.0005, "reward": 1.055208683013916, "reward_std": 0.0010587091092020273, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.5552087128162384, "step": 3464 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.976683937823834, "grad_norm": 0.29040727539042227, "kl": 0.088134765625, "learning_rate": 1.0233160621761657e-07, "loss": 0.0, "reward": 2.4999969005584717, "reward_std": 3.972999877532857e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 3465 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 8.979274611398964, "grad_norm": 0.11796377359458138, "kl": 0.0869140625, "learning_rate": 1.0207253886010363e-07, "loss": 0.0015, "reward": 2.4999985694885254, "reward_std": 1.0261053375870688e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3466 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.981865284974093, "grad_norm": 0.4193813967313966, "kl": 0.101806640625, "learning_rate": 1.0181347150259068e-07, "loss": 0.0013, "reward": 2.4999959468841553, "reward_std": 6.422787009796593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3467 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.984455958549223, "grad_norm": 0.3438863961813382, "kl": 0.06005859375, "learning_rate": 1.015544041450777e-07, "loss": 0.0007, "reward": 2.49999737739563, "reward_std": 1.4675625266136194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3468 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.987046632124352, "grad_norm": 1.4686867994494883, "kl": 0.11376953125, "learning_rate": 1.0129533678756476e-07, "loss": 0.0009, "reward": 1.9993655681610107, "reward_std": 2.673666926966689e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499365508556366, "step": 3469 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.989637305699482, "grad_norm": 0.10049110148050595, "kl": 0.10546875, "learning_rate": 1.0103626943005181e-07, "loss": 0.0001, "reward": 2.4999979734420776, "reward_std": 2.0879641624560463e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3470 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.992227979274611, "grad_norm": 23.388200454285705, "kl": 0.0751953125, "learning_rate": 1.0077720207253885e-07, "loss": 0.0001, "reward": 2.499901294708252, "reward_std": 7.481423324406933e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999011754989624, "step": 3471 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.994818652849741, "grad_norm": 0.09759439802902986, "kl": 0.067138671875, "learning_rate": 1.005181347150259e-07, "loss": -0.0002, "reward": 2.499998450279236, "reward_std": 1.6753554064052878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3472 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.99740932642487, "grad_norm": 0.5377190158466522, "kl": 0.05615234375, "learning_rate": 1.0025906735751296e-07, "loss": 0.0001, "reward": 2.4999964237213135, "reward_std": 2.7389844490244286e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3473 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.0, "grad_norm": 0.3111671622093799, "kl": 0.07177734375, "learning_rate": 1e-07, "loss": -0.0008, "reward": 2.499998688697815, "reward_std": 1.519774428970777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3474 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.00259067357513, "grad_norm": 0.06883466555507047, "kl": 0.068603515625, "learning_rate": 9.974093264248703e-08, "loss": -0.0003, "reward": 2.499998450279236, "reward_std": 8.96821916285262e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3475 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.005181347150259, "grad_norm": 0.09940027718336261, "kl": 0.0625, "learning_rate": 9.948186528497409e-08, "loss": -0.0, "reward": 2.499998688697815, "reward_std": 1.8173992373249348e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3476 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 9.007772020725389, "grad_norm": 0.7744298343394065, "kl": 0.2666015625, "learning_rate": 9.922279792746113e-08, "loss": 0.0018, "reward": 2.499996066093445, "reward_std": 2.7115992793369514e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 3477 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.010362694300518, "grad_norm": 0.07299487359906305, "kl": 0.061279296875, "learning_rate": 9.896373056994818e-08, "loss": 0.001, "reward": 2.499998092651367, "reward_std": 1.0244057762065495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3478 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.012953367875648, "grad_norm": 0.5728707937313825, "kl": 0.077392578125, "learning_rate": 9.870466321243523e-08, "loss": -0.0003, "reward": 2.4999940395355225, "reward_std": 5.230063607086777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 3479 }, { "clip_ratio": 0.0, "completion_length": 44.875, "epoch": 9.015544041450777, "grad_norm": 0.577585519870004, "kl": 0.1171875, "learning_rate": 9.844559585492227e-08, "loss": 0.0007, "reward": 2.4999920129776, "reward_std": 4.07921413625445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 3480 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.018134715025907, "grad_norm": 1.9030463922331389, "kl": 0.2021484375, "learning_rate": 9.818652849740932e-08, "loss": -0.0, "reward": 2.499991774559021, "reward_std": 1.0892174657328724e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 3481 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.020725388601036, "grad_norm": 2.384031158476782, "kl": 0.129638671875, "learning_rate": 9.792746113989638e-08, "loss": 0.0004, "reward": 1.9991512894630432, "reward_std": 3.719640750432518e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499151349067688, "step": 3482 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.023316062176166, "grad_norm": 0.21001529153364712, "kl": 0.0721435546875, "learning_rate": 9.76683937823834e-08, "loss": -0.0006, "reward": 2.4999951124191284, "reward_std": 2.012261745676369e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 3483 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 9.025906735751295, "grad_norm": 0.3503592490814614, "kl": 0.100341796875, "learning_rate": 9.740932642487046e-08, "loss": 0.0008, "reward": 2.499996066093445, "reward_std": 3.6000927252644033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3484 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.028497409326425, "grad_norm": 5.593491809496125, "kl": 0.1435546875, "learning_rate": 9.715025906735751e-08, "loss": 0.0004, "reward": 1.9201748371124268, "reward_std": 0.00016545185326322098, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4201748669147491, "step": 3485 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.031088082901555, "grad_norm": 1.181859071192541, "kl": 0.070068359375, "learning_rate": 9.689119170984456e-08, "loss": -0.0003, "reward": 1.9995995163917542, "reward_std": 1.722225243838693e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499599575996399, "step": 3486 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.033678756476684, "grad_norm": 0.43455535486754426, "kl": 0.05517578125, "learning_rate": 9.66321243523316e-08, "loss": 0.0008, "reward": 2.499998450279236, "reward_std": 1.5358967004885926e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3487 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.036269430051814, "grad_norm": 30.00416130435242, "kl": 0.1025390625, "learning_rate": 9.637305699481865e-08, "loss": 0.0009, "reward": 2.124295711517334, "reward_std": 0.2318367360621778, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6242956519126892, "step": 3488 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.038860103626943, "grad_norm": 0.06605068171929653, "kl": 0.17333984375, "learning_rate": 9.61139896373057e-08, "loss": -0.0, "reward": 2.499998092651367, "reward_std": 1.1339962497913803e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3489 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.041450777202073, "grad_norm": 2.302596252759514, "kl": 0.19091796875, "learning_rate": 9.585492227979273e-08, "loss": 0.0016, "reward": 2.4999587535858154, "reward_std": 1.1759171457015327e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999586939811707, "step": 3490 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 9.044041450777202, "grad_norm": 0.20312601562721577, "kl": 0.075927734375, "learning_rate": 9.559585492227979e-08, "loss": 0.0006, "reward": 2.499998688697815, "reward_std": 9.774062732503808e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3491 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.046632124352332, "grad_norm": 0.1752706129118087, "kl": 0.06201171875, "learning_rate": 9.533678756476684e-08, "loss": -0.0005, "reward": 2.4999972581863403, "reward_std": 2.122408886862104e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3492 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.049222797927461, "grad_norm": 2.2950099507032067, "kl": 0.0916748046875, "learning_rate": 9.507772020725388e-08, "loss": -0.0001, "reward": 1.9995425939559937, "reward_std": 3.370058288965083e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995426833629608, "step": 3493 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05181347150259, "grad_norm": 0.1768077503948095, "kl": 0.0650634765625, "learning_rate": 9.481865284974093e-08, "loss": -0.0012, "reward": 2.4999983310699463, "reward_std": 1.340243272807129e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3494 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05440414507772, "grad_norm": 8.514389611283839, "kl": 0.08355712890625, "learning_rate": 9.455958549222798e-08, "loss": -0.0003, "reward": 1.9995171427726746, "reward_std": 3.3134051705019374e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995173215866089, "step": 3495 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.05699481865285, "grad_norm": 0.38161999386292644, "kl": 0.03533935546875, "learning_rate": 9.430051813471502e-08, "loss": 0.0006, "reward": 2.499996542930603, "reward_std": 3.5599414331954904e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3496 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05958549222798, "grad_norm": 0.27368006829488134, "kl": 0.05267333984375, "learning_rate": 9.404145077720207e-08, "loss": 0.0009, "reward": 2.4999972581863403, "reward_std": 1.952396360138664e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3497 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.062176165803109, "grad_norm": 0.9966530841274793, "kl": 0.096435546875, "learning_rate": 9.378238341968911e-08, "loss": 0.0011, "reward": 2.499992609024048, "reward_std": 4.534846539172577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 3498 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.064766839378239, "grad_norm": 0.19468480991417259, "kl": 0.0494384765625, "learning_rate": 9.352331606217615e-08, "loss": -0.0005, "reward": 2.499998092651367, "reward_std": 1.2090595760128053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3499 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.067357512953368, "grad_norm": 0.16137813760992667, "kl": 0.14501953125, "learning_rate": 9.32642487046632e-08, "loss": 0.0015, "reward": 2.4999988079071045, "reward_std": 9.8964679295932e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3500 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.069948186528498, "grad_norm": 0.14839161239039747, "kl": 0.09423828125, "learning_rate": 9.300518134715026e-08, "loss": 0.0016, "reward": 2.499998092651367, "reward_std": 1.108784687176012e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3501 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.072538860103627, "grad_norm": 0.3462746189441103, "kl": 0.106201171875, "learning_rate": 9.27461139896373e-08, "loss": -0.0007, "reward": 2.4999969005584717, "reward_std": 1.878378498076927e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3502 }, { "clip_ratio": 0.0, "completion_length": 43.625, "epoch": 9.075129533678757, "grad_norm": 0.22861252993907075, "kl": 0.054901123046875, "learning_rate": 9.248704663212435e-08, "loss": 0.0014, "reward": 2.499996066093445, "reward_std": 1.820136731112143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3503 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.077720207253886, "grad_norm": 0.21675874704655848, "kl": 0.0966796875, "learning_rate": 9.22279792746114e-08, "loss": 0.0005, "reward": 2.499997615814209, "reward_std": 2.4345615656784503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3504 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.080310880829016, "grad_norm": 0.08140297878478561, "kl": 0.040771484375, "learning_rate": 9.196891191709843e-08, "loss": 0.0005, "reward": 2.499997854232788, "reward_std": 1.8541093140811427e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3505 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.082901554404145, "grad_norm": 1.2844626501991823, "kl": 0.0491943359375, "learning_rate": 9.170984455958548e-08, "loss": -0.0002, "reward": 2.4999988079071045, "reward_std": 1.3527026112569729e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3506 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.085492227979275, "grad_norm": 0.062022670648105266, "kl": 0.05194091796875, "learning_rate": 9.145077720207254e-08, "loss": 0.0011, "reward": 2.4999985694885254, "reward_std": 1.2521946359811409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3507 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.088082901554404, "grad_norm": 5.501094447792641, "kl": 0.121337890625, "learning_rate": 9.119170984455957e-08, "loss": 0.0, "reward": 2.4999425411224365, "reward_std": 3.329120210082692e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999942660331726, "step": 3508 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.090673575129534, "grad_norm": 0.3130824482114424, "kl": 0.0498046875, "learning_rate": 9.093264248704663e-08, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 1.0441919755521667e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3509 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.093264248704664, "grad_norm": 0.20710941450485104, "kl": 0.0830078125, "learning_rate": 9.067357512953368e-08, "loss": 0.0003, "reward": 2.4999985694885254, "reward_std": 1.2762948529143614e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3510 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.095854922279793, "grad_norm": 0.19359838417961514, "kl": 0.127685546875, "learning_rate": 9.041450777202072e-08, "loss": 0.0004, "reward": 2.499996781349182, "reward_std": 2.8852584819105687e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3511 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.098445595854923, "grad_norm": 8.019565058114269, "kl": 0.068359375, "learning_rate": 9.015544041450777e-08, "loss": 0.0, "reward": 1.9976164102554321, "reward_std": 0.00018490143702365458, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4976165294647217, "step": 3512 }, { "clip_ratio": 0.0, "completion_length": 45.75, "epoch": 9.101036269430052, "grad_norm": 0.3281591165422965, "kl": 0.113525390625, "learning_rate": 8.989637305699481e-08, "loss": 0.0002, "reward": 2.499993324279785, "reward_std": 2.8552201456477633e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 3513 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.103626943005182, "grad_norm": 102.70886847918447, "kl": 0.16455078125, "learning_rate": 8.963730569948185e-08, "loss": -0.0005, "reward": 1.9615327715873718, "reward_std": 0.022489637172469656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.461533010005951, "step": 3514 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.106217616580311, "grad_norm": 0.09578655550331544, "kl": 0.02880859375, "learning_rate": 8.93782383419689e-08, "loss": -0.0005, "reward": 2.499998092651367, "reward_std": 1.3494406516656454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3515 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.10880829015544, "grad_norm": 0.04135590893937164, "kl": 0.012908935546875, "learning_rate": 8.911917098445596e-08, "loss": -0.0014, "reward": 2.499998092651367, "reward_std": 8.916928777580324e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3516 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.11139896373057, "grad_norm": 0.1431390179499437, "kl": 0.0814208984375, "learning_rate": 8.886010362694301e-08, "loss": 0.0011, "reward": 2.4999982118606567, "reward_std": 1.3699809500167248e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3517 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.1139896373057, "grad_norm": 0.4647042303970793, "kl": 0.12353515625, "learning_rate": 8.860103626943005e-08, "loss": -0.0002, "reward": 2.4999892711639404, "reward_std": 4.479625772546569e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 3518 }, { "clip_ratio": 0.0, "completion_length": 47.0625, "epoch": 9.11658031088083, "grad_norm": 5.216882533515147, "kl": 0.170654296875, "learning_rate": 8.83419689119171e-08, "loss": 0.0006, "reward": 1.9882795810699463, "reward_std": 0.00022689815276066838, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.488279640674591, "step": 3519 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.119170984455959, "grad_norm": 0.45116476385169113, "kl": 0.20263671875, "learning_rate": 8.808290155440414e-08, "loss": 0.0001, "reward": 2.499996542930603, "reward_std": 1.492497332833409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3520 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.121761658031089, "grad_norm": 0.1373053713851007, "kl": 0.116943359375, "learning_rate": 8.782383419689118e-08, "loss": 0.0, "reward": 2.499997854232788, "reward_std": 1.615831308754423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3521 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.124352331606218, "grad_norm": 0.036450905720440414, "kl": 0.0911865234375, "learning_rate": 8.756476683937823e-08, "loss": 0.0004, "reward": 2.499999165534973, "reward_std": 8.50380587280597e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 3522 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 9.126943005181348, "grad_norm": 17.96352858558866, "kl": 0.12353515625, "learning_rate": 8.730569948186529e-08, "loss": 0.0009, "reward": 2.437360644340515, "reward_std": 0.17714584673041145, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373605251312256, "step": 3523 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.129533678756477, "grad_norm": 2.2269179359785993, "kl": 0.2021484375, "learning_rate": 8.704663212435232e-08, "loss": -0.0004, "reward": 2.062465190887451, "reward_std": 0.17677982098950906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624654293060303, "step": 3524 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.132124352331607, "grad_norm": 1.5357791012883506, "kl": 0.07373046875, "learning_rate": 8.678756476683938e-08, "loss": -0.0006, "reward": 2.499993920326233, "reward_std": 4.034666972074774e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 3525 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.134715025906736, "grad_norm": 0.12894755209062408, "kl": 0.11767578125, "learning_rate": 8.652849740932643e-08, "loss": 0.0002, "reward": 2.49999737739563, "reward_std": 2.2803430965723237e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3526 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.137305699481866, "grad_norm": 0.3078311634503186, "kl": 0.057861328125, "learning_rate": 8.626943005181347e-08, "loss": 0.0011, "reward": 2.4999985694885254, "reward_std": 1.4808964863277652e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3527 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.139896373056995, "grad_norm": 0.10702747992885327, "kl": 0.087158203125, "learning_rate": 8.601036269430051e-08, "loss": 0.0008, "reward": 2.499997854232788, "reward_std": 1.438599994685319e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3528 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.142487046632125, "grad_norm": 0.5186115042826989, "kl": 0.17236328125, "learning_rate": 8.575129533678756e-08, "loss": 0.0001, "reward": 2.4999983310699463, "reward_std": 1.172221232081938e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3529 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.145077720207254, "grad_norm": 0.07281245368818204, "kl": 0.02520751953125, "learning_rate": 8.54922279792746e-08, "loss": 0.0001, "reward": 2.4999988079071045, "reward_std": 9.947069941063091e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3530 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.147668393782384, "grad_norm": 2.981762180781131, "kl": 0.159912109375, "learning_rate": 8.523316062176165e-08, "loss": 0.0007, "reward": 1.4916887283325195, "reward_std": 7.037031082290923e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9916886985301971, "step": 3531 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.150259067357513, "grad_norm": 3.624716437082935, "kl": 0.1337890625, "learning_rate": 8.497409326424871e-08, "loss": 0.0001, "reward": 1.9951987266540527, "reward_std": 0.00010449500950926449, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4951987862586975, "step": 3532 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.152849740932643, "grad_norm": 6.123575199983153, "kl": 0.149658203125, "learning_rate": 8.471502590673575e-08, "loss": 0.001, "reward": 1.9326473474502563, "reward_std": 0.00022901836115352125, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4326472580432892, "step": 3533 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.155440414507773, "grad_norm": 0.12240146446398546, "kl": 0.1904296875, "learning_rate": 8.44559585492228e-08, "loss": 0.0008, "reward": 2.499997615814209, "reward_std": 2.749745789287772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3534 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.158031088082902, "grad_norm": 0.1876233439953152, "kl": 0.096923828125, "learning_rate": 8.419689119170984e-08, "loss": 0.0015, "reward": 2.499997138977051, "reward_std": 3.14285790636859e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3535 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.160621761658032, "grad_norm": 1.8807053409847254, "kl": 0.17236328125, "learning_rate": 8.393782383419688e-08, "loss": 0.0005, "reward": 1.999733328819275, "reward_std": 1.9581380911404267e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499733328819275, "step": 3536 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.163212435233161, "grad_norm": 0.2417162186782245, "kl": 0.02313232421875, "learning_rate": 8.367875647668393e-08, "loss": -0.0002, "reward": 2.4999948740005493, "reward_std": 4.206884796076338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3537 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.16580310880829, "grad_norm": 0.12779225083112375, "kl": 0.1611328125, "learning_rate": 8.341968911917098e-08, "loss": 0.0012, "reward": 2.4999983310699463, "reward_std": 1.378701369958435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3538 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.16839378238342, "grad_norm": 0.6504550571405474, "kl": 0.07275390625, "learning_rate": 8.316062176165802e-08, "loss": 0.0006, "reward": 2.499994397163391, "reward_std": 5.032922217651503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 3539 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.17098445595855, "grad_norm": 0.2676793614074702, "kl": 0.06024169921875, "learning_rate": 8.290155440414508e-08, "loss": 0.0005, "reward": 2.4999884366989136, "reward_std": 2.7224646714785194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999883770942688, "step": 3540 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.17357512953368, "grad_norm": 0.23117868430508665, "kl": 0.035400390625, "learning_rate": 8.264248704663213e-08, "loss": -0.0012, "reward": 2.4999988079071045, "reward_std": 8.91323651330822e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3541 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.176165803108809, "grad_norm": 5.5617589170464905, "kl": 0.1365966796875, "learning_rate": 8.238341968911918e-08, "loss": 0.0003, "reward": 1.9990262389183044, "reward_std": 4.061861039872383e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990261793136597, "step": 3542 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.178756476683938, "grad_norm": 0.23547582384360838, "kl": 0.1116943359375, "learning_rate": 8.212435233160621e-08, "loss": 0.0002, "reward": 2.4999918937683105, "reward_std": 2.4798429194561322e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 3543 }, { "clip_ratio": 0.0, "completion_length": 42.875, "epoch": 9.181347150259068, "grad_norm": 48.956637997189596, "kl": 0.07025146484375, "learning_rate": 8.186528497409326e-08, "loss": 0.001, "reward": 2.499592661857605, "reward_std": 0.0002758118438919155, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999592661857605, "step": 3544 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.183937823834198, "grad_norm": 0.12399059422283794, "kl": 0.03363037109375, "learning_rate": 8.160621761658031e-08, "loss": -0.0006, "reward": 2.4999977350234985, "reward_std": 9.690955096175458e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3545 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.186528497409327, "grad_norm": 0.5371496291125203, "kl": 0.103759765625, "learning_rate": 8.134715025906735e-08, "loss": 0.0008, "reward": 2.4999847412109375, "reward_std": 5.229160706221592e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 3546 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.189119170984457, "grad_norm": 0.4629858395559129, "kl": 0.11865234375, "learning_rate": 8.10880829015544e-08, "loss": 0.0003, "reward": 2.4999969005584717, "reward_std": 2.8437453920560074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3547 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.191709844559586, "grad_norm": 0.42290329623392264, "kl": 0.08447265625, "learning_rate": 8.082901554404146e-08, "loss": -0.0005, "reward": 2.4999947547912598, "reward_std": 2.9416671623039292e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 3548 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 9.194300518134716, "grad_norm": 6.069831030660708, "kl": 0.087890625, "learning_rate": 8.05699481865285e-08, "loss": 0.0014, "reward": 2.173689544200897, "reward_std": 0.2706016204623438, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6736894845962524, "step": 3549 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.196891191709845, "grad_norm": 5.292662890278738, "kl": 0.07421875, "learning_rate": 8.031088082901554e-08, "loss": 0.0009, "reward": 1.858374834060669, "reward_std": 0.0011458308891860725, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.358374923467636, "step": 3550 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.199481865284975, "grad_norm": 0.2288019746314646, "kl": 0.07861328125, "learning_rate": 8.005181347150259e-08, "loss": -0.0003, "reward": 2.499998688697815, "reward_std": 1.565572972594964e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3551 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.202072538860104, "grad_norm": 0.13774182404201013, "kl": 0.167724609375, "learning_rate": 7.979274611398963e-08, "loss": 0.0008, "reward": 2.4999964237213135, "reward_std": 1.7090208785930372e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3552 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.204663212435234, "grad_norm": 0.053385112229226116, "kl": 0.08935546875, "learning_rate": 7.953367875647668e-08, "loss": 0.0002, "reward": 2.4999942779541016, "reward_std": 2.021474131197465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 3553 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.207253886010363, "grad_norm": 0.12476350211717226, "kl": 0.088623046875, "learning_rate": 7.927461139896373e-08, "loss": 0.0001, "reward": 2.4999969005584717, "reward_std": 1.3479302651830949e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3554 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.209844559585493, "grad_norm": 0.18325820418596148, "kl": 0.026947021484375, "learning_rate": 7.901554404145077e-08, "loss": -0.0, "reward": 2.4999951124191284, "reward_std": 1.5565823616725538e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 3555 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.212435233160623, "grad_norm": 3.590142682033384, "kl": 0.135009765625, "learning_rate": 7.875647668393783e-08, "loss": 0.0007, "reward": 1.935254454612732, "reward_std": 0.0002560021806630175, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4352545142173767, "step": 3556 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.215025906735752, "grad_norm": 0.07211928171498277, "kl": 0.079833984375, "learning_rate": 7.849740932642488e-08, "loss": 0.0005, "reward": 2.4999992847442627, "reward_std": 6.647934327475014e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999463558197, "step": 3557 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.217616580310882, "grad_norm": 0.13586985318554165, "kl": 0.084228515625, "learning_rate": 7.82383419689119e-08, "loss": 0.0002, "reward": 2.4999983310699463, "reward_std": 1.0739508553569976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3558 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.220207253886011, "grad_norm": 0.03198840686205217, "kl": 0.034637451171875, "learning_rate": 7.797927461139896e-08, "loss": 0.0001, "reward": 2.499998927116394, "reward_std": 7.59293158125729e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3559 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.22279792746114, "grad_norm": 1.792747059602611, "kl": 0.16455078125, "learning_rate": 7.772020725388601e-08, "loss": 0.001, "reward": 2.4999918937683105, "reward_std": 4.75702177027415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 3560 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.22538860103627, "grad_norm": 9.164046473963442, "kl": 0.3345947265625, "learning_rate": 7.746113989637305e-08, "loss": 0.0006, "reward": 2.499996781349182, "reward_std": 3.2441436133012758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3561 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.2279792746114, "grad_norm": 0.21517974687063507, "kl": 0.0452880859375, "learning_rate": 7.72020725388601e-08, "loss": 0.0006, "reward": 2.499997138977051, "reward_std": 1.787331029845518e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3562 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.23056994818653, "grad_norm": 0.5857321236215196, "kl": 0.0540771484375, "learning_rate": 7.694300518134715e-08, "loss": 0.0007, "reward": 2.499996304512024, "reward_std": 4.657578642763838e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3563 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.233160621761659, "grad_norm": 0.10638815444298862, "kl": 0.02825927734375, "learning_rate": 7.66839378238342e-08, "loss": -0.0003, "reward": 2.4999983310699463, "reward_std": 1.1807128998952976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3564 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.235751295336788, "grad_norm": 1.1255840130850014, "kl": 0.25537109375, "learning_rate": 7.642487046632123e-08, "loss": 0.0029, "reward": 2.4999892711639404, "reward_std": 4.751030246552546e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 3565 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.238341968911918, "grad_norm": 0.6922662257384038, "kl": 0.0506591796875, "learning_rate": 7.616580310880829e-08, "loss": 0.0004, "reward": 2.4999940395355225, "reward_std": 5.8942874829881475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 3566 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.240932642487047, "grad_norm": 0.21461580510857653, "kl": 0.09912109375, "learning_rate": 7.590673575129533e-08, "loss": 0.0016, "reward": 2.4999969005584717, "reward_std": 1.6334789734173683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3567 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.243523316062177, "grad_norm": 0.13859777397956255, "kl": 0.0845947265625, "learning_rate": 7.564766839378238e-08, "loss": 0.0008, "reward": 2.4999938011169434, "reward_std": 1.8792263745126547e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 3568 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.246113989637305, "grad_norm": 0.218545708377313, "kl": 0.11669921875, "learning_rate": 7.538860103626943e-08, "loss": -0.0003, "reward": 2.4999951124191284, "reward_std": 2.064516706923314e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3569 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.248704663212436, "grad_norm": 2.343974335086104, "kl": 0.119873046875, "learning_rate": 7.512953367875647e-08, "loss": -0.0001, "reward": 2.4999852180480957, "reward_std": 1.0635156741045648e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852180480957, "step": 3570 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.251295336787564, "grad_norm": 0.19039485516030183, "kl": 0.06268310546875, "learning_rate": 7.487046632124352e-08, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.8207654761681624e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3571 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.253886010362695, "grad_norm": 0.07878572688445462, "kl": 0.064453125, "learning_rate": 7.461139896373056e-08, "loss": -0.0, "reward": 2.4999977350234985, "reward_std": 1.0788540976136574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3572 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.256476683937823, "grad_norm": 0.22940013995169029, "kl": 0.0673828125, "learning_rate": 7.43523316062176e-08, "loss": 0.0001, "reward": 2.4999966621398926, "reward_std": 3.3710600746417185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3573 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.259067357512953, "grad_norm": 3.4907532340667977, "kl": 0.095947265625, "learning_rate": 7.409326424870465e-08, "loss": 0.0006, "reward": 1.9989585280418396, "reward_std": 4.486948813564595e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989583790302277, "step": 3574 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.261658031088082, "grad_norm": 2.031033755411811, "kl": 0.152099609375, "learning_rate": 7.383419689119171e-08, "loss": -0.0005, "reward": 1.997396469116211, "reward_std": 3.076488050623993e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4973964989185333, "step": 3575 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.264248704663212, "grad_norm": 6.855079247093661, "kl": 0.07098388671875, "learning_rate": 7.357512953367876e-08, "loss": -0.0001, "reward": 1.995786726474762, "reward_std": 4.7343335722871416e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4957867860794067, "step": 3576 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.266839378238341, "grad_norm": 2.4724472864007705, "kl": 0.045654296875, "learning_rate": 7.33160621761658e-08, "loss": -0.0004, "reward": 2.499998450279236, "reward_std": 1.4566876416211016e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3577 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.26943005181347, "grad_norm": 28.931125236765222, "kl": 0.1806640625, "learning_rate": 7.305699481865285e-08, "loss": 0.0006, "reward": 2.4371258020401, "reward_std": 0.1778223753999555, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9371256828308105, "step": 3578 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.2720207253886, "grad_norm": 0.24526185431947714, "kl": 0.0589599609375, "learning_rate": 7.27979274611399e-08, "loss": 0.0007, "reward": 2.49999463558197, "reward_std": 2.57412233395371e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 3579 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.27461139896373, "grad_norm": 0.1300556897532382, "kl": 0.03826904296875, "learning_rate": 7.253886010362693e-08, "loss": 0.0, "reward": 2.499995470046997, "reward_std": 1.7970642147702165e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 3580 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.27720207253886, "grad_norm": 0.20036300916705224, "kl": 0.09130859375, "learning_rate": 7.227979274611398e-08, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 1.7599689385860984e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3581 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.279792746113989, "grad_norm": 14.246794364651008, "kl": 0.106201171875, "learning_rate": 7.202072538860104e-08, "loss": 0.0007, "reward": 1.7703025937080383, "reward_std": 0.0007937379506870457, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2703025341033936, "step": 3582 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.282383419689118, "grad_norm": 0.21473365634020414, "kl": 0.12646484375, "learning_rate": 7.176165803108808e-08, "loss": -0.0001, "reward": 2.4999977350234985, "reward_std": 2.7360627825601114e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3583 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.284974093264248, "grad_norm": 0.11166258929460875, "kl": 0.1123046875, "learning_rate": 7.150259067357513e-08, "loss": 0.0, "reward": 2.4999985694885254, "reward_std": 1.0808810912976696e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3584 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.287564766839377, "grad_norm": 0.09061915647903572, "kl": 0.06689453125, "learning_rate": 7.124352331606218e-08, "loss": 0.0013, "reward": 2.4999985694885254, "reward_std": 6.082972419108046e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3585 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.290155440414507, "grad_norm": 0.44086257563100284, "kl": 0.12109375, "learning_rate": 7.098445595854922e-08, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 2.0758148480126692e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3586 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.292746113989637, "grad_norm": 0.23876465152212895, "kl": 0.067138671875, "learning_rate": 7.072538860103626e-08, "loss": -0.0006, "reward": 2.4999979734420776, "reward_std": 1.428156338079134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3587 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.295336787564766, "grad_norm": 0.2964631111810492, "kl": 0.04278564453125, "learning_rate": 7.046632124352331e-08, "loss": 0.0005, "reward": 2.4999966621398926, "reward_std": 3.0598816920246463e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3588 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.297927461139896, "grad_norm": 9.83039512213677, "kl": 0.1962890625, "learning_rate": 7.020725388601035e-08, "loss": 0.0008, "reward": 1.745278239250183, "reward_std": 0.17766309925355017, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.245278239250183, "step": 3589 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.300518134715025, "grad_norm": 4.930907097220764, "kl": 0.097900390625, "learning_rate": 6.99481865284974e-08, "loss": 0.0005, "reward": 1.9988873600959778, "reward_std": 0.00013701884569172762, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988874197006226, "step": 3590 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.303108808290155, "grad_norm": 10.72542965973583, "kl": 0.08837890625, "learning_rate": 6.968911917098446e-08, "loss": 0.0011, "reward": 1.9994693994522095, "reward_std": 0.00013732333138705144, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994693994522095, "step": 3591 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.305699481865284, "grad_norm": 0.2491174761594985, "kl": 0.108642578125, "learning_rate": 6.94300518134715e-08, "loss": 0.0009, "reward": 2.499997615814209, "reward_std": 1.6089990140244481e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3592 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.308290155440414, "grad_norm": 0.05461802748555054, "kl": 0.03045654296875, "learning_rate": 6.917098445595855e-08, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 1.3704738393016669e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3593 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 9.310880829015543, "grad_norm": 13.840690983315469, "kl": 0.0986328125, "learning_rate": 6.89119170984456e-08, "loss": 0.0011, "reward": 1.9083737134933472, "reward_std": 0.0017017516706800961, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4083735346794128, "step": 3594 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.313471502590673, "grad_norm": 0.7153139922592794, "kl": 0.369384765625, "learning_rate": 6.865284974093263e-08, "loss": 0.0016, "reward": 2.4999977350234985, "reward_std": 2.1461975734382577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3595 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.316062176165802, "grad_norm": 0.1157243837337217, "kl": 0.063232421875, "learning_rate": 6.839378238341968e-08, "loss": -0.0002, "reward": 2.499996304512024, "reward_std": 1.4447798832861736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3596 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.318652849740932, "grad_norm": 0.04379743141214473, "kl": 0.0283203125, "learning_rate": 6.813471502590673e-08, "loss": -0.0002, "reward": 2.4999988079071045, "reward_std": 9.529030364774371e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3597 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.321243523316062, "grad_norm": 0.08733642854032107, "kl": 0.04296875, "learning_rate": 6.787564766839377e-08, "loss": -0.0003, "reward": 2.499998688697815, "reward_std": 9.922276547058573e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3598 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.323834196891191, "grad_norm": 0.5022351887155698, "kl": 0.09326171875, "learning_rate": 6.761658031088083e-08, "loss": 0.0002, "reward": 2.499993920326233, "reward_std": 5.653181801790197e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 3599 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.32642487046632, "grad_norm": 14.671241201880353, "kl": 0.073974609375, "learning_rate": 6.735751295336788e-08, "loss": 0.0001, "reward": 1.977281391620636, "reward_std": 0.0013573553815717787, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4772814512252808, "step": 3600 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.32901554404145, "grad_norm": 0.07738168589940743, "kl": 0.123046875, "learning_rate": 6.709844559585492e-08, "loss": 0.0002, "reward": 2.4999988079071045, "reward_std": 9.173938906315016e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3601 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 9.33160621761658, "grad_norm": 9.106378625770812, "kl": 0.0924072265625, "learning_rate": 6.683937823834196e-08, "loss": 0.0004, "reward": 1.956620991230011, "reward_std": 0.00030889850421544907, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4566209316253662, "step": 3602 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.33419689119171, "grad_norm": 15.840555859132808, "kl": 0.12646484375, "learning_rate": 6.658031088082901e-08, "loss": 0.0007, "reward": 1.790733516216278, "reward_std": 0.0003378352494110004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2907334566116333, "step": 3603 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.336787564766839, "grad_norm": 0.119126874540989, "kl": 0.09228515625, "learning_rate": 6.632124352331605e-08, "loss": 0.0005, "reward": 2.499988079071045, "reward_std": 2.0201899815219804e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 3604 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.339378238341968, "grad_norm": 0.3542677658120117, "kl": 0.125244140625, "learning_rate": 6.60621761658031e-08, "loss": 0.0014, "reward": 2.4999983310699463, "reward_std": 1.4996069808148604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3605 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.341968911917098, "grad_norm": 0.17708831578915754, "kl": 0.0452880859375, "learning_rate": 6.580310880829015e-08, "loss": 0.0005, "reward": 2.4999970197677612, "reward_std": 2.392069973211619e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3606 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.344559585492227, "grad_norm": 1.9744099217956352, "kl": 0.2197265625, "learning_rate": 6.554404145077721e-08, "loss": 0.0007, "reward": 1.999289870262146, "reward_std": 2.2781258394388715e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992897510528564, "step": 3607 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.347150259067357, "grad_norm": 0.0818958147745962, "kl": 0.028228759765625, "learning_rate": 6.528497409326425e-08, "loss": -0.0002, "reward": 2.4999988079071045, "reward_std": 6.43532956701165e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3608 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.349740932642487, "grad_norm": 0.26630246348485737, "kl": 0.0596923828125, "learning_rate": 6.50259067357513e-08, "loss": 0.0007, "reward": 2.4999959468841553, "reward_std": 2.8611224820451753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3609 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.352331606217616, "grad_norm": 0.1530153042837234, "kl": 0.1328125, "learning_rate": 6.476683937823834e-08, "loss": -0.0005, "reward": 2.499995708465576, "reward_std": 1.8680327684705844e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 3610 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.354922279792746, "grad_norm": 0.15876651125523367, "kl": 0.082763671875, "learning_rate": 6.450777202072538e-08, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 2.3785829910139e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3611 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.357512953367875, "grad_norm": 0.30093736829280054, "kl": 0.05517578125, "learning_rate": 6.424870466321243e-08, "loss": 0.0015, "reward": 2.4999983310699463, "reward_std": 2.0103132669646584e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3612 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 9.360103626943005, "grad_norm": 0.1780023891583548, "kl": 0.0894775390625, "learning_rate": 6.398963730569948e-08, "loss": 0.0009, "reward": 2.4999985694885254, "reward_std": 1.365143475595687e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3613 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.362694300518134, "grad_norm": 7.232719025042965, "kl": 0.120849609375, "learning_rate": 6.373056994818652e-08, "loss": 0.0005, "reward": 2.062354266643524, "reward_std": 0.1768264901600105, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5623540878295898, "step": 3614 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.365284974093264, "grad_norm": 0.10510732415081468, "kl": 0.100341796875, "learning_rate": 6.347150259067358e-08, "loss": 0.0011, "reward": 2.4999979734420776, "reward_std": 1.0180434202311517e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3615 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.367875647668393, "grad_norm": 0.12808546838626128, "kl": 0.02783203125, "learning_rate": 6.321243523316063e-08, "loss": 0.0004, "reward": 2.4999979734420776, "reward_std": 1.5674729070269677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3616 }, { "clip_ratio": 0.0, "completion_length": 39.5625, "epoch": 9.370466321243523, "grad_norm": 0.16091630827070866, "kl": 0.06744384765625, "learning_rate": 6.295336787564765e-08, "loss": 0.0003, "reward": 2.4999982118606567, "reward_std": 1.662584651285215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3617 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.373056994818652, "grad_norm": 0.41438193531615836, "kl": 0.1292724609375, "learning_rate": 6.269430051813471e-08, "loss": 0.001, "reward": 2.4999955892562866, "reward_std": 2.8048168587702094e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3618 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.375647668393782, "grad_norm": 0.21748256277780403, "kl": 0.0479736328125, "learning_rate": 6.243523316062176e-08, "loss": 0.0, "reward": 2.4999985694885254, "reward_std": 1.730632419594258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3619 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.378238341968911, "grad_norm": 1.4953165542387776, "kl": 0.17529296875, "learning_rate": 6.21761658031088e-08, "loss": 0.001, "reward": 2.499985694885254, "reward_std": 8.957299655776296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999855756759644, "step": 3620 }, { "clip_ratio": 0.0, "completion_length": 34.375, "epoch": 9.380829015544041, "grad_norm": 8.853671031110492, "kl": 0.154296875, "learning_rate": 6.191709844559585e-08, "loss": 0.0008, "reward": 1.4916110038757324, "reward_std": 0.005096537868666928, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9916110634803772, "step": 3621 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.38341968911917, "grad_norm": 0.059401767535050706, "kl": 0.09027099609375, "learning_rate": 6.16580310880829e-08, "loss": -0.0002, "reward": 2.4999972581863403, "reward_std": 1.4663588672192418e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3622 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.3860103626943, "grad_norm": 0.05065571247368234, "kl": 0.075439453125, "learning_rate": 6.139896373056994e-08, "loss": -0.0003, "reward": 2.4999977350234985, "reward_std": 1.3659297053436603e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3623 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 9.38860103626943, "grad_norm": 3.164647596222652, "kl": 0.14990234375, "learning_rate": 6.1139896373057e-08, "loss": 0.0005, "reward": 2.4999561309814453, "reward_std": 8.024075714274659e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999560117721558, "step": 3624 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.39119170984456, "grad_norm": 0.1591743714590809, "kl": 0.10595703125, "learning_rate": 6.088082901554404e-08, "loss": -0.0006, "reward": 2.499996781349182, "reward_std": 2.0420766304596327e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3625 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.393782383419689, "grad_norm": 2.3454906017791073, "kl": 0.21337890625, "learning_rate": 6.062176165803109e-08, "loss": -0.0, "reward": 2.4999934434890747, "reward_std": 2.4459145606670063e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 3626 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.396373056994818, "grad_norm": 2.1235537327491816, "kl": 0.073486328125, "learning_rate": 6.036269430051813e-08, "loss": 0.0007, "reward": 2.49997341632843, "reward_std": 1.987850492923826e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999734163284302, "step": 3627 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.398963730569948, "grad_norm": 18.276489714167496, "kl": 0.073974609375, "learning_rate": 6.010362694300518e-08, "loss": 0.0004, "reward": 1.7538734078407288, "reward_std": 0.0029693585449308557, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2538734376430511, "step": 3628 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 9.401554404145077, "grad_norm": 0.12523160213091783, "kl": 0.02337646484375, "learning_rate": 5.984455958549222e-08, "loss": 0.0002, "reward": 2.499997854232788, "reward_std": 1.8227909777124296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3629 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.404145077720207, "grad_norm": 0.1973625894333639, "kl": 0.069580078125, "learning_rate": 5.958549222797927e-08, "loss": -0.0, "reward": 2.499998927116394, "reward_std": 1.2107453244425415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3630 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.406735751295336, "grad_norm": 0.087874696875086, "kl": 0.0986328125, "learning_rate": 5.932642487046632e-08, "loss": 0.001, "reward": 2.499993920326233, "reward_std": 2.4315805831065518e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3631 }, { "clip_ratio": 0.0, "completion_length": 53.8125, "epoch": 9.409326424870466, "grad_norm": 0.3880894254461277, "kl": 0.0855712890625, "learning_rate": 5.9067357512953366e-08, "loss": 0.0, "reward": 2.4999940395355225, "reward_std": 2.5382645389981917e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 3632 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 9.411917098445596, "grad_norm": 3.485353463348756, "kl": 0.0513916015625, "learning_rate": 5.880829015544041e-08, "loss": 0.0012, "reward": 2.499954104423523, "reward_std": 3.103803521753434e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999540448188782, "step": 3633 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.414507772020725, "grad_norm": 3.590095619129856, "kl": 0.089599609375, "learning_rate": 5.854922279792746e-08, "loss": 0.0002, "reward": 2.3437455892562866, "reward_std": 0.4419437044459755, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749955892562866, "step": 3634 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.417098445595855, "grad_norm": 0.08881621379895523, "kl": 0.16455078125, "learning_rate": 5.8290155440414504e-08, "loss": 0.0005, "reward": 2.499997615814209, "reward_std": 8.783458724792581e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3635 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 9.419689119170984, "grad_norm": 1.0252453764097909, "kl": 0.1123046875, "learning_rate": 5.803108808290155e-08, "loss": 0.0002, "reward": 2.4999945163726807, "reward_std": 3.6389884598975186e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 3636 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.422279792746114, "grad_norm": 15.954273734079132, "kl": 0.06396484375, "learning_rate": 5.77720207253886e-08, "loss": 0.0003, "reward": 2.0624386072158813, "reward_std": 0.17679407411310422, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562438726425171, "step": 3637 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.424870466321243, "grad_norm": 0.08721993678915019, "kl": 0.0418701171875, "learning_rate": 5.751295336787564e-08, "loss": -0.0003, "reward": 2.4999985694885254, "reward_std": 1.2154248167917103e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3638 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.427461139896373, "grad_norm": 0.7463703792736648, "kl": 0.1756591796875, "learning_rate": 5.725388601036269e-08, "loss": 0.0017, "reward": 2.499998450279236, "reward_std": 2.1560086338467954e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3639 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.430051813471502, "grad_norm": 0.38678518089096137, "kl": 0.0557861328125, "learning_rate": 5.699481865284974e-08, "loss": 0.0002, "reward": 2.4999964237213135, "reward_std": 3.3472968539172143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3640 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 9.432642487046632, "grad_norm": 0.24260689427242088, "kl": 0.054443359375, "learning_rate": 5.673575129533679e-08, "loss": 0.0004, "reward": 2.4999982118606567, "reward_std": 1.6370253206332563e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3641 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.435233160621761, "grad_norm": 0.09254647668453571, "kl": 0.07470703125, "learning_rate": 5.647668393782383e-08, "loss": 0.001, "reward": 2.4999985694885254, "reward_std": 1.3572292232311156e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3642 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.437823834196891, "grad_norm": 1.090033154683512, "kl": 0.084716796875, "learning_rate": 5.621761658031088e-08, "loss": 0.0014, "reward": 2.49999737739563, "reward_std": 2.1360930304581416e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3643 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.44041450777202, "grad_norm": 1.802814972809135, "kl": 0.05279541015625, "learning_rate": 5.5958549222797925e-08, "loss": -0.0008, "reward": 2.4999985694885254, "reward_std": 1.3046385731740884e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3644 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.44300518134715, "grad_norm": 0.5236218493691641, "kl": 0.08837890625, "learning_rate": 5.569948186528497e-08, "loss": 0.0002, "reward": 2.4999947547912598, "reward_std": 3.456106128396641e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 3645 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.44559585492228, "grad_norm": 0.12540096844146575, "kl": 0.05078125, "learning_rate": 5.544041450777202e-08, "loss": -0.0008, "reward": 2.4999972581863403, "reward_std": 2.073917642064771e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3646 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.44818652849741, "grad_norm": 0.1728487615953028, "kl": 0.10498046875, "learning_rate": 5.518134715025906e-08, "loss": 0.0006, "reward": 2.4999972581863403, "reward_std": 1.8301269051335112e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3647 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.450777202072539, "grad_norm": 0.3107424706274559, "kl": 0.0277099609375, "learning_rate": 5.4922279792746116e-08, "loss": 0.0008, "reward": 2.4999852180480957, "reward_std": 4.473358728773746e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985158443451, "step": 3648 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.453367875647668, "grad_norm": 12.671737205790738, "kl": 0.117919921875, "learning_rate": 5.4663212435233155e-08, "loss": 0.0, "reward": 1.9970173835754395, "reward_std": 0.00015121495582093303, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497017353773117, "step": 3649 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 9.455958549222798, "grad_norm": 0.3467681544896217, "kl": 0.16650390625, "learning_rate": 5.44041450777202e-08, "loss": 0.0005, "reward": 2.4999948740005493, "reward_std": 3.6091330457566073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3650 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 9.458549222797927, "grad_norm": 30.192325578767427, "kl": 0.1455078125, "learning_rate": 5.4145077720207254e-08, "loss": 0.0007, "reward": 1.9717373847961426, "reward_std": 0.0006968245324969757, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4717373251914978, "step": 3651 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.461139896373057, "grad_norm": 0.34607095722026116, "kl": 0.0457763671875, "learning_rate": 5.38860103626943e-08, "loss": -0.0001, "reward": 2.49999737739563, "reward_std": 1.8930753924450983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3652 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 9.463730569948186, "grad_norm": 0.5448851186343754, "kl": 0.130615234375, "learning_rate": 5.362694300518134e-08, "loss": 0.001, "reward": 2.499991774559021, "reward_std": 4.4095952489442425e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 3653 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.466321243523316, "grad_norm": 9.116865567393958, "kl": 0.1240234375, "learning_rate": 5.336787564766839e-08, "loss": 0.001, "reward": 1.999870777130127, "reward_std": 1.5297391371404956e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998708367347717, "step": 3654 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.468911917098445, "grad_norm": 0.1421368319804727, "kl": 0.03143310546875, "learning_rate": 5.310880829015544e-08, "loss": 0.0016, "reward": 2.4999983310699463, "reward_std": 1.7341364468848042e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3655 }, { "clip_ratio": 0.0, "completion_length": 40.1875, "epoch": 9.471502590673575, "grad_norm": 0.15643644161546194, "kl": 0.0653076171875, "learning_rate": 5.284974093264249e-08, "loss": -0.0009, "reward": 2.4999985694885254, "reward_std": 1.4584315408683324e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3656 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.474093264248705, "grad_norm": 1.652308539695442, "kl": 0.18994140625, "learning_rate": 5.259067357512953e-08, "loss": 0.0001, "reward": 1.9995487928390503, "reward_std": 3.070371053581766e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995489418506622, "step": 3657 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.476683937823834, "grad_norm": 12.142611916219485, "kl": 0.0911865234375, "learning_rate": 5.2331606217616577e-08, "loss": 0.0007, "reward": 2.437486410140991, "reward_std": 0.17679407532318692, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374862909317017, "step": 3658 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.479274611398964, "grad_norm": 0.12880422558425067, "kl": 0.09619140625, "learning_rate": 5.207253886010363e-08, "loss": 0.0004, "reward": 2.499998927116394, "reward_std": 1.1238937531743431e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3659 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.481865284974093, "grad_norm": 0.09784381507078879, "kl": 0.104736328125, "learning_rate": 5.181347150259067e-08, "loss": 0.0013, "reward": 2.4999988079071045, "reward_std": 9.665852189755242e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3660 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.484455958549223, "grad_norm": 1.3912513590984912, "kl": 0.10693359375, "learning_rate": 5.1554404145077715e-08, "loss": 0.0008, "reward": 1.9992981553077698, "reward_std": 2.7051125258026332e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499298334121704, "step": 3661 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.487046632124352, "grad_norm": 0.4468488940216501, "kl": 0.116607666015625, "learning_rate": 5.129533678756477e-08, "loss": 0.0009, "reward": 2.4999970197677612, "reward_std": 3.371821634345906e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3662 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.489637305699482, "grad_norm": 0.10543651552558901, "kl": 0.1640625, "learning_rate": 5.1036269430051813e-08, "loss": -0.0001, "reward": 2.4999990463256836, "reward_std": 1.1950355656153988e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 3663 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.492227979274611, "grad_norm": 0.2195069802344762, "kl": 0.12890625, "learning_rate": 5.077720207253885e-08, "loss": 0.0018, "reward": 2.499996781349182, "reward_std": 2.6130073820240796e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3664 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 9.494818652849741, "grad_norm": 0.3465144817334189, "kl": 0.043212890625, "learning_rate": 5.0518134715025906e-08, "loss": -0.0019, "reward": 2.499997615814209, "reward_std": 1.6977792824945936e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3665 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.49740932642487, "grad_norm": 0.18329388562241242, "kl": 0.0941162109375, "learning_rate": 5.025906735751295e-08, "loss": 0.0004, "reward": 2.4999961853027344, "reward_std": 2.430586903301446e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 3666 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.5, "grad_norm": 0.5347540847711754, "kl": 0.1044921875, "learning_rate": 5e-08, "loss": 0.0013, "reward": 2.499995231628418, "reward_std": 5.053573204349959e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3667 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.50259067357513, "grad_norm": 0.10517911008225438, "kl": 0.04150390625, "learning_rate": 4.9740932642487044e-08, "loss": 0.0, "reward": 2.499998688697815, "reward_std": 1.0219980595138622e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3668 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.505181347150259, "grad_norm": 0.7737530396572683, "kl": 0.07177734375, "learning_rate": 4.948186528497409e-08, "loss": 0.0007, "reward": 2.4999947547912598, "reward_std": 6.835841304564383e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3669 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.507772020725389, "grad_norm": 4.1305244300989585, "kl": 0.12646484375, "learning_rate": 4.9222797927461136e-08, "loss": 0.0006, "reward": 1.9823808670043945, "reward_std": 0.00010332564534110134, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4823808670043945, "step": 3670 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.510362694300518, "grad_norm": 77.34924560889802, "kl": 0.05078125, "learning_rate": 4.896373056994819e-08, "loss": -0.0004, "reward": 2.4998831748962402, "reward_std": 0.000321648839531008, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998831748962402, "step": 3671 }, { "clip_ratio": 0.0, "completion_length": 53.625, "epoch": 9.512953367875648, "grad_norm": 14.583210915556055, "kl": 0.1455078125, "learning_rate": 4.870466321243523e-08, "loss": 0.0015, "reward": 2.4992854595184326, "reward_std": 0.0003975931179525105, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999285340309143, "step": 3672 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.515544041450777, "grad_norm": 13.627062732358134, "kl": 0.107666015625, "learning_rate": 4.844559585492228e-08, "loss": 0.0003, "reward": 1.751018226146698, "reward_std": 0.001111598602705044, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2510183453559875, "step": 3673 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.518134715025907, "grad_norm": 0.10333259494606195, "kl": 0.071533203125, "learning_rate": 4.818652849740933e-08, "loss": -0.0003, "reward": 2.4999988079071045, "reward_std": 1.4403462387235777e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 3674 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.520725388601036, "grad_norm": 0.13529857415747307, "kl": 0.0751953125, "learning_rate": 4.7927461139896366e-08, "loss": 0.0012, "reward": 2.499998450279236, "reward_std": 1.24440410331772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3675 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.523316062176166, "grad_norm": 2.4081346597901536, "kl": 0.115234375, "learning_rate": 4.766839378238342e-08, "loss": 0.001, "reward": 1.9998263120651245, "reward_std": 1.2382233308017021e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998261630535126, "step": 3676 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.525906735751295, "grad_norm": 0.12667962166546987, "kl": 0.08367919921875, "learning_rate": 4.7409326424870465e-08, "loss": 0.0009, "reward": 2.4999979734420776, "reward_std": 2.546394057389989e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3677 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.528497409326425, "grad_norm": 0.34558656908226904, "kl": 0.09228515625, "learning_rate": 4.715025906735751e-08, "loss": 0.0004, "reward": 2.499998450279236, "reward_std": 1.651010109071649e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3678 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.531088082901555, "grad_norm": 0.2592686537201516, "kl": 0.063507080078125, "learning_rate": 4.689119170984456e-08, "loss": 0.0011, "reward": 2.499997854232788, "reward_std": 1.5198573919406044e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3679 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.533678756476684, "grad_norm": 2.1889690467311507, "kl": 0.09326171875, "learning_rate": 4.66321243523316e-08, "loss": 0.0005, "reward": 1.4992275834083557, "reward_std": 5.3271155593392905e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9992275834083557, "step": 3680 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.536269430051814, "grad_norm": 0.3116022906225517, "kl": 0.1376953125, "learning_rate": 4.637305699481865e-08, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 2.4968342700049106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3681 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.538860103626943, "grad_norm": 26.62797664541317, "kl": 0.13720703125, "learning_rate": 4.61139896373057e-08, "loss": 0.0008, "reward": 1.999290108680725, "reward_std": 0.00022963262870234757, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499290108680725, "step": 3682 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.541450777202073, "grad_norm": 0.30767572575641516, "kl": 0.03106689453125, "learning_rate": 4.585492227979274e-08, "loss": 0.0004, "reward": 2.4999977350234985, "reward_std": 2.0715415303129703e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3683 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.544041450777202, "grad_norm": 9.719101649676722, "kl": 0.130859375, "learning_rate": 4.559585492227979e-08, "loss": 0.0013, "reward": 1.8857874274253845, "reward_std": 0.0002749708625628955, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.385787308216095, "step": 3684 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.546632124352332, "grad_norm": 0.4993807619670999, "kl": 0.1728515625, "learning_rate": 4.533678756476684e-08, "loss": 0.0002, "reward": 2.499994993209839, "reward_std": 3.71010753497103e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3685 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 9.549222797927461, "grad_norm": 0.8326585502004994, "kl": 0.063720703125, "learning_rate": 4.5077720207253886e-08, "loss": -0.0009, "reward": 2.4999918937683105, "reward_std": 5.3546923481917474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 3686 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.55181347150259, "grad_norm": 0.509501505532027, "kl": 0.05712890625, "learning_rate": 4.4818652849740926e-08, "loss": 0.0005, "reward": 2.49999463558197, "reward_std": 3.3053224797185976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 3687 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.55440414507772, "grad_norm": 9.020869040585579, "kl": 0.1630859375, "learning_rate": 4.455958549222798e-08, "loss": 0.0006, "reward": 1.9800784587860107, "reward_std": 0.0003340487900231892, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4800785183906555, "step": 3688 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.55699481865285, "grad_norm": 0.0568652183939788, "kl": 0.0726318359375, "learning_rate": 4.4300518134715024e-08, "loss": 0.0001, "reward": 2.4999992847442627, "reward_std": 4.613990967072823e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 3689 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.55958549222798, "grad_norm": 0.04542087659021282, "kl": 0.0445556640625, "learning_rate": 4.404145077720207e-08, "loss": 0.0, "reward": 2.499997854232788, "reward_std": 1.2325853901984374e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3690 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.562176165803109, "grad_norm": 0.21692717891019167, "kl": 0.109375, "learning_rate": 4.3782383419689116e-08, "loss": -0.0, "reward": 2.499996781349182, "reward_std": 2.4746789222263033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3691 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.564766839378239, "grad_norm": 0.24096227393413994, "kl": 0.029296875, "learning_rate": 4.352331606217616e-08, "loss": -0.0, "reward": 2.4999970197677612, "reward_std": 1.7339058899779047e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3692 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.567357512953368, "grad_norm": 0.18360958044414094, "kl": 0.10400390625, "learning_rate": 4.3264248704663215e-08, "loss": 0.001, "reward": 2.4999988079071045, "reward_std": 1.3713579534169185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3693 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.569948186528498, "grad_norm": 1.0604078119509348, "kl": 0.123291015625, "learning_rate": 4.3005181347150255e-08, "loss": -0.0001, "reward": 2.4999918937683105, "reward_std": 3.848412575280236e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 3694 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.572538860103627, "grad_norm": 0.8079035307014608, "kl": 0.03619384765625, "learning_rate": 4.27461139896373e-08, "loss": -0.0003, "reward": 2.4999994039535522, "reward_std": 8.176358221589908e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 3695 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.575129533678757, "grad_norm": 15.394335053256096, "kl": 0.0885009765625, "learning_rate": 4.2487046632124353e-08, "loss": 0.0007, "reward": 1.991190254688263, "reward_std": 0.00022968271440504395, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4911902248859406, "step": 3696 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.577720207253886, "grad_norm": 13.433044589669013, "kl": 0.09619140625, "learning_rate": 4.22279792746114e-08, "loss": 0.0014, "reward": 1.9992719888687134, "reward_std": 8.63503835546453e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499271810054779, "step": 3697 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.580310880829016, "grad_norm": 0.5194140757160619, "kl": 0.03619384765625, "learning_rate": 4.196891191709844e-08, "loss": -0.0006, "reward": 2.499992609024048, "reward_std": 3.423036673666502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 3698 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.582901554404145, "grad_norm": 0.12925582253930867, "kl": 0.1103515625, "learning_rate": 4.170984455958549e-08, "loss": 0.0, "reward": 2.4999985694885254, "reward_std": 1.4650702837570861e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3699 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.585492227979275, "grad_norm": 0.1375614947714548, "kl": 0.0494384765625, "learning_rate": 4.145077720207254e-08, "loss": 0.0008, "reward": 2.499998092651367, "reward_std": 1.0608646903165209e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3700 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.588082901554404, "grad_norm": 66.24287545813206, "kl": 0.0457763671875, "learning_rate": 4.119170984455959e-08, "loss": 0.0002, "reward": 1.9869626760482788, "reward_std": 0.011754938512808621, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4869627356529236, "step": 3701 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.590673575129534, "grad_norm": 0.03879593112948971, "kl": 0.0556640625, "learning_rate": 4.093264248704663e-08, "loss": 0.001, "reward": 2.499998927116394, "reward_std": 8.599563443567604e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3702 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.593264248704664, "grad_norm": 0.14155844808840987, "kl": 0.0697021484375, "learning_rate": 4.0673575129533676e-08, "loss": 0.0013, "reward": 2.4999970197677612, "reward_std": 1.4722118066856638e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3703 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.595854922279793, "grad_norm": 0.11882718902454008, "kl": 0.088134765625, "learning_rate": 4.041450777202073e-08, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 1.7807067820285738e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3704 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.598445595854923, "grad_norm": 0.4631060556892568, "kl": 0.1009521484375, "learning_rate": 4.015544041450777e-08, "loss": 0.001, "reward": 2.4999947547912598, "reward_std": 2.8680570380856807e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 3705 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.601036269430052, "grad_norm": 0.17421047491898684, "kl": 0.0396728515625, "learning_rate": 3.9896373056994814e-08, "loss": 0.0003, "reward": 2.499998927116394, "reward_std": 1.169623544683418e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 3706 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.603626943005182, "grad_norm": 0.13484050191045674, "kl": 0.045654296875, "learning_rate": 3.9637305699481867e-08, "loss": 0.0008, "reward": 2.4999982118606567, "reward_std": 1.4790787759011437e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3707 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.606217616580311, "grad_norm": 0.48928691431736043, "kl": 0.08526611328125, "learning_rate": 3.937823834196891e-08, "loss": 0.0009, "reward": 2.4999955892562866, "reward_std": 2.780924262424378e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3708 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.60880829015544, "grad_norm": 7.342837025336624, "kl": 0.12255859375, "learning_rate": 3.911917098445595e-08, "loss": 0.0003, "reward": 1.9992173910140991, "reward_std": 3.623191878432408e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992173612117767, "step": 3709 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 9.61139896373057, "grad_norm": 0.442566756458229, "kl": 0.043609619140625, "learning_rate": 3.8860103626943005e-08, "loss": 0.0018, "reward": 2.499992847442627, "reward_std": 3.2591485705779633e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 3710 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.6139896373057, "grad_norm": 0.17775670199508625, "kl": 0.0499267578125, "learning_rate": 3.860103626943005e-08, "loss": 0.0005, "reward": 2.4999988079071045, "reward_std": 1.273239064403242e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3711 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.61658031088083, "grad_norm": 0.35178206866215284, "kl": 0.0667724609375, "learning_rate": 3.83419689119171e-08, "loss": -0.0003, "reward": 2.4999935626983643, "reward_std": 4.7239742002602725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 3712 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.619170984455959, "grad_norm": 21.79241565235879, "kl": 0.08984375, "learning_rate": 3.808290155440414e-08, "loss": 0.0007, "reward": 1.647644817829132, "reward_std": 0.00068787443774454, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1476447880268097, "step": 3713 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.621761658031089, "grad_norm": 0.05267146138996816, "kl": 0.1259765625, "learning_rate": 3.782383419689119e-08, "loss": 0.0005, "reward": 2.4999988079071045, "reward_std": 7.299678372874041e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3714 }, { "clip_ratio": 0.0, "completion_length": 53.0, "epoch": 9.624352331606218, "grad_norm": 0.32899595276480537, "kl": 0.123779296875, "learning_rate": 3.7564766839378235e-08, "loss": 0.0011, "reward": 2.499995470046997, "reward_std": 4.171713271716726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 3715 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.626943005181348, "grad_norm": 0.4557655072281117, "kl": 0.037109375, "learning_rate": 3.730569948186528e-08, "loss": 0.0004, "reward": 2.499999165534973, "reward_std": 7.338413468005456e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 3716 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.629533678756477, "grad_norm": 1.415480099034529, "kl": 0.16650390625, "learning_rate": 3.704663212435233e-08, "loss": 0.0003, "reward": 2.4999934434890747, "reward_std": 4.182017960374651e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 3717 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.632124352331607, "grad_norm": 4.257629435100904, "kl": 0.09130859375, "learning_rate": 3.678756476683938e-08, "loss": 0.0004, "reward": 1.8217196464538574, "reward_std": 0.00018003635625518655, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3217195272445679, "step": 3718 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.634715025906736, "grad_norm": 0.4811050811074065, "kl": 0.089599609375, "learning_rate": 3.6528497409326426e-08, "loss": -0.0001, "reward": 2.499996066093445, "reward_std": 3.4365048122708686e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3719 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.637305699481866, "grad_norm": 1.6791102867311303, "kl": 0.127685546875, "learning_rate": 3.6269430051813465e-08, "loss": 0.0001, "reward": 1.9993860721588135, "reward_std": 3.157115372687258e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499386191368103, "step": 3720 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.639896373056995, "grad_norm": 6.922485268226246, "kl": 0.106201171875, "learning_rate": 3.601036269430052e-08, "loss": -0.0006, "reward": 1.912861406803131, "reward_std": 0.000218655359958575, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4128615856170654, "step": 3721 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.642487046632125, "grad_norm": 0.6835900906373578, "kl": 0.102783203125, "learning_rate": 3.5751295336787564e-08, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 3.3725131061146385e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3722 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.645077720207254, "grad_norm": 0.29436211907026244, "kl": 0.104736328125, "learning_rate": 3.549222797927461e-08, "loss": 0.0016, "reward": 2.4999955892562866, "reward_std": 2.8871030224308925e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3723 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.647668393782384, "grad_norm": 1.6278798100076772, "kl": 0.0855712890625, "learning_rate": 3.5233160621761656e-08, "loss": -0.0, "reward": 1.9995920062065125, "reward_std": 2.8343314852463664e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995920658111572, "step": 3724 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.650259067357513, "grad_norm": 0.8782914603361645, "kl": 0.130126953125, "learning_rate": 3.49740932642487e-08, "loss": -0.0004, "reward": 2.4999953508377075, "reward_std": 4.706882123173273e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3725 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.652849740932643, "grad_norm": 30.93369671447057, "kl": 0.22802734375, "learning_rate": 3.471502590673575e-08, "loss": 0.0012, "reward": 1.801357388496399, "reward_std": 0.0002682284914499178, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3013575077056885, "step": 3726 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.655440414507773, "grad_norm": 0.12542218535302743, "kl": 0.031158447265625, "learning_rate": 3.44559585492228e-08, "loss": 0.0006, "reward": 2.49999737739563, "reward_std": 1.7285602780248155e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3727 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.658031088082902, "grad_norm": 0.7999461662209999, "kl": 0.1407470703125, "learning_rate": 3.419689119170984e-08, "loss": 0.0003, "reward": 2.4999635219573975, "reward_std": 7.478000043192878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999635815620422, "step": 3728 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.660621761658032, "grad_norm": 0.6080672136378632, "kl": 0.2384033203125, "learning_rate": 3.3937823834196887e-08, "loss": 0.0016, "reward": 2.499998092651367, "reward_std": 1.7988782872180309e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3729 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.663212435233161, "grad_norm": 1.0479642175463073, "kl": 0.14501953125, "learning_rate": 3.367875647668394e-08, "loss": -0.0, "reward": 2.499998092651367, "reward_std": 1.9622912645900215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3730 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.66580310880829, "grad_norm": 0.15193415904085064, "kl": 0.075439453125, "learning_rate": 3.341968911917098e-08, "loss": -0.0006, "reward": 2.4999979734420776, "reward_std": 1.2542765830403368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3731 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.66839378238342, "grad_norm": 0.7401240457182856, "kl": 0.105224609375, "learning_rate": 3.3160621761658025e-08, "loss": 0.0011, "reward": 2.4999314546585083, "reward_std": 6.8274504769760824e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999931514263153, "step": 3732 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.67098445595855, "grad_norm": 0.0956811365417143, "kl": 0.047607421875, "learning_rate": 3.290155440414508e-08, "loss": 0.0, "reward": 2.499997138977051, "reward_std": 1.7996554220189864e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3733 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.67357512953368, "grad_norm": 0.9303297338451666, "kl": 0.1328125, "learning_rate": 3.2642487046632124e-08, "loss": 0.0004, "reward": 2.4999959468841553, "reward_std": 2.855011928204476e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3734 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.676165803108809, "grad_norm": 0.15181846794962373, "kl": 0.050537109375, "learning_rate": 3.238341968911917e-08, "loss": 0.0002, "reward": 2.49999737739563, "reward_std": 1.6631678363410174e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3735 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.678756476683938, "grad_norm": 5.594364024470143, "kl": 0.1416015625, "learning_rate": 3.2124352331606216e-08, "loss": 0.0003, "reward": 1.95859956741333, "reward_std": 0.0010400653375199909, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4585996270179749, "step": 3736 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.681347150259068, "grad_norm": 2.7514194911259735, "kl": 0.1943359375, "learning_rate": 3.186528497409326e-08, "loss": 0.0007, "reward": 2.499985098838806, "reward_std": 6.969257611899593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985158443451, "step": 3737 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.683937823834198, "grad_norm": 0.31076353365199944, "kl": 0.15869140625, "learning_rate": 3.1606217616580314e-08, "loss": 0.0015, "reward": 2.4999990463256836, "reward_std": 1.0930351663773763e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3738 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.686528497409327, "grad_norm": 0.3824623331207761, "kl": 0.116943359375, "learning_rate": 3.1347150259067354e-08, "loss": 0.0005, "reward": 2.4999972581863403, "reward_std": 1.9540998437150847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3739 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.689119170984457, "grad_norm": 0.5311692942343662, "kl": 0.1522216796875, "learning_rate": 3.10880829015544e-08, "loss": 0.0005, "reward": 2.499997138977051, "reward_std": 2.685926176582143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3740 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.691709844559586, "grad_norm": 0.11222205124606958, "kl": 0.056396484375, "learning_rate": 3.082901554404145e-08, "loss": 0.0005, "reward": 2.4999985694885254, "reward_std": 1.3024056215726887e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3741 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.694300518134716, "grad_norm": 0.287794021910093, "kl": 0.083984375, "learning_rate": 3.05699481865285e-08, "loss": 0.0002, "reward": 2.4999953508377075, "reward_std": 1.9028493625228293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3742 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.696891191709845, "grad_norm": 0.29169397602960695, "kl": 0.062255859375, "learning_rate": 3.0310880829015545e-08, "loss": -0.0, "reward": 2.4999982118606567, "reward_std": 1.561217629841849e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3743 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 9.699481865284975, "grad_norm": 0.4442040749253872, "kl": 0.1015625, "learning_rate": 3.005181347150259e-08, "loss": 0.0011, "reward": 2.499983787536621, "reward_std": 3.024503911319698e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983787536621, "step": 3744 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.702072538860104, "grad_norm": 6.07015611233496, "kl": 0.094970703125, "learning_rate": 2.9792746113989634e-08, "loss": -0.0004, "reward": 1.901365041732788, "reward_std": 0.00038356931207772504, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4013651013374329, "step": 3745 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.704663212435234, "grad_norm": 1.5118837525168405, "kl": 0.12353515625, "learning_rate": 2.9533678756476683e-08, "loss": 0.0011, "reward": 1.9999242424964905, "reward_std": 2.2149558503770095e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999242424964905, "step": 3746 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.707253886010363, "grad_norm": 0.1388428170813898, "kl": 0.1043701171875, "learning_rate": 2.927461139896373e-08, "loss": -0.0003, "reward": 2.499995708465576, "reward_std": 1.4188635759637691e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3747 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.709844559585493, "grad_norm": 0.42583960805186893, "kl": 0.08740234375, "learning_rate": 2.9015544041450775e-08, "loss": 0.0014, "reward": 2.4999961853027344, "reward_std": 1.2106620488339104e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3748 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.712435233160623, "grad_norm": 0.5532428952726887, "kl": 0.154541015625, "learning_rate": 2.875647668393782e-08, "loss": 0.0021, "reward": 2.499993324279785, "reward_std": 6.417700859628894e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 3749 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 9.715025906735752, "grad_norm": 0.11391138009378936, "kl": 0.0450439453125, "learning_rate": 2.849740932642487e-08, "loss": 0.0005, "reward": 2.4999974966049194, "reward_std": 1.2666403677030758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 3750 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.717616580310882, "grad_norm": 0.11065021245431056, "kl": 0.026123046875, "learning_rate": 2.8238341968911916e-08, "loss": 0.0001, "reward": 2.4999979734420776, "reward_std": 1.1482831041575992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3751 }, { "clip_ratio": 0.0, "completion_length": 43.375, "epoch": 9.720207253886011, "grad_norm": 1.5924472506336977, "kl": 0.13525390625, "learning_rate": 2.7979274611398963e-08, "loss": 0.001, "reward": 2.4991310834884644, "reward_std": 3.743893216778815e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9991310834884644, "step": 3752 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.72279792746114, "grad_norm": 1.2232165373045096, "kl": 0.11962890625, "learning_rate": 2.772020725388601e-08, "loss": 0.0015, "reward": 2.4999979734420776, "reward_std": 2.0161964471299143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3753 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.72538860103627, "grad_norm": 0.8507279612928389, "kl": 0.1171875, "learning_rate": 2.7461139896373058e-08, "loss": -0.0002, "reward": 2.4999929666519165, "reward_std": 5.6426812307108776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 3754 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.7279792746114, "grad_norm": 0.09535548030735087, "kl": 0.11920166015625, "learning_rate": 2.72020725388601e-08, "loss": 0.0014, "reward": 2.4999983310699463, "reward_std": 1.0069522033973044e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3755 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 9.73056994818653, "grad_norm": 0.730824011086081, "kl": 0.09716796875, "learning_rate": 2.694300518134715e-08, "loss": -0.0002, "reward": 2.4999955892562866, "reward_std": 1.9667504602693953e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3756 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.733160621761659, "grad_norm": 0.8076350988440982, "kl": 0.2841796875, "learning_rate": 2.6683937823834196e-08, "loss": -0.0003, "reward": 2.499995470046997, "reward_std": 5.1131971758877626e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3757 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.735751295336787, "grad_norm": 0.08843338906050775, "kl": 0.020599365234375, "learning_rate": 2.6424870466321246e-08, "loss": 0.0001, "reward": 2.4999985694885254, "reward_std": 1.2556012052300503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3758 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.738341968911918, "grad_norm": 0.34430600955672397, "kl": 0.06402587890625, "learning_rate": 2.6165803108808288e-08, "loss": 0.0006, "reward": 2.4999780654907227, "reward_std": 5.834435455653875e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999780058860779, "step": 3759 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.740932642487046, "grad_norm": 0.21867047475382204, "kl": 0.1318359375, "learning_rate": 2.5906735751295334e-08, "loss": 0.001, "reward": 2.4999972581863403, "reward_std": 1.9290233694846393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3760 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.743523316062177, "grad_norm": 12.424697339444409, "kl": 0.0635986328125, "learning_rate": 2.5647668393782384e-08, "loss": 0.0008, "reward": 2.499504566192627, "reward_std": 0.00010762697934296739, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995044469833374, "step": 3761 }, { "clip_ratio": 0.0, "completion_length": 40.5, "epoch": 9.746113989637305, "grad_norm": 0.09732615347676209, "kl": 0.076171875, "learning_rate": 2.5388601036269426e-08, "loss": 0.0006, "reward": 2.499998688697815, "reward_std": 7.077160688595541e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3762 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.748704663212436, "grad_norm": 0.20874124418875353, "kl": 0.089111328125, "learning_rate": 2.5129533678756476e-08, "loss": 0.0002, "reward": 2.4999966621398926, "reward_std": 2.5189233383571263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3763 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.751295336787564, "grad_norm": 0.21354488257449036, "kl": 0.0618896484375, "learning_rate": 2.4870466321243522e-08, "loss": 0.0001, "reward": 2.499998688697815, "reward_std": 7.022393333500077e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3764 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.753886010362695, "grad_norm": 0.2755086202745854, "kl": 0.0572509765625, "learning_rate": 2.4611398963730568e-08, "loss": -0.0009, "reward": 2.499998450279236, "reward_std": 1.4400384884538653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3765 }, { "clip_ratio": 0.0, "completion_length": 46.75, "epoch": 9.756476683937823, "grad_norm": 0.35048022202851337, "kl": 0.083984375, "learning_rate": 2.4352331606217614e-08, "loss": 0.0003, "reward": 2.4999985694885254, "reward_std": 1.302407753200896e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3766 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.759067357512954, "grad_norm": 0.483947397107209, "kl": 0.07928466796875, "learning_rate": 2.4093264248704663e-08, "loss": 0.0003, "reward": 2.4999979734420776, "reward_std": 1.40744661791814e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3767 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.761658031088082, "grad_norm": 0.6366106560493222, "kl": 0.169677734375, "learning_rate": 2.383419689119171e-08, "loss": 0.0011, "reward": 2.4999947547912598, "reward_std": 6.251258469092136e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3768 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 9.764248704663213, "grad_norm": 0.08281127509771855, "kl": 0.035491943359375, "learning_rate": 2.3575129533678756e-08, "loss": 0.0018, "reward": 2.4999992847442627, "reward_std": 9.008508925489878e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 3769 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.766839378238341, "grad_norm": 0.6832836921676207, "kl": 0.0638427734375, "learning_rate": 2.33160621761658e-08, "loss": 0.0008, "reward": 2.4999916553497314, "reward_std": 6.0702271866830415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 3770 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.76943005181347, "grad_norm": 3.6861995113390686, "kl": 0.159423828125, "learning_rate": 2.305699481865285e-08, "loss": 0.0014, "reward": 1.9999116659164429, "reward_std": 1.4138345534320251e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499911606311798, "step": 3771 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.7720207253886, "grad_norm": 0.10846325725666668, "kl": 0.0660400390625, "learning_rate": 2.2797927461139894e-08, "loss": 0.0011, "reward": 2.499998927116394, "reward_std": 1.1338075864841812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3772 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.77461139896373, "grad_norm": 3.0532155788106303, "kl": 0.0712890625, "learning_rate": 2.2538860103626943e-08, "loss": 0.0011, "reward": 2.4999903440475464, "reward_std": 1.2738880968754529e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 3773 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.77720207253886, "grad_norm": 0.16012862940305056, "kl": 0.105224609375, "learning_rate": 2.227979274611399e-08, "loss": 0.0003, "reward": 2.499997138977051, "reward_std": 1.5832946473892662e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3774 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.779792746113989, "grad_norm": 0.38012114556157567, "kl": 0.061767578125, "learning_rate": 2.2020725388601035e-08, "loss": -0.0005, "reward": 2.49999737739563, "reward_std": 2.7560788282698923e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3775 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.782383419689118, "grad_norm": 0.13686214008524414, "kl": 0.14990234375, "learning_rate": 2.176165803108808e-08, "loss": -0.0006, "reward": 2.499997138977051, "reward_std": 1.2834217670842918e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3776 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.784974093264248, "grad_norm": 0.3184854916433416, "kl": 0.03997802734375, "learning_rate": 2.1502590673575127e-08, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 2.1056638388472493e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3777 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.787564766839377, "grad_norm": 8.066807309495843, "kl": 0.070556640625, "learning_rate": 2.1243523316062177e-08, "loss": -0.0006, "reward": 1.8779073357582092, "reward_std": 0.0009095087913237876, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3779075145721436, "step": 3778 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.790155440414507, "grad_norm": 0.6789921476984827, "kl": 0.034271240234375, "learning_rate": 2.098445595854922e-08, "loss": 0.0002, "reward": 2.4999947547912598, "reward_std": 4.314764424862005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 3779 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.792746113989637, "grad_norm": 0.2352691558841816, "kl": 0.09619140625, "learning_rate": 2.072538860103627e-08, "loss": 0.0004, "reward": 2.4999959468841553, "reward_std": 2.3755790152790723e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3780 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.795336787564766, "grad_norm": 0.8135600311008213, "kl": 0.06927490234375, "learning_rate": 2.0466321243523315e-08, "loss": 0.0007, "reward": 2.49999463558197, "reward_std": 4.681737550527032e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3781 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.797927461139896, "grad_norm": 0.23711186430697373, "kl": 0.1201171875, "learning_rate": 2.0207253886010364e-08, "loss": 0.0005, "reward": 2.499998688697815, "reward_std": 1.4003129535922199e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3782 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.800518134715025, "grad_norm": 0.09701392223168806, "kl": 0.044921875, "learning_rate": 1.9948186528497407e-08, "loss": 0.0, "reward": 2.499997615814209, "reward_std": 1.5335364764723636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3783 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.803108808290155, "grad_norm": 3.6650597202543524, "kl": 0.1097412109375, "learning_rate": 1.9689119170984456e-08, "loss": -0.0007, "reward": 1.9576569199562073, "reward_std": 0.00036408414968036595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4576569199562073, "step": 3784 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.805699481865284, "grad_norm": 0.1549309194630578, "kl": 0.120361328125, "learning_rate": 1.9430051813471502e-08, "loss": 0.0006, "reward": 2.499997138977051, "reward_std": 1.5427621633534727e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3785 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.808290155440414, "grad_norm": 1.903389716145147, "kl": 0.0662841796875, "learning_rate": 1.917098445595855e-08, "loss": -0.0004, "reward": 2.4999911785125732, "reward_std": 6.992621365498053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 3786 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.810880829015543, "grad_norm": 1.4107135367160948, "kl": 0.03857421875, "learning_rate": 1.8911917098445595e-08, "loss": 0.0005, "reward": 1.9999117851257324, "reward_std": 8.431825790466974e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49991175532341, "step": 3787 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.813471502590673, "grad_norm": 10.410461627687278, "kl": 0.17431640625, "learning_rate": 1.865284974093264e-08, "loss": 0.0007, "reward": 2.4999330043792725, "reward_std": 3.300693776964181e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999932885169983, "step": 3788 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.816062176165802, "grad_norm": 0.13850158191515322, "kl": 0.06622314453125, "learning_rate": 1.839378238341969e-08, "loss": -0.0002, "reward": 2.499997854232788, "reward_std": 1.2750875839628861e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3789 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.818652849740932, "grad_norm": 10.883083135222899, "kl": 0.2137451171875, "learning_rate": 1.8134715025906733e-08, "loss": 0.0007, "reward": 2.499997138977051, "reward_std": 2.111229832735262e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3790 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 9.821243523316062, "grad_norm": 18.886210713268035, "kl": 0.14404296875, "learning_rate": 1.7875647668393782e-08, "loss": 0.0008, "reward": 1.997437298297882, "reward_std": 0.0010331470175515278, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4974372386932373, "step": 3791 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 9.823834196891191, "grad_norm": 4.971805995939417, "kl": 0.114990234375, "learning_rate": 1.7616580310880828e-08, "loss": 0.0001, "reward": 2.499287962913513, "reward_std": 8.454972521576565e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999288260936737, "step": 3792 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.82642487046632, "grad_norm": 0.027683992269848197, "kl": 0.02801513671875, "learning_rate": 1.7357512953367874e-08, "loss": 0.0006, "reward": 2.499999761581421, "reward_std": 3.2138270000814373e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999998807907104, "step": 3793 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.82901554404145, "grad_norm": 0.19131455714986165, "kl": 0.03594970703125, "learning_rate": 1.709844559585492e-08, "loss": -0.0, "reward": 2.499993085861206, "reward_std": 3.7247832551656757e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 3794 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 9.83160621761658, "grad_norm": 0.268676017848872, "kl": 0.076416015625, "learning_rate": 1.683937823834197e-08, "loss": 0.0007, "reward": 2.4999982118606567, "reward_std": 1.912419520522235e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3795 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.83419689119171, "grad_norm": 11.16899731001852, "kl": 0.1209716796875, "learning_rate": 1.6580310880829012e-08, "loss": 0.0007, "reward": 1.9954026937484741, "reward_std": 4.655531211028574e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4954026639461517, "step": 3796 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.836787564766839, "grad_norm": 0.12228814922604171, "kl": 0.074951171875, "learning_rate": 1.6321243523316062e-08, "loss": 0.0003, "reward": 2.499998450279236, "reward_std": 9.517974888240133e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3797 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.839378238341968, "grad_norm": 0.7387112632691737, "kl": 0.043701171875, "learning_rate": 1.6062176165803108e-08, "loss": -0.0006, "reward": 2.4999940395355225, "reward_std": 5.556024859743047e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 3798 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.841968911917098, "grad_norm": 0.45322319591205873, "kl": 0.197021484375, "learning_rate": 1.5803108808290157e-08, "loss": 0.0009, "reward": 2.499998688697815, "reward_std": 1.6974895231669507e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3799 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.844559585492227, "grad_norm": 5.754712170370226, "kl": 0.171875, "learning_rate": 1.55440414507772e-08, "loss": 0.001, "reward": 1.9969114661216736, "reward_std": 9.631042428281944e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4969114065170288, "step": 3800 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.847150259067357, "grad_norm": 0.34259504188562123, "kl": 0.0648193359375, "learning_rate": 1.528497409326425e-08, "loss": 0.0011, "reward": 2.4999871253967285, "reward_std": 5.228084660302557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 3801 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.849740932642487, "grad_norm": 0.4125725875153488, "kl": 0.078369140625, "learning_rate": 1.5025906735751295e-08, "loss": -0.0001, "reward": 2.499997138977051, "reward_std": 2.2249514586292207e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3802 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.852331606217616, "grad_norm": 0.08710045620115275, "kl": 0.0673828125, "learning_rate": 1.4766839378238341e-08, "loss": 0.0001, "reward": 2.4999977350234985, "reward_std": 1.6110904539345938e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3803 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.854922279792746, "grad_norm": 0.33984201757901383, "kl": 0.09906005859375, "learning_rate": 1.4507772020725387e-08, "loss": -0.0004, "reward": 2.499993681907654, "reward_std": 4.170523908442192e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 3804 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 9.857512953367875, "grad_norm": 31.47611454692869, "kl": 0.117431640625, "learning_rate": 1.4248704663212435e-08, "loss": 0.0004, "reward": 1.3739087581634521, "reward_std": 0.0029855262000637595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8739087581634521, "step": 3805 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.860103626943005, "grad_norm": 5.4101952767568235, "kl": 0.240234375, "learning_rate": 1.3989637305699481e-08, "loss": 0.0017, "reward": 1.9838261604309082, "reward_std": 0.00014016915372394578, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4838260114192963, "step": 3806 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.862694300518134, "grad_norm": 0.13807693827247153, "kl": 0.129638671875, "learning_rate": 1.3730569948186529e-08, "loss": 0.0007, "reward": 2.4999992847442627, "reward_std": 1.2886196998351807e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 3807 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.865284974093264, "grad_norm": 0.1653047828971348, "kl": 0.083984375, "learning_rate": 1.3471502590673575e-08, "loss": -0.0002, "reward": 2.499993324279785, "reward_std": 2.6324541977373883e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 3808 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.867875647668393, "grad_norm": 2.2073017229864718, "kl": 0.144287109375, "learning_rate": 1.3212435233160623e-08, "loss": 0.0012, "reward": 2.4999918937683105, "reward_std": 5.663033562086639e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 3809 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.870466321243523, "grad_norm": 2.7256545740276388, "kl": 0.11083984375, "learning_rate": 1.2953367875647667e-08, "loss": 0.0012, "reward": 2.4999773502349854, "reward_std": 1.5705370060459245e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999773502349854, "step": 3810 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.873056994818652, "grad_norm": 0.043831968747362234, "kl": 0.07861328125, "learning_rate": 1.2694300518134713e-08, "loss": 0.0006, "reward": 2.4999985694885254, "reward_std": 1.053522794336459e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3811 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.875647668393782, "grad_norm": 4.052878215256439, "kl": 0.17236328125, "learning_rate": 1.2435233160621761e-08, "loss": 0.0019, "reward": 1.9773967862129211, "reward_std": 9.506414698989829e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4773966372013092, "step": 3812 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.878238341968911, "grad_norm": 0.07883734544555218, "kl": 0.08740234375, "learning_rate": 1.2176165803108807e-08, "loss": -0.0001, "reward": 2.4999990463256836, "reward_std": 1.0058341501917312e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3813 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.880829015544041, "grad_norm": 0.8387202095200216, "kl": 0.072265625, "learning_rate": 1.1917098445595855e-08, "loss": -0.0001, "reward": 2.4999899864196777, "reward_std": 6.068301331652037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989926815033, "step": 3814 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.88341968911917, "grad_norm": 0.09195400589406262, "kl": 0.04876708984375, "learning_rate": 1.16580310880829e-08, "loss": 0.0018, "reward": 2.4999992847442627, "reward_std": 5.637899107568956e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 3815 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.8860103626943, "grad_norm": 0.3130338860200259, "kl": 0.063720703125, "learning_rate": 1.1398963730569947e-08, "loss": 0.0008, "reward": 2.4999966621398926, "reward_std": 2.389611012176829e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3816 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.88860103626943, "grad_norm": 13.969737682621586, "kl": 0.114013671875, "learning_rate": 1.1139896373056995e-08, "loss": 0.0005, "reward": 1.8711974620819092, "reward_std": 0.0005802772393508349, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.371197521686554, "step": 3817 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.89119170984456, "grad_norm": 0.2562096828663348, "kl": 0.1265869140625, "learning_rate": 1.088082901554404e-08, "loss": 0.0011, "reward": 2.499996304512024, "reward_std": 2.902668029491906e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 3818 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.893782383419689, "grad_norm": 0.34878278273336216, "kl": 0.1689453125, "learning_rate": 1.0621761658031088e-08, "loss": 0.0001, "reward": 2.4999897480010986, "reward_std": 4.163092853559647e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898672103882, "step": 3819 }, { "clip_ratio": 0.0, "completion_length": 43.5625, "epoch": 9.896373056994818, "grad_norm": 0.2340846663444792, "kl": 0.079132080078125, "learning_rate": 1.0362694300518134e-08, "loss": -0.0006, "reward": 2.499996304512024, "reward_std": 1.4858701575803934e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3820 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.898963730569948, "grad_norm": 0.15328728437451827, "kl": 0.034423828125, "learning_rate": 1.0103626943005182e-08, "loss": 0.0004, "reward": 2.4999972581863403, "reward_std": 1.6103452935567475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3821 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.901554404145077, "grad_norm": 2.0662344878503167, "kl": 0.116455078125, "learning_rate": 9.844559585492228e-09, "loss": -0.0013, "reward": 2.4999799728393555, "reward_std": 1.2334268490121758e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999802708625793, "step": 3822 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.904145077720207, "grad_norm": 0.13061760856817434, "kl": 0.070556640625, "learning_rate": 9.585492227979274e-09, "loss": 0.0013, "reward": 2.499997615814209, "reward_std": 1.5229693843821224e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3823 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.906735751295336, "grad_norm": 0.12471926449151656, "kl": 0.09814453125, "learning_rate": 9.32642487046632e-09, "loss": -0.0, "reward": 2.4999982118606567, "reward_std": 1.3124187603352766e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3824 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.909326424870466, "grad_norm": 7.222829128264534, "kl": 0.104736328125, "learning_rate": 9.067357512953366e-09, "loss": 0.0002, "reward": 1.8456906080245972, "reward_std": 0.0002547140768456302, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3456905782222748, "step": 3825 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.911917098445596, "grad_norm": 0.5588595224099412, "kl": 0.073974609375, "learning_rate": 8.808290155440414e-09, "loss": 0.0003, "reward": 2.4999935626983643, "reward_std": 4.125848022340506e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 3826 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.914507772020725, "grad_norm": 0.2325979242361352, "kl": 0.189208984375, "learning_rate": 8.54922279792746e-09, "loss": 0.0018, "reward": 2.499996781349182, "reward_std": 2.4484774030497647e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3827 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.917098445595855, "grad_norm": 0.15675002373169003, "kl": 0.110748291015625, "learning_rate": 8.290155440414506e-09, "loss": 0.0012, "reward": 2.4999982118606567, "reward_std": 2.0395184776589304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3828 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.919689119170984, "grad_norm": 0.2268745885535705, "kl": 0.22705078125, "learning_rate": 8.031088082901554e-09, "loss": 0.0008, "reward": 2.4999966621398926, "reward_std": 2.531847286491029e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3829 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.922279792746114, "grad_norm": 0.4492763865366213, "kl": 0.092041015625, "learning_rate": 7.7720207253886e-09, "loss": -0.0001, "reward": 2.499997615814209, "reward_std": 1.4182851941768604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3830 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.924870466321243, "grad_norm": 69.35659079488879, "kl": 0.0579833984375, "learning_rate": 7.512953367875648e-09, "loss": 0.0002, "reward": 2.311954617500305, "reward_std": 0.4093087166547775, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8119547367095947, "step": 3831 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.927461139896373, "grad_norm": 0.26442715787853494, "kl": 0.098876953125, "learning_rate": 7.253886010362694e-09, "loss": 0.0001, "reward": 2.499997854232788, "reward_std": 4.102021307517134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3832 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.930051813471502, "grad_norm": 8.258552383003853, "kl": 0.172119140625, "learning_rate": 6.994818652849741e-09, "loss": -0.0001, "reward": 1.9993054866790771, "reward_std": 0.0001129508577832894, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499305635690689, "step": 3833 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.932642487046632, "grad_norm": 0.14806338613805528, "kl": 0.054443359375, "learning_rate": 6.7357512953367875e-09, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 2.095347980457518e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3834 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.935233160621761, "grad_norm": 7.72035290418325, "kl": 0.213623046875, "learning_rate": 6.476683937823834e-09, "loss": 0.0013, "reward": 1.9992157220840454, "reward_std": 3.539934141372214e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992157220840454, "step": 3835 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.937823834196891, "grad_norm": 0.05579149319398992, "kl": 0.059814453125, "learning_rate": 6.2176165803108805e-09, "loss": -0.0003, "reward": 2.499995231628418, "reward_std": 1.7385888213539147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3836 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 9.94041450777202, "grad_norm": 30.295096999876026, "kl": 0.115966796875, "learning_rate": 5.958549222797927e-09, "loss": 0.0005, "reward": 1.8402210474014282, "reward_std": 0.44291423028334975, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.3714709281921387, "step": 3837 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.94300518134715, "grad_norm": 4.001748549761226, "kl": 0.091552734375, "learning_rate": 5.6994818652849734e-09, "loss": -0.0003, "reward": 1.9984712600708008, "reward_std": 3.817616459400597e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984712898731232, "step": 3838 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.94559585492228, "grad_norm": 0.4713359497333933, "kl": 0.098388671875, "learning_rate": 5.44041450777202e-09, "loss": 0.0001, "reward": 2.49996018409729, "reward_std": 4.469998316380952e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999603629112244, "step": 3839 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.94818652849741, "grad_norm": 0.2140247823671603, "kl": 0.0902099609375, "learning_rate": 5.181347150259067e-09, "loss": 0.0014, "reward": 2.4999983310699463, "reward_std": 1.5245344116010529e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3840 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 9.950777202072539, "grad_norm": 0.2858650930958002, "kl": 0.10693359375, "learning_rate": 4.922279792746114e-09, "loss": 0.0004, "reward": 1.4999994039535522, "reward_std": 4.4831026002611907e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999994039535522, "step": 3841 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 9.953367875647668, "grad_norm": 13.190728178945553, "kl": 0.140625, "learning_rate": 4.66321243523316e-09, "loss": 0.0005, "reward": 1.9365031719207764, "reward_std": 0.1770339813665487, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4365032315254211, "step": 3842 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.955958549222798, "grad_norm": 53.699696379960415, "kl": 0.19580078125, "learning_rate": 4.404145077720207e-09, "loss": 0.0017, "reward": 1.9937176704406738, "reward_std": 0.00021178188774229056, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493717610836029, "step": 3843 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.958549222797927, "grad_norm": 7.049552785125113, "kl": 0.181884765625, "learning_rate": 4.145077720207253e-09, "loss": 0.0012, "reward": 1.8341238498687744, "reward_std": 0.0006964670924389793, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3341239094734192, "step": 3844 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.961139896373057, "grad_norm": 5.614093492022736, "kl": 0.09979248046875, "learning_rate": 3.8860103626943e-09, "loss": -0.0002, "reward": 1.9971725940704346, "reward_std": 9.772756385473258e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4971727430820465, "step": 3845 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 9.963730569948186, "grad_norm": 0.3445647701931748, "kl": 0.02587890625, "learning_rate": 3.626943005181347e-09, "loss": 0.0007, "reward": 2.4999972581863403, "reward_std": 1.8021534629042435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3846 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.966321243523316, "grad_norm": 0.4810547212173195, "kl": 0.0546875, "learning_rate": 3.3678756476683938e-09, "loss": 0.0011, "reward": 2.4999938011169434, "reward_std": 3.3109392916230718e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 3847 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.968911917098445, "grad_norm": 0.13376463289846693, "kl": 0.0560302734375, "learning_rate": 3.1088082901554402e-09, "loss": 0.0011, "reward": 2.499996781349182, "reward_std": 1.5553832781733945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3848 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.971502590673575, "grad_norm": 1.2757466854015824, "kl": 0.28125, "learning_rate": 2.8497409326424867e-09, "loss": 0.001, "reward": 2.499996781349182, "reward_std": 3.6028842487212387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3849 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.974093264248705, "grad_norm": 0.39452912377638044, "kl": 0.102294921875, "learning_rate": 2.5906735751295336e-09, "loss": 0.0011, "reward": 2.4999979734420776, "reward_std": 1.9650126432679826e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3850 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.976683937823834, "grad_norm": 2.2497881424985975, "kl": 0.10693359375, "learning_rate": 2.33160621761658e-09, "loss": 0.0004, "reward": 2.4999911785125732, "reward_std": 5.426493714821845e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999910593032837, "step": 3851 }, { "clip_ratio": 0.0, "completion_length": 45.625, "epoch": 9.979274611398964, "grad_norm": 0.18175581664420132, "kl": 0.09375, "learning_rate": 2.0725388601036265e-09, "loss": 0.0002, "reward": 2.499997138977051, "reward_std": 2.0627428511943435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3852 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.981865284974093, "grad_norm": 0.05445230798105908, "kl": 0.0633544921875, "learning_rate": 1.8134715025906734e-09, "loss": -0.0006, "reward": 2.4999982118606567, "reward_std": 1.2222710097375966e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3853 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.984455958549223, "grad_norm": 0.5135737435963933, "kl": 0.06036376953125, "learning_rate": 1.5544041450777201e-09, "loss": -0.0002, "reward": 2.499983072280884, "reward_std": 4.350587573753728e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831318855286, "step": 3854 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.987046632124352, "grad_norm": 7.172873160329153, "kl": 0.0906982421875, "learning_rate": 1.2953367875647668e-09, "loss": 0.0008, "reward": 1.8064342141151428, "reward_std": 0.0003627554459626481, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3064343333244324, "step": 3855 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.989637305699482, "grad_norm": 0.5414356371216714, "kl": 0.119873046875, "learning_rate": 1.0362694300518133e-09, "loss": -0.0008, "reward": 2.4999979734420776, "reward_std": 1.8909098002950486e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3856 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.992227979274611, "grad_norm": 0.4227119487710692, "kl": 0.1697998046875, "learning_rate": 7.772020725388601e-10, "loss": 0.0011, "reward": 2.499998927116394, "reward_std": 1.0097894858063228e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 3857 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.994818652849741, "grad_norm": 0.11585945811358273, "kl": 0.03851318359375, "learning_rate": 5.181347150259066e-10, "loss": 0.0018, "reward": 2.4999969005584717, "reward_std": 1.918579926041275e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3858 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.99740932642487, "grad_norm": 0.12062776047917384, "kl": 0.0626220703125, "learning_rate": 2.590673575129533e-10, "loss": 0.0002, "reward": 2.4999982118606567, "reward_std": 1.8869767473006505e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3859 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 10.0, "grad_norm": 0.3670105380284728, "kl": 0.0908203125, "learning_rate": 0.0, "loss": 0.0009, "reward": 2.4999964237213135, "reward_std": 1.6337946817657212e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3860 } ], "logging_steps": 1.0, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }