{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7008936393902225, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 5.71875, "epoch": 0.00017522340984755565, "grad_norm": 23.31555964997353, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 1.4438997507095337, "reward_std": 0.22532765567302704, "rewards/accuracy_reward_stage2": 0.4438997805118561, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1 }, { "completion_length": 7.21875, "epoch": 0.0003504468196951113, "grad_norm": 22.065290449218974, "kl": -8.940696716308594e-06, "learning_rate": 9.998247765901524e-07, "loss": -0.0, "reward": 1.4763569831848145, "reward_std": 0.2327914535999298, "rewards/accuracy_reward_stage2": 0.47635695338249207, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2 }, { "completion_length": 11.1875, "epoch": 0.0005256702295426669, "grad_norm": 26.456861704069485, "kl": 0.0001163482666015625, "learning_rate": 9.99649553180305e-07, "loss": 0.0, "reward": 1.4022423028945923, "reward_std": 0.2888947129249573, "rewards/accuracy_reward_stage2": 0.4022422432899475, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3 }, { "completion_length": 17.140625, "epoch": 0.0007008936393902226, "grad_norm": 23.624755117010494, "kl": 0.00738525390625, "learning_rate": 9.994743297704572e-07, "loss": 0.0029, "reward": 1.3010417222976685, "reward_std": 0.14182603359222412, "rewards/accuracy_reward_stage2": 0.5510416626930237, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 4 }, { "completion_length": 14.78125, "epoch": 0.0008761170492377782, "grad_norm": 22.987286296145836, "kl": 0.00022792816162109375, "learning_rate": 9.992991063606097e-07, "loss": -0.088, "reward": 1.281743049621582, "reward_std": 0.25567078590393066, "rewards/accuracy_reward_stage2": 0.3129930794239044, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 5 }, { "completion_length": 9.703125, "epoch": 0.0010513404590853338, "grad_norm": 33.07972699991074, "kl": 1.3470649719238281e-05, "learning_rate": 9.991238829507622e-07, "loss": 0.0, "reward": 1.5713826417922974, "reward_std": 0.37181052565574646, "rewards/accuracy_reward_stage2": 0.5713826417922974, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 6 }, { "completion_length": 10.71875, "epoch": 0.0012265638689328894, "grad_norm": 27.974285828488036, "kl": 7.2479248046875e-05, "learning_rate": 9.989486595409147e-07, "loss": 0.0, "reward": 1.3608198165893555, "reward_std": 0.2502235174179077, "rewards/accuracy_reward_stage2": 0.360819935798645, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 7 }, { "completion_length": 8.515625, "epoch": 0.0014017872787804452, "grad_norm": 33.13357399660305, "kl": 0.103515625, "learning_rate": 9.98773436131067e-07, "loss": 0.0521, "reward": 1.3532755374908447, "reward_std": 0.29816100001335144, "rewards/accuracy_reward_stage2": 0.4782755672931671, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 8 }, { "completion_length": 8.359375, "epoch": 0.0015770106886280008, "grad_norm": 24.905794912365085, "kl": -9.1552734375e-05, "learning_rate": 9.985982127212195e-07, "loss": -0.0, "reward": 1.4549081325531006, "reward_std": 0.328605592250824, "rewards/accuracy_reward_stage2": 0.4549080431461334, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 9 }, { "completion_length": 12.96875, "epoch": 0.0017522340984755564, "grad_norm": 67.46304155608689, "kl": 0.03955078125, "learning_rate": 9.98422989311372e-07, "loss": 0.0158, "reward": 1.197622537612915, "reward_std": 0.2530099153518677, "rewards/accuracy_reward_stage2": 0.32262250781059265, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 10 }, { "completion_length": 12.046875, "epoch": 0.001927457508323112, "grad_norm": 20.500299418605792, "kl": 0.000965118408203125, "learning_rate": 9.982477659015245e-07, "loss": -0.0418, "reward": 1.432761311531067, "reward_std": 0.2917559742927551, "rewards/accuracy_reward_stage2": 0.4483863115310669, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 11 }, { "completion_length": 14.453125, "epoch": 0.0021026809181706675, "grad_norm": 167876.85932408215, "kl": 70.5, "learning_rate": 9.980725424916767e-07, "loss": 28.0895, "reward": 1.2883098125457764, "reward_std": 0.1935170292854309, "rewards/accuracy_reward_stage2": 0.5383098125457764, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 12 }, { "completion_length": 9.359375, "epoch": 0.002277904328018223, "grad_norm": 21.528023020071775, "kl": 0.00066375732421875, "learning_rate": 9.978973190818292e-07, "loss": 0.0003, "reward": 1.469854712486267, "reward_std": 0.2446746528148651, "rewards/accuracy_reward_stage2": 0.4698547124862671, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 13 }, { "completion_length": 13.0, "epoch": 0.0024531277378657787, "grad_norm": 23.680931373549402, "kl": 0.002105712890625, "learning_rate": 9.977220956719817e-07, "loss": 0.0008, "reward": 1.2903798818588257, "reward_std": 0.22621138393878937, "rewards/accuracy_reward_stage2": 0.2903798520565033, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 14 }, { "completion_length": 13.421875, "epoch": 0.0026283511477133343, "grad_norm": 20.564036006232385, "kl": 0.00128936767578125, "learning_rate": 9.975468722621342e-07, "loss": 0.0005, "reward": 1.6679387092590332, "reward_std": 0.17764630913734436, "rewards/accuracy_reward_stage2": 0.6679386496543884, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 15 }, { "completion_length": 9.46875, "epoch": 0.0028035745575608903, "grad_norm": 26.089703097200466, "kl": 0.00125885009765625, "learning_rate": 9.973716488522867e-07, "loss": 0.0005, "reward": 1.4319759607315063, "reward_std": 0.27263617515563965, "rewards/accuracy_reward_stage2": 0.43197596073150635, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 16 }, { "completion_length": 7.03125, "epoch": 0.002978797967408446, "grad_norm": 16.17891999936704, "kl": 0.00089263916015625, "learning_rate": 9.97196425442439e-07, "loss": -0.0438, "reward": 1.4551277160644531, "reward_std": 0.12339088320732117, "rewards/accuracy_reward_stage2": 0.5957527160644531, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 17 }, { "completion_length": 7.875, "epoch": 0.0031540213772560015, "grad_norm": 24.111914676200755, "kl": 0.00469970703125, "learning_rate": 9.970212020325915e-07, "loss": 0.0019, "reward": 1.3679840564727783, "reward_std": 0.30560600757598877, "rewards/accuracy_reward_stage2": 0.4929840862751007, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 18 }, { "completion_length": 14.4375, "epoch": 0.003329244787103557, "grad_norm": 20.91886024380496, "kl": 0.002838134765625, "learning_rate": 9.96845978622744e-07, "loss": 0.0011, "reward": 1.330362319946289, "reward_std": 0.1731535941362381, "rewards/accuracy_reward_stage2": 0.3303622603416443, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 19 }, { "completion_length": 15.59375, "epoch": 0.0035044681969511127, "grad_norm": 42139.41750782583, "kl": 428.0, "learning_rate": 9.966707552128965e-07, "loss": 171.9276, "reward": 1.3618611097335815, "reward_std": 0.15837247669696808, "rewards/accuracy_reward_stage2": 0.48686110973358154, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 20 }, { "completion_length": 6.1875, "epoch": 0.0036796916067986683, "grad_norm": 13.026390779558632, "kl": 0.0001850128173828125, "learning_rate": 9.964955318030487e-07, "loss": 0.0001, "reward": 1.7447917461395264, "reward_std": 0.13045889139175415, "rewards/accuracy_reward_stage2": 0.7447916865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 21 }, { "completion_length": 7.0, "epoch": 0.003854915016646224, "grad_norm": 23.78421408951673, "kl": 0.0029754638671875, "learning_rate": 9.963203083932012e-07, "loss": -0.0322, "reward": 1.2731072902679443, "reward_std": 0.18740758299827576, "rewards/accuracy_reward_stage2": 0.28873229026794434, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 22 }, { "completion_length": 11.46875, "epoch": 0.0040301384264937795, "grad_norm": 22.753424574491618, "kl": 0.0037078857421875, "learning_rate": 9.961450849833537e-07, "loss": -0.0449, "reward": 1.2642568349838257, "reward_std": 0.35765278339385986, "rewards/accuracy_reward_stage2": 0.29550686478614807, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 23 }, { "completion_length": 8.703125, "epoch": 0.004205361836341335, "grad_norm": 24.427356484908216, "kl": 0.0021514892578125, "learning_rate": 9.959698615735062e-07, "loss": 0.0009, "reward": 1.3861404657363892, "reward_std": 0.25101006031036377, "rewards/accuracy_reward_stage2": 0.5111405253410339, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 24 }, { "completion_length": 10.578125, "epoch": 0.004380585246188891, "grad_norm": 11105.321622372509, "kl": 11.875, "learning_rate": 9.957946381636585e-07, "loss": 4.7301, "reward": 1.416548490524292, "reward_std": 0.22747981548309326, "rewards/accuracy_reward_stage2": 0.5415483713150024, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 25 }, { "completion_length": 8.78125, "epoch": 0.004555808656036446, "grad_norm": 25.76102538605346, "kl": 0.004913330078125, "learning_rate": 9.95619414753811e-07, "loss": 0.0147, "reward": 1.6482062339782715, "reward_std": 0.191350519657135, "rewards/accuracy_reward_stage2": 0.7732061743736267, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 26 }, { "completion_length": 7.140625, "epoch": 0.004731032065884002, "grad_norm": 26.572344191821514, "kl": 0.003387451171875, "learning_rate": 9.954441913439635e-07, "loss": 0.0014, "reward": 1.5845115184783936, "reward_std": 0.24853208661079407, "rewards/accuracy_reward_stage2": 0.5845115780830383, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 27 }, { "completion_length": 12.734375, "epoch": 0.0049062554757315574, "grad_norm": 24.260368773277218, "kl": 0.00616455078125, "learning_rate": 9.95268967934116e-07, "loss": 0.0025, "reward": 1.5300661325454712, "reward_std": 0.3158077895641327, "rewards/accuracy_reward_stage2": 0.5300660729408264, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 28 }, { "completion_length": 8.765625, "epoch": 0.005081478885579113, "grad_norm": 15.78198731506964, "kl": 0.002166748046875, "learning_rate": 9.950937445242685e-07, "loss": 0.0009, "reward": 1.3041990995407104, "reward_std": 0.12750419974327087, "rewards/accuracy_reward_stage2": 0.42919909954071045, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 29 }, { "completion_length": 11.03125, "epoch": 0.005256702295426669, "grad_norm": 20.63873452976797, "kl": 0.003326416015625, "learning_rate": 9.94918521114421e-07, "loss": 0.0013, "reward": 1.094714641571045, "reward_std": 0.12758338451385498, "rewards/accuracy_reward_stage2": 0.2197147160768509, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 30 }, { "completion_length": 8.859375, "epoch": 0.005431925705274224, "grad_norm": 21.50035066177145, "kl": 0.0181884765625, "learning_rate": 9.947432977045732e-07, "loss": -0.0217, "reward": 1.540401816368103, "reward_std": 0.23430338501930237, "rewards/accuracy_reward_stage2": 0.6810267567634583, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 31 }, { "completion_length": 12.390625, "epoch": 0.005607149115121781, "grad_norm": 29.83623754865519, "kl": 0.0216064453125, "learning_rate": 9.945680742947257e-07, "loss": 0.0087, "reward": 1.25832998752594, "reward_std": 0.26882484555244446, "rewards/accuracy_reward_stage2": 0.3833300471305847, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 32 }, { "completion_length": 10.9375, "epoch": 0.005782372524969336, "grad_norm": 26.143293218614, "kl": 0.0198974609375, "learning_rate": 9.94392850884878e-07, "loss": 0.008, "reward": 1.2494080066680908, "reward_std": 0.2645424008369446, "rewards/accuracy_reward_stage2": 0.49940791726112366, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 33 }, { "completion_length": 9.53125, "epoch": 0.005957595934816892, "grad_norm": 21.480867023451303, "kl": 0.0059814453125, "learning_rate": 9.942176274750305e-07, "loss": 0.0024, "reward": 1.648768663406372, "reward_std": 0.16991084814071655, "rewards/accuracy_reward_stage2": 0.6487685441970825, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 34 }, { "completion_length": 9.6875, "epoch": 0.0061328193446644474, "grad_norm": 22.526380129092917, "kl": 0.00390625, "learning_rate": 9.94042404065183e-07, "loss": 0.0016, "reward": 1.520120620727539, "reward_std": 0.2722627818584442, "rewards/accuracy_reward_stage2": 0.5201205015182495, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 35 }, { "completion_length": 12.875, "epoch": 0.006308042754512003, "grad_norm": 28.92262587524997, "kl": 0.234375, "learning_rate": 9.938671806553355e-07, "loss": 0.0936, "reward": 1.1792454719543457, "reward_std": 0.11877614259719849, "rewards/accuracy_reward_stage2": 0.3042455315589905, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 36 }, { "completion_length": 5.96875, "epoch": 0.006483266164359559, "grad_norm": 21.198214257246605, "kl": 0.00689697265625, "learning_rate": 9.93691957245488e-07, "loss": 0.0028, "reward": 1.6057288646697998, "reward_std": 0.12137105315923691, "rewards/accuracy_reward_stage2": 0.730728805065155, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 37 }, { "completion_length": 7.921875, "epoch": 0.006658489574207114, "grad_norm": 22.106502658208328, "kl": 0.01348876953125, "learning_rate": 9.935167338356405e-07, "loss": 0.0054, "reward": 1.5812971591949463, "reward_std": 0.2364519238471985, "rewards/accuracy_reward_stage2": 0.5812971591949463, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 38 }, { "completion_length": 13.046875, "epoch": 0.00683371298405467, "grad_norm": 24.54112522332538, "kl": 0.00958251953125, "learning_rate": 9.933415104257928e-07, "loss": 0.0038, "reward": 1.3349295854568481, "reward_std": 0.3648528456687927, "rewards/accuracy_reward_stage2": 0.4599296450614929, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 39 }, { "completion_length": 16.109375, "epoch": 0.007008936393902225, "grad_norm": 444.98631596943954, "kl": 1.3125, "learning_rate": 9.931662870159453e-07, "loss": 0.5243, "reward": 1.2739577293395996, "reward_std": 0.1354614943265915, "rewards/accuracy_reward_stage2": 0.5239576697349548, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 40 }, { "completion_length": 15.046875, "epoch": 0.007184159803749781, "grad_norm": 24.708295810309572, "kl": 0.0167236328125, "learning_rate": 9.929910636060978e-07, "loss": 0.0067, "reward": 1.5153954029083252, "reward_std": 0.20246349275112152, "rewards/accuracy_reward_stage2": 0.5153952836990356, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 41 }, { "completion_length": 11.671875, "epoch": 0.007359383213597337, "grad_norm": 19.119381990147836, "kl": 0.007598876953125, "learning_rate": 9.928158401962502e-07, "loss": -0.0066, "reward": 1.4843531847000122, "reward_std": 0.14850273728370667, "rewards/accuracy_reward_stage2": 0.6093531250953674, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 42 }, { "completion_length": 12.296875, "epoch": 0.007534606623444892, "grad_norm": 126.73286516600807, "kl": 0.59375, "learning_rate": 9.926406167864027e-07, "loss": 0.2368, "reward": 1.2725048065185547, "reward_std": 0.2956145703792572, "rewards/accuracy_reward_stage2": 0.3975048065185547, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 43 }, { "completion_length": 7.140625, "epoch": 0.007709830033292448, "grad_norm": 19.271630316643456, "kl": 0.016357421875, "learning_rate": 9.92465393376555e-07, "loss": 0.0066, "reward": 1.5359582901000977, "reward_std": 0.23351669311523438, "rewards/accuracy_reward_stage2": 0.5359582304954529, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 44 }, { "completion_length": 11.546875, "epoch": 0.007885053443140003, "grad_norm": 32.85579727788969, "kl": 0.455078125, "learning_rate": 9.922901699667075e-07, "loss": 0.1818, "reward": 1.4999957084655762, "reward_std": 0.28870946168899536, "rewards/accuracy_reward_stage2": 0.6249956488609314, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 45 }, { "completion_length": 6.046875, "epoch": 0.008060276852987559, "grad_norm": 24.0607272358331, "kl": 0.033203125, "learning_rate": 9.9211494655686e-07, "loss": 0.0132, "reward": 1.4540456533432007, "reward_std": 0.35870805382728577, "rewards/accuracy_reward_stage2": 0.4540456533432007, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 46 }, { "completion_length": 8.8125, "epoch": 0.008235500262835115, "grad_norm": 20.58473435321408, "kl": 0.0022125244140625, "learning_rate": 9.919397231470123e-07, "loss": 0.0009, "reward": 1.4864552021026611, "reward_std": 0.22649237513542175, "rewards/accuracy_reward_stage2": 0.4864552319049835, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 47 }, { "completion_length": 7.9375, "epoch": 0.00841072367268267, "grad_norm": 23.996696185252787, "kl": 0.01416015625, "learning_rate": 9.917644997371648e-07, "loss": 0.0057, "reward": 1.5774058103561401, "reward_std": 0.2729129493236542, "rewards/accuracy_reward_stage2": 0.5774057507514954, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 48 }, { "completion_length": 13.75, "epoch": 0.008585947082530226, "grad_norm": 9198.783068890769, "kl": 12.625, "learning_rate": 9.915892763273173e-07, "loss": 5.0487, "reward": 1.5072916746139526, "reward_std": 0.10488568246364594, "rewards/accuracy_reward_stage2": 0.6322916746139526, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 49 }, { "completion_length": 7.578125, "epoch": 0.008761170492377781, "grad_norm": 21.17089484476723, "kl": 0.028076171875, "learning_rate": 9.914140529174698e-07, "loss": -0.033, "reward": 1.561553716659546, "reward_std": 0.20458553731441498, "rewards/accuracy_reward_stage2": 0.5771787762641907, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 50 }, { "completion_length": 14.546875, "epoch": 0.008936393902225337, "grad_norm": 22.676324556567717, "kl": 0.0269775390625, "learning_rate": 9.912388295076223e-07, "loss": 0.0108, "reward": 1.5551083087921143, "reward_std": 0.24086907505989075, "rewards/accuracy_reward_stage2": 0.5551083087921143, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 51 }, { "completion_length": 19.734375, "epoch": 0.009111617312072893, "grad_norm": 24.090157582750408, "kl": 0.0301513671875, "learning_rate": 9.910636060977745e-07, "loss": 0.012, "reward": 1.5436656475067139, "reward_std": 0.1551145315170288, "rewards/accuracy_reward_stage2": 0.5436656475067139, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 52 }, { "completion_length": 18.359375, "epoch": 0.009286840721920448, "grad_norm": 117.2815317858313, "kl": 0.1787109375, "learning_rate": 9.90888382687927e-07, "loss": 0.0715, "reward": 1.26711106300354, "reward_std": 0.17756909132003784, "rewards/accuracy_reward_stage2": 0.39211103320121765, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 53 }, { "completion_length": 19.640625, "epoch": 0.009462064131768004, "grad_norm": 2670.243259955133, "kl": 4.25, "learning_rate": 9.907131592780795e-07, "loss": 1.6969, "reward": 1.3709712028503418, "reward_std": 0.26806020736694336, "rewards/accuracy_reward_stage2": 0.4959712028503418, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 54 }, { "completion_length": 10.78125, "epoch": 0.00963728754161556, "grad_norm": 24.422978097429848, "kl": 0.031494140625, "learning_rate": 9.90537935868232e-07, "loss": 0.0127, "reward": 1.635439157485962, "reward_std": 0.15717683732509613, "rewards/accuracy_reward_stage2": 0.6354391574859619, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 55 }, { "completion_length": 21.9375, "epoch": 0.009812510951463115, "grad_norm": 58.54729288417099, "kl": 0.94140625, "learning_rate": 9.903627124583845e-07, "loss": 0.3767, "reward": 1.2334133386611938, "reward_std": 0.3513503968715668, "rewards/accuracy_reward_stage2": 0.4834132790565491, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 56 }, { "completion_length": 10.890625, "epoch": 0.00998773436131067, "grad_norm": 24.764202425478974, "kl": 0.01953125, "learning_rate": 9.901874890485368e-07, "loss": 0.0078, "reward": 1.542431116104126, "reward_std": 0.18826258182525635, "rewards/accuracy_reward_stage2": 0.5424311757087708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 57 }, { "completion_length": 11.078125, "epoch": 0.010162957771158226, "grad_norm": 23.433657092831172, "kl": 0.06005859375, "learning_rate": 9.900122656386893e-07, "loss": 0.024, "reward": 1.4939064979553223, "reward_std": 0.20271292328834534, "rewards/accuracy_reward_stage2": 0.4939064383506775, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 58 }, { "completion_length": 9.875, "epoch": 0.010338181181005782, "grad_norm": 37.05527638038634, "kl": 0.271484375, "learning_rate": 9.898370422288418e-07, "loss": 0.0885, "reward": 1.395346999168396, "reward_std": 0.2753984332084656, "rewards/accuracy_reward_stage2": 0.520346999168396, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 59 }, { "completion_length": 13.921875, "epoch": 0.010513404590853337, "grad_norm": 22.533338631681016, "kl": 0.039794921875, "learning_rate": 9.89661818818994e-07, "loss": 0.0159, "reward": 1.3350149393081665, "reward_std": 0.18372395634651184, "rewards/accuracy_reward_stage2": 0.3350149095058441, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 60 }, { "completion_length": 7.359375, "epoch": 0.010688628000700893, "grad_norm": 21.392543766016747, "kl": 0.0162353515625, "learning_rate": 9.894865954091465e-07, "loss": 0.0065, "reward": 1.627392292022705, "reward_std": 0.27495235204696655, "rewards/accuracy_reward_stage2": 0.6273922324180603, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 61 }, { "completion_length": 16.3125, "epoch": 0.010863851410548448, "grad_norm": 20.580479234767235, "kl": 0.02294921875, "learning_rate": 9.89311371999299e-07, "loss": 0.0092, "reward": 1.7086659669876099, "reward_std": 0.26947683095932007, "rewards/accuracy_reward_stage2": 0.7086660265922546, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 62 }, { "completion_length": 23.265625, "epoch": 0.011039074820396006, "grad_norm": 21260.430375982574, "kl": 25.625, "learning_rate": 9.891361485894515e-07, "loss": 10.2778, "reward": 1.194105863571167, "reward_std": 0.2070472538471222, "rewards/accuracy_reward_stage2": 0.31910592317581177, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 63 }, { "completion_length": 17.859375, "epoch": 0.011214298230243561, "grad_norm": 16.365434222292564, "kl": 0.021728515625, "learning_rate": 9.88960925179604e-07, "loss": 0.0087, "reward": 1.5770893096923828, "reward_std": 0.06625860929489136, "rewards/accuracy_reward_stage2": 0.5770893096923828, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 64 }, { "completion_length": 12.21875, "epoch": 0.011389521640091117, "grad_norm": 21.895183855546207, "kl": 0.1611328125, "learning_rate": 9.887857017697563e-07, "loss": 0.0645, "reward": 1.4588305950164795, "reward_std": 0.199259951710701, "rewards/accuracy_reward_stage2": 0.5838305950164795, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 65 }, { "completion_length": 8.359375, "epoch": 0.011564745049938673, "grad_norm": 914.427024535711, "kl": 1.8125, "learning_rate": 9.886104783599088e-07, "loss": 0.7264, "reward": 1.6872773170471191, "reward_std": 0.19540375471115112, "rewards/accuracy_reward_stage2": 0.8122772574424744, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 66 }, { "completion_length": 11.1875, "epoch": 0.011739968459786228, "grad_norm": 57.98909231313472, "kl": 0.1767578125, "learning_rate": 9.884352549500613e-07, "loss": 0.0707, "reward": 0.883919894695282, "reward_std": 0.12535862624645233, "rewards/accuracy_reward_stage2": 0.1339198797941208, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 67 }, { "completion_length": 14.5, "epoch": 0.011915191869633784, "grad_norm": 21.20461032961743, "kl": 0.0277099609375, "learning_rate": 9.882600315402138e-07, "loss": 0.0111, "reward": 1.3983817100524902, "reward_std": 0.2320813536643982, "rewards/accuracy_reward_stage2": 0.5233815908432007, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 68 }, { "completion_length": 12.296875, "epoch": 0.01209041527948134, "grad_norm": 16.882678958865416, "kl": 0.029296875, "learning_rate": 9.880848081303663e-07, "loss": 0.0117, "reward": 1.76310396194458, "reward_std": 0.1585851013660431, "rewards/accuracy_reward_stage2": 0.7631039619445801, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 69 }, { "completion_length": 14.6875, "epoch": 0.012265638689328895, "grad_norm": 486.1965247861804, "kl": 1.2109375, "learning_rate": 9.879095847205188e-07, "loss": 0.4841, "reward": 1.665351390838623, "reward_std": 0.18253958225250244, "rewards/accuracy_reward_stage2": 0.790351390838623, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 70 }, { "completion_length": 11.265625, "epoch": 0.01244086209917645, "grad_norm": 21.42386015892597, "kl": 0.050537109375, "learning_rate": 9.87734361310671e-07, "loss": 0.0202, "reward": 1.214674472808838, "reward_std": 0.15539926290512085, "rewards/accuracy_reward_stage2": 0.3396745026111603, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 71 }, { "completion_length": 6.109375, "epoch": 0.012616085509024006, "grad_norm": 23.54396927449601, "kl": 0.026123046875, "learning_rate": 9.875591379008235e-07, "loss": 0.0105, "reward": 1.7581977844238281, "reward_std": 0.2139248102903366, "rewards/accuracy_reward_stage2": 0.7581977248191833, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 72 }, { "completion_length": 14.40625, "epoch": 0.012791308918871562, "grad_norm": 19.476130544371653, "kl": 0.035400390625, "learning_rate": 9.873839144909758e-07, "loss": 0.0141, "reward": 1.8426779508590698, "reward_std": 0.16201923787593842, "rewards/accuracy_reward_stage2": 0.842677891254425, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 73 }, { "completion_length": 10.5625, "epoch": 0.012966532328719117, "grad_norm": 19.29894733530793, "kl": 0.08251953125, "learning_rate": 9.872086910811283e-07, "loss": -0.0113, "reward": 1.628914475440979, "reward_std": 0.17053398489952087, "rewards/accuracy_reward_stage2": 0.6445394158363342, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 74 }, { "completion_length": 9.484375, "epoch": 0.013141755738566673, "grad_norm": 26.149212215656444, "kl": 0.205078125, "learning_rate": 9.870334676712808e-07, "loss": 0.0817, "reward": 1.3059927225112915, "reward_std": 0.22208189964294434, "rewards/accuracy_reward_stage2": 0.4309927225112915, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 75 }, { "completion_length": 10.875, "epoch": 0.013316979148414228, "grad_norm": 26.023761326712144, "kl": 0.10791015625, "learning_rate": 9.868582442614333e-07, "loss": 0.0432, "reward": 1.583097219467163, "reward_std": 0.2495778203010559, "rewards/accuracy_reward_stage2": 0.5830972790718079, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 76 }, { "completion_length": 9.453125, "epoch": 0.013492202558261784, "grad_norm": 18.353935777981253, "kl": 0.01904296875, "learning_rate": 9.866830208515858e-07, "loss": 0.0076, "reward": 1.3065390586853027, "reward_std": 0.12740254402160645, "rewards/accuracy_reward_stage2": 0.3065391182899475, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 77 }, { "completion_length": 19.09375, "epoch": 0.01366742596810934, "grad_norm": 43.89053956356925, "kl": 0.65625, "learning_rate": 9.86507797441738e-07, "loss": 0.22, "reward": 1.2945109605789185, "reward_std": 0.13152040541172028, "rewards/accuracy_reward_stage2": 0.45076102018356323, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 78 }, { "completion_length": 14.765625, "epoch": 0.013842649377956895, "grad_norm": 22.686536085830813, "kl": 0.050537109375, "learning_rate": 9.863325740318906e-07, "loss": 0.0202, "reward": 1.2739322185516357, "reward_std": 0.22220373153686523, "rewards/accuracy_reward_stage2": 0.27393215894699097, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 79 }, { "completion_length": 6.734375, "epoch": 0.01401787278780445, "grad_norm": 18.610904618490743, "kl": 0.028076171875, "learning_rate": 9.86157350622043e-07, "loss": 0.0113, "reward": 1.7154624462127686, "reward_std": 0.12328290939331055, "rewards/accuracy_reward_stage2": 0.715462327003479, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 80 }, { "completion_length": 13.640625, "epoch": 0.014193096197652006, "grad_norm": 21.58844190934058, "kl": 0.0125732421875, "learning_rate": 9.859821272121955e-07, "loss": 0.005, "reward": 1.775895118713379, "reward_std": 0.1399209350347519, "rewards/accuracy_reward_stage2": 0.7758949995040894, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 81 }, { "completion_length": 19.65625, "epoch": 0.014368319607499562, "grad_norm": 24.391365722105085, "kl": 0.013671875, "learning_rate": 9.85806903802348e-07, "loss": 0.0055, "reward": 1.445000171661377, "reward_std": 0.2040981948375702, "rewards/accuracy_reward_stage2": 0.44500014185905457, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 82 }, { "completion_length": 8.109375, "epoch": 0.014543543017347118, "grad_norm": 14.35169151732421, "kl": 0.01123046875, "learning_rate": 9.856316803925005e-07, "loss": 0.0045, "reward": 1.4154889583587646, "reward_std": 0.17590636014938354, "rewards/accuracy_reward_stage2": 0.41548892855644226, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 83 }, { "completion_length": 8.9375, "epoch": 0.014718766427194673, "grad_norm": 23.391660041096344, "kl": 0.10693359375, "learning_rate": 9.854564569826528e-07, "loss": 0.0428, "reward": 1.4479758739471436, "reward_std": 0.26807376742362976, "rewards/accuracy_reward_stage2": 0.4479758143424988, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 84 }, { "completion_length": 9.890625, "epoch": 0.014893989837042229, "grad_norm": 25.9165649133999, "kl": 0.2138671875, "learning_rate": 9.852812335728053e-07, "loss": 0.0855, "reward": 1.46875, "reward_std": 0.2619796395301819, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 85 }, { "completion_length": 16.375, "epoch": 0.015069213246889784, "grad_norm": 32.11956895366405, "kl": 0.478515625, "learning_rate": 9.851060101629576e-07, "loss": 0.1917, "reward": 1.3244003057479858, "reward_std": 0.2183195948600769, "rewards/accuracy_reward_stage2": 0.44940024614334106, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 86 }, { "completion_length": 17.109375, "epoch": 0.01524443665673734, "grad_norm": 16.465646866338727, "kl": 0.029052734375, "learning_rate": 9.8493078675311e-07, "loss": 0.0116, "reward": 1.5186080932617188, "reward_std": 0.11561406403779984, "rewards/accuracy_reward_stage2": 0.5186082124710083, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 87 }, { "completion_length": 16.484375, "epoch": 0.015419660066584896, "grad_norm": 26.79288856366434, "kl": 0.0177001953125, "learning_rate": 9.847555633432626e-07, "loss": 0.0232, "reward": 1.3938446044921875, "reward_std": 0.3033265471458435, "rewards/accuracy_reward_stage2": 0.5188446640968323, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 88 }, { "completion_length": 9.96875, "epoch": 0.015594883476432451, "grad_norm": 23.16098970301936, "kl": 0.18359375, "learning_rate": 9.84580339933415e-07, "loss": 0.0734, "reward": 1.452669620513916, "reward_std": 0.1748245358467102, "rewards/accuracy_reward_stage2": 0.5776697397232056, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 89 }, { "completion_length": 6.71875, "epoch": 0.015770106886280007, "grad_norm": 14.171409825663957, "kl": 0.0218505859375, "learning_rate": 9.844051165235676e-07, "loss": 0.0088, "reward": 1.5625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 90 }, { "completion_length": 13.765625, "epoch": 0.015945330296127564, "grad_norm": 14.704475254365626, "kl": 0.062255859375, "learning_rate": 9.8422989311372e-07, "loss": 0.0249, "reward": 1.3633265495300293, "reward_std": 0.11541568487882614, "rewards/accuracy_reward_stage2": 0.4883265197277069, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 91 }, { "completion_length": 8.125, "epoch": 0.016120553705975118, "grad_norm": 20.49408337271478, "kl": 0.05029296875, "learning_rate": 9.840546697038723e-07, "loss": 0.0202, "reward": 1.5179017782211304, "reward_std": 0.07997994124889374, "rewards/accuracy_reward_stage2": 0.5179017782211304, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 92 }, { "completion_length": 24.875, "epoch": 0.016295777115822675, "grad_norm": 17.20056666951234, "kl": 0.0191650390625, "learning_rate": 9.838794462940248e-07, "loss": 0.0076, "reward": 1.5605134963989258, "reward_std": 0.15738126635551453, "rewards/accuracy_reward_stage2": 0.5605135560035706, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 93 }, { "completion_length": 8.984375, "epoch": 0.01647100052567023, "grad_norm": 14.05047417845059, "kl": 0.0242919921875, "learning_rate": 9.837042228841773e-07, "loss": 0.0098, "reward": 1.765625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward_stage2": 0.765625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 94 }, { "completion_length": 13.453125, "epoch": 0.016646223935517786, "grad_norm": 16.066438711048843, "kl": 0.00848388671875, "learning_rate": 9.835289994743298e-07, "loss": 0.0034, "reward": 1.3854167461395264, "reward_std": 0.2431686818599701, "rewards/accuracy_reward_stage2": 0.5104166269302368, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 95 }, { "completion_length": 13.9375, "epoch": 0.01682144734536534, "grad_norm": 20.498967796024022, "kl": 0.0223388671875, "learning_rate": 9.833537760644823e-07, "loss": 0.009, "reward": 1.32749605178833, "reward_std": 0.24064147472381592, "rewards/accuracy_reward_stage2": 0.3274959325790405, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 96 }, { "completion_length": 19.640625, "epoch": 0.016996670755212898, "grad_norm": 23.351279022856694, "kl": 0.048583984375, "learning_rate": 9.831785526546346e-07, "loss": 0.0194, "reward": 1.341355562210083, "reward_std": 0.17758896946907043, "rewards/accuracy_reward_stage2": 0.3413556218147278, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 97 }, { "completion_length": 11.0, "epoch": 0.01717189416506045, "grad_norm": 2424.142283949507, "kl": 5.28125, "learning_rate": 9.83003329244787e-07, "loss": 2.1083, "reward": 1.6076582670211792, "reward_std": 0.2769812345504761, "rewards/accuracy_reward_stage2": 0.732658326625824, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 98 }, { "completion_length": 15.5, "epoch": 0.01734711757490801, "grad_norm": 18.888112369485217, "kl": 0.0106201171875, "learning_rate": 9.828281058349396e-07, "loss": 0.0043, "reward": 1.5364583730697632, "reward_std": 0.13152071833610535, "rewards/accuracy_reward_stage2": 0.6614583730697632, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 99 }, { "completion_length": 9.484375, "epoch": 0.017522340984755563, "grad_norm": 18.63597928328685, "kl": 0.0245361328125, "learning_rate": 9.826528824250918e-07, "loss": 0.0098, "reward": 1.5691524744033813, "reward_std": 0.17309194803237915, "rewards/accuracy_reward_stage2": 0.5691524744033813, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 100 }, { "completion_length": 8.25, "epoch": 0.01769756439460312, "grad_norm": 32.53836968752601, "kl": 0.0244140625, "learning_rate": 9.824776590152443e-07, "loss": 0.0098, "reward": 1.399068832397461, "reward_std": 0.3313966393470764, "rewards/accuracy_reward_stage2": 0.39906883239746094, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 101 }, { "completion_length": 9.90625, "epoch": 0.017872787804450674, "grad_norm": 32.8854408605841, "kl": 0.0155029296875, "learning_rate": 9.823024356053968e-07, "loss": 0.0062, "reward": 1.3821427822113037, "reward_std": 0.19832119345664978, "rewards/accuracy_reward_stage2": 0.5071427226066589, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 102 }, { "completion_length": 10.671875, "epoch": 0.01804801121429823, "grad_norm": 21.58148592368162, "kl": 0.0218505859375, "learning_rate": 9.821272121955493e-07, "loss": 0.0087, "reward": 1.7327183485031128, "reward_std": 0.12665671110153198, "rewards/accuracy_reward_stage2": 0.7327184081077576, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 103 }, { "completion_length": 11.53125, "epoch": 0.018223234624145785, "grad_norm": 18.949315673363365, "kl": 0.03955078125, "learning_rate": 9.819519887857018e-07, "loss": -0.0283, "reward": 1.6874890327453613, "reward_std": 0.1615470051765442, "rewards/accuracy_reward_stage2": 0.7031140923500061, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 104 }, { "completion_length": 11.828125, "epoch": 0.018398458033993342, "grad_norm": 26.76789168361859, "kl": 0.1787109375, "learning_rate": 9.81776765375854e-07, "loss": 0.0716, "reward": 1.625, "reward_std": 0.12910360097885132, "rewards/accuracy_reward_stage2": 0.75, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 105 }, { "completion_length": 11.875, "epoch": 0.018573681443840896, "grad_norm": 23.148900056019155, "kl": 0.0859375, "learning_rate": 9.816015419660066e-07, "loss": 0.0343, "reward": 1.6884129047393799, "reward_std": 0.2642272114753723, "rewards/accuracy_reward_stage2": 0.6884129047393799, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 106 }, { "completion_length": 7.0625, "epoch": 0.018748904853688454, "grad_norm": 26.976366492756323, "kl": 0.056884765625, "learning_rate": 9.81426318556159e-07, "loss": 0.0228, "reward": 1.615952968597412, "reward_std": 0.3104846775531769, "rewards/accuracy_reward_stage2": 0.6159528493881226, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 107 }, { "completion_length": 8.953125, "epoch": 0.018924128263536007, "grad_norm": 16.921272039404386, "kl": 0.0419921875, "learning_rate": 9.812510951463116e-07, "loss": 0.0168, "reward": 1.621179461479187, "reward_std": 0.14682598412036896, "rewards/accuracy_reward_stage2": 0.621179461479187, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 108 }, { "completion_length": 13.15625, "epoch": 0.019099351673383565, "grad_norm": 22.94264744441606, "kl": 0.031494140625, "learning_rate": 9.81075871736464e-07, "loss": 0.0126, "reward": 1.3379876613616943, "reward_std": 0.23553958535194397, "rewards/accuracy_reward_stage2": 0.46298760175704956, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 109 }, { "completion_length": 7.78125, "epoch": 0.01927457508323112, "grad_norm": 18.640754349362595, "kl": 0.03125, "learning_rate": 9.809006483266164e-07, "loss": 0.0126, "reward": 1.6614583730697632, "reward_std": 0.18261326849460602, "rewards/accuracy_reward_stage2": 0.7864583730697632, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 110 }, { "completion_length": 9.515625, "epoch": 0.019449798493078676, "grad_norm": 25.22440686754448, "kl": 0.1572265625, "learning_rate": 9.807254249167688e-07, "loss": 0.0629, "reward": 1.5890928506851196, "reward_std": 0.3712921142578125, "rewards/accuracy_reward_stage2": 0.7140928506851196, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 111 }, { "completion_length": 14.3125, "epoch": 0.01962502190292623, "grad_norm": 18.83358143724199, "kl": 0.038818359375, "learning_rate": 9.805502015069213e-07, "loss": 0.0156, "reward": 1.8106896877288818, "reward_std": 0.10225945711135864, "rewards/accuracy_reward_stage2": 0.8106895685195923, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 112 }, { "completion_length": 7.40625, "epoch": 0.019800245312773787, "grad_norm": 22.13546775107038, "kl": 0.0272216796875, "learning_rate": 9.803749780970736e-07, "loss": 0.0109, "reward": 1.6545759439468384, "reward_std": 0.30723053216934204, "rewards/accuracy_reward_stage2": 0.6545758843421936, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 113 }, { "completion_length": 10.109375, "epoch": 0.01997546872262134, "grad_norm": 23.29532824323836, "kl": 0.039306640625, "learning_rate": 9.801997546872261e-07, "loss": 0.0157, "reward": 1.511056661605835, "reward_std": 0.24191156029701233, "rewards/accuracy_reward_stage2": 0.636056661605835, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 114 }, { "completion_length": 14.5, "epoch": 0.020150692132468898, "grad_norm": 20.883116213967234, "kl": 0.06396484375, "learning_rate": 9.800245312773786e-07, "loss": 0.0256, "reward": 1.700423002243042, "reward_std": 0.11160765588283539, "rewards/accuracy_reward_stage2": 0.7004230618476868, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 115 }, { "completion_length": 13.65625, "epoch": 0.020325915542316452, "grad_norm": 38.41318094469385, "kl": 0.58984375, "learning_rate": 9.79849307867531e-07, "loss": 0.2199, "reward": 1.4143095016479492, "reward_std": 0.21462732553482056, "rewards/accuracy_reward_stage2": 0.5549345016479492, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 116 }, { "completion_length": 7.578125, "epoch": 0.02050113895216401, "grad_norm": 19.121183391928234, "kl": 0.0118408203125, "learning_rate": 9.796740844576836e-07, "loss": 0.0047, "reward": 1.3231756687164307, "reward_std": 0.14878198504447937, "rewards/accuracy_reward_stage2": 0.44817566871643066, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 117 }, { "completion_length": 15.90625, "epoch": 0.020676362362011563, "grad_norm": 21.061263351136542, "kl": 0.038818359375, "learning_rate": 9.794988610478359e-07, "loss": 0.0155, "reward": 1.3059378862380981, "reward_std": 0.14674827456474304, "rewards/accuracy_reward_stage2": 0.4309379458427429, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 118 }, { "completion_length": 9.71875, "epoch": 0.02085158577185912, "grad_norm": 25.236293976052632, "kl": 0.055908203125, "learning_rate": 9.793236376379884e-07, "loss": 0.0223, "reward": 1.4603816270828247, "reward_std": 0.23183618485927582, "rewards/accuracy_reward_stage2": 0.4603816270828247, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 119 }, { "completion_length": 9.265625, "epoch": 0.021026809181706674, "grad_norm": 17.647031631462447, "kl": 0.029541015625, "learning_rate": 9.791484142281409e-07, "loss": 0.0118, "reward": 1.4820407629013062, "reward_std": 0.18687711656093597, "rewards/accuracy_reward_stage2": 0.48204079270362854, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 120 }, { "completion_length": 12.640625, "epoch": 0.021202032591554232, "grad_norm": 17.193468877661324, "kl": 0.08935546875, "learning_rate": 9.789731908182933e-07, "loss": 0.0357, "reward": 1.6092438697814941, "reward_std": 0.1579185426235199, "rewards/accuracy_reward_stage2": 0.7342438697814941, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 121 }, { "completion_length": 11.171875, "epoch": 0.021377256001401786, "grad_norm": 26.77965880367834, "kl": 0.19140625, "learning_rate": 9.787979674084458e-07, "loss": 0.0222, "reward": 1.4083753824234009, "reward_std": 0.4149293303489685, "rewards/accuracy_reward_stage2": 0.5646253824234009, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 122 }, { "completion_length": 8.46875, "epoch": 0.021552479411249343, "grad_norm": 16.66820112097354, "kl": 0.034912109375, "learning_rate": 9.786227439985981e-07, "loss": 0.014, "reward": 1.321736216545105, "reward_std": 0.18148699402809143, "rewards/accuracy_reward_stage2": 0.321736216545105, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 123 }, { "completion_length": 10.453125, "epoch": 0.021727702821096897, "grad_norm": 13.039644804969383, "kl": 0.0274658203125, "learning_rate": 9.784475205887506e-07, "loss": 0.011, "reward": 1.6614583730697632, "reward_std": 0.1530819833278656, "rewards/accuracy_reward_stage2": 0.6614583134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 124 }, { "completion_length": 9.234375, "epoch": 0.021902926230944454, "grad_norm": 11.330738899206423, "kl": 0.01251220703125, "learning_rate": 9.78272297178903e-07, "loss": -0.0239, "reward": 1.53125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 125 }, { "completion_length": 13.828125, "epoch": 0.02207814964079201, "grad_norm": 22.438552672241048, "kl": 0.038818359375, "learning_rate": 9.780970737690554e-07, "loss": 0.0155, "reward": 1.3687918186187744, "reward_std": 0.25998741388320923, "rewards/accuracy_reward_stage2": 0.368791788816452, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 126 }, { "completion_length": 7.390625, "epoch": 0.022253373050639565, "grad_norm": 14.108403183062768, "kl": 0.02294921875, "learning_rate": 9.779218503592079e-07, "loss": 0.0092, "reward": 1.7554993629455566, "reward_std": 0.12310698628425598, "rewards/accuracy_reward_stage2": 0.7554993629455566, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 127 }, { "completion_length": 11.9375, "epoch": 0.022428596460487123, "grad_norm": 32.650359349639814, "kl": 0.03857421875, "learning_rate": 9.777466269493604e-07, "loss": 0.0154, "reward": 1.693576455116272, "reward_std": 0.2803168296813965, "rewards/accuracy_reward_stage2": 0.6935763955116272, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 128 }, { "completion_length": 11.3125, "epoch": 0.022603819870334677, "grad_norm": 21.3970061518697, "kl": 0.12109375, "learning_rate": 9.775714035395129e-07, "loss": 0.0484, "reward": 1.5294928550720215, "reward_std": 0.22037862241268158, "rewards/accuracy_reward_stage2": 0.5294929146766663, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 129 }, { "completion_length": 9.1875, "epoch": 0.022779043280182234, "grad_norm": 17.20296674101713, "kl": 0.0439453125, "learning_rate": 9.773961801296654e-07, "loss": 0.0176, "reward": 1.671449065208435, "reward_std": 0.15607139468193054, "rewards/accuracy_reward_stage2": 0.6714490056037903, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 130 }, { "completion_length": 11.109375, "epoch": 0.022954266690029788, "grad_norm": 23.924016993147553, "kl": 0.16796875, "learning_rate": 9.772209567198178e-07, "loss": 0.067, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 131 }, { "completion_length": 8.109375, "epoch": 0.023129490099877345, "grad_norm": 17.871281939724312, "kl": 0.060546875, "learning_rate": 9.770457333099701e-07, "loss": 0.0242, "reward": 1.5422821044921875, "reward_std": 0.2110249251127243, "rewards/accuracy_reward_stage2": 0.5422821640968323, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 132 }, { "completion_length": 8.984375, "epoch": 0.0233047135097249, "grad_norm": 16.237834293008472, "kl": 0.05126953125, "learning_rate": 9.768705099001226e-07, "loss": 0.0205, "reward": 1.4114583730697632, "reward_std": 0.22298547625541687, "rewards/accuracy_reward_stage2": 0.5364583134651184, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 133 }, { "completion_length": 8.234375, "epoch": 0.023479936919572456, "grad_norm": 22.095308233281447, "kl": 0.02685546875, "learning_rate": 9.766952864902751e-07, "loss": -0.031, "reward": 1.8829572200775146, "reward_std": 0.22935430705547333, "rewards/accuracy_reward_stage2": 0.8985822200775146, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 134 }, { "completion_length": 11.203125, "epoch": 0.02365516032942001, "grad_norm": 19.883543612210254, "kl": 0.0498046875, "learning_rate": 9.765200630804274e-07, "loss": 0.0199, "reward": 1.4533112049102783, "reward_std": 0.24766197800636292, "rewards/accuracy_reward_stage2": 0.45331108570098877, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 135 }, { "completion_length": 12.734375, "epoch": 0.023830383739267567, "grad_norm": 22.33848788451206, "kl": 0.26953125, "learning_rate": 9.763448396705799e-07, "loss": 0.0983, "reward": 1.0532407760620117, "reward_std": 0.28661733865737915, "rewards/accuracy_reward_stage2": 0.31886574625968933, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 136 }, { "completion_length": 10.921875, "epoch": 0.02400560714911512, "grad_norm": 22.501605896319216, "kl": 0.1728515625, "learning_rate": 9.761696162607324e-07, "loss": 0.0402, "reward": 1.0, "reward_std": 0.22461533546447754, "rewards/accuracy_reward_stage2": 0.265625, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 137 }, { "completion_length": 11.03125, "epoch": 0.02418083055896268, "grad_norm": 21.43356694594981, "kl": 0.042724609375, "learning_rate": 9.759943928508849e-07, "loss": 0.0171, "reward": 1.4605212211608887, "reward_std": 0.3050932288169861, "rewards/accuracy_reward_stage2": 0.4605211615562439, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 138 }, { "completion_length": 10.484375, "epoch": 0.024356053968810232, "grad_norm": 4150.486203164245, "kl": 9.3125, "learning_rate": 9.758191694410374e-07, "loss": 3.6919, "reward": 1.3730590343475342, "reward_std": 0.19215653836727142, "rewards/accuracy_reward_stage2": 0.5136840343475342, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 139 }, { "completion_length": 10.25, "epoch": 0.02453127737865779, "grad_norm": 19.10514066890567, "kl": 0.07421875, "learning_rate": 9.756439460311896e-07, "loss": 0.0297, "reward": 1.4467504024505615, "reward_std": 0.213166743516922, "rewards/accuracy_reward_stage2": 0.4467504024505615, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 140 }, { "completion_length": 12.265625, "epoch": 0.024706500788505344, "grad_norm": 27.796189272405012, "kl": 0.06591796875, "learning_rate": 9.754687226213421e-07, "loss": 0.0264, "reward": 1.2996835708618164, "reward_std": 0.2103997766971588, "rewards/accuracy_reward_stage2": 0.2996836304664612, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 141 }, { "completion_length": 11.65625, "epoch": 0.0248817241983529, "grad_norm": 15.512147123884196, "kl": 0.0206298828125, "learning_rate": 9.752934992114946e-07, "loss": -0.0359, "reward": 1.52015221118927, "reward_std": 0.09403587877750397, "rewards/accuracy_reward_stage2": 0.5357772707939148, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 142 }, { "completion_length": 10.8125, "epoch": 0.025056947608200455, "grad_norm": 14.26488216251468, "kl": 0.050537109375, "learning_rate": 9.751182758016471e-07, "loss": 0.0202, "reward": 1.4782986640930176, "reward_std": 0.15773266553878784, "rewards/accuracy_reward_stage2": 0.4782986044883728, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 143 }, { "completion_length": 10.3125, "epoch": 0.025232171018048012, "grad_norm": 581.3548172640157, "kl": 2.09375, "learning_rate": 9.749430523917996e-07, "loss": 0.833, "reward": 1.47330904006958, "reward_std": 0.2243586778640747, "rewards/accuracy_reward_stage2": 0.5983090996742249, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 144 }, { "completion_length": 6.640625, "epoch": 0.025407394427895566, "grad_norm": 16.632026818964334, "kl": 0.013427734375, "learning_rate": 9.74767828981952e-07, "loss": 0.0054, "reward": 1.6461806297302246, "reward_std": 0.13736851513385773, "rewards/accuracy_reward_stage2": 0.6461805701255798, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 145 }, { "completion_length": 8.125, "epoch": 0.025582617837743123, "grad_norm": 20.712311357432764, "kl": 0.051025390625, "learning_rate": 9.745926055721044e-07, "loss": 0.0204, "reward": 1.4257996082305908, "reward_std": 0.12755097448825836, "rewards/accuracy_reward_stage2": 0.4257996082305908, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 146 }, { "completion_length": 8.203125, "epoch": 0.025757841247590677, "grad_norm": 20.951278916894317, "kl": 0.03125, "learning_rate": 9.744173821622569e-07, "loss": -0.0316, "reward": 1.5287258625030518, "reward_std": 0.2641984224319458, "rewards/accuracy_reward_stage2": 0.6693509817123413, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 147 }, { "completion_length": 11.953125, "epoch": 0.025933064657438235, "grad_norm": 8.761776615452003, "kl": 0.0230712890625, "learning_rate": 9.742421587524092e-07, "loss": 0.0093, "reward": 1.399897813796997, "reward_std": 0.04742930084466934, "rewards/accuracy_reward_stage2": 0.39989787340164185, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 148 }, { "completion_length": 7.359375, "epoch": 0.02610828806728579, "grad_norm": 15.121517342131725, "kl": 0.027099609375, "learning_rate": 9.740669353425617e-07, "loss": 0.0108, "reward": 1.4454432725906372, "reward_std": 0.04337773099541664, "rewards/accuracy_reward_stage2": 0.4454432725906372, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 149 }, { "completion_length": 15.90625, "epoch": 0.026283511477133346, "grad_norm": 24.934994595131617, "kl": 0.3203125, "learning_rate": 9.738917119327141e-07, "loss": 0.0873, "reward": 1.3222427368164062, "reward_std": 0.29903650283813477, "rewards/accuracy_reward_stage2": 0.4628676772117615, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 150 }, { "completion_length": 13.6875, "epoch": 0.0264587348869809, "grad_norm": 20.45511798008456, "kl": 0.1708984375, "learning_rate": 9.737164885228666e-07, "loss": 0.0684, "reward": 1.582951545715332, "reward_std": 0.07972659170627594, "rewards/accuracy_reward_stage2": 0.7079516053199768, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 151 }, { "completion_length": 17.828125, "epoch": 0.026633958296828457, "grad_norm": 22.159400129993305, "kl": 0.046875, "learning_rate": 9.735412651130191e-07, "loss": 0.0188, "reward": 1.5045561790466309, "reward_std": 0.15542970597743988, "rewards/accuracy_reward_stage2": 0.5045561790466309, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 152 }, { "completion_length": 8.171875, "epoch": 0.02680918170667601, "grad_norm": 16.54412742369763, "kl": 0.1396484375, "learning_rate": 9.733660417031714e-07, "loss": 0.0464, "reward": 1.2883906364440918, "reward_std": 0.06080695986747742, "rewards/accuracy_reward_stage2": 0.5383907556533813, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 153 }, { "completion_length": 10.625, "epoch": 0.026984405116523568, "grad_norm": 21.8872943160035, "kl": 0.054931640625, "learning_rate": 9.73190818293324e-07, "loss": 0.0219, "reward": 1.4988808631896973, "reward_std": 0.23085248470306396, "rewards/accuracy_reward_stage2": 0.4988808333873749, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 154 }, { "completion_length": 17.765625, "epoch": 0.027159628526371122, "grad_norm": 20.512091387289935, "kl": 0.03466796875, "learning_rate": 9.730155948834764e-07, "loss": 0.0139, "reward": 1.479767084121704, "reward_std": 0.23238566517829895, "rewards/accuracy_reward_stage2": 0.6047670841217041, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 155 }, { "completion_length": 8.515625, "epoch": 0.02733485193621868, "grad_norm": 16.26675377437708, "kl": 0.01153564453125, "learning_rate": 9.728403714736289e-07, "loss": 0.0046, "reward": 1.7636384963989258, "reward_std": 0.174021378159523, "rewards/accuracy_reward_stage2": 0.7636384963989258, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 156 }, { "completion_length": 8.71875, "epoch": 0.027510075346066233, "grad_norm": 13.252046634278365, "kl": 0.01556396484375, "learning_rate": 9.726651480637814e-07, "loss": 0.0062, "reward": 1.5384259223937988, "reward_std": 0.13512171804904938, "rewards/accuracy_reward_stage2": 0.5384259223937988, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 157 }, { "completion_length": 9.4375, "epoch": 0.02768529875591379, "grad_norm": 17.87652537935883, "kl": 0.0284423828125, "learning_rate": 9.724899246539337e-07, "loss": 0.0114, "reward": 1.6046476364135742, "reward_std": 0.1472102403640747, "rewards/accuracy_reward_stage2": 0.6046475172042847, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 158 }, { "completion_length": 9.640625, "epoch": 0.027860522165761344, "grad_norm": 255.3729136272485, "kl": 1.1171875, "learning_rate": 9.723147012440862e-07, "loss": 0.4472, "reward": 1.3028572797775269, "reward_std": 0.10992234945297241, "rewards/accuracy_reward_stage2": 0.42785730957984924, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 159 }, { "completion_length": 9.03125, "epoch": 0.0280357455756089, "grad_norm": 24.540647784148998, "kl": 0.07958984375, "learning_rate": 9.721394778342387e-07, "loss": -0.0124, "reward": 1.6068737506866455, "reward_std": 0.2786521315574646, "rewards/accuracy_reward_stage2": 0.6224986910820007, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 160 }, { "completion_length": 9.828125, "epoch": 0.028210968985456455, "grad_norm": 31.622324802367434, "kl": 0.1650390625, "learning_rate": 9.71964254424391e-07, "loss": 0.0659, "reward": 1.5447970628738403, "reward_std": 0.282809317111969, "rewards/accuracy_reward_stage2": 0.5447970628738403, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 161 }, { "completion_length": 5.875, "epoch": 0.028386192395304013, "grad_norm": 18.034072051457613, "kl": 0.037841796875, "learning_rate": 9.717890310145434e-07, "loss": 0.0151, "reward": 1.5959933996200562, "reward_std": 0.17372407019138336, "rewards/accuracy_reward_stage2": 0.7209933400154114, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 162 }, { "completion_length": 14.6875, "epoch": 0.028561415805151567, "grad_norm": 25.02410090499007, "kl": 0.030517578125, "learning_rate": 9.71613807604696e-07, "loss": 0.0122, "reward": 1.4665104150772095, "reward_std": 0.23067545890808105, "rewards/accuracy_reward_stage2": 0.4665104150772095, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 163 }, { "completion_length": 10.578125, "epoch": 0.028736639214999124, "grad_norm": 17.3468175548762, "kl": 0.25, "learning_rate": 9.714385841948484e-07, "loss": 0.0995, "reward": 1.3711848258972168, "reward_std": 0.13964632153511047, "rewards/accuracy_reward_stage2": 0.49618473649024963, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 164 }, { "completion_length": 9.15625, "epoch": 0.028911862624846678, "grad_norm": 23.561247389295385, "kl": 0.08740234375, "learning_rate": 9.71263360785001e-07, "loss": 0.035, "reward": 1.550048589706421, "reward_std": 0.23237371444702148, "rewards/accuracy_reward_stage2": 0.5500486493110657, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 165 }, { "completion_length": 16.734375, "epoch": 0.029087086034694235, "grad_norm": 19.429360315487067, "kl": 0.020263671875, "learning_rate": 9.710881373751532e-07, "loss": 0.0081, "reward": 1.6858422756195068, "reward_std": 0.26720598340034485, "rewards/accuracy_reward_stage2": 0.6858422756195068, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 166 }, { "completion_length": 9.109375, "epoch": 0.029262309444541793, "grad_norm": 20.70039845969837, "kl": 0.051025390625, "learning_rate": 9.709129139653057e-07, "loss": 0.0204, "reward": 1.6000640392303467, "reward_std": 0.34583085775375366, "rewards/accuracy_reward_stage2": 0.6000640392303467, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 167 }, { "completion_length": 12.03125, "epoch": 0.029437532854389346, "grad_norm": 15.176393141742965, "kl": 0.0224609375, "learning_rate": 9.707376905554582e-07, "loss": 0.009, "reward": 1.3509178161621094, "reward_std": 0.0907922238111496, "rewards/accuracy_reward_stage2": 0.4759177565574646, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 168 }, { "completion_length": 7.6875, "epoch": 0.029612756264236904, "grad_norm": 20.90349632158209, "kl": 0.0184326171875, "learning_rate": 9.705624671456107e-07, "loss": 0.0074, "reward": 1.622603416442871, "reward_std": 0.17725098133087158, "rewards/accuracy_reward_stage2": 0.6226034760475159, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 169 }, { "completion_length": 10.609375, "epoch": 0.029787979674084458, "grad_norm": 13.705177811281553, "kl": 0.265625, "learning_rate": 9.703872437357632e-07, "loss": 0.1066, "reward": 1.229966163635254, "reward_std": 0.03953730687499046, "rewards/accuracy_reward_stage2": 0.35496610403060913, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 170 }, { "completion_length": 11.546875, "epoch": 0.029963203083932015, "grad_norm": 19.105963673665826, "kl": 0.04443359375, "learning_rate": 9.702120203259154e-07, "loss": -0.0251, "reward": 1.7006630897521973, "reward_std": 0.10390988737344742, "rewards/accuracy_reward_stage2": 0.7162880301475525, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 171 }, { "completion_length": 7.484375, "epoch": 0.03013842649377957, "grad_norm": 23.78920063928513, "kl": 0.0419921875, "learning_rate": 9.70036796916068e-07, "loss": -0.0274, "reward": 1.514814853668213, "reward_std": 0.35105100274086, "rewards/accuracy_reward_stage2": 0.6554398536682129, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 172 }, { "completion_length": 9.828125, "epoch": 0.030313649903627126, "grad_norm": 17.5747043233872, "kl": 0.1630859375, "learning_rate": 9.698615735062204e-07, "loss": 0.0656, "reward": 1.46875, "reward_std": 0.2619796395301819, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 173 }, { "completion_length": 11.453125, "epoch": 0.03048887331347468, "grad_norm": 18.708571980167633, "kl": 0.033203125, "learning_rate": 9.696863500963727e-07, "loss": 0.0132, "reward": 1.652700424194336, "reward_std": 0.16118405759334564, "rewards/accuracy_reward_stage2": 0.6527003049850464, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 174 }, { "completion_length": 11.0, "epoch": 0.030664096723322237, "grad_norm": 23.207996060963765, "kl": 0.0186767578125, "learning_rate": 9.695111266865252e-07, "loss": 0.0075, "reward": 1.5543544292449951, "reward_std": 0.21840667724609375, "rewards/accuracy_reward_stage2": 0.5543544292449951, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 175 }, { "completion_length": 6.0625, "epoch": 0.03083932013316979, "grad_norm": 17.239276511923176, "kl": 0.05029296875, "learning_rate": 9.693359032766777e-07, "loss": 0.0201, "reward": 1.7345705032348633, "reward_std": 0.13885539770126343, "rewards/accuracy_reward_stage2": 0.7345705032348633, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 176 }, { "completion_length": 14.875, "epoch": 0.03101454354301735, "grad_norm": 17.215969325620332, "kl": 0.064453125, "learning_rate": 9.691606798668302e-07, "loss": 0.0258, "reward": 1.4116889238357544, "reward_std": 0.11823472380638123, "rewards/accuracy_reward_stage2": 0.6616888642311096, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 177 }, { "completion_length": 7.28125, "epoch": 0.031189766952864902, "grad_norm": 12.798355258269762, "kl": 0.02734375, "learning_rate": 9.689854564569827e-07, "loss": 0.0109, "reward": 1.30573308467865, "reward_std": 0.08645682036876678, "rewards/accuracy_reward_stage2": 0.4307331144809723, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 178 }, { "completion_length": 6.765625, "epoch": 0.03136499036271246, "grad_norm": 25.05776951577626, "kl": 0.23828125, "learning_rate": 9.68810233047135e-07, "loss": 0.0953, "reward": 1.3320447206497192, "reward_std": 0.1577865034341812, "rewards/accuracy_reward_stage2": 0.4570447504520416, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 179 }, { "completion_length": 7.96875, "epoch": 0.03154021377256001, "grad_norm": 22.093198419377405, "kl": 0.0634765625, "learning_rate": 9.686350096372874e-07, "loss": 0.0254, "reward": 1.6561169624328613, "reward_std": 0.26356494426727295, "rewards/accuracy_reward_stage2": 0.6561169624328613, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 180 }, { "completion_length": 16.953125, "epoch": 0.03171543718240757, "grad_norm": 30.753363540818007, "kl": 0.26953125, "learning_rate": 9.6845978622744e-07, "loss": 0.1079, "reward": 1.3598719835281372, "reward_std": 0.22879821062088013, "rewards/accuracy_reward_stage2": 0.4848719835281372, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 181 }, { "completion_length": 9.359375, "epoch": 0.03189066059225513, "grad_norm": 21.41239232341035, "kl": 0.0279541015625, "learning_rate": 9.682845628175924e-07, "loss": 0.0112, "reward": 1.6863123178482056, "reward_std": 0.22437290847301483, "rewards/accuracy_reward_stage2": 0.6863122582435608, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 182 }, { "completion_length": 6.484375, "epoch": 0.03206588400210268, "grad_norm": 8.467142001911718, "kl": 0.0150146484375, "learning_rate": 9.68109339407745e-07, "loss": 0.006, "reward": 1.546875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 183 }, { "completion_length": 6.1875, "epoch": 0.032241107411950236, "grad_norm": 18.643982166866053, "kl": 0.0179443359375, "learning_rate": 9.679341159978974e-07, "loss": 0.0072, "reward": 1.4908428192138672, "reward_std": 0.11428863555192947, "rewards/accuracy_reward_stage2": 0.49084287881851196, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 184 }, { "completion_length": 10.3125, "epoch": 0.03241633082179779, "grad_norm": 33.78920896731426, "kl": 0.1728515625, "learning_rate": 9.677588925880497e-07, "loss": 0.0687, "reward": 1.46875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 185 }, { "completion_length": 7.203125, "epoch": 0.03259155423164535, "grad_norm": 21.094244218777078, "kl": 0.140625, "learning_rate": 9.675836691782022e-07, "loss": 0.056, "reward": 1.171875, "reward_std": 0.2867125868797302, "rewards/accuracy_reward_stage2": 0.421875, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 186 }, { "completion_length": 9.609375, "epoch": 0.032766777641492904, "grad_norm": 20.530642144368848, "kl": 0.080078125, "learning_rate": 9.674084457683545e-07, "loss": 0.0321, "reward": 1.6582694053649902, "reward_std": 0.22948169708251953, "rewards/accuracy_reward_stage2": 0.658269464969635, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 187 }, { "completion_length": 9.46875, "epoch": 0.03294200105134046, "grad_norm": 18.975229254487072, "kl": 0.056396484375, "learning_rate": 9.67233222358507e-07, "loss": 0.0226, "reward": 1.6822609901428223, "reward_std": 0.09512491524219513, "rewards/accuracy_reward_stage2": 0.682261049747467, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 188 }, { "completion_length": 13.8125, "epoch": 0.03311722446118801, "grad_norm": 20.47557098090093, "kl": 0.236328125, "learning_rate": 9.670579989486595e-07, "loss": 0.0948, "reward": 1.3839223384857178, "reward_std": 0.19412410259246826, "rewards/accuracy_reward_stage2": 0.6339223384857178, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 189 }, { "completion_length": 11.515625, "epoch": 0.03329244787103557, "grad_norm": 21.453820677237367, "kl": 0.0294189453125, "learning_rate": 9.66882775538812e-07, "loss": 0.0118, "reward": 1.6715006828308105, "reward_std": 0.1585531234741211, "rewards/accuracy_reward_stage2": 0.6715006828308105, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 190 }, { "completion_length": 14.421875, "epoch": 0.03346767128088313, "grad_norm": 29.194800757782893, "kl": 0.08544921875, "learning_rate": 9.667075521289644e-07, "loss": 0.0341, "reward": 1.3433187007904053, "reward_std": 0.17038963735103607, "rewards/accuracy_reward_stage2": 0.4683186411857605, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 191 }, { "completion_length": 7.296875, "epoch": 0.03364289469073068, "grad_norm": 20.725013074544748, "kl": 0.0289306640625, "learning_rate": 9.66532328719117e-07, "loss": 0.0115, "reward": 1.4519970417022705, "reward_std": 0.17661163210868835, "rewards/accuracy_reward_stage2": 0.4519970118999481, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 192 }, { "completion_length": 7.984375, "epoch": 0.033818118100578234, "grad_norm": 18.52882236150989, "kl": 0.0810546875, "learning_rate": 9.663571053092692e-07, "loss": -0.0118, "reward": 1.6647088527679443, "reward_std": 0.12946046888828278, "rewards/accuracy_reward_stage2": 0.6803338527679443, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 193 }, { "completion_length": 8.8125, "epoch": 0.033993341510425795, "grad_norm": 26.484601737109703, "kl": 0.06201171875, "learning_rate": 9.661818818994217e-07, "loss": -0.004, "reward": 1.4455125331878662, "reward_std": 0.1841823160648346, "rewards/accuracy_reward_stage2": 0.5861374735832214, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 194 }, { "completion_length": 12.96875, "epoch": 0.03416856492027335, "grad_norm": 15.203542718715298, "kl": 0.0791015625, "learning_rate": 9.660066584895742e-07, "loss": 0.0317, "reward": 1.557002305984497, "reward_std": 0.16922709345817566, "rewards/accuracy_reward_stage2": 0.6820023059844971, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 195 }, { "completion_length": 12.9375, "epoch": 0.0343437883301209, "grad_norm": 15.59409127943019, "kl": 0.037353515625, "learning_rate": 9.658314350797267e-07, "loss": 0.0149, "reward": 1.5619020462036133, "reward_std": 0.07156114280223846, "rewards/accuracy_reward_stage2": 0.5619020462036133, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 196 }, { "completion_length": 7.578125, "epoch": 0.03451901173996846, "grad_norm": 10.595565784687796, "kl": 0.021240234375, "learning_rate": 9.656562116698792e-07, "loss": 0.0085, "reward": 1.5104167461395264, "reward_std": 0.0294627882540226, "rewards/accuracy_reward_stage2": 0.5104166269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 197 }, { "completion_length": 12.9375, "epoch": 0.03469423514981602, "grad_norm": 13.055675079262832, "kl": 0.007781982421875, "learning_rate": 9.654809882600315e-07, "loss": 0.0031, "reward": 1.84375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.84375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 198 }, { "completion_length": 9.65625, "epoch": 0.03486945855966357, "grad_norm": 1.139007136515326, "kl": 0.007476806640625, "learning_rate": 9.65305764850184e-07, "loss": 0.003, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward_stage2": 0.625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 199 }, { "completion_length": 7.328125, "epoch": 0.035044681969511125, "grad_norm": 16.784491224117573, "kl": 0.0074462890625, "learning_rate": 9.651305414403364e-07, "loss": 0.003, "reward": 1.6219052076339722, "reward_std": 0.155005544424057, "rewards/accuracy_reward_stage2": 0.6219052076339722, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 200 }, { "completion_length": 10.34375, "epoch": 0.03521990537935868, "grad_norm": 20.19629650538392, "kl": 0.0277099609375, "learning_rate": 9.649553180304887e-07, "loss": 0.0111, "reward": 1.394465684890747, "reward_std": 0.08756385743618011, "rewards/accuracy_reward_stage2": 0.39446574449539185, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 201 }, { "completion_length": 8.53125, "epoch": 0.03539512878920624, "grad_norm": 23.718846731751, "kl": 0.041015625, "learning_rate": 9.647800946206412e-07, "loss": 0.0164, "reward": 1.6174988746643066, "reward_std": 0.29005441069602966, "rewards/accuracy_reward_stage2": 0.6174987554550171, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 202 }, { "completion_length": 11.515625, "epoch": 0.035570352199053794, "grad_norm": 17.105844468751872, "kl": 0.046142578125, "learning_rate": 9.646048712107937e-07, "loss": -0.0238, "reward": 1.3539774417877197, "reward_std": 0.20320993661880493, "rewards/accuracy_reward_stage2": 0.36960241198539734, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 203 }, { "completion_length": 11.125, "epoch": 0.03574557560890135, "grad_norm": 24.028616996160352, "kl": 0.0771484375, "learning_rate": 9.644296478009462e-07, "loss": 0.0309, "reward": 1.3534865379333496, "reward_std": 0.17606234550476074, "rewards/accuracy_reward_stage2": 0.3534865975379944, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 204 }, { "completion_length": 26.40625, "epoch": 0.0359207990187489, "grad_norm": 18.689605260335664, "kl": 0.01806640625, "learning_rate": 9.642544243910987e-07, "loss": 0.0072, "reward": 1.4637235403060913, "reward_std": 0.17509686946868896, "rewards/accuracy_reward_stage2": 0.4637235105037689, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 205 }, { "completion_length": 8.421875, "epoch": 0.03609602242859646, "grad_norm": 21.290975767768856, "kl": 0.0205078125, "learning_rate": 9.64079200981251e-07, "loss": 0.0082, "reward": 1.7789130210876465, "reward_std": 0.18055710196495056, "rewards/accuracy_reward_stage2": 0.7789130210876465, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 206 }, { "completion_length": 9.71875, "epoch": 0.036271245838444016, "grad_norm": 25.960035529614007, "kl": 0.0537109375, "learning_rate": 9.639039775714035e-07, "loss": 0.0215, "reward": 1.552076816558838, "reward_std": 0.28908300399780273, "rewards/accuracy_reward_stage2": 0.5520768165588379, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 207 }, { "completion_length": 11.21875, "epoch": 0.03644646924829157, "grad_norm": 16.88448721696107, "kl": 0.036376953125, "learning_rate": 9.63728754161556e-07, "loss": 0.0145, "reward": 1.1257497072219849, "reward_std": 0.1288609355688095, "rewards/accuracy_reward_stage2": 0.2507496774196625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 208 }, { "completion_length": 7.609375, "epoch": 0.03662169265813913, "grad_norm": 16.2274690040401, "kl": 0.0272216796875, "learning_rate": 9.635535307517085e-07, "loss": 0.0109, "reward": 1.4792509078979492, "reward_std": 0.19155427813529968, "rewards/accuracy_reward_stage2": 0.479250967502594, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 209 }, { "completion_length": 13.46875, "epoch": 0.036796916067986685, "grad_norm": 33.784177051978844, "kl": 0.5078125, "learning_rate": 9.63378307341861e-07, "loss": 0.2018, "reward": 1.434826135635376, "reward_std": 0.26847773790359497, "rewards/accuracy_reward_stage2": 0.5598262548446655, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 210 }, { "completion_length": 9.09375, "epoch": 0.03697213947783424, "grad_norm": 20.179490922240536, "kl": 0.048828125, "learning_rate": 9.632030839320132e-07, "loss": 0.0196, "reward": 1.5264551639556885, "reward_std": 0.1224151998758316, "rewards/accuracy_reward_stage2": 0.5264551639556885, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 211 }, { "completion_length": 8.390625, "epoch": 0.03714736288768179, "grad_norm": 20.438576677424336, "kl": 0.02783203125, "learning_rate": 9.630278605221657e-07, "loss": 0.0111, "reward": 1.697406530380249, "reward_std": 0.18822979927062988, "rewards/accuracy_reward_stage2": 0.6974066495895386, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 212 }, { "completion_length": 16.0, "epoch": 0.03732258629752935, "grad_norm": 24.199250122428335, "kl": 0.1708984375, "learning_rate": 9.628526371123182e-07, "loss": 0.0685, "reward": 1.1678051948547363, "reward_std": 0.17951638996601105, "rewards/accuracy_reward_stage2": 0.2928052544593811, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 213 }, { "completion_length": 12.546875, "epoch": 0.03749780970737691, "grad_norm": 13.500767042240465, "kl": 0.00909423828125, "learning_rate": 9.626774137024705e-07, "loss": -0.0253, "reward": 1.5083041191101074, "reward_std": 0.14326375722885132, "rewards/accuracy_reward_stage2": 0.5239291787147522, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 214 }, { "completion_length": 7.265625, "epoch": 0.03767303311722446, "grad_norm": 21.444658879022974, "kl": 0.014404296875, "learning_rate": 9.62502190292623e-07, "loss": 0.0058, "reward": 1.606555461883545, "reward_std": 0.2108723670244217, "rewards/accuracy_reward_stage2": 0.6065554618835449, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 215 }, { "completion_length": 9.359375, "epoch": 0.037848256527072015, "grad_norm": 22.512872765127288, "kl": 0.45703125, "learning_rate": 9.623269668827755e-07, "loss": 0.1825, "reward": 1.3978098630905151, "reward_std": 0.15367010235786438, "rewards/accuracy_reward_stage2": 0.5228098630905151, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 216 }, { "completion_length": 12.921875, "epoch": 0.038023479936919576, "grad_norm": 21.09355853469942, "kl": 0.31640625, "learning_rate": 9.62151743472928e-07, "loss": 0.1267, "reward": 1.358708143234253, "reward_std": 0.23795433342456818, "rewards/accuracy_reward_stage2": 0.6087081432342529, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 217 }, { "completion_length": 9.25, "epoch": 0.03819870334676713, "grad_norm": 19.748781154588258, "kl": 0.294921875, "learning_rate": 9.619765200630805e-07, "loss": 0.1177, "reward": 1.5509915351867676, "reward_std": 0.12086137384176254, "rewards/accuracy_reward_stage2": 0.8009915351867676, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 218 }, { "completion_length": 8.09375, "epoch": 0.03837392675661468, "grad_norm": 35.49436359896782, "kl": 0.205078125, "learning_rate": 9.618012966532327e-07, "loss": 0.0577, "reward": 1.719040870666504, "reward_std": 0.19431626796722412, "rewards/accuracy_reward_stage2": 0.7346658706665039, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 219 }, { "completion_length": 8.953125, "epoch": 0.03854915016646224, "grad_norm": 18.25037389606022, "kl": 0.010498046875, "learning_rate": 9.616260732433852e-07, "loss": -0.04, "reward": 1.566141128540039, "reward_std": 0.20485994219779968, "rewards/accuracy_reward_stage2": 0.5817661881446838, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 220 }, { "completion_length": 9.15625, "epoch": 0.0387243735763098, "grad_norm": 15.805557927333963, "kl": 0.05615234375, "learning_rate": 9.614508498335377e-07, "loss": -0.0205, "reward": 1.4933180809020996, "reward_std": 0.17127634584903717, "rewards/accuracy_reward_stage2": 0.5089430212974548, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 221 }, { "completion_length": 12.390625, "epoch": 0.03889959698615735, "grad_norm": 13.08398417869419, "kl": 0.08154296875, "learning_rate": 9.612756264236902e-07, "loss": -0.0074, "reward": 1.6429219245910645, "reward_std": 0.09408406913280487, "rewards/accuracy_reward_stage2": 0.658547043800354, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 222 }, { "completion_length": 11.0, "epoch": 0.039074820396004906, "grad_norm": 17.417074560822787, "kl": 0.0167236328125, "learning_rate": 9.611004030138427e-07, "loss": 0.0067, "reward": 1.492321252822876, "reward_std": 0.10672628879547119, "rewards/accuracy_reward_stage2": 0.617321252822876, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 223 }, { "completion_length": 8.109375, "epoch": 0.03925004380585246, "grad_norm": 18.883396983473418, "kl": 0.019775390625, "learning_rate": 9.60925179603995e-07, "loss": -0.0363, "reward": 1.439524531364441, "reward_std": 0.20576725900173187, "rewards/accuracy_reward_stage2": 0.4551495909690857, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 224 }, { "completion_length": 11.1875, "epoch": 0.03942526721570002, "grad_norm": 21.665625000254725, "kl": 0.0185546875, "learning_rate": 9.607499561941475e-07, "loss": 0.0074, "reward": 1.6839239597320557, "reward_std": 0.21414509415626526, "rewards/accuracy_reward_stage2": 0.6839240193367004, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 225 }, { "completion_length": 9.28125, "epoch": 0.039600490625547574, "grad_norm": 14.634515141275852, "kl": 0.01007080078125, "learning_rate": 9.605747327843e-07, "loss": 0.004, "reward": 1.6269270181655884, "reward_std": 0.12803037464618683, "rewards/accuracy_reward_stage2": 0.6269270181655884, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 226 }, { "completion_length": 28.96875, "epoch": 0.03977571403539513, "grad_norm": 21.338816841964235, "kl": 0.018310546875, "learning_rate": 9.603995093744523e-07, "loss": 0.0073, "reward": 1.555863618850708, "reward_std": 0.12815237045288086, "rewards/accuracy_reward_stage2": 0.555863618850708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 227 }, { "completion_length": 9.390625, "epoch": 0.03995093744524268, "grad_norm": 21.852815893340804, "kl": 0.0498046875, "learning_rate": 9.602242859646048e-07, "loss": 0.0199, "reward": 1.7307069301605225, "reward_std": 0.11314516514539719, "rewards/accuracy_reward_stage2": 0.7307069301605225, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 228 }, { "completion_length": 9.40625, "epoch": 0.04012616085509024, "grad_norm": 30.771365408147698, "kl": 0.0225830078125, "learning_rate": 9.600490625547573e-07, "loss": 0.009, "reward": 1.560467004776001, "reward_std": 0.22677713632583618, "rewards/accuracy_reward_stage2": 0.5604670643806458, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 229 }, { "completion_length": 7.4375, "epoch": 0.040301384264937797, "grad_norm": 24.481460538698858, "kl": 0.0966796875, "learning_rate": 9.598738391449097e-07, "loss": 0.0387, "reward": 1.6300032138824463, "reward_std": 0.1686421036720276, "rewards/accuracy_reward_stage2": 0.6300033330917358, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 230 }, { "completion_length": 9.15625, "epoch": 0.04047660767478535, "grad_norm": 28.843071660319026, "kl": 0.045654296875, "learning_rate": 9.596986157350622e-07, "loss": -0.0151, "reward": 1.376204252243042, "reward_std": 0.3445381224155426, "rewards/accuracy_reward_stage2": 0.39182931184768677, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 231 }, { "completion_length": 12.109375, "epoch": 0.040651831084632904, "grad_norm": 23.224141525934847, "kl": 0.0301513671875, "learning_rate": 9.595233923252145e-07, "loss": 0.012, "reward": 1.3188610076904297, "reward_std": 0.17884111404418945, "rewards/accuracy_reward_stage2": 0.3188610076904297, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 232 }, { "completion_length": 19.609375, "epoch": 0.040827054494480465, "grad_norm": 16.86419016940912, "kl": 0.01214599609375, "learning_rate": 9.59348168915367e-07, "loss": 0.0049, "reward": 1.3646348714828491, "reward_std": 0.13505005836486816, "rewards/accuracy_reward_stage2": 0.36463481187820435, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 233 }, { "completion_length": 8.5, "epoch": 0.04100227790432802, "grad_norm": 24.073123884487078, "kl": 0.01123046875, "learning_rate": 9.591729455055195e-07, "loss": 0.0045, "reward": 1.40625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward_stage2": 0.40625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 234 }, { "completion_length": 13.4375, "epoch": 0.04117750131417557, "grad_norm": 4901.515461483319, "kl": 18.375, "learning_rate": 9.58997722095672e-07, "loss": 7.3251, "reward": 1.438122272491455, "reward_std": 0.15549173951148987, "rewards/accuracy_reward_stage2": 0.5631222724914551, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 235 }, { "completion_length": 9.34375, "epoch": 0.04135272472402313, "grad_norm": 14.789976756404938, "kl": 0.009521484375, "learning_rate": 9.588224986858245e-07, "loss": 0.0038, "reward": 1.650240421295166, "reward_std": 0.1713310033082962, "rewards/accuracy_reward_stage2": 0.650240421295166, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 236 }, { "completion_length": 13.609375, "epoch": 0.04152794813387069, "grad_norm": 584.9939223178902, "kl": 1.90625, "learning_rate": 9.586472752759768e-07, "loss": 0.7597, "reward": 1.3631982803344727, "reward_std": 0.2880287170410156, "rewards/accuracy_reward_stage2": 0.48819833993911743, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 237 }, { "completion_length": 10.40625, "epoch": 0.04170317154371824, "grad_norm": 20.92452543043772, "kl": 0.0235595703125, "learning_rate": 9.584720518661293e-07, "loss": 0.0094, "reward": 1.336254358291626, "reward_std": 0.19408045709133148, "rewards/accuracy_reward_stage2": 0.3362542986869812, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 238 }, { "completion_length": 11.390625, "epoch": 0.041878394953565795, "grad_norm": 15.628353567644007, "kl": 0.0079345703125, "learning_rate": 9.582968284562818e-07, "loss": 0.0032, "reward": 1.4190398454666138, "reward_std": 0.12067941576242447, "rewards/accuracy_reward_stage2": 0.41903984546661377, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 239 }, { "completion_length": 9.875, "epoch": 0.04205361836341335, "grad_norm": 23.718856680377566, "kl": 0.018798828125, "learning_rate": 9.58121605046434e-07, "loss": 0.0075, "reward": 1.7033743858337402, "reward_std": 0.2691870629787445, "rewards/accuracy_reward_stage2": 0.7033743858337402, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 240 }, { "completion_length": 7.96875, "epoch": 0.04222884177326091, "grad_norm": 16.06620576115406, "kl": 0.032958984375, "learning_rate": 9.579463816365865e-07, "loss": 0.0132, "reward": 1.472252368927002, "reward_std": 0.21186134219169617, "rewards/accuracy_reward_stage2": 0.5972523093223572, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 241 }, { "completion_length": 12.5625, "epoch": 0.042404065183108464, "grad_norm": 23.444470839653174, "kl": 0.039306640625, "learning_rate": 9.57771158226739e-07, "loss": 0.0157, "reward": 1.5591293573379517, "reward_std": 0.33478352427482605, "rewards/accuracy_reward_stage2": 0.5591292977333069, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 242 }, { "completion_length": 13.71875, "epoch": 0.04257928859295602, "grad_norm": 25.381272213819486, "kl": 0.0235595703125, "learning_rate": 9.575959348168915e-07, "loss": -0.0236, "reward": 1.4567725658416748, "reward_std": 0.3310222923755646, "rewards/accuracy_reward_stage2": 0.4723976254463196, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 243 }, { "completion_length": 14.03125, "epoch": 0.04275451200280357, "grad_norm": 24.936302441450792, "kl": 0.62109375, "learning_rate": 9.57420711407044e-07, "loss": 0.2483, "reward": 1.6432292461395264, "reward_std": 0.3550029397010803, "rewards/accuracy_reward_stage2": 0.7682291865348816, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 244 }, { "completion_length": 9.734375, "epoch": 0.04292973541265113, "grad_norm": 20.441411725972664, "kl": 0.0283203125, "learning_rate": 9.572454879971965e-07, "loss": -0.0229, "reward": 1.5584733486175537, "reward_std": 0.37590664625167847, "rewards/accuracy_reward_stage2": 0.5740982890129089, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 245 }, { "completion_length": 8.3125, "epoch": 0.043104958822498686, "grad_norm": 20.72964347938417, "kl": 0.0201416015625, "learning_rate": 9.570702645873488e-07, "loss": 0.0081, "reward": 1.5848780870437622, "reward_std": 0.2728351056575775, "rewards/accuracy_reward_stage2": 0.584878146648407, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 246 }, { "completion_length": 12.5625, "epoch": 0.04328018223234624, "grad_norm": 15.52199378970145, "kl": 0.007476806640625, "learning_rate": 9.568950411775013e-07, "loss": 0.003, "reward": 1.575636386871338, "reward_std": 0.13075202703475952, "rewards/accuracy_reward_stage2": 0.5756364464759827, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 247 }, { "completion_length": 12.0625, "epoch": 0.043455405642193794, "grad_norm": 19.395159139210442, "kl": 0.1533203125, "learning_rate": 9.567198177676538e-07, "loss": 0.0279, "reward": 1.4184027910232544, "reward_std": 0.3044259250164032, "rewards/accuracy_reward_stage2": 0.6840277910232544, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 248 }, { "completion_length": 5.765625, "epoch": 0.043630629052041354, "grad_norm": 14.48011797195782, "kl": 0.0079345703125, "learning_rate": 9.565445943578063e-07, "loss": 0.0032, "reward": 1.34375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 249 }, { "completion_length": 11.265625, "epoch": 0.04380585246188891, "grad_norm": 17.31147879550402, "kl": 0.0228271484375, "learning_rate": 9.563693709479585e-07, "loss": -0.0238, "reward": 1.598874568939209, "reward_std": 0.233104407787323, "rewards/accuracy_reward_stage2": 0.6144995093345642, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 250 }, { "completion_length": 12.0625, "epoch": 0.04398107587173646, "grad_norm": 16.454194341141637, "kl": 0.0361328125, "learning_rate": 9.56194147538111e-07, "loss": 0.0145, "reward": 1.520371675491333, "reward_std": 0.15112952888011932, "rewards/accuracy_reward_stage2": 0.5203717350959778, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 251 }, { "completion_length": 11.4375, "epoch": 0.04415629928158402, "grad_norm": 17.35487988167322, "kl": 0.0361328125, "learning_rate": 9.560189241282635e-07, "loss": 0.0145, "reward": 1.3840597867965698, "reward_std": 0.06660275906324387, "rewards/accuracy_reward_stage2": 0.3840597867965698, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 252 }, { "completion_length": 8.421875, "epoch": 0.04433152269143158, "grad_norm": 17.687468327124677, "kl": 0.029296875, "learning_rate": 9.55843700718416e-07, "loss": 0.0117, "reward": 1.7530488967895508, "reward_std": 0.05738438665866852, "rewards/accuracy_reward_stage2": 0.7530487775802612, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 253 }, { "completion_length": 12.28125, "epoch": 0.04450674610127913, "grad_norm": 19.913907277473474, "kl": 0.0693359375, "learning_rate": 9.556684773085683e-07, "loss": -0.0166, "reward": 1.416152000427246, "reward_std": 0.25723960995674133, "rewards/accuracy_reward_stage2": 0.5567771196365356, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 254 }, { "completion_length": 9.5625, "epoch": 0.044681969511126685, "grad_norm": 23.441539432847623, "kl": 0.08740234375, "learning_rate": 9.554932538987208e-07, "loss": 0.0349, "reward": 1.4265499114990234, "reward_std": 0.2730960547924042, "rewards/accuracy_reward_stage2": 0.42654991149902344, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 255 }, { "completion_length": 8.796875, "epoch": 0.044857192920974245, "grad_norm": 20.805671934114525, "kl": 0.04541015625, "learning_rate": 9.553180304888733e-07, "loss": 0.0182, "reward": 1.392343282699585, "reward_std": 0.18272897601127625, "rewards/accuracy_reward_stage2": 0.39234328269958496, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 256 }, { "completion_length": 11.71875, "epoch": 0.0450324163308218, "grad_norm": 23.194431431497243, "kl": 0.062255859375, "learning_rate": 9.551428070790258e-07, "loss": 0.0249, "reward": 1.399277687072754, "reward_std": 0.19788572192192078, "rewards/accuracy_reward_stage2": 0.3992777466773987, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 257 }, { "completion_length": 7.0625, "epoch": 0.04520763974066935, "grad_norm": 25.40844788041373, "kl": 0.0712890625, "learning_rate": 9.549675836691783e-07, "loss": 0.0285, "reward": 1.5427207946777344, "reward_std": 0.17167173326015472, "rewards/accuracy_reward_stage2": 0.6677207350730896, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 258 }, { "completion_length": 6.875, "epoch": 0.04538286315051691, "grad_norm": 19.327847215838386, "kl": 0.059326171875, "learning_rate": 9.547923602593305e-07, "loss": 0.0238, "reward": 1.63570237159729, "reward_std": 0.17358574271202087, "rewards/accuracy_reward_stage2": 0.63570237159729, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 259 }, { "completion_length": 10.59375, "epoch": 0.04555808656036447, "grad_norm": 12.462430172624714, "kl": 0.046142578125, "learning_rate": 9.54617136849483e-07, "loss": 0.0184, "reward": 1.4343960285186768, "reward_std": 0.0724828690290451, "rewards/accuracy_reward_stage2": 0.43439605832099915, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 260 }, { "completion_length": 18.265625, "epoch": 0.04573330997021202, "grad_norm": 18.39096279891861, "kl": 0.043212890625, "learning_rate": 9.544419134396355e-07, "loss": 0.0173, "reward": 1.4796587228775024, "reward_std": 0.2298499345779419, "rewards/accuracy_reward_stage2": 0.47965875267982483, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 261 }, { "completion_length": 11.5625, "epoch": 0.045908533380059575, "grad_norm": 18.064444413910984, "kl": 0.051513671875, "learning_rate": 9.54266690029788e-07, "loss": 0.0206, "reward": 1.385161280632019, "reward_std": 0.12799863517284393, "rewards/accuracy_reward_stage2": 0.510161280632019, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 262 }, { "completion_length": 8.75, "epoch": 0.04608375678990713, "grad_norm": 16.59071665301892, "kl": 0.10009765625, "learning_rate": 9.540914666199403e-07, "loss": 0.0401, "reward": 1.6114752292633057, "reward_std": 0.07131287455558777, "rewards/accuracy_reward_stage2": 0.6114752292633057, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 263 }, { "completion_length": 11.390625, "epoch": 0.04625898019975469, "grad_norm": 27.809570183113333, "kl": 0.0888671875, "learning_rate": 9.539162432100928e-07, "loss": 0.0355, "reward": 1.5748344659805298, "reward_std": 0.2848876714706421, "rewards/accuracy_reward_stage2": 0.5748344659805298, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 264 }, { "completion_length": 9.59375, "epoch": 0.046434203609602244, "grad_norm": 24.14804135776577, "kl": 0.04931640625, "learning_rate": 9.537410198002453e-07, "loss": 0.0197, "reward": 1.8005200624465942, "reward_std": 0.22077980637550354, "rewards/accuracy_reward_stage2": 0.8005200624465942, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 265 }, { "completion_length": 6.8125, "epoch": 0.0466094270194498, "grad_norm": 20.759906763884782, "kl": 0.0299072265625, "learning_rate": 9.535657963903977e-07, "loss": 0.0023, "reward": 1.408979058265686, "reward_std": 0.11087541282176971, "rewards/accuracy_reward_stage2": 0.42460405826568604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 266 }, { "completion_length": 9.265625, "epoch": 0.04678465042929735, "grad_norm": 21.32710865447075, "kl": 0.10791015625, "learning_rate": 9.533905729805502e-07, "loss": 0.0433, "reward": 1.3225247859954834, "reward_std": 0.20090004801750183, "rewards/accuracy_reward_stage2": 0.5725248456001282, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 267 }, { "completion_length": 10.671875, "epoch": 0.04695987383914491, "grad_norm": 24.455639169887064, "kl": 0.345703125, "learning_rate": 9.532153495707026e-07, "loss": 0.1386, "reward": 1.4042267799377441, "reward_std": 0.15356406569480896, "rewards/accuracy_reward_stage2": 0.5292267799377441, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 268 }, { "completion_length": 15.328125, "epoch": 0.047135097248992466, "grad_norm": 21.541350855308803, "kl": 0.035400390625, "learning_rate": 9.53040126160855e-07, "loss": 0.0141, "reward": 1.6411187648773193, "reward_std": 0.166485995054245, "rewards/accuracy_reward_stage2": 0.6411186456680298, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 269 }, { "completion_length": 9.3125, "epoch": 0.04731032065884002, "grad_norm": 22.15774609889451, "kl": 0.0625, "learning_rate": 9.528649027510075e-07, "loss": 0.025, "reward": 1.4429640769958496, "reward_std": 0.19489288330078125, "rewards/accuracy_reward_stage2": 0.5679640173912048, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 270 }, { "completion_length": 11.421875, "epoch": 0.047485544068687574, "grad_norm": 16.491138324168993, "kl": 0.023193359375, "learning_rate": 9.526896793411599e-07, "loss": -0.0349, "reward": 1.5896495580673218, "reward_std": 0.20764687657356262, "rewards/accuracy_reward_stage2": 0.6052745580673218, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 271 }, { "completion_length": 5.953125, "epoch": 0.047660767478535135, "grad_norm": 21.88275639215845, "kl": 0.09716796875, "learning_rate": 9.525144559313124e-07, "loss": 0.0388, "reward": 1.6785914897918701, "reward_std": 0.16737723350524902, "rewards/accuracy_reward_stage2": 0.6785914897918701, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 272 }, { "completion_length": 11.9375, "epoch": 0.04783599088838269, "grad_norm": 21.467567337800396, "kl": 0.62890625, "learning_rate": 9.523392325214649e-07, "loss": 0.2498, "reward": 1.6683162450790405, "reward_std": 0.2646476924419403, "rewards/accuracy_reward_stage2": 0.7933162450790405, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 273 }, { "completion_length": 12.921875, "epoch": 0.04801121429823024, "grad_norm": 25.858447993597725, "kl": 0.56640625, "learning_rate": 9.521640091116173e-07, "loss": 0.2256, "reward": 1.4236572980880737, "reward_std": 0.17377673089504242, "rewards/accuracy_reward_stage2": 0.5486572980880737, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 274 }, { "completion_length": 14.15625, "epoch": 0.048186437708077796, "grad_norm": 46.2527752830865, "kl": 0.03857421875, "learning_rate": 9.519887857017697e-07, "loss": 0.0154, "reward": 1.678868055343628, "reward_std": 0.2035331428050995, "rewards/accuracy_reward_stage2": 0.6788681745529175, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 275 }, { "completion_length": 7.59375, "epoch": 0.04836166111792536, "grad_norm": 32.720924431721514, "kl": 0.16796875, "learning_rate": 9.518135622919221e-07, "loss": 0.0383, "reward": 1.8637468814849854, "reward_std": 0.2025931477546692, "rewards/accuracy_reward_stage2": 0.8793718814849854, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 276 }, { "completion_length": 7.53125, "epoch": 0.04853688452777291, "grad_norm": 21.119273458560386, "kl": 0.076171875, "learning_rate": 9.516383388820746e-07, "loss": -0.0137, "reward": 1.4132182598114014, "reward_std": 0.24847757816314697, "rewards/accuracy_reward_stage2": 0.5538431406021118, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 277 }, { "completion_length": 12.390625, "epoch": 0.048712107937620465, "grad_norm": 14.369302454959714, "kl": 0.08154296875, "learning_rate": 9.514631154722271e-07, "loss": 0.0327, "reward": 1.5831317901611328, "reward_std": 0.12140820920467377, "rewards/accuracy_reward_stage2": 0.583131730556488, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 278 }, { "completion_length": 6.421875, "epoch": 0.04888733134746802, "grad_norm": 12.107063676651114, "kl": 0.01446533203125, "learning_rate": 9.512878920623794e-07, "loss": 0.0058, "reward": 1.71875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.71875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 279 }, { "completion_length": 7.96875, "epoch": 0.04906255475731558, "grad_norm": 20.12750865493871, "kl": 0.06884765625, "learning_rate": 9.511126686525319e-07, "loss": 0.0275, "reward": 1.6211915016174316, "reward_std": 0.1551232933998108, "rewards/accuracy_reward_stage2": 0.6211915612220764, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 280 }, { "completion_length": 22.015625, "epoch": 0.04923777816716313, "grad_norm": 118700.45485301006, "kl": 500.0, "learning_rate": 9.509374452426844e-07, "loss": 200.9689, "reward": 1.578223705291748, "reward_std": 0.22198337316513062, "rewards/accuracy_reward_stage2": 0.718848705291748, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 281 }, { "completion_length": 9.75, "epoch": 0.04941300157701069, "grad_norm": 18.937226993599705, "kl": 0.06494140625, "learning_rate": 9.507622218328368e-07, "loss": 0.026, "reward": 1.6166949272155762, "reward_std": 0.2150428295135498, "rewards/accuracy_reward_stage2": 0.6166949272155762, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 282 }, { "completion_length": 16.859375, "epoch": 0.04958822498685824, "grad_norm": 35.92465554139422, "kl": 0.283203125, "learning_rate": 9.505869984229893e-07, "loss": 0.1138, "reward": 1.3042311668395996, "reward_std": 0.24792616069316864, "rewards/accuracy_reward_stage2": 0.5542311072349548, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 283 }, { "completion_length": 9.109375, "epoch": 0.0497634483967058, "grad_norm": 14.33229856110137, "kl": 0.146484375, "learning_rate": 9.504117750131417e-07, "loss": 0.0588, "reward": 1.455843210220337, "reward_std": 0.07982275635004044, "rewards/accuracy_reward_stage2": 0.5808432102203369, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 284 }, { "completion_length": 11.046875, "epoch": 0.049938671806553356, "grad_norm": 16.522511082795745, "kl": 0.06787109375, "learning_rate": 9.502365516032942e-07, "loss": 0.0272, "reward": 1.5573397874832153, "reward_std": 0.21037398278713226, "rewards/accuracy_reward_stage2": 0.5573397874832153, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 285 }, { "completion_length": 7.65625, "epoch": 0.05011389521640091, "grad_norm": 18.612997661361707, "kl": 0.059814453125, "learning_rate": 9.500613281934467e-07, "loss": -0.0106, "reward": 1.5225942134857178, "reward_std": 0.1994466781616211, "rewards/accuracy_reward_stage2": 0.5382192134857178, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 286 }, { "completion_length": 8.34375, "epoch": 0.050289118626248464, "grad_norm": 21.052614034383808, "kl": 0.0654296875, "learning_rate": 9.498861047835991e-07, "loss": 0.0261, "reward": 1.7560055255889893, "reward_std": 0.1533891260623932, "rewards/accuracy_reward_stage2": 0.756005585193634, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 287 }, { "completion_length": 7.671875, "epoch": 0.050464342036096024, "grad_norm": 14.416852702309425, "kl": 0.033935546875, "learning_rate": 9.497108813737515e-07, "loss": 0.0136, "reward": 1.5063834190368652, "reward_std": 0.1799892634153366, "rewards/accuracy_reward_stage2": 0.5063834190368652, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 288 }, { "completion_length": 16.546875, "epoch": 0.05063956544594358, "grad_norm": 23.690892862827162, "kl": 0.197265625, "learning_rate": 9.495356579639038e-07, "loss": 0.0789, "reward": 1.4050755500793457, "reward_std": 0.13149887323379517, "rewards/accuracy_reward_stage2": 0.5300755500793457, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 289 }, { "completion_length": 13.421875, "epoch": 0.05081478885579113, "grad_norm": 24.095865194664135, "kl": 0.0849609375, "learning_rate": 9.493604345540563e-07, "loss": -0.0103, "reward": 1.5336406230926514, "reward_std": 0.23935247957706451, "rewards/accuracy_reward_stage2": 0.5492656826972961, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 290 }, { "completion_length": 9.03125, "epoch": 0.050990012265638686, "grad_norm": 20.282179837458845, "kl": 0.23046875, "learning_rate": 9.491852111442088e-07, "loss": 0.092, "reward": 1.5, "reward_std": 0.1872510462999344, "rewards/accuracy_reward_stage2": 0.625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 291 }, { "completion_length": 7.8125, "epoch": 0.05116523567548625, "grad_norm": 23.860637551007596, "kl": 0.193359375, "learning_rate": 9.490099877343612e-07, "loss": 0.0772, "reward": 1.546875, "reward_std": 0.16887323558330536, "rewards/accuracy_reward_stage2": 0.671875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 292 }, { "completion_length": 11.703125, "epoch": 0.0513404590853338, "grad_norm": 24.13121591753904, "kl": 0.0537109375, "learning_rate": 9.488347643245137e-07, "loss": -0.0227, "reward": 1.5658124685287476, "reward_std": 0.2879348397254944, "rewards/accuracy_reward_stage2": 0.5814374685287476, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 293 }, { "completion_length": 22.875, "epoch": 0.051515682495181354, "grad_norm": 18.675519791516496, "kl": 0.0263671875, "learning_rate": 9.486595409146662e-07, "loss": 0.0106, "reward": 1.2921215295791626, "reward_std": 0.17938996851444244, "rewards/accuracy_reward_stage2": 0.4171214997768402, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 294 }, { "completion_length": 10.9375, "epoch": 0.051690905905028915, "grad_norm": 20.26634765781072, "kl": 0.00799560546875, "learning_rate": 9.484843175048186e-07, "loss": 0.0032, "reward": 1.6875, "reward_std": 0.2041158676147461, "rewards/accuracy_reward_stage2": 0.6875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 295 }, { "completion_length": 8.53125, "epoch": 0.05186612931487647, "grad_norm": 19.78494567228972, "kl": 0.0208740234375, "learning_rate": 9.483090940949711e-07, "loss": 0.0084, "reward": 1.633901834487915, "reward_std": 0.2172580063343048, "rewards/accuracy_reward_stage2": 0.633901834487915, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 296 }, { "completion_length": 21.625, "epoch": 0.05204135272472402, "grad_norm": 17.083860605215293, "kl": 0.0120849609375, "learning_rate": 9.481338706851235e-07, "loss": 0.0048, "reward": 1.3703351020812988, "reward_std": 0.09173109382390976, "rewards/accuracy_reward_stage2": 0.3703351616859436, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 297 }, { "completion_length": 10.234375, "epoch": 0.05221657613457158, "grad_norm": 20.168621744741493, "kl": 0.06640625, "learning_rate": 9.47958647275276e-07, "loss": 0.0265, "reward": 1.5391501188278198, "reward_std": 0.17844170331954956, "rewards/accuracy_reward_stage2": 0.6641501188278198, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 298 }, { "completion_length": 10.703125, "epoch": 0.05239179954441914, "grad_norm": 21.384155893839793, "kl": 0.02685546875, "learning_rate": 9.477834238654284e-07, "loss": 0.0107, "reward": 1.5403645038604736, "reward_std": 0.3316608667373657, "rewards/accuracy_reward_stage2": 0.5403645634651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 299 }, { "completion_length": 15.5, "epoch": 0.05256702295426669, "grad_norm": 67.3784361561861, "kl": 0.478515625, "learning_rate": 9.476082004555808e-07, "loss": 0.1914, "reward": 1.2157280445098877, "reward_std": 0.050552383065223694, "rewards/accuracy_reward_stage2": 0.4657280445098877, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 300 }, { "completion_length": 13.5, "epoch": 0.052742246364114245, "grad_norm": 17.64358996401573, "kl": 0.053466796875, "learning_rate": 9.474329770457332e-07, "loss": 0.0213, "reward": 1.3356982469558716, "reward_std": 0.1435163915157318, "rewards/accuracy_reward_stage2": 0.3356982469558716, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 301 }, { "completion_length": 14.359375, "epoch": 0.0529174697739618, "grad_norm": 22.56834868655897, "kl": 0.03173828125, "learning_rate": 9.472577536358857e-07, "loss": 0.0127, "reward": 1.435058355331421, "reward_std": 0.17073744535446167, "rewards/accuracy_reward_stage2": 0.4350583851337433, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 302 }, { "completion_length": 11.578125, "epoch": 0.05309269318380936, "grad_norm": 25.73115764646642, "kl": 0.072265625, "learning_rate": 9.470825302260381e-07, "loss": 0.0289, "reward": 1.601118564605713, "reward_std": 0.28823572397232056, "rewards/accuracy_reward_stage2": 0.6011185646057129, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 303 }, { "completion_length": 7.828125, "epoch": 0.053267916593656914, "grad_norm": 21.950921128147304, "kl": 0.046142578125, "learning_rate": 9.469073068161906e-07, "loss": -0.0387, "reward": 1.4147183895111084, "reward_std": 0.2501143217086792, "rewards/accuracy_reward_stage2": 0.5709684491157532, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 304 }, { "completion_length": 8.578125, "epoch": 0.05344314000350447, "grad_norm": 77.45405609155354, "kl": 0.376953125, "learning_rate": 9.46732083406343e-07, "loss": 0.106, "reward": 1.5750467777252197, "reward_std": 0.15764901041984558, "rewards/accuracy_reward_stage2": 0.5906718373298645, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 305 }, { "completion_length": 11.578125, "epoch": 0.05361836341335202, "grad_norm": 17.972984505078564, "kl": 0.09375, "learning_rate": 9.465568599964955e-07, "loss": 0.0374, "reward": 1.3125, "reward_std": 0.2314550280570984, "rewards/accuracy_reward_stage2": 0.4375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 306 }, { "completion_length": 9.234375, "epoch": 0.05379358682319958, "grad_norm": 21.731586710719984, "kl": 0.07763671875, "learning_rate": 9.46381636586648e-07, "loss": -0.0044, "reward": 1.406597375869751, "reward_std": 0.2542756199836731, "rewards/accuracy_reward_stage2": 0.42222240567207336, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 307 }, { "completion_length": 10.625, "epoch": 0.053968810233047136, "grad_norm": 49.507632531802265, "kl": 0.349609375, "learning_rate": 9.462064131768004e-07, "loss": 0.1399, "reward": 1.4937288761138916, "reward_std": 0.19214007258415222, "rewards/accuracy_reward_stage2": 0.6187288761138916, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 308 }, { "completion_length": 9.125, "epoch": 0.05414403364289469, "grad_norm": 16.914177137657322, "kl": 0.0255126953125, "learning_rate": 9.460311897669528e-07, "loss": 0.0102, "reward": 1.5506947040557861, "reward_std": 0.1157640889286995, "rewards/accuracy_reward_stage2": 0.5506946444511414, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 309 }, { "completion_length": 15.046875, "epoch": 0.054319257052742244, "grad_norm": 14.960025362961645, "kl": 0.022216796875, "learning_rate": 9.458559663571053e-07, "loss": 0.0089, "reward": 1.4643514156341553, "reward_std": 0.16417983174324036, "rewards/accuracy_reward_stage2": 0.4643513560295105, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 310 }, { "completion_length": 18.390625, "epoch": 0.054494480462589805, "grad_norm": 21.473871880094755, "kl": 0.0419921875, "learning_rate": 9.456807429472577e-07, "loss": 0.0168, "reward": 1.3595951795578003, "reward_std": 0.24870413541793823, "rewards/accuracy_reward_stage2": 0.4845951795578003, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 311 }, { "completion_length": 13.734375, "epoch": 0.05466970387243736, "grad_norm": 14.675544367354817, "kl": 0.0269775390625, "learning_rate": 9.455055195374102e-07, "loss": -0.0334, "reward": 1.4303240776062012, "reward_std": 0.1907956451177597, "rewards/accuracy_reward_stage2": 0.5709490180015564, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 312 }, { "completion_length": 8.640625, "epoch": 0.05484492728228491, "grad_norm": 21.425310572049685, "kl": 0.03125, "learning_rate": 9.453302961275626e-07, "loss": 0.0125, "reward": 1.7457122802734375, "reward_std": 0.2760339379310608, "rewards/accuracy_reward_stage2": 0.745712399482727, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 313 }, { "completion_length": 10.1875, "epoch": 0.055020150692132466, "grad_norm": 23.740271575825194, "kl": 0.1279296875, "learning_rate": 9.45155072717715e-07, "loss": 0.051, "reward": 1.4499235153198242, "reward_std": 0.1825544536113739, "rewards/accuracy_reward_stage2": 0.44992342591285706, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 314 }, { "completion_length": 11.546875, "epoch": 0.05519537410198003, "grad_norm": 24.347783431235495, "kl": 0.046630859375, "learning_rate": 9.449798493078675e-07, "loss": -0.0167, "reward": 1.5818061828613281, "reward_std": 0.36251744627952576, "rewards/accuracy_reward_stage2": 0.5974311828613281, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 315 }, { "completion_length": 15.59375, "epoch": 0.05537059751182758, "grad_norm": 26.675134620684368, "kl": 0.25, "learning_rate": 9.448046258980199e-07, "loss": 0.1, "reward": 1.3757495880126953, "reward_std": 0.25632089376449585, "rewards/accuracy_reward_stage2": 0.6257495284080505, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 316 }, { "completion_length": 14.234375, "epoch": 0.055545820921675135, "grad_norm": 19.637641539670906, "kl": 0.57421875, "learning_rate": 9.446294024881724e-07, "loss": 0.2295, "reward": 1.5519661903381348, "reward_std": 0.14337322115898132, "rewards/accuracy_reward_stage2": 0.6769663095474243, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 317 }, { "completion_length": 10.84375, "epoch": 0.05572104433152269, "grad_norm": 21.498652862245095, "kl": 0.0211181640625, "learning_rate": 9.444541790783249e-07, "loss": -0.0338, "reward": 1.237978219985962, "reward_std": 0.14729207754135132, "rewards/accuracy_reward_stage2": 0.2536032199859619, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 318 }, { "completion_length": 7.671875, "epoch": 0.05589626774137025, "grad_norm": 77.66592293586824, "kl": 0.0712890625, "learning_rate": 9.442789556684772e-07, "loss": -0.0049, "reward": 1.20796537399292, "reward_std": 0.19719843566417694, "rewards/accuracy_reward_stage2": 0.22359028458595276, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 319 }, { "completion_length": 17.5625, "epoch": 0.0560714911512178, "grad_norm": 24.167033254443226, "kl": 0.0576171875, "learning_rate": 9.441037322586297e-07, "loss": 0.023, "reward": 1.460500717163086, "reward_std": 0.17372924089431763, "rewards/accuracy_reward_stage2": 0.46050071716308594, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 320 }, { "completion_length": 7.8125, "epoch": 0.05624671456106536, "grad_norm": 23.422114499103813, "kl": 0.0284423828125, "learning_rate": 9.439285088487821e-07, "loss": 0.0114, "reward": 1.719941258430481, "reward_std": 0.22414857149124146, "rewards/accuracy_reward_stage2": 0.7199413180351257, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 321 }, { "completion_length": 14.03125, "epoch": 0.05642193797091291, "grad_norm": 21.216710069826014, "kl": 0.04443359375, "learning_rate": 9.437532854389346e-07, "loss": 0.0177, "reward": 1.4304391145706177, "reward_std": 0.2377379685640335, "rewards/accuracy_reward_stage2": 0.43043917417526245, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 322 }, { "completion_length": 8.375, "epoch": 0.05659716138076047, "grad_norm": 15.917513479644082, "kl": 0.048583984375, "learning_rate": 9.435780620290871e-07, "loss": 0.0194, "reward": 1.3735301494598389, "reward_std": 0.16220563650131226, "rewards/accuracy_reward_stage2": 0.3735300600528717, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 323 }, { "completion_length": 6.609375, "epoch": 0.056772384790608026, "grad_norm": 15.550109521299955, "kl": 0.0230712890625, "learning_rate": 9.434028386192395e-07, "loss": 0.0092, "reward": 1.5212457180023193, "reward_std": 0.06378524005413055, "rewards/accuracy_reward_stage2": 0.6462457180023193, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 324 }, { "completion_length": 17.5, "epoch": 0.05694760820045558, "grad_norm": 24.026423483219208, "kl": 0.09423828125, "learning_rate": 9.43227615209392e-07, "loss": 0.0088, "reward": 1.6016652584075928, "reward_std": 0.17152510583400726, "rewards/accuracy_reward_stage2": 0.6172903776168823, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 325 }, { "completion_length": 10.5, "epoch": 0.05712283161030313, "grad_norm": 22.942754962486735, "kl": 0.0478515625, "learning_rate": 9.430523917995444e-07, "loss": 0.0192, "reward": 1.5287861824035645, "reward_std": 0.2127273827791214, "rewards/accuracy_reward_stage2": 0.5287861227989197, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 326 }, { "completion_length": 14.0625, "epoch": 0.057298055020150694, "grad_norm": 19.828393084324876, "kl": 0.0859375, "learning_rate": 9.428771683896968e-07, "loss": 0.0055, "reward": 1.4723576307296753, "reward_std": 0.22869396209716797, "rewards/accuracy_reward_stage2": 0.6129826307296753, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 327 }, { "completion_length": 22.0, "epoch": 0.05747327842999825, "grad_norm": 19.45292668071304, "kl": 0.03955078125, "learning_rate": 9.427019449798493e-07, "loss": 0.0158, "reward": 1.2495118379592896, "reward_std": 0.16045579314231873, "rewards/accuracy_reward_stage2": 0.3745118975639343, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 328 }, { "completion_length": 7.421875, "epoch": 0.0576485018398458, "grad_norm": 19.920681409272653, "kl": 0.03564453125, "learning_rate": 9.425267215700016e-07, "loss": 0.0142, "reward": 1.5295330286026, "reward_std": 0.25502684712409973, "rewards/accuracy_reward_stage2": 0.6545330286026001, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 329 }, { "completion_length": 12.46875, "epoch": 0.057823725249693356, "grad_norm": 21.62160329452179, "kl": 0.44921875, "learning_rate": 9.423514981601541e-07, "loss": 0.1801, "reward": 1.4313299655914307, "reward_std": 0.2229369878768921, "rewards/accuracy_reward_stage2": 0.5563299655914307, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 330 }, { "completion_length": 9.734375, "epoch": 0.057998948659540916, "grad_norm": 17.07241858301237, "kl": 0.038818359375, "learning_rate": 9.421762747503066e-07, "loss": -0.0576, "reward": 1.6263515949249268, "reward_std": 0.1974027454853058, "rewards/accuracy_reward_stage2": 0.6576014757156372, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 331 }, { "completion_length": 9.40625, "epoch": 0.05817417206938847, "grad_norm": 20.23984568805099, "kl": 0.1005859375, "learning_rate": 9.42001051340459e-07, "loss": 0.0019, "reward": 1.738767147064209, "reward_std": 0.19900760054588318, "rewards/accuracy_reward_stage2": 0.7543920278549194, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 332 }, { "completion_length": 7.765625, "epoch": 0.058349395479236024, "grad_norm": 25.418557978515302, "kl": 0.060546875, "learning_rate": 9.418258279306115e-07, "loss": -0.0145, "reward": 1.5975984334945679, "reward_std": 0.2777034044265747, "rewards/accuracy_reward_stage2": 0.6132233738899231, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 333 }, { "completion_length": 8.984375, "epoch": 0.058524618889083585, "grad_norm": 39.31755014042999, "kl": 0.2216796875, "learning_rate": 9.41650604520764e-07, "loss": 0.0369, "reward": 1.4411017894744873, "reward_std": 0.2632755935192108, "rewards/accuracy_reward_stage2": 0.47235187888145447, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 334 }, { "completion_length": 6.015625, "epoch": 0.05869984229893114, "grad_norm": 13.464327717734951, "kl": 0.0186767578125, "learning_rate": 9.414753811109164e-07, "loss": -0.0141, "reward": 1.824305534362793, "reward_std": 0.07549665868282318, "rewards/accuracy_reward_stage2": 0.839930534362793, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 335 }, { "completion_length": 9.671875, "epoch": 0.05887506570877869, "grad_norm": 19.3182318322218, "kl": 0.103515625, "learning_rate": 9.413001577010689e-07, "loss": 0.0215, "reward": 1.6234338283538818, "reward_std": 0.20951224863529205, "rewards/accuracy_reward_stage2": 0.7640588283538818, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 336 }, { "completion_length": 14.65625, "epoch": 0.05905028911862625, "grad_norm": 17.7357517309841, "kl": 0.01904296875, "learning_rate": 9.411249342912213e-07, "loss": 0.0076, "reward": 1.3733090162277222, "reward_std": 0.11433231830596924, "rewards/accuracy_reward_stage2": 0.6233089566230774, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 337 }, { "completion_length": 11.671875, "epoch": 0.05922551252847381, "grad_norm": 16.671151395451187, "kl": 0.06982421875, "learning_rate": 9.409497108813738e-07, "loss": -0.0352, "reward": 1.4322497844696045, "reward_std": 0.2082475870847702, "rewards/accuracy_reward_stage2": 0.4634997248649597, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 338 }, { "completion_length": 11.59375, "epoch": 0.05940073593832136, "grad_norm": 19.83687104514724, "kl": 0.060546875, "learning_rate": 9.407744874715261e-07, "loss": -0.02, "reward": 1.4493414163589478, "reward_std": 0.2299998253583908, "rewards/accuracy_reward_stage2": 0.5899664163589478, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 339 }, { "completion_length": 15.78125, "epoch": 0.059575959348168915, "grad_norm": 21.61938135228256, "kl": 0.056884765625, "learning_rate": 9.405992640616785e-07, "loss": 0.0228, "reward": 1.6063339710235596, "reward_std": 0.23589658737182617, "rewards/accuracy_reward_stage2": 0.6063340306282043, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 340 }, { "completion_length": 7.671875, "epoch": 0.05975118275801647, "grad_norm": 16.099045116611332, "kl": 0.033203125, "learning_rate": 9.40424040651831e-07, "loss": -0.0984, "reward": 1.203125, "reward_std": 0.19044627249240875, "rewards/accuracy_reward_stage2": 0.265625, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 341 }, { "completion_length": 10.09375, "epoch": 0.05992640616786403, "grad_norm": 17.02533664481703, "kl": 0.039306640625, "learning_rate": 9.402488172419835e-07, "loss": 0.0157, "reward": 1.7135874032974243, "reward_std": 0.18373428285121918, "rewards/accuracy_reward_stage2": 0.7135874032974243, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 342 }, { "completion_length": 8.0625, "epoch": 0.060101629577711584, "grad_norm": 16.60695937815947, "kl": 0.01458740234375, "learning_rate": 9.400735938321359e-07, "loss": -0.0231, "reward": 1.6499578952789307, "reward_std": 0.1063762977719307, "rewards/accuracy_reward_stage2": 0.6655828952789307, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 343 }, { "completion_length": 12.5, "epoch": 0.06027685298755914, "grad_norm": 21.365233496072975, "kl": 0.56640625, "learning_rate": 9.398983704222884e-07, "loss": 0.2267, "reward": 1.409088134765625, "reward_std": 0.11065279692411423, "rewards/accuracy_reward_stage2": 0.5340880751609802, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 344 }, { "completion_length": 6.984375, "epoch": 0.06045207639740669, "grad_norm": 20.21324822494437, "kl": 0.0289306640625, "learning_rate": 9.397231470124408e-07, "loss": 0.0116, "reward": 1.7402604818344116, "reward_std": 0.20953799784183502, "rewards/accuracy_reward_stage2": 0.7402604818344116, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 345 }, { "completion_length": 12.8125, "epoch": 0.06062729980725425, "grad_norm": 30.088987680319942, "kl": 0.4296875, "learning_rate": 9.395479236025933e-07, "loss": 0.1713, "reward": 1.4402340650558472, "reward_std": 0.3298301696777344, "rewards/accuracy_reward_stage2": 0.5652340650558472, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 346 }, { "completion_length": 17.25, "epoch": 0.060802523217101806, "grad_norm": 16.63132010533935, "kl": 0.056396484375, "learning_rate": 9.393727001927458e-07, "loss": 0.0226, "reward": 1.2534388303756714, "reward_std": 0.12893691658973694, "rewards/accuracy_reward_stage2": 0.37843888998031616, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 347 }, { "completion_length": 12.859375, "epoch": 0.06097774662694936, "grad_norm": 19.911215792220517, "kl": 0.057861328125, "learning_rate": 9.391974767828981e-07, "loss": 0.0231, "reward": 1.7468256950378418, "reward_std": 0.16623491048812866, "rewards/accuracy_reward_stage2": 0.7468256950378418, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 348 }, { "completion_length": 13.375, "epoch": 0.061152970036796914, "grad_norm": 16.62225918229597, "kl": 0.04052734375, "learning_rate": 9.390222533730506e-07, "loss": 0.0162, "reward": 1.4750198125839233, "reward_std": 0.08836042135953903, "rewards/accuracy_reward_stage2": 0.4750198423862457, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 349 }, { "completion_length": 9.453125, "epoch": 0.061328193446644474, "grad_norm": 17.918736470818693, "kl": 0.0264892578125, "learning_rate": 9.388470299632031e-07, "loss": -0.0622, "reward": 1.5052083730697632, "reward_std": 0.2088155895471573, "rewards/accuracy_reward_stage2": 0.5677083134651184, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 350 }, { "completion_length": 16.828125, "epoch": 0.06150341685649203, "grad_norm": 18.93734858541338, "kl": 0.029052734375, "learning_rate": 9.386718065533555e-07, "loss": 0.0116, "reward": 1.6041667461395264, "reward_std": 0.1329318881034851, "rewards/accuracy_reward_stage2": 0.6041666269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 351 }, { "completion_length": 14.65625, "epoch": 0.06167864026633958, "grad_norm": 13.115570119136578, "kl": 0.007659912109375, "learning_rate": 9.384965831435079e-07, "loss": -0.0411, "reward": 1.5149922370910645, "reward_std": 0.10126683115959167, "rewards/accuracy_reward_stage2": 0.5306171178817749, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 352 }, { "completion_length": 10.484375, "epoch": 0.061853863676187136, "grad_norm": 23.67866454130065, "kl": 0.0673828125, "learning_rate": 9.383213597336603e-07, "loss": 0.0268, "reward": 1.462594747543335, "reward_std": 0.2419978380203247, "rewards/accuracy_reward_stage2": 0.4625946879386902, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 353 }, { "completion_length": 10.203125, "epoch": 0.0620290870860347, "grad_norm": 28.223532476857716, "kl": 0.054443359375, "learning_rate": 9.381461363238128e-07, "loss": 0.0217, "reward": 1.5003230571746826, "reward_std": 0.18812508881092072, "rewards/accuracy_reward_stage2": 0.6253230571746826, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 354 }, { "completion_length": 12.34375, "epoch": 0.06220431049588225, "grad_norm": 17.401607076165597, "kl": 0.0191650390625, "learning_rate": 9.379709129139653e-07, "loss": 0.0077, "reward": 1.508453369140625, "reward_std": 0.10828704386949539, "rewards/accuracy_reward_stage2": 0.508453369140625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 355 }, { "completion_length": 12.484375, "epoch": 0.062379533905729805, "grad_norm": 23.054852828053292, "kl": 0.142578125, "learning_rate": 9.377956895041177e-07, "loss": 0.0571, "reward": 1.3993406295776367, "reward_std": 0.12222443521022797, "rewards/accuracy_reward_stage2": 0.6493405103683472, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 356 }, { "completion_length": 9.296875, "epoch": 0.06255475731557736, "grad_norm": 19.277548157506466, "kl": 0.125, "learning_rate": 9.376204660942702e-07, "loss": 0.046, "reward": 1.6770656108856201, "reward_std": 0.14195884764194489, "rewards/accuracy_reward_stage2": 0.8020656704902649, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 357 }, { "completion_length": 8.9375, "epoch": 0.06272998072542492, "grad_norm": 14.01438455619914, "kl": 0.0203857421875, "learning_rate": 9.374452426844227e-07, "loss": 0.0082, "reward": 1.296875, "reward_std": 0.1804211586713791, "rewards/accuracy_reward_stage2": 0.328125, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 358 }, { "completion_length": 9.578125, "epoch": 0.06290520413527247, "grad_norm": 18.87541757343168, "kl": 0.06591796875, "learning_rate": 9.37270019274575e-07, "loss": 0.0265, "reward": 1.4920721054077148, "reward_std": 0.13774442672729492, "rewards/accuracy_reward_stage2": 0.49207204580307007, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 359 }, { "completion_length": 10.625, "epoch": 0.06308042754512003, "grad_norm": 19.89411336926994, "kl": 0.0615234375, "learning_rate": 9.370947958647275e-07, "loss": -0.0196, "reward": 1.503807544708252, "reward_std": 0.26093435287475586, "rewards/accuracy_reward_stage2": 0.6444324851036072, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 360 }, { "completion_length": 15.46875, "epoch": 0.06325565095496759, "grad_norm": 23.077935193360588, "kl": 0.130859375, "learning_rate": 9.369195724548799e-07, "loss": 0.0129, "reward": 1.599015235900879, "reward_std": 0.2068370282649994, "rewards/accuracy_reward_stage2": 0.6146402359008789, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 361 }, { "completion_length": 7.875, "epoch": 0.06343087436481513, "grad_norm": 21.35498118118063, "kl": 0.0303955078125, "learning_rate": 9.367443490450324e-07, "loss": 0.0122, "reward": 1.7239583730697632, "reward_std": 0.28599968552589417, "rewards/accuracy_reward_stage2": 0.7239583730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 362 }, { "completion_length": 12.40625, "epoch": 0.0636060977746627, "grad_norm": 1170.2714969237206, "kl": 3.21875, "learning_rate": 9.365691256351849e-07, "loss": 1.2427, "reward": 1.412689208984375, "reward_std": 0.1700247824192047, "rewards/accuracy_reward_stage2": 0.6783140897750854, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 363 }, { "completion_length": 8.703125, "epoch": 0.06378132118451026, "grad_norm": 21.473732277517065, "kl": 0.024658203125, "learning_rate": 9.363939022253373e-07, "loss": 0.0099, "reward": 1.6391968727111816, "reward_std": 0.14921677112579346, "rewards/accuracy_reward_stage2": 0.6391969323158264, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 364 }, { "completion_length": 5.953125, "epoch": 0.0639565445943578, "grad_norm": 24.9623619651563, "kl": 0.08837890625, "learning_rate": 9.362186788154897e-07, "loss": 0.0144, "reward": 1.559525489807129, "reward_std": 0.3174114227294922, "rewards/accuracy_reward_stage2": 0.5751504898071289, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 365 }, { "completion_length": 8.703125, "epoch": 0.06413176800420536, "grad_norm": 25.036012178534264, "kl": 0.12451171875, "learning_rate": 9.360434554056421e-07, "loss": 0.0209, "reward": 1.32099449634552, "reward_std": 0.31950968503952026, "rewards/accuracy_reward_stage2": 0.4616195559501648, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 366 }, { "completion_length": 11.296875, "epoch": 0.06430699141405291, "grad_norm": 21.858208443169964, "kl": 0.119140625, "learning_rate": 9.358682319957946e-07, "loss": -0.0174, "reward": 1.502138614654541, "reward_std": 0.2199799120426178, "rewards/accuracy_reward_stage2": 0.5333885550498962, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 367 }, { "completion_length": 11.859375, "epoch": 0.06448221482390047, "grad_norm": 40.60899905178031, "kl": 0.06689453125, "learning_rate": 9.35693008585947e-07, "loss": 0.0267, "reward": 1.6392221450805664, "reward_std": 0.27685898542404175, "rewards/accuracy_reward_stage2": 0.6392222046852112, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 368 }, { "completion_length": 11.3125, "epoch": 0.06465743823374803, "grad_norm": 18.414448641824837, "kl": 0.10302734375, "learning_rate": 9.355177851760994e-07, "loss": 0.0411, "reward": 1.4526405334472656, "reward_std": 0.155037060379982, "rewards/accuracy_reward_stage2": 0.4526405334472656, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 369 }, { "completion_length": 7.71875, "epoch": 0.06483266164359558, "grad_norm": 24.29576979333205, "kl": 0.0703125, "learning_rate": 9.353425617662519e-07, "loss": -0.1017, "reward": 1.6652096509933472, "reward_std": 0.4130101203918457, "rewards/accuracy_reward_stage2": 0.7277096509933472, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 370 }, { "completion_length": 8.078125, "epoch": 0.06500788505344314, "grad_norm": 17.81540268602905, "kl": 0.061767578125, "learning_rate": 9.351673383564044e-07, "loss": 0.0246, "reward": 1.6285955905914307, "reward_std": 0.16550035774707794, "rewards/accuracy_reward_stage2": 0.6285956501960754, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 371 }, { "completion_length": 10.15625, "epoch": 0.0651831084632907, "grad_norm": 45.18264689694991, "kl": 0.265625, "learning_rate": 9.349921149465568e-07, "loss": 0.0619, "reward": 1.517066478729248, "reward_std": 0.13450536131858826, "rewards/accuracy_reward_stage2": 0.5326914191246033, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 372 }, { "completion_length": 9.140625, "epoch": 0.06535833187313825, "grad_norm": 17.577834154040076, "kl": 0.046875, "learning_rate": 9.348168915367093e-07, "loss": 0.0187, "reward": 1.4530048370361328, "reward_std": 0.07536228746175766, "rewards/accuracy_reward_stage2": 0.45300477743148804, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 373 }, { "completion_length": 12.71875, "epoch": 0.06553355528298581, "grad_norm": 23.31006461032496, "kl": 0.1767578125, "learning_rate": 9.346416681268617e-07, "loss": 0.0032, "reward": 1.6315686702728271, "reward_std": 0.14052413403987885, "rewards/accuracy_reward_stage2": 0.7878186106681824, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 374 }, { "completion_length": 9.421875, "epoch": 0.06570877869283337, "grad_norm": 18.109201679127302, "kl": 0.068359375, "learning_rate": 9.344664447170142e-07, "loss": 0.0273, "reward": 1.3579471111297607, "reward_std": 0.136207714676857, "rewards/accuracy_reward_stage2": 0.3579471707344055, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 375 }, { "completion_length": 7.921875, "epoch": 0.06588400210268092, "grad_norm": 13.861200917194214, "kl": 0.031494140625, "learning_rate": 9.342912213071667e-07, "loss": 0.0126, "reward": 1.6228134632110596, "reward_std": 0.1539314091205597, "rewards/accuracy_reward_stage2": 0.6228134632110596, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 376 }, { "completion_length": 7.234375, "epoch": 0.06605922551252848, "grad_norm": 18.323722906868795, "kl": 0.0107421875, "learning_rate": 9.34115997897319e-07, "loss": 0.0043, "reward": 1.5229077339172363, "reward_std": 0.08434540033340454, "rewards/accuracy_reward_stage2": 0.5229077339172363, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 377 }, { "completion_length": 10.015625, "epoch": 0.06623444892237602, "grad_norm": 21.175979440293844, "kl": 0.045166015625, "learning_rate": 9.339407744874714e-07, "loss": 0.0181, "reward": 1.550042748451233, "reward_std": 0.2471158653497696, "rewards/accuracy_reward_stage2": 0.5500428080558777, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 378 }, { "completion_length": 10.890625, "epoch": 0.06640967233222358, "grad_norm": 16.39289635513477, "kl": 0.04541015625, "learning_rate": 9.337655510776239e-07, "loss": 0.0181, "reward": 1.7313203811645508, "reward_std": 0.1756003499031067, "rewards/accuracy_reward_stage2": 0.7313204407691956, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 379 }, { "completion_length": 8.6875, "epoch": 0.06658489574207115, "grad_norm": 19.84227908797742, "kl": 0.0289306640625, "learning_rate": 9.335903276677763e-07, "loss": 0.0116, "reward": 1.377845048904419, "reward_std": 0.18648402392864227, "rewards/accuracy_reward_stage2": 0.5028449892997742, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 380 }, { "completion_length": 7.9375, "epoch": 0.06676011915191869, "grad_norm": 25.930310165133555, "kl": 0.0693359375, "learning_rate": 9.334151042579288e-07, "loss": 0.0278, "reward": 1.5856380462646484, "reward_std": 0.13614915311336517, "rewards/accuracy_reward_stage2": 0.7106380462646484, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 381 }, { "completion_length": 11.734375, "epoch": 0.06693534256176625, "grad_norm": 20.751357067789666, "kl": 0.10302734375, "learning_rate": 9.332398808480812e-07, "loss": -0.0029, "reward": 1.272355318069458, "reward_std": 0.25995975732803345, "rewards/accuracy_reward_stage2": 0.28798040747642517, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 382 }, { "completion_length": 11.015625, "epoch": 0.06711056597161381, "grad_norm": 17.974724434571502, "kl": 0.07470703125, "learning_rate": 9.330646574382337e-07, "loss": -0.0144, "reward": 1.5834121704101562, "reward_std": 0.19877059757709503, "rewards/accuracy_reward_stage2": 0.5990370512008667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 383 }, { "completion_length": 8.265625, "epoch": 0.06728578938146136, "grad_norm": 32.14407547624172, "kl": 0.068359375, "learning_rate": 9.328894340283862e-07, "loss": -0.0057, "reward": 1.5889458656311035, "reward_std": 0.2500606179237366, "rewards/accuracy_reward_stage2": 0.6045708656311035, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 384 }, { "completion_length": 11.734375, "epoch": 0.06746101279130892, "grad_norm": 15420.533479337024, "kl": 35.75, "learning_rate": 9.327142106185386e-07, "loss": 14.2103, "reward": 1.4057738780975342, "reward_std": 0.1744045913219452, "rewards/accuracy_reward_stage2": 0.5463988184928894, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 385 }, { "completion_length": 9.234375, "epoch": 0.06763623620115647, "grad_norm": 31.137419894167664, "kl": 0.0439453125, "learning_rate": 9.325389872086911e-07, "loss": -0.02, "reward": 1.6810356378555298, "reward_std": 0.24055354297161102, "rewards/accuracy_reward_stage2": 0.8216606378555298, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 386 }, { "completion_length": 11.359375, "epoch": 0.06781145961100403, "grad_norm": 21.441134645043906, "kl": 0.048583984375, "learning_rate": 9.323637637988436e-07, "loss": -0.0248, "reward": 1.5098655223846436, "reward_std": 0.19568142294883728, "rewards/accuracy_reward_stage2": 0.5254905223846436, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 387 }, { "completion_length": 9.515625, "epoch": 0.06798668302085159, "grad_norm": 18.926669594347928, "kl": 0.0296630859375, "learning_rate": 9.321885403889959e-07, "loss": -0.0195, "reward": 1.563733458518982, "reward_std": 0.26144492626190186, "rewards/accuracy_reward_stage2": 0.5793584585189819, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 388 }, { "completion_length": 17.140625, "epoch": 0.06816190643069914, "grad_norm": 21.75447769013038, "kl": 0.0419921875, "learning_rate": 9.320133169791484e-07, "loss": -0.0271, "reward": 1.5931763648986816, "reward_std": 0.1997213512659073, "rewards/accuracy_reward_stage2": 0.6088013648986816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 389 }, { "completion_length": 8.5625, "epoch": 0.0683371298405467, "grad_norm": 20.67454053924187, "kl": 0.0888671875, "learning_rate": 9.318380935693007e-07, "loss": -0.0701, "reward": 1.4614261388778687, "reward_std": 0.2886839509010315, "rewards/accuracy_reward_stage2": 0.5083011984825134, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 390 }, { "completion_length": 10.359375, "epoch": 0.06851235325039426, "grad_norm": 21.8958041612382, "kl": 0.039306640625, "learning_rate": 9.316628701594532e-07, "loss": 0.0157, "reward": 1.316678524017334, "reward_std": 0.14866000413894653, "rewards/accuracy_reward_stage2": 0.44167858362197876, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 391 }, { "completion_length": 12.953125, "epoch": 0.0686875766602418, "grad_norm": 17.742755062439734, "kl": 0.017822265625, "learning_rate": 9.314876467496057e-07, "loss": 0.0071, "reward": 1.4223427772521973, "reward_std": 0.15768727660179138, "rewards/accuracy_reward_stage2": 0.4223426580429077, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 392 }, { "completion_length": 10.84375, "epoch": 0.06886280007008937, "grad_norm": 18.22403562891549, "kl": 0.0625, "learning_rate": 9.313124233397581e-07, "loss": 0.025, "reward": 1.4049084186553955, "reward_std": 0.2654655873775482, "rewards/accuracy_reward_stage2": 0.5299084186553955, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 393 }, { "completion_length": 9.546875, "epoch": 0.06903802347993691, "grad_norm": 20.273210311381384, "kl": 0.0810546875, "learning_rate": 9.311371999299106e-07, "loss": -0.0119, "reward": 1.4684481620788574, "reward_std": 0.23478779196739197, "rewards/accuracy_reward_stage2": 0.48407310247421265, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 394 }, { "completion_length": 9.21875, "epoch": 0.06921324688978447, "grad_norm": 21.644046040667575, "kl": 0.056884765625, "learning_rate": 9.309619765200631e-07, "loss": 0.0227, "reward": 1.632354736328125, "reward_std": 0.1535060554742813, "rewards/accuracy_reward_stage2": 0.6323546767234802, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 395 }, { "completion_length": 11.6875, "epoch": 0.06938847029963204, "grad_norm": 16.892620916232747, "kl": 0.034912109375, "learning_rate": 9.307867531102155e-07, "loss": 0.0139, "reward": 1.739698886871338, "reward_std": 0.21730005741119385, "rewards/accuracy_reward_stage2": 0.7396988868713379, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 396 }, { "completion_length": 18.265625, "epoch": 0.06956369370947958, "grad_norm": 18.328691362249153, "kl": 54.5, "learning_rate": 9.30611529700368e-07, "loss": 21.8455, "reward": 1.3181016445159912, "reward_std": 0.09811335802078247, "rewards/accuracy_reward_stage2": 0.4587266445159912, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 397 }, { "completion_length": 9.15625, "epoch": 0.06973891711932714, "grad_norm": 25.60445091883008, "kl": 0.130859375, "learning_rate": 9.304363062905203e-07, "loss": 0.0082, "reward": 1.54931640625, "reward_std": 0.23682433366775513, "rewards/accuracy_reward_stage2": 0.56494140625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 398 }, { "completion_length": 27.078125, "epoch": 0.0699141405291747, "grad_norm": 20.710243539061945, "kl": 0.06201171875, "learning_rate": 9.302610828806728e-07, "loss": 0.0036, "reward": 1.312551498413086, "reward_std": 0.16797444224357605, "rewards/accuracy_reward_stage2": 0.3281765580177307, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 399 }, { "completion_length": 9.46875, "epoch": 0.07008936393902225, "grad_norm": 19.818484838510113, "kl": 0.04296875, "learning_rate": 9.300858594708253e-07, "loss": 0.0172, "reward": 1.495539903640747, "reward_std": 0.1228901818394661, "rewards/accuracy_reward_stage2": 0.49553996324539185, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 400 }, { "completion_length": 8.84375, "epoch": 0.07026458734886981, "grad_norm": 59.341330787930666, "kl": 0.0869140625, "learning_rate": 9.299106360609777e-07, "loss": 0.0347, "reward": 1.8055976629257202, "reward_std": 0.2616202235221863, "rewards/accuracy_reward_stage2": 0.8055975437164307, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 401 }, { "completion_length": 14.328125, "epoch": 0.07043981075871736, "grad_norm": 18.6793268588412, "kl": 0.07666015625, "learning_rate": 9.297354126511302e-07, "loss": -0.0076, "reward": 1.4062790870666504, "reward_std": 0.1280801147222519, "rewards/accuracy_reward_stage2": 0.4219040870666504, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 402 }, { "completion_length": 9.515625, "epoch": 0.07061503416856492, "grad_norm": 17.36545255201424, "kl": 0.0859375, "learning_rate": 9.295601892412826e-07, "loss": 0.0343, "reward": 1.599717378616333, "reward_std": 0.23323816061019897, "rewards/accuracy_reward_stage2": 0.724717378616333, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 403 }, { "completion_length": 14.421875, "epoch": 0.07079025757841248, "grad_norm": 22.000875240676198, "kl": 0.072265625, "learning_rate": 9.29384965831435e-07, "loss": -0.036, "reward": 1.3782211542129517, "reward_std": 0.2272408902645111, "rewards/accuracy_reward_stage2": 0.40947121381759644, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 404 }, { "completion_length": 16.359375, "epoch": 0.07096548098826003, "grad_norm": 25.677116196214612, "kl": 63.0, "learning_rate": 9.292097424215875e-07, "loss": 25.2293, "reward": 1.4082450866699219, "reward_std": 0.2820996642112732, "rewards/accuracy_reward_stage2": 0.5488699674606323, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 405 }, { "completion_length": 10.546875, "epoch": 0.07114070439810759, "grad_norm": 17.342161817612745, "kl": 0.02880859375, "learning_rate": 9.290345190117399e-07, "loss": -0.0326, "reward": 1.8020833730697632, "reward_std": 0.19606460630893707, "rewards/accuracy_reward_stage2": 0.8177083730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 406 }, { "completion_length": 13.21875, "epoch": 0.07131592780795515, "grad_norm": 22.355008720403898, "kl": 0.02294921875, "learning_rate": 9.288592956018924e-07, "loss": 0.0092, "reward": 1.617870569229126, "reward_std": 0.2922362983226776, "rewards/accuracy_reward_stage2": 0.6178706288337708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 407 }, { "completion_length": 8.578125, "epoch": 0.0714911512178027, "grad_norm": 23.973990386827367, "kl": 0.064453125, "learning_rate": 9.286840721920448e-07, "loss": 0.0258, "reward": 1.624790072441101, "reward_std": 0.1845066249370575, "rewards/accuracy_reward_stage2": 0.6247899532318115, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 408 }, { "completion_length": 7.5, "epoch": 0.07166637462765026, "grad_norm": 21.002992389593107, "kl": 0.068359375, "learning_rate": 9.285088487821972e-07, "loss": -0.0159, "reward": 1.509068489074707, "reward_std": 0.25469109416007996, "rewards/accuracy_reward_stage2": 0.524693489074707, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 409 }, { "completion_length": 8.265625, "epoch": 0.0718415980374978, "grad_norm": 24.076299669631023, "kl": 0.0703125, "learning_rate": 9.283336253723497e-07, "loss": 0.0281, "reward": 1.7792377471923828, "reward_std": 0.15023738145828247, "rewards/accuracy_reward_stage2": 0.7792376279830933, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 410 }, { "completion_length": 16.296875, "epoch": 0.07201682144734536, "grad_norm": 68.66799831706935, "kl": 55.25, "learning_rate": 9.281584019625022e-07, "loss": 22.1182, "reward": 1.3476190567016602, "reward_std": 0.3558818995952606, "rewards/accuracy_reward_stage2": 0.48824408650398254, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 411 }, { "completion_length": 13.9375, "epoch": 0.07219204485719292, "grad_norm": 25.391220966759903, "kl": 0.15625, "learning_rate": 9.279831785526546e-07, "loss": 0.0061, "reward": 1.4034576416015625, "reward_std": 0.30644452571868896, "rewards/accuracy_reward_stage2": 0.5597076416015625, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 412 }, { "completion_length": 18.84375, "epoch": 0.07236726826704047, "grad_norm": 23223.505120208472, "kl": 342.0, "learning_rate": 9.278079551428071e-07, "loss": 137.4967, "reward": 1.2346117496490479, "reward_std": 0.1464797854423523, "rewards/accuracy_reward_stage2": 0.35961174964904785, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 413 }, { "completion_length": 8.640625, "epoch": 0.07254249167688803, "grad_norm": 15.935590483914812, "kl": 0.047119140625, "learning_rate": 9.276327317329595e-07, "loss": -0.0254, "reward": 1.4736135005950928, "reward_std": 0.18977496027946472, "rewards/accuracy_reward_stage2": 0.6142385601997375, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 414 }, { "completion_length": 7.765625, "epoch": 0.0727177150867356, "grad_norm": 20.58056946058028, "kl": 0.11328125, "learning_rate": 9.27457508323112e-07, "loss": 0.0453, "reward": 1.2243397235870361, "reward_std": 0.20016539096832275, "rewards/accuracy_reward_stage2": 0.47433966398239136, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 415 }, { "completion_length": 8.671875, "epoch": 0.07289293849658314, "grad_norm": 20.599294957099612, "kl": 0.1865234375, "learning_rate": 9.272822849132644e-07, "loss": 0.0746, "reward": 1.811370849609375, "reward_std": 0.10164359956979752, "rewards/accuracy_reward_stage2": 0.936370849609375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 416 }, { "completion_length": 23.25, "epoch": 0.0730681619064307, "grad_norm": 14.076138054590698, "kl": 0.03369140625, "learning_rate": 9.271070615034167e-07, "loss": 0.0135, "reward": 1.4144988059997559, "reward_std": 0.07342597842216492, "rewards/accuracy_reward_stage2": 0.41449886560440063, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 417 }, { "completion_length": 11.59375, "epoch": 0.07324338531627826, "grad_norm": 20.929553261051325, "kl": 0.0830078125, "learning_rate": 9.269318380935692e-07, "loss": 0.0333, "reward": 1.3908796310424805, "reward_std": 0.14465433359146118, "rewards/accuracy_reward_stage2": 0.5158795714378357, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 418 }, { "completion_length": 11.640625, "epoch": 0.07341860872612581, "grad_norm": 15.50809565451471, "kl": 0.01287841796875, "learning_rate": 9.267566146837217e-07, "loss": 0.0052, "reward": 1.3149559497833252, "reward_std": 0.10769060254096985, "rewards/accuracy_reward_stage2": 0.4399559497833252, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 419 }, { "completion_length": 12.578125, "epoch": 0.07359383213597337, "grad_norm": 19.864710108306696, "kl": 0.0791015625, "learning_rate": 9.265813912738741e-07, "loss": 0.0316, "reward": 1.5394093990325928, "reward_std": 0.13540780544281006, "rewards/accuracy_reward_stage2": 0.539409339427948, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 420 }, { "completion_length": 9.375, "epoch": 0.07376905554582092, "grad_norm": 17.71086059432014, "kl": 0.07080078125, "learning_rate": 9.264061678640266e-07, "loss": -0.0489, "reward": 1.6943539381027222, "reward_std": 0.20985379815101624, "rewards/accuracy_reward_stage2": 0.7256039977073669, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 421 }, { "completion_length": 11.59375, "epoch": 0.07394427895566848, "grad_norm": 20.681192592863816, "kl": 0.126953125, "learning_rate": 9.26230944454179e-07, "loss": -0.0402, "reward": 1.3249560594558716, "reward_std": 0.3091086447238922, "rewards/accuracy_reward_stage2": 0.4812060594558716, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 422 }, { "completion_length": 6.34375, "epoch": 0.07411950236551604, "grad_norm": 12.04230880142454, "kl": 0.040283203125, "learning_rate": 9.260557210443315e-07, "loss": -0.0205, "reward": 1.740767002105713, "reward_std": 0.15345188975334167, "rewards/accuracy_reward_stage2": 0.7563920617103577, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 423 }, { "completion_length": 11.921875, "epoch": 0.07429472577536358, "grad_norm": 20.38539880893731, "kl": 0.09130859375, "learning_rate": 9.25880497634484e-07, "loss": 0.0365, "reward": 1.3158583641052246, "reward_std": 0.2701030969619751, "rewards/accuracy_reward_stage2": 0.31585830450057983, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 424 }, { "completion_length": 7.609375, "epoch": 0.07446994918521115, "grad_norm": 21.084952418921368, "kl": 0.0152587890625, "learning_rate": 9.257052742246364e-07, "loss": -0.0294, "reward": 1.4352679252624512, "reward_std": 0.20226189494132996, "rewards/accuracy_reward_stage2": 0.450892835855484, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 425 }, { "completion_length": 5.6875, "epoch": 0.0746451725950587, "grad_norm": 20.96660867060781, "kl": 0.060546875, "learning_rate": 9.255300508147889e-07, "loss": -0.0199, "reward": 1.4383260011672974, "reward_std": 0.16609863936901093, "rewards/accuracy_reward_stage2": 0.46957600116729736, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 426 }, { "completion_length": 12.34375, "epoch": 0.07482039600490625, "grad_norm": 26.736215847509886, "kl": 0.2099609375, "learning_rate": 9.253548274049414e-07, "loss": 0.0399, "reward": 1.5008306503295898, "reward_std": 0.2757319509983063, "rewards/accuracy_reward_stage2": 0.6414556503295898, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 427 }, { "completion_length": 8.578125, "epoch": 0.07499561941475381, "grad_norm": 20.00071764470865, "kl": 0.0732421875, "learning_rate": 9.251796039950936e-07, "loss": -0.0211, "reward": 1.5251744985580444, "reward_std": 0.15365660190582275, "rewards/accuracy_reward_stage2": 0.5564244985580444, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 428 }, { "completion_length": 16.578125, "epoch": 0.07517084282460136, "grad_norm": 332.3153287615184, "kl": 41.0, "learning_rate": 9.250043805852461e-07, "loss": 16.39, "reward": 1.3449559211730957, "reward_std": 0.4346715807914734, "rewards/accuracy_reward_stage2": 0.5949559211730957, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 429 }, { "completion_length": 8.515625, "epoch": 0.07534606623444892, "grad_norm": 26.579188374251853, "kl": 0.04248046875, "learning_rate": 9.248291571753985e-07, "loss": -0.0163, "reward": 1.3712437152862549, "reward_std": 0.38221314549446106, "rewards/accuracy_reward_stage2": 0.3868686556816101, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 430 }, { "completion_length": 12.109375, "epoch": 0.07552128964429648, "grad_norm": 19.678076223219964, "kl": 0.04736328125, "learning_rate": 9.24653933765551e-07, "loss": -0.0485, "reward": 1.583035945892334, "reward_std": 0.16018790006637573, "rewards/accuracy_reward_stage2": 0.614285945892334, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 431 }, { "completion_length": 12.890625, "epoch": 0.07569651305414403, "grad_norm": 20.088676161617673, "kl": 75.5, "learning_rate": 9.244787103557035e-07, "loss": 30.3652, "reward": 1.371154546737671, "reward_std": 0.17230086028575897, "rewards/accuracy_reward_stage2": 0.5117795467376709, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 432 }, { "completion_length": 14.734375, "epoch": 0.07587173646399159, "grad_norm": 18.981420019902902, "kl": 0.048583984375, "learning_rate": 9.243034869458559e-07, "loss": 0.0193, "reward": 1.3368223905563354, "reward_std": 0.21399948000907898, "rewards/accuracy_reward_stage2": 0.46182242035865784, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 433 }, { "completion_length": 20.46875, "epoch": 0.07604695987383915, "grad_norm": 19.334220522199253, "kl": 53.0, "learning_rate": 9.241282635360084e-07, "loss": 21.2419, "reward": 1.4832494258880615, "reward_std": 0.12704303860664368, "rewards/accuracy_reward_stage2": 0.6082494854927063, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 434 }, { "completion_length": 22.46875, "epoch": 0.0762221832836867, "grad_norm": 23.000925090159036, "kl": 59.0, "learning_rate": 9.239530401261609e-07, "loss": 23.7356, "reward": 1.5550317764282227, "reward_std": 0.22015462815761566, "rewards/accuracy_reward_stage2": 0.6800317168235779, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 435 }, { "completion_length": 9.046875, "epoch": 0.07639740669353426, "grad_norm": 28.476420892876988, "kl": 0.1484375, "learning_rate": 9.237778167163133e-07, "loss": 0.0256, "reward": 1.6102795600891113, "reward_std": 0.20625394582748413, "rewards/accuracy_reward_stage2": 0.6259044408798218, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 436 }, { "completion_length": 12.53125, "epoch": 0.0765726301033818, "grad_norm": 24.19592301194606, "kl": 0.0419921875, "learning_rate": 9.236025933064658e-07, "loss": -0.0274, "reward": 1.4183125495910645, "reward_std": 0.24271854758262634, "rewards/accuracy_reward_stage2": 0.5589376091957092, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 437 }, { "completion_length": 11.46875, "epoch": 0.07674785351322937, "grad_norm": 35.20563518798691, "kl": 0.17578125, "learning_rate": 9.234273698966181e-07, "loss": 0.015, "reward": 1.4013981819152832, "reward_std": 0.27046674489974976, "rewards/accuracy_reward_stage2": 0.4326481819152832, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 438 }, { "completion_length": 10.640625, "epoch": 0.07692307692307693, "grad_norm": 21.04907556045925, "kl": 0.0546875, "learning_rate": 9.232521464867706e-07, "loss": 0.009, "reward": 1.5379629135131836, "reward_std": 0.2769656777381897, "rewards/accuracy_reward_stage2": 0.6785879135131836, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 439 }, { "completion_length": 9.21875, "epoch": 0.07709830033292447, "grad_norm": 18.911621023379887, "kl": 0.024169921875, "learning_rate": 9.230769230769231e-07, "loss": 0.0097, "reward": 1.5322370529174805, "reward_std": 0.18402306735515594, "rewards/accuracy_reward_stage2": 0.5322371125221252, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 440 }, { "completion_length": 10.4375, "epoch": 0.07727352374277204, "grad_norm": 15.798461479357341, "kl": 0.0849609375, "learning_rate": 9.229016996670754e-07, "loss": 0.0341, "reward": 1.6544257402420044, "reward_std": 0.16453927755355835, "rewards/accuracy_reward_stage2": 0.6544257998466492, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 441 }, { "completion_length": 10.0, "epoch": 0.0774487471526196, "grad_norm": 14.276807876532036, "kl": 0.0224609375, "learning_rate": 9.227264762572279e-07, "loss": -0.0352, "reward": 1.59375, "reward_std": 0.1778542846441269, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 442 }, { "completion_length": 7.03125, "epoch": 0.07762397056246714, "grad_norm": 19.745379817242817, "kl": 0.022705078125, "learning_rate": 9.225512528473803e-07, "loss": 0.0091, "reward": 1.530820608139038, "reward_std": 0.22802403569221497, "rewards/accuracy_reward_stage2": 0.5308204889297485, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 443 }, { "completion_length": 12.546875, "epoch": 0.0777991939723147, "grad_norm": 19.52967803023215, "kl": 0.01397705078125, "learning_rate": 9.223760294375328e-07, "loss": 0.0056, "reward": 1.6203205585479736, "reward_std": 0.10274563729763031, "rewards/accuracy_reward_stage2": 0.6203205585479736, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 444 }, { "completion_length": 15.015625, "epoch": 0.07797441738216225, "grad_norm": 30.33681041142242, "kl": 0.1533203125, "learning_rate": 9.222008060276853e-07, "loss": 0.0174, "reward": 1.2619487047195435, "reward_std": 0.3217325508594513, "rewards/accuracy_reward_stage2": 0.40257370471954346, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 445 }, { "completion_length": 9.765625, "epoch": 0.07814964079200981, "grad_norm": 18.15244970923255, "kl": 0.049072265625, "learning_rate": 9.220255826178377e-07, "loss": 0.0197, "reward": 1.5675026178359985, "reward_std": 0.2025008201599121, "rewards/accuracy_reward_stage2": 0.5675026774406433, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 446 }, { "completion_length": 7.734375, "epoch": 0.07832486420185737, "grad_norm": 22.894618014689563, "kl": 0.08642578125, "learning_rate": 9.218503592079901e-07, "loss": 0.0059, "reward": 1.6023638248443604, "reward_std": 0.27222031354904175, "rewards/accuracy_reward_stage2": 0.6179888248443604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 447 }, { "completion_length": 11.765625, "epoch": 0.07850008761170492, "grad_norm": 48.019137053035074, "kl": 84.5, "learning_rate": 9.216751357981426e-07, "loss": 33.936, "reward": 1.5308412313461304, "reward_std": 0.21382224559783936, "rewards/accuracy_reward_stage2": 0.6558412313461304, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 448 }, { "completion_length": 10.296875, "epoch": 0.07867531102155248, "grad_norm": 16.88177879480028, "kl": 0.0159912109375, "learning_rate": 9.21499912388295e-07, "loss": 0.0064, "reward": 1.6435894966125488, "reward_std": 0.20435433089733124, "rewards/accuracy_reward_stage2": 0.6435894966125488, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 449 }, { "completion_length": 9.1875, "epoch": 0.07885053443140004, "grad_norm": 19.188572270884954, "kl": 0.06396484375, "learning_rate": 9.213246889784475e-07, "loss": 0.0256, "reward": 1.636287808418274, "reward_std": 0.18115490674972534, "rewards/accuracy_reward_stage2": 0.6362878084182739, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 450 }, { "completion_length": 8.5, "epoch": 0.07902575784124759, "grad_norm": 20.17073402441959, "kl": 0.06884765625, "learning_rate": 9.211494655685999e-07, "loss": 0.0275, "reward": 1.7681330442428589, "reward_std": 0.18841080367565155, "rewards/accuracy_reward_stage2": 0.8931329846382141, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 451 }, { "completion_length": 10.25, "epoch": 0.07920098125109515, "grad_norm": 17.815446994203846, "kl": 0.0218505859375, "learning_rate": 9.209742421587524e-07, "loss": 0.0088, "reward": 1.5982638597488403, "reward_std": 0.18450896441936493, "rewards/accuracy_reward_stage2": 0.5982638597488403, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 452 }, { "completion_length": 9.796875, "epoch": 0.0793762046609427, "grad_norm": 18.242203342119833, "kl": 0.01806640625, "learning_rate": 9.207990187489049e-07, "loss": -0.0232, "reward": 1.6876232624053955, "reward_std": 0.14777256548404694, "rewards/accuracy_reward_stage2": 0.703248143196106, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 453 }, { "completion_length": 8.546875, "epoch": 0.07955142807079026, "grad_norm": 18.606917192384177, "kl": 81.0, "learning_rate": 9.206237953390572e-07, "loss": 32.4161, "reward": 1.496006965637207, "reward_std": 0.09968242049217224, "rewards/accuracy_reward_stage2": 0.621006965637207, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 454 }, { "completion_length": 12.28125, "epoch": 0.07972665148063782, "grad_norm": 21.9000430991336, "kl": 0.06884765625, "learning_rate": 9.204485719292097e-07, "loss": 0.0274, "reward": 1.250139594078064, "reward_std": 0.12326813489198685, "rewards/accuracy_reward_stage2": 0.25013962388038635, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 455 }, { "completion_length": 14.109375, "epoch": 0.07990187489048536, "grad_norm": 19.960024521289824, "kl": 0.0849609375, "learning_rate": 9.202733485193622e-07, "loss": -0.0495, "reward": 1.5685629844665527, "reward_std": 0.1990964710712433, "rewards/accuracy_reward_stage2": 0.5998129844665527, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 456 }, { "completion_length": 9.921875, "epoch": 0.08007709830033292, "grad_norm": 15.223912178548778, "kl": 0.0537109375, "learning_rate": 9.200981251095145e-07, "loss": 0.0215, "reward": 1.7704863548278809, "reward_std": 0.1370040774345398, "rewards/accuracy_reward_stage2": 0.7704862952232361, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 457 }, { "completion_length": 11.484375, "epoch": 0.08025232171018049, "grad_norm": 21.93213543217981, "kl": 0.30078125, "learning_rate": 9.19922901699667e-07, "loss": 0.0903, "reward": 1.4456546306610107, "reward_std": 0.13230293989181519, "rewards/accuracy_reward_stage2": 0.586279571056366, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 458 }, { "completion_length": 9.484375, "epoch": 0.08042754512002803, "grad_norm": 14.798954968368728, "kl": 0.08056640625, "learning_rate": 9.197476782898194e-07, "loss": 0.0323, "reward": 1.6742162704467773, "reward_std": 0.23457954823970795, "rewards/accuracy_reward_stage2": 0.6742162704467773, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 459 }, { "completion_length": 10.1875, "epoch": 0.08060276852987559, "grad_norm": 19.995984059508032, "kl": 0.033203125, "learning_rate": 9.195724548799719e-07, "loss": -0.031, "reward": 1.419159173965454, "reward_std": 0.2401229292154312, "rewards/accuracy_reward_stage2": 0.5597842335700989, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 460 }, { "completion_length": 8.828125, "epoch": 0.08077799193972315, "grad_norm": 27.59647788465417, "kl": 0.068359375, "learning_rate": 9.193972314701244e-07, "loss": 0.0049, "reward": 1.57512366771698, "reward_std": 0.2905910611152649, "rewards/accuracy_reward_stage2": 0.60637366771698, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 461 }, { "completion_length": 12.9375, "epoch": 0.0809532153495707, "grad_norm": 23.438390815015925, "kl": 0.1201171875, "learning_rate": 9.192220080602768e-07, "loss": 0.048, "reward": 1.3983908891677856, "reward_std": 0.19916199147701263, "rewards/accuracy_reward_stage2": 0.39839091897010803, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 462 }, { "completion_length": 10.203125, "epoch": 0.08112843875941826, "grad_norm": 25.905344729385636, "kl": 0.07763671875, "learning_rate": 9.190467846504293e-07, "loss": -0.0377, "reward": 1.351801872253418, "reward_std": 0.1887568235397339, "rewards/accuracy_reward_stage2": 0.5080518126487732, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 463 }, { "completion_length": 8.5625, "epoch": 0.08130366216926581, "grad_norm": 21.852140692838674, "kl": 0.057861328125, "learning_rate": 9.188715612405818e-07, "loss": 0.0231, "reward": 1.613126277923584, "reward_std": 0.19007891416549683, "rewards/accuracy_reward_stage2": 0.613126277923584, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 464 }, { "completion_length": 9.171875, "epoch": 0.08147888557911337, "grad_norm": 16.677989585298285, "kl": 0.12158203125, "learning_rate": 9.186963378307342e-07, "loss": 0.0488, "reward": 1.3942195177078247, "reward_std": 0.13556255400180817, "rewards/accuracy_reward_stage2": 0.5348445177078247, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 465 }, { "completion_length": 13.1875, "epoch": 0.08165410898896093, "grad_norm": 7.684489611981729, "kl": 0.03076171875, "learning_rate": 9.185211144208866e-07, "loss": 0.0123, "reward": 1.4885270595550537, "reward_std": 0.032450269907712936, "rewards/accuracy_reward_stage2": 0.6135270595550537, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 466 }, { "completion_length": 25.828125, "epoch": 0.08182933239880848, "grad_norm": 19.304238267341717, "kl": 0.03662109375, "learning_rate": 9.183458910110389e-07, "loss": 0.0147, "reward": 1.5734906196594238, "reward_std": 0.24000124633312225, "rewards/accuracy_reward_stage2": 0.5734906792640686, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 467 }, { "completion_length": 22.0, "epoch": 0.08200455580865604, "grad_norm": 17.66367688216898, "kl": 41.25, "learning_rate": 9.181706676011914e-07, "loss": 16.6104, "reward": 1.2004015445709229, "reward_std": 0.14673781394958496, "rewards/accuracy_reward_stage2": 0.32540154457092285, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 468 }, { "completion_length": 10.28125, "epoch": 0.0821797792185036, "grad_norm": 16.750027229018684, "kl": 0.01092529296875, "learning_rate": 9.179954441913439e-07, "loss": 0.0044, "reward": 1.825685977935791, "reward_std": 0.18837112188339233, "rewards/accuracy_reward_stage2": 0.825685977935791, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 469 }, { "completion_length": 8.96875, "epoch": 0.08235500262835115, "grad_norm": 19.670475644991413, "kl": 0.062255859375, "learning_rate": 9.178202207814963e-07, "loss": -0.0194, "reward": 1.5610289573669434, "reward_std": 0.2965965270996094, "rewards/accuracy_reward_stage2": 0.5766539573669434, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 470 }, { "completion_length": 14.125, "epoch": 0.0825302260381987, "grad_norm": 22.687628136347087, "kl": 0.1142578125, "learning_rate": 9.176449973716488e-07, "loss": 0.0458, "reward": 1.6705833673477173, "reward_std": 0.21611103415489197, "rewards/accuracy_reward_stage2": 0.6705833673477173, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 471 }, { "completion_length": 9.265625, "epoch": 0.08270544944804625, "grad_norm": 18.9690436720185, "kl": 0.058837890625, "learning_rate": 9.174697739618013e-07, "loss": 0.0236, "reward": 1.789116621017456, "reward_std": 0.07733018696308136, "rewards/accuracy_reward_stage2": 0.789116621017456, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 472 }, { "completion_length": 9.28125, "epoch": 0.08288067285789381, "grad_norm": 18.03798220139904, "kl": 0.062255859375, "learning_rate": 9.172945505519537e-07, "loss": 0.0249, "reward": 1.5140492916107178, "reward_std": 0.24741026759147644, "rewards/accuracy_reward_stage2": 0.514049232006073, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 473 }, { "completion_length": 26.09375, "epoch": 0.08305589626774137, "grad_norm": 5082.5494113753975, "kl": 73.0, "learning_rate": 9.171193271421062e-07, "loss": 29.2608, "reward": 1.3094170093536377, "reward_std": 0.12926608324050903, "rewards/accuracy_reward_stage2": 0.5594170093536377, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 474 }, { "completion_length": 10.515625, "epoch": 0.08323111967758892, "grad_norm": 22.58619197290474, "kl": 0.0986328125, "learning_rate": 9.169441037322586e-07, "loss": 0.0395, "reward": 1.661272406578064, "reward_std": 0.3087402582168579, "rewards/accuracy_reward_stage2": 0.661272406578064, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 475 }, { "completion_length": 9.6875, "epoch": 0.08340634308743648, "grad_norm": 24.06168327767966, "kl": 0.087890625, "learning_rate": 9.167688803224111e-07, "loss": 0.0352, "reward": 1.495906949043274, "reward_std": 0.1737568974494934, "rewards/accuracy_reward_stage2": 0.4959069490432739, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 476 }, { "completion_length": 15.265625, "epoch": 0.08358156649728404, "grad_norm": 16.528081126500286, "kl": 0.5078125, "learning_rate": 9.165936569125636e-07, "loss": 0.1584, "reward": 1.4166667461395264, "reward_std": 0.1257408708333969, "rewards/accuracy_reward_stage2": 0.5572916269302368, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 477 }, { "completion_length": 11.921875, "epoch": 0.08375678990713159, "grad_norm": 19.16101481092754, "kl": 0.03125, "learning_rate": 9.164184335027159e-07, "loss": -0.0205, "reward": 1.5078704357147217, "reward_std": 0.26106423139572144, "rewards/accuracy_reward_stage2": 0.5234953761100769, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 478 }, { "completion_length": 10.90625, "epoch": 0.08393201331697915, "grad_norm": 20.65729096807624, "kl": 0.0291748046875, "learning_rate": 9.162432100928683e-07, "loss": -0.032, "reward": 1.5712745189666748, "reward_std": 0.1589372754096985, "rewards/accuracy_reward_stage2": 0.5868995785713196, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 479 }, { "completion_length": 14.03125, "epoch": 0.0841072367268267, "grad_norm": 23.009481020265948, "kl": 0.08984375, "learning_rate": 9.160679866830208e-07, "loss": -0.063, "reward": 1.1516456604003906, "reward_std": 0.4139564633369446, "rewards/accuracy_reward_stage2": 0.323520690202713, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 480 }, { "completion_length": 10.921875, "epoch": 0.08428246013667426, "grad_norm": 22.887241467709128, "kl": 0.041015625, "learning_rate": 9.158927632731732e-07, "loss": 0.0164, "reward": 1.658280611038208, "reward_std": 0.19431188702583313, "rewards/accuracy_reward_stage2": 0.783280611038208, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 481 }, { "completion_length": 9.046875, "epoch": 0.08445768354652182, "grad_norm": 19.33248398430543, "kl": 0.078125, "learning_rate": 9.157175398633257e-07, "loss": -0.0129, "reward": 1.4629442691802979, "reward_std": 0.2794113755226135, "rewards/accuracy_reward_stage2": 0.4785691797733307, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 482 }, { "completion_length": 7.6875, "epoch": 0.08463290695636937, "grad_norm": 24.134444137143362, "kl": 0.033447265625, "learning_rate": 9.155423164534781e-07, "loss": 0.0134, "reward": 1.601351022720337, "reward_std": 0.28532567620277405, "rewards/accuracy_reward_stage2": 0.6013510227203369, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 483 }, { "completion_length": 6.171875, "epoch": 0.08480813036621693, "grad_norm": 16.767962021551625, "kl": 0.018798828125, "learning_rate": 9.153670930436306e-07, "loss": -0.0463, "reward": 1.4350864887237549, "reward_std": 0.16850194334983826, "rewards/accuracy_reward_stage2": 0.46633651852607727, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 484 }, { "completion_length": 14.046875, "epoch": 0.08498335377606449, "grad_norm": 22.146728503845583, "kl": 0.58203125, "learning_rate": 9.151918696337831e-07, "loss": 0.2322, "reward": 1.4404876232147217, "reward_std": 0.26357513666152954, "rewards/accuracy_reward_stage2": 0.5654876232147217, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 485 }, { "completion_length": 12.8125, "epoch": 0.08515857718591203, "grad_norm": 24.475383994637514, "kl": 0.09619140625, "learning_rate": 9.150166462239355e-07, "loss": 0.0384, "reward": 1.639747977256775, "reward_std": 0.2860134541988373, "rewards/accuracy_reward_stage2": 0.6397479772567749, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 486 }, { "completion_length": 10.453125, "epoch": 0.0853338005957596, "grad_norm": 17.108695939756277, "kl": 0.068359375, "learning_rate": 9.148414228140879e-07, "loss": -0.0169, "reward": 1.7072330713272095, "reward_std": 0.1748734563589096, "rewards/accuracy_reward_stage2": 0.7228580713272095, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 487 }, { "completion_length": 8.265625, "epoch": 0.08550902400560714, "grad_norm": 14.48557849493761, "kl": 0.0252685546875, "learning_rate": 9.146661994042404e-07, "loss": -0.0341, "reward": 1.7925978899002075, "reward_std": 0.11917868256568909, "rewards/accuracy_reward_stage2": 0.8082229495048523, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 488 }, { "completion_length": 11.703125, "epoch": 0.0856842474154547, "grad_norm": 19.639142356206534, "kl": 0.095703125, "learning_rate": 9.144909759943928e-07, "loss": 0.0385, "reward": 1.58922278881073, "reward_std": 0.2474866509437561, "rewards/accuracy_reward_stage2": 0.71422278881073, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 489 }, { "completion_length": 20.53125, "epoch": 0.08585947082530226, "grad_norm": 30.107818801594092, "kl": 0.3515625, "learning_rate": 9.143157525845453e-07, "loss": 0.1409, "reward": 1.3591396808624268, "reward_std": 0.15374769270420074, "rewards/accuracy_reward_stage2": 0.48413965106010437, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 490 }, { "completion_length": 14.578125, "epoch": 0.08603469423514981, "grad_norm": 47.30563364220158, "kl": 0.181640625, "learning_rate": 9.141405291746977e-07, "loss": 0.0283, "reward": 1.171875, "reward_std": 0.19939783215522766, "rewards/accuracy_reward_stage2": 0.328125, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 491 }, { "completion_length": 10.78125, "epoch": 0.08620991764499737, "grad_norm": 13.510331003385526, "kl": 0.06689453125, "learning_rate": 9.139653057648501e-07, "loss": -0.0175, "reward": 1.61344313621521, "reward_std": 0.17493629455566406, "rewards/accuracy_reward_stage2": 0.6290681958198547, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 492 }, { "completion_length": 12.015625, "epoch": 0.08638514105484493, "grad_norm": 15.117382465909863, "kl": 0.14453125, "learning_rate": 9.137900823550026e-07, "loss": 0.0139, "reward": 1.3842592239379883, "reward_std": 0.22201895713806152, "rewards/accuracy_reward_stage2": 0.5248842239379883, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 493 }, { "completion_length": 6.65625, "epoch": 0.08656036446469248, "grad_norm": 17.29696625736238, "kl": 0.02783203125, "learning_rate": 9.13614858945155e-07, "loss": -0.033, "reward": 1.8380773067474365, "reward_std": 0.08552451431751251, "rewards/accuracy_reward_stage2": 0.8537023067474365, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 494 }, { "completion_length": 10.53125, "epoch": 0.08673558787454004, "grad_norm": 18.126590875549933, "kl": 0.025390625, "learning_rate": 9.134396355353075e-07, "loss": -0.034, "reward": 1.5160049200057983, "reward_std": 0.17203427851200104, "rewards/accuracy_reward_stage2": 0.5316299200057983, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 495 }, { "completion_length": 10.3125, "epoch": 0.08691081128438759, "grad_norm": 22.92055570142462, "kl": 0.061767578125, "learning_rate": 9.1326441212546e-07, "loss": 0.0247, "reward": 1.6086578369140625, "reward_std": 0.18672674894332886, "rewards/accuracy_reward_stage2": 0.7336578369140625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 496 }, { "completion_length": 19.0625, "epoch": 0.08708603469423515, "grad_norm": 13.63320237888586, "kl": 0.05615234375, "learning_rate": 9.130891887156123e-07, "loss": -0.0648, "reward": 1.4610896110534668, "reward_std": 0.17531195282936096, "rewards/accuracy_reward_stage2": 0.49233970046043396, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 497 }, { "completion_length": 8.671875, "epoch": 0.08726125810408271, "grad_norm": 20.53693666680177, "kl": 0.10595703125, "learning_rate": 9.129139653057648e-07, "loss": 0.0422, "reward": 1.6972384452819824, "reward_std": 0.20393230020999908, "rewards/accuracy_reward_stage2": 0.6972383856773376, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 498 }, { "completion_length": 16.515625, "epoch": 0.08743648151393026, "grad_norm": 18.100392259301536, "kl": 0.03662109375, "learning_rate": 9.127387418959172e-07, "loss": 0.0051, "reward": 1.575078010559082, "reward_std": 0.12993305921554565, "rewards/accuracy_reward_stage2": 0.590703010559082, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 499 }, { "completion_length": 14.6875, "epoch": 0.08761170492377782, "grad_norm": 21.14943372944215, "kl": 0.024658203125, "learning_rate": 9.125635184860697e-07, "loss": 0.0099, "reward": 1.285620927810669, "reward_std": 0.15270642936229706, "rewards/accuracy_reward_stage2": 0.41062092781066895, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 500 }, { "completion_length": 7.5, "epoch": 0.08778692833362538, "grad_norm": 17.769614304448513, "kl": 0.0198974609375, "learning_rate": 9.123882950762222e-07, "loss": 0.008, "reward": 1.6949687004089355, "reward_std": 0.09590702503919601, "rewards/accuracy_reward_stage2": 0.694968581199646, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 501 }, { "completion_length": 11.265625, "epoch": 0.08796215174347292, "grad_norm": 24.687050847645843, "kl": 0.032958984375, "learning_rate": 9.122130716663746e-07, "loss": -0.0555, "reward": 1.694044589996338, "reward_std": 0.24584685266017914, "rewards/accuracy_reward_stage2": 0.7252947092056274, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 502 }, { "completion_length": 12.28125, "epoch": 0.08813737515332049, "grad_norm": 23.468740520971043, "kl": 0.384765625, "learning_rate": 9.120378482565271e-07, "loss": 0.0656, "reward": 1.4185097217559814, "reward_std": 0.2678026556968689, "rewards/accuracy_reward_stage2": 0.5747597217559814, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 503 }, { "completion_length": 10.421875, "epoch": 0.08831259856316805, "grad_norm": 30.292195286556733, "kl": 0.056396484375, "learning_rate": 9.118626248466796e-07, "loss": 0.0225, "reward": 1.5536742210388184, "reward_std": 0.19536569714546204, "rewards/accuracy_reward_stage2": 0.6786742210388184, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 504 }, { "completion_length": 10.765625, "epoch": 0.08848782197301559, "grad_norm": 22.056329493793715, "kl": 0.55859375, "learning_rate": 9.116874014368319e-07, "loss": 0.2231, "reward": 1.4932494163513184, "reward_std": 0.165345698595047, "rewards/accuracy_reward_stage2": 0.7432493567466736, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 505 }, { "completion_length": 7.78125, "epoch": 0.08866304538286315, "grad_norm": 23.179529678402307, "kl": 0.1494140625, "learning_rate": 9.115121780269844e-07, "loss": 0.0599, "reward": 1.523409128189087, "reward_std": 0.24513620138168335, "rewards/accuracy_reward_stage2": 0.5234091281890869, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 506 }, { "completion_length": 8.671875, "epoch": 0.0888382687927107, "grad_norm": 40.15466178291841, "kl": 0.08447265625, "learning_rate": 9.113369546171367e-07, "loss": 0.0338, "reward": 1.4454511404037476, "reward_std": 0.16291175782680511, "rewards/accuracy_reward_stage2": 0.44545111060142517, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 507 }, { "completion_length": 7.96875, "epoch": 0.08901349220255826, "grad_norm": 16.655840698240784, "kl": 0.052978515625, "learning_rate": 9.111617312072892e-07, "loss": 0.0212, "reward": 1.5639185905456543, "reward_std": 0.10686256736516953, "rewards/accuracy_reward_stage2": 0.5639185905456543, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 508 }, { "completion_length": 12.46875, "epoch": 0.08918871561240582, "grad_norm": 15.858290783911817, "kl": 0.330078125, "learning_rate": 9.109865077974417e-07, "loss": 0.0441, "reward": 1.3085144758224487, "reward_std": 0.2654265761375427, "rewards/accuracy_reward_stage2": 0.46476447582244873, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 509 }, { "completion_length": 8.0, "epoch": 0.08936393902225337, "grad_norm": 16.55163129247764, "kl": 0.042724609375, "learning_rate": 9.108112843875941e-07, "loss": 0.017, "reward": 1.5995845794677734, "reward_std": 0.18738877773284912, "rewards/accuracy_reward_stage2": 0.5995846390724182, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 510 }, { "completion_length": 9.8125, "epoch": 0.08953916243210093, "grad_norm": 17.536117228564414, "kl": 0.087890625, "learning_rate": 9.106360609777466e-07, "loss": 0.0018, "reward": 1.3366703987121582, "reward_std": 0.14628343284130096, "rewards/accuracy_reward_stage2": 0.3522953391075134, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 511 }, { "completion_length": 16.34375, "epoch": 0.08971438584194849, "grad_norm": 26.290486758726278, "kl": 0.1953125, "learning_rate": 9.10460837567899e-07, "loss": 0.034, "reward": 1.3810583353042603, "reward_std": 0.2410065084695816, "rewards/accuracy_reward_stage2": 0.39668336510658264, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 512 }, { "completion_length": 8.40625, "epoch": 0.08988960925179604, "grad_norm": 15.909152671710622, "kl": 0.06201171875, "learning_rate": 9.102856141580515e-07, "loss": 0.0247, "reward": 1.629618763923645, "reward_std": 0.18738989531993866, "rewards/accuracy_reward_stage2": 0.629618763923645, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 513 }, { "completion_length": 11.5, "epoch": 0.0900648326616436, "grad_norm": 15.863809367993417, "kl": 0.0380859375, "learning_rate": 9.10110390748204e-07, "loss": 0.0152, "reward": 1.478208065032959, "reward_std": 0.10647543519735336, "rewards/accuracy_reward_stage2": 0.4782080352306366, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 514 }, { "completion_length": 7.90625, "epoch": 0.09024005607149115, "grad_norm": 21.755133696217552, "kl": 0.1044921875, "learning_rate": 9.099351673383564e-07, "loss": 0.0417, "reward": 1.5345044136047363, "reward_std": 0.3030553460121155, "rewards/accuracy_reward_stage2": 0.5345043540000916, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 515 }, { "completion_length": 18.546875, "epoch": 0.0904152794813387, "grad_norm": 18.702052472544235, "kl": 0.5078125, "learning_rate": 9.097599439285089e-07, "loss": 0.1589, "reward": 1.3089147806167603, "reward_std": 0.16796602308750153, "rewards/accuracy_reward_stage2": 0.44953978061676025, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 516 }, { "completion_length": 8.015625, "epoch": 0.09059050289118627, "grad_norm": 20.65997615387692, "kl": 0.04296875, "learning_rate": 9.095847205186612e-07, "loss": 0.0172, "reward": 1.6913013458251953, "reward_std": 0.2100488543510437, "rewards/accuracy_reward_stage2": 0.6913013458251953, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 517 }, { "completion_length": 8.359375, "epoch": 0.09076572630103381, "grad_norm": 20.015545431572207, "kl": 0.025146484375, "learning_rate": 9.094094971088136e-07, "loss": 0.0101, "reward": 1.6237950325012207, "reward_std": 0.2327117621898651, "rewards/accuracy_reward_stage2": 0.6237950325012207, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 518 }, { "completion_length": 7.8125, "epoch": 0.09094094971088137, "grad_norm": 17.673422161995706, "kl": 0.0299072265625, "learning_rate": 9.092342736989661e-07, "loss": 0.012, "reward": 1.6694855690002441, "reward_std": 0.23738789558410645, "rewards/accuracy_reward_stage2": 0.6694855690002441, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 519 }, { "completion_length": 7.796875, "epoch": 0.09111617312072894, "grad_norm": 20.678175792830373, "kl": 0.076171875, "learning_rate": 9.090590502891185e-07, "loss": -0.0581, "reward": 1.4174991846084595, "reward_std": 0.18197058141231537, "rewards/accuracy_reward_stage2": 0.4487491548061371, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 520 }, { "completion_length": 9.5625, "epoch": 0.09129139653057648, "grad_norm": 16.014094198427838, "kl": 0.1435546875, "learning_rate": 9.08883826879271e-07, "loss": 0.0575, "reward": 1.4942562580108643, "reward_std": 0.18194083869457245, "rewards/accuracy_reward_stage2": 0.6192562580108643, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 521 }, { "completion_length": 14.890625, "epoch": 0.09146661994042404, "grad_norm": 24.857365439555846, "kl": 0.42578125, "learning_rate": 9.087086034694235e-07, "loss": 0.126, "reward": 1.3850722312927246, "reward_std": 0.21721617877483368, "rewards/accuracy_reward_stage2": 0.5256972908973694, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 522 }, { "completion_length": 22.875, "epoch": 0.09164184335027159, "grad_norm": 22.317024704003305, "kl": 0.1328125, "learning_rate": 9.085333800595759e-07, "loss": 0.0091, "reward": 1.3440500497817993, "reward_std": 0.20350778102874756, "rewards/accuracy_reward_stage2": 0.3596750795841217, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 523 }, { "completion_length": 7.859375, "epoch": 0.09181706676011915, "grad_norm": 12.267929545023042, "kl": 0.04931640625, "learning_rate": 9.083581566497284e-07, "loss": 0.0198, "reward": 1.3105697631835938, "reward_std": 0.038502879440784454, "rewards/accuracy_reward_stage2": 0.31056979298591614, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 524 }, { "completion_length": 8.078125, "epoch": 0.09199229016996671, "grad_norm": 16.011879952345478, "kl": 0.06787109375, "learning_rate": 9.081829332398809e-07, "loss": -0.0484, "reward": 1.4427083730697632, "reward_std": 0.303839772939682, "rewards/accuracy_reward_stage2": 0.4739583432674408, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 525 }, { "completion_length": 10.6875, "epoch": 0.09216751357981426, "grad_norm": 21.68307168867549, "kl": 0.06787109375, "learning_rate": 9.080077098300333e-07, "loss": 0.027, "reward": 1.6258351802825928, "reward_std": 0.27005815505981445, "rewards/accuracy_reward_stage2": 0.6258351802825928, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 526 }, { "completion_length": 8.5, "epoch": 0.09234273698966182, "grad_norm": 11.823473795220892, "kl": 0.0419921875, "learning_rate": 9.078324864201857e-07, "loss": -0.0215, "reward": 1.3020833730697632, "reward_std": 0.1627970188856125, "rewards/accuracy_reward_stage2": 0.3177083134651184, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 527 }, { "completion_length": 10.71875, "epoch": 0.09251796039950938, "grad_norm": 23.896112782111985, "kl": 0.0252685546875, "learning_rate": 9.076572630103381e-07, "loss": 0.0101, "reward": 1.6162935495376587, "reward_std": 0.22328680753707886, "rewards/accuracy_reward_stage2": 0.6162935495376587, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 528 }, { "completion_length": 11.203125, "epoch": 0.09269318380935693, "grad_norm": 76.31274697932342, "kl": 0.291015625, "learning_rate": 9.074820396004906e-07, "loss": 0.0999, "reward": 1.5997403860092163, "reward_std": 0.28809764981269836, "rewards/accuracy_reward_stage2": 0.6153653264045715, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 529 }, { "completion_length": 10.890625, "epoch": 0.09286840721920449, "grad_norm": 1213.4011074759428, "kl": 3.171875, "learning_rate": 9.07306816190643e-07, "loss": 1.2073, "reward": 1.527639389038086, "reward_std": 0.2979646921157837, "rewards/accuracy_reward_stage2": 0.6838893890380859, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 530 }, { "completion_length": 12.9375, "epoch": 0.09304363062905203, "grad_norm": 20.3193019482723, "kl": 0.04150390625, "learning_rate": 9.071315927807954e-07, "loss": -0.0619, "reward": 1.526425838470459, "reward_std": 0.2860848307609558, "rewards/accuracy_reward_stage2": 0.5576759576797485, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 531 }, { "completion_length": 8.78125, "epoch": 0.0932188540388996, "grad_norm": 16.39573830704058, "kl": 0.09033203125, "learning_rate": 9.069563693709479e-07, "loss": 0.0, "reward": 1.499305009841919, "reward_std": 0.25076764822006226, "rewards/accuracy_reward_stage2": 0.5149299502372742, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 532 }, { "completion_length": 10.6875, "epoch": 0.09339407744874716, "grad_norm": 18.708267724995316, "kl": 0.0277099609375, "learning_rate": 9.067811459611004e-07, "loss": 0.0111, "reward": 1.4556643962860107, "reward_std": 0.16554230451583862, "rewards/accuracy_reward_stage2": 0.4556644558906555, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 533 }, { "completion_length": 12.296875, "epoch": 0.0935693008585947, "grad_norm": 52816.97086739961, "kl": 704.0, "learning_rate": 9.066059225512528e-07, "loss": 282.7971, "reward": 1.1979167461395264, "reward_std": 0.2623191773891449, "rewards/accuracy_reward_stage2": 0.3541666865348816, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 534 }, { "completion_length": 8.296875, "epoch": 0.09374452426844226, "grad_norm": 24.82478545154244, "kl": 0.08740234375, "learning_rate": 9.064306991414053e-07, "loss": 0.035, "reward": 1.4436891078948975, "reward_std": 0.26873427629470825, "rewards/accuracy_reward_stage2": 0.5686891078948975, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 535 }, { "completion_length": 6.125, "epoch": 0.09391974767828982, "grad_norm": 18.23616939760275, "kl": 0.0167236328125, "learning_rate": 9.062554757315576e-07, "loss": 0.0067, "reward": 1.7154107093811035, "reward_std": 0.19368675351142883, "rewards/accuracy_reward_stage2": 0.7154107093811035, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 536 }, { "completion_length": 15.515625, "epoch": 0.09409497108813737, "grad_norm": 12.11677875558571, "kl": 0.035888671875, "learning_rate": 9.060802523217101e-07, "loss": 0.0144, "reward": 1.44085693359375, "reward_std": 0.06919336318969727, "rewards/accuracy_reward_stage2": 0.4408569931983948, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 537 }, { "completion_length": 7.640625, "epoch": 0.09427019449798493, "grad_norm": 16.886219674799325, "kl": 0.0830078125, "learning_rate": 9.059050289118626e-07, "loss": -0.0274, "reward": 1.2922900915145874, "reward_std": 0.16166669130325317, "rewards/accuracy_reward_stage2": 0.4641650915145874, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 538 }, { "completion_length": 32.59375, "epoch": 0.09444541790783248, "grad_norm": 19.878285295210695, "kl": 0.037109375, "learning_rate": 9.05729805502015e-07, "loss": -0.0293, "reward": 1.4270589351654053, "reward_std": 0.17999550700187683, "rewards/accuracy_reward_stage2": 0.44268399477005005, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 539 }, { "completion_length": 9.0625, "epoch": 0.09462064131768004, "grad_norm": 22.418001970985898, "kl": 0.049560546875, "learning_rate": 9.055545820921675e-07, "loss": -0.0931, "reward": 1.335763931274414, "reward_std": 0.24215462803840637, "rewards/accuracy_reward_stage2": 0.3826389014720917, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 540 }, { "completion_length": 15.3125, "epoch": 0.0947958647275276, "grad_norm": 22.252296229336675, "kl": 0.0986328125, "learning_rate": 9.0537935868232e-07, "loss": 0.0395, "reward": 1.3899197578430176, "reward_std": 0.16463521122932434, "rewards/accuracy_reward_stage2": 0.3899197280406952, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 541 }, { "completion_length": 12.375, "epoch": 0.09497108813737515, "grad_norm": 16.457184450985498, "kl": 0.06396484375, "learning_rate": 9.052041352724724e-07, "loss": 0.0256, "reward": 1.3750066757202148, "reward_std": 0.1883399784564972, "rewards/accuracy_reward_stage2": 0.3750067353248596, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 542 }, { "completion_length": 11.625, "epoch": 0.09514631154722271, "grad_norm": 33.68922560485876, "kl": 0.29296875, "learning_rate": 9.050289118626248e-07, "loss": 0.0647, "reward": 1.2392685413360596, "reward_std": 0.1982816904783249, "rewards/accuracy_reward_stage2": 0.5205184817314148, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 543 }, { "completion_length": 12.71875, "epoch": 0.09532153495707027, "grad_norm": 33.730678368774534, "kl": 0.35546875, "learning_rate": 9.048536884527772e-07, "loss": 0.0475, "reward": 1.0741642713546753, "reward_std": 0.22530388832092285, "rewards/accuracy_reward_stage2": 0.3554142713546753, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 544 }, { "completion_length": 7.859375, "epoch": 0.09549675836691782, "grad_norm": 22.80060588847184, "kl": 0.142578125, "learning_rate": 9.046784650429297e-07, "loss": 0.0569, "reward": 1.6529418230056763, "reward_std": 0.2742685079574585, "rewards/accuracy_reward_stage2": 0.6529418230056763, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 545 }, { "completion_length": 10.65625, "epoch": 0.09567198177676538, "grad_norm": 19.18030606110292, "kl": 0.169921875, "learning_rate": 9.045032416330821e-07, "loss": 0.0303, "reward": 1.33396577835083, "reward_std": 0.19901514053344727, "rewards/accuracy_reward_stage2": 0.47459083795547485, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 546 }, { "completion_length": 24.609375, "epoch": 0.09584720518661294, "grad_norm": 39.36257012429911, "kl": 0.5625, "learning_rate": 9.043280182232345e-07, "loss": 0.2261, "reward": 1.3327341079711914, "reward_std": 0.1748802214860916, "rewards/accuracy_reward_stage2": 0.457734078168869, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 547 }, { "completion_length": 11.921875, "epoch": 0.09602242859646049, "grad_norm": 15.899625187857596, "kl": 0.04345703125, "learning_rate": 9.04152794813387e-07, "loss": -0.0268, "reward": 1.4270386695861816, "reward_std": 0.15212519466876984, "rewards/accuracy_reward_stage2": 0.5676637291908264, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 548 }, { "completion_length": 7.921875, "epoch": 0.09619765200630805, "grad_norm": 20.104482583734367, "kl": 0.06982421875, "learning_rate": 9.039775714035395e-07, "loss": -0.0009, "reward": 1.757695198059082, "reward_std": 0.25138598680496216, "rewards/accuracy_reward_stage2": 0.773320198059082, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 549 }, { "completion_length": 12.4375, "epoch": 0.09637287541615559, "grad_norm": 17.080432216977275, "kl": 0.56640625, "learning_rate": 9.038023479936919e-07, "loss": 0.2259, "reward": 1.4803493022918701, "reward_std": 0.18915359675884247, "rewards/accuracy_reward_stage2": 0.6053494215011597, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 550 }, { "completion_length": 11.1875, "epoch": 0.09654809882600315, "grad_norm": 20.191666928026432, "kl": 0.05859375, "learning_rate": 9.036271245838444e-07, "loss": 0.0105, "reward": 1.8123853206634521, "reward_std": 0.20700375735759735, "rewards/accuracy_reward_stage2": 0.8280103206634521, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 551 }, { "completion_length": 11.421875, "epoch": 0.09672332223585071, "grad_norm": 20.378539040102037, "kl": 0.1962890625, "learning_rate": 9.034519011739968e-07, "loss": 0.0787, "reward": 1.409591555595398, "reward_std": 0.23674771189689636, "rewards/accuracy_reward_stage2": 0.659591555595398, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 552 }, { "completion_length": 11.75, "epoch": 0.09689854564569826, "grad_norm": 18.890258636089598, "kl": 0.051513671875, "learning_rate": 9.032766777641493e-07, "loss": 0.0206, "reward": 1.4163398742675781, "reward_std": 0.23046299815177917, "rewards/accuracy_reward_stage2": 0.6663398742675781, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 553 }, { "completion_length": 13.5, "epoch": 0.09707376905554582, "grad_norm": 18.286306275322975, "kl": 0.0869140625, "learning_rate": 9.031014543543018e-07, "loss": 0.0348, "reward": 1.3902003765106201, "reward_std": 0.15996113419532776, "rewards/accuracy_reward_stage2": 0.3902003765106201, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 554 }, { "completion_length": 21.921875, "epoch": 0.09724899246539338, "grad_norm": 40.26435687211343, "kl": 0.05517578125, "learning_rate": 9.029262309444542e-07, "loss": 0.0221, "reward": 1.505936622619629, "reward_std": 0.17588528990745544, "rewards/accuracy_reward_stage2": 0.5059365034103394, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 555 }, { "completion_length": 9.90625, "epoch": 0.09742421587524093, "grad_norm": 15.957309347414999, "kl": 0.046875, "learning_rate": 9.027510075346065e-07, "loss": -0.0588, "reward": 1.508584976196289, "reward_std": 0.23708796501159668, "rewards/accuracy_reward_stage2": 0.5398349165916443, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 556 }, { "completion_length": 16.015625, "epoch": 0.09759943928508849, "grad_norm": 14.467619029018568, "kl": 0.6953125, "learning_rate": 9.02575784124759e-07, "loss": 0.2386, "reward": 1.5341227054595947, "reward_std": 0.2018691599369049, "rewards/accuracy_reward_stage2": 0.7997477650642395, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 557 }, { "completion_length": 12.59375, "epoch": 0.09777466269493604, "grad_norm": 22.63678621876035, "kl": 0.050537109375, "learning_rate": 9.024005607149114e-07, "loss": -0.1099, "reward": 1.3984155654907227, "reward_std": 0.3438401520252228, "rewards/accuracy_reward_stage2": 0.4609155058860779, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 558 }, { "completion_length": 11.875, "epoch": 0.0979498861047836, "grad_norm": 22.6560160835456, "kl": 0.0751953125, "learning_rate": 9.022253373050639e-07, "loss": -0.0033, "reward": 1.3677244186401367, "reward_std": 0.3097214102745056, "rewards/accuracy_reward_stage2": 0.3833494782447815, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 559 }, { "completion_length": 11.8125, "epoch": 0.09812510951463116, "grad_norm": 22.911396622943528, "kl": 0.73046875, "learning_rate": 9.020501138952163e-07, "loss": 0.249, "reward": 1.5243258476257324, "reward_std": 0.285469114780426, "rewards/accuracy_reward_stage2": 0.6649507284164429, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 560 }, { "completion_length": 12.046875, "epoch": 0.0983003329244787, "grad_norm": 14.34568733429109, "kl": 0.051025390625, "learning_rate": 9.018748904853688e-07, "loss": 0.0205, "reward": 1.4658381938934326, "reward_std": 0.12557768821716309, "rewards/accuracy_reward_stage2": 0.4658382534980774, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 561 }, { "completion_length": 10.0625, "epoch": 0.09847555633432627, "grad_norm": 37.32668085474892, "kl": 0.232421875, "learning_rate": 9.016996670755213e-07, "loss": 0.093, "reward": 1.4536373615264893, "reward_std": 0.22130584716796875, "rewards/accuracy_reward_stage2": 0.703637421131134, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 562 }, { "completion_length": 12.75, "epoch": 0.09865077974417383, "grad_norm": 15.319698390846803, "kl": 0.0546875, "learning_rate": 9.015244436656737e-07, "loss": 0.0219, "reward": 1.364243984222412, "reward_std": 0.14342659711837769, "rewards/accuracy_reward_stage2": 0.36424392461776733, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 563 }, { "completion_length": 11.0625, "epoch": 0.09882600315402137, "grad_norm": 20.899740069261195, "kl": 0.08935546875, "learning_rate": 9.013492202558262e-07, "loss": 0.0357, "reward": 1.557667851448059, "reward_std": 0.31662172079086304, "rewards/accuracy_reward_stage2": 0.8076679110527039, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 564 }, { "completion_length": 12.0, "epoch": 0.09900122656386894, "grad_norm": 21.39614803409284, "kl": 0.1123046875, "learning_rate": 9.011739968459787e-07, "loss": 0.0008, "reward": 1.7577136754989624, "reward_std": 0.20902200043201447, "rewards/accuracy_reward_stage2": 0.7733386754989624, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 565 }, { "completion_length": 11.203125, "epoch": 0.09917644997371648, "grad_norm": 16.244712679238543, "kl": 0.07275390625, "learning_rate": 9.00998773436131e-07, "loss": -0.0574, "reward": 1.4084053039550781, "reward_std": 0.12980613112449646, "rewards/accuracy_reward_stage2": 0.4396553635597229, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 566 }, { "completion_length": 8.796875, "epoch": 0.09935167338356404, "grad_norm": 12.42118208336979, "kl": 0.029541015625, "learning_rate": 9.008235500262835e-07, "loss": 0.0118, "reward": 1.6280958652496338, "reward_std": 0.12311365455389023, "rewards/accuracy_reward_stage2": 0.7530958652496338, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 567 }, { "completion_length": 8.34375, "epoch": 0.0995268967934116, "grad_norm": 19.86622380429182, "kl": 0.07421875, "learning_rate": 9.006483266164358e-07, "loss": 0.0296, "reward": 1.6223640441894531, "reward_std": 0.2801703214645386, "rewards/accuracy_reward_stage2": 0.6223639249801636, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 568 }, { "completion_length": 9.15625, "epoch": 0.09970212020325915, "grad_norm": 23.225550681225148, "kl": 0.052734375, "learning_rate": 9.004731032065883e-07, "loss": 0.0069, "reward": 1.6514757871627808, "reward_std": 0.242259219288826, "rewards/accuracy_reward_stage2": 0.6671008467674255, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 569 }, { "completion_length": 9.234375, "epoch": 0.09987734361310671, "grad_norm": 19.919949196266696, "kl": 0.203125, "learning_rate": 9.002978797967408e-07, "loss": 0.037, "reward": 1.4913837909698486, "reward_std": 0.23644289374351501, "rewards/accuracy_reward_stage2": 0.6320087909698486, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 570 }, { "completion_length": 13.0, "epoch": 0.10005256702295427, "grad_norm": 13.91521551284311, "kl": 0.0673828125, "learning_rate": 9.001226563868932e-07, "loss": -0.0614, "reward": 1.6299842596054077, "reward_std": 0.18330855667591095, "rewards/accuracy_reward_stage2": 0.6612342596054077, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 571 }, { "completion_length": 8.03125, "epoch": 0.10022779043280182, "grad_norm": 19.58232929067739, "kl": 0.0322265625, "learning_rate": 8.999474329770457e-07, "loss": 0.0129, "reward": 1.6614583730697632, "reward_std": 0.19485904276371002, "rewards/accuracy_reward_stage2": 0.6614583134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 572 }, { "completion_length": 9.25, "epoch": 0.10040301384264938, "grad_norm": 19.636258204988145, "kl": 0.0478515625, "learning_rate": 8.997722095671982e-07, "loss": 0.0191, "reward": 1.5884075164794922, "reward_std": 0.31157463788986206, "rewards/accuracy_reward_stage2": 0.5884075164794922, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 573 }, { "completion_length": 8.4375, "epoch": 0.10057823725249693, "grad_norm": 11.776864581244816, "kl": 0.03173828125, "learning_rate": 8.995969861573506e-07, "loss": 0.0126, "reward": 1.8850898742675781, "reward_std": 0.08686178922653198, "rewards/accuracy_reward_stage2": 0.8850897550582886, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 574 }, { "completion_length": 5.484375, "epoch": 0.10075346066234449, "grad_norm": 16.28159485758816, "kl": 0.0203857421875, "learning_rate": 8.994217627475031e-07, "loss": -0.036, "reward": 1.349395513534546, "reward_std": 0.13381871581077576, "rewards/accuracy_reward_stage2": 0.6150203943252563, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 575 }, { "completion_length": 11.203125, "epoch": 0.10092868407219205, "grad_norm": 14.42868534069688, "kl": 0.034423828125, "learning_rate": 8.992465393376554e-07, "loss": 0.0138, "reward": 1.4621574878692627, "reward_std": 0.13635873794555664, "rewards/accuracy_reward_stage2": 0.7121575474739075, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 576 }, { "completion_length": 15.0625, "epoch": 0.1011039074820396, "grad_norm": 26.063053167282682, "kl": 0.08154296875, "learning_rate": 8.990713159278079e-07, "loss": -0.0032, "reward": 1.4650766849517822, "reward_std": 0.3235799968242645, "rewards/accuracy_reward_stage2": 0.4807017147541046, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 577 }, { "completion_length": 8.046875, "epoch": 0.10127913089188716, "grad_norm": 25.109409400455146, "kl": 0.0595703125, "learning_rate": 8.988960925179604e-07, "loss": -0.0436, "reward": 1.6310796737670898, "reward_std": 0.2766677737236023, "rewards/accuracy_reward_stage2": 0.6623297333717346, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 578 }, { "completion_length": 10.265625, "epoch": 0.10145435430173472, "grad_norm": 17.661150618728758, "kl": 0.2021484375, "learning_rate": 8.987208691081128e-07, "loss": 0.0807, "reward": 1.1396028995513916, "reward_std": 0.19701595604419708, "rewards/accuracy_reward_stage2": 0.5146028995513916, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 579 }, { "completion_length": 8.359375, "epoch": 0.10162957771158226, "grad_norm": 18.734631769812072, "kl": 0.058837890625, "learning_rate": 8.985456456982653e-07, "loss": 0.0068, "reward": 1.6147187948226929, "reward_std": 0.2279905080795288, "rewards/accuracy_reward_stage2": 0.6303437948226929, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 580 }, { "completion_length": 9.65625, "epoch": 0.10180480112142982, "grad_norm": 19.790683776159472, "kl": 0.1416015625, "learning_rate": 8.983704222884176e-07, "loss": -0.0755, "reward": 1.4265341758728027, "reward_std": 0.23208576440811157, "rewards/accuracy_reward_stage2": 0.4734092354774475, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 581 }, { "completion_length": 15.9375, "epoch": 0.10198002453127737, "grad_norm": 22.49576694273184, "kl": 0.052001953125, "learning_rate": 8.981951988785701e-07, "loss": 0.0208, "reward": 1.3234437704086304, "reward_std": 0.17618276178836823, "rewards/accuracy_reward_stage2": 0.323443740606308, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 582 }, { "completion_length": 9.5625, "epoch": 0.10215524794112493, "grad_norm": 9.57883485832979, "kl": 0.03076171875, "learning_rate": 8.980199754687226e-07, "loss": -0.0761, "reward": 1.2532668113708496, "reward_std": 0.10138334333896637, "rewards/accuracy_reward_stage2": 0.28451675176620483, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 583 }, { "completion_length": 9.515625, "epoch": 0.1023304713509725, "grad_norm": 13.225909758271314, "kl": 0.0257568359375, "learning_rate": 8.97844752058875e-07, "loss": -0.0339, "reward": 1.6815369129180908, "reward_std": 0.1408383846282959, "rewards/accuracy_reward_stage2": 0.6971619129180908, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 584 }, { "completion_length": 9.90625, "epoch": 0.10250569476082004, "grad_norm": 23.03665624058185, "kl": 0.07666015625, "learning_rate": 8.976695286490275e-07, "loss": -0.0451, "reward": 1.4693918228149414, "reward_std": 0.2618715465068817, "rewards/accuracy_reward_stage2": 0.5006418228149414, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 585 }, { "completion_length": 10.390625, "epoch": 0.1026809181706676, "grad_norm": 21.82393188697414, "kl": 0.06494140625, "learning_rate": 8.974943052391799e-07, "loss": -0.0054, "reward": 1.4567054510116577, "reward_std": 0.28674811124801636, "rewards/accuracy_reward_stage2": 0.47233039140701294, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 586 }, { "completion_length": 9.0625, "epoch": 0.10285614158051516, "grad_norm": 18.317296690283246, "kl": 0.28515625, "learning_rate": 8.973190818293323e-07, "loss": 0.0716, "reward": 1.2668479681015015, "reward_std": 0.20858728885650635, "rewards/accuracy_reward_stage2": 0.4074729084968567, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 587 }, { "completion_length": 16.46875, "epoch": 0.10303136499036271, "grad_norm": 17.62410201653559, "kl": 0.0869140625, "learning_rate": 8.971438584194848e-07, "loss": -0.0094, "reward": 1.4687905311584473, "reward_std": 0.2499588280916214, "rewards/accuracy_reward_stage2": 0.6094154715538025, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 588 }, { "completion_length": 11.140625, "epoch": 0.10320658840021027, "grad_norm": 15.522166737484554, "kl": 0.05859375, "learning_rate": 8.969686350096372e-07, "loss": 0.0234, "reward": 1.6265251636505127, "reward_std": 0.18599528074264526, "rewards/accuracy_reward_stage2": 0.7515252232551575, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 589 }, { "completion_length": 10.84375, "epoch": 0.10338181181005783, "grad_norm": 21.197974304069106, "kl": 0.046142578125, "learning_rate": 8.967934115997897e-07, "loss": 0.0184, "reward": 1.5723726749420166, "reward_std": 0.2601754069328308, "rewards/accuracy_reward_stage2": 0.5723727345466614, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 590 }, { "completion_length": 14.578125, "epoch": 0.10355703521990538, "grad_norm": 24.743867916068385, "kl": 0.0615234375, "learning_rate": 8.966181881899422e-07, "loss": -0.0044, "reward": 1.618594765663147, "reward_std": 0.1704864203929901, "rewards/accuracy_reward_stage2": 0.6342197060585022, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 591 }, { "completion_length": 8.515625, "epoch": 0.10373225862975294, "grad_norm": 22.397575395216897, "kl": 0.0245361328125, "learning_rate": 8.964429647800946e-07, "loss": 0.0098, "reward": 1.7014509439468384, "reward_std": 0.2518218159675598, "rewards/accuracy_reward_stage2": 0.7014508247375488, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 592 }, { "completion_length": 8.609375, "epoch": 0.10390748203960049, "grad_norm": 17.820707192919905, "kl": 0.031982421875, "learning_rate": 8.962677413702471e-07, "loss": 0.0128, "reward": 1.5031335353851318, "reward_std": 0.1464834064245224, "rewards/accuracy_reward_stage2": 0.5031336545944214, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 593 }, { "completion_length": 11.984375, "epoch": 0.10408270544944805, "grad_norm": 21.839142949786588, "kl": 0.06396484375, "learning_rate": 8.960925179603995e-07, "loss": 0.0255, "reward": 1.6463342905044556, "reward_std": 0.25005415081977844, "rewards/accuracy_reward_stage2": 0.6463342308998108, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 594 }, { "completion_length": 12.78125, "epoch": 0.1042579288592956, "grad_norm": 12.309352868315235, "kl": 0.01416015625, "learning_rate": 8.959172945505519e-07, "loss": 0.0057, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.71875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 595 }, { "completion_length": 24.0, "epoch": 0.10443315226914315, "grad_norm": 22.16452522559772, "kl": 0.0250244140625, "learning_rate": 8.957420711407043e-07, "loss": 0.01, "reward": 1.3530032634735107, "reward_std": 0.281308650970459, "rewards/accuracy_reward_stage2": 0.3530033528804779, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 596 }, { "completion_length": 6.65625, "epoch": 0.10460837567899071, "grad_norm": 18.573770659631396, "kl": 0.0849609375, "learning_rate": 8.955668477308567e-07, "loss": 0.0006, "reward": 1.5902413129806519, "reward_std": 0.19903446733951569, "rewards/accuracy_reward_stage2": 0.6058663725852966, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 597 }, { "completion_length": 16.921875, "epoch": 0.10478359908883828, "grad_norm": 16.665416052004712, "kl": 0.060302734375, "learning_rate": 8.953916243210092e-07, "loss": -0.0144, "reward": 1.294586181640625, "reward_std": 0.12051106244325638, "rewards/accuracy_reward_stage2": 0.3102111220359802, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 598 }, { "completion_length": 10.5, "epoch": 0.10495882249868582, "grad_norm": 19.600816799377586, "kl": 0.12109375, "learning_rate": 8.952164009111617e-07, "loss": 0.0485, "reward": 1.7329591512680054, "reward_std": 0.24576660990715027, "rewards/accuracy_reward_stage2": 0.7329592108726501, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 599 }, { "completion_length": 13.03125, "epoch": 0.10513404590853338, "grad_norm": 17.31547786408126, "kl": 0.05517578125, "learning_rate": 8.950411775013141e-07, "loss": 0.0221, "reward": 1.1188859939575195, "reward_std": 0.13924580812454224, "rewards/accuracy_reward_stage2": 0.24388596415519714, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 600 }, { "completion_length": 12.59375, "epoch": 0.10530926931838093, "grad_norm": 19.322536755534735, "kl": 0.130859375, "learning_rate": 8.948659540914666e-07, "loss": 0.0525, "reward": 1.4634090662002563, "reward_std": 0.17742908000946045, "rewards/accuracy_reward_stage2": 0.5884091258049011, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 601 }, { "completion_length": 13.96875, "epoch": 0.10548449272822849, "grad_norm": 15.587417098129867, "kl": 0.09326171875, "learning_rate": 8.946907306816191e-07, "loss": 0.0374, "reward": 1.5436468124389648, "reward_std": 0.12240086495876312, "rewards/accuracy_reward_stage2": 0.6686468124389648, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 602 }, { "completion_length": 8.359375, "epoch": 0.10565971613807605, "grad_norm": 20.599422159485304, "kl": 0.045654296875, "learning_rate": 8.945155072717715e-07, "loss": 0.0183, "reward": 1.7965465784072876, "reward_std": 0.24006301164627075, "rewards/accuracy_reward_stage2": 0.7965465188026428, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 603 }, { "completion_length": 8.84375, "epoch": 0.1058349395479236, "grad_norm": 18.210715993732016, "kl": 0.033447265625, "learning_rate": 8.94340283861924e-07, "loss": 0.0134, "reward": 1.558894157409668, "reward_std": 0.14036405086517334, "rewards/accuracy_reward_stage2": 0.5745192170143127, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 604 }, { "completion_length": 13.421875, "epoch": 0.10601016295777116, "grad_norm": 27.94464562579728, "kl": 0.017333984375, "learning_rate": 8.941650604520764e-07, "loss": -0.0264, "reward": 1.4473905563354492, "reward_std": 0.21036802232265472, "rewards/accuracy_reward_stage2": 0.5880155563354492, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 605 }, { "completion_length": 11.34375, "epoch": 0.10618538636761872, "grad_norm": 12.57504645024913, "kl": 0.01263427734375, "learning_rate": 8.939898370422288e-07, "loss": -0.1122, "reward": 1.539158821105957, "reward_std": 0.2022034078836441, "rewards/accuracy_reward_stage2": 0.586033821105957, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 606 }, { "completion_length": 8.15625, "epoch": 0.10636060977746627, "grad_norm": 15.638105890819281, "kl": 0.10546875, "learning_rate": 8.938146136323812e-07, "loss": 0.0422, "reward": 1.3107225894927979, "reward_std": 0.18837109208106995, "rewards/accuracy_reward_stage2": 0.4357225298881531, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 607 }, { "completion_length": 9.515625, "epoch": 0.10653583318731383, "grad_norm": 16.783161632058864, "kl": 0.045654296875, "learning_rate": 8.936393902225336e-07, "loss": -0.0033, "reward": 1.728787899017334, "reward_std": 0.1572287678718567, "rewards/accuracy_reward_stage2": 0.7444128394126892, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 608 }, { "completion_length": 7.296875, "epoch": 0.10671105659716137, "grad_norm": 19.953434160420613, "kl": 0.023681640625, "learning_rate": 8.934641668126861e-07, "loss": 0.0095, "reward": 1.3125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward_stage2": 0.4375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 609 }, { "completion_length": 12.375, "epoch": 0.10688628000700894, "grad_norm": 23.33191958488592, "kl": 0.037353515625, "learning_rate": 8.932889434028386e-07, "loss": 0.0053, "reward": 1.6041978597640991, "reward_std": 0.33646097779273987, "rewards/accuracy_reward_stage2": 0.6198228597640991, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 610 }, { "completion_length": 8.546875, "epoch": 0.1070615034168565, "grad_norm": 14.652227746966123, "kl": 0.01519775390625, "learning_rate": 8.93113719992991e-07, "loss": 0.0061, "reward": 1.7631537914276123, "reward_std": 0.10471543669700623, "rewards/accuracy_reward_stage2": 0.7631537318229675, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 611 }, { "completion_length": 8.390625, "epoch": 0.10723672682670404, "grad_norm": 19.372365428544043, "kl": 0.0311279296875, "learning_rate": 8.929384965831435e-07, "loss": 0.0124, "reward": 1.6221894025802612, "reward_std": 0.14791935682296753, "rewards/accuracy_reward_stage2": 0.622189462184906, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 612 }, { "completion_length": 7.296875, "epoch": 0.1074119502365516, "grad_norm": 12.757778654176331, "kl": 0.02880859375, "learning_rate": 8.927632731732959e-07, "loss": -0.0327, "reward": 1.203125, "reward_std": 0.16887323558330536, "rewards/accuracy_reward_stage2": 0.21875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 613 }, { "completion_length": 9.234375, "epoch": 0.10758717364639916, "grad_norm": 15.82738195868456, "kl": 0.0301513671875, "learning_rate": 8.925880497634484e-07, "loss": 0.0121, "reward": 1.7813446521759033, "reward_std": 0.16648373007774353, "rewards/accuracy_reward_stage2": 0.9063446521759033, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 614 }, { "completion_length": 9.71875, "epoch": 0.10776239705624671, "grad_norm": 21.46197875618223, "kl": 0.030517578125, "learning_rate": 8.924128263536009e-07, "loss": -0.1204, "reward": 1.430158019065857, "reward_std": 0.289289653301239, "rewards/accuracy_reward_stage2": 0.47703301906585693, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 615 }, { "completion_length": 9.828125, "epoch": 0.10793762046609427, "grad_norm": 24.27795896535574, "kl": 0.0458984375, "learning_rate": 8.922376029437532e-07, "loss": -0.0258, "reward": 1.624479055404663, "reward_std": 0.21070247888565063, "rewards/accuracy_reward_stage2": 0.6401039958000183, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 616 }, { "completion_length": 9.109375, "epoch": 0.10811284387594182, "grad_norm": 16.643265414166162, "kl": 0.1318359375, "learning_rate": 8.920623795339057e-07, "loss": -0.0343, "reward": 1.634692668914795, "reward_std": 0.15342967212200165, "rewards/accuracy_reward_stage2": 0.7909427285194397, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 617 }, { "completion_length": 14.703125, "epoch": 0.10828806728578938, "grad_norm": 21.05809926024803, "kl": 0.061279296875, "learning_rate": 8.918871561240582e-07, "loss": 0.0245, "reward": 1.3503414392471313, "reward_std": 0.15436102449893951, "rewards/accuracy_reward_stage2": 0.35034140944480896, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 618 }, { "completion_length": 6.453125, "epoch": 0.10846329069563694, "grad_norm": 18.701299422305127, "kl": 0.03173828125, "learning_rate": 8.917119327142105e-07, "loss": 0.0127, "reward": 1.5458629131317139, "reward_std": 0.19350674748420715, "rewards/accuracy_reward_stage2": 0.5458628535270691, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 619 }, { "completion_length": 9.9375, "epoch": 0.10863851410548449, "grad_norm": 19.58448716432124, "kl": 0.0159912109375, "learning_rate": 8.91536709304363e-07, "loss": -0.0378, "reward": 1.4166667461395264, "reward_std": 0.2630414366722107, "rewards/accuracy_reward_stage2": 0.5572916865348816, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 620 }, { "completion_length": 10.640625, "epoch": 0.10881373751533205, "grad_norm": 20.55948510580831, "kl": 0.162109375, "learning_rate": 8.913614858945154e-07, "loss": 0.0649, "reward": 1.4507501125335693, "reward_std": 0.12407205998897552, "rewards/accuracy_reward_stage2": 0.5757502317428589, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 621 }, { "completion_length": 9.578125, "epoch": 0.10898896092517961, "grad_norm": 22.69764250164246, "kl": 0.1083984375, "learning_rate": 8.911862624846679e-07, "loss": 0.0432, "reward": 1.4806079864501953, "reward_std": 0.16957543790340424, "rewards/accuracy_reward_stage2": 0.6056080460548401, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 622 }, { "completion_length": 20.53125, "epoch": 0.10916418433502716, "grad_norm": 18.42436068010624, "kl": 0.034423828125, "learning_rate": 8.910110390748204e-07, "loss": 0.0138, "reward": 1.778219223022461, "reward_std": 0.19961267709732056, "rewards/accuracy_reward_stage2": 0.7782192230224609, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 623 }, { "completion_length": 12.84375, "epoch": 0.10933940774487472, "grad_norm": 29.281130200071576, "kl": 0.263671875, "learning_rate": 8.908358156649728e-07, "loss": 0.0613, "reward": 1.4601194858551025, "reward_std": 0.11638569831848145, "rewards/accuracy_reward_stage2": 0.6007444858551025, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 624 }, { "completion_length": 9.890625, "epoch": 0.10951463115472228, "grad_norm": 24.147239619968612, "kl": 0.0712890625, "learning_rate": 8.906605922551253e-07, "loss": 0.0285, "reward": 1.625582218170166, "reward_std": 0.15535606443881989, "rewards/accuracy_reward_stage2": 0.6255822777748108, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 625 }, { "completion_length": 9.84375, "epoch": 0.10968985456456982, "grad_norm": 16.839339018289216, "kl": 0.091796875, "learning_rate": 8.904853688452777e-07, "loss": -0.0074, "reward": 1.8911480903625488, "reward_std": 0.1472635567188263, "rewards/accuracy_reward_stage2": 0.9067729711532593, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 626 }, { "completion_length": 12.75, "epoch": 0.10986507797441739, "grad_norm": 16.885588058948453, "kl": 0.06689453125, "learning_rate": 8.903101454354301e-07, "loss": 0.0269, "reward": 1.6324387788772583, "reward_std": 0.11915861815214157, "rewards/accuracy_reward_stage2": 0.6324387788772583, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 627 }, { "completion_length": 11.296875, "epoch": 0.11004030138426493, "grad_norm": 19.57572697577605, "kl": 0.05419921875, "learning_rate": 8.901349220255826e-07, "loss": 0.0217, "reward": 1.3621962070465088, "reward_std": 0.26940637826919556, "rewards/accuracy_reward_stage2": 0.3621961772441864, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 628 }, { "completion_length": 7.296875, "epoch": 0.1102155247941125, "grad_norm": 15.528399734860603, "kl": 0.03955078125, "learning_rate": 8.89959698615735e-07, "loss": 0.0158, "reward": 1.6868137121200562, "reward_std": 0.10869477689266205, "rewards/accuracy_reward_stage2": 0.6868136525154114, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 629 }, { "completion_length": 9.765625, "epoch": 0.11039074820396005, "grad_norm": 19.224322095125764, "kl": 0.048583984375, "learning_rate": 8.897844752058875e-07, "loss": 0.0194, "reward": 1.6562113761901855, "reward_std": 0.18549120426177979, "rewards/accuracy_reward_stage2": 0.6562113761901855, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 630 }, { "completion_length": 6.1875, "epoch": 0.1105659716138076, "grad_norm": 20.116365737073824, "kl": 0.060791015625, "learning_rate": 8.8960925179604e-07, "loss": 0.0243, "reward": 1.512540578842163, "reward_std": 0.16334936022758484, "rewards/accuracy_reward_stage2": 0.5125405788421631, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 631 }, { "completion_length": 8.84375, "epoch": 0.11074119502365516, "grad_norm": 16.682655445127725, "kl": 0.1279296875, "learning_rate": 8.894340283861923e-07, "loss": 0.0112, "reward": 1.474839210510254, "reward_std": 0.2113642543554306, "rewards/accuracy_reward_stage2": 0.6154641509056091, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 632 }, { "completion_length": 14.390625, "epoch": 0.11091641843350272, "grad_norm": 17.490175729379096, "kl": 0.0233154296875, "learning_rate": 8.892588049763448e-07, "loss": 0.0094, "reward": 1.9157812595367432, "reward_std": 0.12778238952159882, "rewards/accuracy_reward_stage2": 0.9157813191413879, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 633 }, { "completion_length": 16.5625, "epoch": 0.11109164184335027, "grad_norm": 16.11217323057676, "kl": 0.06396484375, "learning_rate": 8.890835815664973e-07, "loss": 0.0255, "reward": 1.3033857345581055, "reward_std": 0.15057168900966644, "rewards/accuracy_reward_stage2": 0.42838579416275024, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 634 }, { "completion_length": 8.015625, "epoch": 0.11126686525319783, "grad_norm": 27.436005651707575, "kl": 0.1728515625, "learning_rate": 8.889083581566496e-07, "loss": -0.0039, "reward": 1.6807503700256348, "reward_std": 0.30542638897895813, "rewards/accuracy_reward_stage2": 0.8370004296302795, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 635 }, { "completion_length": 9.703125, "epoch": 0.11144208866304538, "grad_norm": 26.94592284911264, "kl": 0.07421875, "learning_rate": 8.887331347468021e-07, "loss": 0.0297, "reward": 1.3588056564331055, "reward_std": 0.3892172873020172, "rewards/accuracy_reward_stage2": 0.6088056564331055, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 636 }, { "completion_length": 9.21875, "epoch": 0.11161731207289294, "grad_norm": 18.19530983729694, "kl": 0.036376953125, "learning_rate": 8.885579113369545e-07, "loss": -0.0021, "reward": 1.6655867099761963, "reward_std": 0.25258275866508484, "rewards/accuracy_reward_stage2": 0.6812116503715515, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 637 }, { "completion_length": 12.0625, "epoch": 0.1117925354827405, "grad_norm": 20.711330390595382, "kl": 0.0712890625, "learning_rate": 8.88382687927107e-07, "loss": -0.0048, "reward": 1.4815778732299805, "reward_std": 0.24052470922470093, "rewards/accuracy_reward_stage2": 0.4972028136253357, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 638 }, { "completion_length": 10.1875, "epoch": 0.11196775889258805, "grad_norm": 20.415117056750283, "kl": 0.043212890625, "learning_rate": 8.882074645172595e-07, "loss": -0.0143, "reward": 1.4225728511810303, "reward_std": 0.20394927263259888, "rewards/accuracy_reward_stage2": 0.4381977319717407, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 639 }, { "completion_length": 7.5, "epoch": 0.1121429823024356, "grad_norm": 18.065810871206835, "kl": 0.0400390625, "learning_rate": 8.880322411074119e-07, "loss": 0.016, "reward": 1.529618501663208, "reward_std": 0.19352030754089355, "rewards/accuracy_reward_stage2": 0.5296184420585632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 640 }, { "completion_length": 7.65625, "epoch": 0.11231820571228317, "grad_norm": 17.331584185274014, "kl": 0.0203857421875, "learning_rate": 8.878570176975644e-07, "loss": 0.0082, "reward": 1.4235129356384277, "reward_std": 0.2608225345611572, "rewards/accuracy_reward_stage2": 0.42351287603378296, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 641 }, { "completion_length": 7.953125, "epoch": 0.11249342912213071, "grad_norm": 17.852702136003707, "kl": 0.30078125, "learning_rate": 8.876817942877169e-07, "loss": 0.12, "reward": 1.2848070859909058, "reward_std": 0.1477287858724594, "rewards/accuracy_reward_stage2": 0.534807026386261, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 642 }, { "completion_length": 12.03125, "epoch": 0.11266865253197828, "grad_norm": 15.507208538142622, "kl": 0.015869140625, "learning_rate": 8.875065708778693e-07, "loss": 0.0063, "reward": 1.7227667570114136, "reward_std": 0.2567778527736664, "rewards/accuracy_reward_stage2": 0.7227667570114136, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 643 }, { "completion_length": 16.921875, "epoch": 0.11284387594182582, "grad_norm": 23.081114868964416, "kl": 0.2216796875, "learning_rate": 8.873313474680218e-07, "loss": 0.0887, "reward": 1.27731454372406, "reward_std": 0.20373868942260742, "rewards/accuracy_reward_stage2": 0.5273144841194153, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 644 }, { "completion_length": 5.828125, "epoch": 0.11301909935167338, "grad_norm": 16.313428020485347, "kl": 0.06103515625, "learning_rate": 8.87156124058174e-07, "loss": -0.064, "reward": 1.6435449123382568, "reward_std": 0.19214250147342682, "rewards/accuracy_reward_stage2": 0.6747948527336121, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 645 }, { "completion_length": 29.6875, "epoch": 0.11319432276152094, "grad_norm": 19.13977602308894, "kl": 0.031494140625, "learning_rate": 8.869809006483265e-07, "loss": 0.0126, "reward": 1.8208966255187988, "reward_std": 0.14088435471057892, "rewards/accuracy_reward_stage2": 0.8208966255187988, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 646 }, { "completion_length": 8.078125, "epoch": 0.11336954617136849, "grad_norm": 16.336114992014164, "kl": 0.1513671875, "learning_rate": 8.86805677238479e-07, "loss": 0.0608, "reward": 1.5519046783447266, "reward_std": 0.1638176441192627, "rewards/accuracy_reward_stage2": 0.5519046187400818, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 647 }, { "completion_length": 10.328125, "epoch": 0.11354476958121605, "grad_norm": 19.192940552680728, "kl": 0.11328125, "learning_rate": 8.866304538286314e-07, "loss": 0.0453, "reward": 1.5758169889450073, "reward_std": 0.19315147399902344, "rewards/accuracy_reward_stage2": 0.5758169293403625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 648 }, { "completion_length": 10.109375, "epoch": 0.11371999299106361, "grad_norm": 24.65646749415431, "kl": 0.1298828125, "learning_rate": 8.864552304187839e-07, "loss": 0.0521, "reward": 1.5815494060516357, "reward_std": 0.2213352620601654, "rewards/accuracy_reward_stage2": 0.5815494060516357, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 649 }, { "completion_length": 7.46875, "epoch": 0.11389521640091116, "grad_norm": 17.447375675037648, "kl": 0.0751953125, "learning_rate": 8.862800070089363e-07, "loss": -0.0142, "reward": 1.478639841079712, "reward_std": 0.19840413331985474, "rewards/accuracy_reward_stage2": 0.4942649006843567, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 650 }, { "completion_length": 9.953125, "epoch": 0.11407043981075872, "grad_norm": 18.03776475428187, "kl": 0.1435546875, "learning_rate": 8.861047835990888e-07, "loss": 0.0575, "reward": 1.3064175844192505, "reward_std": 0.19495750963687897, "rewards/accuracy_reward_stage2": 0.4314176142215729, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 651 }, { "completion_length": 10.125, "epoch": 0.11424566322060627, "grad_norm": 14.52019538206722, "kl": 0.0244140625, "learning_rate": 8.859295601892413e-07, "loss": -0.032, "reward": 1.5017303228378296, "reward_std": 0.08835500478744507, "rewards/accuracy_reward_stage2": 0.5173553228378296, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 652 }, { "completion_length": 9.234375, "epoch": 0.11442088663045383, "grad_norm": 20.38124668947182, "kl": 0.05224609375, "learning_rate": 8.857543367793937e-07, "loss": 0.0209, "reward": 1.5765533447265625, "reward_std": 0.22117437422275543, "rewards/accuracy_reward_stage2": 0.5765534043312073, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 653 }, { "completion_length": 8.390625, "epoch": 0.11459611004030139, "grad_norm": 16.921443968716325, "kl": 0.12451171875, "learning_rate": 8.855791133695462e-07, "loss": 0.0497, "reward": 1.603499174118042, "reward_std": 0.20851662755012512, "rewards/accuracy_reward_stage2": 0.603499174118042, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 654 }, { "completion_length": 9.859375, "epoch": 0.11477133345014894, "grad_norm": 18.83286142598205, "kl": 0.0693359375, "learning_rate": 8.854038899596987e-07, "loss": 0.0278, "reward": 1.5485821962356567, "reward_std": 0.2668875753879547, "rewards/accuracy_reward_stage2": 0.5485821962356567, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 655 }, { "completion_length": 10.75, "epoch": 0.1149465568599965, "grad_norm": 20.846478318101642, "kl": 0.06005859375, "learning_rate": 8.85228666549851e-07, "loss": 0.024, "reward": 1.5447120666503906, "reward_std": 0.1853538602590561, "rewards/accuracy_reward_stage2": 0.5447121262550354, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 656 }, { "completion_length": 18.71875, "epoch": 0.11512178026984406, "grad_norm": 17.393770278359163, "kl": 0.01513671875, "learning_rate": 8.850534431400035e-07, "loss": 0.0061, "reward": 1.5744693279266357, "reward_std": 0.1881437748670578, "rewards/accuracy_reward_stage2": 0.5744693279266357, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 657 }, { "completion_length": 9.953125, "epoch": 0.1152970036796916, "grad_norm": 14.001795451142137, "kl": 0.11083984375, "learning_rate": 8.848782197301558e-07, "loss": 0.0444, "reward": 1.5067996978759766, "reward_std": 0.08500517159700394, "rewards/accuracy_reward_stage2": 0.6317996978759766, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 658 }, { "completion_length": 11.25, "epoch": 0.11547222708953916, "grad_norm": 20.467399329823657, "kl": 0.06005859375, "learning_rate": 8.847029963203083e-07, "loss": -0.0189, "reward": 1.3823635578155518, "reward_std": 0.21266907453536987, "rewards/accuracy_reward_stage2": 0.39798852801322937, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 659 }, { "completion_length": 8.96875, "epoch": 0.11564745049938671, "grad_norm": 21.6979069287139, "kl": 0.05224609375, "learning_rate": 8.845277729104608e-07, "loss": 0.0209, "reward": 1.4927724599838257, "reward_std": 0.2009587436914444, "rewards/accuracy_reward_stage2": 0.4927724003791809, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 660 }, { "completion_length": 11.03125, "epoch": 0.11582267390923427, "grad_norm": 20.26879174400301, "kl": 0.10107421875, "learning_rate": 8.843525495006132e-07, "loss": 0.0405, "reward": 1.4508693218231201, "reward_std": 0.18329568207263947, "rewards/accuracy_reward_stage2": 0.5758693218231201, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 661 }, { "completion_length": 11.859375, "epoch": 0.11599789731908183, "grad_norm": 20.977372896175574, "kl": 0.06640625, "learning_rate": 8.841773260907657e-07, "loss": -0.0176, "reward": 1.5426456928253174, "reward_std": 0.2179446965456009, "rewards/accuracy_reward_stage2": 0.5582706928253174, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 662 }, { "completion_length": 9.359375, "epoch": 0.11617312072892938, "grad_norm": 32.37126869962416, "kl": 0.1962890625, "learning_rate": 8.840021026809182e-07, "loss": 0.0789, "reward": 1.342227578163147, "reward_std": 0.20740927755832672, "rewards/accuracy_reward_stage2": 0.4672274887561798, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 663 }, { "completion_length": 10.921875, "epoch": 0.11634834413877694, "grad_norm": 20.284582129799464, "kl": 0.059814453125, "learning_rate": 8.838268792710706e-07, "loss": 0.0239, "reward": 1.6112689971923828, "reward_std": 0.2986975312232971, "rewards/accuracy_reward_stage2": 0.611268937587738, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 664 }, { "completion_length": 15.109375, "epoch": 0.1165235675486245, "grad_norm": 17.454920372904077, "kl": 0.05859375, "learning_rate": 8.83651655861223e-07, "loss": 0.0068, "reward": 1.364925742149353, "reward_std": 0.24069397151470184, "rewards/accuracy_reward_stage2": 0.3805507719516754, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 665 }, { "completion_length": 10.09375, "epoch": 0.11669879095847205, "grad_norm": 22.190376227479046, "kl": 0.064453125, "learning_rate": 8.834764324513754e-07, "loss": 0.0258, "reward": 1.6114730834960938, "reward_std": 0.22121301293373108, "rewards/accuracy_reward_stage2": 0.6114731431007385, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 666 }, { "completion_length": 12.09375, "epoch": 0.11687401436831961, "grad_norm": 24.39844815882228, "kl": 0.0498046875, "learning_rate": 8.833012090415279e-07, "loss": 0.0199, "reward": 1.733590841293335, "reward_std": 0.25560033321380615, "rewards/accuracy_reward_stage2": 0.7335907220840454, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 667 }, { "completion_length": 8.90625, "epoch": 0.11704923777816717, "grad_norm": 20.17060417802813, "kl": 0.06298828125, "learning_rate": 8.831259856316804e-07, "loss": 0.0252, "reward": 1.3896734714508057, "reward_std": 0.1829843521118164, "rewards/accuracy_reward_stage2": 0.3896734118461609, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 668 }, { "completion_length": 8.1875, "epoch": 0.11722446118801472, "grad_norm": 19.397346877665314, "kl": 0.0361328125, "learning_rate": 8.829507622218328e-07, "loss": -0.0298, "reward": 1.5043643712997437, "reward_std": 0.15379469096660614, "rewards/accuracy_reward_stage2": 0.5199893712997437, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 669 }, { "completion_length": 10.453125, "epoch": 0.11739968459786228, "grad_norm": 14.634540399364296, "kl": 0.06640625, "learning_rate": 8.827755388119852e-07, "loss": 0.0265, "reward": 1.3958325386047363, "reward_std": 0.1272956132888794, "rewards/accuracy_reward_stage2": 0.39583244919776917, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 670 }, { "completion_length": 12.5625, "epoch": 0.11757490800770982, "grad_norm": 19.871858144020802, "kl": 0.059814453125, "learning_rate": 8.826003154021377e-07, "loss": -0.0274, "reward": 1.6321520805358887, "reward_std": 0.21269144117832184, "rewards/accuracy_reward_stage2": 0.6634020209312439, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 671 }, { "completion_length": 7.171875, "epoch": 0.11775013141755739, "grad_norm": 17.158601344446673, "kl": 0.0830078125, "learning_rate": 8.824250919922901e-07, "loss": 0.0333, "reward": 1.6193318367004395, "reward_std": 0.17965298891067505, "rewards/accuracy_reward_stage2": 0.6193318963050842, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 672 }, { "completion_length": 16.296875, "epoch": 0.11792535482740495, "grad_norm": 19.247189611936488, "kl": 0.0308837890625, "learning_rate": 8.822498685824426e-07, "loss": 0.0124, "reward": 1.6456239223480225, "reward_std": 0.11864829808473587, "rewards/accuracy_reward_stage2": 0.6456239223480225, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 673 }, { "completion_length": 10.296875, "epoch": 0.1181005782372525, "grad_norm": 16.813617413768394, "kl": 0.01214599609375, "learning_rate": 8.82074645172595e-07, "loss": 0.0049, "reward": 1.1939867734909058, "reward_std": 0.16266238689422607, "rewards/accuracy_reward_stage2": 0.19398674368858337, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 674 }, { "completion_length": 14.578125, "epoch": 0.11827580164710005, "grad_norm": 22.171691413512914, "kl": 0.2373046875, "learning_rate": 8.818994217627474e-07, "loss": 0.0948, "reward": 1.4289348125457764, "reward_std": 0.21636059880256653, "rewards/accuracy_reward_stage2": 0.5539346933364868, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 675 }, { "completion_length": 15.3125, "epoch": 0.11845102505694761, "grad_norm": 16.51389196229471, "kl": 0.060791015625, "learning_rate": 8.817241983528999e-07, "loss": -0.0046, "reward": 1.4913157224655151, "reward_std": 0.1663581132888794, "rewards/accuracy_reward_stage2": 0.6163157224655151, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 676 }, { "completion_length": 8.984375, "epoch": 0.11862624846679516, "grad_norm": 17.816697430476907, "kl": 0.19921875, "learning_rate": 8.815489749430523e-07, "loss": 0.0795, "reward": 1.4166667461395264, "reward_std": 0.2742938995361328, "rewards/accuracy_reward_stage2": 0.6666666865348816, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 677 }, { "completion_length": 9.09375, "epoch": 0.11880147187664272, "grad_norm": 22.066474738023967, "kl": 0.037841796875, "learning_rate": 8.813737515332048e-07, "loss": 0.0152, "reward": 1.782374382019043, "reward_std": 0.2402632236480713, "rewards/accuracy_reward_stage2": 0.7823742628097534, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 678 }, { "completion_length": 11.796875, "epoch": 0.11897669528649027, "grad_norm": 21.59322311043083, "kl": 0.072265625, "learning_rate": 8.811985281233573e-07, "loss": -0.0513, "reward": 1.4617195129394531, "reward_std": 0.2623371481895447, "rewards/accuracy_reward_stage2": 0.4929695129394531, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 679 }, { "completion_length": 19.984375, "epoch": 0.11915191869633783, "grad_norm": 21.791897157131835, "kl": 0.263671875, "learning_rate": 8.810233047135097e-07, "loss": 0.0271, "reward": 1.276839017868042, "reward_std": 0.2978595793247223, "rewards/accuracy_reward_stage2": 0.433089017868042, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 680 }, { "completion_length": 15.28125, "epoch": 0.11932714210618539, "grad_norm": 20.228697346514785, "kl": 0.05859375, "learning_rate": 8.808480813036622e-07, "loss": -0.0547, "reward": 1.6011157035827637, "reward_std": 0.2627042233943939, "rewards/accuracy_reward_stage2": 0.6323657035827637, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 681 }, { "completion_length": 15.8125, "epoch": 0.11950236551603294, "grad_norm": 14.047270245441503, "kl": 0.1142578125, "learning_rate": 8.806728578938146e-07, "loss": 0.0361, "reward": 1.3606376647949219, "reward_std": 0.14999261498451233, "rewards/accuracy_reward_stage2": 0.4856376647949219, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 682 }, { "completion_length": 9.5, "epoch": 0.1196775889258805, "grad_norm": 25.027738822698772, "kl": 0.064453125, "learning_rate": 8.80497634483967e-07, "loss": 0.0257, "reward": 1.554718255996704, "reward_std": 0.17526723444461823, "rewards/accuracy_reward_stage2": 0.5547182559967041, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 683 }, { "completion_length": 18.984375, "epoch": 0.11985281233572806, "grad_norm": 21.291836763245577, "kl": 0.0341796875, "learning_rate": 8.803224110741195e-07, "loss": 0.0136, "reward": 1.7058907747268677, "reward_std": 0.16722923517227173, "rewards/accuracy_reward_stage2": 0.7058907747268677, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 684 }, { "completion_length": 9.9375, "epoch": 0.1200280357455756, "grad_norm": 10.990732505324662, "kl": 0.016845703125, "learning_rate": 8.801471876642718e-07, "loss": 0.0067, "reward": 1.7997299432754517, "reward_std": 0.08413556218147278, "rewards/accuracy_reward_stage2": 0.7997298240661621, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 685 }, { "completion_length": 6.921875, "epoch": 0.12020325915542317, "grad_norm": 11.626939281046583, "kl": 0.050537109375, "learning_rate": 8.799719642544243e-07, "loss": -0.024, "reward": 1.4167120456695557, "reward_std": 0.14424577355384827, "rewards/accuracy_reward_stage2": 0.4323369860649109, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 686 }, { "completion_length": 9.53125, "epoch": 0.12037848256527071, "grad_norm": 23.820420447554763, "kl": 0.0208740234375, "learning_rate": 8.797967408445768e-07, "loss": 0.0084, "reward": 1.7314132452011108, "reward_std": 0.2912830412387848, "rewards/accuracy_reward_stage2": 0.7314131855964661, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 687 }, { "completion_length": 8.421875, "epoch": 0.12055370597511827, "grad_norm": 25.086453810433557, "kl": 0.17578125, "learning_rate": 8.796215174347292e-07, "loss": -0.0079, "reward": 1.603489637374878, "reward_std": 0.16268977522850037, "rewards/accuracy_reward_stage2": 0.6347395181655884, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 688 }, { "completion_length": 18.28125, "epoch": 0.12072892938496584, "grad_norm": 22.8929620555088, "kl": 0.224609375, "learning_rate": 8.794462940248817e-07, "loss": 0.0898, "reward": 1.3817732334136963, "reward_std": 0.3009188175201416, "rewards/accuracy_reward_stage2": 0.5067732334136963, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 689 }, { "completion_length": 7.46875, "epoch": 0.12090415279481338, "grad_norm": 14.703281851075689, "kl": 0.056884765625, "learning_rate": 8.792710706150341e-07, "loss": -0.0184, "reward": 1.5921062231063843, "reward_std": 0.19487033784389496, "rewards/accuracy_reward_stage2": 0.6077312231063843, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 690 }, { "completion_length": 9.34375, "epoch": 0.12107937620466094, "grad_norm": 20.135150925561963, "kl": 0.0712890625, "learning_rate": 8.790958472051866e-07, "loss": 0.0285, "reward": 1.5368764400482178, "reward_std": 0.29632821679115295, "rewards/accuracy_reward_stage2": 0.5368764996528625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 691 }, { "completion_length": 19.84375, "epoch": 0.1212545996145085, "grad_norm": 23.68837901261757, "kl": 0.03955078125, "learning_rate": 8.789206237953391e-07, "loss": 0.0158, "reward": 1.5159263610839844, "reward_std": 0.1295163780450821, "rewards/accuracy_reward_stage2": 0.5159264206886292, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 692 }, { "completion_length": 7.828125, "epoch": 0.12142982302435605, "grad_norm": 20.499563576511616, "kl": 0.078125, "learning_rate": 8.787454003854915e-07, "loss": -0.0018, "reward": 1.6112077236175537, "reward_std": 0.27959445118904114, "rewards/accuracy_reward_stage2": 0.6268327236175537, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 693 }, { "completion_length": 9.171875, "epoch": 0.12160504643420361, "grad_norm": 15.815312044878585, "kl": 0.11474609375, "learning_rate": 8.78570176975644e-07, "loss": 0.0458, "reward": 1.4532971382141113, "reward_std": 0.22508825361728668, "rewards/accuracy_reward_stage2": 0.4532972276210785, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 694 }, { "completion_length": 14.328125, "epoch": 0.12178026984405116, "grad_norm": 21.787091332554773, "kl": 0.051513671875, "learning_rate": 8.783949535657964e-07, "loss": -0.022, "reward": 1.662217617034912, "reward_std": 0.21388936042785645, "rewards/accuracy_reward_stage2": 0.6778424978256226, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 695 }, { "completion_length": 9.140625, "epoch": 0.12195549325389872, "grad_norm": 24.559787032583806, "kl": 0.08447265625, "learning_rate": 8.782197301559487e-07, "loss": -0.0419, "reward": 1.6562061309814453, "reward_std": 0.3602675795555115, "rewards/accuracy_reward_stage2": 0.6874561905860901, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 696 }, { "completion_length": 10.109375, "epoch": 0.12213071666374628, "grad_norm": 20.95511007029554, "kl": 0.0228271484375, "learning_rate": 8.780445067461012e-07, "loss": 0.0091, "reward": 1.3176965713500977, "reward_std": 0.13378259539604187, "rewards/accuracy_reward_stage2": 0.5676966309547424, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 697 }, { "completion_length": 12.0625, "epoch": 0.12230594007359383, "grad_norm": 21.98721888405554, "kl": 0.2421875, "learning_rate": 8.778692833362536e-07, "loss": 0.0969, "reward": 1.233135461807251, "reward_std": 0.12779664993286133, "rewards/accuracy_reward_stage2": 0.483135461807251, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 698 }, { "completion_length": 21.8125, "epoch": 0.12248116348344139, "grad_norm": 16.042916838294357, "kl": 0.020751953125, "learning_rate": 8.776940599264061e-07, "loss": 0.0083, "reward": 1.2561603784561157, "reward_std": 0.10535544157028198, "rewards/accuracy_reward_stage2": 0.3811603784561157, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 699 }, { "completion_length": 7.5, "epoch": 0.12265638689328895, "grad_norm": 6.820091036219988, "kl": 0.0162353515625, "learning_rate": 8.775188365165586e-07, "loss": -0.0377, "reward": 1.359375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 700 }, { "completion_length": 19.34375, "epoch": 0.1228316103031365, "grad_norm": 26.132183313864143, "kl": 0.10107421875, "learning_rate": 8.77343613106711e-07, "loss": -0.0037, "reward": 1.4717097282409668, "reward_std": 0.33925962448120117, "rewards/accuracy_reward_stage2": 0.612334668636322, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 701 }, { "completion_length": 12.765625, "epoch": 0.12300683371298406, "grad_norm": 23.06283365798747, "kl": 0.36328125, "learning_rate": 8.771683896968635e-07, "loss": 0.1453, "reward": 1.3122766017913818, "reward_std": 0.2527972161769867, "rewards/accuracy_reward_stage2": 0.5622766017913818, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 702 }, { "completion_length": 14.0625, "epoch": 0.1231820571228316, "grad_norm": 21.00144003227733, "kl": 0.051513671875, "learning_rate": 8.76993166287016e-07, "loss": 0.0206, "reward": 1.542011022567749, "reward_std": 0.21726180613040924, "rewards/accuracy_reward_stage2": 0.542011022567749, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 703 }, { "completion_length": 16.015625, "epoch": 0.12335728053267916, "grad_norm": 19.729022686331653, "kl": 0.03564453125, "learning_rate": 8.768179428771684e-07, "loss": 0.0142, "reward": 1.6703829765319824, "reward_std": 0.14785850048065186, "rewards/accuracy_reward_stage2": 0.6703829169273376, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 704 }, { "completion_length": 12.265625, "epoch": 0.12353250394252673, "grad_norm": 16.84608154286035, "kl": 0.068359375, "learning_rate": 8.766427194673208e-07, "loss": -0.0499, "reward": 1.5830492973327637, "reward_std": 0.22908511757850647, "rewards/accuracy_reward_stage2": 0.6142992973327637, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 705 }, { "completion_length": 9.578125, "epoch": 0.12370772735237427, "grad_norm": 17.416974869118555, "kl": 0.17578125, "learning_rate": 8.764674960574732e-07, "loss": 0.0703, "reward": 1.3125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward_stage2": 0.4375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 706 }, { "completion_length": 11.453125, "epoch": 0.12388295076222183, "grad_norm": 19.79245621798528, "kl": 0.03173828125, "learning_rate": 8.762922726476257e-07, "loss": -0.0604, "reward": 1.5851845741271973, "reward_std": 0.2783687710762024, "rewards/accuracy_reward_stage2": 0.6164345741271973, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 707 }, { "completion_length": 16.046875, "epoch": 0.1240581741720694, "grad_norm": 17.210066397566948, "kl": 0.0751953125, "learning_rate": 8.761170492377782e-07, "loss": -0.0125, "reward": 1.521512508392334, "reward_std": 0.23234155774116516, "rewards/accuracy_reward_stage2": 0.537137508392334, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 708 }, { "completion_length": 13.171875, "epoch": 0.12423339758191694, "grad_norm": 37.86191898225597, "kl": 0.283203125, "learning_rate": 8.759418258279305e-07, "loss": 0.1262, "reward": 1.515625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 709 }, { "completion_length": 9.078125, "epoch": 0.1244086209917645, "grad_norm": 20.46731923834019, "kl": 0.02783203125, "learning_rate": 8.75766602418083e-07, "loss": 0.0112, "reward": 1.5376777648925781, "reward_std": 0.19346807897090912, "rewards/accuracy_reward_stage2": 0.6626777648925781, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 710 }, { "completion_length": 7.734375, "epoch": 0.12458384440161206, "grad_norm": 27.163653480328417, "kl": 0.054443359375, "learning_rate": 8.755913790082355e-07, "loss": -0.0225, "reward": 1.4373842477798462, "reward_std": 0.24737679958343506, "rewards/accuracy_reward_stage2": 0.4530092477798462, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 711 }, { "completion_length": 10.1875, "epoch": 0.12475906781145961, "grad_norm": 24.58636403392442, "kl": 0.047607421875, "learning_rate": 8.754161555983879e-07, "loss": 0.0191, "reward": 1.4863896369934082, "reward_std": 0.309207558631897, "rewards/accuracy_reward_stage2": 0.4863896369934082, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 712 }, { "completion_length": 9.984375, "epoch": 0.12493429122130717, "grad_norm": 23.66613349846893, "kl": 0.0390625, "learning_rate": 8.752409321885404e-07, "loss": -0.0281, "reward": 1.6329691410064697, "reward_std": 0.22451892495155334, "rewards/accuracy_reward_stage2": 0.6485941410064697, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 713 }, { "completion_length": 8.921875, "epoch": 0.12510951463115472, "grad_norm": 22.08982170844661, "kl": 0.125, "learning_rate": 8.750657087786927e-07, "loss": 0.0083, "reward": 1.7729127407073975, "reward_std": 0.22566775977611542, "rewards/accuracy_reward_stage2": 0.7885376811027527, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 714 }, { "completion_length": 10.96875, "epoch": 0.1252847380410023, "grad_norm": 15.29384643846092, "kl": 0.059814453125, "learning_rate": 8.748904853688452e-07, "loss": -0.021, "reward": 1.3826444149017334, "reward_std": 0.2546432912349701, "rewards/accuracy_reward_stage2": 0.4138944447040558, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 715 }, { "completion_length": 9.25, "epoch": 0.12545996145084984, "grad_norm": 22.25942061435046, "kl": 0.125, "learning_rate": 8.747152619589977e-07, "loss": -0.016, "reward": 1.588760256767273, "reward_std": 0.27730366587638855, "rewards/accuracy_reward_stage2": 0.854385256767273, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 716 }, { "completion_length": 11.9375, "epoch": 0.12563518486069739, "grad_norm": 14.361927992686452, "kl": 0.0517578125, "learning_rate": 8.745400385491501e-07, "loss": 0.0207, "reward": 1.789434552192688, "reward_std": 0.07365694642066956, "rewards/accuracy_reward_stage2": 0.789434552192688, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 717 }, { "completion_length": 15.640625, "epoch": 0.12581040827054493, "grad_norm": 24.44647306222038, "kl": 0.15625, "learning_rate": 8.743648151393026e-07, "loss": 0.0408, "reward": 1.407584309577942, "reward_std": 0.14255505800247192, "rewards/accuracy_reward_stage2": 0.6732093691825867, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 718 }, { "completion_length": 12.5, "epoch": 0.1259856316803925, "grad_norm": 23.288244008568896, "kl": 0.091796875, "learning_rate": 8.741895917294551e-07, "loss": 0.0052, "reward": 1.5710008144378662, "reward_std": 0.24018634855747223, "rewards/accuracy_reward_stage2": 0.5866257548332214, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 719 }, { "completion_length": 29.15625, "epoch": 0.12616085509024005, "grad_norm": 15.350773622109923, "kl": 0.048828125, "learning_rate": 8.740143683196075e-07, "loss": 0.0196, "reward": 1.360360026359558, "reward_std": 0.15090447664260864, "rewards/accuracy_reward_stage2": 0.4853600561618805, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 720 }, { "completion_length": 9.359375, "epoch": 0.1263360785000876, "grad_norm": 18.454024671060672, "kl": 0.115234375, "learning_rate": 8.738391449097599e-07, "loss": 0.0461, "reward": 1.3818800449371338, "reward_std": 0.16841940581798553, "rewards/accuracy_reward_stage2": 0.5068800449371338, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 721 }, { "completion_length": 8.265625, "epoch": 0.12651130190993518, "grad_norm": 21.160670778734957, "kl": 0.046142578125, "learning_rate": 8.736639214999123e-07, "loss": 0.0185, "reward": 1.6255755424499512, "reward_std": 0.22785469889640808, "rewards/accuracy_reward_stage2": 0.6255755424499512, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 722 }, { "completion_length": 9.59375, "epoch": 0.12668652531978272, "grad_norm": 22.416171315171262, "kl": 0.09521484375, "learning_rate": 8.734886980900648e-07, "loss": -0.0139, "reward": 1.3465453386306763, "reward_std": 0.19358889758586884, "rewards/accuracy_reward_stage2": 0.3777953088283539, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 723 }, { "completion_length": 8.0625, "epoch": 0.12686174872963027, "grad_norm": 19.61083070842855, "kl": 0.0242919921875, "learning_rate": 8.733134746802173e-07, "loss": 0.0097, "reward": 1.5615843534469604, "reward_std": 0.19774232804775238, "rewards/accuracy_reward_stage2": 0.5615843534469604, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 724 }, { "completion_length": 10.0625, "epoch": 0.12703697213947784, "grad_norm": 13.690877366216116, "kl": 0.11279296875, "learning_rate": 8.731382512703696e-07, "loss": 0.0451, "reward": 1.3139506578445435, "reward_std": 0.13442152738571167, "rewards/accuracy_reward_stage2": 0.5639506578445435, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 725 }, { "completion_length": 8.828125, "epoch": 0.1272121955493254, "grad_norm": 18.62268067901579, "kl": 0.267578125, "learning_rate": 8.729630278605221e-07, "loss": 0.094, "reward": 1.4589645862579346, "reward_std": 0.18263386189937592, "rewards/accuracy_reward_stage2": 0.7245896458625793, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 726 }, { "completion_length": 15.21875, "epoch": 0.12738741895917294, "grad_norm": 23.71829613698294, "kl": 0.061279296875, "learning_rate": 8.727878044506745e-07, "loss": 0.0245, "reward": 1.4625771045684814, "reward_std": 0.17948952317237854, "rewards/accuracy_reward_stage2": 0.5875771045684814, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 727 }, { "completion_length": 8.171875, "epoch": 0.1275626423690205, "grad_norm": 15.271887827957038, "kl": 0.03173828125, "learning_rate": 8.72612581040827e-07, "loss": -0.0669, "reward": 1.7864583730697632, "reward_std": 0.23897382616996765, "rewards/accuracy_reward_stage2": 0.8177083134651184, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 728 }, { "completion_length": 14.65625, "epoch": 0.12773786577886806, "grad_norm": 17.48789694547715, "kl": 0.173828125, "learning_rate": 8.724373576309795e-07, "loss": 0.0255, "reward": 1.3286545276641846, "reward_std": 0.13117444515228271, "rewards/accuracy_reward_stage2": 0.46927955746650696, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 729 }, { "completion_length": 12.328125, "epoch": 0.1279130891887156, "grad_norm": 23.96242945197875, "kl": 0.060546875, "learning_rate": 8.722621342211319e-07, "loss": -0.0112, "reward": 1.722947120666504, "reward_std": 0.23015879094600677, "rewards/accuracy_reward_stage2": 0.7385720610618591, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 730 }, { "completion_length": 11.234375, "epoch": 0.12808831259856318, "grad_norm": 14.144418959421241, "kl": 0.06689453125, "learning_rate": 8.720869108112844e-07, "loss": -0.0175, "reward": 1.603621006011963, "reward_std": 0.16533318161964417, "rewards/accuracy_reward_stage2": 0.6192460656166077, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 731 }, { "completion_length": 12.03125, "epoch": 0.12826353600841073, "grad_norm": 18.61312573748659, "kl": 0.052734375, "learning_rate": 8.719116874014369e-07, "loss": 0.0048, "reward": 1.4712536334991455, "reward_std": 0.15344056487083435, "rewards/accuracy_reward_stage2": 0.611878514289856, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 732 }, { "completion_length": 8.625, "epoch": 0.12843875941825827, "grad_norm": 17.441433878109546, "kl": 0.050537109375, "learning_rate": 8.717364639915893e-07, "loss": 0.0202, "reward": 1.3996421098709106, "reward_std": 0.2853778600692749, "rewards/accuracy_reward_stage2": 0.5246421694755554, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 733 }, { "completion_length": 11.15625, "epoch": 0.12861398282810582, "grad_norm": 24.980574455079825, "kl": 0.189453125, "learning_rate": 8.715612405817416e-07, "loss": 0.0045, "reward": 1.6333026885986328, "reward_std": 0.26741790771484375, "rewards/accuracy_reward_stage2": 0.7895527482032776, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 734 }, { "completion_length": 11.109375, "epoch": 0.1287892062379534, "grad_norm": 50.068285086941245, "kl": 0.255859375, "learning_rate": 8.71386017171894e-07, "loss": 0.0624, "reward": 1.6093500852584839, "reward_std": 0.18894584476947784, "rewards/accuracy_reward_stage2": 0.6249750852584839, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 735 }, { "completion_length": 10.46875, "epoch": 0.12896442964780094, "grad_norm": 16.488258189073505, "kl": 0.0198974609375, "learning_rate": 8.712107937620465e-07, "loss": -0.005, "reward": 1.542205572128296, "reward_std": 0.139317587018013, "rewards/accuracy_reward_stage2": 0.5578306913375854, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 736 }, { "completion_length": 11.5625, "epoch": 0.1291396530576485, "grad_norm": 34.105571787658626, "kl": 0.0439453125, "learning_rate": 8.71035570352199e-07, "loss": -0.0257, "reward": 1.604642391204834, "reward_std": 0.18298792839050293, "rewards/accuracy_reward_stage2": 0.7452673316001892, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 737 }, { "completion_length": 11.609375, "epoch": 0.12931487646749606, "grad_norm": 23.348912361605613, "kl": 0.0751953125, "learning_rate": 8.708603469423514e-07, "loss": 0.0301, "reward": 1.808213710784912, "reward_std": 0.1791677325963974, "rewards/accuracy_reward_stage2": 0.8082137107849121, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 738 }, { "completion_length": 11.59375, "epoch": 0.1294900998773436, "grad_norm": 21.941403584613536, "kl": 0.07470703125, "learning_rate": 8.706851235325039e-07, "loss": 0.0299, "reward": 1.4388988018035889, "reward_std": 0.20526067912578583, "rewards/accuracy_reward_stage2": 0.43889886140823364, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 739 }, { "completion_length": 13.375, "epoch": 0.12966532328719116, "grad_norm": 21.150108215201115, "kl": 0.06298828125, "learning_rate": 8.705099001226564e-07, "loss": -0.019, "reward": 1.7087092399597168, "reward_std": 0.2277108132839203, "rewards/accuracy_reward_stage2": 0.7243342399597168, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 740 }, { "completion_length": 8.78125, "epoch": 0.12984054669703873, "grad_norm": 16.65691581922075, "kl": 0.0224609375, "learning_rate": 8.703346767128088e-07, "loss": -0.0352, "reward": 1.3385417461395264, "reward_std": 0.13152071833610535, "rewards/accuracy_reward_stage2": 0.3541666567325592, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 741 }, { "completion_length": 9.9375, "epoch": 0.13001577010688628, "grad_norm": 17.551427941488406, "kl": 0.09033203125, "learning_rate": 8.701594533029613e-07, "loss": 0.0361, "reward": 1.6374205350875854, "reward_std": 0.1315646469593048, "rewards/accuracy_reward_stage2": 0.762420654296875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 742 }, { "completion_length": 11.859375, "epoch": 0.13019099351673383, "grad_norm": 19.7964491837443, "kl": 0.0147705078125, "learning_rate": 8.699842298931137e-07, "loss": 0.0059, "reward": 1.7438607215881348, "reward_std": 0.0982588678598404, "rewards/accuracy_reward_stage2": 0.7438607811927795, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 743 }, { "completion_length": 11.375, "epoch": 0.1303662169265814, "grad_norm": 15.932552856525444, "kl": 0.054931640625, "learning_rate": 8.698090064832662e-07, "loss": 0.022, "reward": 1.5394132137298584, "reward_std": 0.10703323036432266, "rewards/accuracy_reward_stage2": 0.5394132137298584, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 744 }, { "completion_length": 8.40625, "epoch": 0.13054144033642895, "grad_norm": 18.34859237019442, "kl": 0.023193359375, "learning_rate": 8.696337830734186e-07, "loss": 0.0093, "reward": 1.5473082065582275, "reward_std": 0.2439563274383545, "rewards/accuracy_reward_stage2": 0.6723082065582275, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 745 }, { "completion_length": 7.46875, "epoch": 0.1307166637462765, "grad_norm": 24.168096070523397, "kl": 0.13671875, "learning_rate": 8.69458559663571e-07, "loss": 0.0316, "reward": 1.4857832193374634, "reward_std": 0.2854121923446655, "rewards/accuracy_reward_stage2": 0.5014082193374634, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 746 }, { "completion_length": 42.578125, "epoch": 0.13089188715612407, "grad_norm": 23.125506213020195, "kl": 0.1162109375, "learning_rate": 8.692833362537234e-07, "loss": 0.0466, "reward": 1.48673677444458, "reward_std": 0.24000920355319977, "rewards/accuracy_reward_stage2": 0.4867367744445801, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 747 }, { "completion_length": 7.421875, "epoch": 0.13106711056597162, "grad_norm": 18.38991310797759, "kl": 0.05078125, "learning_rate": 8.691081128438759e-07, "loss": 0.0203, "reward": 1.53125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 748 }, { "completion_length": 15.359375, "epoch": 0.13124233397581916, "grad_norm": 30.97799225674079, "kl": 0.1630859375, "learning_rate": 8.689328894340283e-07, "loss": 0.0653, "reward": 1.4413138628005981, "reward_std": 0.22766022384166718, "rewards/accuracy_reward_stage2": 0.5663139224052429, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 749 }, { "completion_length": 9.9375, "epoch": 0.13141755738566674, "grad_norm": 15.125011554316435, "kl": 0.06201171875, "learning_rate": 8.687576660241808e-07, "loss": 0.0248, "reward": 1.7050046920776367, "reward_std": 0.11424589902162552, "rewards/accuracy_reward_stage2": 0.7050046324729919, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 750 }, { "completion_length": 7.0625, "epoch": 0.13159278079551429, "grad_norm": 20.962470991988887, "kl": 0.059814453125, "learning_rate": 8.685824426143332e-07, "loss": 0.0239, "reward": 1.678983211517334, "reward_std": 0.19982093572616577, "rewards/accuracy_reward_stage2": 0.678983211517334, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 751 }, { "completion_length": 14.6875, "epoch": 0.13176800420536183, "grad_norm": 385.8362173270209, "kl": 1.5625, "learning_rate": 8.684072192044857e-07, "loss": 0.6266, "reward": 1.203751564025879, "reward_std": 0.18451911211013794, "rewards/accuracy_reward_stage2": 0.4537515342235565, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 752 }, { "completion_length": 7.046875, "epoch": 0.13194322761520938, "grad_norm": 10.755434994237342, "kl": 0.005523681640625, "learning_rate": 8.682319957946382e-07, "loss": -0.042, "reward": 1.6046037673950195, "reward_std": 0.0576893612742424, "rewards/accuracy_reward_stage2": 0.6202287077903748, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 753 }, { "completion_length": 12.109375, "epoch": 0.13211845102505695, "grad_norm": 20.873092091818556, "kl": 0.1015625, "learning_rate": 8.680567723847905e-07, "loss": -0.0454, "reward": 1.3576622009277344, "reward_std": 0.25579172372817993, "rewards/accuracy_reward_stage2": 0.3889121115207672, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 754 }, { "completion_length": 10.21875, "epoch": 0.1322936744349045, "grad_norm": 23.23743047363629, "kl": 0.08740234375, "learning_rate": 8.67881548974943e-07, "loss": -0.0093, "reward": 1.6699610948562622, "reward_std": 0.2566087543964386, "rewards/accuracy_reward_stage2": 0.6855860948562622, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 755 }, { "completion_length": 7.328125, "epoch": 0.13246889784475205, "grad_norm": 20.70546052020614, "kl": 0.140625, "learning_rate": 8.677063255650955e-07, "loss": 0.0561, "reward": 1.5664639472961426, "reward_std": 0.17884564399719238, "rewards/accuracy_reward_stage2": 0.6914640069007874, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 756 }, { "completion_length": 11.5, "epoch": 0.13264412125459962, "grad_norm": 20.88423913244621, "kl": 0.244140625, "learning_rate": 8.675311021552479e-07, "loss": 0.1123, "reward": 1.1417410373687744, "reward_std": 0.1422545462846756, "rewards/accuracy_reward_stage2": 0.3917410969734192, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 757 }, { "completion_length": 9.015625, "epoch": 0.13281934466444717, "grad_norm": 18.603362315080933, "kl": 0.07763671875, "learning_rate": 8.673558787454004e-07, "loss": -0.0318, "reward": 1.6441401243209839, "reward_std": 0.2794819474220276, "rewards/accuracy_reward_stage2": 0.6753901243209839, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 758 }, { "completion_length": 10.109375, "epoch": 0.13299456807429472, "grad_norm": 20.419572301243445, "kl": 0.048828125, "learning_rate": 8.671806553355527e-07, "loss": 0.0195, "reward": 1.6706733703613281, "reward_std": 0.17810383439064026, "rewards/accuracy_reward_stage2": 0.6706732511520386, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 759 }, { "completion_length": 11.234375, "epoch": 0.1331697914841423, "grad_norm": 25.885192159188893, "kl": 0.04833984375, "learning_rate": 8.670054319257052e-07, "loss": -0.0248, "reward": 1.4116337299346924, "reward_std": 0.2717921733856201, "rewards/accuracy_reward_stage2": 0.5522586107254028, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 760 }, { "completion_length": 11.96875, "epoch": 0.13334501489398984, "grad_norm": 18.97680260993474, "kl": 0.068359375, "learning_rate": 8.668302085158577e-07, "loss": -0.0168, "reward": 1.7507286071777344, "reward_std": 0.17205798625946045, "rewards/accuracy_reward_stage2": 0.7663537263870239, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 761 }, { "completion_length": 9.796875, "epoch": 0.13352023830383739, "grad_norm": 16.862781802573416, "kl": 0.04833984375, "learning_rate": 8.666549851060101e-07, "loss": -0.014, "reward": 1.2919607162475586, "reward_std": 0.22537767887115479, "rewards/accuracy_reward_stage2": 0.3075857162475586, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 762 }, { "completion_length": 9.0, "epoch": 0.13369546171368496, "grad_norm": 13.026380521166306, "kl": 0.0380859375, "learning_rate": 8.664797616961626e-07, "loss": -0.0596, "reward": 1.5166369676589966, "reward_std": 0.19904598593711853, "rewards/accuracy_reward_stage2": 0.6728869676589966, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 763 }, { "completion_length": 16.875, "epoch": 0.1338706851235325, "grad_norm": 20.06725931064897, "kl": 0.09423828125, "learning_rate": 8.66304538286315e-07, "loss": 0.0377, "reward": 1.4519935846328735, "reward_std": 0.21910575032234192, "rewards/accuracy_reward_stage2": 0.5769935846328735, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 764 }, { "completion_length": 12.96875, "epoch": 0.13404590853338005, "grad_norm": 21.98710630877217, "kl": 0.0361328125, "learning_rate": 8.661293148764674e-07, "loss": 0.0144, "reward": 1.4242117404937744, "reward_std": 0.29334086179733276, "rewards/accuracy_reward_stage2": 0.6742118000984192, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 765 }, { "completion_length": 17.03125, "epoch": 0.13422113194322763, "grad_norm": 20.425342566894855, "kl": 0.12451171875, "learning_rate": 8.659540914666199e-07, "loss": 0.0499, "reward": 1.2799370288848877, "reward_std": 0.18060024082660675, "rewards/accuracy_reward_stage2": 0.4049370586872101, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 766 }, { "completion_length": 10.234375, "epoch": 0.13439635535307518, "grad_norm": 18.339545297817114, "kl": 0.0380859375, "learning_rate": 8.657788680567723e-07, "loss": 0.0152, "reward": 1.4003806114196777, "reward_std": 0.18729178607463837, "rewards/accuracy_reward_stage2": 0.5253806710243225, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 767 }, { "completion_length": 11.953125, "epoch": 0.13457157876292272, "grad_norm": 25.01143507989166, "kl": 0.0458984375, "learning_rate": 8.656036446469248e-07, "loss": 0.0184, "reward": 1.4315991401672363, "reward_std": 0.2495567500591278, "rewards/accuracy_reward_stage2": 0.5565991401672363, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 768 }, { "completion_length": 8.5, "epoch": 0.13474680217277027, "grad_norm": 22.864818091288953, "kl": 0.05224609375, "learning_rate": 8.654284212370773e-07, "loss": 0.0209, "reward": 1.5170884132385254, "reward_std": 0.2796719968318939, "rewards/accuracy_reward_stage2": 0.5170884132385254, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 769 }, { "completion_length": 8.375, "epoch": 0.13492202558261784, "grad_norm": 19.1641610611154, "kl": 0.08349609375, "learning_rate": 8.652531978272297e-07, "loss": 0.0334, "reward": 1.553787350654602, "reward_std": 0.22851644456386566, "rewards/accuracy_reward_stage2": 0.553787350654602, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 770 }, { "completion_length": 10.21875, "epoch": 0.1350972489924654, "grad_norm": 19.934467521338437, "kl": 0.0537109375, "learning_rate": 8.650779744173822e-07, "loss": -0.0216, "reward": 1.4445722103118896, "reward_std": 0.2468641996383667, "rewards/accuracy_reward_stage2": 0.47582218050956726, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 771 }, { "completion_length": 10.9375, "epoch": 0.13527247240231294, "grad_norm": 13.701970829268843, "kl": 0.06982421875, "learning_rate": 8.649027510075346e-07, "loss": 0.0279, "reward": 1.4476916790008545, "reward_std": 0.12115681171417236, "rewards/accuracy_reward_stage2": 0.5726916790008545, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 772 }, { "completion_length": 10.109375, "epoch": 0.1354476958121605, "grad_norm": 18.066531116861547, "kl": 0.29296875, "learning_rate": 8.64727527597687e-07, "loss": 0.0854, "reward": 1.426032543182373, "reward_std": 0.21603938937187195, "rewards/accuracy_reward_stage2": 0.566657543182373, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 773 }, { "completion_length": 9.09375, "epoch": 0.13562291922200806, "grad_norm": 16.71484519661642, "kl": 0.1220703125, "learning_rate": 8.645523041878394e-07, "loss": -0.0187, "reward": 1.326295256614685, "reward_std": 0.24095875024795532, "rewards/accuracy_reward_stage2": 0.48254525661468506, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 774 }, { "completion_length": 7.9375, "epoch": 0.1357981426318556, "grad_norm": 19.917155663833235, "kl": 0.09326171875, "learning_rate": 8.643770807779918e-07, "loss": -0.0468, "reward": 1.4075981378555298, "reward_std": 0.23457500338554382, "rewards/accuracy_reward_stage2": 0.563848078250885, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 775 }, { "completion_length": 5.484375, "epoch": 0.13597336604170318, "grad_norm": 21.14923073069886, "kl": 0.1435546875, "learning_rate": 8.642018573681443e-07, "loss": 0.0134, "reward": 1.250086784362793, "reward_std": 0.19084270298480988, "rewards/accuracy_reward_stage2": 0.26571181416511536, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 776 }, { "completion_length": 10.0, "epoch": 0.13614858945155073, "grad_norm": 24.381992165789516, "kl": 0.07763671875, "learning_rate": 8.640266339582968e-07, "loss": -0.0125, "reward": 1.502030611038208, "reward_std": 0.2354775220155716, "rewards/accuracy_reward_stage2": 0.517655611038208, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 777 }, { "completion_length": 12.875, "epoch": 0.13632381286139827, "grad_norm": 19.508155852089544, "kl": 0.0478515625, "learning_rate": 8.638514105484492e-07, "loss": -0.0251, "reward": 1.6137158870697021, "reward_std": 0.21956676244735718, "rewards/accuracy_reward_stage2": 0.6293408870697021, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 778 }, { "completion_length": 13.265625, "epoch": 0.13649903627124585, "grad_norm": 16.56376238689246, "kl": 0.06640625, "learning_rate": 8.636761871386017e-07, "loss": 0.0266, "reward": 0.9685095548629761, "reward_std": 0.17524616420269012, "rewards/accuracy_reward_stage2": 0.2185094952583313, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 779 }, { "completion_length": 7.4375, "epoch": 0.1366742596810934, "grad_norm": 16.918923047432216, "kl": 0.0693359375, "learning_rate": 8.635009637287542e-07, "loss": 0.0278, "reward": 1.5394725799560547, "reward_std": 0.1202455535531044, "rewards/accuracy_reward_stage2": 0.6644724607467651, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 780 }, { "completion_length": 8.359375, "epoch": 0.13684948309094094, "grad_norm": 25.712473621519223, "kl": 0.0223388671875, "learning_rate": 8.633257403189066e-07, "loss": 0.0089, "reward": 1.6351406574249268, "reward_std": 0.1843852698802948, "rewards/accuracy_reward_stage2": 0.6351406574249268, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 781 }, { "completion_length": 9.328125, "epoch": 0.13702470650078852, "grad_norm": 15.671890042035438, "kl": 0.021484375, "learning_rate": 8.631505169090591e-07, "loss": 0.0023, "reward": 1.4858630895614624, "reward_std": 0.12837354838848114, "rewards/accuracy_reward_stage2": 0.6108630895614624, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 782 }, { "completion_length": 10.984375, "epoch": 0.13719992991063606, "grad_norm": 20.30097845606878, "kl": 0.12255859375, "learning_rate": 8.629752934992115e-07, "loss": 0.0126, "reward": 1.5040788650512695, "reward_std": 0.2858182489871979, "rewards/accuracy_reward_stage2": 0.5197038650512695, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 783 }, { "completion_length": 9.203125, "epoch": 0.1373751533204836, "grad_norm": 26.037028675184818, "kl": 0.10009765625, "learning_rate": 8.62800070089364e-07, "loss": -0.0872, "reward": 1.5257666110992432, "reward_std": 0.29896122217178345, "rewards/accuracy_reward_stage2": 0.5726416110992432, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 784 }, { "completion_length": 11.796875, "epoch": 0.13755037673033116, "grad_norm": 22.34753247816572, "kl": 0.061279296875, "learning_rate": 8.626248466795163e-07, "loss": 0.0246, "reward": 1.5781292915344238, "reward_std": 0.14735251665115356, "rewards/accuracy_reward_stage2": 0.5781292915344238, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 785 }, { "completion_length": 9.640625, "epoch": 0.13772560014017873, "grad_norm": 21.799564810232653, "kl": 0.01904296875, "learning_rate": 8.624496232696687e-07, "loss": 0.0076, "reward": 1.606956958770752, "reward_std": 0.2769896984100342, "rewards/accuracy_reward_stage2": 0.6069568395614624, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 786 }, { "completion_length": 8.125, "epoch": 0.13790082355002628, "grad_norm": 15.977768888328422, "kl": 0.05078125, "learning_rate": 8.622743998598212e-07, "loss": 0.0202, "reward": 1.5291006565093994, "reward_std": 0.22113242745399475, "rewards/accuracy_reward_stage2": 0.5291005373001099, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 787 }, { "completion_length": 9.625, "epoch": 0.13807604695987383, "grad_norm": 24.547348319862273, "kl": 0.10791015625, "learning_rate": 8.620991764499737e-07, "loss": 0.0431, "reward": 1.7083591222763062, "reward_std": 0.20269346237182617, "rewards/accuracy_reward_stage2": 0.7083591222763062, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 788 }, { "completion_length": 7.203125, "epoch": 0.1382512703697214, "grad_norm": 9.08047735946729, "kl": 0.009521484375, "learning_rate": 8.619239530401261e-07, "loss": 0.0038, "reward": 1.3945484161376953, "reward_std": 0.012436339631676674, "rewards/accuracy_reward_stage2": 0.3945484161376953, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 789 }, { "completion_length": 8.859375, "epoch": 0.13842649377956895, "grad_norm": 22.58653722630102, "kl": 0.0859375, "learning_rate": 8.617487296302786e-07, "loss": 0.001, "reward": 1.6167235374450684, "reward_std": 0.26534900069236755, "rewards/accuracy_reward_stage2": 0.6323485374450684, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 790 }, { "completion_length": 7.90625, "epoch": 0.1386017171894165, "grad_norm": 25.853314882203968, "kl": 0.1376953125, "learning_rate": 8.61573506220431e-07, "loss": -0.032, "reward": 1.5990020036697388, "reward_std": 0.3643280267715454, "rewards/accuracy_reward_stage2": 0.6458768844604492, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 791 }, { "completion_length": 17.46875, "epoch": 0.13877694059926407, "grad_norm": 22.407136209035535, "kl": 0.09716796875, "learning_rate": 8.613982828105835e-07, "loss": -0.0052, "reward": 1.6806354522705078, "reward_std": 0.2374819815158844, "rewards/accuracy_reward_stage2": 0.6962604522705078, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 792 }, { "completion_length": 7.734375, "epoch": 0.13895216400911162, "grad_norm": 22.541418888160006, "kl": 0.1689453125, "learning_rate": 8.61223059400736e-07, "loss": 0.0677, "reward": 1.4915530681610107, "reward_std": 0.15434393286705017, "rewards/accuracy_reward_stage2": 0.6165530681610107, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 793 }, { "completion_length": 9.453125, "epoch": 0.13912738741895916, "grad_norm": 20.819147207825274, "kl": 0.2265625, "learning_rate": 8.610478359908883e-07, "loss": 0.011, "reward": 1.2268931865692139, "reward_std": 0.28804537653923035, "rewards/accuracy_reward_stage2": 0.38314324617385864, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 794 }, { "completion_length": 8.765625, "epoch": 0.13930261082880674, "grad_norm": 18.854480783612633, "kl": 0.0595703125, "learning_rate": 8.608726125810408e-07, "loss": -0.0646, "reward": 1.582197904586792, "reward_std": 0.2034272849559784, "rewards/accuracy_reward_stage2": 0.613447904586792, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 795 }, { "completion_length": 12.375, "epoch": 0.13947783423865429, "grad_norm": 19.002903839692486, "kl": 0.078125, "learning_rate": 8.606973891711933e-07, "loss": 0.0312, "reward": 1.512641429901123, "reward_std": 0.20425836741924286, "rewards/accuracy_reward_stage2": 0.5126413702964783, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 796 }, { "completion_length": 9.9375, "epoch": 0.13965305764850183, "grad_norm": 24.877281554306364, "kl": 0.240234375, "learning_rate": 8.605221657613457e-07, "loss": 0.0961, "reward": 1.546425700187683, "reward_std": 0.27219367027282715, "rewards/accuracy_reward_stage2": 0.6714255809783936, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 797 }, { "completion_length": 8.984375, "epoch": 0.1398282810583494, "grad_norm": 23.992827145250047, "kl": 0.07958984375, "learning_rate": 8.603469423514981e-07, "loss": -0.0549, "reward": 1.747768521308899, "reward_std": 0.25258371233940125, "rewards/accuracy_reward_stage2": 0.7790185213088989, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 798 }, { "completion_length": 5.484375, "epoch": 0.14000350446819695, "grad_norm": 21.729766925610043, "kl": 0.06689453125, "learning_rate": 8.601717189416505e-07, "loss": 0.0267, "reward": 1.8072917461395264, "reward_std": 0.1921348124742508, "rewards/accuracy_reward_stage2": 0.8072916865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 799 }, { "completion_length": 9.21875, "epoch": 0.1401787278780445, "grad_norm": 21.856434941735007, "kl": 0.047607421875, "learning_rate": 8.59996495531803e-07, "loss": 0.019, "reward": 1.5184874534606934, "reward_std": 0.10601860284805298, "rewards/accuracy_reward_stage2": 0.5184873938560486, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 800 }, { "completion_length": 11.5, "epoch": 0.14035395128789208, "grad_norm": 24.821815379910944, "kl": 0.0869140625, "learning_rate": 8.598212721219555e-07, "loss": -0.0094, "reward": 1.6020259857177734, "reward_std": 0.25857317447662354, "rewards/accuracy_reward_stage2": 0.7426510453224182, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 801 }, { "completion_length": 10.890625, "epoch": 0.14052917469773962, "grad_norm": 16.76642314310539, "kl": 0.041748046875, "learning_rate": 8.596460487121079e-07, "loss": 0.0167, "reward": 1.5585997104644775, "reward_std": 0.10192655771970749, "rewards/accuracy_reward_stage2": 0.5585997104644775, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 802 }, { "completion_length": 13.1875, "epoch": 0.14070439810758717, "grad_norm": 41.3087590593791, "kl": 0.2392578125, "learning_rate": 8.594708253022604e-07, "loss": 0.0622, "reward": 1.2422547340393066, "reward_std": 0.24303309619426727, "rewards/accuracy_reward_stage2": 0.3828798234462738, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 803 }, { "completion_length": 10.078125, "epoch": 0.14087962151743472, "grad_norm": 21.448868861727423, "kl": 0.1103515625, "learning_rate": 8.592956018924127e-07, "loss": -0.0151, "reward": 1.6200649738311768, "reward_std": 0.2951427698135376, "rewards/accuracy_reward_stage2": 0.6513150334358215, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 804 }, { "completion_length": 11.8125, "epoch": 0.1410548449272823, "grad_norm": 20.462640582577773, "kl": 0.041259765625, "learning_rate": 8.591203784825652e-07, "loss": 0.0166, "reward": 1.4762279987335205, "reward_std": 0.20103763043880463, "rewards/accuracy_reward_stage2": 0.4762280285358429, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 805 }, { "completion_length": 9.0, "epoch": 0.14123006833712984, "grad_norm": 17.430891887761103, "kl": 0.2119140625, "learning_rate": 8.589451550727177e-07, "loss": 0.0849, "reward": 1.347902774810791, "reward_std": 0.15564867854118347, "rewards/accuracy_reward_stage2": 0.5979026556015015, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 806 }, { "completion_length": 9.484375, "epoch": 0.14140529174697739, "grad_norm": 21.318720238994356, "kl": 0.12451171875, "learning_rate": 8.587699316628701e-07, "loss": -0.0666, "reward": 1.6182410717010498, "reward_std": 0.2888680100440979, "rewards/accuracy_reward_stage2": 0.665116012096405, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 807 }, { "completion_length": 9.640625, "epoch": 0.14158051515682496, "grad_norm": 1125.174475735105, "kl": 0.703125, "learning_rate": 8.585947082530226e-07, "loss": 0.2047, "reward": 1.3404356241226196, "reward_std": 0.15762722492218018, "rewards/accuracy_reward_stage2": 0.49668562412261963, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 808 }, { "completion_length": 13.03125, "epoch": 0.1417557385666725, "grad_norm": 24.047162978568288, "kl": 0.1376953125, "learning_rate": 8.584194848431751e-07, "loss": 0.0551, "reward": 1.5736751556396484, "reward_std": 0.2321637123823166, "rewards/accuracy_reward_stage2": 0.5736752152442932, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 809 }, { "completion_length": 14.578125, "epoch": 0.14193096197652005, "grad_norm": 16.987052420807263, "kl": 0.0308837890625, "learning_rate": 8.582442614333274e-07, "loss": -0.0231, "reward": 1.4605519771575928, "reward_std": 0.2577477991580963, "rewards/accuracy_reward_stage2": 0.6011769771575928, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 810 }, { "completion_length": 8.625, "epoch": 0.14210618538636763, "grad_norm": 23.109130665223027, "kl": 0.2392578125, "learning_rate": 8.580690380234799e-07, "loss": 0.0646, "reward": 1.4605047702789307, "reward_std": 0.27034828066825867, "rewards/accuracy_reward_stage2": 0.7261297106742859, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 811 }, { "completion_length": 10.140625, "epoch": 0.14228140879621518, "grad_norm": 16.502025071920517, "kl": 0.11474609375, "learning_rate": 8.578938146136323e-07, "loss": -0.0425, "reward": 1.5705902576446533, "reward_std": 0.1804032325744629, "rewards/accuracy_reward_stage2": 0.6018401980400085, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 812 }, { "completion_length": 7.875, "epoch": 0.14245663220606272, "grad_norm": 7.228563122762854, "kl": 0.00823974609375, "learning_rate": 8.577185912037847e-07, "loss": -0.0409, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.765625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 813 }, { "completion_length": 12.6875, "epoch": 0.1426318556159103, "grad_norm": 18.22673671823769, "kl": 0.10693359375, "learning_rate": 8.575433677939372e-07, "loss": -0.0646, "reward": 1.5301176309585571, "reward_std": 0.2613670229911804, "rewards/accuracy_reward_stage2": 0.5769926309585571, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 814 }, { "completion_length": 13.328125, "epoch": 0.14280707902575784, "grad_norm": 17.23263122771429, "kl": 0.037353515625, "learning_rate": 8.573681443840896e-07, "loss": 0.0149, "reward": 1.6051459312438965, "reward_std": 0.15718679130077362, "rewards/accuracy_reward_stage2": 0.605146050453186, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 815 }, { "completion_length": 6.40625, "epoch": 0.1429823024356054, "grad_norm": 16.364062402792296, "kl": 0.08642578125, "learning_rate": 8.571929209742421e-07, "loss": -0.0096, "reward": 1.6421375274658203, "reward_std": 0.16563481092453003, "rewards/accuracy_reward_stage2": 0.6577625274658203, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 816 }, { "completion_length": 8.078125, "epoch": 0.14315752584545297, "grad_norm": 21.1458214999794, "kl": 0.12109375, "learning_rate": 8.570176975643946e-07, "loss": 0.0485, "reward": 1.6554884910583496, "reward_std": 0.18612952530384064, "rewards/accuracy_reward_stage2": 0.6554884910583496, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 817 }, { "completion_length": 14.75, "epoch": 0.1433327492553005, "grad_norm": 26.119798669726237, "kl": 0.01556396484375, "learning_rate": 8.56842474154547e-07, "loss": 0.0062, "reward": 1.6673200130462646, "reward_std": 0.29138341546058655, "rewards/accuracy_reward_stage2": 0.6673198938369751, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 818 }, { "completion_length": 12.75, "epoch": 0.14350797266514806, "grad_norm": 17.38674288999391, "kl": 0.0150146484375, "learning_rate": 8.566672507446995e-07, "loss": 0.006, "reward": 1.7542085647583008, "reward_std": 0.29505500197410583, "rewards/accuracy_reward_stage2": 0.7542085647583008, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 819 }, { "completion_length": 8.4375, "epoch": 0.1436831960749956, "grad_norm": 21.875096570193925, "kl": 0.038330078125, "learning_rate": 8.564920273348519e-07, "loss": 0.0153, "reward": 1.4888389110565186, "reward_std": 0.27264735102653503, "rewards/accuracy_reward_stage2": 0.6138389706611633, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 820 }, { "completion_length": 10.6875, "epoch": 0.14385841948484318, "grad_norm": 10.463498557034104, "kl": 0.03759765625, "learning_rate": 8.563168039250044e-07, "loss": 0.015, "reward": 1.4326601028442383, "reward_std": 0.030810590833425522, "rewards/accuracy_reward_stage2": 0.4326601028442383, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 821 }, { "completion_length": 10.765625, "epoch": 0.14403364289469073, "grad_norm": 20.18488498768583, "kl": 0.1806640625, "learning_rate": 8.561415805151569e-07, "loss": 0.0725, "reward": 1.5884678363800049, "reward_std": 0.2005475014448166, "rewards/accuracy_reward_stage2": 0.7134678363800049, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 822 }, { "completion_length": 10.9375, "epoch": 0.14420886630453827, "grad_norm": 16.441621674165216, "kl": 0.02734375, "learning_rate": 8.559663571053091e-07, "loss": 0.011, "reward": 1.583137035369873, "reward_std": 0.22141794860363007, "rewards/accuracy_reward_stage2": 0.583137035369873, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 823 }, { "completion_length": 11.796875, "epoch": 0.14438408971438585, "grad_norm": 17.290747179596657, "kl": 0.07568359375, "learning_rate": 8.557911336954616e-07, "loss": -0.0138, "reward": 1.5539031028747559, "reward_std": 0.15610679984092712, "rewards/accuracy_reward_stage2": 0.6945281624794006, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 824 }, { "completion_length": 9.421875, "epoch": 0.1445593131242334, "grad_norm": 13.948054655485015, "kl": 0.0281982421875, "learning_rate": 8.556159102856141e-07, "loss": -0.0329, "reward": 1.8782212734222412, "reward_std": 0.12385688722133636, "rewards/accuracy_reward_stage2": 0.8938462734222412, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 825 }, { "completion_length": 13.625, "epoch": 0.14473453653408094, "grad_norm": 16.201832903904826, "kl": 0.1796875, "learning_rate": 8.554406868757665e-07, "loss": 0.0718, "reward": 1.442164421081543, "reward_std": 0.09635643661022186, "rewards/accuracy_reward_stage2": 0.5671643614768982, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 826 }, { "completion_length": 8.296875, "epoch": 0.14490975994392852, "grad_norm": 24.460771275572338, "kl": 0.05029296875, "learning_rate": 8.55265463465919e-07, "loss": 0.0201, "reward": 1.6931114196777344, "reward_std": 0.35366058349609375, "rewards/accuracy_reward_stage2": 0.6931114196777344, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 827 }, { "completion_length": 10.234375, "epoch": 0.14508498335377606, "grad_norm": 24.600109281918645, "kl": 0.1005859375, "learning_rate": 8.550902400560714e-07, "loss": -0.0306, "reward": 1.4568983316421509, "reward_std": 0.3055163323879242, "rewards/accuracy_reward_stage2": 0.4881483018398285, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 828 }, { "completion_length": 14.203125, "epoch": 0.1452602067636236, "grad_norm": 20.48543817048711, "kl": 0.049560546875, "learning_rate": 8.549150166462239e-07, "loss": 0.0198, "reward": 1.5052090883255005, "reward_std": 0.1374053657054901, "rewards/accuracy_reward_stage2": 0.5052090287208557, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 829 }, { "completion_length": 8.015625, "epoch": 0.1454354301734712, "grad_norm": 18.253365710861136, "kl": 0.12060546875, "learning_rate": 8.547397932363764e-07, "loss": 0.0483, "reward": 1.5600733757019043, "reward_std": 0.14701932668685913, "rewards/accuracy_reward_stage2": 0.5600734949111938, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 830 }, { "completion_length": 10.484375, "epoch": 0.14561065358331873, "grad_norm": 15.571074165782326, "kl": 0.0458984375, "learning_rate": 8.545645698265288e-07, "loss": 0.012, "reward": 1.5902339220046997, "reward_std": 0.07367925345897675, "rewards/accuracy_reward_stage2": 0.7152339220046997, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 831 }, { "completion_length": 14.140625, "epoch": 0.14578587699316628, "grad_norm": 51.077344535450706, "kl": 0.1884765625, "learning_rate": 8.543893464166813e-07, "loss": 0.0313, "reward": 1.1851284503936768, "reward_std": 0.21774765849113464, "rewards/accuracy_reward_stage2": 0.3257533311843872, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 832 }, { "completion_length": 10.328125, "epoch": 0.14596110040301385, "grad_norm": 24.55740055745479, "kl": 0.09814453125, "learning_rate": 8.542141230068338e-07, "loss": 0.0062, "reward": 1.5574613809585571, "reward_std": 0.357519268989563, "rewards/accuracy_reward_stage2": 0.5730863213539124, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 833 }, { "completion_length": 16.515625, "epoch": 0.1461363238128614, "grad_norm": 15.594915814044413, "kl": 0.0546875, "learning_rate": 8.540388995969861e-07, "loss": -0.0201, "reward": 1.5986196994781494, "reward_std": 0.13833385705947876, "rewards/accuracy_reward_stage2": 0.6142447590827942, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 834 }, { "completion_length": 8.84375, "epoch": 0.14631154722270895, "grad_norm": 14.683705793659195, "kl": 0.052734375, "learning_rate": 8.538636761871386e-07, "loss": 0.0211, "reward": 1.8413678407669067, "reward_std": 0.10930629074573517, "rewards/accuracy_reward_stage2": 0.8413679003715515, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 835 }, { "completion_length": 14.640625, "epoch": 0.14648677063255652, "grad_norm": 16.716252647384675, "kl": 0.044921875, "learning_rate": 8.536884527772909e-07, "loss": -0.0704, "reward": 1.5871949195861816, "reward_std": 0.18210293352603912, "rewards/accuracy_reward_stage2": 0.6184448599815369, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 836 }, { "completion_length": 8.703125, "epoch": 0.14666199404240407, "grad_norm": 14.968499708337697, "kl": 0.0625, "learning_rate": 8.535132293674434e-07, "loss": 0.025, "reward": 1.6192355155944824, "reward_std": 0.13152600824832916, "rewards/accuracy_reward_stage2": 0.619235634803772, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 837 }, { "completion_length": 14.25, "epoch": 0.14683721745225162, "grad_norm": 22.841885046960563, "kl": 0.050048828125, "learning_rate": 8.533380059575959e-07, "loss": -0.0335, "reward": 1.5910993814468384, "reward_std": 0.18110352754592896, "rewards/accuracy_reward_stage2": 0.6223493814468384, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 838 }, { "completion_length": 9.828125, "epoch": 0.14701244086209916, "grad_norm": 20.15835105543159, "kl": 0.0908203125, "learning_rate": 8.531627825477483e-07, "loss": 0.0363, "reward": 1.7946337461471558, "reward_std": 0.18121492862701416, "rewards/accuracy_reward_stage2": 0.794633686542511, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 839 }, { "completion_length": 12.734375, "epoch": 0.14718766427194674, "grad_norm": 14.433049590453853, "kl": 0.06884765625, "learning_rate": 8.529875591379008e-07, "loss": 0.0276, "reward": 1.7972837686538696, "reward_std": 0.14100028574466705, "rewards/accuracy_reward_stage2": 0.7972837686538696, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 840 }, { "completion_length": 10.140625, "epoch": 0.14736288768179429, "grad_norm": 26.699421426615817, "kl": 0.1845703125, "learning_rate": 8.528123357280533e-07, "loss": 0.0182, "reward": 1.4089363813400269, "reward_std": 0.26331624388694763, "rewards/accuracy_reward_stage2": 0.44018638134002686, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 841 }, { "completion_length": 9.59375, "epoch": 0.14753811109164183, "grad_norm": 17.884239561013167, "kl": 0.0615234375, "learning_rate": 8.526371123182057e-07, "loss": -0.0197, "reward": 1.5981876850128174, "reward_std": 0.2483270913362503, "rewards/accuracy_reward_stage2": 0.6138126254081726, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 842 }, { "completion_length": 9.78125, "epoch": 0.1477133345014894, "grad_norm": 15.32685599876304, "kl": 0.0400390625, "learning_rate": 8.524618889083582e-07, "loss": 0.016, "reward": 1.6972908973693848, "reward_std": 0.09243768453598022, "rewards/accuracy_reward_stage2": 0.6972908973693848, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 843 }, { "completion_length": 23.53125, "epoch": 0.14788855791133695, "grad_norm": 20.172619708324884, "kl": 0.072265625, "learning_rate": 8.522866654985105e-07, "loss": -0.0064, "reward": 1.3043758869171143, "reward_std": 0.23132845759391785, "rewards/accuracy_reward_stage2": 0.44500094652175903, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 844 }, { "completion_length": 6.78125, "epoch": 0.1480637813211845, "grad_norm": 14.049334173065237, "kl": 0.08935546875, "learning_rate": 8.52111442088663e-07, "loss": 0.0357, "reward": 1.6144025325775146, "reward_std": 0.06398695707321167, "rewards/accuracy_reward_stage2": 0.6144025325775146, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 845 }, { "completion_length": 6.671875, "epoch": 0.14823900473103208, "grad_norm": 9.94233896025104, "kl": 0.03564453125, "learning_rate": 8.519362186788155e-07, "loss": -0.0299, "reward": 1.5922174453735352, "reward_std": 0.07707421481609344, "rewards/accuracy_reward_stage2": 0.6078425049781799, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 846 }, { "completion_length": 24.484375, "epoch": 0.14841422814087962, "grad_norm": 20.63713262817355, "kl": 0.16796875, "learning_rate": 8.517609952689679e-07, "loss": -0.0195, "reward": 1.3995568752288818, "reward_std": 0.3477005660533905, "rewards/accuracy_reward_stage2": 0.5558068156242371, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 847 }, { "completion_length": 15.78125, "epoch": 0.14858945155072717, "grad_norm": 20.264020472537457, "kl": 0.044677734375, "learning_rate": 8.515857718591204e-07, "loss": 0.0179, "reward": 1.6210176944732666, "reward_std": 0.20520731806755066, "rewards/accuracy_reward_stage2": 0.621017575263977, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 848 }, { "completion_length": 14.3125, "epoch": 0.14876467496057474, "grad_norm": 9.600883326034557, "kl": 0.06640625, "learning_rate": 8.514105484492728e-07, "loss": 0.0266, "reward": 1.5826388597488403, "reward_std": 0.08929072320461273, "rewards/accuracy_reward_stage2": 0.5826388597488403, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 849 }, { "completion_length": 12.046875, "epoch": 0.1489398983704223, "grad_norm": 16.59982588749903, "kl": 0.1318359375, "learning_rate": 8.512353250394252e-07, "loss": -0.0356, "reward": 1.71842360496521, "reward_std": 0.25606924295425415, "rewards/accuracy_reward_stage2": 0.76529860496521, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 850 }, { "completion_length": 15.203125, "epoch": 0.14911512178026984, "grad_norm": 25.532052798300754, "kl": 0.2490234375, "learning_rate": 8.510601016295777e-07, "loss": 0.066, "reward": 1.5083532333374023, "reward_std": 0.27988600730895996, "rewards/accuracy_reward_stage2": 0.6489783525466919, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 851 }, { "completion_length": 10.5, "epoch": 0.1492903451901174, "grad_norm": 12.56724203301642, "kl": 0.0213623046875, "learning_rate": 8.508848782197301e-07, "loss": -0.0204, "reward": 1.546875, "reward_std": 0.16887323558330536, "rewards/accuracy_reward_stage2": 0.6875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 852 }, { "completion_length": 10.78125, "epoch": 0.14946556859996496, "grad_norm": 17.700019567042943, "kl": 0.01123046875, "learning_rate": 8.507096548098825e-07, "loss": -0.0397, "reward": 1.685826063156128, "reward_std": 0.13276247680187225, "rewards/accuracy_reward_stage2": 0.7014511227607727, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 853 }, { "completion_length": 10.015625, "epoch": 0.1496407920098125, "grad_norm": 13.651544327069582, "kl": 0.01806640625, "learning_rate": 8.50534431400035e-07, "loss": -0.037, "reward": 1.03125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.296875, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 854 }, { "completion_length": 11.953125, "epoch": 0.14981601541966005, "grad_norm": 24.999448901305616, "kl": 0.05517578125, "learning_rate": 8.503592079901874e-07, "loss": -0.0788, "reward": 1.5360831022262573, "reward_std": 0.34290796518325806, "rewards/accuracy_reward_stage2": 0.5829581618309021, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 855 }, { "completion_length": 9.078125, "epoch": 0.14999123882950763, "grad_norm": 16.70769597449124, "kl": 0.05859375, "learning_rate": 8.501839845803399e-07, "loss": -0.0649, "reward": 1.453125, "reward_std": 0.308285653591156, "rewards/accuracy_reward_stage2": 0.484375, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 856 }, { "completion_length": 13.453125, "epoch": 0.15016646223935518, "grad_norm": 17.40194584377364, "kl": 0.111328125, "learning_rate": 8.500087611704924e-07, "loss": 0.0166, "reward": 1.473811149597168, "reward_std": 0.22556474804878235, "rewards/accuracy_reward_stage2": 0.6144360303878784, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 857 }, { "completion_length": 7.65625, "epoch": 0.15034168564920272, "grad_norm": 22.17923847228564, "kl": 0.07421875, "learning_rate": 8.498335377606448e-07, "loss": -0.0146, "reward": 1.602588415145874, "reward_std": 0.2952241003513336, "rewards/accuracy_reward_stage2": 0.7432133555412292, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 858 }, { "completion_length": 7.796875, "epoch": 0.1505169090590503, "grad_norm": 15.660354355947947, "kl": 0.03759765625, "learning_rate": 8.496583143507973e-07, "loss": 0.015, "reward": 1.6695375442504883, "reward_std": 0.17127437889575958, "rewards/accuracy_reward_stage2": 0.6695374846458435, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 859 }, { "completion_length": 11.90625, "epoch": 0.15069213246889784, "grad_norm": 21.1108280984021, "kl": 0.048583984375, "learning_rate": 8.494830909409497e-07, "loss": -0.0199, "reward": 1.5457628965377808, "reward_std": 0.22555433213710785, "rewards/accuracy_reward_stage2": 0.561387836933136, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 860 }, { "completion_length": 9.328125, "epoch": 0.1508673558787454, "grad_norm": 19.86410560849335, "kl": 0.0634765625, "learning_rate": 8.493078675311021e-07, "loss": -0.0189, "reward": 1.4820592403411865, "reward_std": 0.20549719035625458, "rewards/accuracy_reward_stage2": 0.6226842403411865, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 861 }, { "completion_length": 11.46875, "epoch": 0.15104257928859297, "grad_norm": 15.593853856372352, "kl": 0.04296875, "learning_rate": 8.491326441212546e-07, "loss": -0.0765, "reward": 1.6471900939941406, "reward_std": 0.2422153800725937, "rewards/accuracy_reward_stage2": 0.6940651535987854, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 862 }, { "completion_length": 12.953125, "epoch": 0.1512178026984405, "grad_norm": 18.740518266665386, "kl": 0.11865234375, "learning_rate": 8.489574207114069e-07, "loss": 0.0186, "reward": 1.422258734703064, "reward_std": 0.24776926636695862, "rewards/accuracy_reward_stage2": 0.4378837049007416, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 863 }, { "completion_length": 10.734375, "epoch": 0.15139302610828806, "grad_norm": 20.18460613831639, "kl": 0.052734375, "learning_rate": 8.487821973015594e-07, "loss": 0.0211, "reward": 1.3923512697219849, "reward_std": 0.31910377740859985, "rewards/accuracy_reward_stage2": 0.39235132932662964, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 864 }, { "completion_length": 20.421875, "epoch": 0.15156824951813563, "grad_norm": 19.765628340117512, "kl": 0.1044921875, "learning_rate": 8.486069738917118e-07, "loss": 0.0021, "reward": 1.2905793190002441, "reward_std": 0.1945052146911621, "rewards/accuracy_reward_stage2": 0.5562041997909546, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 865 }, { "completion_length": 11.65625, "epoch": 0.15174347292798318, "grad_norm": 23.360094504906623, "kl": 0.171875, "learning_rate": 8.484317504818643e-07, "loss": -0.0099, "reward": 1.372206449508667, "reward_std": 0.3116145730018616, "rewards/accuracy_reward_stage2": 0.544081449508667, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 866 }, { "completion_length": 8.109375, "epoch": 0.15191869633783073, "grad_norm": 21.28420476628764, "kl": 0.12255859375, "learning_rate": 8.482565270720168e-07, "loss": -0.0343, "reward": 1.772420883178711, "reward_std": 0.27512839436531067, "rewards/accuracy_reward_stage2": 0.8036710619926453, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 867 }, { "completion_length": 12.890625, "epoch": 0.1520939197476783, "grad_norm": 13.792981077834618, "kl": 0.0111083984375, "learning_rate": 8.480813036621692e-07, "loss": 0.0044, "reward": 1.7508642673492432, "reward_std": 0.050229497253894806, "rewards/accuracy_reward_stage2": 0.7508642673492432, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 868 }, { "completion_length": 9.0, "epoch": 0.15226914315752585, "grad_norm": 20.417553629433254, "kl": 0.04248046875, "learning_rate": 8.479060802523217e-07, "loss": -0.0271, "reward": 1.6638281345367432, "reward_std": 0.1942887008190155, "rewards/accuracy_reward_stage2": 0.6794531941413879, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 869 }, { "completion_length": 10.046875, "epoch": 0.1524443665673734, "grad_norm": 15.65798750344904, "kl": 0.0546875, "learning_rate": 8.477308568424742e-07, "loss": 0.0219, "reward": 1.4875478744506836, "reward_std": 0.13235189020633698, "rewards/accuracy_reward_stage2": 0.4875478744506836, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 870 }, { "completion_length": 8.609375, "epoch": 0.15261958997722094, "grad_norm": 18.692786962291954, "kl": 0.107421875, "learning_rate": 8.475556334326266e-07, "loss": -0.0761, "reward": 1.7316548824310303, "reward_std": 0.24129626154899597, "rewards/accuracy_reward_stage2": 0.778529942035675, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 871 }, { "completion_length": 11.21875, "epoch": 0.15279481338706852, "grad_norm": 36.74883998202966, "kl": 0.1533203125, "learning_rate": 8.473804100227791e-07, "loss": -0.0271, "reward": 1.359375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 872 }, { "completion_length": 19.890625, "epoch": 0.15297003679691606, "grad_norm": 19.386833518301408, "kl": 0.017333984375, "learning_rate": 8.472051866129316e-07, "loss": 0.0069, "reward": 1.4666125774383545, "reward_std": 0.1620863825082779, "rewards/accuracy_reward_stage2": 0.4666125476360321, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 873 }, { "completion_length": 10.96875, "epoch": 0.1531452602067636, "grad_norm": 24.861548593450724, "kl": 0.064453125, "learning_rate": 8.470299632030838e-07, "loss": 0.0258, "reward": 1.605391502380371, "reward_std": 0.24627715349197388, "rewards/accuracy_reward_stage2": 0.6053914427757263, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 874 }, { "completion_length": 9.25, "epoch": 0.1533204836166112, "grad_norm": 23.9841767692309, "kl": 0.08740234375, "learning_rate": 8.468547397932363e-07, "loss": -0.0403, "reward": 1.6885817050933838, "reward_std": 0.39420628547668457, "rewards/accuracy_reward_stage2": 0.7198317050933838, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 875 }, { "completion_length": 11.59375, "epoch": 0.15349570702645873, "grad_norm": 18.208345060683527, "kl": 0.11279296875, "learning_rate": 8.466795163833887e-07, "loss": 0.0008, "reward": 1.663696050643921, "reward_std": 0.1911657601594925, "rewards/accuracy_reward_stage2": 0.6793211102485657, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 876 }, { "completion_length": 9.3125, "epoch": 0.15367093043630628, "grad_norm": 18.970527826997365, "kl": 0.1513671875, "learning_rate": 8.465042929735412e-07, "loss": 0.0607, "reward": 1.5123016834259033, "reward_std": 0.21880435943603516, "rewards/accuracy_reward_stage2": 0.6373016834259033, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 877 }, { "completion_length": 8.625, "epoch": 0.15384615384615385, "grad_norm": 23.75217986780291, "kl": 0.11279296875, "learning_rate": 8.463290695636937e-07, "loss": -0.0357, "reward": 1.7432327270507812, "reward_std": 0.30611133575439453, "rewards/accuracy_reward_stage2": 0.7744826078414917, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 878 }, { "completion_length": 9.25, "epoch": 0.1540213772560014, "grad_norm": 20.18899090894407, "kl": 0.07421875, "learning_rate": 8.461538461538461e-07, "loss": -0.0626, "reward": 1.6393646001815796, "reward_std": 0.16706180572509766, "rewards/accuracy_reward_stage2": 0.6862396597862244, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 879 }, { "completion_length": 11.59375, "epoch": 0.15419660066584895, "grad_norm": 22.636159987310997, "kl": 0.1337890625, "learning_rate": 8.459786227439986e-07, "loss": 0.0534, "reward": 1.5441811084747314, "reward_std": 0.28956350684165955, "rewards/accuracy_reward_stage2": 0.6691809892654419, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 880 }, { "completion_length": 7.578125, "epoch": 0.15437182407569652, "grad_norm": 90.53936940551583, "kl": 0.349609375, "learning_rate": 8.45803399334151e-07, "loss": 0.0987, "reward": 1.4019708633422852, "reward_std": 0.2559443712234497, "rewards/accuracy_reward_stage2": 0.5425958037376404, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 881 }, { "completion_length": 11.4375, "epoch": 0.15454704748554407, "grad_norm": 19.73605396407536, "kl": 0.1201171875, "learning_rate": 8.456281759243035e-07, "loss": 0.048, "reward": 1.716138243675232, "reward_std": 0.25902819633483887, "rewards/accuracy_reward_stage2": 0.7161382436752319, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 882 }, { "completion_length": 13.046875, "epoch": 0.15472227089539162, "grad_norm": 25.45542294657479, "kl": 0.140625, "learning_rate": 8.45452952514456e-07, "loss": 0.0561, "reward": 1.733203411102295, "reward_std": 0.29068833589553833, "rewards/accuracy_reward_stage2": 0.7332033514976501, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 883 }, { "completion_length": 9.1875, "epoch": 0.1548974943052392, "grad_norm": 20.634460745406873, "kl": 0.054931640625, "learning_rate": 8.452777291046083e-07, "loss": 0.022, "reward": 1.4329566955566406, "reward_std": 0.2863396406173706, "rewards/accuracy_reward_stage2": 0.5579568147659302, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 884 }, { "completion_length": 8.6875, "epoch": 0.15507271771508674, "grad_norm": 24.606510940394227, "kl": 0.07421875, "learning_rate": 8.451025056947608e-07, "loss": 0.0047, "reward": 1.554135799407959, "reward_std": 0.2531256675720215, "rewards/accuracy_reward_stage2": 0.5697606801986694, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 885 }, { "completion_length": 11.875, "epoch": 0.15524794112493429, "grad_norm": 21.4730685645679, "kl": 0.138671875, "learning_rate": 8.449272822849133e-07, "loss": 0.0334, "reward": 1.5125616788864136, "reward_std": 0.1945052444934845, "rewards/accuracy_reward_stage2": 0.5281866788864136, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 886 }, { "completion_length": 11.328125, "epoch": 0.15542316453478186, "grad_norm": 21.448196825835407, "kl": 0.03125, "learning_rate": 8.447520588750656e-07, "loss": -0.0308, "reward": 1.57716965675354, "reward_std": 0.2582663893699646, "rewards/accuracy_reward_stage2": 0.59279465675354, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 887 }, { "completion_length": 10.234375, "epoch": 0.1555983879446294, "grad_norm": 23.030439291257906, "kl": 0.205078125, "learning_rate": 8.445768354652181e-07, "loss": 0.0097, "reward": 1.4215422868728638, "reward_std": 0.2988489866256714, "rewards/accuracy_reward_stage2": 0.7027922868728638, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 888 }, { "completion_length": 7.484375, "epoch": 0.15577361135447695, "grad_norm": 17.137018589062855, "kl": 0.09228515625, "learning_rate": 8.444016120553705e-07, "loss": 0.0307, "reward": 1.6334525346755981, "reward_std": 0.19943121075630188, "rewards/accuracy_reward_stage2": 0.6490775942802429, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 889 }, { "completion_length": 7.46875, "epoch": 0.1559488347643245, "grad_norm": 21.648857530110167, "kl": 0.10595703125, "learning_rate": 8.44226388645523e-07, "loss": 0.0069, "reward": 1.5306628942489624, "reward_std": 0.2571268379688263, "rewards/accuracy_reward_stage2": 0.5462879538536072, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 890 }, { "completion_length": 8.65625, "epoch": 0.15612405817417208, "grad_norm": 20.660084693200343, "kl": 0.011474609375, "learning_rate": 8.440511652356755e-07, "loss": 0.0046, "reward": 1.5625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 891 }, { "completion_length": 10.84375, "epoch": 0.15629928158401962, "grad_norm": 16.66053317450666, "kl": 0.333984375, "learning_rate": 8.438759418258279e-07, "loss": 0.089, "reward": 1.17367422580719, "reward_std": 0.2187984734773636, "rewards/accuracy_reward_stage2": 0.31429922580718994, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 892 }, { "completion_length": 7.75, "epoch": 0.15647450499386717, "grad_norm": 16.696832713661816, "kl": 0.0986328125, "learning_rate": 8.437007184159803e-07, "loss": -0.0046, "reward": 1.5517808198928833, "reward_std": 0.1454845815896988, "rewards/accuracy_reward_stage2": 0.5674057602882385, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 893 }, { "completion_length": 13.90625, "epoch": 0.15664972840371474, "grad_norm": 18.134303764193838, "kl": 0.0184326171875, "learning_rate": 8.435254950061328e-07, "loss": -0.0657, "reward": 1.4824566841125488, "reward_std": 0.10660809278488159, "rewards/accuracy_reward_stage2": 0.5137066841125488, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 894 }, { "completion_length": 11.546875, "epoch": 0.1568249518135623, "grad_norm": 17.47601020099436, "kl": 0.193359375, "learning_rate": 8.433502715962852e-07, "loss": 0.0331, "reward": 1.3741912841796875, "reward_std": 0.16278903186321259, "rewards/accuracy_reward_stage2": 0.514816164970398, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 895 }, { "completion_length": 11.109375, "epoch": 0.15700017522340984, "grad_norm": 21.664510515701206, "kl": 0.08544921875, "learning_rate": 8.431750481864377e-07, "loss": 0.0051, "reward": 1.4644746780395508, "reward_std": 0.28877538442611694, "rewards/accuracy_reward_stage2": 0.4800996482372284, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 896 }, { "completion_length": 8.21875, "epoch": 0.1571753986332574, "grad_norm": 20.922720090845928, "kl": 0.08203125, "learning_rate": 8.429998247765901e-07, "loss": 0.001, "reward": 1.4854588508605957, "reward_std": 0.16852207481861115, "rewards/accuracy_reward_stage2": 0.5010839104652405, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 897 }, { "completion_length": 10.09375, "epoch": 0.15735062204310496, "grad_norm": 20.48289200755099, "kl": 0.08203125, "learning_rate": 8.428246013667426e-07, "loss": 0.0329, "reward": 1.5313962697982788, "reward_std": 0.14356286823749542, "rewards/accuracy_reward_stage2": 0.6563963294029236, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 898 }, { "completion_length": 15.921875, "epoch": 0.1575258454529525, "grad_norm": 21.49636538136034, "kl": 0.11376953125, "learning_rate": 8.426493779568951e-07, "loss": 0.0085, "reward": 1.56718111038208, "reward_std": 0.15211787819862366, "rewards/accuracy_reward_stage2": 0.5828061103820801, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 899 }, { "completion_length": 9.859375, "epoch": 0.15770106886280008, "grad_norm": 24.25647930376624, "kl": 0.0615234375, "learning_rate": 8.424741545470474e-07, "loss": -0.0028, "reward": 1.715989589691162, "reward_std": 0.2878607511520386, "rewards/accuracy_reward_stage2": 0.7316147089004517, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 900 }, { "completion_length": 10.75, "epoch": 0.15787629227264763, "grad_norm": 19.215338734644746, "kl": 0.0179443359375, "learning_rate": 8.422989311371999e-07, "loss": -0.0262, "reward": 1.4582719802856445, "reward_std": 0.32733142375946045, "rewards/accuracy_reward_stage2": 0.5988969802856445, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 901 }, { "completion_length": 14.046875, "epoch": 0.15805151568249518, "grad_norm": 23.475241913665684, "kl": 0.05029296875, "learning_rate": 8.421237077273524e-07, "loss": -0.024, "reward": 1.646759033203125, "reward_std": 0.231346994638443, "rewards/accuracy_reward_stage2": 0.662384033203125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 902 }, { "completion_length": 8.859375, "epoch": 0.15822673909234275, "grad_norm": 19.976502771099383, "kl": 0.1640625, "learning_rate": 8.419484843175047e-07, "loss": -0.1622, "reward": 1.475749135017395, "reward_std": 0.40651267766952515, "rewards/accuracy_reward_stage2": 0.569499135017395, "rewards/format_reward_stage1_pointerpad": 0.90625, "scores/accuracy_reward_stage2": 0.90625, "step": 903 }, { "completion_length": 16.453125, "epoch": 0.1584019625021903, "grad_norm": 11.343613400181686, "kl": 0.058837890625, "learning_rate": 8.417732609076572e-07, "loss": -0.0648, "reward": 1.4348111152648926, "reward_std": 0.1632448136806488, "rewards/accuracy_reward_stage2": 0.4660611152648926, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 904 }, { "completion_length": 9.171875, "epoch": 0.15857718591203784, "grad_norm": 19.639344625728686, "kl": 0.083984375, "learning_rate": 8.415980374978096e-07, "loss": -0.0086, "reward": 1.4668049812316895, "reward_std": 0.10977669060230255, "rewards/accuracy_reward_stage2": 0.4824299216270447, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 905 }, { "completion_length": 18.25, "epoch": 0.1587524093218854, "grad_norm": 20.04975456715171, "kl": 0.107421875, "learning_rate": 8.414228140879621e-07, "loss": 0.0046, "reward": 1.3382542133331299, "reward_std": 0.23458905518054962, "rewards/accuracy_reward_stage2": 0.47887933254241943, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 906 }, { "completion_length": 11.21875, "epoch": 0.15892763273173297, "grad_norm": 20.769525897884474, "kl": 0.0625, "learning_rate": 8.412475906781146e-07, "loss": -0.0192, "reward": 1.5473453998565674, "reward_std": 0.1797097623348236, "rewards/accuracy_reward_stage2": 0.5629702806472778, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 907 }, { "completion_length": 9.15625, "epoch": 0.1591028561415805, "grad_norm": 19.600389057571586, "kl": 0.06298828125, "learning_rate": 8.41072367268267e-07, "loss": -0.0082, "reward": 1.6035189628601074, "reward_std": 0.16768498718738556, "rewards/accuracy_reward_stage2": 0.6191439628601074, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 908 }, { "completion_length": 8.765625, "epoch": 0.15927807955142806, "grad_norm": 42.576306532869665, "kl": 0.38671875, "learning_rate": 8.408971438584195e-07, "loss": 0.1551, "reward": 1.4866942167282104, "reward_std": 0.23718145489692688, "rewards/accuracy_reward_stage2": 0.6116942763328552, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 909 }, { "completion_length": 11.09375, "epoch": 0.15945330296127563, "grad_norm": 20.471116146597804, "kl": 0.0286865234375, "learning_rate": 8.40721920448572e-07, "loss": 0.0115, "reward": 1.4572649002075195, "reward_std": 0.15716366469860077, "rewards/accuracy_reward_stage2": 0.4572649598121643, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 910 }, { "completion_length": 12.765625, "epoch": 0.15962852637112318, "grad_norm": 19.459024514081843, "kl": 0.1103515625, "learning_rate": 8.405466970387244e-07, "loss": 0.0006, "reward": 1.7537041902542114, "reward_std": 0.11763329803943634, "rewards/accuracy_reward_stage2": 0.7693291902542114, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 911 }, { "completion_length": 11.390625, "epoch": 0.15980374978097073, "grad_norm": 25.867143049061717, "kl": 0.126953125, "learning_rate": 8.403714736288767e-07, "loss": 0.0117, "reward": 1.6886465549468994, "reward_std": 0.22849583625793457, "rewards/accuracy_reward_stage2": 0.7042715549468994, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 912 }, { "completion_length": 10.375, "epoch": 0.1599789731908183, "grad_norm": 24.869482371507228, "kl": 0.036376953125, "learning_rate": 8.401962502190291e-07, "loss": -0.0296, "reward": 1.4990651607513428, "reward_std": 0.38119715452194214, "rewards/accuracy_reward_stage2": 0.6396902203559875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 913 }, { "completion_length": 11.8125, "epoch": 0.16015419660066585, "grad_norm": 15.483895225098495, "kl": 0.130859375, "learning_rate": 8.400210268091816e-07, "loss": 0.0523, "reward": 1.4544987678527832, "reward_std": 0.18442535400390625, "rewards/accuracy_reward_stage2": 0.5794986486434937, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 914 }, { "completion_length": 18.265625, "epoch": 0.1603294200105134, "grad_norm": 23.179724791748395, "kl": 0.050048828125, "learning_rate": 8.398458033993341e-07, "loss": 0.02, "reward": 1.7503162622451782, "reward_std": 0.1276683211326599, "rewards/accuracy_reward_stage2": 0.7503161430358887, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 915 }, { "completion_length": 6.546875, "epoch": 0.16050464342036097, "grad_norm": 18.057542812191738, "kl": 0.0390625, "learning_rate": 8.396705799894865e-07, "loss": -0.1062, "reward": 1.418050765991211, "reward_std": 0.23071405291557312, "rewards/accuracy_reward_stage2": 0.46492570638656616, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 916 }, { "completion_length": 9.3125, "epoch": 0.16067986683020852, "grad_norm": 19.653550716601625, "kl": 0.0546875, "learning_rate": 8.39495356579639e-07, "loss": -0.007, "reward": 1.4791667461395264, "reward_std": 0.2661178410053253, "rewards/accuracy_reward_stage2": 0.4947916865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 917 }, { "completion_length": 19.21875, "epoch": 0.16085509024005606, "grad_norm": 14.689042788013323, "kl": 0.06396484375, "learning_rate": 8.393201331697915e-07, "loss": -0.0184, "reward": 1.3778043985366821, "reward_std": 0.18908536434173584, "rewards/accuracy_reward_stage2": 0.5184293985366821, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 918 }, { "completion_length": 10.53125, "epoch": 0.16103031364990364, "grad_norm": 24.26621589117923, "kl": 0.126953125, "learning_rate": 8.391449097599439e-07, "loss": 0.0219, "reward": 1.4894888401031494, "reward_std": 0.2673390507698059, "rewards/accuracy_reward_stage2": 0.505113959312439, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 919 }, { "completion_length": 7.390625, "epoch": 0.16120553705975119, "grad_norm": 18.517823793201053, "kl": 0.07470703125, "learning_rate": 8.389696863500964e-07, "loss": 0.0132, "reward": 1.5432384014129639, "reward_std": 0.24379214644432068, "rewards/accuracy_reward_stage2": 0.5588634014129639, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 920 }, { "completion_length": 12.3125, "epoch": 0.16138076046959873, "grad_norm": 17.44714427018299, "kl": 0.25390625, "learning_rate": 8.387944629402488e-07, "loss": 0.0665, "reward": 1.4797606468200684, "reward_std": 0.20411017537117004, "rewards/accuracy_reward_stage2": 0.6203855276107788, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 921 }, { "completion_length": 12.015625, "epoch": 0.1615559838794463, "grad_norm": 18.831065124465535, "kl": 0.060302734375, "learning_rate": 8.386192395304013e-07, "loss": 0.0241, "reward": 1.7214339971542358, "reward_std": 0.19486960768699646, "rewards/accuracy_reward_stage2": 0.7214340567588806, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 922 }, { "completion_length": 8.40625, "epoch": 0.16173120728929385, "grad_norm": 16.571351956065644, "kl": 0.08251953125, "learning_rate": 8.384440161205537e-07, "loss": -0.0004, "reward": 1.573890209197998, "reward_std": 0.15192250907421112, "rewards/accuracy_reward_stage2": 0.589515209197998, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 923 }, { "completion_length": 10.5625, "epoch": 0.1619064306991414, "grad_norm": 21.658736598327444, "kl": 0.026123046875, "learning_rate": 8.382687927107061e-07, "loss": 0.0104, "reward": 1.7161989212036133, "reward_std": 0.1691042184829712, "rewards/accuracy_reward_stage2": 0.7161988019943237, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 924 }, { "completion_length": 5.546875, "epoch": 0.16208165410898895, "grad_norm": 15.044499024913339, "kl": 0.025634765625, "learning_rate": 8.380935693008585e-07, "loss": 0.0102, "reward": 1.7326302528381348, "reward_std": 0.18306495249271393, "rewards/accuracy_reward_stage2": 0.7326301336288452, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 925 }, { "completion_length": 14.03125, "epoch": 0.16225687751883652, "grad_norm": 16.145061763378322, "kl": 0.050048828125, "learning_rate": 8.37918345891011e-07, "loss": 0.02, "reward": 1.646541953086853, "reward_std": 0.09941836446523666, "rewards/accuracy_reward_stage2": 0.771541953086853, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 926 }, { "completion_length": 12.109375, "epoch": 0.16243210092868407, "grad_norm": 22.317007705230505, "kl": 0.01904296875, "learning_rate": 8.377431224811634e-07, "loss": 0.0077, "reward": 1.5018961429595947, "reward_std": 0.20814120769500732, "rewards/accuracy_reward_stage2": 0.5018961429595947, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 927 }, { "completion_length": 11.640625, "epoch": 0.16260732433853162, "grad_norm": 15.262903698390526, "kl": 0.036376953125, "learning_rate": 8.375678990713159e-07, "loss": -0.0296, "reward": 1.6556651592254639, "reward_std": 0.18859100341796875, "rewards/accuracy_reward_stage2": 0.6712901592254639, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 928 }, { "completion_length": 8.03125, "epoch": 0.1627825477483792, "grad_norm": 17.09237768184502, "kl": 0.04638671875, "learning_rate": 8.373926756614683e-07, "loss": 0.0185, "reward": 1.5406862497329712, "reward_std": 0.18566709756851196, "rewards/accuracy_reward_stage2": 0.5406862497329712, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 929 }, { "completion_length": 14.859375, "epoch": 0.16295777115822674, "grad_norm": 18.91427736792877, "kl": 0.06787109375, "learning_rate": 8.372174522516208e-07, "loss": 0.0271, "reward": 1.4712986946105957, "reward_std": 0.097105011343956, "rewards/accuracy_reward_stage2": 0.47129860520362854, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 930 }, { "completion_length": 9.75, "epoch": 0.16313299456807429, "grad_norm": 22.340739109880076, "kl": 0.0123291015625, "learning_rate": 8.370422288417733e-07, "loss": 0.0049, "reward": 1.4322266578674316, "reward_std": 0.3248811960220337, "rewards/accuracy_reward_stage2": 0.43222665786743164, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 931 }, { "completion_length": 8.71875, "epoch": 0.16330821797792186, "grad_norm": 19.42803457345458, "kl": 0.07275390625, "learning_rate": 8.368670054319256e-07, "loss": 0.0291, "reward": 1.2958667278289795, "reward_std": 0.1573130041360855, "rewards/accuracy_reward_stage2": 0.29586678743362427, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 932 }, { "completion_length": 12.328125, "epoch": 0.1634834413877694, "grad_norm": 15.734435814984927, "kl": 0.12451171875, "learning_rate": 8.366917820220781e-07, "loss": -0.035, "reward": 1.3563389778137207, "reward_std": 0.21657343208789825, "rewards/accuracy_reward_stage2": 0.5125889778137207, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 933 }, { "completion_length": 11.765625, "epoch": 0.16365866479761695, "grad_norm": 19.72464887053107, "kl": 0.04541015625, "learning_rate": 8.365165586122306e-07, "loss": 0.0182, "reward": 1.3644483089447021, "reward_std": 0.20462146401405334, "rewards/accuracy_reward_stage2": 0.36444830894470215, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 934 }, { "completion_length": 8.5, "epoch": 0.16383388820746453, "grad_norm": 17.043259339917427, "kl": 0.0595703125, "learning_rate": 8.36341335202383e-07, "loss": 0.0239, "reward": 1.6852599382400513, "reward_std": 0.15038639307022095, "rewards/accuracy_reward_stage2": 0.6852599382400513, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 935 }, { "completion_length": 9.65625, "epoch": 0.16400911161731208, "grad_norm": 17.623879303196922, "kl": 0.0673828125, "learning_rate": 8.361661117925355e-07, "loss": 0.0269, "reward": 1.4992897510528564, "reward_std": 0.14591173827648163, "rewards/accuracy_reward_stage2": 0.49928975105285645, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 936 }, { "completion_length": 10.71875, "epoch": 0.16418433502715962, "grad_norm": 20.812568655675257, "kl": 0.051513671875, "learning_rate": 8.359908883826879e-07, "loss": 0.0206, "reward": 1.5781810283660889, "reward_std": 0.2800504267215729, "rewards/accuracy_reward_stage2": 0.5781811475753784, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 937 }, { "completion_length": 10.171875, "epoch": 0.1643595584370072, "grad_norm": 17.797227923047306, "kl": 0.0172119140625, "learning_rate": 8.358156649728403e-07, "loss": 0.0069, "reward": 1.610327124595642, "reward_std": 0.06501858681440353, "rewards/accuracy_reward_stage2": 0.7353270649909973, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 938 }, { "completion_length": 11.09375, "epoch": 0.16453478184685474, "grad_norm": 13.469802078992393, "kl": 0.048583984375, "learning_rate": 8.356404415629928e-07, "loss": 0.0194, "reward": 1.607242465019226, "reward_std": 0.07041595876216888, "rewards/accuracy_reward_stage2": 0.6072424054145813, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 939 }, { "completion_length": 8.734375, "epoch": 0.1647100052567023, "grad_norm": 17.907630506038068, "kl": 0.10595703125, "learning_rate": 8.354652181531452e-07, "loss": -0.0203, "reward": 1.6189165115356445, "reward_std": 0.20212598145008087, "rewards/accuracy_reward_stage2": 0.7751665115356445, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 940 }, { "completion_length": 22.96875, "epoch": 0.16488522866654984, "grad_norm": 24.025578235260905, "kl": 0.054931640625, "learning_rate": 8.352899947432977e-07, "loss": 0.022, "reward": 1.6079862117767334, "reward_std": 0.24116870760917664, "rewards/accuracy_reward_stage2": 0.6079861521720886, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 941 }, { "completion_length": 9.046875, "epoch": 0.1650604520763974, "grad_norm": 19.68392029533682, "kl": 0.17578125, "learning_rate": 8.3511477133345e-07, "loss": 0.0702, "reward": 1.2127020359039307, "reward_std": 0.26052191853523254, "rewards/accuracy_reward_stage2": 0.33770206570625305, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 942 }, { "completion_length": 14.4375, "epoch": 0.16523567548624496, "grad_norm": 15.842916725269227, "kl": 0.04345703125, "learning_rate": 8.349395479236025e-07, "loss": 0.0174, "reward": 1.5224876403808594, "reward_std": 0.16868887841701508, "rewards/accuracy_reward_stage2": 0.5224875807762146, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 943 }, { "completion_length": 9.125, "epoch": 0.1654108988960925, "grad_norm": 14.834285405679069, "kl": 0.04345703125, "learning_rate": 8.34764324513755e-07, "loss": 0.0174, "reward": 1.6600130796432495, "reward_std": 0.0960196852684021, "rewards/accuracy_reward_stage2": 0.6600130796432495, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 944 }, { "completion_length": 17.140625, "epoch": 0.16558612230594008, "grad_norm": 16.652320078685968, "kl": 0.0152587890625, "learning_rate": 8.345891011039074e-07, "loss": 0.0061, "reward": 1.234375, "reward_std": 0.23144522309303284, "rewards/accuracy_reward_stage2": 0.359375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 945 }, { "completion_length": 15.6875, "epoch": 0.16576134571578763, "grad_norm": 15.300518563687973, "kl": 0.052490234375, "learning_rate": 8.344138776940599e-07, "loss": 0.021, "reward": 1.638373851776123, "reward_std": 0.1297706514596939, "rewards/accuracy_reward_stage2": 0.6383737921714783, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 946 }, { "completion_length": 19.375, "epoch": 0.16593656912563517, "grad_norm": 21.360435306543938, "kl": 0.11328125, "learning_rate": 8.342386542842124e-07, "loss": 0.0452, "reward": 1.4355685710906982, "reward_std": 0.14541111886501312, "rewards/accuracy_reward_stage2": 0.5605685710906982, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 947 }, { "completion_length": 8.15625, "epoch": 0.16611179253548275, "grad_norm": 20.031436354993605, "kl": 0.06787109375, "learning_rate": 8.340634308743648e-07, "loss": 0.0272, "reward": 1.5705227851867676, "reward_std": 0.1583014577627182, "rewards/accuracy_reward_stage2": 0.5705227255821228, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 948 }, { "completion_length": 12.109375, "epoch": 0.1662870159453303, "grad_norm": 18.51744509104956, "kl": 0.04736328125, "learning_rate": 8.338882074645173e-07, "loss": 0.0189, "reward": 1.668715476989746, "reward_std": 0.2258095145225525, "rewards/accuracy_reward_stage2": 0.6687155961990356, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 949 }, { "completion_length": 7.25, "epoch": 0.16646223935517784, "grad_norm": 21.24530602480943, "kl": 0.1337890625, "learning_rate": 8.337129840546698e-07, "loss": 0.0094, "reward": 1.4853681325912476, "reward_std": 0.21900716423988342, "rewards/accuracy_reward_stage2": 0.6259931325912476, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 950 }, { "completion_length": 7.953125, "epoch": 0.16663746276502542, "grad_norm": 18.566300126795337, "kl": 0.060546875, "learning_rate": 8.335377606448221e-07, "loss": -0.02, "reward": 1.4947917461395264, "reward_std": 0.2537845969200134, "rewards/accuracy_reward_stage2": 0.5104166865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 951 }, { "completion_length": 11.390625, "epoch": 0.16681268617487297, "grad_norm": 17.93695120274296, "kl": 0.0308837890625, "learning_rate": 8.333625372349745e-07, "loss": 0.0123, "reward": 1.4117063283920288, "reward_std": 0.21608895063400269, "rewards/accuracy_reward_stage2": 0.5367063283920288, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 952 }, { "completion_length": 12.546875, "epoch": 0.1669879095847205, "grad_norm": 17.191049053892815, "kl": 0.08056640625, "learning_rate": 8.331873138251269e-07, "loss": 0.0321, "reward": 1.3560242652893066, "reward_std": 0.19221973419189453, "rewards/accuracy_reward_stage2": 0.4810241460800171, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 953 }, { "completion_length": 11.21875, "epoch": 0.1671631329945681, "grad_norm": 14.71217338958482, "kl": 0.0576171875, "learning_rate": 8.330120904152794e-07, "loss": 0.0231, "reward": 1.5724213123321533, "reward_std": 0.11429198831319809, "rewards/accuracy_reward_stage2": 0.5724212527275085, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 954 }, { "completion_length": 9.203125, "epoch": 0.16733835640441563, "grad_norm": 17.366397954858957, "kl": 0.0830078125, "learning_rate": 8.328368670054319e-07, "loss": 0.0087, "reward": 1.718414306640625, "reward_std": 0.16092431545257568, "rewards/accuracy_reward_stage2": 0.7340393662452698, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 955 }, { "completion_length": 10.0625, "epoch": 0.16751357981426318, "grad_norm": 19.431897152020653, "kl": 0.0255126953125, "learning_rate": 8.326616435955843e-07, "loss": 0.0102, "reward": 1.502739667892456, "reward_std": 0.2304997444152832, "rewards/accuracy_reward_stage2": 0.5027396082878113, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 956 }, { "completion_length": 9.4375, "epoch": 0.16768880322411073, "grad_norm": 20.418223177440694, "kl": 0.2353515625, "learning_rate": 8.324864201857368e-07, "loss": 0.0057, "reward": 1.495002269744873, "reward_std": 0.1964532732963562, "rewards/accuracy_reward_stage2": 0.651252269744873, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 957 }, { "completion_length": 8.375, "epoch": 0.1678640266339583, "grad_norm": 15.965850962778783, "kl": 0.0233154296875, "learning_rate": 8.323111967758892e-07, "loss": 0.0093, "reward": 1.4397320747375488, "reward_std": 0.22162576019763947, "rewards/accuracy_reward_stage2": 0.4397321343421936, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 958 }, { "completion_length": 8.15625, "epoch": 0.16803925004380585, "grad_norm": 18.8964180423816, "kl": 0.12060546875, "learning_rate": 8.321359733660417e-07, "loss": 0.012, "reward": 1.560694694519043, "reward_std": 0.3037213087081909, "rewards/accuracy_reward_stage2": 0.5763195753097534, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 959 }, { "completion_length": 11.453125, "epoch": 0.1682144734536534, "grad_norm": 13.521559828752565, "kl": 0.040771484375, "learning_rate": 8.319607499561942e-07, "loss": -0.0185, "reward": 1.584068775177002, "reward_std": 0.15472474694252014, "rewards/accuracy_reward_stage2": 0.599693775177002, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 960 }, { "completion_length": 11.578125, "epoch": 0.16838969686350097, "grad_norm": 26.76151915493076, "kl": 0.119140625, "learning_rate": 8.317855265463466e-07, "loss": 0.0478, "reward": 1.4074745178222656, "reward_std": 0.27609723806381226, "rewards/accuracy_reward_stage2": 0.4074746072292328, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 961 }, { "completion_length": 6.59375, "epoch": 0.16856492027334852, "grad_norm": 13.049430572037243, "kl": 0.16015625, "learning_rate": 8.31610303136499e-07, "loss": 0.0638, "reward": 1.546875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 962 }, { "completion_length": 8.734375, "epoch": 0.16874014368319606, "grad_norm": 15.147946605641504, "kl": 0.2216796875, "learning_rate": 8.314350797266514e-07, "loss": 0.0887, "reward": 1.2102738618850708, "reward_std": 0.09944657981395721, "rewards/accuracy_reward_stage2": 0.3352738320827484, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 963 }, { "completion_length": 13.09375, "epoch": 0.16891536709304364, "grad_norm": 23.67007480900033, "kl": 0.17578125, "learning_rate": 8.312598563168038e-07, "loss": 0.0703, "reward": 1.0486290454864502, "reward_std": 0.28356683254241943, "rewards/accuracy_reward_stage2": 0.4236289858818054, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 964 }, { "completion_length": 23.234375, "epoch": 0.16909059050289119, "grad_norm": 14.766032051622496, "kl": 0.04248046875, "learning_rate": 8.310846329069563e-07, "loss": 0.017, "reward": 1.5564314126968384, "reward_std": 0.15278059244155884, "rewards/accuracy_reward_stage2": 0.6814314723014832, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 965 }, { "completion_length": 10.578125, "epoch": 0.16926581391273873, "grad_norm": 19.302599742311433, "kl": 0.1376953125, "learning_rate": 8.309094094971087e-07, "loss": 0.0112, "reward": 1.621635913848877, "reward_std": 0.28337401151657104, "rewards/accuracy_reward_stage2": 0.6372608542442322, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 966 }, { "completion_length": 7.296875, "epoch": 0.1694410373225863, "grad_norm": 20.30266988191586, "kl": 0.11181640625, "learning_rate": 8.307341860872612e-07, "loss": 0.0449, "reward": 1.5777642726898193, "reward_std": 0.18352115154266357, "rewards/accuracy_reward_stage2": 0.5777642726898193, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 967 }, { "completion_length": 10.15625, "epoch": 0.16961626073243385, "grad_norm": 25.58995419244153, "kl": 0.03515625, "learning_rate": 8.305589626774137e-07, "loss": -0.0632, "reward": 1.4791667461395264, "reward_std": 0.34395408630371094, "rewards/accuracy_reward_stage2": 0.6354166865348816, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 968 }, { "completion_length": 23.40625, "epoch": 0.1697914841422814, "grad_norm": 19.70117347558748, "kl": 0.0712890625, "learning_rate": 8.303837392675661e-07, "loss": 0.0285, "reward": 1.3151779174804688, "reward_std": 0.16388216614723206, "rewards/accuracy_reward_stage2": 0.31517791748046875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 969 }, { "completion_length": 7.390625, "epoch": 0.16996670755212898, "grad_norm": 13.108220090014575, "kl": 0.04150390625, "learning_rate": 8.302085158577186e-07, "loss": 0.0166, "reward": 1.4289811849594116, "reward_std": 0.14381805062294006, "rewards/accuracy_reward_stage2": 0.42898115515708923, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 970 }, { "completion_length": 9.09375, "epoch": 0.17014193096197652, "grad_norm": 15.970366158750508, "kl": 0.037841796875, "learning_rate": 8.300332924478711e-07, "loss": -0.029, "reward": 1.5887812376022339, "reward_std": 0.10244224965572357, "rewards/accuracy_reward_stage2": 0.6044061779975891, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 971 }, { "completion_length": 7.359375, "epoch": 0.17031715437182407, "grad_norm": 17.71757056881451, "kl": 0.10107421875, "learning_rate": 8.298580690380234e-07, "loss": -0.0037, "reward": 1.5167280435562134, "reward_std": 0.16086438298225403, "rewards/accuracy_reward_stage2": 0.5323530435562134, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 972 }, { "completion_length": 9.125, "epoch": 0.17049237778167164, "grad_norm": 15.226591161141798, "kl": 0.053955078125, "learning_rate": 8.296828456281759e-07, "loss": -0.0073, "reward": 1.4342520236968994, "reward_std": 0.18709491193294525, "rewards/accuracy_reward_stage2": 0.5592520833015442, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 973 }, { "completion_length": 11.28125, "epoch": 0.1706676011915192, "grad_norm": 23.202770013131985, "kl": 0.037109375, "learning_rate": 8.295076222183283e-07, "loss": -0.0268, "reward": 1.741548776626587, "reward_std": 0.2346932739019394, "rewards/accuracy_reward_stage2": 0.7571737766265869, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 974 }, { "completion_length": 10.46875, "epoch": 0.17084282460136674, "grad_norm": 20.769549195676348, "kl": 0.08349609375, "learning_rate": 8.293323988084808e-07, "loss": 0.0334, "reward": 1.4408023357391357, "reward_std": 0.36021432280540466, "rewards/accuracy_reward_stage2": 0.4408022463321686, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 975 }, { "completion_length": 9.5625, "epoch": 0.17101804801121429, "grad_norm": 15.462756460452283, "kl": 0.05419921875, "learning_rate": 8.291571753986332e-07, "loss": -0.0226, "reward": 1.5718014240264893, "reward_std": 0.17255185544490814, "rewards/accuracy_reward_stage2": 0.7124263644218445, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 976 }, { "completion_length": 23.484375, "epoch": 0.17119327142106186, "grad_norm": 17.53958845633692, "kl": 0.051025390625, "learning_rate": 8.289819519887856e-07, "loss": -0.0238, "reward": 1.3811914920806885, "reward_std": 0.2425132691860199, "rewards/accuracy_reward_stage2": 0.3968164622783661, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 977 }, { "completion_length": 7.828125, "epoch": 0.1713684948309094, "grad_norm": 22.068611094884353, "kl": 0.08349609375, "learning_rate": 8.288067285789381e-07, "loss": -0.0325, "reward": 1.6086739301681519, "reward_std": 0.31228816509246826, "rewards/accuracy_reward_stage2": 0.6399239301681519, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 978 }, { "completion_length": 8.265625, "epoch": 0.17154371824075695, "grad_norm": 19.72837271320354, "kl": 0.060791015625, "learning_rate": 8.286315051690906e-07, "loss": 0.0243, "reward": 1.3992738723754883, "reward_std": 0.26514434814453125, "rewards/accuracy_reward_stage2": 0.3992738425731659, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 979 }, { "completion_length": 19.65625, "epoch": 0.17171894165060453, "grad_norm": 20.081802914517276, "kl": 0.035400390625, "learning_rate": 8.28456281759243e-07, "loss": 0.0142, "reward": 1.3881545066833496, "reward_std": 0.09738902747631073, "rewards/accuracy_reward_stage2": 0.5131544470787048, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 980 }, { "completion_length": 10.109375, "epoch": 0.17189416506045208, "grad_norm": 151.70299207827858, "kl": 0.82421875, "learning_rate": 8.282810583493955e-07, "loss": 0.3295, "reward": 1.504793643951416, "reward_std": 0.12093257904052734, "rewards/accuracy_reward_stage2": 0.6297937631607056, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 981 }, { "completion_length": 13.046875, "epoch": 0.17206938847029962, "grad_norm": 15.725761134737253, "kl": 0.048828125, "learning_rate": 8.281058349395478e-07, "loss": -0.0246, "reward": 1.321092128753662, "reward_std": 0.25256600975990295, "rewards/accuracy_reward_stage2": 0.3367171585559845, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 982 }, { "completion_length": 11.515625, "epoch": 0.1722446118801472, "grad_norm": 19.530650469401575, "kl": 0.072265625, "learning_rate": 8.279306115297003e-07, "loss": 0.0027, "reward": 1.491995096206665, "reward_std": 0.2790702283382416, "rewards/accuracy_reward_stage2": 0.507620096206665, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 983 }, { "completion_length": 6.390625, "epoch": 0.17241983528999474, "grad_norm": 18.369641912161285, "kl": 0.07958984375, "learning_rate": 8.277553881198528e-07, "loss": -0.1004, "reward": 1.7411483526229858, "reward_std": 0.2977067530155182, "rewards/accuracy_reward_stage2": 0.7880233526229858, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 984 }, { "completion_length": 12.4375, "epoch": 0.1725950586998423, "grad_norm": 21.629425902778692, "kl": 0.10302734375, "learning_rate": 8.275801647100052e-07, "loss": -0.0549, "reward": 1.7416812181472778, "reward_std": 0.29112160205841064, "rewards/accuracy_reward_stage2": 0.7885562777519226, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 985 }, { "completion_length": 8.890625, "epoch": 0.17277028210968987, "grad_norm": 14.636965958095939, "kl": 0.0673828125, "learning_rate": 8.274049413001577e-07, "loss": 0.0269, "reward": 1.5075805187225342, "reward_std": 0.12128952145576477, "rewards/accuracy_reward_stage2": 0.5075805187225342, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 986 }, { "completion_length": 11.28125, "epoch": 0.1729455055195374, "grad_norm": 16.800360042148473, "kl": 0.1484375, "learning_rate": 8.272297178903102e-07, "loss": -0.0479, "reward": 1.6010843515396118, "reward_std": 0.21340158581733704, "rewards/accuracy_reward_stage2": 0.647959291934967, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 987 }, { "completion_length": 9.359375, "epoch": 0.17312072892938496, "grad_norm": 19.29185552280403, "kl": 0.0673828125, "learning_rate": 8.270544944804626e-07, "loss": -0.0171, "reward": 1.329951286315918, "reward_std": 0.21836236119270325, "rewards/accuracy_reward_stage2": 0.47057637572288513, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 988 }, { "completion_length": 10.828125, "epoch": 0.17329595233923253, "grad_norm": 17.61933481117207, "kl": 0.162109375, "learning_rate": 8.26879271070615e-07, "loss": 0.0647, "reward": 1.7506136894226074, "reward_std": 0.11140866577625275, "rewards/accuracy_reward_stage2": 0.7506136894226074, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 989 }, { "completion_length": 15.75, "epoch": 0.17347117574908008, "grad_norm": 16.550933351259392, "kl": 0.0216064453125, "learning_rate": 8.267040476607674e-07, "loss": -0.074, "reward": 1.5046889781951904, "reward_std": 0.23458629846572876, "rewards/accuracy_reward_stage2": 0.5359390377998352, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 990 }, { "completion_length": 10.5625, "epoch": 0.17364639915892763, "grad_norm": 18.65045253777362, "kl": 0.0810546875, "learning_rate": 8.265288242509199e-07, "loss": 0.0323, "reward": 1.4617888927459717, "reward_std": 0.3118630647659302, "rewards/accuracy_reward_stage2": 0.4617888927459717, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 991 }, { "completion_length": 10.171875, "epoch": 0.17382162256877517, "grad_norm": 21.35463925406452, "kl": 0.228515625, "learning_rate": 8.263536008410723e-07, "loss": 0.0918, "reward": 1.6203045845031738, "reward_std": 0.1743711233139038, "rewards/accuracy_reward_stage2": 0.7453045845031738, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 992 }, { "completion_length": 8.78125, "epoch": 0.17399684597862275, "grad_norm": 16.277081608292, "kl": 0.1943359375, "learning_rate": 8.261783774312247e-07, "loss": 0.0777, "reward": 1.471451997756958, "reward_std": 0.17242538928985596, "rewards/accuracy_reward_stage2": 0.721451997756958, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 993 }, { "completion_length": 8.671875, "epoch": 0.1741720693884703, "grad_norm": 20.263183545075382, "kl": 0.049560546875, "learning_rate": 8.260031540213772e-07, "loss": 0.0084, "reward": 1.3886260986328125, "reward_std": 0.3107958137989044, "rewards/accuracy_reward_stage2": 0.5136260986328125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 994 }, { "completion_length": 9.296875, "epoch": 0.17434729279831784, "grad_norm": 19.59160552675768, "kl": 0.09033203125, "learning_rate": 8.258279306115297e-07, "loss": 0.0362, "reward": 1.3483185768127441, "reward_std": 0.2468905746936798, "rewards/accuracy_reward_stage2": 0.34831857681274414, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 995 }, { "completion_length": 10.65625, "epoch": 0.17452251620816542, "grad_norm": 18.529946519101127, "kl": 0.11474609375, "learning_rate": 8.256527072016821e-07, "loss": 0.0459, "reward": 1.512170433998108, "reward_std": 0.12816551327705383, "rewards/accuracy_reward_stage2": 0.6371704339981079, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 996 }, { "completion_length": 8.078125, "epoch": 0.17469773961801296, "grad_norm": 24.896906912430655, "kl": 0.310546875, "learning_rate": 8.254774837918346e-07, "loss": 0.1239, "reward": 1.4386553764343262, "reward_std": 0.1602693349123001, "rewards/accuracy_reward_stage2": 0.5636553764343262, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 997 }, { "completion_length": 8.09375, "epoch": 0.1748729630278605, "grad_norm": 24.22248073843513, "kl": 0.0458984375, "learning_rate": 8.25302260381987e-07, "loss": 0.0183, "reward": 1.6866368055343628, "reward_std": 0.20650362968444824, "rewards/accuracy_reward_stage2": 0.6866368055343628, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 998 }, { "completion_length": 11.0625, "epoch": 0.1750481864377081, "grad_norm": 18.275631526717135, "kl": 0.037353515625, "learning_rate": 8.251270369721395e-07, "loss": 0.0149, "reward": 1.853606939315796, "reward_std": 0.21327659487724304, "rewards/accuracy_reward_stage2": 0.8536069393157959, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 999 }, { "completion_length": 7.328125, "epoch": 0.17522340984755563, "grad_norm": 20.812654899712545, "kl": 0.07080078125, "learning_rate": 8.24951813562292e-07, "loss": -0.016, "reward": 1.7883012294769287, "reward_std": 0.17655321955680847, "rewards/accuracy_reward_stage2": 0.8039262294769287, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1000 }, { "completion_length": 12.421875, "epoch": 0.17539863325740318, "grad_norm": 20.401303313595758, "kl": 0.0537109375, "learning_rate": 8.247765901524442e-07, "loss": -0.0227, "reward": 1.476668119430542, "reward_std": 0.19259199500083923, "rewards/accuracy_reward_stage2": 0.6172930002212524, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1001 }, { "completion_length": 8.328125, "epoch": 0.17557385666725076, "grad_norm": 16.901598014848172, "kl": 0.0576171875, "learning_rate": 8.246013667425967e-07, "loss": -0.0211, "reward": 1.639136552810669, "reward_std": 0.27491295337677, "rewards/accuracy_reward_stage2": 0.6547614336013794, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1002 }, { "completion_length": 7.671875, "epoch": 0.1757490800770983, "grad_norm": 22.6402288692317, "kl": 0.0791015625, "learning_rate": 8.244261433327491e-07, "loss": -0.0026, "reward": 1.515639305114746, "reward_std": 0.28191104531288147, "rewards/accuracy_reward_stage2": 0.6562642455101013, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1003 }, { "completion_length": 11.640625, "epoch": 0.17592430348694585, "grad_norm": 20.971542924107762, "kl": 0.04443359375, "learning_rate": 8.242509199229016e-07, "loss": 0.0178, "reward": 1.5278730392456055, "reward_std": 0.22605562210083008, "rewards/accuracy_reward_stage2": 0.5278730392456055, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1004 }, { "completion_length": 9.796875, "epoch": 0.17609952689679342, "grad_norm": 16.783145596403724, "kl": 0.23046875, "learning_rate": 8.240756965130541e-07, "loss": 0.0924, "reward": 1.3827335834503174, "reward_std": 0.1471167504787445, "rewards/accuracy_reward_stage2": 0.5077335834503174, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1005 }, { "completion_length": 8.453125, "epoch": 0.17627475030664097, "grad_norm": 21.650322269214524, "kl": 0.2451171875, "learning_rate": 8.239004731032065e-07, "loss": 0.065, "reward": 1.471717119216919, "reward_std": 0.31655657291412354, "rewards/accuracy_reward_stage2": 0.6123421788215637, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1006 }, { "completion_length": 11.453125, "epoch": 0.17644997371648852, "grad_norm": 20.356875919340375, "kl": 0.140625, "learning_rate": 8.23725249693359e-07, "loss": 0.0562, "reward": 1.6293368339538574, "reward_std": 0.22882044315338135, "rewards/accuracy_reward_stage2": 0.7543368339538574, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1007 }, { "completion_length": 18.09375, "epoch": 0.1766251971263361, "grad_norm": 14.100834330812283, "kl": 0.1318359375, "learning_rate": 8.235500262835115e-07, "loss": 0.0525, "reward": 1.4905918836593628, "reward_std": 0.07823127508163452, "rewards/accuracy_reward_stage2": 0.6155918836593628, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1008 }, { "completion_length": 7.3125, "epoch": 0.17680042053618364, "grad_norm": 21.476913992157833, "kl": 0.09130859375, "learning_rate": 8.233748028736639e-07, "loss": -0.0076, "reward": 1.5450880527496338, "reward_std": 0.23528814315795898, "rewards/accuracy_reward_stage2": 0.560712993144989, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1009 }, { "completion_length": 9.3125, "epoch": 0.17697564394603119, "grad_norm": 16.438543339296952, "kl": 0.02978515625, "learning_rate": 8.231995794638164e-07, "loss": 0.0119, "reward": 1.4612115621566772, "reward_std": 0.12375926971435547, "rewards/accuracy_reward_stage2": 0.46121156215667725, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1010 }, { "completion_length": 10.734375, "epoch": 0.17715086735587873, "grad_norm": 26.435355144409534, "kl": 0.0908203125, "learning_rate": 8.230243560539689e-07, "loss": 0.0148, "reward": 1.6115930080413818, "reward_std": 0.27439337968826294, "rewards/accuracy_reward_stage2": 0.6272180676460266, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1011 }, { "completion_length": 10.921875, "epoch": 0.1773260907657263, "grad_norm": 47.63378987105592, "kl": 0.0283203125, "learning_rate": 8.228491326441212e-07, "loss": 0.0113, "reward": 1.6302083730697632, "reward_std": 0.13152070343494415, "rewards/accuracy_reward_stage2": 0.7552083134651184, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1012 }, { "completion_length": 11.03125, "epoch": 0.17750131417557385, "grad_norm": 18.101819542619275, "kl": 0.134765625, "learning_rate": 8.226739092342737e-07, "loss": 0.0181, "reward": 1.4931879043579102, "reward_std": 0.24275703728199005, "rewards/accuracy_reward_stage2": 0.6338127851486206, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1013 }, { "completion_length": 9.15625, "epoch": 0.1776765375854214, "grad_norm": 16.102376244932238, "kl": 0.07373046875, "learning_rate": 8.22498685824426e-07, "loss": 0.0295, "reward": 1.5167219638824463, "reward_std": 0.15145519375801086, "rewards/accuracy_reward_stage2": 0.5167218446731567, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1014 }, { "completion_length": 8.9375, "epoch": 0.17785176099526898, "grad_norm": 17.936021523356104, "kl": 0.0546875, "learning_rate": 8.223234624145785e-07, "loss": -0.0061, "reward": 1.4813058376312256, "reward_std": 0.2451373040676117, "rewards/accuracy_reward_stage2": 0.4969308078289032, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1015 }, { "completion_length": 14.453125, "epoch": 0.17802698440511652, "grad_norm": 18.444340845595043, "kl": 0.06298828125, "learning_rate": 8.22148239004731e-07, "loss": 0.0251, "reward": 1.5098751783370972, "reward_std": 0.251526415348053, "rewards/accuracy_reward_stage2": 0.5098751783370972, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1016 }, { "completion_length": 9.234375, "epoch": 0.17820220781496407, "grad_norm": 15.845520083072273, "kl": 0.0250244140625, "learning_rate": 8.219730155948834e-07, "loss": 0.01, "reward": 1.5925273895263672, "reward_std": 0.1007295772433281, "rewards/accuracy_reward_stage2": 0.7175273895263672, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1017 }, { "completion_length": 8.515625, "epoch": 0.17837743122481164, "grad_norm": 16.774498969471463, "kl": 0.06787109375, "learning_rate": 8.217977921850359e-07, "loss": -0.0613, "reward": 1.5572917461395264, "reward_std": 0.27777281403541565, "rewards/accuracy_reward_stage2": 0.5885416865348816, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1018 }, { "completion_length": 12.453125, "epoch": 0.1785526546346592, "grad_norm": 18.360925497310827, "kl": 0.0595703125, "learning_rate": 8.216225687751883e-07, "loss": -0.0179, "reward": 1.7169744968414307, "reward_std": 0.23899121582508087, "rewards/accuracy_reward_stage2": 0.7325994372367859, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1019 }, { "completion_length": 17.859375, "epoch": 0.17872787804450674, "grad_norm": 19.006972211077844, "kl": 0.029296875, "learning_rate": 8.214473453653408e-07, "loss": 0.0117, "reward": 1.5444114208221436, "reward_std": 0.08622656762599945, "rewards/accuracy_reward_stage2": 0.5444114804267883, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1020 }, { "completion_length": 6.828125, "epoch": 0.1789031014543543, "grad_norm": 20.900425709632742, "kl": 0.130859375, "learning_rate": 8.212721219554933e-07, "loss": -0.0142, "reward": 1.8227179050445557, "reward_std": 0.19578629732131958, "rewards/accuracy_reward_stage2": 0.8539679050445557, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1021 }, { "completion_length": 10.40625, "epoch": 0.17907832486420186, "grad_norm": 13.043553359133417, "kl": 0.03271484375, "learning_rate": 8.210968985456456e-07, "loss": 0.013, "reward": 1.7412645816802979, "reward_std": 0.1495451033115387, "rewards/accuracy_reward_stage2": 0.7412645816802979, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1022 }, { "completion_length": 12.03125, "epoch": 0.1792535482740494, "grad_norm": 20.33707074188944, "kl": 0.047119140625, "learning_rate": 8.209216751357981e-07, "loss": 0.0188, "reward": 1.577462077140808, "reward_std": 0.18190613389015198, "rewards/accuracy_reward_stage2": 0.5774620771408081, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1023 }, { "completion_length": 12.734375, "epoch": 0.17942877168389698, "grad_norm": 30.677168098528167, "kl": 0.0458984375, "learning_rate": 8.207464517259506e-07, "loss": 0.0183, "reward": 1.5067112445831299, "reward_std": 0.12453323602676392, "rewards/accuracy_reward_stage2": 0.5067112445831299, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1024 }, { "completion_length": 12.296875, "epoch": 0.17960399509374453, "grad_norm": 19.917300368702044, "kl": 0.12109375, "learning_rate": 8.20571228316103e-07, "loss": 0.0485, "reward": 1.4255033731460571, "reward_std": 0.12041162699460983, "rewards/accuracy_reward_stage2": 0.5505033731460571, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1025 }, { "completion_length": 5.90625, "epoch": 0.17977921850359208, "grad_norm": 14.500310886298085, "kl": 0.0322265625, "learning_rate": 8.203960049062555e-07, "loss": -0.0313, "reward": 1.8072917461395264, "reward_std": 0.1236192062497139, "rewards/accuracy_reward_stage2": 0.8229166865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1026 }, { "completion_length": 10.703125, "epoch": 0.17995444191343962, "grad_norm": 16.120327694641297, "kl": 0.140625, "learning_rate": 8.202207814964078e-07, "loss": 0.0119, "reward": 1.4947917461395264, "reward_std": 0.2251920998096466, "rewards/accuracy_reward_stage2": 0.6354166865348816, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1027 }, { "completion_length": 11.1875, "epoch": 0.1801296653232872, "grad_norm": 17.841738073436776, "kl": 0.020751953125, "learning_rate": 8.200455580865603e-07, "loss": 0.0083, "reward": 1.6875, "reward_std": 0.2540663480758667, "rewards/accuracy_reward_stage2": 0.6875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1028 }, { "completion_length": 9.5, "epoch": 0.18030488873313474, "grad_norm": 14.691140252447976, "kl": 0.07470703125, "learning_rate": 8.198703346767128e-07, "loss": -0.0121, "reward": 1.6741013526916504, "reward_std": 0.252849280834198, "rewards/accuracy_reward_stage2": 0.6897263526916504, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1029 }, { "completion_length": 7.140625, "epoch": 0.1804801121429823, "grad_norm": 15.284062287517404, "kl": 0.0159912109375, "learning_rate": 8.196951112668652e-07, "loss": 0.0064, "reward": 1.6623451709747314, "reward_std": 0.12901227176189423, "rewards/accuracy_reward_stage2": 0.6623451709747314, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1030 }, { "completion_length": 11.671875, "epoch": 0.18065533555282987, "grad_norm": 20.77060669760343, "kl": 0.1162109375, "learning_rate": 8.195198878570176e-07, "loss": -0.0166, "reward": 1.5759544372558594, "reward_std": 0.3065808415412903, "rewards/accuracy_reward_stage2": 0.6072044968605042, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1031 }, { "completion_length": 9.328125, "epoch": 0.1808305589626774, "grad_norm": 22.582798418645474, "kl": 0.265625, "learning_rate": 8.193446644471701e-07, "loss": 0.1057, "reward": 1.3950915336608887, "reward_std": 0.31523001194000244, "rewards/accuracy_reward_stage2": 0.6450915336608887, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1032 }, { "completion_length": 12.5, "epoch": 0.18100578237252496, "grad_norm": 23.123052879635157, "kl": 0.169921875, "learning_rate": 8.191694410373225e-07, "loss": 0.0238, "reward": 1.5907235145568848, "reward_std": 0.24119962751865387, "rewards/accuracy_reward_stage2": 0.7313483953475952, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1033 }, { "completion_length": 8.921875, "epoch": 0.18118100578237253, "grad_norm": 16.125358149381995, "kl": 0.0908203125, "learning_rate": 8.18994217627475e-07, "loss": 0.0363, "reward": 1.5684726238250732, "reward_std": 0.2518884241580963, "rewards/accuracy_reward_stage2": 0.5684726238250732, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1034 }, { "completion_length": 11.71875, "epoch": 0.18135622919222008, "grad_norm": 17.152327203738277, "kl": 0.10205078125, "learning_rate": 8.188189942176274e-07, "loss": 0.0075, "reward": 1.4885659217834473, "reward_std": 0.2130601704120636, "rewards/accuracy_reward_stage2": 0.504190981388092, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1035 }, { "completion_length": 8.5625, "epoch": 0.18153145260206763, "grad_norm": 12.17071070502903, "kl": 0.044921875, "learning_rate": 8.186437708077799e-07, "loss": -0.0263, "reward": 1.5416667461395264, "reward_std": 0.16781337559223175, "rewards/accuracy_reward_stage2": 0.5572916865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1036 }, { "completion_length": 9.765625, "epoch": 0.1817066760119152, "grad_norm": 15.998631392491633, "kl": 0.06103515625, "learning_rate": 8.184685473979324e-07, "loss": -0.0197, "reward": 1.7481575012207031, "reward_std": 0.12084738910198212, "rewards/accuracy_reward_stage2": 0.7637824416160583, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1037 }, { "completion_length": 11.734375, "epoch": 0.18188189942176275, "grad_norm": 15.049644417962663, "kl": 0.08203125, "learning_rate": 8.182933239880848e-07, "loss": 0.0328, "reward": 1.6555730104446411, "reward_std": 0.14731593430042267, "rewards/accuracy_reward_stage2": 0.6555730104446411, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1038 }, { "completion_length": 32.265625, "epoch": 0.1820571228316103, "grad_norm": 20.944749777286034, "kl": 0.10546875, "learning_rate": 8.181181005782373e-07, "loss": 0.0004, "reward": 1.3465502262115479, "reward_std": 0.11534099280834198, "rewards/accuracy_reward_stage2": 0.48717522621154785, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1039 }, { "completion_length": 7.0625, "epoch": 0.18223234624145787, "grad_norm": 18.674889870148007, "kl": 0.06396484375, "learning_rate": 8.179428771683897e-07, "loss": 0.0256, "reward": 1.4174654483795166, "reward_std": 0.17789804935455322, "rewards/accuracy_reward_stage2": 0.5424654483795166, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1040 }, { "completion_length": 12.375, "epoch": 0.18240756965130542, "grad_norm": 15.68352013594671, "kl": 0.0181884765625, "learning_rate": 8.17767653758542e-07, "loss": 0.0073, "reward": 1.6470057964324951, "reward_std": 0.15792571008205414, "rewards/accuracy_reward_stage2": 0.6470057368278503, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1041 }, { "completion_length": 11.78125, "epoch": 0.18258279306115296, "grad_norm": 17.814980651109845, "kl": 0.07568359375, "learning_rate": 8.175924303486945e-07, "loss": -0.0027, "reward": 1.720482349395752, "reward_std": 0.28667330741882324, "rewards/accuracy_reward_stage2": 0.736107349395752, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1042 }, { "completion_length": 17.796875, "epoch": 0.18275801647100054, "grad_norm": 21.4349531985855, "kl": 0.059326171875, "learning_rate": 8.174172069388469e-07, "loss": 0.0237, "reward": 1.185448169708252, "reward_std": 0.19691388309001923, "rewards/accuracy_reward_stage2": 0.31044822931289673, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1043 }, { "completion_length": 12.515625, "epoch": 0.1829332398808481, "grad_norm": 17.88483618320598, "kl": 0.078125, "learning_rate": 8.172419835289994e-07, "loss": -0.0127, "reward": 1.491915225982666, "reward_std": 0.18654459714889526, "rewards/accuracy_reward_stage2": 0.5075401663780212, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1044 }, { "completion_length": 10.296875, "epoch": 0.18310846329069563, "grad_norm": 22.59994444123705, "kl": 0.027099609375, "learning_rate": 8.170667601191519e-07, "loss": 0.0108, "reward": 1.53125, "reward_std": 0.3119301199913025, "rewards/accuracy_reward_stage2": 0.53125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1045 }, { "completion_length": 5.375, "epoch": 0.18328368670054318, "grad_norm": 9.016221394372321, "kl": 0.035400390625, "learning_rate": 8.168915367093043e-07, "loss": 0.0141, "reward": 1.695914387702942, "reward_std": 0.025557324290275574, "rewards/accuracy_reward_stage2": 0.6959144473075867, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1046 }, { "completion_length": 10.8125, "epoch": 0.18345891011039075, "grad_norm": 24.762047009674582, "kl": 0.1064453125, "learning_rate": 8.167163132994568e-07, "loss": -0.0191, "reward": 1.7301406860351562, "reward_std": 0.292434424161911, "rewards/accuracy_reward_stage2": 0.7613905668258667, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1047 }, { "completion_length": 10.859375, "epoch": 0.1836341335202383, "grad_norm": 35.97968114566077, "kl": 0.291015625, "learning_rate": 8.165410898896093e-07, "loss": 0.0854, "reward": 1.3661143779754639, "reward_std": 0.21945153176784515, "rewards/accuracy_reward_stage2": 0.6317393779754639, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1048 }, { "completion_length": 8.78125, "epoch": 0.18380935693008585, "grad_norm": 14.925876419456184, "kl": 0.0269775390625, "learning_rate": 8.163658664797617e-07, "loss": 0.0108, "reward": 1.5434216260910034, "reward_std": 0.04544178768992424, "rewards/accuracy_reward_stage2": 0.5434216260910034, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1049 }, { "completion_length": 11.859375, "epoch": 0.18398458033993342, "grad_norm": 27.873893943359306, "kl": 0.1494140625, "learning_rate": 8.161906430699142e-07, "loss": 0.0696, "reward": 1.3853431940078735, "reward_std": 0.20109063386917114, "rewards/accuracy_reward_stage2": 0.510343074798584, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1050 }, { "completion_length": 7.90625, "epoch": 0.18415980374978097, "grad_norm": 18.855523440997825, "kl": 0.07666015625, "learning_rate": 8.160154196600665e-07, "loss": 0.0306, "reward": 1.6734848022460938, "reward_std": 0.2122466266155243, "rewards/accuracy_reward_stage2": 0.6734848022460938, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1051 }, { "completion_length": 13.25, "epoch": 0.18433502715962852, "grad_norm": 22.084077966222758, "kl": 0.06787109375, "learning_rate": 8.158401962502189e-07, "loss": 0.0144, "reward": 1.6123440265655518, "reward_std": 0.1913946568965912, "rewards/accuracy_reward_stage2": 0.627968966960907, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1052 }, { "completion_length": 9.28125, "epoch": 0.1845102505694761, "grad_norm": 21.391734235107787, "kl": 0.06787109375, "learning_rate": 8.156649728403714e-07, "loss": 0.0273, "reward": 1.7385876178741455, "reward_std": 0.15550118684768677, "rewards/accuracy_reward_stage2": 0.7385876178741455, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1053 }, { "completion_length": 9.21875, "epoch": 0.18468547397932364, "grad_norm": 22.317421155534973, "kl": 0.0615234375, "learning_rate": 8.154897494305238e-07, "loss": 0.0246, "reward": 1.710514783859253, "reward_std": 0.30146676301956177, "rewards/accuracy_reward_stage2": 0.7105147838592529, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1054 }, { "completion_length": 6.28125, "epoch": 0.18486069738917119, "grad_norm": 17.435864193181782, "kl": 0.125, "learning_rate": 8.153145260206763e-07, "loss": -0.0174, "reward": 1.7268104553222656, "reward_std": 0.16393327713012695, "rewards/accuracy_reward_stage2": 0.7580605745315552, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1055 }, { "completion_length": 11.96875, "epoch": 0.18503592079901876, "grad_norm": 51.940177880679094, "kl": 0.357421875, "learning_rate": 8.151393026108288e-07, "loss": 0.181, "reward": 1.4652410745620728, "reward_std": 0.19520485401153564, "rewards/accuracy_reward_stage2": 0.5902410745620728, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1056 }, { "completion_length": 9.171875, "epoch": 0.1852111442088663, "grad_norm": 15.935770458923814, "kl": 0.083984375, "learning_rate": 8.149640792009812e-07, "loss": 0.0337, "reward": 1.5994462966918945, "reward_std": 0.12744741141796112, "rewards/accuracy_reward_stage2": 0.5994464159011841, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1057 }, { "completion_length": 7.703125, "epoch": 0.18538636761871385, "grad_norm": 19.994315655368574, "kl": 0.042724609375, "learning_rate": 8.147888557911337e-07, "loss": 0.017, "reward": 1.5583534240722656, "reward_std": 0.16728171706199646, "rewards/accuracy_reward_stage2": 0.5583534836769104, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1058 }, { "completion_length": 9.046875, "epoch": 0.18556159102856143, "grad_norm": 27.119082446679823, "kl": 0.095703125, "learning_rate": 8.146136323812861e-07, "loss": 0.0319, "reward": 1.4803056716918945, "reward_std": 0.30267736315727234, "rewards/accuracy_reward_stage2": 0.4959307312965393, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1059 }, { "completion_length": 10.65625, "epoch": 0.18573681443840898, "grad_norm": 14.351166107837374, "kl": 0.0693359375, "learning_rate": 8.144384089714386e-07, "loss": 0.0276, "reward": 1.7468219995498657, "reward_std": 0.07223241031169891, "rewards/accuracy_reward_stage2": 0.7468219995498657, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1060 }, { "completion_length": 9.484375, "epoch": 0.18591203784825652, "grad_norm": 18.448592069829477, "kl": 0.10107421875, "learning_rate": 8.14263185561591e-07, "loss": 0.0019, "reward": 1.4668774604797363, "reward_std": 0.147735133767128, "rewards/accuracy_reward_stage2": 0.4825023412704468, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1061 }, { "completion_length": 8.078125, "epoch": 0.18608726125810407, "grad_norm": 18.84877306794847, "kl": 0.07861328125, "learning_rate": 8.140879621517434e-07, "loss": -0.0128, "reward": 1.6677536964416504, "reward_std": 0.16473901271820068, "rewards/accuracy_reward_stage2": 0.8083786368370056, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1062 }, { "completion_length": 10.328125, "epoch": 0.18626248466795164, "grad_norm": 20.90295345468802, "kl": 0.103515625, "learning_rate": 8.139127387418959e-07, "loss": -0.0027, "reward": 1.4815478324890137, "reward_std": 0.2598685026168823, "rewards/accuracy_reward_stage2": 0.4971729516983032, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1063 }, { "completion_length": 17.0, "epoch": 0.1864377080777992, "grad_norm": 23.64176392441905, "kl": 0.07568359375, "learning_rate": 8.137375153320484e-07, "loss": 0.0302, "reward": 1.4952113628387451, "reward_std": 0.1622573733329773, "rewards/accuracy_reward_stage2": 0.49521133303642273, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1064 }, { "completion_length": 7.640625, "epoch": 0.18661293148764674, "grad_norm": 18.15053174337808, "kl": 0.275390625, "learning_rate": 8.135622919222007e-07, "loss": 0.1104, "reward": 1.2247974872589111, "reward_std": 0.14716273546218872, "rewards/accuracy_reward_stage2": 0.47479745745658875, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1065 }, { "completion_length": 7.171875, "epoch": 0.1867881548974943, "grad_norm": 20.80874547850332, "kl": 0.087890625, "learning_rate": 8.133870685123532e-07, "loss": 0.0351, "reward": 1.516782283782959, "reward_std": 0.19680972397327423, "rewards/accuracy_reward_stage2": 0.7667823433876038, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1066 }, { "completion_length": 9.59375, "epoch": 0.18696337830734186, "grad_norm": 18.272476449941177, "kl": 0.197265625, "learning_rate": 8.132118451025056e-07, "loss": 0.0066, "reward": 1.2306230068206787, "reward_std": 0.2887798845767975, "rewards/accuracy_reward_stage2": 0.5118729472160339, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1067 }, { "completion_length": 7.90625, "epoch": 0.1871386017171894, "grad_norm": 18.320944156003165, "kl": 0.0908203125, "learning_rate": 8.130366216926581e-07, "loss": -0.008, "reward": 1.4491363763809204, "reward_std": 0.26713356375694275, "rewards/accuracy_reward_stage2": 0.5897614359855652, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1068 }, { "completion_length": 5.15625, "epoch": 0.18731382512703698, "grad_norm": 5.6172327173402214, "kl": 0.030517578125, "learning_rate": 8.128613982828106e-07, "loss": 0.0122, "reward": 1.34375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1069 }, { "completion_length": 10.78125, "epoch": 0.18748904853688453, "grad_norm": 13.968815746488696, "kl": 0.07763671875, "learning_rate": 8.12686174872963e-07, "loss": 0.031, "reward": 1.6498345136642456, "reward_std": 0.10637789964675903, "rewards/accuracy_reward_stage2": 0.6498345136642456, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1070 }, { "completion_length": 8.9375, "epoch": 0.18766427194673208, "grad_norm": 12.70060536444373, "kl": 0.05615234375, "learning_rate": 8.125109514631154e-07, "loss": -0.0206, "reward": 1.636265754699707, "reward_std": 0.16409549117088318, "rewards/accuracy_reward_stage2": 0.651890754699707, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1071 }, { "completion_length": 10.46875, "epoch": 0.18783949535657965, "grad_norm": 16.49288173549537, "kl": 0.0595703125, "learning_rate": 8.123357280532679e-07, "loss": -0.0119, "reward": 1.7313894033432007, "reward_std": 0.21923525631427765, "rewards/accuracy_reward_stage2": 0.7470144033432007, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1072 }, { "completion_length": 12.359375, "epoch": 0.1880147187664272, "grad_norm": 19.559616270302996, "kl": 0.1982421875, "learning_rate": 8.121605046434203e-07, "loss": 0.0018, "reward": 1.7189399003982544, "reward_std": 0.16802644729614258, "rewards/accuracy_reward_stage2": 0.750190019607544, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1073 }, { "completion_length": 8.484375, "epoch": 0.18818994217627474, "grad_norm": 21.374718124967487, "kl": 0.08935546875, "learning_rate": 8.119852812335728e-07, "loss": -0.0418, "reward": 1.489177942276001, "reward_std": 0.21345767378807068, "rewards/accuracy_reward_stage2": 0.629802942276001, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1074 }, { "completion_length": 9.78125, "epoch": 0.18836516558612232, "grad_norm": 13.154409710619879, "kl": 0.01904296875, "learning_rate": 8.118100578237252e-07, "loss": 0.0076, "reward": 1.484375, "reward_std": 0.16887325048446655, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1075 }, { "completion_length": 9.265625, "epoch": 0.18854038899596987, "grad_norm": 20.690764265380892, "kl": 0.1337890625, "learning_rate": 8.116348344138777e-07, "loss": 0.0095, "reward": 1.607621669769287, "reward_std": 0.2766973674297333, "rewards/accuracy_reward_stage2": 0.6232466697692871, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1076 }, { "completion_length": 10.46875, "epoch": 0.1887156124058174, "grad_norm": 22.12174077968315, "kl": 0.1279296875, "learning_rate": 8.114596110040302e-07, "loss": 0.0222, "reward": 1.641465425491333, "reward_std": 0.22600020468235016, "rewards/accuracy_reward_stage2": 0.6570904850959778, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1077 }, { "completion_length": 12.671875, "epoch": 0.18889083581566496, "grad_norm": 18.34750471892336, "kl": 0.0458984375, "learning_rate": 8.112843875941825e-07, "loss": 0.0183, "reward": 1.5698916912078857, "reward_std": 0.1975686103105545, "rewards/accuracy_reward_stage2": 0.5698915719985962, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1078 }, { "completion_length": 10.953125, "epoch": 0.18906605922551253, "grad_norm": 29.019484580187594, "kl": 0.0283203125, "learning_rate": 8.11109164184335e-07, "loss": 0.0113, "reward": 1.8850083351135254, "reward_std": 0.14863698184490204, "rewards/accuracy_reward_stage2": 0.8850083351135254, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1079 }, { "completion_length": 6.34375, "epoch": 0.18924128263536008, "grad_norm": 18.771028895482477, "kl": 0.0859375, "learning_rate": 8.109339407744873e-07, "loss": -0.0387, "reward": 1.7000000476837158, "reward_std": 0.1992851197719574, "rewards/accuracy_reward_stage2": 0.8562500476837158, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1080 }, { "completion_length": 8.953125, "epoch": 0.18941650604520763, "grad_norm": 26.366143313068836, "kl": 0.08935546875, "learning_rate": 8.107587173646398e-07, "loss": -0.0084, "reward": 1.587983250617981, "reward_std": 0.23805229365825653, "rewards/accuracy_reward_stage2": 0.603608250617981, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1081 }, { "completion_length": 6.390625, "epoch": 0.1895917294550552, "grad_norm": 10.240158184262638, "kl": 0.004119873046875, "learning_rate": 8.105834939547923e-07, "loss": 0.0016, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.703125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1082 }, { "completion_length": 10.34375, "epoch": 0.18976695286490275, "grad_norm": 15.66851708461809, "kl": 0.1318359375, "learning_rate": 8.104082705449447e-07, "loss": 0.0528, "reward": 1.4515047073364258, "reward_std": 0.16618230938911438, "rewards/accuracy_reward_stage2": 0.5765047073364258, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1083 }, { "completion_length": 13.140625, "epoch": 0.1899421762747503, "grad_norm": 19.031333710317966, "kl": 0.0703125, "learning_rate": 8.102330471350972e-07, "loss": 0.0282, "reward": 1.310152530670166, "reward_std": 0.12037193030118942, "rewards/accuracy_reward_stage2": 0.31015244126319885, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1084 }, { "completion_length": 6.3125, "epoch": 0.19011739968459787, "grad_norm": 19.075249545090323, "kl": 0.056884765625, "learning_rate": 8.100578237252497e-07, "loss": 0.0227, "reward": 1.6960257291793823, "reward_std": 0.1582869291305542, "rewards/accuracy_reward_stage2": 0.6960256695747375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1085 }, { "completion_length": 8.609375, "epoch": 0.19029262309444542, "grad_norm": 22.746907077342723, "kl": 0.1337890625, "learning_rate": 8.098826003154021e-07, "loss": -0.0, "reward": 1.8112388849258423, "reward_std": 0.22928564250469208, "rewards/accuracy_reward_stage2": 0.8424888849258423, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1086 }, { "completion_length": 8.734375, "epoch": 0.19046784650429296, "grad_norm": 14.58116819960654, "kl": 0.0162353515625, "learning_rate": 8.097073769055546e-07, "loss": 0.0065, "reward": 1.4780704975128174, "reward_std": 0.1812051385641098, "rewards/accuracy_reward_stage2": 0.4780704975128174, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1087 }, { "completion_length": 15.671875, "epoch": 0.19064306991414054, "grad_norm": 13.710019110530325, "kl": 0.060302734375, "learning_rate": 8.095321534957071e-07, "loss": -0.0181, "reward": 1.3493430614471436, "reward_std": 0.12040011584758759, "rewards/accuracy_reward_stage2": 0.36496806144714355, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1088 }, { "completion_length": 9.65625, "epoch": 0.1908182933239881, "grad_norm": 18.8022343699883, "kl": 0.0205078125, "learning_rate": 8.093569300858595e-07, "loss": 0.0082, "reward": 1.6593749523162842, "reward_std": 0.16405992209911346, "rewards/accuracy_reward_stage2": 0.6593749523162842, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1089 }, { "completion_length": 12.40625, "epoch": 0.19099351673383563, "grad_norm": 144.1566137298228, "kl": 0.66796875, "learning_rate": 8.09181706676012e-07, "loss": 0.2231, "reward": 1.3865861892700195, "reward_std": 0.2962217926979065, "rewards/accuracy_reward_stage2": 0.5272111296653748, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1090 }, { "completion_length": 9.625, "epoch": 0.1911687401436832, "grad_norm": 16.503467268366663, "kl": 0.05859375, "learning_rate": 8.090064832661642e-07, "loss": -0.0207, "reward": 1.767581820487976, "reward_std": 0.2052784264087677, "rewards/accuracy_reward_stage2": 0.7832068204879761, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1091 }, { "completion_length": 9.9375, "epoch": 0.19134396355353075, "grad_norm": 22.629686000416033, "kl": 0.09130859375, "learning_rate": 8.088312598563167e-07, "loss": -0.078, "reward": 1.576542615890503, "reward_std": 0.2770374119281769, "rewards/accuracy_reward_stage2": 0.6234177350997925, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1092 }, { "completion_length": 11.890625, "epoch": 0.1915191869633783, "grad_norm": 19.855391563751468, "kl": 0.12353515625, "learning_rate": 8.086560364464692e-07, "loss": -0.0374, "reward": 1.4056425094604492, "reward_std": 0.17462682723999023, "rewards/accuracy_reward_stage2": 0.43689244985580444, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1093 }, { "completion_length": 5.71875, "epoch": 0.19169441037322588, "grad_norm": 17.63095417389011, "kl": 0.06201171875, "learning_rate": 8.084808130366216e-07, "loss": 0.0247, "reward": 1.6471445560455322, "reward_std": 0.18837015330791473, "rewards/accuracy_reward_stage2": 0.6471446752548218, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1094 }, { "completion_length": 18.28125, "epoch": 0.19186963378307342, "grad_norm": 25.517680180087687, "kl": 0.1513671875, "learning_rate": 8.083055896267741e-07, "loss": 0.0571, "reward": 1.354015588760376, "reward_std": 0.2112516313791275, "rewards/accuracy_reward_stage2": 0.4946404993534088, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1095 }, { "completion_length": 10.90625, "epoch": 0.19204485719292097, "grad_norm": 31.255011942515864, "kl": 0.05029296875, "learning_rate": 8.081303662169265e-07, "loss": 0.0201, "reward": 1.5470197200775146, "reward_std": 0.26016050577163696, "rewards/accuracy_reward_stage2": 0.5470197796821594, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1096 }, { "completion_length": 8.625, "epoch": 0.19222008060276852, "grad_norm": 20.68222206734905, "kl": 0.1376953125, "learning_rate": 8.07955142807079e-07, "loss": 0.0214, "reward": 1.4428706169128418, "reward_std": 0.2221945971250534, "rewards/accuracy_reward_stage2": 0.5834957361221313, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1097 }, { "completion_length": 8.015625, "epoch": 0.1923953040126161, "grad_norm": 56.24965357387562, "kl": 0.41015625, "learning_rate": 8.077799193972315e-07, "loss": 0.1637, "reward": 1.48624587059021, "reward_std": 0.09766960889101028, "rewards/accuracy_reward_stage2": 0.6112458109855652, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1098 }, { "completion_length": 10.640625, "epoch": 0.19257052742246364, "grad_norm": 12.780882453737103, "kl": 0.007537841796875, "learning_rate": 8.076046959873839e-07, "loss": 0.003, "reward": 1.65625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1099 }, { "completion_length": 9.265625, "epoch": 0.19274575083231119, "grad_norm": 20.199563575560575, "kl": 0.09228515625, "learning_rate": 8.074294725775364e-07, "loss": -0.0072, "reward": 1.8605185747146606, "reward_std": 0.15882590413093567, "rewards/accuracy_reward_stage2": 0.8761435747146606, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1100 }, { "completion_length": 6.984375, "epoch": 0.19292097424215876, "grad_norm": 18.088546653390626, "kl": 0.061767578125, "learning_rate": 8.072542491676888e-07, "loss": 0.0248, "reward": 1.664806604385376, "reward_std": 0.14888063073158264, "rewards/accuracy_reward_stage2": 0.6648065447807312, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1101 }, { "completion_length": 12.1875, "epoch": 0.1930961976520063, "grad_norm": 21.685034859130564, "kl": 0.068359375, "learning_rate": 8.070790257578412e-07, "loss": -0.011, "reward": 1.5130434036254883, "reward_std": 0.2805604338645935, "rewards/accuracy_reward_stage2": 0.5442932844161987, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1102 }, { "completion_length": 8.390625, "epoch": 0.19327142106185385, "grad_norm": 19.693827267743735, "kl": 0.2265625, "learning_rate": 8.069038023479936e-07, "loss": 0.0145, "reward": 1.4980933666229248, "reward_std": 0.3202298879623413, "rewards/accuracy_reward_stage2": 0.54496830701828, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1103 }, { "completion_length": 8.46875, "epoch": 0.19344664447170143, "grad_norm": 14.74632575144857, "kl": 0.04248046875, "learning_rate": 8.06728578938146e-07, "loss": -0.0141, "reward": 1.7438368797302246, "reward_std": 0.20524749159812927, "rewards/accuracy_reward_stage2": 0.7594619393348694, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1104 }, { "completion_length": 10.671875, "epoch": 0.19362186788154898, "grad_norm": 13.627598413449007, "kl": 0.060791015625, "learning_rate": 8.065533555282985e-07, "loss": 0.0243, "reward": 1.5157394409179688, "reward_std": 0.1647764891386032, "rewards/accuracy_reward_stage2": 0.5157395005226135, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1105 }, { "completion_length": 12.28125, "epoch": 0.19379709129139652, "grad_norm": 20.02739663095883, "kl": 0.1943359375, "learning_rate": 8.06378132118451e-07, "loss": 0.0777, "reward": 1.4368054866790771, "reward_std": 0.27020564675331116, "rewards/accuracy_reward_stage2": 0.5618055462837219, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1106 }, { "completion_length": 13.40625, "epoch": 0.1939723147012441, "grad_norm": 14.013964860105306, "kl": 0.02294921875, "learning_rate": 8.062029087086034e-07, "loss": -0.0327, "reward": 1.385578989982605, "reward_std": 0.132847398519516, "rewards/accuracy_reward_stage2": 0.401203989982605, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1107 }, { "completion_length": 13.703125, "epoch": 0.19414753811109164, "grad_norm": 21.173019061399607, "kl": 0.205078125, "learning_rate": 8.060276852987559e-07, "loss": 0.0657, "reward": 1.1857408285140991, "reward_std": 0.24654100835323334, "rewards/accuracy_reward_stage2": 0.5763658285140991, "rewards/format_reward_stage1_pointerpad": 0.609375, "scores/accuracy_reward_stage2": 0.609375, "step": 1108 }, { "completion_length": 17.8125, "epoch": 0.1943227615209392, "grad_norm": 14.208974126460554, "kl": 0.045654296875, "learning_rate": 8.058524618889084e-07, "loss": -0.0934, "reward": 1.469606876373291, "reward_std": 0.14302250742912292, "rewards/accuracy_reward_stage2": 0.5164818167686462, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1109 }, { "completion_length": 12.609375, "epoch": 0.19449798493078677, "grad_norm": 20.904985420063163, "kl": 0.2041015625, "learning_rate": 8.056772384790608e-07, "loss": 0.0817, "reward": 1.2993509769439697, "reward_std": 0.12314928323030472, "rewards/accuracy_reward_stage2": 0.5493509769439697, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1110 }, { "completion_length": 9.53125, "epoch": 0.1946732083406343, "grad_norm": 19.75010243636568, "kl": 0.057861328125, "learning_rate": 8.055020150692132e-07, "loss": 0.0231, "reward": 1.7738691568374634, "reward_std": 0.14646458625793457, "rewards/accuracy_reward_stage2": 0.7738690972328186, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1111 }, { "completion_length": 18.140625, "epoch": 0.19484843175048186, "grad_norm": 16.508736731493936, "kl": 0.0849609375, "learning_rate": 8.053267916593656e-07, "loss": -0.0515, "reward": 1.4216002225875854, "reward_std": 0.19240014255046844, "rewards/accuracy_reward_stage2": 0.45285022258758545, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1112 }, { "completion_length": 9.75, "epoch": 0.1950236551603294, "grad_norm": 19.065605788174146, "kl": 0.115234375, "learning_rate": 8.051515682495181e-07, "loss": -0.106, "reward": 1.5236477851867676, "reward_std": 0.2754653990268707, "rewards/accuracy_reward_stage2": 0.7111477851867676, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 1113 }, { "completion_length": 9.65625, "epoch": 0.19519887857017698, "grad_norm": 18.696100390969345, "kl": 0.061279296875, "learning_rate": 8.049763448396706e-07, "loss": -0.0196, "reward": 1.6031370162963867, "reward_std": 0.27101612091064453, "rewards/accuracy_reward_stage2": 0.6187620162963867, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1114 }, { "completion_length": 11.3125, "epoch": 0.19537410198002453, "grad_norm": 16.51187072505447, "kl": 0.046142578125, "learning_rate": 8.04801121429823e-07, "loss": -0.0233, "reward": 1.6047179698944092, "reward_std": 0.10883722454309464, "rewards/accuracy_reward_stage2": 0.8703429102897644, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1115 }, { "completion_length": 9.578125, "epoch": 0.19554932538987208, "grad_norm": 13.640423112555817, "kl": 0.040283203125, "learning_rate": 8.046258980199754e-07, "loss": -0.0718, "reward": 1.6661803722381592, "reward_std": 0.14868390560150146, "rewards/accuracy_reward_stage2": 0.6974303722381592, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1116 }, { "completion_length": 10.15625, "epoch": 0.19572454879971965, "grad_norm": 22.98236965229761, "kl": 0.1357421875, "learning_rate": 8.044506746101279e-07, "loss": 0.0115, "reward": 1.5336987972259521, "reward_std": 0.22573234140872955, "rewards/accuracy_reward_stage2": 0.5493239164352417, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1117 }, { "completion_length": 8.515625, "epoch": 0.1958997722095672, "grad_norm": 15.808069400903483, "kl": 0.0693359375, "learning_rate": 8.042754512002803e-07, "loss": 0.0277, "reward": 1.6645541191101074, "reward_std": 0.18062806129455566, "rewards/accuracy_reward_stage2": 0.6645541787147522, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1118 }, { "completion_length": 18.234375, "epoch": 0.19607499561941474, "grad_norm": 18.34917213634432, "kl": 0.046875, "learning_rate": 8.041002277904328e-07, "loss": -0.0234, "reward": 1.5347862243652344, "reward_std": 0.2595851719379425, "rewards/accuracy_reward_stage2": 0.5504111051559448, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1119 }, { "completion_length": 10.296875, "epoch": 0.19625021902926232, "grad_norm": 39.987764020386464, "kl": 0.2099609375, "learning_rate": 8.039250043805851e-07, "loss": 0.0216, "reward": 1.5678634643554688, "reward_std": 0.2782178819179535, "rewards/accuracy_reward_stage2": 0.5991134643554688, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1120 }, { "completion_length": 15.1875, "epoch": 0.19642544243910987, "grad_norm": 8.103500483325902, "kl": 0.00994873046875, "learning_rate": 8.037497809707376e-07, "loss": 0.004, "reward": 1.453125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward_stage2": 0.453125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1121 }, { "completion_length": 9.9375, "epoch": 0.1966006658489574, "grad_norm": 21.186066478652183, "kl": 0.296875, "learning_rate": 8.035745575608901e-07, "loss": 0.1183, "reward": 1.4764658212661743, "reward_std": 0.217693030834198, "rewards/accuracy_reward_stage2": 0.8514657020568848, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1122 }, { "completion_length": 10.5625, "epoch": 0.196775889258805, "grad_norm": 14.332701954367625, "kl": 0.024169921875, "learning_rate": 8.033993341510425e-07, "loss": 0.0096, "reward": 1.5120658874511719, "reward_std": 0.1503337323665619, "rewards/accuracy_reward_stage2": 0.5120658874511719, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1123 }, { "completion_length": 5.0625, "epoch": 0.19695111266865253, "grad_norm": 14.324934707265692, "kl": 0.021728515625, "learning_rate": 8.03224110741195e-07, "loss": 0.0087, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.8125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1124 }, { "completion_length": 10.390625, "epoch": 0.19712633607850008, "grad_norm": 16.529729841606866, "kl": 0.08056640625, "learning_rate": 8.030488873313475e-07, "loss": -0.0786, "reward": 1.5661100149154663, "reward_std": 0.1992965042591095, "rewards/accuracy_reward_stage2": 0.6129850149154663, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1125 }, { "completion_length": 7.421875, "epoch": 0.19730155948834766, "grad_norm": 17.60318401877113, "kl": 0.0654296875, "learning_rate": 8.028736639214999e-07, "loss": 0.0263, "reward": 1.5065890550613403, "reward_std": 0.19483307003974915, "rewards/accuracy_reward_stage2": 0.5065890550613403, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1126 }, { "completion_length": 10.359375, "epoch": 0.1974767828981952, "grad_norm": 13.272451524780983, "kl": 0.083984375, "learning_rate": 8.026984405116524e-07, "loss": -0.0231, "reward": 1.5966994762420654, "reward_std": 0.16520971059799194, "rewards/accuracy_reward_stage2": 0.6279494762420654, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1127 }, { "completion_length": 8.453125, "epoch": 0.19765200630804275, "grad_norm": 19.345697034013444, "kl": 0.046875, "learning_rate": 8.025232171018048e-07, "loss": -0.0255, "reward": 1.320874810218811, "reward_std": 0.1730649173259735, "rewards/accuracy_reward_stage2": 0.33649978041648865, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1128 }, { "completion_length": 10.71875, "epoch": 0.19782722971789032, "grad_norm": 22.72010394636336, "kl": 0.12158203125, "learning_rate": 8.023479936919572e-07, "loss": -0.0043, "reward": 1.314929723739624, "reward_std": 0.2263418585062027, "rewards/accuracy_reward_stage2": 0.596179723739624, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1129 }, { "completion_length": 12.515625, "epoch": 0.19800245312773787, "grad_norm": 21.640102442615657, "kl": 0.212890625, "learning_rate": 8.021727702821096e-07, "loss": -0.0032, "reward": 1.4405428171157837, "reward_std": 0.33114007115364075, "rewards/accuracy_reward_stage2": 0.7217926979064941, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1130 }, { "completion_length": 8.921875, "epoch": 0.19817767653758542, "grad_norm": 16.96837016642232, "kl": 0.049072265625, "learning_rate": 8.01997546872262e-07, "loss": 0.0196, "reward": 1.5607819557189941, "reward_std": 0.2285585105419159, "rewards/accuracy_reward_stage2": 0.6857819557189941, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1131 }, { "completion_length": 11.03125, "epoch": 0.19835289994743296, "grad_norm": 20.74462179618685, "kl": 0.08642578125, "learning_rate": 8.018223234624145e-07, "loss": 0.0057, "reward": 1.6989161968231201, "reward_std": 0.18676617741584778, "rewards/accuracy_reward_stage2": 0.7145411968231201, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1132 }, { "completion_length": 9.90625, "epoch": 0.19852812335728054, "grad_norm": 22.856482124259504, "kl": 0.1015625, "learning_rate": 8.01647100052567e-07, "loss": -0.0499, "reward": 1.6860418319702148, "reward_std": 0.2933948040008545, "rewards/accuracy_reward_stage2": 0.7329168319702148, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1133 }, { "completion_length": 9.578125, "epoch": 0.1987033467671281, "grad_norm": 10.13055659742841, "kl": 0.2255859375, "learning_rate": 8.014718766427194e-07, "loss": 0.0901, "reward": 1.4931246042251587, "reward_std": 0.06938232481479645, "rewards/accuracy_reward_stage2": 0.7431246042251587, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1134 }, { "completion_length": 13.234375, "epoch": 0.19887857017697563, "grad_norm": 21.296227271607425, "kl": 0.06591796875, "learning_rate": 8.012966532328719e-07, "loss": 0.0263, "reward": 1.3034307956695557, "reward_std": 0.2627410888671875, "rewards/accuracy_reward_stage2": 0.42843079566955566, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1135 }, { "completion_length": 11.03125, "epoch": 0.1990537935868232, "grad_norm": 19.25565519002808, "kl": 0.1337890625, "learning_rate": 8.011214298230243e-07, "loss": 0.0093, "reward": 1.3711471557617188, "reward_std": 0.26397189497947693, "rewards/accuracy_reward_stage2": 0.5117720365524292, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1136 }, { "completion_length": 12.6875, "epoch": 0.19922901699667075, "grad_norm": 16.864201753153427, "kl": 0.0703125, "learning_rate": 8.009462064131768e-07, "loss": -0.0496, "reward": 1.494227647781372, "reward_std": 0.22331348061561584, "rewards/accuracy_reward_stage2": 0.5254777073860168, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1137 }, { "completion_length": 24.53125, "epoch": 0.1994042404065183, "grad_norm": 22.756765542105306, "kl": 0.0283203125, "learning_rate": 8.007709830033293e-07, "loss": -0.0079, "reward": 1.4776358604431152, "reward_std": 0.29558607935905457, "rewards/accuracy_reward_stage2": 0.49326080083847046, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1138 }, { "completion_length": 9.28125, "epoch": 0.19957946381636588, "grad_norm": 23.208713725384428, "kl": 0.0986328125, "learning_rate": 8.005957595934817e-07, "loss": -0.0951, "reward": 1.4900455474853516, "reward_std": 0.29236066341400146, "rewards/accuracy_reward_stage2": 0.5525454878807068, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1139 }, { "completion_length": 8.265625, "epoch": 0.19975468722621342, "grad_norm": 17.131297352821566, "kl": 0.0634765625, "learning_rate": 8.004205361836342e-07, "loss": 0.0253, "reward": 1.5214886665344238, "reward_std": 0.21488597989082336, "rewards/accuracy_reward_stage2": 0.5214886665344238, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1140 }, { "completion_length": 14.359375, "epoch": 0.19992991063606097, "grad_norm": 20.921675496913988, "kl": 0.1806640625, "learning_rate": 8.002453127737866e-07, "loss": -0.0048, "reward": 1.1669869422912598, "reward_std": 0.22576582431793213, "rewards/accuracy_reward_stage2": 0.4482370615005493, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1141 }, { "completion_length": 10.484375, "epoch": 0.20010513404590854, "grad_norm": 20.421989513086785, "kl": 0.051025390625, "learning_rate": 8.000700893639389e-07, "loss": 0.0204, "reward": 1.701317310333252, "reward_std": 0.13738197088241577, "rewards/accuracy_reward_stage2": 0.7013173699378967, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1142 }, { "completion_length": 8.890625, "epoch": 0.2002803574557561, "grad_norm": 11.718815490151254, "kl": 0.034423828125, "learning_rate": 7.998948659540914e-07, "loss": -0.0276, "reward": 1.5396586656570435, "reward_std": 0.09947885572910309, "rewards/accuracy_reward_stage2": 0.5552836656570435, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1143 }, { "completion_length": 10.265625, "epoch": 0.20045558086560364, "grad_norm": 18.279605016875873, "kl": 0.126953125, "learning_rate": 7.997196425442438e-07, "loss": 0.051, "reward": 1.6181336641311646, "reward_std": 0.15057526528835297, "rewards/accuracy_reward_stage2": 0.7431336641311646, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1144 }, { "completion_length": 8.828125, "epoch": 0.2006308042754512, "grad_norm": 26.921773092556197, "kl": 0.25, "learning_rate": 7.995444191343963e-07, "loss": 0.1002, "reward": 1.542344570159912, "reward_std": 0.2922079563140869, "rewards/accuracy_reward_stage2": 0.5423445105552673, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1145 }, { "completion_length": 9.140625, "epoch": 0.20080602768529876, "grad_norm": 19.701859450137892, "kl": 0.111328125, "learning_rate": 7.993691957245488e-07, "loss": -0.0622, "reward": 1.340492606163025, "reward_std": 0.24292413890361786, "rewards/accuracy_reward_stage2": 0.4029926657676697, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1146 }, { "completion_length": 9.890625, "epoch": 0.2009812510951463, "grad_norm": 17.120098765809743, "kl": 0.0233154296875, "learning_rate": 7.991939723147012e-07, "loss": 0.0093, "reward": 1.623031497001648, "reward_std": 0.15811499953269958, "rewards/accuracy_reward_stage2": 0.623031497001648, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1147 }, { "completion_length": 10.0625, "epoch": 0.20115647450499385, "grad_norm": 16.414510733056197, "kl": 0.0194091796875, "learning_rate": 7.990187489048537e-07, "loss": -0.0364, "reward": 1.7732515335083008, "reward_std": 0.17782685160636902, "rewards/accuracy_reward_stage2": 0.788876473903656, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1148 }, { "completion_length": 11.65625, "epoch": 0.20133169791484143, "grad_norm": 21.158467097461774, "kl": 0.107421875, "learning_rate": 7.988435254950062e-07, "loss": -0.0013, "reward": 1.5274202823638916, "reward_std": 0.2117118239402771, "rewards/accuracy_reward_stage2": 0.5430452823638916, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1149 }, { "completion_length": 8.703125, "epoch": 0.20150692132468898, "grad_norm": 21.15810096062497, "kl": 0.11474609375, "learning_rate": 7.986683020851585e-07, "loss": 0.046, "reward": 1.2723078727722168, "reward_std": 0.282155305147171, "rewards/accuracy_reward_stage2": 0.3973078727722168, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1150 }, { "completion_length": 10.46875, "epoch": 0.20168214473453652, "grad_norm": 19.455305358264336, "kl": 0.0615234375, "learning_rate": 7.98493078675311e-07, "loss": 0.0246, "reward": 1.549159049987793, "reward_std": 0.1756734549999237, "rewards/accuracy_reward_stage2": 0.549159049987793, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1151 }, { "completion_length": 29.0, "epoch": 0.2018573681443841, "grad_norm": 18.905730682719934, "kl": 0.1669921875, "learning_rate": 7.983178552654634e-07, "loss": 0.0669, "reward": 1.4018785953521729, "reward_std": 0.14914950728416443, "rewards/accuracy_reward_stage2": 0.5268786549568176, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1152 }, { "completion_length": 12.53125, "epoch": 0.20203259155423164, "grad_norm": 26.675549604761773, "kl": 0.181640625, "learning_rate": 7.981426318556159e-07, "loss": 0.0728, "reward": 1.3172643184661865, "reward_std": 0.2368382215499878, "rewards/accuracy_reward_stage2": 0.5672642588615417, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1153 }, { "completion_length": 20.140625, "epoch": 0.2022078149640792, "grad_norm": 22.721907098494817, "kl": 0.171875, "learning_rate": 7.979674084457683e-07, "loss": 0.0687, "reward": 1.3109116554260254, "reward_std": 0.16556920111179352, "rewards/accuracy_reward_stage2": 0.43591174483299255, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1154 }, { "completion_length": 11.921875, "epoch": 0.20238303837392677, "grad_norm": 20.857879895249667, "kl": 0.1240234375, "learning_rate": 7.977921850359207e-07, "loss": -0.0352, "reward": 1.4702496528625488, "reward_std": 0.24368052184581757, "rewards/accuracy_reward_stage2": 0.5014996528625488, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1155 }, { "completion_length": 12.453125, "epoch": 0.2025582617837743, "grad_norm": 20.071391682574166, "kl": 0.0458984375, "learning_rate": 7.976169616260732e-07, "loss": 0.0183, "reward": 1.1320414543151855, "reward_std": 0.1970067322254181, "rewards/accuracy_reward_stage2": 0.25704148411750793, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1156 }, { "completion_length": 11.859375, "epoch": 0.20273348519362186, "grad_norm": 29.57065952608139, "kl": 0.06201171875, "learning_rate": 7.974417382162256e-07, "loss": -0.1075, "reward": 1.6288851499557495, "reward_std": 0.263146311044693, "rewards/accuracy_reward_stage2": 0.6757600903511047, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1157 }, { "completion_length": 11.078125, "epoch": 0.20290870860346943, "grad_norm": 20.549246695365362, "kl": 0.07177734375, "learning_rate": 7.972665148063781e-07, "loss": 0.0287, "reward": 1.6410633325576782, "reward_std": 0.2141045480966568, "rewards/accuracy_reward_stage2": 0.6410633325576782, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1158 }, { "completion_length": 10.640625, "epoch": 0.20308393201331698, "grad_norm": 18.21590409691188, "kl": 0.07861328125, "learning_rate": 7.970912913965306e-07, "loss": 0.0315, "reward": 1.7376164197921753, "reward_std": 0.146676704287529, "rewards/accuracy_reward_stage2": 0.7376164793968201, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1159 }, { "completion_length": 12.03125, "epoch": 0.20325915542316453, "grad_norm": 16.926699738446533, "kl": 0.024658203125, "learning_rate": 7.969160679866829e-07, "loss": -0.0343, "reward": 1.622064471244812, "reward_std": 0.19049572944641113, "rewards/accuracy_reward_stage2": 0.6376894116401672, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1160 }, { "completion_length": 12.203125, "epoch": 0.2034343788330121, "grad_norm": 17.57185430695999, "kl": 0.0400390625, "learning_rate": 7.967408445768354e-07, "loss": 0.0159, "reward": 1.5260417461395264, "reward_std": 0.2976905405521393, "rewards/accuracy_reward_stage2": 0.5260416269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1161 }, { "completion_length": 15.5625, "epoch": 0.20360960224285965, "grad_norm": 20.624347335706137, "kl": 0.1328125, "learning_rate": 7.965656211669879e-07, "loss": 0.053, "reward": 1.6277599334716797, "reward_std": 0.20827616751194, "rewards/accuracy_reward_stage2": 0.6277599334716797, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1162 }, { "completion_length": 8.734375, "epoch": 0.2037848256527072, "grad_norm": 20.888572067895698, "kl": 0.09912109375, "learning_rate": 7.963903977571403e-07, "loss": 0.0396, "reward": 1.7622499465942383, "reward_std": 0.2863914966583252, "rewards/accuracy_reward_stage2": 0.7622500658035278, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1163 }, { "completion_length": 15.09375, "epoch": 0.20396004906255474, "grad_norm": 17.51885785997261, "kl": 0.068359375, "learning_rate": 7.962151743472928e-07, "loss": 0.0272, "reward": 1.4528738260269165, "reward_std": 0.17152443528175354, "rewards/accuracy_reward_stage2": 0.5778738260269165, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1164 }, { "completion_length": 12.234375, "epoch": 0.20413527247240232, "grad_norm": 257.1767706508973, "kl": 1.2421875, "learning_rate": 7.960399509374453e-07, "loss": 0.4498, "reward": 1.5711274147033691, "reward_std": 0.19762010872364044, "rewards/accuracy_reward_stage2": 0.7117522954940796, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1165 }, { "completion_length": 9.75, "epoch": 0.20431049588224987, "grad_norm": 23.04372942522331, "kl": 0.20703125, "learning_rate": 7.958647275275977e-07, "loss": 0.0419, "reward": 1.4802830219268799, "reward_std": 0.21664901077747345, "rewards/accuracy_reward_stage2": 0.6209080219268799, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1166 }, { "completion_length": 9.0, "epoch": 0.2044857192920974, "grad_norm": 25.731014770808162, "kl": 0.17578125, "learning_rate": 7.956895041177501e-07, "loss": 0.0223, "reward": 1.528602123260498, "reward_std": 0.1948903203010559, "rewards/accuracy_reward_stage2": 0.669227123260498, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1167 }, { "completion_length": 12.0625, "epoch": 0.204660942701945, "grad_norm": 23.776557724679314, "kl": 0.044677734375, "learning_rate": 7.955142807079025e-07, "loss": 0.0178, "reward": 1.709090232849121, "reward_std": 0.1950330287218094, "rewards/accuracy_reward_stage2": 0.7090902328491211, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1168 }, { "completion_length": 9.765625, "epoch": 0.20483616611179253, "grad_norm": 15.919983081141853, "kl": 0.032470703125, "learning_rate": 7.95339057298055e-07, "loss": 0.013, "reward": 1.5694223642349243, "reward_std": 0.08031494915485382, "rewards/accuracy_reward_stage2": 0.5694223046302795, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1169 }, { "completion_length": 18.828125, "epoch": 0.20501138952164008, "grad_norm": 21.286745727148634, "kl": 0.08154296875, "learning_rate": 7.951638338882074e-07, "loss": 0.0262, "reward": 1.190812110900879, "reward_std": 0.18454615771770477, "rewards/accuracy_reward_stage2": 0.2064370959997177, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1170 }, { "completion_length": 8.125, "epoch": 0.20518661293148766, "grad_norm": 13.777441072502276, "kl": 0.013916015625, "learning_rate": 7.949886104783598e-07, "loss": 0.0056, "reward": 1.5902838706970215, "reward_std": 0.11751788854598999, "rewards/accuracy_reward_stage2": 0.590283989906311, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1171 }, { "completion_length": 6.140625, "epoch": 0.2053618363413352, "grad_norm": 16.392015761201815, "kl": 0.0284423828125, "learning_rate": 7.948133870685123e-07, "loss": 0.0114, "reward": 1.8313522338867188, "reward_std": 0.09155251830816269, "rewards/accuracy_reward_stage2": 0.8313522338867188, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1172 }, { "completion_length": 8.25, "epoch": 0.20553705975118275, "grad_norm": 20.217002311756907, "kl": 0.11962890625, "learning_rate": 7.946381636586647e-07, "loss": 0.0479, "reward": 1.2145620584487915, "reward_std": 0.21878552436828613, "rewards/accuracy_reward_stage2": 0.4645621180534363, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1173 }, { "completion_length": 11.671875, "epoch": 0.20571228316103032, "grad_norm": 16.081412388073563, "kl": 0.038330078125, "learning_rate": 7.944629402488172e-07, "loss": 0.0153, "reward": 1.54155695438385, "reward_std": 0.1580614596605301, "rewards/accuracy_reward_stage2": 0.5415569543838501, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1174 }, { "completion_length": 10.984375, "epoch": 0.20588750657087787, "grad_norm": 22.670026601608498, "kl": 0.024658203125, "learning_rate": 7.942877168389697e-07, "loss": 0.0099, "reward": 1.6144170761108398, "reward_std": 0.2522222399711609, "rewards/accuracy_reward_stage2": 0.6144170761108398, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1175 }, { "completion_length": 8.3125, "epoch": 0.20606272998072542, "grad_norm": 17.841750146010423, "kl": 0.06396484375, "learning_rate": 7.941124934291221e-07, "loss": 0.0255, "reward": 1.8633146286010742, "reward_std": 0.21237066388130188, "rewards/accuracy_reward_stage2": 0.8633145689964294, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1176 }, { "completion_length": 13.203125, "epoch": 0.206237953390573, "grad_norm": 19.983949385651133, "kl": 0.1552734375, "learning_rate": 7.939372700192746e-07, "loss": 0.0684, "reward": 1.2808198928833008, "reward_std": 0.039055947214365005, "rewards/accuracy_reward_stage2": 0.405819833278656, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1177 }, { "completion_length": 8.46875, "epoch": 0.20641317680042054, "grad_norm": 19.892008522113688, "kl": 0.052001953125, "learning_rate": 7.937620466094271e-07, "loss": 0.0208, "reward": 1.7783706188201904, "reward_std": 0.18616268038749695, "rewards/accuracy_reward_stage2": 0.7783706188201904, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1178 }, { "completion_length": 11.15625, "epoch": 0.20658840021026809, "grad_norm": 14.320012639942059, "kl": 0.296875, "learning_rate": 7.935868231995795e-07, "loss": 0.0851, "reward": 1.4547843933105469, "reward_std": 0.12215742468833923, "rewards/accuracy_reward_stage2": 0.5954092741012573, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1179 }, { "completion_length": 13.421875, "epoch": 0.20676362362011566, "grad_norm": 22.20258067675751, "kl": 0.0771484375, "learning_rate": 7.934115997897318e-07, "loss": 0.0308, "reward": 1.740135908126831, "reward_std": 0.22515861690044403, "rewards/accuracy_reward_stage2": 0.7401360273361206, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1180 }, { "completion_length": 6.859375, "epoch": 0.2069388470299632, "grad_norm": 12.56155836903543, "kl": 0.048095703125, "learning_rate": 7.932363763798842e-07, "loss": 0.0129, "reward": 1.2953832149505615, "reward_std": 0.1523396372795105, "rewards/accuracy_reward_stage2": 0.4203832149505615, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1181 }, { "completion_length": 8.40625, "epoch": 0.20711407043981075, "grad_norm": 24.90537422362036, "kl": 0.1455078125, "learning_rate": 7.930611529700367e-07, "loss": 0.0302, "reward": 1.7267037630081177, "reward_std": 0.26153823733329773, "rewards/accuracy_reward_stage2": 0.7423287630081177, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1182 }, { "completion_length": 12.484375, "epoch": 0.2072892938496583, "grad_norm": 21.687780028830804, "kl": 0.046630859375, "learning_rate": 7.928859295601892e-07, "loss": 0.0187, "reward": 1.5232006311416626, "reward_std": 0.20009201765060425, "rewards/accuracy_reward_stage2": 0.5232006311416626, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1183 }, { "completion_length": 14.703125, "epoch": 0.20746451725950588, "grad_norm": 29.365889674879565, "kl": 0.057861328125, "learning_rate": 7.927107061503416e-07, "loss": -0.0271, "reward": 1.4428789615631104, "reward_std": 0.24434049427509308, "rewards/accuracy_reward_stage2": 0.5991290807723999, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1184 }, { "completion_length": 13.59375, "epoch": 0.20763974066935342, "grad_norm": 18.703243296211422, "kl": 0.01495361328125, "learning_rate": 7.925354827404941e-07, "loss": 0.006, "reward": 1.7748501300811768, "reward_std": 0.16142131388187408, "rewards/accuracy_reward_stage2": 0.7748501300811768, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1185 }, { "completion_length": 8.0625, "epoch": 0.20781496407920097, "grad_norm": 14.896985352193926, "kl": 0.052001953125, "learning_rate": 7.923602593306466e-07, "loss": 0.0208, "reward": 1.5153526067733765, "reward_std": 0.23511351644992828, "rewards/accuracy_reward_stage2": 0.5153526067733765, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1186 }, { "completion_length": 9.828125, "epoch": 0.20799018748904854, "grad_norm": 15.184774328423192, "kl": 0.060546875, "learning_rate": 7.92185035920799e-07, "loss": 0.0242, "reward": 1.5610758066177368, "reward_std": 0.2549077570438385, "rewards/accuracy_reward_stage2": 0.686075747013092, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1187 }, { "completion_length": 12.5, "epoch": 0.2081654108988961, "grad_norm": 28.950774793478022, "kl": 0.052001953125, "learning_rate": 7.920098125109515e-07, "loss": 0.0208, "reward": 1.5520353317260742, "reward_std": 0.3283025622367859, "rewards/accuracy_reward_stage2": 0.6770353317260742, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1188 }, { "completion_length": 15.640625, "epoch": 0.20834063430874364, "grad_norm": 20.61263740578102, "kl": 0.059814453125, "learning_rate": 7.918345891011039e-07, "loss": -0.0683, "reward": 1.24334716796875, "reward_std": 0.31474292278289795, "rewards/accuracy_reward_stage2": 0.3995971381664276, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1189 }, { "completion_length": 8.890625, "epoch": 0.2085158577185912, "grad_norm": 23.985866977400512, "kl": 0.1953125, "learning_rate": 7.916593656912563e-07, "loss": 0.0932, "reward": 1.5583475828170776, "reward_std": 0.2929634153842926, "rewards/accuracy_reward_stage2": 0.6833474636077881, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1190 }, { "completion_length": 38.8125, "epoch": 0.20869108112843876, "grad_norm": 17.87360965600017, "kl": 0.1220703125, "learning_rate": 7.914841422814088e-07, "loss": 0.0489, "reward": 1.627746820449829, "reward_std": 0.1904383897781372, "rewards/accuracy_reward_stage2": 0.6277468204498291, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1191 }, { "completion_length": 10.953125, "epoch": 0.2088663045382863, "grad_norm": 18.58192828258147, "kl": 0.05712890625, "learning_rate": 7.913089188715612e-07, "loss": -0.0061, "reward": 1.4184027910232544, "reward_std": 0.2182597517967224, "rewards/accuracy_reward_stage2": 0.434027761220932, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1192 }, { "completion_length": 7.671875, "epoch": 0.20904152794813388, "grad_norm": 1.353597979884702, "kl": 0.025634765625, "learning_rate": 7.911336954617136e-07, "loss": 0.0103, "reward": 1.597916603088379, "reward_std": 0.0, "rewards/accuracy_reward_stage2": 0.5979166626930237, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1193 }, { "completion_length": 15.390625, "epoch": 0.20921675135798143, "grad_norm": 16.06473391090549, "kl": 0.046875, "learning_rate": 7.909584720518661e-07, "loss": 0.0188, "reward": 1.383378505706787, "reward_std": 0.21060232818126678, "rewards/accuracy_reward_stage2": 0.3833785355091095, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1194 }, { "completion_length": 10.828125, "epoch": 0.20939197476782898, "grad_norm": 14.175264395757077, "kl": 0.11279296875, "learning_rate": 7.907832486420185e-07, "loss": 0.045, "reward": 1.3746671676635742, "reward_std": 0.1269018054008484, "rewards/accuracy_reward_stage2": 0.49966704845428467, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1195 }, { "completion_length": 9.578125, "epoch": 0.20956719817767655, "grad_norm": 22.205500444522485, "kl": 0.107421875, "learning_rate": 7.90608025232171e-07, "loss": 0.0431, "reward": 1.5016958713531494, "reward_std": 0.23775088787078857, "rewards/accuracy_reward_stage2": 0.5016958117485046, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1196 }, { "completion_length": 9.765625, "epoch": 0.2097424215875241, "grad_norm": 18.843440652713703, "kl": 0.232421875, "learning_rate": 7.904328018223234e-07, "loss": 0.0927, "reward": 1.6348192691802979, "reward_std": 0.14728645980358124, "rewards/accuracy_reward_stage2": 0.7598193287849426, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1197 }, { "completion_length": 17.21875, "epoch": 0.20991764499737164, "grad_norm": 25.964716086110375, "kl": 0.1455078125, "learning_rate": 7.902575784124759e-07, "loss": 0.0583, "reward": 1.3283579349517822, "reward_std": 0.19651073217391968, "rewards/accuracy_reward_stage2": 0.45335784554481506, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1198 }, { "completion_length": 8.6875, "epoch": 0.2100928684072192, "grad_norm": 17.365380157487795, "kl": 0.043701171875, "learning_rate": 7.900823550026284e-07, "loss": 0.0175, "reward": 1.6050076484680176, "reward_std": 0.16576728224754333, "rewards/accuracy_reward_stage2": 0.6050077080726624, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1199 }, { "completion_length": 7.40625, "epoch": 0.21026809181706677, "grad_norm": 17.84908015245948, "kl": 0.046875, "learning_rate": 7.899071315927807e-07, "loss": 0.0188, "reward": 1.2840076684951782, "reward_std": 0.17957797646522522, "rewards/accuracy_reward_stage2": 0.40900763869285583, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1200 }, { "completion_length": 11.421875, "epoch": 0.2104433152269143, "grad_norm": 22.669593862703703, "kl": 0.039306640625, "learning_rate": 7.897319081829332e-07, "loss": -0.0284, "reward": 1.7310082912445068, "reward_std": 0.16316458582878113, "rewards/accuracy_reward_stage2": 0.7466332912445068, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1201 }, { "completion_length": 9.828125, "epoch": 0.21061853863676186, "grad_norm": 30.363784239849362, "kl": 0.1806640625, "learning_rate": 7.895566847730857e-07, "loss": 0.0535, "reward": 1.4026780128479004, "reward_std": 0.15883180499076843, "rewards/accuracy_reward_stage2": 0.4183030128479004, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1202 }, { "completion_length": 6.46875, "epoch": 0.21079376204660943, "grad_norm": 16.30063889671874, "kl": 0.027587890625, "learning_rate": 7.893814613632381e-07, "loss": -0.0331, "reward": 1.6259620189666748, "reward_std": 0.1683189868927002, "rewards/accuracy_reward_stage2": 0.64158695936203, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1203 }, { "completion_length": 9.59375, "epoch": 0.21096898545645698, "grad_norm": 16.699157121205253, "kl": 0.06298828125, "learning_rate": 7.892062379533906e-07, "loss": 0.0158, "reward": 1.4885568618774414, "reward_std": 0.12854987382888794, "rewards/accuracy_reward_stage2": 0.6135568618774414, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1204 }, { "completion_length": 8.9375, "epoch": 0.21114420886630453, "grad_norm": 14.78587528185712, "kl": 0.04296875, "learning_rate": 7.890310145435429e-07, "loss": 0.0172, "reward": 1.6338293552398682, "reward_std": 0.12834054231643677, "rewards/accuracy_reward_stage2": 0.7588293552398682, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1205 }, { "completion_length": 12.3125, "epoch": 0.2113194322761521, "grad_norm": 19.546222852809866, "kl": 0.060791015625, "learning_rate": 7.888557911336954e-07, "loss": 0.0244, "reward": 1.721550464630127, "reward_std": 0.17753317952156067, "rewards/accuracy_reward_stage2": 0.7215505242347717, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1206 }, { "completion_length": 7.703125, "epoch": 0.21149465568599965, "grad_norm": 20.43738576748487, "kl": 0.091796875, "learning_rate": 7.886805677238479e-07, "loss": 0.0054, "reward": 1.551939606666565, "reward_std": 0.2607957422733307, "rewards/accuracy_reward_stage2": 0.5675646066665649, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1207 }, { "completion_length": 11.953125, "epoch": 0.2116698790958472, "grad_norm": 17.672123595755643, "kl": 0.169921875, "learning_rate": 7.885053443140003e-07, "loss": 0.0312, "reward": 1.431882381439209, "reward_std": 0.2033139318227768, "rewards/accuracy_reward_stage2": 0.5725074410438538, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1208 }, { "completion_length": 11.28125, "epoch": 0.21184510250569477, "grad_norm": 17.456375274733755, "kl": 0.12255859375, "learning_rate": 7.883301209041528e-07, "loss": -0.0308, "reward": 1.4818658828735352, "reward_std": 0.2782401740550995, "rewards/accuracy_reward_stage2": 0.5131158828735352, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1209 }, { "completion_length": 22.578125, "epoch": 0.21202032591554232, "grad_norm": 19.43033143294077, "kl": 0.111328125, "learning_rate": 7.881548974943052e-07, "loss": 0.0113, "reward": 1.272879719734192, "reward_std": 0.1519315093755722, "rewards/accuracy_reward_stage2": 0.4135046601295471, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1210 }, { "completion_length": 10.390625, "epoch": 0.21219554932538987, "grad_norm": 21.513740888250048, "kl": 0.068359375, "learning_rate": 7.879796740844576e-07, "loss": 0.0274, "reward": 1.415919542312622, "reward_std": 0.20253877341747284, "rewards/accuracy_reward_stage2": 0.41591957211494446, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1211 }, { "completion_length": 7.140625, "epoch": 0.21237077273523744, "grad_norm": 18.76383666387577, "kl": 0.0181884765625, "learning_rate": 7.878044506746101e-07, "loss": 0.0073, "reward": 1.9206148386001587, "reward_std": 0.16348612308502197, "rewards/accuracy_reward_stage2": 0.9206147789955139, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1212 }, { "completion_length": 16.046875, "epoch": 0.212545996145085, "grad_norm": 5.082339464017986, "kl": 0.015625, "learning_rate": 7.876292272647625e-07, "loss": 0.0062, "reward": 1.40625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward_stage2": 0.40625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1213 }, { "completion_length": 11.53125, "epoch": 0.21272121955493253, "grad_norm": 16.474658851171878, "kl": 0.017822265625, "learning_rate": 7.87454003854915e-07, "loss": 0.0071, "reward": 1.4888964891433716, "reward_std": 0.23521284759044647, "rewards/accuracy_reward_stage2": 0.48889651894569397, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1214 }, { "completion_length": 14.90625, "epoch": 0.2128964429647801, "grad_norm": 17.607668248281357, "kl": 0.007659912109375, "learning_rate": 7.872787804450675e-07, "loss": -0.0218, "reward": 1.5345828533172607, "reward_std": 0.16702640056610107, "rewards/accuracy_reward_stage2": 0.5502078533172607, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1215 }, { "completion_length": 15.578125, "epoch": 0.21307166637462766, "grad_norm": 17.512366393353393, "kl": 0.078125, "learning_rate": 7.871035570352199e-07, "loss": -0.013, "reward": 1.507341980934143, "reward_std": 0.10357113182544708, "rewards/accuracy_reward_stage2": 0.5229669809341431, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1216 }, { "completion_length": 11.40625, "epoch": 0.2132468897844752, "grad_norm": 17.65170516624198, "kl": 0.056640625, "learning_rate": 7.869283336253724e-07, "loss": 0.0226, "reward": 1.2889139652252197, "reward_std": 0.12118306756019592, "rewards/accuracy_reward_stage2": 0.2889139652252197, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1217 }, { "completion_length": 9.5625, "epoch": 0.21342211319432275, "grad_norm": 22.66795441655081, "kl": 0.1474609375, "learning_rate": 7.867531102155247e-07, "loss": 0.059, "reward": 1.6077473163604736, "reward_std": 0.1633305847644806, "rewards/accuracy_reward_stage2": 0.6077473759651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1218 }, { "completion_length": 9.359375, "epoch": 0.21359733660417032, "grad_norm": 14.421986062058423, "kl": 0.04833984375, "learning_rate": 7.865778868056771e-07, "loss": 0.0193, "reward": 1.2509424686431885, "reward_std": 0.1293540596961975, "rewards/accuracy_reward_stage2": 0.5009424686431885, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1219 }, { "completion_length": 8.921875, "epoch": 0.21377256001401787, "grad_norm": 17.1096991822818, "kl": 0.07568359375, "learning_rate": 7.864026633958296e-07, "loss": 0.0302, "reward": 1.1151213645935059, "reward_std": 0.13514229655265808, "rewards/accuracy_reward_stage2": 0.11512142419815063, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1220 }, { "completion_length": 27.3125, "epoch": 0.21394778342386542, "grad_norm": 16.857618479926604, "kl": 0.025146484375, "learning_rate": 7.86227439985982e-07, "loss": 0.0101, "reward": 1.5993309020996094, "reward_std": 0.13720698654651642, "rewards/accuracy_reward_stage2": 0.5993307828903198, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1221 }, { "completion_length": 8.5, "epoch": 0.214123006833713, "grad_norm": 18.28127660466508, "kl": 0.0245361328125, "learning_rate": 7.860522165761345e-07, "loss": 0.0098, "reward": 1.831869125366211, "reward_std": 0.20181122422218323, "rewards/accuracy_reward_stage2": 0.8318691253662109, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1222 }, { "completion_length": 12.78125, "epoch": 0.21429823024356054, "grad_norm": 14.07413822583387, "kl": 0.06494140625, "learning_rate": 7.85876993166287e-07, "loss": -0.0619, "reward": 1.5926318168640137, "reward_std": 0.21398116648197174, "rewards/accuracy_reward_stage2": 0.6238818168640137, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1223 }, { "completion_length": 10.984375, "epoch": 0.21447345365340809, "grad_norm": 20.191094882292095, "kl": 0.1181640625, "learning_rate": 7.857017697564394e-07, "loss": 0.0067, "reward": 1.5521167516708374, "reward_std": 0.1629626452922821, "rewards/accuracy_reward_stage2": 0.5677417516708374, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1224 }, { "completion_length": 8.5, "epoch": 0.21464867706325566, "grad_norm": 19.044635303856023, "kl": 0.06591796875, "learning_rate": 7.855265463465919e-07, "loss": -0.0304, "reward": 1.675663709640503, "reward_std": 0.2704814374446869, "rewards/accuracy_reward_stage2": 0.7069137096405029, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1225 }, { "completion_length": 9.546875, "epoch": 0.2148239004731032, "grad_norm": 12.892689964479764, "kl": 0.205078125, "learning_rate": 7.853513229367444e-07, "loss": -0.0004, "reward": 1.5591697692871094, "reward_std": 0.1387302577495575, "rewards/accuracy_reward_stage2": 0.7154197096824646, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1226 }, { "completion_length": 9.65625, "epoch": 0.21499912388295075, "grad_norm": 24.68468555159018, "kl": 0.203125, "learning_rate": 7.851760995268968e-07, "loss": 0.0811, "reward": 1.470663070678711, "reward_std": 0.3228445053100586, "rewards/accuracy_reward_stage2": 0.5956631302833557, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1227 }, { "completion_length": 10.375, "epoch": 0.21517434729279833, "grad_norm": 19.866727346698354, "kl": 0.05859375, "learning_rate": 7.850008761170493e-07, "loss": 0.0234, "reward": 1.6230573654174805, "reward_std": 0.2438812255859375, "rewards/accuracy_reward_stage2": 0.6230573654174805, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1228 }, { "completion_length": 11.9375, "epoch": 0.21534957070264588, "grad_norm": 323.85482590509264, "kl": 0.58984375, "learning_rate": 7.848256527072016e-07, "loss": 0.1682, "reward": 1.5124356746673584, "reward_std": 0.1736292541027069, "rewards/accuracy_reward_stage2": 0.6686856746673584, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1229 }, { "completion_length": 9.3125, "epoch": 0.21552479411249342, "grad_norm": 18.499353836268497, "kl": 0.064453125, "learning_rate": 7.846504292973541e-07, "loss": -0.0072, "reward": 1.6767133474349976, "reward_std": 0.15823645889759064, "rewards/accuracy_reward_stage2": 0.6923382878303528, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1230 }, { "completion_length": 6.359375, "epoch": 0.215700017522341, "grad_norm": 17.16666874669316, "kl": 0.05322265625, "learning_rate": 7.844752058875065e-07, "loss": 0.0212, "reward": 1.5173816680908203, "reward_std": 0.21086975932121277, "rewards/accuracy_reward_stage2": 0.5173816680908203, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1231 }, { "completion_length": 7.25, "epoch": 0.21587524093218854, "grad_norm": 18.453030271685524, "kl": 0.07421875, "learning_rate": 7.842999824776589e-07, "loss": 0.0296, "reward": 1.5241796970367432, "reward_std": 0.16060179471969604, "rewards/accuracy_reward_stage2": 0.5241796970367432, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1232 }, { "completion_length": 11.09375, "epoch": 0.2160504643420361, "grad_norm": 20.878559833643852, "kl": 0.10986328125, "learning_rate": 7.841247590678114e-07, "loss": 0.0439, "reward": 1.5652143955230713, "reward_std": 0.23995351791381836, "rewards/accuracy_reward_stage2": 0.5652143955230713, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1233 }, { "completion_length": 7.875, "epoch": 0.21622568775188364, "grad_norm": 10.92429529273633, "kl": 0.02197265625, "learning_rate": 7.839495356579638e-07, "loss": 0.0088, "reward": 1.546875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1234 }, { "completion_length": 9.625, "epoch": 0.2164009111617312, "grad_norm": 22.419497200292163, "kl": 0.12451171875, "learning_rate": 7.837743122481163e-07, "loss": 0.0614, "reward": 1.256831407546997, "reward_std": 0.3416307270526886, "rewards/accuracy_reward_stage2": 0.3818313181400299, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1235 }, { "completion_length": 8.53125, "epoch": 0.21657613457157876, "grad_norm": 43.141486161066105, "kl": 0.283203125, "learning_rate": 7.835990888382688e-07, "loss": 0.0824, "reward": 1.4279000759124756, "reward_std": 0.19505921006202698, "rewards/accuracy_reward_stage2": 0.568524956703186, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1236 }, { "completion_length": 20.890625, "epoch": 0.2167513579814263, "grad_norm": 22.738050502586738, "kl": 0.08251953125, "learning_rate": 7.834238654284212e-07, "loss": 0.033, "reward": 1.67447829246521, "reward_std": 0.2933582365512848, "rewards/accuracy_reward_stage2": 0.67447829246521, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1237 }, { "completion_length": 12.828125, "epoch": 0.21692658139127388, "grad_norm": 19.737080524499078, "kl": 0.09912109375, "learning_rate": 7.832486420185737e-07, "loss": 0.0396, "reward": 1.4670445919036865, "reward_std": 0.1632198989391327, "rewards/accuracy_reward_stage2": 0.46704450249671936, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1238 }, { "completion_length": 9.71875, "epoch": 0.21710180480112143, "grad_norm": 18.02312666630985, "kl": 0.1259765625, "learning_rate": 7.830734186087262e-07, "loss": 0.0504, "reward": 1.2427325248718262, "reward_std": 0.1590673327445984, "rewards/accuracy_reward_stage2": 0.49273252487182617, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1239 }, { "completion_length": 11.515625, "epoch": 0.21727702821096898, "grad_norm": 16.867177885985814, "kl": 0.0888671875, "learning_rate": 7.828981951988785e-07, "loss": -0.0482, "reward": 1.6229031085968018, "reward_std": 0.26744771003723145, "rewards/accuracy_reward_stage2": 0.6541531085968018, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1240 }, { "completion_length": 14.78125, "epoch": 0.21745225162081655, "grad_norm": 14.87929735312399, "kl": 0.0242919921875, "learning_rate": 7.82722971789031e-07, "loss": 0.0097, "reward": 1.7083333730697632, "reward_std": 0.10346909612417221, "rewards/accuracy_reward_stage2": 0.7083333134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1241 }, { "completion_length": 8.53125, "epoch": 0.2176274750306641, "grad_norm": 23.08135670217196, "kl": 0.05908203125, "learning_rate": 7.825477483791834e-07, "loss": 0.0236, "reward": 1.829587697982788, "reward_std": 0.12539014220237732, "rewards/accuracy_reward_stage2": 0.8295876979827881, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1242 }, { "completion_length": 9.484375, "epoch": 0.21780269844051164, "grad_norm": 18.350211164053533, "kl": 0.04638671875, "learning_rate": 7.823725249693359e-07, "loss": 0.0186, "reward": 1.3802655935287476, "reward_std": 0.17971576750278473, "rewards/accuracy_reward_stage2": 0.38026559352874756, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1243 }, { "completion_length": 16.0, "epoch": 0.21797792185035922, "grad_norm": 17.128190072983514, "kl": 0.051025390625, "learning_rate": 7.821973015594883e-07, "loss": -0.0085, "reward": 1.6977180242538452, "reward_std": 0.1646936684846878, "rewards/accuracy_reward_stage2": 0.7133429646492004, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1244 }, { "completion_length": 26.921875, "epoch": 0.21815314526020677, "grad_norm": 12.042671829195628, "kl": 0.0634765625, "learning_rate": 7.820220781496407e-07, "loss": -0.1071, "reward": 1.4990663528442383, "reward_std": 0.17997947335243225, "rewards/accuracy_reward_stage2": 0.5459413528442383, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1245 }, { "completion_length": 10.296875, "epoch": 0.2183283686700543, "grad_norm": 21.10255172023999, "kl": 0.09375, "learning_rate": 7.818468547397932e-07, "loss": 0.0374, "reward": 1.6483008861541748, "reward_std": 0.2902987599372864, "rewards/accuracy_reward_stage2": 0.6483009457588196, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1246 }, { "completion_length": 9.03125, "epoch": 0.2185035920799019, "grad_norm": 21.7959643852111, "kl": 0.09033203125, "learning_rate": 7.816716313299457e-07, "loss": -0.0081, "reward": 1.255446434020996, "reward_std": 0.2230614423751831, "rewards/accuracy_reward_stage2": 0.2710713744163513, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1247 }, { "completion_length": 17.5, "epoch": 0.21867881548974943, "grad_norm": 19.42697648439749, "kl": 0.076171875, "learning_rate": 7.814964079200981e-07, "loss": -0.0075, "reward": 1.437554121017456, "reward_std": 0.12493880093097687, "rewards/accuracy_reward_stage2": 0.45317918062210083, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1248 }, { "completion_length": 12.71875, "epoch": 0.21885403889959698, "grad_norm": 17.63240550187539, "kl": 0.057373046875, "learning_rate": 7.813211845102505e-07, "loss": 0.0229, "reward": 1.2602894306182861, "reward_std": 0.11471651494503021, "rewards/accuracy_reward_stage2": 0.38528940081596375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1249 }, { "completion_length": 10.71875, "epoch": 0.21902926230944456, "grad_norm": 20.388080966035275, "kl": 0.06787109375, "learning_rate": 7.811459611004029e-07, "loss": -0.0611, "reward": 1.572366714477539, "reward_std": 0.14886946976184845, "rewards/accuracy_reward_stage2": 0.6036166548728943, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1250 }, { "completion_length": 8.734375, "epoch": 0.2192044857192921, "grad_norm": 13.291214271289837, "kl": 0.0986328125, "learning_rate": 7.809707376905554e-07, "loss": 0.0107, "reward": 1.7387276887893677, "reward_std": 0.16672708094120026, "rewards/accuracy_reward_stage2": 0.7543526887893677, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1251 }, { "completion_length": 13.453125, "epoch": 0.21937970912913965, "grad_norm": 20.069113743487662, "kl": 0.020751953125, "learning_rate": 7.807955142807079e-07, "loss": 0.0083, "reward": 1.3933387994766235, "reward_std": 0.3103262484073639, "rewards/accuracy_reward_stage2": 0.39333879947662354, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1252 }, { "completion_length": 10.015625, "epoch": 0.2195549325389872, "grad_norm": 16.660368209284208, "kl": 0.064453125, "learning_rate": 7.806202908708603e-07, "loss": 0.0257, "reward": 1.4384819269180298, "reward_std": 0.16353179514408112, "rewards/accuracy_reward_stage2": 0.4384819269180298, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1253 }, { "completion_length": 7.375, "epoch": 0.21973015594883477, "grad_norm": 7.523798852671286, "kl": 0.0115966796875, "learning_rate": 7.804450674610128e-07, "loss": 0.0047, "reward": 1.6167200803756714, "reward_std": 0.0578637570142746, "rewards/accuracy_reward_stage2": 0.6167200803756714, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1254 }, { "completion_length": 12.671875, "epoch": 0.21990537935868232, "grad_norm": 14.542038526776427, "kl": 0.044189453125, "learning_rate": 7.802698440511653e-07, "loss": -0.0266, "reward": 1.515625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1255 }, { "completion_length": 10.8125, "epoch": 0.22008060276852986, "grad_norm": 17.201209817778793, "kl": 0.055419921875, "learning_rate": 7.800946206413176e-07, "loss": 0.0221, "reward": 1.429174780845642, "reward_std": 0.26050835847854614, "rewards/accuracy_reward_stage2": 0.4291748106479645, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1256 }, { "completion_length": 9.296875, "epoch": 0.22025582617837744, "grad_norm": 21.338852752745392, "kl": 0.265625, "learning_rate": 7.799193972314701e-07, "loss": 0.0755, "reward": 1.5294039249420166, "reward_std": 0.27403104305267334, "rewards/accuracy_reward_stage2": 0.6700288653373718, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1257 }, { "completion_length": 11.15625, "epoch": 0.220431049588225, "grad_norm": 22.23905656233593, "kl": 0.060302734375, "learning_rate": 7.797441738216225e-07, "loss": -0.0201, "reward": 1.869028091430664, "reward_std": 0.17269808053970337, "rewards/accuracy_reward_stage2": 0.8846530318260193, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1258 }, { "completion_length": 9.984375, "epoch": 0.22060627299807253, "grad_norm": 19.399824642943887, "kl": 0.1103515625, "learning_rate": 7.795689504117749e-07, "loss": 0.0441, "reward": 1.223279595375061, "reward_std": 0.2005893588066101, "rewards/accuracy_reward_stage2": 0.5982796549797058, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1259 }, { "completion_length": 15.875, "epoch": 0.2207814964079201, "grad_norm": 24.27584542312224, "kl": 0.322265625, "learning_rate": 7.793937270019274e-07, "loss": 0.0848, "reward": 1.191131830215454, "reward_std": 0.23304371535778046, "rewards/accuracy_reward_stage2": 0.33175671100616455, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1260 }, { "completion_length": 10.6875, "epoch": 0.22095671981776766, "grad_norm": 20.4149420187181, "kl": 0.1142578125, "learning_rate": 7.792185035920798e-07, "loss": 0.0455, "reward": 1.4384210109710693, "reward_std": 0.22694987058639526, "rewards/accuracy_reward_stage2": 0.5634210109710693, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1261 }, { "completion_length": 7.359375, "epoch": 0.2211319432276152, "grad_norm": 12.284447759112405, "kl": 0.0673828125, "learning_rate": 7.790432801822323e-07, "loss": -0.0046, "reward": 1.6275393962860107, "reward_std": 0.15898236632347107, "rewards/accuracy_reward_stage2": 0.6431642770767212, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1262 }, { "completion_length": 9.53125, "epoch": 0.22130716663746278, "grad_norm": 19.201363227623997, "kl": 0.0791015625, "learning_rate": 7.788680567723848e-07, "loss": -0.0125, "reward": 1.521449089050293, "reward_std": 0.2546595335006714, "rewards/accuracy_reward_stage2": 0.5370742082595825, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1263 }, { "completion_length": 13.0625, "epoch": 0.22148239004731032, "grad_norm": 17.83997321001346, "kl": 0.1513671875, "learning_rate": 7.786928333625372e-07, "loss": 0.0164, "reward": 1.5619481801986694, "reward_std": 0.27089670300483704, "rewards/accuracy_reward_stage2": 0.5775731801986694, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1264 }, { "completion_length": 9.421875, "epoch": 0.22165761345715787, "grad_norm": 18.783396663696035, "kl": 0.07373046875, "learning_rate": 7.785176099526897e-07, "loss": 0.0006, "reward": 1.6510417461395264, "reward_std": 0.2089996337890625, "rewards/accuracy_reward_stage2": 0.6822916269302368, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1265 }, { "completion_length": 16.65625, "epoch": 0.22183283686700545, "grad_norm": 23.50690187272042, "kl": 0.11083984375, "learning_rate": 7.783423865428421e-07, "loss": 0.012, "reward": 1.626581072807312, "reward_std": 0.2647040784358978, "rewards/accuracy_reward_stage2": 0.6422061324119568, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1266 }, { "completion_length": 15.734375, "epoch": 0.222008060276853, "grad_norm": 19.475958082591323, "kl": 0.09423828125, "learning_rate": 7.781671631329946e-07, "loss": 0.0294, "reward": 1.0696529150009155, "reward_std": 0.07191064208745956, "rewards/accuracy_reward_stage2": 0.3196529150009155, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1267 }, { "completion_length": 8.125, "epoch": 0.22218328368670054, "grad_norm": 23.898456160776146, "kl": 0.123046875, "learning_rate": 7.779919397231471e-07, "loss": 0.0493, "reward": 1.5231192111968994, "reward_std": 0.20473015308380127, "rewards/accuracy_reward_stage2": 0.5231192111968994, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1268 }, { "completion_length": 8.1875, "epoch": 0.22235850709654809, "grad_norm": 13.176356060911349, "kl": 0.0703125, "learning_rate": 7.778167163132993e-07, "loss": 0.0282, "reward": 1.746025800704956, "reward_std": 0.14434634149074554, "rewards/accuracy_reward_stage2": 0.7460259199142456, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1269 }, { "completion_length": 13.078125, "epoch": 0.22253373050639566, "grad_norm": 12.526083904205054, "kl": 0.04931640625, "learning_rate": 7.776414929034518e-07, "loss": -0.0245, "reward": 1.6939078569412231, "reward_std": 0.13472694158554077, "rewards/accuracy_reward_stage2": 0.7095328569412231, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1270 }, { "completion_length": 19.03125, "epoch": 0.2227089539162432, "grad_norm": 15.443856012437893, "kl": 0.06201171875, "learning_rate": 7.774662694936043e-07, "loss": 0.0248, "reward": 1.3781781196594238, "reward_std": 0.12267878651618958, "rewards/accuracy_reward_stage2": 0.378178209066391, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1271 }, { "completion_length": 12.859375, "epoch": 0.22288417732609075, "grad_norm": 21.835257227780716, "kl": 0.140625, "learning_rate": 7.772910460837567e-07, "loss": 0.0224, "reward": 1.552232265472412, "reward_std": 0.2832726240158081, "rewards/accuracy_reward_stage2": 0.5678572654724121, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1272 }, { "completion_length": 9.0, "epoch": 0.22305940073593833, "grad_norm": 20.60717806692071, "kl": 0.07177734375, "learning_rate": 7.771158226739092e-07, "loss": -0.0155, "reward": 1.5970072746276855, "reward_std": 0.30943915247917175, "rewards/accuracy_reward_stage2": 0.6126322746276855, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1273 }, { "completion_length": 11.46875, "epoch": 0.22323462414578588, "grad_norm": 19.321778472115792, "kl": 0.07861328125, "learning_rate": 7.769405992640616e-07, "loss": -0.0128, "reward": 1.7938032150268555, "reward_std": 0.20492716133594513, "rewards/accuracy_reward_stage2": 0.8094281554222107, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1274 }, { "completion_length": 7.265625, "epoch": 0.22340984755563342, "grad_norm": 17.917894454080603, "kl": 0.1162109375, "learning_rate": 7.767653758542141e-07, "loss": 0.0464, "reward": 1.718109130859375, "reward_std": 0.22331853210926056, "rewards/accuracy_reward_stage2": 0.7181090116500854, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1275 }, { "completion_length": 8.25, "epoch": 0.223585070965481, "grad_norm": 18.712665480373918, "kl": 0.2451171875, "learning_rate": 7.765901524443666e-07, "loss": -0.0028, "reward": 1.2784273624420166, "reward_std": 0.35105907917022705, "rewards/accuracy_reward_stage2": 0.4503024220466614, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1276 }, { "completion_length": 11.296875, "epoch": 0.22376029437532854, "grad_norm": 29.74251938860607, "kl": 0.06494140625, "learning_rate": 7.76414929034519e-07, "loss": 0.0259, "reward": 1.7380640506744385, "reward_std": 0.2355988770723343, "rewards/accuracy_reward_stage2": 0.7380639910697937, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1277 }, { "completion_length": 11.359375, "epoch": 0.2239355177851761, "grad_norm": 19.119455330636775, "kl": 0.058349609375, "learning_rate": 7.762397056246715e-07, "loss": 0.0234, "reward": 1.6018481254577637, "reward_std": 0.14455029368400574, "rewards/accuracy_reward_stage2": 0.6018481254577637, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1278 }, { "completion_length": 14.21875, "epoch": 0.22411074119502367, "grad_norm": 18.31859900135893, "kl": 0.05078125, "learning_rate": 7.76064482214824e-07, "loss": 0.0203, "reward": 1.4314525127410889, "reward_std": 0.16612425446510315, "rewards/accuracy_reward_stage2": 0.4314524531364441, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1279 }, { "completion_length": 10.578125, "epoch": 0.2242859646048712, "grad_norm": 14.4401733471884, "kl": 0.0216064453125, "learning_rate": 7.758892588049763e-07, "loss": -0.0355, "reward": 1.5586047172546387, "reward_std": 0.1084030419588089, "rewards/accuracy_reward_stage2": 0.5742297172546387, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1280 }, { "completion_length": 9.953125, "epoch": 0.22446118801471876, "grad_norm": 12.963432488454982, "kl": 0.07666015625, "learning_rate": 7.757140353951288e-07, "loss": -0.0135, "reward": 1.4432322978973389, "reward_std": 0.08829830586910248, "rewards/accuracy_reward_stage2": 0.45885732769966125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1281 }, { "completion_length": 12.75, "epoch": 0.22463641142456633, "grad_norm": 18.285143581142435, "kl": 0.072265625, "learning_rate": 7.755388119852811e-07, "loss": 0.029, "reward": 1.6616626977920532, "reward_std": 0.16342474520206451, "rewards/accuracy_reward_stage2": 0.7866626977920532, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1282 }, { "completion_length": 10.828125, "epoch": 0.22481163483441388, "grad_norm": 25.22350216516759, "kl": 0.24609375, "learning_rate": 7.753635885754336e-07, "loss": 0.069, "reward": 1.5536048412322998, "reward_std": 0.22804740071296692, "rewards/accuracy_reward_stage2": 0.694229781627655, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1283 }, { "completion_length": 15.65625, "epoch": 0.22498685824426143, "grad_norm": 13.930013600400436, "kl": 0.0537109375, "learning_rate": 7.751883651655861e-07, "loss": -0.0542, "reward": 1.5508832931518555, "reward_std": 0.18876992166042328, "rewards/accuracy_reward_stage2": 0.7071333527565002, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1284 }, { "completion_length": 14.03125, "epoch": 0.22516208165410898, "grad_norm": 24.37646426933664, "kl": 0.044921875, "learning_rate": 7.750131417557385e-07, "loss": -0.021, "reward": 1.4661248922348022, "reward_std": 0.2995033264160156, "rewards/accuracy_reward_stage2": 0.48174989223480225, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1285 }, { "completion_length": 24.984375, "epoch": 0.22533730506395655, "grad_norm": 21.56030833367531, "kl": 0.0751953125, "learning_rate": 7.74837918345891e-07, "loss": -0.0029, "reward": 1.5520787239074707, "reward_std": 0.163404643535614, "rewards/accuracy_reward_stage2": 0.5677036046981812, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1286 }, { "completion_length": 11.484375, "epoch": 0.2255125284738041, "grad_norm": 20.697940818242685, "kl": 0.1787109375, "learning_rate": 7.746626949360435e-07, "loss": -0.0314, "reward": 1.4901411533355713, "reward_std": 0.3281293213367462, "rewards/accuracy_reward_stage2": 0.5370161533355713, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1287 }, { "completion_length": 12.859375, "epoch": 0.22568775188365164, "grad_norm": 17.43483491224403, "kl": 0.0260009765625, "learning_rate": 7.744874715261959e-07, "loss": 0.0104, "reward": 1.7811851501464844, "reward_std": 0.20009824633598328, "rewards/accuracy_reward_stage2": 0.7811851501464844, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1288 }, { "completion_length": 9.796875, "epoch": 0.22586297529349922, "grad_norm": 18.708135005309416, "kl": 0.06298828125, "learning_rate": 7.743122481163483e-07, "loss": -0.0178, "reward": 1.512305736541748, "reward_std": 0.21341320872306824, "rewards/accuracy_reward_stage2": 0.5279307961463928, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1289 }, { "completion_length": 10.828125, "epoch": 0.22603819870334677, "grad_norm": 16.460682608123427, "kl": 0.04638671875, "learning_rate": 7.741370247065007e-07, "loss": 0.0186, "reward": 1.7030662298202515, "reward_std": 0.15714354813098907, "rewards/accuracy_reward_stage2": 0.7030661106109619, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1290 }, { "completion_length": 12.8125, "epoch": 0.2262134221131943, "grad_norm": 17.695886350155817, "kl": 0.04296875, "learning_rate": 7.739618012966532e-07, "loss": 0.0172, "reward": 1.398033618927002, "reward_std": 0.1854480504989624, "rewards/accuracy_reward_stage2": 0.39803367853164673, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1291 }, { "completion_length": 7.71875, "epoch": 0.2263886455230419, "grad_norm": 13.61432730167237, "kl": 0.0252685546875, "learning_rate": 7.737865778868057e-07, "loss": 0.0101, "reward": 1.7604167461395264, "reward_std": 0.1167893186211586, "rewards/accuracy_reward_stage2": 0.7604166865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1292 }, { "completion_length": 9.109375, "epoch": 0.22656386893288943, "grad_norm": 15.963181167643066, "kl": 0.091796875, "learning_rate": 7.736113544769581e-07, "loss": -0.0023, "reward": 1.4881972074508667, "reward_std": 0.11426497250795364, "rewards/accuracy_reward_stage2": 0.5038222074508667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1293 }, { "completion_length": 9.3125, "epoch": 0.22673909234273698, "grad_norm": 22.062928498011892, "kl": 0.20703125, "learning_rate": 7.734361310671105e-07, "loss": 0.0224, "reward": 1.3682682514190674, "reward_std": 0.2822038233280182, "rewards/accuracy_reward_stage2": 0.6495183110237122, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1294 }, { "completion_length": 11.046875, "epoch": 0.22691431575258456, "grad_norm": 15.61429349005288, "kl": 0.099609375, "learning_rate": 7.732609076572629e-07, "loss": -0.0333, "reward": 1.3251614570617676, "reward_std": 0.3028058409690857, "rewards/accuracy_reward_stage2": 0.4814113974571228, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1295 }, { "completion_length": 7.9375, "epoch": 0.2270895391624321, "grad_norm": 15.122875763850107, "kl": 0.08544921875, "learning_rate": 7.730856842474154e-07, "loss": 0.0341, "reward": 1.5477509498596191, "reward_std": 0.14309881627559662, "rewards/accuracy_reward_stage2": 0.5477508902549744, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1296 }, { "completion_length": 12.703125, "epoch": 0.22726476257227965, "grad_norm": 19.00074915929077, "kl": 0.02685546875, "learning_rate": 7.729104608375679e-07, "loss": 0.0107, "reward": 1.6535483598709106, "reward_std": 0.1936911642551422, "rewards/accuracy_reward_stage2": 0.7785484790802002, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1297 }, { "completion_length": 11.4375, "epoch": 0.22743998598212722, "grad_norm": 15.655251618086494, "kl": 0.1328125, "learning_rate": 7.727352374277202e-07, "loss": 0.0528, "reward": 1.3991637229919434, "reward_std": 0.19552862644195557, "rewards/accuracy_reward_stage2": 0.6491636633872986, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1298 }, { "completion_length": 10.171875, "epoch": 0.22761520939197477, "grad_norm": 23.824028580200068, "kl": 0.048095703125, "learning_rate": 7.725600140178727e-07, "loss": -0.0142, "reward": 1.5, "reward_std": 0.35824593901634216, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1299 }, { "completion_length": 9.296875, "epoch": 0.22779043280182232, "grad_norm": 9.523524049899153, "kl": 0.007110595703125, "learning_rate": 7.723847906080252e-07, "loss": 0.0029, "reward": 1.641369104385376, "reward_std": 0.10163542628288269, "rewards/accuracy_reward_stage2": 0.6413690447807312, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1300 }, { "completion_length": 9.796875, "epoch": 0.2279656562116699, "grad_norm": 14.469235347826526, "kl": 0.02587890625, "learning_rate": 7.722095671981776e-07, "loss": 0.0103, "reward": 1.6671922206878662, "reward_std": 0.16057150065898895, "rewards/accuracy_reward_stage2": 0.6671922206878662, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1301 }, { "completion_length": 14.734375, "epoch": 0.22814087962151744, "grad_norm": 19.157426995880886, "kl": 0.09033203125, "learning_rate": 7.720343437883301e-07, "loss": -0.0081, "reward": 1.5279631614685059, "reward_std": 0.30399948358535767, "rewards/accuracy_reward_stage2": 0.5435882210731506, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1302 }, { "completion_length": 6.640625, "epoch": 0.228316103031365, "grad_norm": 19.37342779340734, "kl": 0.1357421875, "learning_rate": 7.718591203784826e-07, "loss": -0.0341, "reward": 1.626155138015747, "reward_std": 0.21683211624622345, "rewards/accuracy_reward_stage2": 0.7824052572250366, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1303 }, { "completion_length": 10.671875, "epoch": 0.22849132644121253, "grad_norm": 16.792216050058997, "kl": 0.061767578125, "learning_rate": 7.71683896968635e-07, "loss": -0.0508, "reward": 1.5081243515014648, "reward_std": 0.2527463436126709, "rewards/accuracy_reward_stage2": 0.6643743515014648, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1304 }, { "completion_length": 11.03125, "epoch": 0.2286665498510601, "grad_norm": 18.21345290572861, "kl": 0.1474609375, "learning_rate": 7.715086735587875e-07, "loss": 0.0376, "reward": 1.1700676679611206, "reward_std": 0.2538905739784241, "rewards/accuracy_reward_stage2": 0.3106927275657654, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1305 }, { "completion_length": 9.78125, "epoch": 0.22884177326090765, "grad_norm": 25.267173688859856, "kl": 0.1357421875, "learning_rate": 7.713334501489399e-07, "loss": 0.0214, "reward": 1.5649652481079102, "reward_std": 0.2843823730945587, "rewards/accuracy_reward_stage2": 0.5805902481079102, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1306 }, { "completion_length": 14.625, "epoch": 0.2290169966707552, "grad_norm": 17.060570429206486, "kl": 0.053466796875, "learning_rate": 7.711582267390923e-07, "loss": -0.0228, "reward": 1.5452286005020142, "reward_std": 0.20523126423358917, "rewards/accuracy_reward_stage2": 0.5608536005020142, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1307 }, { "completion_length": 10.859375, "epoch": 0.22919222008060278, "grad_norm": 17.399015919757105, "kl": 0.053955078125, "learning_rate": 7.709830033292448e-07, "loss": -0.0227, "reward": 1.46875, "reward_std": 0.2845909595489502, "rewards/accuracy_reward_stage2": 0.484375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1308 }, { "completion_length": 13.828125, "epoch": 0.22936744349045032, "grad_norm": 19.079762022572663, "kl": 0.08447265625, "learning_rate": 7.708077799193971e-07, "loss": -0.0535, "reward": 1.495568871498108, "reward_std": 0.2711693048477173, "rewards/accuracy_reward_stage2": 0.5268188118934631, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1309 }, { "completion_length": 10.3125, "epoch": 0.22954266690029787, "grad_norm": 21.410291018790012, "kl": 0.0830078125, "learning_rate": 7.706325565095496e-07, "loss": -0.0103, "reward": 1.66116201877594, "reward_std": 0.2185249626636505, "rewards/accuracy_reward_stage2": 0.6767870187759399, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1310 }, { "completion_length": 5.578125, "epoch": 0.22971789031014545, "grad_norm": 23.749171801536022, "kl": 0.06787109375, "learning_rate": 7.70457333099702e-07, "loss": -0.0018, "reward": 1.860271692276001, "reward_std": 0.19402723014354706, "rewards/accuracy_reward_stage2": 0.8758968114852905, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1311 }, { "completion_length": 7.546875, "epoch": 0.229893113719993, "grad_norm": 16.81050162349492, "kl": 0.050048828125, "learning_rate": 7.702821096898545e-07, "loss": 0.0201, "reward": 1.6367621421813965, "reward_std": 0.12695074081420898, "rewards/accuracy_reward_stage2": 0.7617621421813965, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1312 }, { "completion_length": 10.46875, "epoch": 0.23006833712984054, "grad_norm": 18.882004316200497, "kl": 0.091796875, "learning_rate": 7.70106886280007e-07, "loss": 0.0368, "reward": 1.577049970626831, "reward_std": 0.28842049837112427, "rewards/accuracy_reward_stage2": 0.7020500302314758, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1313 }, { "completion_length": 7.609375, "epoch": 0.2302435605396881, "grad_norm": 18.144597604562488, "kl": 0.0869140625, "learning_rate": 7.699316628701594e-07, "loss": 0.0348, "reward": 1.497538685798645, "reward_std": 0.23568907380104065, "rewards/accuracy_reward_stage2": 0.49753862619400024, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1314 }, { "completion_length": 7.125, "epoch": 0.23041878394953566, "grad_norm": 32.72252036712734, "kl": 0.25390625, "learning_rate": 7.697564394603119e-07, "loss": 0.101, "reward": 1.4892593622207642, "reward_std": 0.09943016618490219, "rewards/accuracy_reward_stage2": 0.6142593622207642, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1315 }, { "completion_length": 14.59375, "epoch": 0.2305940073593832, "grad_norm": 23.14253804576249, "kl": 0.11279296875, "learning_rate": 7.695812160504644e-07, "loss": 0.045, "reward": 1.3638789653778076, "reward_std": 0.26762983202934265, "rewards/accuracy_reward_stage2": 0.48887893557548523, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1316 }, { "completion_length": 6.953125, "epoch": 0.23076923076923078, "grad_norm": 16.876861475002965, "kl": 0.04736328125, "learning_rate": 7.694059926406168e-07, "loss": -0.0486, "reward": 1.484375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1317 }, { "completion_length": 8.375, "epoch": 0.23094445417907833, "grad_norm": 16.856181738487386, "kl": 0.15625, "learning_rate": 7.692307692307693e-07, "loss": 0.0186, "reward": 1.605655550956726, "reward_std": 0.2316904067993164, "rewards/accuracy_reward_stage2": 0.7462804317474365, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1318 }, { "completion_length": 8.65625, "epoch": 0.23111967758892588, "grad_norm": 21.496554558856914, "kl": 0.1220703125, "learning_rate": 7.690555458209216e-07, "loss": 0.0047, "reward": 1.4125896692276, "reward_std": 0.2990317940711975, "rewards/accuracy_reward_stage2": 0.4282146990299225, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1319 }, { "completion_length": 13.046875, "epoch": 0.23129490099877342, "grad_norm": 22.226835762626035, "kl": 0.1640625, "learning_rate": 7.68880322411074e-07, "loss": -0.0087, "reward": 1.5795575380325317, "reward_std": 0.34644702076911926, "rewards/accuracy_reward_stage2": 0.6108075976371765, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1320 }, { "completion_length": 9.28125, "epoch": 0.231470124408621, "grad_norm": 18.118326027742203, "kl": 0.0361328125, "learning_rate": 7.687050990012265e-07, "loss": 0.0145, "reward": 1.8229167461395264, "reward_std": 0.21129511296749115, "rewards/accuracy_reward_stage2": 0.8229166269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1321 }, { "completion_length": 10.765625, "epoch": 0.23164534781846854, "grad_norm": 14.929477462062813, "kl": 0.1298828125, "learning_rate": 7.685298755913789e-07, "loss": 0.0075, "reward": 1.837185263633728, "reward_std": 0.22572728991508484, "rewards/accuracy_reward_stage2": 0.852810263633728, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1322 }, { "completion_length": 6.34375, "epoch": 0.2318205712283161, "grad_norm": 16.885100313656853, "kl": 0.040771484375, "learning_rate": 7.683546521815314e-07, "loss": 0.0163, "reward": 1.641325831413269, "reward_std": 0.2138734608888626, "rewards/accuracy_reward_stage2": 0.641325831413269, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1323 }, { "completion_length": 9.765625, "epoch": 0.23199579463816367, "grad_norm": 21.435876816815696, "kl": 0.11572265625, "learning_rate": 7.681794287716839e-07, "loss": 0.0066, "reward": 1.4557948112487793, "reward_std": 0.37678366899490356, "rewards/accuracy_reward_stage2": 0.4714197516441345, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1324 }, { "completion_length": 9.640625, "epoch": 0.2321710180480112, "grad_norm": 19.439716800962223, "kl": 0.04638671875, "learning_rate": 7.680042053618363e-07, "loss": -0.032, "reward": 1.5691554546356201, "reward_std": 0.2068910300731659, "rewards/accuracy_reward_stage2": 0.6004054546356201, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1325 }, { "completion_length": 7.359375, "epoch": 0.23234624145785876, "grad_norm": 17.288923569988533, "kl": 0.0751953125, "learning_rate": 7.678289819519888e-07, "loss": -0.014, "reward": 1.7781198024749756, "reward_std": 0.15428794920444489, "rewards/accuracy_reward_stage2": 0.7937447428703308, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1326 }, { "completion_length": 9.0625, "epoch": 0.23252146486770633, "grad_norm": 17.876378399465597, "kl": 0.1533203125, "learning_rate": 7.676537585421412e-07, "loss": 0.0171, "reward": 1.6630034446716309, "reward_std": 0.15936848521232605, "rewards/accuracy_reward_stage2": 0.8036285042762756, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1327 }, { "completion_length": 11.75, "epoch": 0.23269668827755388, "grad_norm": 17.121016301907396, "kl": 0.2197265625, "learning_rate": 7.674785351322936e-07, "loss": 0.0876, "reward": 1.5210437774658203, "reward_std": 0.18795861303806305, "rewards/accuracy_reward_stage2": 0.6460438966751099, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1328 }, { "completion_length": 9.46875, "epoch": 0.23287191168740143, "grad_norm": 13.808690989852431, "kl": 0.05322265625, "learning_rate": 7.673033117224461e-07, "loss": 0.0213, "reward": 1.421875, "reward_std": 0.13258251547813416, "rewards/accuracy_reward_stage2": 0.421875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1329 }, { "completion_length": 8.21875, "epoch": 0.233047135097249, "grad_norm": 16.41141039654268, "kl": 0.1396484375, "learning_rate": 7.671280883125985e-07, "loss": 0.0561, "reward": 1.5604474544525146, "reward_std": 0.2582498788833618, "rewards/accuracy_reward_stage2": 0.5604474544525146, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1330 }, { "completion_length": 6.734375, "epoch": 0.23322235850709655, "grad_norm": 13.308302497276948, "kl": 0.03662109375, "learning_rate": 7.66952864902751e-07, "loss": 0.0147, "reward": 1.5406548976898193, "reward_std": 0.16961881518363953, "rewards/accuracy_reward_stage2": 0.5406548976898193, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1331 }, { "completion_length": 6.015625, "epoch": 0.2333975819169441, "grad_norm": 18.64517313452226, "kl": 0.060791015625, "learning_rate": 7.667776414929035e-07, "loss": -0.0506, "reward": 1.6535872220993042, "reward_std": 0.22229741513729095, "rewards/accuracy_reward_stage2": 0.8098372220993042, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1332 }, { "completion_length": 8.125, "epoch": 0.23357280532679167, "grad_norm": 15.161287245748868, "kl": 0.06298828125, "learning_rate": 7.666024180830558e-07, "loss": -0.0138, "reward": 1.6737689971923828, "reward_std": 0.30542173981666565, "rewards/accuracy_reward_stage2": 0.689393937587738, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1333 }, { "completion_length": 6.5625, "epoch": 0.23374802873663922, "grad_norm": 19.129160927851164, "kl": 0.169921875, "learning_rate": 7.664271946732083e-07, "loss": -0.0139, "reward": 1.543736457824707, "reward_std": 0.14306402206420898, "rewards/accuracy_reward_stage2": 0.5749865770339966, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1334 }, { "completion_length": 8.8125, "epoch": 0.23392325214648677, "grad_norm": 20.079367351448898, "kl": 0.16015625, "learning_rate": 7.662519712633607e-07, "loss": 0.0195, "reward": 1.2349598407745361, "reward_std": 0.11756610125303268, "rewards/accuracy_reward_stage2": 0.3755849003791809, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1335 }, { "completion_length": 11.703125, "epoch": 0.23409847555633434, "grad_norm": 16.942394215976904, "kl": 0.06396484375, "learning_rate": 7.660767478535132e-07, "loss": 0.0256, "reward": 1.2687785625457764, "reward_std": 0.22143125534057617, "rewards/accuracy_reward_stage2": 0.5187786221504211, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1336 }, { "completion_length": 10.421875, "epoch": 0.2342736989661819, "grad_norm": 18.10765071993869, "kl": 0.06689453125, "learning_rate": 7.659015244436657e-07, "loss": -0.0175, "reward": 1.4496071338653564, "reward_std": 0.11517606675624847, "rewards/accuracy_reward_stage2": 0.46523213386535645, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1337 }, { "completion_length": 8.40625, "epoch": 0.23444892237602943, "grad_norm": 18.622226862732052, "kl": 0.13671875, "learning_rate": 7.65726301033818e-07, "loss": -0.0338, "reward": 1.515625, "reward_std": 0.28778618574142456, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1338 }, { "completion_length": 10.953125, "epoch": 0.23462414578587698, "grad_norm": 20.568536347605033, "kl": 0.16796875, "learning_rate": 7.655510776239705e-07, "loss": 0.023, "reward": 1.3898448944091797, "reward_std": 0.24200965464115143, "rewards/accuracy_reward_stage2": 0.5304698944091797, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1339 }, { "completion_length": 15.25, "epoch": 0.23479936919572456, "grad_norm": 20.4797597642588, "kl": 0.041259765625, "learning_rate": 7.65375854214123e-07, "loss": -0.0277, "reward": 1.3560901880264282, "reward_std": 0.1921404004096985, "rewards/accuracy_reward_stage2": 0.3717151880264282, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1340 }, { "completion_length": 9.8125, "epoch": 0.2349745926055721, "grad_norm": 21.944151781117228, "kl": 0.09326171875, "learning_rate": 7.652006308042754e-07, "loss": -0.0051, "reward": 1.6057844161987305, "reward_std": 0.29269227385520935, "rewards/accuracy_reward_stage2": 0.6214094161987305, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1341 }, { "completion_length": 9.359375, "epoch": 0.23514981601541965, "grad_norm": 12.40325893539299, "kl": 0.08154296875, "learning_rate": 7.650254073944279e-07, "loss": -0.0117, "reward": 1.6456931829452515, "reward_std": 0.08253457397222519, "rewards/accuracy_reward_stage2": 0.6613181233406067, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1342 }, { "completion_length": 8.234375, "epoch": 0.23532503942526722, "grad_norm": 19.208591877134815, "kl": 0.1220703125, "learning_rate": 7.648501839845803e-07, "loss": 0.0489, "reward": 1.7316572666168213, "reward_std": 0.14084160327911377, "rewards/accuracy_reward_stage2": 0.8566572070121765, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1343 }, { "completion_length": 11.65625, "epoch": 0.23550026283511477, "grad_norm": 17.0946448588103, "kl": 0.1279296875, "learning_rate": 7.646749605747328e-07, "loss": -0.0369, "reward": 1.2201182842254639, "reward_std": 0.23437920212745667, "rewards/accuracy_reward_stage2": 0.37636837363243103, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1344 }, { "completion_length": 6.921875, "epoch": 0.23567548624496232, "grad_norm": 10.852411604630273, "kl": 0.06201171875, "learning_rate": 7.644997371648852e-07, "loss": 0.0248, "reward": 1.7287862300872803, "reward_std": 0.09362059086561203, "rewards/accuracy_reward_stage2": 0.7287862300872803, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1345 }, { "completion_length": 11.640625, "epoch": 0.2358507096548099, "grad_norm": 17.03273169319801, "kl": 0.028076171875, "learning_rate": 7.643245137550376e-07, "loss": 0.0112, "reward": 1.5563149452209473, "reward_std": 0.21286053955554962, "rewards/accuracy_reward_stage2": 0.5563148856163025, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1346 }, { "completion_length": 16.9375, "epoch": 0.23602593306465744, "grad_norm": 23.584844649922754, "kl": 0.1455078125, "learning_rate": 7.641492903451901e-07, "loss": -0.0076, "reward": 1.402522325515747, "reward_std": 0.36791902780532837, "rewards/accuracy_reward_stage2": 0.43377232551574707, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1347 }, { "completion_length": 9.015625, "epoch": 0.236201156474505, "grad_norm": 14.40621714172456, "kl": 0.123046875, "learning_rate": 7.639740669353425e-07, "loss": -0.0711, "reward": 1.4438755512237549, "reward_std": 0.3174234628677368, "rewards/accuracy_reward_stage2": 0.4907504916191101, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1348 }, { "completion_length": 6.453125, "epoch": 0.23637637988435256, "grad_norm": 17.89363169126588, "kl": 0.02734375, "learning_rate": 7.637988435254949e-07, "loss": 0.011, "reward": 1.8724802732467651, "reward_std": 0.13984158635139465, "rewards/accuracy_reward_stage2": 0.8724802136421204, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1349 }, { "completion_length": 10.96875, "epoch": 0.2365516032942001, "grad_norm": 20.342646184614797, "kl": 0.041748046875, "learning_rate": 7.636236201156474e-07, "loss": -0.0717, "reward": 1.6354167461395264, "reward_std": 0.2051776647567749, "rewards/accuracy_reward_stage2": 0.6666666269302368, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1350 }, { "completion_length": 8.375, "epoch": 0.23672682670404765, "grad_norm": 29.274146939478232, "kl": 0.21484375, "learning_rate": 7.634483967057998e-07, "loss": 0.0107, "reward": 1.5348470211029053, "reward_std": 0.24751737713813782, "rewards/accuracy_reward_stage2": 0.56609708070755, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1351 }, { "completion_length": 7.140625, "epoch": 0.23690205011389523, "grad_norm": 15.083267265104967, "kl": 0.1201171875, "learning_rate": 7.632731732959523e-07, "loss": 0.048, "reward": 1.5895814895629883, "reward_std": 0.15502366423606873, "rewards/accuracy_reward_stage2": 0.5895814895629883, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1352 }, { "completion_length": 13.671875, "epoch": 0.23707727352374278, "grad_norm": 23.24766828227649, "kl": 0.2080078125, "learning_rate": 7.630979498861048e-07, "loss": 0.0387, "reward": 1.4341652393341064, "reward_std": 0.2539823651313782, "rewards/accuracy_reward_stage2": 0.5747902393341064, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1353 }, { "completion_length": 23.40625, "epoch": 0.23725249693359032, "grad_norm": 18.812711379812775, "kl": 0.05859375, "learning_rate": 7.629227264762572e-07, "loss": -0.0052, "reward": 1.6507964134216309, "reward_std": 0.12219469249248505, "rewards/accuracy_reward_stage2": 0.6664214134216309, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1354 }, { "completion_length": 12.15625, "epoch": 0.23742772034343787, "grad_norm": 20.974248217569382, "kl": 0.0888671875, "learning_rate": 7.627475030664097e-07, "loss": 0.0355, "reward": 1.4771928787231445, "reward_std": 0.2657541334629059, "rewards/accuracy_reward_stage2": 0.47719287872314453, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1355 }, { "completion_length": 7.59375, "epoch": 0.23760294375328544, "grad_norm": 14.147872821836522, "kl": 0.0751953125, "learning_rate": 7.625722796565622e-07, "loss": 0.0302, "reward": 1.8256654739379883, "reward_std": 0.16003847122192383, "rewards/accuracy_reward_stage2": 0.8256654739379883, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1356 }, { "completion_length": 8.4375, "epoch": 0.237778167163133, "grad_norm": 23.388735536574366, "kl": 0.146484375, "learning_rate": 7.623970562467146e-07, "loss": -0.0669, "reward": 1.6211612224578857, "reward_std": 0.28670769929885864, "rewards/accuracy_reward_stage2": 0.6680362224578857, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1357 }, { "completion_length": 8.140625, "epoch": 0.23795339057298054, "grad_norm": 1026.974996111092, "kl": 5.0625, "learning_rate": 7.622218328368669e-07, "loss": 1.9528, "reward": 1.3219940662384033, "reward_std": 0.2393598109483719, "rewards/accuracy_reward_stage2": 0.4938691258430481, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1358 }, { "completion_length": 11.953125, "epoch": 0.2381286139828281, "grad_norm": 25.40176586494032, "kl": 0.12060546875, "learning_rate": 7.620466094270193e-07, "loss": -0.0242, "reward": 1.5815457105636597, "reward_std": 0.29680225253105164, "rewards/accuracy_reward_stage2": 0.6127958297729492, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1359 }, { "completion_length": 6.578125, "epoch": 0.23830383739267566, "grad_norm": 18.639160968237462, "kl": 0.0693359375, "learning_rate": 7.618713860171718e-07, "loss": -0.0165, "reward": 1.541497826576233, "reward_std": 0.24797038733959198, "rewards/accuracy_reward_stage2": 0.5571227669715881, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1360 }, { "completion_length": 16.125, "epoch": 0.2384790608025232, "grad_norm": 18.559946755867603, "kl": 0.2578125, "learning_rate": 7.616961626073243e-07, "loss": 0.0012, "reward": 1.537853717803955, "reward_std": 0.2887413203716278, "rewards/accuracy_reward_stage2": 0.7097286581993103, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1361 }, { "completion_length": 5.953125, "epoch": 0.23865428421237078, "grad_norm": 15.60884525772603, "kl": 0.057373046875, "learning_rate": 7.615209391974767e-07, "loss": -0.0105, "reward": 1.5885417461395264, "reward_std": 0.24286779761314392, "rewards/accuracy_reward_stage2": 0.6041666269302368, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1362 }, { "completion_length": 9.46875, "epoch": 0.23882950762221833, "grad_norm": 24.530372357681458, "kl": 0.10009765625, "learning_rate": 7.613457157876292e-07, "loss": -0.0649, "reward": 1.209742784500122, "reward_std": 0.3205549716949463, "rewards/accuracy_reward_stage2": 0.38161781430244446, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1363 }, { "completion_length": 10.71875, "epoch": 0.23900473103206588, "grad_norm": 18.32753281244687, "kl": 0.1005859375, "learning_rate": 7.611704923777817e-07, "loss": 0.0403, "reward": 1.592026710510254, "reward_std": 0.1409560739994049, "rewards/accuracy_reward_stage2": 0.7170267105102539, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1364 }, { "completion_length": 10.78125, "epoch": 0.23917995444191345, "grad_norm": 18.624788183315577, "kl": 0.1630859375, "learning_rate": 7.609952689679341e-07, "loss": -0.0905, "reward": 1.6103694438934326, "reward_std": 0.23033568263053894, "rewards/accuracy_reward_stage2": 0.6728694438934326, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1365 }, { "completion_length": 11.890625, "epoch": 0.239355177851761, "grad_norm": 17.579634638243046, "kl": 0.16015625, "learning_rate": 7.608200455580866e-07, "loss": 0.0284, "reward": 1.3834011554718018, "reward_std": 0.2378893941640854, "rewards/accuracy_reward_stage2": 0.3990260362625122, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1366 }, { "completion_length": 16.71875, "epoch": 0.23953040126160854, "grad_norm": 21.561179085008593, "kl": 0.10107421875, "learning_rate": 7.60644822148239e-07, "loss": 0.0404, "reward": 1.5523983240127563, "reward_std": 0.19578197598457336, "rewards/accuracy_reward_stage2": 0.5523982644081116, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1367 }, { "completion_length": 6.9375, "epoch": 0.23970562467145612, "grad_norm": 21.208497450352066, "kl": 0.1767578125, "learning_rate": 7.604695987383914e-07, "loss": 0.0009, "reward": 1.471160650253296, "reward_std": 0.20998351275920868, "rewards/accuracy_reward_stage2": 0.6274106502532959, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1368 }, { "completion_length": 9.859375, "epoch": 0.23988084808130367, "grad_norm": 15.544292327181843, "kl": 0.11669921875, "learning_rate": 7.602943753285439e-07, "loss": 0.0025, "reward": 1.3777086734771729, "reward_std": 0.14471843838691711, "rewards/accuracy_reward_stage2": 0.39333376288414, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1369 }, { "completion_length": 12.6875, "epoch": 0.2400560714911512, "grad_norm": 20.253928877390308, "kl": 0.05908203125, "learning_rate": 7.601191519186963e-07, "loss": 0.0236, "reward": 1.4667612314224243, "reward_std": 0.2103850245475769, "rewards/accuracy_reward_stage2": 0.4667612612247467, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1370 }, { "completion_length": 10.09375, "epoch": 0.24023129490099876, "grad_norm": 18.476012392364993, "kl": 0.1513671875, "learning_rate": 7.599439285088487e-07, "loss": 0.0163, "reward": 1.645654320716858, "reward_std": 0.13692879676818848, "rewards/accuracy_reward_stage2": 0.6612793803215027, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1371 }, { "completion_length": 14.140625, "epoch": 0.24040651831084633, "grad_norm": 15.660536838371636, "kl": 0.07177734375, "learning_rate": 7.597687050990011e-07, "loss": 0.0288, "reward": 1.5812649726867676, "reward_std": 0.17887036502361298, "rewards/accuracy_reward_stage2": 0.5812650918960571, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1372 }, { "completion_length": 10.96875, "epoch": 0.24058174172069388, "grad_norm": 20.22050450069256, "kl": 0.08154296875, "learning_rate": 7.595934816891536e-07, "loss": 0.0088, "reward": 1.6261334419250488, "reward_std": 0.14943452179431915, "rewards/accuracy_reward_stage2": 0.6417584419250488, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1373 }, { "completion_length": 18.953125, "epoch": 0.24075696513054143, "grad_norm": 15.262480413366804, "kl": 0.06494140625, "learning_rate": 7.594182582793061e-07, "loss": -0.0157, "reward": 1.6218539476394653, "reward_std": 0.18022483587265015, "rewards/accuracy_reward_stage2": 0.6374789476394653, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1374 }, { "completion_length": 12.796875, "epoch": 0.240932188540389, "grad_norm": 21.09257487877772, "kl": 0.11962890625, "learning_rate": 7.592430348694585e-07, "loss": 0.0119, "reward": 1.6718034744262695, "reward_std": 0.36308354139328003, "rewards/accuracy_reward_stage2": 0.68742835521698, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1375 }, { "completion_length": 8.25, "epoch": 0.24110741195023655, "grad_norm": 19.02651654352321, "kl": 0.059814453125, "learning_rate": 7.59067811459611e-07, "loss": 0.0239, "reward": 1.585327386856079, "reward_std": 0.21454137563705444, "rewards/accuracy_reward_stage2": 0.5853273868560791, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1376 }, { "completion_length": 9.0, "epoch": 0.2412826353600841, "grad_norm": 19.488328207175734, "kl": 0.0927734375, "learning_rate": 7.588925880497635e-07, "loss": -0.0065, "reward": 1.418020486831665, "reward_std": 0.3297243118286133, "rewards/accuracy_reward_stage2": 0.5586454272270203, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1377 }, { "completion_length": 11.3125, "epoch": 0.24145785876993167, "grad_norm": 21.364367900934393, "kl": 0.244140625, "learning_rate": 7.587173646399158e-07, "loss": 0.0623, "reward": 1.37074613571167, "reward_std": 0.20727571845054626, "rewards/accuracy_reward_stage2": 0.5113711357116699, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1378 }, { "completion_length": 16.78125, "epoch": 0.24163308217977922, "grad_norm": 15.906057313581638, "kl": 0.018310546875, "learning_rate": 7.585421412300683e-07, "loss": 0.0073, "reward": 1.6640890836715698, "reward_std": 0.12738674879074097, "rewards/accuracy_reward_stage2": 0.7890890836715698, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1379 }, { "completion_length": 7.4375, "epoch": 0.24180830558962677, "grad_norm": 17.62827161364021, "kl": 0.111328125, "learning_rate": 7.583669178202208e-07, "loss": 0.0005, "reward": 1.6967413425445557, "reward_std": 0.13932161033153534, "rewards/accuracy_reward_stage2": 0.7123663425445557, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1380 }, { "completion_length": 13.5, "epoch": 0.24198352899947434, "grad_norm": 24.47590495691864, "kl": 0.232421875, "learning_rate": 7.581916944103732e-07, "loss": 0.0992, "reward": 1.4481374025344849, "reward_std": 0.3019064962863922, "rewards/accuracy_reward_stage2": 0.5731374621391296, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1381 }, { "completion_length": 8.296875, "epoch": 0.2421587524093219, "grad_norm": 27.811506620771162, "kl": 0.1357421875, "learning_rate": 7.580164710005257e-07, "loss": 0.0543, "reward": 1.6210970878601074, "reward_std": 0.22113262116909027, "rewards/accuracy_reward_stage2": 0.6210970878601074, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1382 }, { "completion_length": 14.390625, "epoch": 0.24233397581916943, "grad_norm": 16.569060349237095, "kl": 0.052734375, "learning_rate": 7.578412475906781e-07, "loss": 0.0211, "reward": 1.3238930702209473, "reward_std": 0.1760530024766922, "rewards/accuracy_reward_stage2": 0.3238930106163025, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1383 }, { "completion_length": 14.671875, "epoch": 0.242509199229017, "grad_norm": 9.046751874397618, "kl": 0.02685546875, "learning_rate": 7.576660241808305e-07, "loss": -0.0334, "reward": 1.515625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.53125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1384 }, { "completion_length": 12.09375, "epoch": 0.24268442263886456, "grad_norm": 18.023372522650554, "kl": 0.072265625, "learning_rate": 7.57490800770983e-07, "loss": 0.0289, "reward": 1.1849263906478882, "reward_std": 0.14363038539886475, "rewards/accuracy_reward_stage2": 0.4349263310432434, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1385 }, { "completion_length": 10.625, "epoch": 0.2428596460487121, "grad_norm": 18.403407774950747, "kl": 0.138671875, "learning_rate": 7.573155773611354e-07, "loss": 0.0259, "reward": 1.6670591831207275, "reward_std": 0.20200037956237793, "rewards/accuracy_reward_stage2": 0.682684063911438, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1386 }, { "completion_length": 13.890625, "epoch": 0.24303486945855968, "grad_norm": 20.79383527590382, "kl": 0.357421875, "learning_rate": 7.571403539512879e-07, "loss": 0.15, "reward": 1.3685318231582642, "reward_std": 0.25985872745513916, "rewards/accuracy_reward_stage2": 0.6185318231582642, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1387 }, { "completion_length": 10.03125, "epoch": 0.24321009286840722, "grad_norm": 18.86950437113182, "kl": 0.126953125, "learning_rate": 7.569651305414402e-07, "loss": 0.0067, "reward": 1.7178502082824707, "reward_std": 0.1606583297252655, "rewards/accuracy_reward_stage2": 0.7334751486778259, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1388 }, { "completion_length": 6.796875, "epoch": 0.24338531627825477, "grad_norm": 22.82460816866117, "kl": 0.1572265625, "learning_rate": 7.567899071315927e-07, "loss": 0.0632, "reward": 1.465291976928711, "reward_std": 0.38037338852882385, "rewards/accuracy_reward_stage2": 0.46529191732406616, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1389 }, { "completion_length": 10.1875, "epoch": 0.24356053968810232, "grad_norm": 24.642991253870704, "kl": 0.11279296875, "learning_rate": 7.566146837217452e-07, "loss": 0.0009, "reward": 1.7565643787384033, "reward_std": 0.19197356700897217, "rewards/accuracy_reward_stage2": 0.8971893787384033, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1390 }, { "completion_length": 11.515625, "epoch": 0.2437357630979499, "grad_norm": 20.364150468143993, "kl": 0.29296875, "learning_rate": 7.564394603118976e-07, "loss": 0.0728, "reward": 1.351882815361023, "reward_std": 0.29016733169555664, "rewards/accuracy_reward_stage2": 0.4925077557563782, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1391 }, { "completion_length": 13.09375, "epoch": 0.24391098650779744, "grad_norm": 20.290293173883178, "kl": 0.345703125, "learning_rate": 7.562642369020501e-07, "loss": 0.1379, "reward": 1.0814404487609863, "reward_std": 0.20886757969856262, "rewards/accuracy_reward_stage2": 0.45644041895866394, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1392 }, { "completion_length": 9.453125, "epoch": 0.244086209917645, "grad_norm": 19.97602043621162, "kl": 0.296875, "learning_rate": 7.560890134922026e-07, "loss": 0.0348, "reward": 1.3737891912460327, "reward_std": 0.2441052496433258, "rewards/accuracy_reward_stage2": 0.5456641912460327, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1393 }, { "completion_length": 9.953125, "epoch": 0.24426143332749256, "grad_norm": 21.983461650003445, "kl": 0.083984375, "learning_rate": 7.55913790082355e-07, "loss": 0.0334, "reward": 1.4830906391143799, "reward_std": 0.21525192260742188, "rewards/accuracy_reward_stage2": 0.6080905795097351, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1394 }, { "completion_length": 8.1875, "epoch": 0.2444366567373401, "grad_norm": 21.998258110612007, "kl": 0.1337890625, "learning_rate": 7.557385666725075e-07, "loss": 0.0248, "reward": 1.452605128288269, "reward_std": 0.28352901339530945, "rewards/accuracy_reward_stage2": 0.46823009848594666, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1395 }, { "completion_length": 7.84375, "epoch": 0.24461188014718765, "grad_norm": 19.909572146545123, "kl": 0.08056640625, "learning_rate": 7.555633432626598e-07, "loss": 0.0323, "reward": 1.5185023546218872, "reward_std": 0.2108551412820816, "rewards/accuracy_reward_stage2": 0.5185023546218872, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1396 }, { "completion_length": 9.53125, "epoch": 0.24478710355703523, "grad_norm": 12.74340987455773, "kl": 0.0174560546875, "learning_rate": 7.553881198528122e-07, "loss": 0.007, "reward": 1.3910613059997559, "reward_std": 0.15033581852912903, "rewards/accuracy_reward_stage2": 0.5160612463951111, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1397 }, { "completion_length": 10.5, "epoch": 0.24496232696688278, "grad_norm": 18.40246201674656, "kl": 0.11181640625, "learning_rate": 7.552128964429647e-07, "loss": -0.026, "reward": 1.5389893054962158, "reward_std": 0.20025639235973358, "rewards/accuracy_reward_stage2": 0.570239245891571, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1398 }, { "completion_length": 10.296875, "epoch": 0.24513755037673032, "grad_norm": 13.009913705130106, "kl": 0.055419921875, "learning_rate": 7.550376730331171e-07, "loss": 0.0222, "reward": 1.4153332710266113, "reward_std": 0.13312244415283203, "rewards/accuracy_reward_stage2": 0.41533327102661133, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1399 }, { "completion_length": 8.1875, "epoch": 0.2453127737865779, "grad_norm": 18.3058249826422, "kl": 0.032470703125, "learning_rate": 7.548624496232696e-07, "loss": -0.0312, "reward": 1.7342438697814941, "reward_std": 0.2937045693397522, "rewards/accuracy_reward_stage2": 0.7498688697814941, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1400 }, { "completion_length": 9.171875, "epoch": 0.24548799719642544, "grad_norm": 23.01036824762215, "kl": 0.052001953125, "learning_rate": 7.546872262134221e-07, "loss": 0.0208, "reward": 1.603689432144165, "reward_std": 0.320762038230896, "rewards/accuracy_reward_stage2": 0.6036894917488098, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1401 }, { "completion_length": 12.171875, "epoch": 0.245663220606273, "grad_norm": 19.966849327865962, "kl": 0.05224609375, "learning_rate": 7.545120028035745e-07, "loss": 0.0209, "reward": 1.2532411813735962, "reward_std": 0.23318162560462952, "rewards/accuracy_reward_stage2": 0.3782411813735962, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1402 }, { "completion_length": 11.25, "epoch": 0.24583844401612057, "grad_norm": 32.40288041494598, "kl": 0.1103515625, "learning_rate": 7.54336779393727e-07, "loss": 0.0442, "reward": 1.524993658065796, "reward_std": 0.18178695440292358, "rewards/accuracy_reward_stage2": 0.6499937176704407, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1403 }, { "completion_length": 12.96875, "epoch": 0.2460136674259681, "grad_norm": 13.635345821381598, "kl": 0.0189208984375, "learning_rate": 7.541615559838794e-07, "loss": 0.0076, "reward": 1.600611925125122, "reward_std": 0.11209513992071152, "rewards/accuracy_reward_stage2": 0.6006119251251221, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1404 }, { "completion_length": 7.671875, "epoch": 0.24618889083581566, "grad_norm": 16.000875268863858, "kl": 0.193359375, "learning_rate": 7.539863325740319e-07, "loss": 0.0774, "reward": 1.7346065044403076, "reward_std": 0.09460826218128204, "rewards/accuracy_reward_stage2": 0.8596064448356628, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1405 }, { "completion_length": 8.84375, "epoch": 0.2463641142456632, "grad_norm": 16.47259843018963, "kl": 0.08203125, "learning_rate": 7.538111091641844e-07, "loss": 0.0327, "reward": 1.893869161605835, "reward_std": 0.1298605501651764, "rewards/accuracy_reward_stage2": 0.8938692212104797, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1406 }, { "completion_length": 15.265625, "epoch": 0.24653933765551078, "grad_norm": 17.276454759130427, "kl": 0.0546875, "learning_rate": 7.536358857543368e-07, "loss": -0.0223, "reward": 1.3059229850769043, "reward_std": 0.17960919439792633, "rewards/accuracy_reward_stage2": 0.3215479254722595, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1407 }, { "completion_length": 11.796875, "epoch": 0.24671456106535833, "grad_norm": 23.06353585461065, "kl": 0.061767578125, "learning_rate": 7.534606623444892e-07, "loss": -0.0092, "reward": 1.5690124034881592, "reward_std": 0.2022152692079544, "rewards/accuracy_reward_stage2": 0.5846374034881592, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1408 }, { "completion_length": 10.28125, "epoch": 0.24688978447520588, "grad_norm": 16.828407269287137, "kl": 0.0306396484375, "learning_rate": 7.532854389346416e-07, "loss": -0.0319, "reward": 1.7287945747375488, "reward_std": 0.23023012280464172, "rewards/accuracy_reward_stage2": 0.7444195747375488, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1409 }, { "completion_length": 16.59375, "epoch": 0.24706500788505345, "grad_norm": 15.25377782281171, "kl": 0.10546875, "learning_rate": 7.53110215524794e-07, "loss": 0.0423, "reward": 1.3533527851104736, "reward_std": 0.13732056319713593, "rewards/accuracy_reward_stage2": 0.47835278511047363, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1410 }, { "completion_length": 12.125, "epoch": 0.247240231294901, "grad_norm": 21.936185903908854, "kl": 0.06982421875, "learning_rate": 7.529349921149465e-07, "loss": 0.028, "reward": 1.7008384466171265, "reward_std": 0.2988817095756531, "rewards/accuracy_reward_stage2": 0.7008384466171265, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1411 }, { "completion_length": 4.3125, "epoch": 0.24741545470474854, "grad_norm": 25.73348775481858, "kl": 0.232421875, "learning_rate": 7.527597687050989e-07, "loss": 0.0256, "reward": 1.4827898740768433, "reward_std": 0.125931054353714, "rewards/accuracy_reward_stage2": 0.5140398740768433, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1412 }, { "completion_length": 11.59375, "epoch": 0.24759067811459612, "grad_norm": 21.094142841819078, "kl": 0.11376953125, "learning_rate": 7.525845452952514e-07, "loss": 0.0141, "reward": 1.4062385559082031, "reward_std": 0.16928933560848236, "rewards/accuracy_reward_stage2": 0.4218636453151703, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1413 }, { "completion_length": 11.296875, "epoch": 0.24776590152444367, "grad_norm": 19.519594526717743, "kl": 0.0732421875, "learning_rate": 7.524093218854039e-07, "loss": 0.0292, "reward": 1.7651951313018799, "reward_std": 0.2411029040813446, "rewards/accuracy_reward_stage2": 0.7651951909065247, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1414 }, { "completion_length": 10.484375, "epoch": 0.2479411249342912, "grad_norm": 16.276921952277196, "kl": 0.072265625, "learning_rate": 7.522340984755563e-07, "loss": -0.0152, "reward": 1.505523920059204, "reward_std": 0.2533569037914276, "rewards/accuracy_reward_stage2": 0.5211489796638489, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1415 }, { "completion_length": 6.9375, "epoch": 0.2481163483441388, "grad_norm": 17.29022823380657, "kl": 0.0400390625, "learning_rate": 7.520588750657088e-07, "loss": -0.0611, "reward": 1.5840046405792236, "reward_std": 0.29173704981803894, "rewards/accuracy_reward_stage2": 0.6152546405792236, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1416 }, { "completion_length": 9.0, "epoch": 0.24829157175398633, "grad_norm": 20.546807374945057, "kl": 0.08203125, "learning_rate": 7.518836516558613e-07, "loss": -0.0403, "reward": 1.6551421880722046, "reward_std": 0.2729400098323822, "rewards/accuracy_reward_stage2": 0.6863921880722046, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1417 }, { "completion_length": 10.171875, "epoch": 0.24846679516383388, "grad_norm": 16.351239914609064, "kl": 0.095703125, "learning_rate": 7.517084282460136e-07, "loss": 0.0383, "reward": 1.6802784204483032, "reward_std": 0.20798176527023315, "rewards/accuracy_reward_stage2": 0.6802783608436584, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1418 }, { "completion_length": 11.15625, "epoch": 0.24864201857368146, "grad_norm": 15.355436275669865, "kl": 0.09716796875, "learning_rate": 7.515332048361661e-07, "loss": 0.0389, "reward": 1.7547039985656738, "reward_std": 0.19607709348201752, "rewards/accuracy_reward_stage2": 0.7547039985656738, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1419 }, { "completion_length": 14.3125, "epoch": 0.248817241983529, "grad_norm": 26.208048459448875, "kl": 0.14453125, "learning_rate": 7.513579814263185e-07, "loss": 0.0579, "reward": 1.5602327585220337, "reward_std": 0.19080065190792084, "rewards/accuracy_reward_stage2": 0.6852326989173889, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1420 }, { "completion_length": 6.875, "epoch": 0.24899246539337655, "grad_norm": 20.93084846517713, "kl": 0.0439453125, "learning_rate": 7.51182758016471e-07, "loss": -0.0266, "reward": 1.5314676761627197, "reward_std": 0.1645144373178482, "rewards/accuracy_reward_stage2": 0.5470925569534302, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1421 }, { "completion_length": 12.65625, "epoch": 0.24916768880322412, "grad_norm": 22.8515591698515, "kl": 0.06689453125, "learning_rate": 7.510075346066234e-07, "loss": -0.0175, "reward": 1.425929069519043, "reward_std": 0.34283581376075745, "rewards/accuracy_reward_stage2": 0.44155409932136536, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1422 }, { "completion_length": 10.265625, "epoch": 0.24934291221307167, "grad_norm": 20.10466292441384, "kl": 0.119140625, "learning_rate": 7.508323111967758e-07, "loss": 0.0308, "reward": 1.6480023860931396, "reward_std": 0.2787425220012665, "rewards/accuracy_reward_stage2": 0.6636273860931396, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1423 }, { "completion_length": 7.53125, "epoch": 0.24951813562291922, "grad_norm": 18.74219789011929, "kl": 0.181640625, "learning_rate": 7.506570877869283e-07, "loss": -0.0252, "reward": 1.5641932487487793, "reward_std": 0.1766408234834671, "rewards/accuracy_reward_stage2": 0.6110682487487793, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1424 }, { "completion_length": 12.25, "epoch": 0.24969335903276677, "grad_norm": 21.74483115845773, "kl": 0.11572265625, "learning_rate": 7.504818643770808e-07, "loss": 0.0464, "reward": 1.3805147409439087, "reward_std": 0.08259022235870361, "rewards/accuracy_reward_stage2": 0.5055146813392639, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1425 }, { "completion_length": 7.53125, "epoch": 0.24986858244261434, "grad_norm": 28.82118561441724, "kl": 0.25390625, "learning_rate": 7.503066409672332e-07, "loss": 0.0226, "reward": 1.5050541162490845, "reward_std": 0.23860237002372742, "rewards/accuracy_reward_stage2": 0.5363041162490845, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1426 }, { "completion_length": 15.484375, "epoch": 0.2500438058524619, "grad_norm": 14.751929645062313, "kl": 0.029541015625, "learning_rate": 7.501314175573856e-07, "loss": 0.0118, "reward": 1.4637627601623535, "reward_std": 0.1221727728843689, "rewards/accuracy_reward_stage2": 0.46376264095306396, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1427 }, { "completion_length": 9.953125, "epoch": 0.25021902926230943, "grad_norm": 20.848579861484406, "kl": 0.06298828125, "learning_rate": 7.49956194147538e-07, "loss": 0.0251, "reward": 1.6132272481918335, "reward_std": 0.24379153549671173, "rewards/accuracy_reward_stage2": 0.6132272481918335, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1428 }, { "completion_length": 10.484375, "epoch": 0.250394252672157, "grad_norm": 12.875223980001039, "kl": 0.04638671875, "learning_rate": 7.497809707376905e-07, "loss": -0.0246, "reward": 1.5445420742034912, "reward_std": 0.10616233944892883, "rewards/accuracy_reward_stage2": 0.5601670742034912, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1429 }, { "completion_length": 11.40625, "epoch": 0.2505694760820046, "grad_norm": 20.47718883591059, "kl": 0.1630859375, "learning_rate": 7.49605747327843e-07, "loss": 0.0323, "reward": 1.5031461715698242, "reward_std": 0.21613532304763794, "rewards/accuracy_reward_stage2": 0.6437711715698242, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1430 }, { "completion_length": 7.171875, "epoch": 0.25074469949185213, "grad_norm": 16.85359317480518, "kl": 0.0830078125, "learning_rate": 7.494305239179954e-07, "loss": 0.0333, "reward": 1.4066383838653564, "reward_std": 0.13117088377475739, "rewards/accuracy_reward_stage2": 0.40663841366767883, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1431 }, { "completion_length": 8.234375, "epoch": 0.2509199229016997, "grad_norm": 16.17823877879919, "kl": 0.02392578125, "learning_rate": 7.492553005081479e-07, "loss": 0.0096, "reward": 1.5708149671554565, "reward_std": 0.175631582736969, "rewards/accuracy_reward_stage2": 0.5708150267601013, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1432 }, { "completion_length": 10.921875, "epoch": 0.2510951463115472, "grad_norm": 26.57272633941125, "kl": 0.06591796875, "learning_rate": 7.490800770983004e-07, "loss": 0.0264, "reward": 1.4339896440505981, "reward_std": 0.2509709894657135, "rewards/accuracy_reward_stage2": 0.43398961424827576, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1433 }, { "completion_length": 9.953125, "epoch": 0.25127036972139477, "grad_norm": 17.38723444203358, "kl": 0.1396484375, "learning_rate": 7.489048536884528e-07, "loss": 0.056, "reward": 1.7741584777832031, "reward_std": 0.19155940413475037, "rewards/accuracy_reward_stage2": 0.7741584181785583, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1434 }, { "completion_length": 6.390625, "epoch": 0.2514455931312423, "grad_norm": 28.928724087604284, "kl": 0.240234375, "learning_rate": 7.487296302786052e-07, "loss": 0.0556, "reward": 1.4168357849121094, "reward_std": 0.19237574934959412, "rewards/accuracy_reward_stage2": 0.4480857849121094, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1435 }, { "completion_length": 32.203125, "epoch": 0.25162081654108986, "grad_norm": 17.414773448675557, "kl": 0.07373046875, "learning_rate": 7.485544068687576e-07, "loss": 0.0295, "reward": 1.1635406017303467, "reward_std": 0.22368814051151276, "rewards/accuracy_reward_stage2": 0.4135405719280243, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1436 }, { "completion_length": 12.75, "epoch": 0.25179603995093747, "grad_norm": 14.257635558313762, "kl": 0.07275390625, "learning_rate": 7.4837918345891e-07, "loss": 0.0291, "reward": 1.6434564590454102, "reward_std": 0.10967773199081421, "rewards/accuracy_reward_stage2": 0.6434564590454102, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1437 }, { "completion_length": 8.65625, "epoch": 0.251971263360785, "grad_norm": 23.17077825610406, "kl": 0.166015625, "learning_rate": 7.482039600490625e-07, "loss": -0.0219, "reward": 1.4026463031768799, "reward_std": 0.28032857179641724, "rewards/accuracy_reward_stage2": 0.4495212733745575, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1438 }, { "completion_length": 12.28125, "epoch": 0.25214648677063256, "grad_norm": 24.845948232527043, "kl": 0.11962890625, "learning_rate": 7.480287366392149e-07, "loss": 0.0477, "reward": 1.4077582359313965, "reward_std": 0.24148832261562347, "rewards/accuracy_reward_stage2": 0.5327582359313965, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1439 }, { "completion_length": 16.875, "epoch": 0.2523217101804801, "grad_norm": 19.20260433661532, "kl": 0.048828125, "learning_rate": 7.478535132293674e-07, "loss": 0.0196, "reward": 1.3172272443771362, "reward_std": 0.14055949449539185, "rewards/accuracy_reward_stage2": 0.31722724437713623, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1440 }, { "completion_length": 11.921875, "epoch": 0.25249693359032765, "grad_norm": 19.72320501947569, "kl": 0.099609375, "learning_rate": 7.476782898195199e-07, "loss": 0.0001, "reward": 1.4920825958251953, "reward_std": 0.2956010699272156, "rewards/accuracy_reward_stage2": 0.5077076554298401, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1441 }, { "completion_length": 7.734375, "epoch": 0.2526721570001752, "grad_norm": 19.618789319973512, "kl": 0.07763671875, "learning_rate": 7.475030664096723e-07, "loss": -0.0043, "reward": 1.4340300559997559, "reward_std": 0.2557186782360077, "rewards/accuracy_reward_stage2": 0.5746550559997559, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1442 }, { "completion_length": 12.609375, "epoch": 0.2528473804100228, "grad_norm": 17.932373270098704, "kl": 0.068359375, "learning_rate": 7.473278429998248e-07, "loss": -0.0061, "reward": 1.3050273656845093, "reward_std": 0.2368200719356537, "rewards/accuracy_reward_stage2": 0.3206523656845093, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1443 }, { "completion_length": 10.3125, "epoch": 0.25302260381987035, "grad_norm": 17.592314152355126, "kl": 0.015625, "learning_rate": 7.471526195899772e-07, "loss": 0.0063, "reward": 1.6176997423171997, "reward_std": 0.23842398822307587, "rewards/accuracy_reward_stage2": 0.6176997423171997, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1444 }, { "completion_length": 7.828125, "epoch": 0.2531978272297179, "grad_norm": 17.54833559818638, "kl": 0.0625, "learning_rate": 7.469773961801297e-07, "loss": -0.0066, "reward": 1.583640456199646, "reward_std": 0.2253027856349945, "rewards/accuracy_reward_stage2": 0.599265456199646, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1445 }, { "completion_length": 7.34375, "epoch": 0.25337305063956544, "grad_norm": 18.76032144403276, "kl": 0.02783203125, "learning_rate": 7.468021727702822e-07, "loss": -0.033, "reward": 1.8142331838607788, "reward_std": 0.22606132924556732, "rewards/accuracy_reward_stage2": 0.829858124256134, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1446 }, { "completion_length": 13.171875, "epoch": 0.253548274049413, "grad_norm": 19.281481149835077, "kl": 0.21875, "learning_rate": 7.466269493604344e-07, "loss": 0.012, "reward": 1.2332404851913452, "reward_std": 0.2107694447040558, "rewards/accuracy_reward_stage2": 0.3894904851913452, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1447 }, { "completion_length": 12.515625, "epoch": 0.25372349745926054, "grad_norm": 16.387474225377876, "kl": 0.06005859375, "learning_rate": 7.464517259505869e-07, "loss": -0.0201, "reward": 1.6585381031036377, "reward_std": 0.18613138794898987, "rewards/accuracy_reward_stage2": 0.6741631031036377, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1448 }, { "completion_length": 8.734375, "epoch": 0.2538987208691081, "grad_norm": 23.012981197680062, "kl": 0.1591796875, "learning_rate": 7.462765025407393e-07, "loss": 0.0196, "reward": 1.4184439182281494, "reward_std": 0.311229944229126, "rewards/accuracy_reward_stage2": 0.5590689182281494, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1449 }, { "completion_length": 12.015625, "epoch": 0.2540739442789557, "grad_norm": 20.388117986982774, "kl": 0.0966796875, "learning_rate": 7.461012791308918e-07, "loss": 0.0387, "reward": 1.2674095630645752, "reward_std": 0.179796040058136, "rewards/accuracy_reward_stage2": 0.3924095034599304, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1450 }, { "completion_length": 11.859375, "epoch": 0.25424916768880323, "grad_norm": 23.739127987190525, "kl": 0.08740234375, "learning_rate": 7.459260557210443e-07, "loss": -0.0093, "reward": 1.3971586227416992, "reward_std": 0.24291972815990448, "rewards/accuracy_reward_stage2": 0.4127836227416992, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1451 }, { "completion_length": 9.953125, "epoch": 0.2544243910986508, "grad_norm": 18.38959476405202, "kl": 0.04541015625, "learning_rate": 7.457508323111967e-07, "loss": 0.0182, "reward": 1.6254558563232422, "reward_std": 0.16483592987060547, "rewards/accuracy_reward_stage2": 0.750455915927887, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1452 }, { "completion_length": 8.953125, "epoch": 0.25459961450849833, "grad_norm": 18.41502038751911, "kl": 0.07080078125, "learning_rate": 7.455756089013492e-07, "loss": -0.0368, "reward": 1.500500202178955, "reward_std": 0.23161879181861877, "rewards/accuracy_reward_stage2": 0.5317501425743103, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1453 }, { "completion_length": 8.484375, "epoch": 0.2547748379183459, "grad_norm": 21.173867795926054, "kl": 0.080078125, "learning_rate": 7.454003854915017e-07, "loss": 0.032, "reward": 1.5174919366836548, "reward_std": 0.19462689757347107, "rewards/accuracy_reward_stage2": 0.5174919962882996, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1454 }, { "completion_length": 10.890625, "epoch": 0.2549500613281934, "grad_norm": 24.184911493436093, "kl": 0.1875, "learning_rate": 7.452251620816541e-07, "loss": 0.0846, "reward": 1.4352514743804932, "reward_std": 0.25682583451271057, "rewards/accuracy_reward_stage2": 0.5602514743804932, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1455 }, { "completion_length": 17.625, "epoch": 0.255125284738041, "grad_norm": 14.765625590550027, "kl": 0.0595703125, "learning_rate": 7.450499386718066e-07, "loss": 0.011, "reward": 1.5301792621612549, "reward_std": 0.14312390983104706, "rewards/accuracy_reward_stage2": 0.6708042025566101, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1456 }, { "completion_length": 14.546875, "epoch": 0.25530050814788857, "grad_norm": 26.303298887600445, "kl": 0.12109375, "learning_rate": 7.448747152619589e-07, "loss": 0.0105, "reward": 1.30367112159729, "reward_std": 0.248937726020813, "rewards/accuracy_reward_stage2": 0.5692960619926453, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1457 }, { "completion_length": 10.71875, "epoch": 0.2554757315577361, "grad_norm": 15.87483540806158, "kl": 0.279296875, "learning_rate": 7.446994918521114e-07, "loss": 0.0236, "reward": 1.452156662940979, "reward_std": 0.25209495425224304, "rewards/accuracy_reward_stage2": 0.608406662940979, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1458 }, { "completion_length": 23.171875, "epoch": 0.25565095496758367, "grad_norm": 16.989660761895617, "kl": 0.10400390625, "learning_rate": 7.445242684422639e-07, "loss": -0.0027, "reward": 1.3787615299224854, "reward_std": 0.1453903168439865, "rewards/accuracy_reward_stage2": 0.5193865299224854, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1459 }, { "completion_length": 18.28125, "epoch": 0.2558261783774312, "grad_norm": 17.646162678744076, "kl": 0.04736328125, "learning_rate": 7.443490450324162e-07, "loss": 0.0189, "reward": 1.3753581047058105, "reward_std": 0.16490060091018677, "rewards/accuracy_reward_stage2": 0.37535810470581055, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1460 }, { "completion_length": 11.796875, "epoch": 0.25600140178727876, "grad_norm": 17.721008191652405, "kl": 0.09423828125, "learning_rate": 7.441738216225687e-07, "loss": 0.0376, "reward": 1.703101396560669, "reward_std": 0.1289398968219757, "rewards/accuracy_reward_stage2": 0.8281015157699585, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1461 }, { "completion_length": 7.8125, "epoch": 0.25617662519712636, "grad_norm": 18.96483793561938, "kl": 0.031494140625, "learning_rate": 7.439985982127212e-07, "loss": 0.0126, "reward": 1.644444465637207, "reward_std": 0.18405601382255554, "rewards/accuracy_reward_stage2": 0.644444465637207, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1462 }, { "completion_length": 13.390625, "epoch": 0.2563518486069739, "grad_norm": 17.659693979455977, "kl": 0.0439453125, "learning_rate": 7.438233748028736e-07, "loss": -0.0195, "reward": 1.7559140920639038, "reward_std": 0.2098643183708191, "rewards/accuracy_reward_stage2": 0.7715390920639038, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1463 }, { "completion_length": 5.984375, "epoch": 0.25652707201682146, "grad_norm": 15.24752737199156, "kl": 0.07763671875, "learning_rate": 7.436481513930261e-07, "loss": -0.0131, "reward": 1.7054529190063477, "reward_std": 0.20000435411930084, "rewards/accuracy_reward_stage2": 0.7210779786109924, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1464 }, { "completion_length": 10.046875, "epoch": 0.256702295426669, "grad_norm": 19.832224778307896, "kl": 0.053955078125, "learning_rate": 7.434729279831785e-07, "loss": 0.0215, "reward": 1.7961781024932861, "reward_std": 0.22936266660690308, "rewards/accuracy_reward_stage2": 0.7961781024932861, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1465 }, { "completion_length": 12.28125, "epoch": 0.25687751883651655, "grad_norm": 18.569052563883726, "kl": 0.0830078125, "learning_rate": 7.43297704573331e-07, "loss": -0.0106, "reward": 1.6345622539520264, "reward_std": 0.22242167592048645, "rewards/accuracy_reward_stage2": 0.6501872539520264, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1466 }, { "completion_length": 9.953125, "epoch": 0.2570527422463641, "grad_norm": 19.164318358894214, "kl": 0.130859375, "learning_rate": 7.431224811634834e-07, "loss": -0.0306, "reward": 1.5217496156692505, "reward_std": 0.26683294773101807, "rewards/accuracy_reward_stage2": 0.5529996156692505, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1467 }, { "completion_length": 9.828125, "epoch": 0.25722796565621164, "grad_norm": 23.655118472656955, "kl": 0.11376953125, "learning_rate": 7.429472577536358e-07, "loss": 0.026, "reward": 1.4828197956085205, "reward_std": 0.2148694396018982, "rewards/accuracy_reward_stage2": 0.4984448552131653, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1468 }, { "completion_length": 12.0625, "epoch": 0.25740318906605925, "grad_norm": 19.835704810324053, "kl": 0.09912109375, "learning_rate": 7.427720343437883e-07, "loss": 0.0397, "reward": 1.1822917461395264, "reward_std": 0.3223046064376831, "rewards/accuracy_reward_stage2": 0.4322916865348816, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1469 }, { "completion_length": 8.0625, "epoch": 0.2575784124759068, "grad_norm": 16.988692252928395, "kl": 0.15234375, "learning_rate": 7.425968109339408e-07, "loss": 0.0609, "reward": 1.390625, "reward_std": 0.16887323558330536, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1470 }, { "completion_length": 13.828125, "epoch": 0.25775363588575434, "grad_norm": 19.896280413945497, "kl": 0.171875, "learning_rate": 7.424215875240932e-07, "loss": -0.0636, "reward": 1.4487630128860474, "reward_std": 0.24810229241847992, "rewards/accuracy_reward_stage2": 0.49563801288604736, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1471 }, { "completion_length": 10.984375, "epoch": 0.2579288592956019, "grad_norm": 24.565980627282247, "kl": 0.16015625, "learning_rate": 7.422463641142457e-07, "loss": 0.0641, "reward": 1.600043773651123, "reward_std": 0.23149800300598145, "rewards/accuracy_reward_stage2": 0.600043773651123, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1472 }, { "completion_length": 11.328125, "epoch": 0.25810408270544943, "grad_norm": 16.323516653761956, "kl": 0.08935546875, "learning_rate": 7.42071140704398e-07, "loss": -0.0014, "reward": 1.389192819595337, "reward_std": 0.18605825304985046, "rewards/accuracy_reward_stage2": 0.4048178791999817, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1473 }, { "completion_length": 12.875, "epoch": 0.258279306115297, "grad_norm": 13.089491093183998, "kl": 0.0458984375, "learning_rate": 7.418959172945505e-07, "loss": 0.0184, "reward": 1.8300971984863281, "reward_std": 0.10606367141008377, "rewards/accuracy_reward_stage2": 0.8300973176956177, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1474 }, { "completion_length": 7.40625, "epoch": 0.2584545295251446, "grad_norm": 21.708425799782695, "kl": 0.09765625, "learning_rate": 7.41720693884703e-07, "loss": 0.0101, "reward": 1.613210916519165, "reward_std": 0.24434758722782135, "rewards/accuracy_reward_stage2": 0.6288357973098755, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1475 }, { "completion_length": 7.609375, "epoch": 0.25862975293499213, "grad_norm": 22.158316501149994, "kl": 0.07177734375, "learning_rate": 7.415454704748554e-07, "loss": -0.0089, "reward": 1.8659720420837402, "reward_std": 0.14032378792762756, "rewards/accuracy_reward_stage2": 0.8815969824790955, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1476 }, { "completion_length": 10.140625, "epoch": 0.2588049763448397, "grad_norm": 20.15236047195376, "kl": 0.05029296875, "learning_rate": 7.413702470650078e-07, "loss": 0.0201, "reward": 1.6414086818695068, "reward_std": 0.16311705112457275, "rewards/accuracy_reward_stage2": 0.6414086222648621, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1477 }, { "completion_length": 8.625, "epoch": 0.2589801997546872, "grad_norm": 11.965558883144823, "kl": 0.0380859375, "learning_rate": 7.411950236551603e-07, "loss": 0.0153, "reward": 1.6470980644226074, "reward_std": 0.09230685234069824, "rewards/accuracy_reward_stage2": 0.6470980644226074, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1478 }, { "completion_length": 13.5, "epoch": 0.25915542316453477, "grad_norm": 19.52953275782209, "kl": 0.09912109375, "learning_rate": 7.410198002453127e-07, "loss": 0.0396, "reward": 1.4840954542160034, "reward_std": 0.16965684294700623, "rewards/accuracy_reward_stage2": 0.6090954542160034, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1479 }, { "completion_length": 24.25, "epoch": 0.2593306465743823, "grad_norm": 21.65207615786653, "kl": 0.09375, "learning_rate": 7.408445768354652e-07, "loss": 0.0374, "reward": 1.379471778869629, "reward_std": 0.26678863167762756, "rewards/accuracy_reward_stage2": 0.5044718980789185, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1480 }, { "completion_length": 10.28125, "epoch": 0.2595058699842299, "grad_norm": 18.114718995075766, "kl": 0.046142578125, "learning_rate": 7.406693534256176e-07, "loss": 0.0185, "reward": 1.7144222259521484, "reward_std": 0.24147561192512512, "rewards/accuracy_reward_stage2": 0.7144221067428589, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1481 }, { "completion_length": 9.171875, "epoch": 0.25968109339407747, "grad_norm": 23.559305769873887, "kl": 0.2314453125, "learning_rate": 7.404941300157701e-07, "loss": 0.0559, "reward": 1.7861251831054688, "reward_std": 0.2582182288169861, "rewards/accuracy_reward_stage2": 0.8017500638961792, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1482 }, { "completion_length": 12.125, "epoch": 0.259856316803925, "grad_norm": 16.380539897990786, "kl": 0.09814453125, "learning_rate": 7.403189066059226e-07, "loss": 0.0392, "reward": 1.516692876815796, "reward_std": 0.14989466965198517, "rewards/accuracy_reward_stage2": 0.5166928172111511, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1483 }, { "completion_length": 6.640625, "epoch": 0.26003154021377256, "grad_norm": 14.736981869826542, "kl": 0.11181640625, "learning_rate": 7.40143683196075e-07, "loss": 0.0004, "reward": 1.8959097862243652, "reward_std": 0.15828779339790344, "rewards/accuracy_reward_stage2": 0.9115347862243652, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1484 }, { "completion_length": 7.328125, "epoch": 0.2602067636236201, "grad_norm": 15.499096500204017, "kl": 0.072265625, "learning_rate": 7.399684597862275e-07, "loss": -0.004, "reward": 1.6398365497589111, "reward_std": 0.17402216792106628, "rewards/accuracy_reward_stage2": 0.6554616093635559, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1485 }, { "completion_length": 10.8125, "epoch": 0.26038198703346765, "grad_norm": 18.499317534039804, "kl": 0.06640625, "learning_rate": 7.397932363763799e-07, "loss": 0.0265, "reward": 1.4845848083496094, "reward_std": 0.10854353755712509, "rewards/accuracy_reward_stage2": 0.4845846891403198, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1486 }, { "completion_length": 12.578125, "epoch": 0.2605572104433152, "grad_norm": 17.074920587332564, "kl": 0.12060546875, "learning_rate": 7.396180129665322e-07, "loss": 0.0481, "reward": 1.523491621017456, "reward_std": 0.0827716588973999, "rewards/accuracy_reward_stage2": 0.648491621017456, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1487 }, { "completion_length": 6.8125, "epoch": 0.2607324338531628, "grad_norm": 16.74247186108171, "kl": 0.10400390625, "learning_rate": 7.394427895566847e-07, "loss": 0.0031, "reward": 1.5125616788864136, "reward_std": 0.2064923644065857, "rewards/accuracy_reward_stage2": 0.6531867384910583, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1488 }, { "completion_length": 13.6875, "epoch": 0.26090765726301035, "grad_norm": 23.25587684438963, "kl": 0.1923828125, "learning_rate": 7.392675661468371e-07, "loss": 0.0769, "reward": 1.2328770160675049, "reward_std": 0.23550641536712646, "rewards/accuracy_reward_stage2": 0.4828770160675049, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1489 }, { "completion_length": 12.75, "epoch": 0.2610828806728579, "grad_norm": 16.46879874350823, "kl": 0.08984375, "learning_rate": 7.390923427369896e-07, "loss": 0.0359, "reward": 1.3310246467590332, "reward_std": 0.21695610880851746, "rewards/accuracy_reward_stage2": 0.4560246169567108, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1490 }, { "completion_length": 9.53125, "epoch": 0.26125810408270544, "grad_norm": 18.813317538380165, "kl": 0.150390625, "learning_rate": 7.389171193271421e-07, "loss": 0.0603, "reward": 1.631639003753662, "reward_std": 0.21653306484222412, "rewards/accuracy_reward_stage2": 0.7566390633583069, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1491 }, { "completion_length": 8.5625, "epoch": 0.261433327492553, "grad_norm": 14.57852348656557, "kl": 0.1455078125, "learning_rate": 7.387418959172945e-07, "loss": 0.0141, "reward": 1.3212279081344604, "reward_std": 0.20050299167633057, "rewards/accuracy_reward_stage2": 0.33685290813446045, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1492 }, { "completion_length": 9.71875, "epoch": 0.26160855090240054, "grad_norm": 19.213824798628277, "kl": 0.1484375, "learning_rate": 7.38566672507447e-07, "loss": 0.0593, "reward": 1.7892357110977173, "reward_std": 0.31280261278152466, "rewards/accuracy_reward_stage2": 0.7892358303070068, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1493 }, { "completion_length": 7.71875, "epoch": 0.26178377431224814, "grad_norm": 17.296368273180498, "kl": 0.11572265625, "learning_rate": 7.383914490975995e-07, "loss": 0.0462, "reward": 1.476837158203125, "reward_std": 0.15431980788707733, "rewards/accuracy_reward_stage2": 0.4768372178077698, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1494 }, { "completion_length": 6.453125, "epoch": 0.2619589977220957, "grad_norm": 15.24005128245555, "kl": 0.11328125, "learning_rate": 7.382162256877519e-07, "loss": -0.0341, "reward": 1.624133586883545, "reward_std": 0.1425294131040573, "rewards/accuracy_reward_stage2": 0.6553836464881897, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1495 }, { "completion_length": 9.375, "epoch": 0.26213422113194323, "grad_norm": 24.641600506436927, "kl": 0.045654296875, "learning_rate": 7.380410022779044e-07, "loss": 0.0182, "reward": 1.531754493713379, "reward_std": 0.2628553509712219, "rewards/accuracy_reward_stage2": 0.5317546129226685, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1496 }, { "completion_length": 12.90625, "epoch": 0.2623094445417908, "grad_norm": 17.434987564564846, "kl": 0.057373046875, "learning_rate": 7.378657788680567e-07, "loss": 0.0229, "reward": 1.3142204284667969, "reward_std": 0.08087074756622314, "rewards/accuracy_reward_stage2": 0.3142204284667969, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1497 }, { "completion_length": 9.0, "epoch": 0.26248466795163833, "grad_norm": 21.262342736610243, "kl": 0.0634765625, "learning_rate": 7.376905554582091e-07, "loss": 0.0254, "reward": 1.4722018241882324, "reward_std": 0.24770504236221313, "rewards/accuracy_reward_stage2": 0.47220176458358765, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1498 }, { "completion_length": 10.125, "epoch": 0.2626598913614859, "grad_norm": 26.079360544594778, "kl": 0.10693359375, "learning_rate": 7.375153320483616e-07, "loss": 0.0428, "reward": 1.614469289779663, "reward_std": 0.23577997088432312, "rewards/accuracy_reward_stage2": 0.6144692897796631, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1499 }, { "completion_length": 8.953125, "epoch": 0.2628351147713335, "grad_norm": 13.188425069955375, "kl": 0.07177734375, "learning_rate": 7.37340108638514e-07, "loss": 0.0287, "reward": 1.6197917461395264, "reward_std": 0.17163534462451935, "rewards/accuracy_reward_stage2": 0.7447916865348816, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1500 }, { "completion_length": 10.0625, "epoch": 0.263010338181181, "grad_norm": 25.910669248652916, "kl": 0.1787109375, "learning_rate": 7.371648852286665e-07, "loss": 0.0382, "reward": 1.2576980590820312, "reward_std": 0.421562522649765, "rewards/accuracy_reward_stage2": 0.39832305908203125, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1501 }, { "completion_length": 9.515625, "epoch": 0.26318556159102857, "grad_norm": 36.427488731530936, "kl": 0.173828125, "learning_rate": 7.36989661818819e-07, "loss": 0.0317, "reward": 1.3031154870986938, "reward_std": 0.3199523091316223, "rewards/accuracy_reward_stage2": 0.5687404870986938, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1502 }, { "completion_length": 9.640625, "epoch": 0.2633607850008761, "grad_norm": 19.300581958634684, "kl": 0.236328125, "learning_rate": 7.368144384089714e-07, "loss": 0.0945, "reward": 1.5580551624298096, "reward_std": 0.18352779746055603, "rewards/accuracy_reward_stage2": 0.6830551624298096, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1503 }, { "completion_length": 10.375, "epoch": 0.26353600841072367, "grad_norm": 20.598441592058176, "kl": 0.025390625, "learning_rate": 7.366392149991239e-07, "loss": 0.0102, "reward": 1.4132441282272339, "reward_std": 0.34681200981140137, "rewards/accuracy_reward_stage2": 0.4132440388202667, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1504 }, { "completion_length": 15.46875, "epoch": 0.2637112318205712, "grad_norm": 16.923225965838604, "kl": 0.0595703125, "learning_rate": 7.364639915892763e-07, "loss": -0.0203, "reward": 1.5988796949386597, "reward_std": 0.22254234552383423, "rewards/accuracy_reward_stage2": 0.6145046949386597, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1505 }, { "completion_length": 8.015625, "epoch": 0.26388645523041876, "grad_norm": 50.729038664876875, "kl": 0.220703125, "learning_rate": 7.362887681794288e-07, "loss": 0.0882, "reward": 1.7398931980133057, "reward_std": 0.1275492012500763, "rewards/accuracy_reward_stage2": 0.7398930788040161, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1506 }, { "completion_length": 11.875, "epoch": 0.26406167864026636, "grad_norm": 24.131657758244867, "kl": 0.072265625, "learning_rate": 7.361135447695812e-07, "loss": 0.0289, "reward": 1.7232370376586914, "reward_std": 0.17531737685203552, "rewards/accuracy_reward_stage2": 0.7232369184494019, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1507 }, { "completion_length": 11.984375, "epoch": 0.2642369020501139, "grad_norm": 18.52551523920455, "kl": 0.04638671875, "learning_rate": 7.359383213597336e-07, "loss": 0.0186, "reward": 1.78243887424469, "reward_std": 0.06022557616233826, "rewards/accuracy_reward_stage2": 0.7824387550354004, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1508 }, { "completion_length": 12.34375, "epoch": 0.26441212545996146, "grad_norm": 12.259126050769588, "kl": 0.04052734375, "learning_rate": 7.357630979498861e-07, "loss": -0.0206, "reward": 1.5980116128921509, "reward_std": 0.06384958326816559, "rewards/accuracy_reward_stage2": 0.6136366128921509, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1509 }, { "completion_length": 14.21875, "epoch": 0.264587348869809, "grad_norm": 113.84216603325396, "kl": 0.5625, "learning_rate": 7.355878745400386e-07, "loss": 0.2247, "reward": 1.5885417461395264, "reward_std": 0.20276054739952087, "rewards/accuracy_reward_stage2": 0.7135416269302368, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1510 }, { "completion_length": 14.0625, "epoch": 0.26476257227965655, "grad_norm": 22.257783257834138, "kl": 0.04443359375, "learning_rate": 7.354126511301909e-07, "loss": -0.0152, "reward": 1.488447666168213, "reward_std": 0.2847330570220947, "rewards/accuracy_reward_stage2": 0.5040726661682129, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1511 }, { "completion_length": 13.375, "epoch": 0.2649377956895041, "grad_norm": 19.024628703462216, "kl": 0.10986328125, "learning_rate": 7.352374277203434e-07, "loss": 0.0441, "reward": 1.3006612062454224, "reward_std": 0.16250282526016235, "rewards/accuracy_reward_stage2": 0.42566123604774475, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1512 }, { "completion_length": 8.421875, "epoch": 0.2651130190993517, "grad_norm": 23.432281970637504, "kl": 0.11669921875, "learning_rate": 7.350622043104958e-07, "loss": 0.0467, "reward": 1.5951578617095947, "reward_std": 0.2725307047367096, "rewards/accuracy_reward_stage2": 0.5951578617095947, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1513 }, { "completion_length": 13.90625, "epoch": 0.26528824250919925, "grad_norm": 15.405200044290508, "kl": 0.1337890625, "learning_rate": 7.348869809006483e-07, "loss": 0.0533, "reward": 1.2321314811706543, "reward_std": 0.09853121638298035, "rewards/accuracy_reward_stage2": 0.3571315109729767, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1514 }, { "completion_length": 9.671875, "epoch": 0.2654634659190468, "grad_norm": 14.970455451581415, "kl": 0.0169677734375, "learning_rate": 7.347117574908008e-07, "loss": 0.0068, "reward": 1.8090277910232544, "reward_std": 0.15713483095169067, "rewards/accuracy_reward_stage2": 0.8090277910232544, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1515 }, { "completion_length": 10.015625, "epoch": 0.26563868932889434, "grad_norm": 20.289129130031835, "kl": 0.126953125, "learning_rate": 7.345365340809531e-07, "loss": 0.0192, "reward": 1.2822504043579102, "reward_std": 0.1558304727077484, "rewards/accuracy_reward_stage2": 0.29787540435791016, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1516 }, { "completion_length": 7.984375, "epoch": 0.2658139127387419, "grad_norm": 12.479550386762886, "kl": 0.0517578125, "learning_rate": 7.343613106711056e-07, "loss": -0.0175, "reward": 1.6450035572052002, "reward_std": 0.17588838934898376, "rewards/accuracy_reward_stage2": 0.6606285572052002, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1517 }, { "completion_length": 17.109375, "epoch": 0.26598913614858943, "grad_norm": 19.38559620135752, "kl": 0.1162109375, "learning_rate": 7.341860872612581e-07, "loss": 0.0463, "reward": 1.4216383695602417, "reward_std": 0.1845116913318634, "rewards/accuracy_reward_stage2": 0.5466383695602417, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1518 }, { "completion_length": 12.25, "epoch": 0.266164359558437, "grad_norm": 13.140767714807309, "kl": 0.041748046875, "learning_rate": 7.340108638514105e-07, "loss": -0.0122, "reward": 1.6374698877334595, "reward_std": 0.16872502863407135, "rewards/accuracy_reward_stage2": 0.6530948877334595, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1519 }, { "completion_length": 12.8125, "epoch": 0.2663395829682846, "grad_norm": 17.16037416358004, "kl": 0.08837890625, "learning_rate": 7.33835640441563e-07, "loss": 0.0353, "reward": 1.5358409881591797, "reward_std": 0.26287949085235596, "rewards/accuracy_reward_stage2": 0.7858409881591797, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1520 }, { "completion_length": 12.1875, "epoch": 0.26651480637813213, "grad_norm": 20.780652134014453, "kl": 0.10498046875, "learning_rate": 7.336604170317154e-07, "loss": -0.0256, "reward": 1.5530736446380615, "reward_std": 0.22033998370170593, "rewards/accuracy_reward_stage2": 0.7093237042427063, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1521 }, { "completion_length": 9.15625, "epoch": 0.2666900297879797, "grad_norm": 21.789248697651665, "kl": 0.095703125, "learning_rate": 7.334851936218679e-07, "loss": -0.0756, "reward": 1.4975124597549438, "reward_std": 0.32140904664993286, "rewards/accuracy_reward_stage2": 0.6693874597549438, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1522 }, { "completion_length": 12.203125, "epoch": 0.2668652531978272, "grad_norm": 22.67098179945106, "kl": 0.125, "learning_rate": 7.333099702120204e-07, "loss": 0.05, "reward": 1.5546128749847412, "reward_std": 0.25176021456718445, "rewards/accuracy_reward_stage2": 0.679612934589386, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1523 }, { "completion_length": 16.75, "epoch": 0.26704047660767477, "grad_norm": 17.754697761796134, "kl": 0.1806640625, "learning_rate": 7.331347468021727e-07, "loss": 0.0722, "reward": 1.5393863916397095, "reward_std": 0.2093697190284729, "rewards/accuracy_reward_stage2": 0.5393863916397095, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1524 }, { "completion_length": 16.9375, "epoch": 0.2672157000175223, "grad_norm": 17.132183072177597, "kl": 0.0625, "learning_rate": 7.329595233923252e-07, "loss": 0.0251, "reward": 1.7056671380996704, "reward_std": 0.12278222292661667, "rewards/accuracy_reward_stage2": 0.7056670784950256, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1525 }, { "completion_length": 11.9375, "epoch": 0.2673909234273699, "grad_norm": 21.318333986059294, "kl": 0.14453125, "learning_rate": 7.327842999824775e-07, "loss": 0.0393, "reward": 1.4820630550384521, "reward_std": 0.1880171000957489, "rewards/accuracy_reward_stage2": 0.4976881146430969, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1526 }, { "completion_length": 6.453125, "epoch": 0.26756614683721747, "grad_norm": 15.136268546383176, "kl": 0.12060546875, "learning_rate": 7.3260907657263e-07, "loss": -0.0378, "reward": 1.5672743320465088, "reward_std": 0.22971659898757935, "rewards/accuracy_reward_stage2": 0.6141493320465088, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1527 }, { "completion_length": 6.796875, "epoch": 0.267741370247065, "grad_norm": 20.14827169845298, "kl": 0.08544921875, "learning_rate": 7.324338531627825e-07, "loss": -0.0082, "reward": 1.6023056507110596, "reward_std": 0.26246657967567444, "rewards/accuracy_reward_stage2": 0.6179307103157043, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1528 }, { "completion_length": 8.84375, "epoch": 0.26791659365691256, "grad_norm": 10.198922534215432, "kl": 0.0478515625, "learning_rate": 7.322586297529349e-07, "loss": 0.0192, "reward": 1.6613264083862305, "reward_std": 0.07768907397985458, "rewards/accuracy_reward_stage2": 0.6613264083862305, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1529 }, { "completion_length": 9.140625, "epoch": 0.2680918170667601, "grad_norm": 16.248910050517573, "kl": 0.08203125, "learning_rate": 7.320834063430874e-07, "loss": -0.0078, "reward": 1.5019550323486328, "reward_std": 0.19075937569141388, "rewards/accuracy_reward_stage2": 0.6425800323486328, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1530 }, { "completion_length": 7.859375, "epoch": 0.26826704047660765, "grad_norm": 20.561452330667606, "kl": 0.033447265625, "learning_rate": 7.319081829332399e-07, "loss": 0.0134, "reward": 1.5087047815322876, "reward_std": 0.26103299856185913, "rewards/accuracy_reward_stage2": 0.5087048411369324, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1531 }, { "completion_length": 21.0625, "epoch": 0.26844226388645526, "grad_norm": 18.92039677260036, "kl": 0.01806640625, "learning_rate": 7.317329595233923e-07, "loss": 0.0072, "reward": 1.5572808980941772, "reward_std": 0.2519490718841553, "rewards/accuracy_reward_stage2": 0.6822808980941772, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1532 }, { "completion_length": 8.46875, "epoch": 0.2686174872963028, "grad_norm": 16.95047090037599, "kl": 0.078125, "learning_rate": 7.315577361135448e-07, "loss": -0.0128, "reward": 1.5476700067520142, "reward_std": 0.22458161413669586, "rewards/accuracy_reward_stage2": 0.5632950067520142, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1533 }, { "completion_length": 11.53125, "epoch": 0.26879271070615035, "grad_norm": 13.105746686622085, "kl": 0.1044921875, "learning_rate": 7.313825127036972e-07, "loss": 0.0011, "reward": 1.5887963771820068, "reward_std": 0.20041370391845703, "rewards/accuracy_reward_stage2": 0.6044213771820068, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1534 }, { "completion_length": 12.71875, "epoch": 0.2689679341159979, "grad_norm": 22.064501649963276, "kl": 0.064453125, "learning_rate": 7.312072892938497e-07, "loss": 0.0163, "reward": 1.5054750442504883, "reward_std": 0.15925616025924683, "rewards/accuracy_reward_stage2": 0.5210999846458435, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1535 }, { "completion_length": 10.84375, "epoch": 0.26914315752584544, "grad_norm": 16.184712854528378, "kl": 0.1103515625, "learning_rate": 7.310320658840022e-07, "loss": 0.0001, "reward": 1.243492603302002, "reward_std": 0.18201014399528503, "rewards/accuracy_reward_stage2": 0.38411760330200195, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1536 }, { "completion_length": 15.578125, "epoch": 0.269318380935693, "grad_norm": 18.757460190128715, "kl": 0.078125, "learning_rate": 7.308568424741544e-07, "loss": 0.0312, "reward": 1.6463735103607178, "reward_std": 0.17724749445915222, "rewards/accuracy_reward_stage2": 0.646373450756073, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1537 }, { "completion_length": 12.90625, "epoch": 0.26949360434554054, "grad_norm": 12.440359363732176, "kl": 0.053466796875, "learning_rate": 7.306816190643069e-07, "loss": 0.0214, "reward": 1.3152844905853271, "reward_std": 0.08948960155248642, "rewards/accuracy_reward_stage2": 0.31528446078300476, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1538 }, { "completion_length": 8.828125, "epoch": 0.26966882775538814, "grad_norm": 23.314871830339797, "kl": 0.10693359375, "learning_rate": 7.305063956544594e-07, "loss": 0.0031, "reward": 1.6845183372497559, "reward_std": 0.31178730726242065, "rewards/accuracy_reward_stage2": 0.7001434564590454, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1539 }, { "completion_length": 8.265625, "epoch": 0.2698440511652357, "grad_norm": 15.128750411864983, "kl": 0.197265625, "learning_rate": 7.303311722446118e-07, "loss": 0.079, "reward": 1.5312914848327637, "reward_std": 0.1368497908115387, "rewards/accuracy_reward_stage2": 0.6562913656234741, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1540 }, { "completion_length": 9.453125, "epoch": 0.27001927457508323, "grad_norm": 9.897014047607549, "kl": 0.06982421875, "learning_rate": 7.301559488347643e-07, "loss": 0.028, "reward": 1.6693193912506104, "reward_std": 0.1036425307393074, "rewards/accuracy_reward_stage2": 0.7943194508552551, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1541 }, { "completion_length": 12.96875, "epoch": 0.2701944979849308, "grad_norm": 17.61779482500366, "kl": 0.0260009765625, "learning_rate": 7.299807254249167e-07, "loss": 0.0104, "reward": 1.4325488805770874, "reward_std": 0.23638816177845, "rewards/accuracy_reward_stage2": 0.4325488805770874, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1542 }, { "completion_length": 10.296875, "epoch": 0.27036972139477833, "grad_norm": 21.49084583990938, "kl": 0.240234375, "learning_rate": 7.298055020150692e-07, "loss": 0.096, "reward": 1.5804922580718994, "reward_std": 0.23495854437351227, "rewards/accuracy_reward_stage2": 0.7054921984672546, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1543 }, { "completion_length": 9.703125, "epoch": 0.2705449448046259, "grad_norm": 25.659390561790094, "kl": 0.1953125, "learning_rate": 7.296302786052217e-07, "loss": 0.0779, "reward": 1.3564759492874146, "reward_std": 0.2723844647407532, "rewards/accuracy_reward_stage2": 0.48147594928741455, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1544 }, { "completion_length": 11.140625, "epoch": 0.2707201682144735, "grad_norm": 21.15270962339563, "kl": 0.12060546875, "learning_rate": 7.294550551953741e-07, "loss": 0.0751, "reward": 1.6740481853485107, "reward_std": 0.16617107391357422, "rewards/accuracy_reward_stage2": 0.799048125743866, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1545 }, { "completion_length": 6.625, "epoch": 0.270895391624321, "grad_norm": 25.622727181263297, "kl": 0.095703125, "learning_rate": 7.292798317855265e-07, "loss": 0.0384, "reward": 1.722312331199646, "reward_std": 0.19830942153930664, "rewards/accuracy_reward_stage2": 0.847312331199646, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1546 }, { "completion_length": 9.796875, "epoch": 0.27107061503416857, "grad_norm": 27.517916045250036, "kl": 0.453125, "learning_rate": 7.29104608375679e-07, "loss": 0.1811, "reward": 1.391111135482788, "reward_std": 0.3434157371520996, "rewards/accuracy_reward_stage2": 0.6411112546920776, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1547 }, { "completion_length": 7.59375, "epoch": 0.2712458384440161, "grad_norm": 14.200419137196972, "kl": 0.0751953125, "learning_rate": 7.289293849658314e-07, "loss": 0.0301, "reward": 1.612762212753296, "reward_std": 0.1002715528011322, "rewards/accuracy_reward_stage2": 0.7377622127532959, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1548 }, { "completion_length": 15.734375, "epoch": 0.27142106185386367, "grad_norm": 19.81161434968578, "kl": 0.012939453125, "learning_rate": 7.287541615559838e-07, "loss": 0.0052, "reward": 1.5175888538360596, "reward_std": 0.219430610537529, "rewards/accuracy_reward_stage2": 0.5175889134407043, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1549 }, { "completion_length": 15.28125, "epoch": 0.2715962852637112, "grad_norm": 21.63402553561504, "kl": 0.1640625, "learning_rate": 7.285789381461362e-07, "loss": 0.0034, "reward": 1.5669314861297607, "reward_std": 0.19692623615264893, "rewards/accuracy_reward_stage2": 0.6138066053390503, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1550 }, { "completion_length": 12.21875, "epoch": 0.2717715086735588, "grad_norm": 22.15076349499154, "kl": 0.1708984375, "learning_rate": 7.284037147362887e-07, "loss": 0.0324, "reward": 1.4736478328704834, "reward_std": 0.13731749355793, "rewards/accuracy_reward_stage2": 0.4892728924751282, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1551 }, { "completion_length": 6.09375, "epoch": 0.27194673208340636, "grad_norm": 17.382817812945337, "kl": 0.1005859375, "learning_rate": 7.282284913264412e-07, "loss": 0.0404, "reward": 1.7686809301376343, "reward_std": 0.19698965549468994, "rewards/accuracy_reward_stage2": 0.7686809301376343, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1552 }, { "completion_length": 8.046875, "epoch": 0.2721219554932539, "grad_norm": 20.083754257329154, "kl": 0.1650390625, "learning_rate": 7.280532679165936e-07, "loss": 0.0661, "reward": 1.3247437477111816, "reward_std": 0.12370945513248444, "rewards/accuracy_reward_stage2": 0.44974377751350403, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1553 }, { "completion_length": 10.109375, "epoch": 0.27229717890310146, "grad_norm": 16.555514649268687, "kl": 0.0751953125, "learning_rate": 7.278780445067461e-07, "loss": -0.0325, "reward": 1.443509578704834, "reward_std": 0.3313031792640686, "rewards/accuracy_reward_stage2": 0.474759578704834, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1554 }, { "completion_length": 9.859375, "epoch": 0.272472402312949, "grad_norm": 17.971828702025466, "kl": 0.060302734375, "learning_rate": 7.277028210968986e-07, "loss": 0.0241, "reward": 1.5731000900268555, "reward_std": 0.29691874980926514, "rewards/accuracy_reward_stage2": 0.5731000900268555, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1555 }, { "completion_length": 15.796875, "epoch": 0.27264762572279655, "grad_norm": 14.684691166296368, "kl": 0.04541015625, "learning_rate": 7.275275976870509e-07, "loss": 0.0182, "reward": 1.428015947341919, "reward_std": 0.14021556079387665, "rewards/accuracy_reward_stage2": 0.42801591753959656, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1556 }, { "completion_length": 10.421875, "epoch": 0.2728228491326441, "grad_norm": 21.497354071634, "kl": 0.1943359375, "learning_rate": 7.273523742772034e-07, "loss": 0.0378, "reward": 1.5667054653167725, "reward_std": 0.31732115149497986, "rewards/accuracy_reward_stage2": 0.5823305249214172, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1557 }, { "completion_length": 7.53125, "epoch": 0.2729980725424917, "grad_norm": 12.187444232549343, "kl": 0.03515625, "learning_rate": 7.271771508673558e-07, "loss": 0.0141, "reward": 1.6551628112792969, "reward_std": 0.11131031066179276, "rewards/accuracy_reward_stage2": 0.6551628708839417, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1558 }, { "completion_length": 10.0625, "epoch": 0.27317329595233925, "grad_norm": 17.247051556669394, "kl": 0.037841796875, "learning_rate": 7.270019274575083e-07, "loss": 0.0152, "reward": 1.6297712326049805, "reward_std": 0.1473989188671112, "rewards/accuracy_reward_stage2": 0.6297712922096252, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1559 }, { "completion_length": 8.84375, "epoch": 0.2733485193621868, "grad_norm": 18.867441244462896, "kl": 0.0458984375, "learning_rate": 7.268267040476608e-07, "loss": 0.0184, "reward": 1.7025973796844482, "reward_std": 0.19792814552783966, "rewards/accuracy_reward_stage2": 0.7025973796844482, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1560 }, { "completion_length": 7.921875, "epoch": 0.27352374277203434, "grad_norm": 16.56831611531489, "kl": 0.11376953125, "learning_rate": 7.266514806378132e-07, "loss": -0.0846, "reward": 1.465267300605774, "reward_std": 0.2277645468711853, "rewards/accuracy_reward_stage2": 0.6371423006057739, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1561 }, { "completion_length": 10.59375, "epoch": 0.2736989661818819, "grad_norm": 21.307744690181806, "kl": 0.04248046875, "learning_rate": 7.264762572279656e-07, "loss": -0.0164, "reward": 1.3307263851165771, "reward_std": 0.30374133586883545, "rewards/accuracy_reward_stage2": 0.34635138511657715, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1562 }, { "completion_length": 10.109375, "epoch": 0.27387418959172943, "grad_norm": 20.814389143707405, "kl": 0.08544921875, "learning_rate": 7.263010338181181e-07, "loss": -0.0101, "reward": 1.476302146911621, "reward_std": 0.35749757289886475, "rewards/accuracy_reward_stage2": 0.4919270873069763, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1563 }, { "completion_length": 7.890625, "epoch": 0.27404941300157704, "grad_norm": 16.58609657046578, "kl": 0.050537109375, "learning_rate": 7.261258104082705e-07, "loss": 0.0202, "reward": 1.630878210067749, "reward_std": 0.21472877264022827, "rewards/accuracy_reward_stage2": 0.6308783292770386, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1564 }, { "completion_length": 12.390625, "epoch": 0.2742246364114246, "grad_norm": 13.702475279669857, "kl": 0.04296875, "learning_rate": 7.25950586998423e-07, "loss": 0.0172, "reward": 1.6737043857574463, "reward_std": 0.07372551411390305, "rewards/accuracy_reward_stage2": 0.6737043857574463, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1565 }, { "completion_length": 12.09375, "epoch": 0.27439985982127213, "grad_norm": 22.677346899697838, "kl": 0.109375, "learning_rate": 7.257753635885753e-07, "loss": 0.0437, "reward": 1.6756170988082886, "reward_std": 0.24650396406650543, "rewards/accuracy_reward_stage2": 0.6756170988082886, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1566 }, { "completion_length": 13.390625, "epoch": 0.2745750832311197, "grad_norm": 23.576489442401506, "kl": 0.026611328125, "learning_rate": 7.256001401787278e-07, "loss": 0.0107, "reward": 1.3751254081726074, "reward_std": 0.23284590244293213, "rewards/accuracy_reward_stage2": 0.5001254677772522, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1567 }, { "completion_length": 11.375, "epoch": 0.2747503066409672, "grad_norm": 25.478991718817426, "kl": 0.038818359375, "learning_rate": 7.254249167688803e-07, "loss": -0.0511, "reward": 1.4270833730697632, "reward_std": 0.37081876397132874, "rewards/accuracy_reward_stage2": 0.4583333134651184, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1568 }, { "completion_length": 10.4375, "epoch": 0.27492553005081477, "grad_norm": 17.76540063991035, "kl": 0.078125, "learning_rate": 7.252496933590327e-07, "loss": 0.0314, "reward": 1.5698776245117188, "reward_std": 0.2609265446662903, "rewards/accuracy_reward_stage2": 0.5698776245117188, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1569 }, { "completion_length": 9.96875, "epoch": 0.2751007534606623, "grad_norm": 20.370761525766085, "kl": 0.04638671875, "learning_rate": 7.250744699491852e-07, "loss": 0.0186, "reward": 1.5809662342071533, "reward_std": 0.2464766502380371, "rewards/accuracy_reward_stage2": 0.5809662342071533, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1570 }, { "completion_length": 8.59375, "epoch": 0.2752759768705099, "grad_norm": 21.541839159331936, "kl": 0.1123046875, "learning_rate": 7.248992465393377e-07, "loss": 0.0235, "reward": 1.5938735008239746, "reward_std": 0.25188708305358887, "rewards/accuracy_reward_stage2": 0.6094985008239746, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1571 }, { "completion_length": 9.015625, "epoch": 0.27545120028035747, "grad_norm": 11.988388116776276, "kl": 0.0615234375, "learning_rate": 7.247240231294901e-07, "loss": -0.0146, "reward": 1.7149364948272705, "reward_std": 0.2069161832332611, "rewards/accuracy_reward_stage2": 0.7305614948272705, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1572 }, { "completion_length": 10.125, "epoch": 0.275626423690205, "grad_norm": 22.24484166978493, "kl": 0.1923828125, "learning_rate": 7.245487997196426e-07, "loss": 0.0769, "reward": 1.0984094142913818, "reward_std": 0.1601712703704834, "rewards/accuracy_reward_stage2": 0.348409503698349, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1573 }, { "completion_length": 8.546875, "epoch": 0.27580164710005256, "grad_norm": 19.80297995187602, "kl": 0.1552734375, "learning_rate": 7.24373576309795e-07, "loss": 0.0012, "reward": 1.5901460647583008, "reward_std": 0.2562961280345917, "rewards/accuracy_reward_stage2": 0.6213959455490112, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1574 }, { "completion_length": 7.484375, "epoch": 0.2759768705099001, "grad_norm": 18.965599614704047, "kl": 0.076171875, "learning_rate": 7.241983528999474e-07, "loss": 0.0304, "reward": 1.657869815826416, "reward_std": 0.15086671710014343, "rewards/accuracy_reward_stage2": 0.6578697562217712, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1575 }, { "completion_length": 10.65625, "epoch": 0.27615209391974765, "grad_norm": 16.051869215816296, "kl": 0.07373046875, "learning_rate": 7.240231294900998e-07, "loss": -0.0147, "reward": 1.4174940586090088, "reward_std": 0.2014116644859314, "rewards/accuracy_reward_stage2": 0.5581189393997192, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1576 }, { "completion_length": 11.71875, "epoch": 0.27632731732959526, "grad_norm": 28.32712176310248, "kl": 0.04150390625, "learning_rate": 7.238479060802522e-07, "loss": 0.0166, "reward": 1.4083119630813599, "reward_std": 0.32629430294036865, "rewards/accuracy_reward_stage2": 0.5333119630813599, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1577 }, { "completion_length": 20.421875, "epoch": 0.2765025407394428, "grad_norm": 18.039033791626412, "kl": 0.0634765625, "learning_rate": 7.236726826704047e-07, "loss": 0.0254, "reward": 1.5804221630096436, "reward_std": 0.16351595520973206, "rewards/accuracy_reward_stage2": 0.5804222226142883, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1578 }, { "completion_length": 9.171875, "epoch": 0.27667776414929035, "grad_norm": 21.66543564045071, "kl": 0.212890625, "learning_rate": 7.234974592605572e-07, "loss": 0.0416, "reward": 1.7025885581970215, "reward_std": 0.3499017357826233, "rewards/accuracy_reward_stage2": 0.7182136178016663, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1579 }, { "completion_length": 17.84375, "epoch": 0.2768529875591379, "grad_norm": 18.73158665742104, "kl": 0.05224609375, "learning_rate": 7.233222358507096e-07, "loss": -0.0233, "reward": 1.4649728536605835, "reward_std": 0.19344475865364075, "rewards/accuracy_reward_stage2": 0.4805978834629059, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1580 }, { "completion_length": 10.453125, "epoch": 0.27702821096898544, "grad_norm": 19.02839268753434, "kl": 0.08740234375, "learning_rate": 7.231470124408621e-07, "loss": 0.0349, "reward": 1.4809317588806152, "reward_std": 0.26230835914611816, "rewards/accuracy_reward_stage2": 0.6059318780899048, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1581 }, { "completion_length": 12.21875, "epoch": 0.277203434378833, "grad_norm": 17.77505642063965, "kl": 0.052734375, "learning_rate": 7.229717890310145e-07, "loss": -0.0187, "reward": 1.6692399978637695, "reward_std": 0.23491568863391876, "rewards/accuracy_reward_stage2": 0.80986487865448, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1582 }, { "completion_length": 8.609375, "epoch": 0.2773786577886806, "grad_norm": 14.097807207383974, "kl": 0.16796875, "learning_rate": 7.22796565621167e-07, "loss": 0.0002, "reward": 1.224002480506897, "reward_std": 0.1800319105386734, "rewards/accuracy_reward_stage2": 0.255252480506897, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1583 }, { "completion_length": 8.390625, "epoch": 0.27755388119852814, "grad_norm": 18.685066348919854, "kl": 0.2158203125, "learning_rate": 7.226213422113195e-07, "loss": 0.086, "reward": 1.3312758207321167, "reward_std": 0.2569296658039093, "rewards/accuracy_reward_stage2": 0.4562758803367615, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1584 }, { "completion_length": 12.75, "epoch": 0.2777291046083757, "grad_norm": 24.130001829627535, "kl": 0.1962890625, "learning_rate": 7.224461188014719e-07, "loss": 0.0452, "reward": 1.5319864749908447, "reward_std": 0.31532320380210876, "rewards/accuracy_reward_stage2": 0.5476114749908447, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1585 }, { "completion_length": 9.25, "epoch": 0.27790432801822323, "grad_norm": 19.15274013812705, "kl": 0.03759765625, "learning_rate": 7.222708953916243e-07, "loss": 0.008, "reward": 1.5639368295669556, "reward_std": 0.1757792979478836, "rewards/accuracy_reward_stage2": 0.6889368295669556, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1586 }, { "completion_length": 7.546875, "epoch": 0.2780795514280708, "grad_norm": 21.16513159458688, "kl": 0.0576171875, "learning_rate": 7.220956719817766e-07, "loss": 0.0001, "reward": 1.4314806461334229, "reward_std": 0.2914354205131531, "rewards/accuracy_reward_stage2": 0.44710561633110046, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1587 }, { "completion_length": 7.875, "epoch": 0.27825477483791833, "grad_norm": 28.105192038087445, "kl": 0.267578125, "learning_rate": 7.219204485719291e-07, "loss": 0.107, "reward": 1.2831439971923828, "reward_std": 0.19191929697990417, "rewards/accuracy_reward_stage2": 0.533143937587738, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1588 }, { "completion_length": 7.71875, "epoch": 0.2784299982477659, "grad_norm": 20.911599804476825, "kl": 0.08642578125, "learning_rate": 7.217452251620816e-07, "loss": 0.0346, "reward": 1.5567564964294434, "reward_std": 0.18814368546009064, "rewards/accuracy_reward_stage2": 0.6817566156387329, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1589 }, { "completion_length": 6.484375, "epoch": 0.2786052216576135, "grad_norm": 18.09001762878497, "kl": 0.057861328125, "learning_rate": 7.21570001752234e-07, "loss": 0.0232, "reward": 1.5766370296478271, "reward_std": 0.16156907379627228, "rewards/accuracy_reward_stage2": 0.5766370296478271, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1590 }, { "completion_length": 8.90625, "epoch": 0.278780445067461, "grad_norm": 12.938723311552208, "kl": 0.036865234375, "learning_rate": 7.213947783423865e-07, "loss": 0.0147, "reward": 1.4394537210464478, "reward_std": 0.09740547835826874, "rewards/accuracy_reward_stage2": 0.43945372104644775, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1591 }, { "completion_length": 11.921875, "epoch": 0.27895566847730857, "grad_norm": 17.96221751888626, "kl": 0.0341796875, "learning_rate": 7.21219554932539e-07, "loss": -0.0079, "reward": 1.4706923961639404, "reward_std": 0.18699324131011963, "rewards/accuracy_reward_stage2": 0.48631754517555237, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1592 }, { "completion_length": 9.015625, "epoch": 0.2791308918871561, "grad_norm": 18.842290019196636, "kl": 0.07373046875, "learning_rate": 7.210443315226914e-07, "loss": 0.0295, "reward": 1.8209922313690186, "reward_std": 0.2510858178138733, "rewards/accuracy_reward_stage2": 0.8209922909736633, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1593 }, { "completion_length": 11.796875, "epoch": 0.27930611529700367, "grad_norm": 20.724838234141696, "kl": 0.020263671875, "learning_rate": 7.208691081128439e-07, "loss": 0.0081, "reward": 1.7598631381988525, "reward_std": 0.1263093203306198, "rewards/accuracy_reward_stage2": 0.7598632574081421, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1594 }, { "completion_length": 11.890625, "epoch": 0.2794813387068512, "grad_norm": 21.423489400476658, "kl": 0.068359375, "learning_rate": 7.206938847029962e-07, "loss": 0.0274, "reward": 1.5535926818847656, "reward_std": 0.16711866855621338, "rewards/accuracy_reward_stage2": 0.5535926818847656, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1595 }, { "completion_length": 10.515625, "epoch": 0.2796565621166988, "grad_norm": 14.707843957438799, "kl": 0.03271484375, "learning_rate": 7.205186612931487e-07, "loss": 0.013, "reward": 1.8087513446807861, "reward_std": 0.07620933651924133, "rewards/accuracy_reward_stage2": 0.8087514042854309, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1596 }, { "completion_length": 6.96875, "epoch": 0.27983178552654636, "grad_norm": 20.941631632466663, "kl": 0.083984375, "learning_rate": 7.203434378833012e-07, "loss": 0.0048, "reward": 1.7164759635925293, "reward_std": 0.19856837391853333, "rewards/accuracy_reward_stage2": 0.8571010828018188, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1597 }, { "completion_length": 12.390625, "epoch": 0.2800070089363939, "grad_norm": 13.708390554967615, "kl": 0.1943359375, "learning_rate": 7.201682144734536e-07, "loss": 0.0436, "reward": 1.4508538246154785, "reward_std": 0.21650546789169312, "rewards/accuracy_reward_stage2": 0.5914788246154785, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1598 }, { "completion_length": 17.421875, "epoch": 0.28018223234624146, "grad_norm": 15.884490536295282, "kl": 0.01385498046875, "learning_rate": 7.199929910636061e-07, "loss": 0.0055, "reward": 1.6412231922149658, "reward_std": 0.1582455039024353, "rewards/accuracy_reward_stage2": 0.6412231922149658, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1599 }, { "completion_length": 12.75, "epoch": 0.280357455756089, "grad_norm": 19.54963850066304, "kl": 0.2392578125, "learning_rate": 7.198177676537585e-07, "loss": 0.0515, "reward": 1.5843968391418457, "reward_std": 0.3087541460990906, "rewards/accuracy_reward_stage2": 0.7250218391418457, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1600 }, { "completion_length": 8.796875, "epoch": 0.28053267916593655, "grad_norm": 11.990420921934714, "kl": 0.1044921875, "learning_rate": 7.196425442439109e-07, "loss": -0.0466, "reward": 1.8261520862579346, "reward_std": 0.12564031779766083, "rewards/accuracy_reward_stage2": 0.8574021458625793, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1601 }, { "completion_length": 9.625, "epoch": 0.28070790257578415, "grad_norm": 16.45410163995305, "kl": 0.07177734375, "learning_rate": 7.194673208340634e-07, "loss": -0.0153, "reward": 1.3370466232299805, "reward_std": 0.2418329268693924, "rewards/accuracy_reward_stage2": 0.35267162322998047, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1602 }, { "completion_length": 7.515625, "epoch": 0.2808831259856317, "grad_norm": 15.062055007504009, "kl": 0.0269775390625, "learning_rate": 7.192920974242158e-07, "loss": 0.0108, "reward": 1.6208202838897705, "reward_std": 0.11473879963159561, "rewards/accuracy_reward_stage2": 0.6208202242851257, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1603 }, { "completion_length": 15.46875, "epoch": 0.28105834939547925, "grad_norm": 15.064200762093972, "kl": 0.045654296875, "learning_rate": 7.191168740143683e-07, "loss": -0.0235, "reward": 1.4908268451690674, "reward_std": 0.14759297668933868, "rewards/accuracy_reward_stage2": 0.5064517855644226, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1604 }, { "completion_length": 9.53125, "epoch": 0.2812335728053268, "grad_norm": 18.400798164712032, "kl": 0.28125, "learning_rate": 7.189416506045208e-07, "loss": 0.0809, "reward": 1.1706050634384155, "reward_std": 0.21874907612800598, "rewards/accuracy_reward_stage2": 0.4362300634384155, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1605 }, { "completion_length": 9.75, "epoch": 0.28140879621517434, "grad_norm": 16.819137140159572, "kl": 0.07080078125, "learning_rate": 7.187664271946731e-07, "loss": 0.0284, "reward": 1.6809344291687012, "reward_std": 0.2468591034412384, "rewards/accuracy_reward_stage2": 0.6809343099594116, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1606 }, { "completion_length": 12.28125, "epoch": 0.2815840196250219, "grad_norm": 24.427891880001116, "kl": 0.2197265625, "learning_rate": 7.185912037848256e-07, "loss": 0.0876, "reward": 1.5013978481292725, "reward_std": 0.2571167051792145, "rewards/accuracy_reward_stage2": 0.6263978481292725, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1607 }, { "completion_length": 9.8125, "epoch": 0.28175924303486943, "grad_norm": 10.410467714252185, "kl": 0.059814453125, "learning_rate": 7.184159803749781e-07, "loss": 0.0143, "reward": 1.71875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1608 }, { "completion_length": 14.9375, "epoch": 0.28193446644471704, "grad_norm": 29.89187810370791, "kl": 0.050537109375, "learning_rate": 7.182407569651305e-07, "loss": 0.014, "reward": 1.3833160400390625, "reward_std": 0.2767779231071472, "rewards/accuracy_reward_stage2": 0.3989410996437073, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1609 }, { "completion_length": 12.703125, "epoch": 0.2821096898545646, "grad_norm": 22.31757950701303, "kl": 0.3203125, "learning_rate": 7.18065533555283e-07, "loss": 0.0685, "reward": 1.545056700706482, "reward_std": 0.20747987926006317, "rewards/accuracy_reward_stage2": 0.6856817007064819, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1610 }, { "completion_length": 11.03125, "epoch": 0.28228491326441213, "grad_norm": 15.946911886957992, "kl": 0.03125, "learning_rate": 7.178903101454354e-07, "loss": 0.0125, "reward": 1.7586082220077515, "reward_std": 0.13714845478534698, "rewards/accuracy_reward_stage2": 0.7586082220077515, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1611 }, { "completion_length": 23.9375, "epoch": 0.2824601366742597, "grad_norm": 21.26908901702462, "kl": 0.26953125, "learning_rate": 7.177150867355879e-07, "loss": 0.1081, "reward": 1.35392427444458, "reward_std": 0.16395366191864014, "rewards/accuracy_reward_stage2": 0.4789242446422577, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1612 }, { "completion_length": 10.5625, "epoch": 0.2826353600841072, "grad_norm": 21.791260706148368, "kl": 0.03759765625, "learning_rate": 7.175398633257403e-07, "loss": -0.0184, "reward": 1.765136480331421, "reward_std": 0.20289413630962372, "rewards/accuracy_reward_stage2": 0.9057614803314209, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1613 }, { "completion_length": 8.8125, "epoch": 0.28281058349395477, "grad_norm": 23.844684700591703, "kl": 0.060791015625, "learning_rate": 7.173646399158927e-07, "loss": -0.0091, "reward": 1.355473518371582, "reward_std": 0.3000277876853943, "rewards/accuracy_reward_stage2": 0.37109851837158203, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1614 }, { "completion_length": 14.484375, "epoch": 0.2829858069038024, "grad_norm": 14.153123323249732, "kl": 0.06591796875, "learning_rate": 7.171894165060451e-07, "loss": -0.0179, "reward": 1.6771589517593384, "reward_std": 0.14484737813472748, "rewards/accuracy_reward_stage2": 0.6927839517593384, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1615 }, { "completion_length": 13.734375, "epoch": 0.2831610303136499, "grad_norm": 17.854073307258215, "kl": 0.0947265625, "learning_rate": 7.170141930961976e-07, "loss": 0.0379, "reward": 1.5207949876785278, "reward_std": 0.15267062187194824, "rewards/accuracy_reward_stage2": 0.6457949876785278, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1616 }, { "completion_length": 7.21875, "epoch": 0.28333625372349747, "grad_norm": 14.907211623261963, "kl": 0.205078125, "learning_rate": 7.1683896968635e-07, "loss": -0.0062, "reward": 1.416919231414795, "reward_std": 0.2574279308319092, "rewards/accuracy_reward_stage2": 0.5731692314147949, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1617 }, { "completion_length": 9.59375, "epoch": 0.283511477133345, "grad_norm": 23.825957362220002, "kl": 0.115234375, "learning_rate": 7.166637462765025e-07, "loss": 0.046, "reward": 1.5133368968963623, "reward_std": 0.1434442102909088, "rewards/accuracy_reward_stage2": 0.7633370161056519, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1618 }, { "completion_length": 9.671875, "epoch": 0.28368670054319256, "grad_norm": 12.868527094628538, "kl": 0.05615234375, "learning_rate": 7.164885228666549e-07, "loss": -0.0482, "reward": 1.6809239387512207, "reward_std": 0.20865806937217712, "rewards/accuracy_reward_stage2": 0.7121739387512207, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1619 }, { "completion_length": 13.109375, "epoch": 0.2838619239530401, "grad_norm": 16.658702628163443, "kl": 0.0118408203125, "learning_rate": 7.163132994568074e-07, "loss": 0.0047, "reward": 1.34375, "reward_std": 0.2630179226398468, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1620 }, { "completion_length": 12.203125, "epoch": 0.28403714736288765, "grad_norm": 16.104471025041327, "kl": 0.14453125, "learning_rate": 7.161380760469599e-07, "loss": 0.0582, "reward": 1.785825490951538, "reward_std": 0.10489936918020248, "rewards/accuracy_reward_stage2": 0.9108256101608276, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1621 }, { "completion_length": 20.03125, "epoch": 0.28421237077273526, "grad_norm": 15.819631796987974, "kl": 0.1083984375, "learning_rate": 7.159628526371123e-07, "loss": -0.0419, "reward": 1.4067251682281494, "reward_std": 0.20268367230892181, "rewards/accuracy_reward_stage2": 0.5629751086235046, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1622 }, { "completion_length": 7.140625, "epoch": 0.2843875941825828, "grad_norm": 37.538471453661955, "kl": 0.06884765625, "learning_rate": 7.157876292272648e-07, "loss": 0.0275, "reward": 1.5744768381118774, "reward_std": 0.1758725643157959, "rewards/accuracy_reward_stage2": 0.5744768977165222, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1623 }, { "completion_length": 9.6875, "epoch": 0.28456281759243035, "grad_norm": 18.377628674736215, "kl": 0.18359375, "learning_rate": 7.156124058174173e-07, "loss": 0.0447, "reward": 1.4850250482559204, "reward_std": 0.2172246277332306, "rewards/accuracy_reward_stage2": 0.6256501078605652, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1624 }, { "completion_length": 12.078125, "epoch": 0.2847380410022779, "grad_norm": 18.698228548749846, "kl": 0.09130859375, "learning_rate": 7.154371824075697e-07, "loss": -0.052, "reward": 1.5879881381988525, "reward_std": 0.14833402633666992, "rewards/accuracy_reward_stage2": 0.6192381381988525, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1625 }, { "completion_length": 7.59375, "epoch": 0.28491326441212544, "grad_norm": 13.655609339465885, "kl": 0.296875, "learning_rate": 7.15261958997722e-07, "loss": 0.031, "reward": 1.6834176778793335, "reward_std": 0.18731647729873657, "rewards/accuracy_reward_stage2": 0.7146677374839783, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1626 }, { "completion_length": 10.671875, "epoch": 0.285088487821973, "grad_norm": 23.757278282718293, "kl": 0.13671875, "learning_rate": 7.150867355878744e-07, "loss": 0.0546, "reward": 1.4906599521636963, "reward_std": 0.10720989108085632, "rewards/accuracy_reward_stage2": 0.4906599223613739, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1627 }, { "completion_length": 10.25, "epoch": 0.2852637112318206, "grad_norm": 55.79335785557363, "kl": 0.0211181640625, "learning_rate": 7.149115121780269e-07, "loss": -0.0306, "reward": 1.618137240409851, "reward_std": 0.24807269871234894, "rewards/accuracy_reward_stage2": 0.7587622404098511, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1628 }, { "completion_length": 6.921875, "epoch": 0.28543893464166814, "grad_norm": 18.74178145843626, "kl": 0.1328125, "learning_rate": 7.147362887681794e-07, "loss": -0.0351, "reward": 1.3534233570098877, "reward_std": 0.1923113763332367, "rewards/accuracy_reward_stage2": 0.3846732974052429, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1629 }, { "completion_length": 9.359375, "epoch": 0.2856141580515157, "grad_norm": 11.461917013351266, "kl": 0.2080078125, "learning_rate": 7.145610653583318e-07, "loss": 0.0833, "reward": 1.4668022394180298, "reward_std": 0.08062316477298737, "rewards/accuracy_reward_stage2": 0.466802179813385, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1630 }, { "completion_length": 13.796875, "epoch": 0.28578938146136323, "grad_norm": 22.880906576728375, "kl": 0.1318359375, "learning_rate": 7.143858419484843e-07, "loss": 0.0087, "reward": 1.6901664733886719, "reward_std": 0.30371004343032837, "rewards/accuracy_reward_stage2": 0.8307914733886719, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1631 }, { "completion_length": 7.015625, "epoch": 0.2859646048712108, "grad_norm": 19.54098394102929, "kl": 0.07275390625, "learning_rate": 7.142106185386368e-07, "loss": 0.0292, "reward": 1.4868440628051758, "reward_std": 0.1463613510131836, "rewards/accuracy_reward_stage2": 0.48684412240982056, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1632 }, { "completion_length": 10.375, "epoch": 0.28613982828105833, "grad_norm": 21.269052597139854, "kl": 0.06494140625, "learning_rate": 7.140353951287892e-07, "loss": -0.0137, "reward": 1.6641602516174316, "reward_std": 0.3177988529205322, "rewards/accuracy_reward_stage2": 0.6797853112220764, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1633 }, { "completion_length": 5.234375, "epoch": 0.28631505169090593, "grad_norm": 16.2130956464234, "kl": 0.0673828125, "learning_rate": 7.138601717189417e-07, "loss": -0.0235, "reward": 1.3314732313156128, "reward_std": 0.19711866974830627, "rewards/accuracy_reward_stage2": 0.4720982015132904, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1634 }, { "completion_length": 7.21875, "epoch": 0.2864902751007535, "grad_norm": 45.213911068141066, "kl": 0.46484375, "learning_rate": 7.13684948309094e-07, "loss": 0.1418, "reward": 1.2447587251663208, "reward_std": 0.23937861621379852, "rewards/accuracy_reward_stage2": 0.3853837251663208, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1635 }, { "completion_length": 19.984375, "epoch": 0.286665498510601, "grad_norm": 16.039437320650997, "kl": 0.09912109375, "learning_rate": 7.135097248992465e-07, "loss": -0.0685, "reward": 1.687240719795227, "reward_std": 0.16071754693984985, "rewards/accuracy_reward_stage2": 0.7341156601905823, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1636 }, { "completion_length": 13.296875, "epoch": 0.28684072192044857, "grad_norm": 14.849111870421927, "kl": 0.185546875, "learning_rate": 7.13334501489399e-07, "loss": 0.074, "reward": 1.274766445159912, "reward_std": 0.1772993505001068, "rewards/accuracy_reward_stage2": 0.5247663855552673, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1637 }, { "completion_length": 13.046875, "epoch": 0.2870159453302961, "grad_norm": 23.99212122858826, "kl": 0.07373046875, "learning_rate": 7.131592780795513e-07, "loss": -0.0338, "reward": 1.5409257411956787, "reward_std": 0.31629616022109985, "rewards/accuracy_reward_stage2": 0.5721758008003235, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1638 }, { "completion_length": 34.140625, "epoch": 0.28719116874014367, "grad_norm": 20.01264481300106, "kl": 0.09375, "learning_rate": 7.129840546697038e-07, "loss": 0.0375, "reward": 1.5125254392623901, "reward_std": 0.23700213432312012, "rewards/accuracy_reward_stage2": 0.5125254392623901, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1639 }, { "completion_length": 9.0625, "epoch": 0.2873663921499912, "grad_norm": 11.016581012451287, "kl": 0.061767578125, "learning_rate": 7.128088312598563e-07, "loss": 0.0247, "reward": 1.7490017414093018, "reward_std": 0.10017408430576324, "rewards/accuracy_reward_stage2": 0.7490018010139465, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1640 }, { "completion_length": 12.375, "epoch": 0.2875416155598388, "grad_norm": 25.213433901660334, "kl": 0.23046875, "learning_rate": 7.126336078500087e-07, "loss": 0.1007, "reward": 1.5582143068313599, "reward_std": 0.2488112449645996, "rewards/accuracy_reward_stage2": 0.8082143068313599, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1641 }, { "completion_length": 7.5625, "epoch": 0.28771683896968636, "grad_norm": 24.57818343120992, "kl": 0.201171875, "learning_rate": 7.124583844401612e-07, "loss": -0.0179, "reward": 1.7404444217681885, "reward_std": 0.25796931982040405, "rewards/accuracy_reward_stage2": 0.7873194217681885, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1642 }, { "completion_length": 27.875, "epoch": 0.2878920623795339, "grad_norm": 22.31498698944003, "kl": 0.05712890625, "learning_rate": 7.122831610303136e-07, "loss": 0.0229, "reward": 1.6970582008361816, "reward_std": 0.2050331085920334, "rewards/accuracy_reward_stage2": 0.6970581412315369, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1643 }, { "completion_length": 16.09375, "epoch": 0.28806728578938146, "grad_norm": 19.38288084898735, "kl": 0.0439453125, "learning_rate": 7.121079376204661e-07, "loss": 0.0175, "reward": 1.7315735816955566, "reward_std": 0.19836972653865814, "rewards/accuracy_reward_stage2": 0.7315736413002014, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1644 }, { "completion_length": 7.296875, "epoch": 0.288242509199229, "grad_norm": 20.68800808708774, "kl": 0.04296875, "learning_rate": 7.119327142106185e-07, "loss": 0.0172, "reward": 1.6252111196517944, "reward_std": 0.2738679349422455, "rewards/accuracy_reward_stage2": 0.6252111196517944, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1645 }, { "completion_length": 9.328125, "epoch": 0.28841773260907655, "grad_norm": 25.418979208262513, "kl": 0.263671875, "learning_rate": 7.117574908007709e-07, "loss": 0.0062, "reward": 1.490492820739746, "reward_std": 0.32653310894966125, "rewards/accuracy_reward_stage2": 0.6623678207397461, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1646 }, { "completion_length": 10.515625, "epoch": 0.28859295601892415, "grad_norm": 17.77005226261, "kl": 0.142578125, "learning_rate": 7.115822673909234e-07, "loss": 0.0256, "reward": 1.305906057357788, "reward_std": 0.2853769063949585, "rewards/accuracy_reward_stage2": 0.5715309381484985, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1647 }, { "completion_length": 8.328125, "epoch": 0.2887681794287717, "grad_norm": 21.53141032151713, "kl": 0.1259765625, "learning_rate": 7.114070439810759e-07, "loss": 0.0504, "reward": 1.5774625539779663, "reward_std": 0.19729429483413696, "rewards/accuracy_reward_stage2": 0.5774626135826111, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1648 }, { "completion_length": 9.765625, "epoch": 0.28894340283861925, "grad_norm": 22.13505312706258, "kl": 0.029541015625, "learning_rate": 7.112318205712283e-07, "loss": -0.0273, "reward": 1.4544023275375366, "reward_std": 0.2900089621543884, "rewards/accuracy_reward_stage2": 0.5950272679328918, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1649 }, { "completion_length": 10.4375, "epoch": 0.2891186262484668, "grad_norm": 16.66684202268035, "kl": 0.030517578125, "learning_rate": 7.110565971613808e-07, "loss": 0.0122, "reward": 1.1875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.3125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1650 }, { "completion_length": 6.1875, "epoch": 0.28929384965831434, "grad_norm": 7.4505253530629245, "kl": 0.041748046875, "learning_rate": 7.108813737515331e-07, "loss": 0.0167, "reward": 1.6225404739379883, "reward_std": 0.0069564878940582275, "rewards/accuracy_reward_stage2": 0.7475404739379883, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1651 }, { "completion_length": 7.53125, "epoch": 0.2894690730681619, "grad_norm": 28.55696586333797, "kl": 0.1796875, "learning_rate": 7.107061503416856e-07, "loss": 0.0279, "reward": 1.49065363407135, "reward_std": 0.29641100764274597, "rewards/accuracy_reward_stage2": 0.5062786340713501, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1652 }, { "completion_length": 11.75, "epoch": 0.2896442964780095, "grad_norm": 20.033273583846604, "kl": 0.042236328125, "learning_rate": 7.105309269318381e-07, "loss": 0.0169, "reward": 1.4843425750732422, "reward_std": 0.209964781999588, "rewards/accuracy_reward_stage2": 0.6093425750732422, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1653 }, { "completion_length": 13.109375, "epoch": 0.28981951988785704, "grad_norm": 19.51626253625671, "kl": 0.11669921875, "learning_rate": 7.103557035219905e-07, "loss": -0.1495, "reward": 1.3871873617172241, "reward_std": 0.32389187812805176, "rewards/accuracy_reward_stage2": 0.4653124511241913, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 1654 }, { "completion_length": 7.59375, "epoch": 0.2899947432977046, "grad_norm": 14.401028332479045, "kl": 0.0184326171875, "learning_rate": 7.101804801121429e-07, "loss": 0.0074, "reward": 1.8791757822036743, "reward_std": 0.12005934864282608, "rewards/accuracy_reward_stage2": 0.8791757225990295, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1655 }, { "completion_length": 16.75, "epoch": 0.29016996670755213, "grad_norm": 24.425108474466274, "kl": 0.1767578125, "learning_rate": 7.100052567022954e-07, "loss": 0.0268, "reward": 1.4417062997817993, "reward_std": 0.24762201309204102, "rewards/accuracy_reward_stage2": 0.45733126997947693, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1656 }, { "completion_length": 9.125, "epoch": 0.2903451901173997, "grad_norm": 15.885346234048205, "kl": 0.08251953125, "learning_rate": 7.098300332924478e-07, "loss": -0.0457, "reward": 1.750290870666504, "reward_std": 0.21685898303985596, "rewards/accuracy_reward_stage2": 0.7815408706665039, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1657 }, { "completion_length": 7.921875, "epoch": 0.2905204135272472, "grad_norm": 14.827047303372789, "kl": 0.07177734375, "learning_rate": 7.096548098826003e-07, "loss": 0.0286, "reward": 1.8020378351211548, "reward_std": 0.13259246945381165, "rewards/accuracy_reward_stage2": 0.8020378351211548, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1658 }, { "completion_length": 10.53125, "epoch": 0.29069563693709477, "grad_norm": 21.85151316980941, "kl": 0.09912109375, "learning_rate": 7.094795864727527e-07, "loss": -0.0489, "reward": 1.6325395107269287, "reward_std": 0.3101848363876343, "rewards/accuracy_reward_stage2": 0.7887896299362183, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1659 }, { "completion_length": 6.265625, "epoch": 0.2908708603469424, "grad_norm": 18.287190634942217, "kl": 0.0615234375, "learning_rate": 7.093043630629052e-07, "loss": 0.0246, "reward": 1.5881173610687256, "reward_std": 0.10431988537311554, "rewards/accuracy_reward_stage2": 0.5881173610687256, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1660 }, { "completion_length": 8.65625, "epoch": 0.2910460837567899, "grad_norm": 15.895496389770273, "kl": 0.061279296875, "learning_rate": 7.091291396530577e-07, "loss": 0.0245, "reward": 1.6019673347473145, "reward_std": 0.12393862009048462, "rewards/accuracy_reward_stage2": 0.6019672155380249, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1661 }, { "completion_length": 13.390625, "epoch": 0.29122130716663747, "grad_norm": 10.816467295801187, "kl": 0.0908203125, "learning_rate": 7.089539162432101e-07, "loss": -0.0078, "reward": 1.8275662660598755, "reward_std": 0.13626514375209808, "rewards/accuracy_reward_stage2": 0.8431912660598755, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1662 }, { "completion_length": 8.546875, "epoch": 0.291396530576485, "grad_norm": 15.694381347608815, "kl": 0.1865234375, "learning_rate": 7.087786928333626e-07, "loss": -0.0344, "reward": 1.4604032039642334, "reward_std": 0.21337604522705078, "rewards/accuracy_reward_stage2": 0.6322782039642334, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1663 }, { "completion_length": 10.875, "epoch": 0.29157175398633256, "grad_norm": 16.364034081347608, "kl": 0.068359375, "learning_rate": 7.086034694235148e-07, "loss": -0.0073, "reward": 1.4936509132385254, "reward_std": 0.21664029359817505, "rewards/accuracy_reward_stage2": 0.6342759728431702, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1664 }, { "completion_length": 10.09375, "epoch": 0.2917469773961801, "grad_norm": 26.763510925051527, "kl": 0.35546875, "learning_rate": 7.084282460136673e-07, "loss": 0.0305, "reward": 1.3588550090789795, "reward_std": 0.25715306401252747, "rewards/accuracy_reward_stage2": 0.5307300090789795, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1665 }, { "completion_length": 10.859375, "epoch": 0.2919222008060277, "grad_norm": 29.69703159052776, "kl": 0.283203125, "learning_rate": 7.082530226038198e-07, "loss": 0.0743, "reward": 1.75, "reward_std": 0.3197399973869324, "rewards/accuracy_reward_stage2": 0.765625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1666 }, { "completion_length": 9.53125, "epoch": 0.29209742421587526, "grad_norm": 22.667034575376125, "kl": 0.251953125, "learning_rate": 7.080777991939722e-07, "loss": 0.0564, "reward": 1.568906545639038, "reward_std": 0.211813285946846, "rewards/accuracy_reward_stage2": 0.8345316052436829, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1667 }, { "completion_length": 17.21875, "epoch": 0.2922726476257228, "grad_norm": 22.412634113582342, "kl": 0.11279296875, "learning_rate": 7.079025757841247e-07, "loss": 0.0021, "reward": 1.3446143865585327, "reward_std": 0.2523040771484375, "rewards/accuracy_reward_stage2": 0.5008643865585327, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1668 }, { "completion_length": 8.1875, "epoch": 0.29244787103557035, "grad_norm": 17.11055627315877, "kl": 0.0849609375, "learning_rate": 7.077273523742772e-07, "loss": 0.0339, "reward": 1.4724442958831787, "reward_std": 0.19884838163852692, "rewards/accuracy_reward_stage2": 0.47244423627853394, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1669 }, { "completion_length": 8.828125, "epoch": 0.2926230944454179, "grad_norm": 18.118629529042312, "kl": 0.115234375, "learning_rate": 7.075521289644296e-07, "loss": 0.0459, "reward": 1.6591260433197021, "reward_std": 0.16784465312957764, "rewards/accuracy_reward_stage2": 0.6591259241104126, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1670 }, { "completion_length": 15.046875, "epoch": 0.29279831785526544, "grad_norm": 17.44795830979665, "kl": 0.04541015625, "learning_rate": 7.073769055545821e-07, "loss": -0.0482, "reward": 1.4219658374786377, "reward_std": 0.11053664982318878, "rewards/accuracy_reward_stage2": 0.5782157778739929, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1671 }, { "completion_length": 17.296875, "epoch": 0.29297354126511305, "grad_norm": 16.374551074871473, "kl": 0.03271484375, "learning_rate": 7.072016821447345e-07, "loss": 0.013, "reward": 1.5180377960205078, "reward_std": 0.18597131967544556, "rewards/accuracy_reward_stage2": 0.5180378556251526, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1672 }, { "completion_length": 6.328125, "epoch": 0.2931487646749606, "grad_norm": 12.337377058119465, "kl": 0.033935546875, "learning_rate": 7.07026458734887e-07, "loss": 0.0136, "reward": 1.8476905822753906, "reward_std": 0.08578959107398987, "rewards/accuracy_reward_stage2": 0.8476906418800354, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1673 }, { "completion_length": 11.5, "epoch": 0.29332398808480814, "grad_norm": 22.60062112863413, "kl": 0.294921875, "learning_rate": 7.068512353250395e-07, "loss": -0.0613, "reward": 1.504547119140625, "reward_std": 0.2656328082084656, "rewards/accuracy_reward_stage2": 0.582672119140625, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 1674 }, { "completion_length": 10.234375, "epoch": 0.2934992114946557, "grad_norm": 19.392245624011455, "kl": 0.06787109375, "learning_rate": 7.066760119151918e-07, "loss": 0.0273, "reward": 1.4372313022613525, "reward_std": 0.12203386425971985, "rewards/accuracy_reward_stage2": 0.4372313618659973, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1675 }, { "completion_length": 13.59375, "epoch": 0.29367443490450323, "grad_norm": 24.5100935157014, "kl": 0.1240234375, "learning_rate": 7.065007885053443e-07, "loss": 0.0054, "reward": 1.2867053747177124, "reward_std": 0.23883283138275146, "rewards/accuracy_reward_stage2": 0.30233034491539, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1676 }, { "completion_length": 14.296875, "epoch": 0.2938496583143508, "grad_norm": 21.009404224525824, "kl": 0.2119140625, "learning_rate": 7.063255650954967e-07, "loss": 0.0469, "reward": 1.4832779169082642, "reward_std": 0.22361421585083008, "rewards/accuracy_reward_stage2": 0.6239029169082642, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1677 }, { "completion_length": 5.453125, "epoch": 0.29402488172419833, "grad_norm": 18.31135062907212, "kl": 0.072265625, "learning_rate": 7.061503416856491e-07, "loss": -0.0152, "reward": 1.8268635272979736, "reward_std": 0.11123533546924591, "rewards/accuracy_reward_stage2": 0.8424884080886841, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1678 }, { "completion_length": 8.625, "epoch": 0.29420010513404593, "grad_norm": 20.70682730963805, "kl": 0.1884765625, "learning_rate": 7.059751182758016e-07, "loss": -0.0091, "reward": 1.6978143453598022, "reward_std": 0.3025016486644745, "rewards/accuracy_reward_stage2": 0.7446893453598022, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1679 }, { "completion_length": 8.453125, "epoch": 0.2943753285438935, "grad_norm": 18.365155253399227, "kl": 0.06689453125, "learning_rate": 7.05799894865954e-07, "loss": -0.0174, "reward": 1.580909252166748, "reward_std": 0.19966112077236176, "rewards/accuracy_reward_stage2": 0.5965343117713928, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1680 }, { "completion_length": 8.1875, "epoch": 0.294550551953741, "grad_norm": 25.531188137039162, "kl": 0.30859375, "learning_rate": 7.056246714561065e-07, "loss": 0.1237, "reward": 1.1729844808578491, "reward_std": 0.16541114449501038, "rewards/accuracy_reward_stage2": 0.4229844808578491, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1681 }, { "completion_length": 13.703125, "epoch": 0.29472577536358857, "grad_norm": 17.57522237064251, "kl": 0.055908203125, "learning_rate": 7.05449448046259e-07, "loss": 0.0222, "reward": 1.7702456712722778, "reward_std": 0.1767645925283432, "rewards/accuracy_reward_stage2": 0.7702457308769226, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1682 }, { "completion_length": 9.515625, "epoch": 0.2949009987734361, "grad_norm": 14.555393826814768, "kl": 0.06494140625, "learning_rate": 7.052742246364114e-07, "loss": -0.0164, "reward": 1.7005529403686523, "reward_std": 0.09703925251960754, "rewards/accuracy_reward_stage2": 0.8411779999732971, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1683 }, { "completion_length": 9.78125, "epoch": 0.29507622218328367, "grad_norm": 22.386526662514495, "kl": 0.142578125, "learning_rate": 7.050990012265639e-07, "loss": 0.0569, "reward": 1.701306939125061, "reward_std": 0.26653289794921875, "rewards/accuracy_reward_stage2": 0.7013068795204163, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1684 }, { "completion_length": 15.75, "epoch": 0.29525144559313127, "grad_norm": 18.142136706976988, "kl": 0.0576171875, "learning_rate": 7.049237778167163e-07, "loss": 0.023, "reward": 1.2344558238983154, "reward_std": 0.1106235533952713, "rewards/accuracy_reward_stage2": 0.23445577919483185, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1685 }, { "completion_length": 9.953125, "epoch": 0.2954266690029788, "grad_norm": 16.073543348250226, "kl": 0.07275390625, "learning_rate": 7.047485544068687e-07, "loss": 0.0343, "reward": 1.5708773136138916, "reward_std": 0.16317957639694214, "rewards/accuracy_reward_stage2": 0.6958773732185364, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1686 }, { "completion_length": 12.640625, "epoch": 0.29560189241282636, "grad_norm": 23.8509920791397, "kl": 0.011962890625, "learning_rate": 7.045733309970212e-07, "loss": 0.0048, "reward": 1.6665546894073486, "reward_std": 0.2417851835489273, "rewards/accuracy_reward_stage2": 0.6665546894073486, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1687 }, { "completion_length": 9.9375, "epoch": 0.2957771158226739, "grad_norm": 20.34102299001797, "kl": 0.2451171875, "learning_rate": 7.043981075871736e-07, "loss": 0.0975, "reward": 1.5598974227905273, "reward_std": 0.1706065535545349, "rewards/accuracy_reward_stage2": 0.6848974227905273, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1688 }, { "completion_length": 27.171875, "epoch": 0.29595233923252146, "grad_norm": 24.778954933179413, "kl": 0.20703125, "learning_rate": 7.04222884177326e-07, "loss": 0.0437, "reward": 1.4397300481796265, "reward_std": 0.20912772417068481, "rewards/accuracy_reward_stage2": 0.5803550481796265, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1689 }, { "completion_length": 9.984375, "epoch": 0.296127562642369, "grad_norm": 18.30052895572527, "kl": 0.1416015625, "learning_rate": 7.040476607674785e-07, "loss": 0.015, "reward": 1.3897716999053955, "reward_std": 0.20197075605392456, "rewards/accuracy_reward_stage2": 0.4053967595100403, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1690 }, { "completion_length": 9.328125, "epoch": 0.29630278605221655, "grad_norm": 21.721276007608935, "kl": 0.1328125, "learning_rate": 7.038724373576309e-07, "loss": 0.0532, "reward": 1.6735678911209106, "reward_std": 0.32562583684921265, "rewards/accuracy_reward_stage2": 0.6735677719116211, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1691 }, { "completion_length": 8.046875, "epoch": 0.29647800946206415, "grad_norm": 20.41959856286839, "kl": 0.12890625, "learning_rate": 7.036972139477834e-07, "loss": 0.0513, "reward": 1.5956950187683105, "reward_std": 0.18200629949569702, "rewards/accuracy_reward_stage2": 0.5956949591636658, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1692 }, { "completion_length": 12.109375, "epoch": 0.2966532328719117, "grad_norm": 17.309247466482873, "kl": 0.1044921875, "learning_rate": 7.035219905379359e-07, "loss": 0.0252, "reward": 1.6495327949523926, "reward_std": 0.2783252000808716, "rewards/accuracy_reward_stage2": 0.7901579141616821, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1693 }, { "completion_length": 10.421875, "epoch": 0.29682845628175925, "grad_norm": 11.817725673933843, "kl": 0.07763671875, "learning_rate": 7.033467671280882e-07, "loss": -0.011, "reward": 1.3649306297302246, "reward_std": 0.07383120805025101, "rewards/accuracy_reward_stage2": 0.38055557012557983, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1694 }, { "completion_length": 10.984375, "epoch": 0.2970036796916068, "grad_norm": 20.20455995667041, "kl": 0.052978515625, "learning_rate": 7.031715437182407e-07, "loss": 0.0211, "reward": 1.615881085395813, "reward_std": 0.1898079514503479, "rewards/accuracy_reward_stage2": 0.6158811450004578, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1695 }, { "completion_length": 5.859375, "epoch": 0.29717890310145434, "grad_norm": 20.723780937675922, "kl": 0.1044921875, "learning_rate": 7.029963203083931e-07, "loss": 0.0418, "reward": 1.6208045482635498, "reward_std": 0.10398261994123459, "rewards/accuracy_reward_stage2": 0.6208046078681946, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1696 }, { "completion_length": 8.390625, "epoch": 0.2973541265113019, "grad_norm": 23.222706582027712, "kl": 0.09765625, "learning_rate": 7.028210968985456e-07, "loss": 0.0391, "reward": 1.7384235858917236, "reward_std": 0.19045890867710114, "rewards/accuracy_reward_stage2": 0.7384235858917236, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1697 }, { "completion_length": 9.453125, "epoch": 0.2975293499211495, "grad_norm": 15.779788156610927, "kl": 0.078125, "learning_rate": 7.026458734886981e-07, "loss": -0.013, "reward": 1.3911956548690796, "reward_std": 0.242641419172287, "rewards/accuracy_reward_stage2": 0.5318205952644348, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1698 }, { "completion_length": 9.53125, "epoch": 0.29770457333099704, "grad_norm": 22.91272507487531, "kl": 0.1201171875, "learning_rate": 7.024706500788505e-07, "loss": -0.0927, "reward": 1.5430138111114502, "reward_std": 0.25438836216926575, "rewards/accuracy_reward_stage2": 0.6055138111114502, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1699 }, { "completion_length": 10.625, "epoch": 0.2978797967408446, "grad_norm": 19.778327412160536, "kl": 0.11328125, "learning_rate": 7.02295426669003e-07, "loss": 0.0453, "reward": 1.5060014724731445, "reward_std": 0.24018427729606628, "rewards/accuracy_reward_stage2": 0.5060014128684998, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1700 }, { "completion_length": 11.984375, "epoch": 0.29805502015069213, "grad_norm": 19.40855478090295, "kl": 0.08447265625, "learning_rate": 7.021202032591555e-07, "loss": -0.0105, "reward": 1.603383183479309, "reward_std": 0.3217662572860718, "rewards/accuracy_reward_stage2": 0.6190081238746643, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1701 }, { "completion_length": 8.796875, "epoch": 0.2982302435605397, "grad_norm": 18.222665367205884, "kl": 0.058837890625, "learning_rate": 7.019449798493078e-07, "loss": 0.0235, "reward": 1.5814367532730103, "reward_std": 0.1338120698928833, "rewards/accuracy_reward_stage2": 0.5814367532730103, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1702 }, { "completion_length": 11.953125, "epoch": 0.2984054669703872, "grad_norm": 24.934665597811787, "kl": 0.1875, "learning_rate": 7.017697564394603e-07, "loss": 0.0748, "reward": 1.6412934064865112, "reward_std": 0.3221040666103363, "rewards/accuracy_reward_stage2": 0.7662933468818665, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1703 }, { "completion_length": 10.609375, "epoch": 0.2985806903802348, "grad_norm": 15.026115592854442, "kl": 0.09814453125, "learning_rate": 7.015945330296126e-07, "loss": -0.0049, "reward": 1.6630749702453613, "reward_std": 0.16869525611400604, "rewards/accuracy_reward_stage2": 0.6786999702453613, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1704 }, { "completion_length": 11.046875, "epoch": 0.2987559137900824, "grad_norm": 25.677616356620522, "kl": 0.30859375, "learning_rate": 7.014193096197651e-07, "loss": 0.1236, "reward": 1.0827093124389648, "reward_std": 0.2146224081516266, "rewards/accuracy_reward_stage2": 0.5827093720436096, "rewards/format_reward_stage1_pointerpad": 0.5, "scores/accuracy_reward_stage2": 0.5, "step": 1705 }, { "completion_length": 8.890625, "epoch": 0.2989311371999299, "grad_norm": 16.767615751833127, "kl": 0.008544921875, "learning_rate": 7.012440862099176e-07, "loss": 0.0034, "reward": 1.5104167461395264, "reward_std": 0.1473139077425003, "rewards/accuracy_reward_stage2": 0.6354166269302368, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1706 }, { "completion_length": 8.046875, "epoch": 0.29910636060977747, "grad_norm": 21.753461446874425, "kl": 0.08447265625, "learning_rate": 7.0106886280007e-07, "loss": 0.0009, "reward": 1.4956333637237549, "reward_std": 0.26299524307250977, "rewards/accuracy_reward_stage2": 0.5112583637237549, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1707 }, { "completion_length": 10.3125, "epoch": 0.299281584019625, "grad_norm": 20.184864788281224, "kl": 0.224609375, "learning_rate": 7.008936393902225e-07, "loss": 0.0509, "reward": 1.5954148769378662, "reward_std": 0.29817402362823486, "rewards/accuracy_reward_stage2": 0.7360398769378662, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1708 }, { "completion_length": 12.6875, "epoch": 0.29945680742947256, "grad_norm": 15.485288813738334, "kl": 0.083984375, "learning_rate": 7.00718415980375e-07, "loss": -0.0064, "reward": 1.474943995475769, "reward_std": 0.19505015015602112, "rewards/accuracy_reward_stage2": 0.615568995475769, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1709 }, { "completion_length": 27.671875, "epoch": 0.2996320308393201, "grad_norm": 17.649240525297987, "kl": 0.041748046875, "learning_rate": 7.005431925705274e-07, "loss": 0.0167, "reward": 1.5623822212219238, "reward_std": 0.2002073973417282, "rewards/accuracy_reward_stage2": 0.5623822212219238, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1710 }, { "completion_length": 7.203125, "epoch": 0.2998072542491677, "grad_norm": 23.44514836779095, "kl": 0.10498046875, "learning_rate": 7.003679691606799e-07, "loss": 0.042, "reward": 1.682141661643982, "reward_std": 0.28925785422325134, "rewards/accuracy_reward_stage2": 0.8071417808532715, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1711 }, { "completion_length": 9.28125, "epoch": 0.29998247765901526, "grad_norm": 18.377371253216243, "kl": 0.045166015625, "learning_rate": 7.001927457508323e-07, "loss": 0.018, "reward": 1.7067670822143555, "reward_std": 0.17415907979011536, "rewards/accuracy_reward_stage2": 0.7067670822143555, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1712 }, { "completion_length": 9.296875, "epoch": 0.3001577010688628, "grad_norm": 20.942129163532762, "kl": 0.053955078125, "learning_rate": 7.000175223409848e-07, "loss": -0.0009, "reward": 1.6164193153381348, "reward_std": 0.21394112706184387, "rewards/accuracy_reward_stage2": 0.74141925573349, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1713 }, { "completion_length": 9.40625, "epoch": 0.30033292447871035, "grad_norm": 15.992685381176873, "kl": 0.150390625, "learning_rate": 6.998422989311373e-07, "loss": 0.0267, "reward": 1.441511869430542, "reward_std": 0.2877542972564697, "rewards/accuracy_reward_stage2": 0.45713692903518677, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1714 }, { "completion_length": 18.140625, "epoch": 0.3005081478885579, "grad_norm": 20.477620250403163, "kl": 0.0654296875, "learning_rate": 6.996670755212895e-07, "loss": 0.0262, "reward": 1.519256353378296, "reward_std": 0.20742157101631165, "rewards/accuracy_reward_stage2": 0.5192563533782959, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1715 }, { "completion_length": 12.03125, "epoch": 0.30068337129840544, "grad_norm": 15.828782229925874, "kl": 0.236328125, "learning_rate": 6.99491852111442e-07, "loss": 0.006, "reward": 1.2528257369995117, "reward_std": 0.16298237442970276, "rewards/accuracy_reward_stage2": 0.5184507966041565, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1716 }, { "completion_length": 16.984375, "epoch": 0.30085859470825305, "grad_norm": 19.351136914365497, "kl": 0.11376953125, "learning_rate": 6.993166287015945e-07, "loss": -0.0378, "reward": 1.2651151418685913, "reward_std": 0.26157358288764954, "rewards/accuracy_reward_stage2": 0.4213651418685913, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1717 }, { "completion_length": 16.40625, "epoch": 0.3010338181181006, "grad_norm": 22.03734358648279, "kl": 0.0439453125, "learning_rate": 6.991414052917469e-07, "loss": 0.0176, "reward": 1.6531357765197754, "reward_std": 0.21220947802066803, "rewards/accuracy_reward_stage2": 0.7781356573104858, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1718 }, { "completion_length": 7.046875, "epoch": 0.30120904152794814, "grad_norm": 18.753470433630223, "kl": 0.126953125, "learning_rate": 6.989661818818994e-07, "loss": -0.0375, "reward": 1.7411688566207886, "reward_std": 0.19873298704624176, "rewards/accuracy_reward_stage2": 0.7724189162254333, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1719 }, { "completion_length": 8.8125, "epoch": 0.3013842649377957, "grad_norm": 15.466518379156236, "kl": 0.07080078125, "learning_rate": 6.987909584720518e-07, "loss": 0.0284, "reward": 1.7029595375061035, "reward_std": 0.12291057407855988, "rewards/accuracy_reward_stage2": 0.827959418296814, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1720 }, { "completion_length": 7.796875, "epoch": 0.30155948834764323, "grad_norm": 19.501181455296493, "kl": 0.06298828125, "learning_rate": 6.986157350622043e-07, "loss": 0.0252, "reward": 1.5771433115005493, "reward_std": 0.17379529774188995, "rewards/accuracy_reward_stage2": 0.7021433115005493, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1721 }, { "completion_length": 10.421875, "epoch": 0.3017347117574908, "grad_norm": 22.98391134712126, "kl": 0.2119140625, "learning_rate": 6.984405116523568e-07, "loss": -0.03, "reward": 1.4428976774215698, "reward_std": 0.2138996124267578, "rewards/accuracy_reward_stage2": 0.5053976774215698, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1722 }, { "completion_length": 24.34375, "epoch": 0.3019099351673384, "grad_norm": 24.585847348472083, "kl": 0.04638671875, "learning_rate": 6.982652882425092e-07, "loss": 0.0185, "reward": 1.6923989057540894, "reward_std": 0.1567954421043396, "rewards/accuracy_reward_stage2": 0.8173988461494446, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1723 }, { "completion_length": 10.015625, "epoch": 0.30208515857718593, "grad_norm": 29.306608426277936, "kl": 0.154296875, "learning_rate": 6.980900648326617e-07, "loss": 0.0173, "reward": 1.7728705406188965, "reward_std": 0.2877151072025299, "rewards/accuracy_reward_stage2": 0.7884955406188965, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1724 }, { "completion_length": 9.578125, "epoch": 0.3022603819870335, "grad_norm": 20.790074690488744, "kl": 0.11669921875, "learning_rate": 6.979148414228141e-07, "loss": 0.0466, "reward": 1.8588755130767822, "reward_std": 0.1497223675251007, "rewards/accuracy_reward_stage2": 0.8588753938674927, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1725 }, { "completion_length": 10.875, "epoch": 0.302435605396881, "grad_norm": 28.348773800842704, "kl": 0.205078125, "learning_rate": 6.977396180129665e-07, "loss": 0.0505, "reward": 1.4835188388824463, "reward_std": 0.2762864828109741, "rewards/accuracy_reward_stage2": 0.4991438388824463, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1726 }, { "completion_length": 7.296875, "epoch": 0.30261082880672857, "grad_norm": 19.30322415984756, "kl": 0.0859375, "learning_rate": 6.97564394603119e-07, "loss": 0.0344, "reward": 1.5627442598342896, "reward_std": 0.13243502378463745, "rewards/accuracy_reward_stage2": 0.5627442598342896, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1727 }, { "completion_length": 8.953125, "epoch": 0.3027860522165761, "grad_norm": 13.272637700139425, "kl": 0.138671875, "learning_rate": 6.973891711932713e-07, "loss": 0.0557, "reward": 1.1412205696105957, "reward_std": 0.20045800507068634, "rewards/accuracy_reward_stage2": 0.5162205696105957, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1728 }, { "completion_length": 7.765625, "epoch": 0.30296127562642367, "grad_norm": 18.56909415470759, "kl": 0.07958984375, "learning_rate": 6.972139477834238e-07, "loss": 0.0319, "reward": 1.6612675189971924, "reward_std": 0.19378286600112915, "rewards/accuracy_reward_stage2": 0.6612674593925476, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1729 }, { "completion_length": 8.296875, "epoch": 0.30313649903627127, "grad_norm": 19.996812828528885, "kl": 0.1953125, "learning_rate": 6.970387243735763e-07, "loss": -0.003, "reward": 1.4460539817810059, "reward_std": 0.3024117946624756, "rewards/accuracy_reward_stage2": 0.6023039221763611, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1730 }, { "completion_length": 10.875, "epoch": 0.3033117224461188, "grad_norm": 20.133568474644466, "kl": 0.10009765625, "learning_rate": 6.968635009637287e-07, "loss": -0.064, "reward": 1.5166605710983276, "reward_std": 0.3427537679672241, "rewards/accuracy_reward_stage2": 0.5635355710983276, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1731 }, { "completion_length": 10.078125, "epoch": 0.30348694585596636, "grad_norm": 26.844065801862286, "kl": 0.263671875, "learning_rate": 6.966882775538812e-07, "loss": 0.1057, "reward": 1.6550629138946533, "reward_std": 0.18192754685878754, "rewards/accuracy_reward_stage2": 0.7800629138946533, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1732 }, { "completion_length": 7.453125, "epoch": 0.3036621692658139, "grad_norm": 72.76331505491501, "kl": 0.53125, "learning_rate": 6.965130541440337e-07, "loss": 0.169, "reward": 1.6019704341888428, "reward_std": 0.19926634430885315, "rewards/accuracy_reward_stage2": 0.7425954937934875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1733 }, { "completion_length": 10.34375, "epoch": 0.30383739267566146, "grad_norm": 18.421882393675595, "kl": 0.07275390625, "learning_rate": 6.96337830734186e-07, "loss": -0.0591, "reward": 1.6065335273742676, "reward_std": 0.3355046510696411, "rewards/accuracy_reward_stage2": 0.6377835869789124, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1734 }, { "completion_length": 10.125, "epoch": 0.304012616085509, "grad_norm": 11.942224141003042, "kl": 0.058837890625, "learning_rate": 6.961626073243385e-07, "loss": -0.0207, "reward": 1.4618998765945435, "reward_std": 0.07960294187068939, "rewards/accuracy_reward_stage2": 0.47752487659454346, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1735 }, { "completion_length": 9.703125, "epoch": 0.3041878394953566, "grad_norm": 19.31158166873272, "kl": 0.14453125, "learning_rate": 6.959873839144909e-07, "loss": -0.0482, "reward": 1.4449491500854492, "reward_std": 0.35838043689727783, "rewards/accuracy_reward_stage2": 0.4918241500854492, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1736 }, { "completion_length": 8.84375, "epoch": 0.30436306290520415, "grad_norm": 26.716362549948762, "kl": 0.0673828125, "learning_rate": 6.958121605046434e-07, "loss": 0.027, "reward": 1.786990761756897, "reward_std": 0.2307921200990677, "rewards/accuracy_reward_stage2": 0.786990761756897, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1737 }, { "completion_length": 9.578125, "epoch": 0.3045382863150517, "grad_norm": 13.812922311399504, "kl": 0.1044921875, "learning_rate": 6.956369370947959e-07, "loss": 0.0418, "reward": 1.4359885454177856, "reward_std": 0.22620894014835358, "rewards/accuracy_reward_stage2": 0.43598854541778564, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1738 }, { "completion_length": 7.609375, "epoch": 0.30471350972489925, "grad_norm": 20.11713933240359, "kl": 0.0908203125, "learning_rate": 6.954617136849483e-07, "loss": 0.0363, "reward": 1.5567574501037598, "reward_std": 0.1984756886959076, "rewards/accuracy_reward_stage2": 0.5567575097084045, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1739 }, { "completion_length": 19.1875, "epoch": 0.3048887331347468, "grad_norm": 20.825688050186635, "kl": 0.0595703125, "learning_rate": 6.952864902751007e-07, "loss": 0.0238, "reward": 1.6817889213562012, "reward_std": 0.1523667573928833, "rewards/accuracy_reward_stage2": 0.6817888021469116, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1740 }, { "completion_length": 11.75, "epoch": 0.30506395654459434, "grad_norm": 24.818617359584426, "kl": 0.3046875, "learning_rate": 6.951112668652531e-07, "loss": 0.0934, "reward": 1.1904207468032837, "reward_std": 0.2955701947212219, "rewards/accuracy_reward_stage2": 0.4560457468032837, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1741 }, { "completion_length": 14.953125, "epoch": 0.3052391799544419, "grad_norm": 17.298674576478618, "kl": 0.1513671875, "learning_rate": 6.949360434554056e-07, "loss": 0.0605, "reward": 1.2696678638458252, "reward_std": 0.19690418243408203, "rewards/accuracy_reward_stage2": 0.3946678340435028, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1742 }, { "completion_length": 13.5, "epoch": 0.3054144033642895, "grad_norm": 18.43912249339227, "kl": 0.287109375, "learning_rate": 6.947608200455581e-07, "loss": 0.0735, "reward": 1.2864735126495361, "reward_std": 0.29236263036727905, "rewards/accuracy_reward_stage2": 0.4270986318588257, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1743 }, { "completion_length": 14.703125, "epoch": 0.30558962677413704, "grad_norm": 20.53686398022577, "kl": 0.09423828125, "learning_rate": 6.945855966357104e-07, "loss": 0.0377, "reward": 1.43604576587677, "reward_std": 0.21306610107421875, "rewards/accuracy_reward_stage2": 0.43604573607444763, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1744 }, { "completion_length": 10.375, "epoch": 0.3057648501839846, "grad_norm": 21.82032382446694, "kl": 0.046875, "learning_rate": 6.944103732258629e-07, "loss": -0.0146, "reward": 1.505760669708252, "reward_std": 0.21576610207557678, "rewards/accuracy_reward_stage2": 0.5213857293128967, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1745 }, { "completion_length": 8.75, "epoch": 0.30594007359383213, "grad_norm": 16.48715253308781, "kl": 0.08056640625, "learning_rate": 6.942351498160154e-07, "loss": -0.0041, "reward": 1.5899578332901, "reward_std": 0.23990005254745483, "rewards/accuracy_reward_stage2": 0.6055828332901001, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1746 }, { "completion_length": 13.5625, "epoch": 0.3061152970036797, "grad_norm": 25.359738427813962, "kl": 0.10107421875, "learning_rate": 6.940599264061678e-07, "loss": 0.0405, "reward": 1.7109923362731934, "reward_std": 0.25352123379707336, "rewards/accuracy_reward_stage2": 0.7109923362731934, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1747 }, { "completion_length": 9.34375, "epoch": 0.3062905204135272, "grad_norm": 20.257427202231625, "kl": 0.06884765625, "learning_rate": 6.938847029963203e-07, "loss": 0.0275, "reward": 1.3980247974395752, "reward_std": 0.181054025888443, "rewards/accuracy_reward_stage2": 0.3980247974395752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1748 }, { "completion_length": 12.9375, "epoch": 0.3064657438233748, "grad_norm": 17.564095255403075, "kl": 0.294921875, "learning_rate": 6.937094795864727e-07, "loss": 0.1018, "reward": 1.5810174942016602, "reward_std": 0.2686046361923218, "rewards/accuracy_reward_stage2": 0.7216424942016602, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1749 }, { "completion_length": 6.25, "epoch": 0.3066409672332224, "grad_norm": 14.375651726004179, "kl": 0.046630859375, "learning_rate": 6.935342561766252e-07, "loss": -0.0255, "reward": 1.6467738151550293, "reward_std": 0.1914321780204773, "rewards/accuracy_reward_stage2": 0.6623987555503845, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1750 }, { "completion_length": 12.09375, "epoch": 0.3068161906430699, "grad_norm": 12.345937734240898, "kl": 0.01513671875, "learning_rate": 6.933590327667777e-07, "loss": -0.0376, "reward": 1.5015289783477783, "reward_std": 0.14592652022838593, "rewards/accuracy_reward_stage2": 0.5171540379524231, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1751 }, { "completion_length": 11.0, "epoch": 0.30699141405291747, "grad_norm": 16.665879700484737, "kl": 0.1904296875, "learning_rate": 6.931838093569301e-07, "loss": 0.0377, "reward": 1.5536797046661377, "reward_std": 0.3037300109863281, "rewards/accuracy_reward_stage2": 0.5693047046661377, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1752 }, { "completion_length": 12.0, "epoch": 0.307166637462765, "grad_norm": 18.09600917797856, "kl": 0.1474609375, "learning_rate": 6.930085859470825e-07, "loss": 0.0147, "reward": 1.3697917461395264, "reward_std": 0.17622756958007812, "rewards/accuracy_reward_stage2": 0.5104166269302368, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1753 }, { "completion_length": 8.421875, "epoch": 0.30734186087261256, "grad_norm": 17.718409175442556, "kl": 0.062255859375, "learning_rate": 6.928333625372349e-07, "loss": -0.0193, "reward": 1.8201448917388916, "reward_std": 0.188689187169075, "rewards/accuracy_reward_stage2": 0.8357699513435364, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1754 }, { "completion_length": 10.234375, "epoch": 0.30751708428246016, "grad_norm": 17.51134890833407, "kl": 0.1787109375, "learning_rate": 6.926581391273873e-07, "loss": 0.0714, "reward": 1.1596169471740723, "reward_std": 0.09221327304840088, "rewards/accuracy_reward_stage2": 0.40961694717407227, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1755 }, { "completion_length": 10.390625, "epoch": 0.3076923076923077, "grad_norm": 21.006838236257735, "kl": 0.224609375, "learning_rate": 6.924829157175398e-07, "loss": 0.0447, "reward": 1.1669657230377197, "reward_std": 0.3041993975639343, "rewards/accuracy_reward_stage2": 0.5732156038284302, "rewards/format_reward_stage1_pointerpad": 0.59375, "scores/accuracy_reward_stage2": 0.59375, "step": 1756 }, { "completion_length": 10.5625, "epoch": 0.30786753110215526, "grad_norm": 17.512426542672337, "kl": 0.08740234375, "learning_rate": 6.923076923076922e-07, "loss": 0.035, "reward": 1.471048355102539, "reward_std": 0.21887768805027008, "rewards/accuracy_reward_stage2": 0.47104835510253906, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1757 }, { "completion_length": 10.09375, "epoch": 0.3080427545120028, "grad_norm": 23.961599386130843, "kl": 0.2197265625, "learning_rate": 6.921324688978447e-07, "loss": 0.0881, "reward": 1.472118854522705, "reward_std": 0.1884707510471344, "rewards/accuracy_reward_stage2": 0.5971187949180603, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1758 }, { "completion_length": 8.671875, "epoch": 0.30821797792185035, "grad_norm": 11.807947327421346, "kl": 0.0185546875, "learning_rate": 6.919572454879972e-07, "loss": 0.0074, "reward": 1.8020833730697632, "reward_std": 0.15872615575790405, "rewards/accuracy_reward_stage2": 0.8020833730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1759 }, { "completion_length": 11.125, "epoch": 0.3083932013316979, "grad_norm": 21.153286980872252, "kl": 0.28125, "learning_rate": 6.917820220781496e-07, "loss": 0.1127, "reward": 1.3018338680267334, "reward_std": 0.21189749240875244, "rewards/accuracy_reward_stage2": 0.5518338680267334, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1760 }, { "completion_length": 16.3125, "epoch": 0.30856842474154544, "grad_norm": 16.93507853922392, "kl": 0.0927734375, "learning_rate": 6.916067986683021e-07, "loss": 0.0062, "reward": 1.6600141525268555, "reward_std": 0.14201043546199799, "rewards/accuracy_reward_stage2": 0.6756391525268555, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1761 }, { "completion_length": 12.46875, "epoch": 0.30874364815139305, "grad_norm": 25.109481914584034, "kl": 0.12255859375, "learning_rate": 6.914315752584546e-07, "loss": 0.0048, "reward": 1.365222692489624, "reward_std": 0.28223758935928345, "rewards/accuracy_reward_stage2": 0.5058478116989136, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1762 }, { "completion_length": 12.46875, "epoch": 0.3089188715612406, "grad_norm": 17.783663309948565, "kl": 0.10498046875, "learning_rate": 6.91256351848607e-07, "loss": 0.042, "reward": 1.6931748390197754, "reward_std": 0.31448644399642944, "rewards/accuracy_reward_stage2": 0.6931748986244202, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1763 }, { "completion_length": 9.75, "epoch": 0.30909409497108814, "grad_norm": 14.788676137177648, "kl": 0.076171875, "learning_rate": 6.910811284387594e-07, "loss": 0.0304, "reward": 1.4861619472503662, "reward_std": 0.19302460551261902, "rewards/accuracy_reward_stage2": 0.4861619770526886, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1764 }, { "completion_length": 17.140625, "epoch": 0.3092693183809357, "grad_norm": 16.180602672992737, "kl": 0.0595703125, "learning_rate": 6.909059050289118e-07, "loss": -0.0073, "reward": 1.5507076978683472, "reward_std": 0.14039362967014313, "rewards/accuracy_reward_stage2": 0.5663327574729919, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1765 }, { "completion_length": 7.6875, "epoch": 0.30944454179078323, "grad_norm": 18.944751242113163, "kl": 0.181640625, "learning_rate": 6.907306816190642e-07, "loss": 0.0006, "reward": 1.7289048433303833, "reward_std": 0.18334315717220306, "rewards/accuracy_reward_stage2": 0.760154664516449, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1766 }, { "completion_length": 11.421875, "epoch": 0.3096197652006308, "grad_norm": 19.16973729447749, "kl": 0.1494140625, "learning_rate": 6.905554582092167e-07, "loss": -0.052, "reward": 1.4670138359069824, "reward_std": 0.3148658871650696, "rewards/accuracy_reward_stage2": 0.5138888955116272, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1767 }, { "completion_length": 9.71875, "epoch": 0.3097949886104784, "grad_norm": 17.309743318297446, "kl": 0.044921875, "learning_rate": 6.903802347993691e-07, "loss": 0.018, "reward": 1.6352248191833496, "reward_std": 0.28032034635543823, "rewards/accuracy_reward_stage2": 0.6352247595787048, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1768 }, { "completion_length": 9.5625, "epoch": 0.30997021202032593, "grad_norm": 18.063172656913363, "kl": 0.11181640625, "learning_rate": 6.902050113895216e-07, "loss": -0.0092, "reward": 1.2916667461395264, "reward_std": 0.31406551599502563, "rewards/accuracy_reward_stage2": 0.3229166865348816, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1769 }, { "completion_length": 8.296875, "epoch": 0.3101454354301735, "grad_norm": 16.536948904301017, "kl": 0.035888671875, "learning_rate": 6.900297879796741e-07, "loss": 0.0143, "reward": 1.728659987449646, "reward_std": 0.14718098938465118, "rewards/accuracy_reward_stage2": 0.728659987449646, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1770 }, { "completion_length": 7.84375, "epoch": 0.310320658840021, "grad_norm": 11.889469909971977, "kl": 0.111328125, "learning_rate": 6.898545645698265e-07, "loss": -0.0151, "reward": 1.8219847679138184, "reward_std": 0.21930548548698425, "rewards/accuracy_reward_stage2": 0.8532347083091736, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1771 }, { "completion_length": 15.546875, "epoch": 0.31049588224986857, "grad_norm": 15.999502540982576, "kl": 0.11328125, "learning_rate": 6.89679341159979e-07, "loss": -0.0765, "reward": 1.328125, "reward_std": 0.19939783215522766, "rewards/accuracy_reward_stage2": 0.375, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1772 }, { "completion_length": 11.1875, "epoch": 0.3106711056597161, "grad_norm": 15.775649129635754, "kl": 0.049072265625, "learning_rate": 6.895041177501314e-07, "loss": 0.0196, "reward": 1.67244553565979, "reward_std": 0.1583077609539032, "rewards/accuracy_reward_stage2": 0.6724455952644348, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1773 }, { "completion_length": 11.671875, "epoch": 0.3108463290695637, "grad_norm": 17.128809859484743, "kl": 0.1494140625, "learning_rate": 6.893288943402838e-07, "loss": -0.0652, "reward": 1.5411221981048584, "reward_std": 0.3266652524471283, "rewards/accuracy_reward_stage2": 0.5879971981048584, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1774 }, { "completion_length": 9.84375, "epoch": 0.31102155247941127, "grad_norm": 21.203604381045565, "kl": 0.0947265625, "learning_rate": 6.891536709304363e-07, "loss": 0.0378, "reward": 1.4142817258834839, "reward_std": 0.2929390072822571, "rewards/accuracy_reward_stage2": 0.5392817258834839, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1775 }, { "completion_length": 10.1875, "epoch": 0.3111967758892588, "grad_norm": 18.289077348581767, "kl": 0.1865234375, "learning_rate": 6.889784475205887e-07, "loss": 0.0554, "reward": 1.5055460929870605, "reward_std": 0.18611961603164673, "rewards/accuracy_reward_stage2": 0.6461710333824158, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1776 }, { "completion_length": 8.21875, "epoch": 0.31137199929910636, "grad_norm": 20.461632497981842, "kl": 0.08984375, "learning_rate": 6.888032241107412e-07, "loss": 0.0358, "reward": 1.6237359046936035, "reward_std": 0.21165037155151367, "rewards/accuracy_reward_stage2": 0.6393609046936035, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1777 }, { "completion_length": 13.5625, "epoch": 0.3115472227089539, "grad_norm": 15.923489710242217, "kl": 0.138671875, "learning_rate": 6.886280007008937e-07, "loss": 0.0555, "reward": 1.4647328853607178, "reward_std": 0.19382403790950775, "rewards/accuracy_reward_stage2": 0.5897328853607178, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1778 }, { "completion_length": 11.40625, "epoch": 0.31172244611880146, "grad_norm": 23.491850619048765, "kl": 0.1572265625, "learning_rate": 6.88452777291046e-07, "loss": -0.0162, "reward": 1.7011396884918213, "reward_std": 0.21986907720565796, "rewards/accuracy_reward_stage2": 0.8417646288871765, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1779 }, { "completion_length": 21.65625, "epoch": 0.311897669528649, "grad_norm": 17.648231242026238, "kl": 0.166015625, "learning_rate": 6.882775538811985e-07, "loss": 0.0309, "reward": 1.2604795694351196, "reward_std": 0.2557189464569092, "rewards/accuracy_reward_stage2": 0.40110456943511963, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1780 }, { "completion_length": 33.84375, "epoch": 0.3120728929384966, "grad_norm": 20.279169259088963, "kl": 0.06689453125, "learning_rate": 6.881023304713509e-07, "loss": 0.0204, "reward": 1.2711501121520996, "reward_std": 0.11733835190534592, "rewards/accuracy_reward_stage2": 0.5367749929428101, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1781 }, { "completion_length": 13.0625, "epoch": 0.31224811634834415, "grad_norm": 19.349168319136698, "kl": 0.064453125, "learning_rate": 6.879271070615034e-07, "loss": -0.0022, "reward": 1.6423496007919312, "reward_std": 0.23339146375656128, "rewards/accuracy_reward_stage2": 0.7829747200012207, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1782 }, { "completion_length": 6.5625, "epoch": 0.3124233397581917, "grad_norm": 19.729708472837455, "kl": 0.11572265625, "learning_rate": 6.877518836516559e-07, "loss": 0.0109, "reward": 1.6721187829971313, "reward_std": 0.27952146530151367, "rewards/accuracy_reward_stage2": 0.6877437233924866, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1783 }, { "completion_length": 15.03125, "epoch": 0.31259856316803925, "grad_norm": 16.744165717397728, "kl": 0.037841796875, "learning_rate": 6.875766602418082e-07, "loss": -0.0263, "reward": 1.404069185256958, "reward_std": 0.1922588050365448, "rewards/accuracy_reward_stage2": 0.5446941256523132, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1784 }, { "completion_length": 9.359375, "epoch": 0.3127737865778868, "grad_norm": 19.139819261721463, "kl": 0.1142578125, "learning_rate": 6.874014368319607e-07, "loss": 0.0458, "reward": 1.667823314666748, "reward_std": 0.19142280519008636, "rewards/accuracy_reward_stage2": 0.6678231954574585, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1785 }, { "completion_length": 13.546875, "epoch": 0.31294900998773434, "grad_norm": 18.638743470121007, "kl": 0.087890625, "learning_rate": 6.872262134221132e-07, "loss": 0.0351, "reward": 1.5992480516433716, "reward_std": 0.09372645616531372, "rewards/accuracy_reward_stage2": 0.5992480516433716, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1786 }, { "completion_length": 11.5, "epoch": 0.31312423339758194, "grad_norm": 15.238512705166738, "kl": 0.08447265625, "learning_rate": 6.870509900122656e-07, "loss": 0.0338, "reward": 1.466585636138916, "reward_std": 0.13504785299301147, "rewards/accuracy_reward_stage2": 0.591585636138916, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1787 }, { "completion_length": 14.46875, "epoch": 0.3132994568074295, "grad_norm": 22.201999425814748, "kl": 0.1064453125, "learning_rate": 6.868757666024181e-07, "loss": -0.0017, "reward": 1.3887823820114136, "reward_std": 0.24121630191802979, "rewards/accuracy_reward_stage2": 0.5294073820114136, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1788 }, { "completion_length": 12.140625, "epoch": 0.31347468021727704, "grad_norm": 16.71526611503449, "kl": 0.061767578125, "learning_rate": 6.867005431925705e-07, "loss": 0.0248, "reward": 1.3335199356079102, "reward_std": 0.16721726953983307, "rewards/accuracy_reward_stage2": 0.45851996541023254, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1789 }, { "completion_length": 13.109375, "epoch": 0.3136499036271246, "grad_norm": 65.37486727195878, "kl": 0.369140625, "learning_rate": 6.86525319782723e-07, "loss": 0.1083, "reward": 1.7268718481063843, "reward_std": 0.1315089464187622, "rewards/accuracy_reward_stage2": 0.8831217885017395, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1790 }, { "completion_length": 12.203125, "epoch": 0.31382512703697213, "grad_norm": 21.970841856829036, "kl": 0.0751953125, "learning_rate": 6.863500963728754e-07, "loss": 0.0237, "reward": 1.3490945100784302, "reward_std": 0.28697898983955383, "rewards/accuracy_reward_stage2": 0.4897195100784302, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1791 }, { "completion_length": 5.40625, "epoch": 0.3140003504468197, "grad_norm": 14.203849936543449, "kl": 0.06640625, "learning_rate": 6.861748729630278e-07, "loss": -0.0175, "reward": 1.6782519817352295, "reward_std": 0.12433382123708725, "rewards/accuracy_reward_stage2": 0.6938769817352295, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1792 }, { "completion_length": 12.09375, "epoch": 0.3141755738566673, "grad_norm": 19.317828271285112, "kl": 0.17578125, "learning_rate": 6.859996495531802e-07, "loss": 0.026, "reward": 1.4357094764709473, "reward_std": 0.252302348613739, "rewards/accuracy_reward_stage2": 0.5763345956802368, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1793 }, { "completion_length": 10.25, "epoch": 0.3143507972665148, "grad_norm": 18.615946255677432, "kl": 0.3046875, "learning_rate": 6.858244261433327e-07, "loss": 0.0775, "reward": 1.4579432010650635, "reward_std": 0.19797083735466003, "rewards/accuracy_reward_stage2": 0.5985681414604187, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1794 }, { "completion_length": 8.609375, "epoch": 0.3145260206763624, "grad_norm": 19.174787497092378, "kl": 0.16015625, "learning_rate": 6.856492027334851e-07, "loss": 0.0213, "reward": 1.614447832107544, "reward_std": 0.2662124037742615, "rewards/accuracy_reward_stage2": 0.630072832107544, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1795 }, { "completion_length": 10.734375, "epoch": 0.3147012440862099, "grad_norm": 15.360421940382055, "kl": 0.06787109375, "learning_rate": 6.854739793236376e-07, "loss": -0.0331, "reward": 1.457749605178833, "reward_std": 0.19533106684684753, "rewards/accuracy_reward_stage2": 0.48899969458580017, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1796 }, { "completion_length": 20.28125, "epoch": 0.31487646749605747, "grad_norm": 658.9442180431508, "kl": 1.140625, "learning_rate": 6.8529875591379e-07, "loss": 0.409, "reward": 1.5153186321258545, "reward_std": 0.22788935899734497, "rewards/accuracy_reward_stage2": 0.6559436917304993, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1797 }, { "completion_length": 8.671875, "epoch": 0.315051690905905, "grad_norm": 20.79543805564879, "kl": 0.21484375, "learning_rate": 6.851235325039425e-07, "loss": 0.0206, "reward": 1.3769316673278809, "reward_std": 0.2010369449853897, "rewards/accuracy_reward_stage2": 0.5331815481185913, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1798 }, { "completion_length": 9.5625, "epoch": 0.31522691431575256, "grad_norm": 29.209511348518387, "kl": 0.150390625, "learning_rate": 6.84948309094095e-07, "loss": 0.0603, "reward": 1.6429357528686523, "reward_std": 0.2625489830970764, "rewards/accuracy_reward_stage2": 0.7679356932640076, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1799 }, { "completion_length": 9.71875, "epoch": 0.31540213772560016, "grad_norm": 16.85321625670232, "kl": 0.0712890625, "learning_rate": 6.847730856842474e-07, "loss": -0.0156, "reward": 1.6711848974227905, "reward_std": 0.19875484704971313, "rewards/accuracy_reward_stage2": 0.6868098974227905, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1800 }, { "completion_length": 6.875, "epoch": 0.3155773611354477, "grad_norm": 20.034299414918525, "kl": 0.1259765625, "learning_rate": 6.845978622743999e-07, "loss": 0.0015, "reward": 1.737661600112915, "reward_std": 0.2608322203159332, "rewards/accuracy_reward_stage2": 0.768911600112915, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1801 }, { "completion_length": 9.515625, "epoch": 0.31575258454529526, "grad_norm": 16.12460743639783, "kl": 0.0986328125, "learning_rate": 6.844226388645524e-07, "loss": 0.0394, "reward": 1.5717509984970093, "reward_std": 0.15481790900230408, "rewards/accuracy_reward_stage2": 0.6967509984970093, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1802 }, { "completion_length": 10.875, "epoch": 0.3159278079551428, "grad_norm": 15.839550154054333, "kl": 0.12158203125, "learning_rate": 6.842474154547048e-07, "loss": 0.0046, "reward": 1.812552809715271, "reward_std": 0.1441907435655594, "rewards/accuracy_reward_stage2": 0.828177809715271, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1803 }, { "completion_length": 9.453125, "epoch": 0.31610303136499035, "grad_norm": 13.23966129910894, "kl": 0.03515625, "learning_rate": 6.840721920448571e-07, "loss": 0.0141, "reward": 1.5005505084991455, "reward_std": 0.1589551568031311, "rewards/accuracy_reward_stage2": 0.5005505084991455, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1804 }, { "completion_length": 17.421875, "epoch": 0.3162782547748379, "grad_norm": 20.892304554616636, "kl": 0.087890625, "learning_rate": 6.838969686350095e-07, "loss": 0.0351, "reward": 1.4543895721435547, "reward_std": 0.18670448660850525, "rewards/accuracy_reward_stage2": 0.45438963174819946, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1805 }, { "completion_length": 8.203125, "epoch": 0.3164534781846855, "grad_norm": 20.082354567975813, "kl": 0.0634765625, "learning_rate": 6.83721745225162e-07, "loss": 0.0253, "reward": 1.6109774112701416, "reward_std": 0.20097583532333374, "rewards/accuracy_reward_stage2": 0.6109773516654968, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1806 }, { "completion_length": 9.359375, "epoch": 0.31662870159453305, "grad_norm": 13.84361945350731, "kl": 0.81640625, "learning_rate": 6.835465218153145e-07, "loss": 0.3261, "reward": 1.1875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1807 }, { "completion_length": 9.46875, "epoch": 0.3168039250043806, "grad_norm": 22.82033708517315, "kl": 0.208984375, "learning_rate": 6.833712984054669e-07, "loss": 0.0014, "reward": 1.634639859199524, "reward_std": 0.24602361023426056, "rewards/accuracy_reward_stage2": 0.7908898591995239, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1808 }, { "completion_length": 10.515625, "epoch": 0.31697914841422814, "grad_norm": 20.550993721079408, "kl": 0.158203125, "learning_rate": 6.831960749956194e-07, "loss": -0.0, "reward": 1.1504223346710205, "reward_std": 0.26793819665908813, "rewards/accuracy_reward_stage2": 0.29104727506637573, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1809 }, { "completion_length": 9.90625, "epoch": 0.3171543718240757, "grad_norm": 12.17435510264999, "kl": 0.051025390625, "learning_rate": 6.830208515857718e-07, "loss": -0.0215, "reward": 1.8293263912200928, "reward_std": 0.09863705933094025, "rewards/accuracy_reward_stage2": 0.8449514508247375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1810 }, { "completion_length": 9.828125, "epoch": 0.31732959523392323, "grad_norm": 16.92668722984298, "kl": 0.0966796875, "learning_rate": 6.828456281759243e-07, "loss": -0.0056, "reward": 1.7856630086898804, "reward_std": 0.26676464080810547, "rewards/accuracy_reward_stage2": 0.8012880086898804, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1811 }, { "completion_length": 6.65625, "epoch": 0.3175048186437708, "grad_norm": 17.29778111581455, "kl": 0.12890625, "learning_rate": 6.826704047660768e-07, "loss": 0.0225, "reward": 1.6117510795593262, "reward_std": 0.21526718139648438, "rewards/accuracy_reward_stage2": 0.6273760795593262, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1812 }, { "completion_length": 9.734375, "epoch": 0.3176800420536184, "grad_norm": 13.89711888030353, "kl": 0.037353515625, "learning_rate": 6.824951813562291e-07, "loss": 0.015, "reward": 1.5531659126281738, "reward_std": 0.0688503310084343, "rewards/accuracy_reward_stage2": 0.553165853023529, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1813 }, { "completion_length": 9.328125, "epoch": 0.31785526546346593, "grad_norm": 20.5060681062909, "kl": 0.1376953125, "learning_rate": 6.823199579463816e-07, "loss": 0.0135, "reward": 1.1958703994750977, "reward_std": 0.17446595430374146, "rewards/accuracy_reward_stage2": 0.33649545907974243, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1814 }, { "completion_length": 6.484375, "epoch": 0.3180304888733135, "grad_norm": 16.327339834068823, "kl": 0.0322265625, "learning_rate": 6.821447345365341e-07, "loss": 0.0129, "reward": 1.6876778602600098, "reward_std": 0.13662895560264587, "rewards/accuracy_reward_stage2": 0.687677800655365, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1815 }, { "completion_length": 11.546875, "epoch": 0.318205712283161, "grad_norm": 20.985490210673987, "kl": 0.0771484375, "learning_rate": 6.819695111266865e-07, "loss": 0.0309, "reward": 1.6510412693023682, "reward_std": 0.22464652359485626, "rewards/accuracy_reward_stage2": 0.7760413289070129, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1816 }, { "completion_length": 14.3125, "epoch": 0.31838093569300857, "grad_norm": 25.15709767548847, "kl": 0.08203125, "learning_rate": 6.817942877168389e-07, "loss": 0.0329, "reward": 1.381837248802185, "reward_std": 0.2987816333770752, "rewards/accuracy_reward_stage2": 0.6318372488021851, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1817 }, { "completion_length": 9.5625, "epoch": 0.3185561591028561, "grad_norm": 21.45777273299857, "kl": 0.2041015625, "learning_rate": 6.816190643069913e-07, "loss": 0.0816, "reward": 1.4881311655044556, "reward_std": 0.17458751797676086, "rewards/accuracy_reward_stage2": 0.6131311058998108, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1818 }, { "completion_length": 10.890625, "epoch": 0.3187313825127037, "grad_norm": 22.804407689637223, "kl": 0.34765625, "learning_rate": 6.814438408971438e-07, "loss": 0.1395, "reward": 1.1429061889648438, "reward_std": 0.2962269186973572, "rewards/accuracy_reward_stage2": 0.39290618896484375, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1819 }, { "completion_length": 11.046875, "epoch": 0.31890660592255127, "grad_norm": 18.228809141794585, "kl": 0.0625, "learning_rate": 6.812686174872963e-07, "loss": 0.0251, "reward": 1.617333173751831, "reward_std": 0.19772064685821533, "rewards/accuracy_reward_stage2": 0.742333173751831, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1820 }, { "completion_length": 5.90625, "epoch": 0.3190818293323988, "grad_norm": 14.05008775941103, "kl": 0.04296875, "learning_rate": 6.810933940774487e-07, "loss": 0.0172, "reward": 1.57807457447052, "reward_std": 0.1071772426366806, "rewards/accuracy_reward_stage2": 0.57807457447052, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1821 }, { "completion_length": 5.4375, "epoch": 0.31925705274224636, "grad_norm": 8.770429021155142, "kl": 0.006134033203125, "learning_rate": 6.809181706676012e-07, "loss": 0.0025, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1822 }, { "completion_length": 8.78125, "epoch": 0.3194322761520939, "grad_norm": 16.03661282647896, "kl": 0.0947265625, "learning_rate": 6.807429472577537e-07, "loss": 0.0379, "reward": 1.28269624710083, "reward_std": 0.1685888171195984, "rewards/accuracy_reward_stage2": 0.2826962471008301, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1823 }, { "completion_length": 9.390625, "epoch": 0.31960749956194145, "grad_norm": 20.147449873716155, "kl": 0.138671875, "learning_rate": 6.80567723847906e-07, "loss": -0.0222, "reward": 1.3721352815628052, "reward_std": 0.2498307228088379, "rewards/accuracy_reward_stage2": 0.4033852517604828, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1824 }, { "completion_length": 11.140625, "epoch": 0.31978272297178906, "grad_norm": 22.691989026804517, "kl": 0.2353515625, "learning_rate": 6.803925004380585e-07, "loss": 0.0609, "reward": 1.6414058208465576, "reward_std": 0.16309432685375214, "rewards/accuracy_reward_stage2": 0.7820307016372681, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1825 }, { "completion_length": 11.609375, "epoch": 0.3199579463816366, "grad_norm": 17.33336402597771, "kl": 0.0947265625, "learning_rate": 6.802172770282109e-07, "loss": -0.0064, "reward": 1.430842638015747, "reward_std": 0.23729614913463593, "rewards/accuracy_reward_stage2": 0.5714677572250366, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1826 }, { "completion_length": 9.8125, "epoch": 0.32013316979148415, "grad_norm": 10.20804098641941, "kl": 0.08056640625, "learning_rate": 6.800420536183634e-07, "loss": -0.012, "reward": 1.8573908805847168, "reward_std": 0.2043817788362503, "rewards/accuracy_reward_stage2": 0.8730158805847168, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1827 }, { "completion_length": 11.15625, "epoch": 0.3203083932013317, "grad_norm": 16.427886870766105, "kl": 0.076171875, "learning_rate": 6.798668302085159e-07, "loss": -0.0138, "reward": 1.3155391216278076, "reward_std": 0.19373847544193268, "rewards/accuracy_reward_stage2": 0.33116400241851807, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1828 }, { "completion_length": 15.734375, "epoch": 0.32048361661117925, "grad_norm": 23.683707696314503, "kl": 0.0966796875, "learning_rate": 6.796916067986683e-07, "loss": -0.0056, "reward": 1.4888830184936523, "reward_std": 0.3248387575149536, "rewards/accuracy_reward_stage2": 0.5045079588890076, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1829 }, { "completion_length": 12.828125, "epoch": 0.3206588400210268, "grad_norm": 22.327109222125593, "kl": 0.0869140625, "learning_rate": 6.795163833888207e-07, "loss": 0.0349, "reward": 1.7786725759506226, "reward_std": 0.19240732491016388, "rewards/accuracy_reward_stage2": 0.7786725163459778, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1830 }, { "completion_length": 7.171875, "epoch": 0.32083406343087434, "grad_norm": 21.46194590157146, "kl": 0.236328125, "learning_rate": 6.793411599789732e-07, "loss": 0.0848, "reward": 1.3829593658447266, "reward_std": 0.21105429530143738, "rewards/accuracy_reward_stage2": 0.5235843658447266, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1831 }, { "completion_length": 11.828125, "epoch": 0.32100928684072194, "grad_norm": 22.802588704471336, "kl": 0.01806640625, "learning_rate": 6.791659365691256e-07, "loss": 0.0072, "reward": 1.7552083730697632, "reward_std": 0.2507331371307373, "rewards/accuracy_reward_stage2": 0.7552083730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1832 }, { "completion_length": 8.40625, "epoch": 0.3211845102505695, "grad_norm": 17.738512216103807, "kl": 0.06787109375, "learning_rate": 6.78990713159278e-07, "loss": 0.0272, "reward": 1.3199025392532349, "reward_std": 0.22648808360099792, "rewards/accuracy_reward_stage2": 0.44490253925323486, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1833 }, { "completion_length": 12.53125, "epoch": 0.32135973366041704, "grad_norm": 17.953454178427567, "kl": 0.1552734375, "learning_rate": 6.788154897494304e-07, "loss": 0.0624, "reward": 1.4585304260253906, "reward_std": 0.19420871138572693, "rewards/accuracy_reward_stage2": 0.5835304856300354, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1834 }, { "completion_length": 7.953125, "epoch": 0.3215349570702646, "grad_norm": 19.099478848368094, "kl": 0.08740234375, "learning_rate": 6.786402663395829e-07, "loss": 0.0349, "reward": 1.5200002193450928, "reward_std": 0.24521487951278687, "rewards/accuracy_reward_stage2": 0.6450002789497375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1835 }, { "completion_length": 9.296875, "epoch": 0.32171018048011213, "grad_norm": 14.022198227726895, "kl": 0.07470703125, "learning_rate": 6.784650429297354e-07, "loss": 0.03, "reward": 1.5641283988952637, "reward_std": 0.1534004509449005, "rewards/accuracy_reward_stage2": 0.5641283392906189, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1836 }, { "completion_length": 7.875, "epoch": 0.3218854038899597, "grad_norm": 15.95768049558501, "kl": 0.1650390625, "learning_rate": 6.782898195198878e-07, "loss": 0.0661, "reward": 1.0655455589294434, "reward_std": 0.13622140884399414, "rewards/accuracy_reward_stage2": 0.3155454993247986, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1837 }, { "completion_length": 13.5, "epoch": 0.3220606272998073, "grad_norm": 30.588601350925913, "kl": 0.244140625, "learning_rate": 6.781145961100403e-07, "loss": 0.0353, "reward": 1.3970437049865723, "reward_std": 0.23561923205852509, "rewards/accuracy_reward_stage2": 0.662668764591217, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1838 }, { "completion_length": 11.9375, "epoch": 0.3222358507096548, "grad_norm": 23.826287338434923, "kl": 0.060546875, "learning_rate": 6.779393727001928e-07, "loss": 0.0242, "reward": 1.3648169040679932, "reward_std": 0.24242788553237915, "rewards/accuracy_reward_stage2": 0.4898168742656708, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1839 }, { "completion_length": 12.015625, "epoch": 0.32241107411950237, "grad_norm": 16.304101239213786, "kl": 0.11572265625, "learning_rate": 6.777641492903452e-07, "loss": 0.0314, "reward": 1.4843683242797852, "reward_std": 0.15928740799427032, "rewards/accuracy_reward_stage2": 0.49999332427978516, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1840 }, { "completion_length": 12.0625, "epoch": 0.3225862975293499, "grad_norm": 19.02631795490684, "kl": 0.2158203125, "learning_rate": 6.775889258804977e-07, "loss": 0.0419, "reward": 1.5703623294830322, "reward_std": 0.3020603358745575, "rewards/accuracy_reward_stage2": 0.710987389087677, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1841 }, { "completion_length": 9.875, "epoch": 0.32276152093919747, "grad_norm": 11.51782773806621, "kl": 0.051513671875, "learning_rate": 6.7741370247065e-07, "loss": -0.0237, "reward": 1.4839015007019043, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.4995265007019043, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1842 }, { "completion_length": 21.875, "epoch": 0.322936744349045, "grad_norm": 22.112513404630846, "kl": 0.1435546875, "learning_rate": 6.772384790608024e-07, "loss": 0.0572, "reward": 1.2472364902496338, "reward_std": 0.11612291634082794, "rewards/accuracy_reward_stage2": 0.4972364008426666, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1843 }, { "completion_length": 12.421875, "epoch": 0.3231119677588926, "grad_norm": 19.85265521887095, "kl": 0.08251953125, "learning_rate": 6.770632556509549e-07, "loss": 0.0329, "reward": 1.3744330406188965, "reward_std": 0.17216874659061432, "rewards/accuracy_reward_stage2": 0.3744330406188965, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1844 }, { "completion_length": 8.078125, "epoch": 0.32328719116874016, "grad_norm": 11.854489737566515, "kl": 0.0576171875, "learning_rate": 6.768880322411073e-07, "loss": 0.0231, "reward": 1.5750248432159424, "reward_std": 0.10127189010381699, "rewards/accuracy_reward_stage2": 0.7000248432159424, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1845 }, { "completion_length": 10.046875, "epoch": 0.3234624145785877, "grad_norm": 213.56026056990504, "kl": 1.328125, "learning_rate": 6.767128088312598e-07, "loss": 0.5308, "reward": 1.5306633710861206, "reward_std": 0.20582614839076996, "rewards/accuracy_reward_stage2": 0.6556633114814758, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1846 }, { "completion_length": 7.46875, "epoch": 0.32363763798843526, "grad_norm": 24.61251417703207, "kl": 0.1279296875, "learning_rate": 6.765375854214123e-07, "loss": 0.0512, "reward": 1.7689133882522583, "reward_std": 0.2784450054168701, "rewards/accuracy_reward_stage2": 0.7689133882522583, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1847 }, { "completion_length": 11.625, "epoch": 0.3238128613982828, "grad_norm": 21.46606276764324, "kl": 0.279296875, "learning_rate": 6.763623620115647e-07, "loss": 0.0673, "reward": 1.1931835412979126, "reward_std": 0.28681859374046326, "rewards/accuracy_reward_stage2": 0.4588085412979126, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1848 }, { "completion_length": 9.5625, "epoch": 0.32398808480813035, "grad_norm": 24.369027773600227, "kl": 0.1552734375, "learning_rate": 6.761871386017172e-07, "loss": 0.062, "reward": 1.7190744876861572, "reward_std": 0.25476139783859253, "rewards/accuracy_reward_stage2": 0.7190744280815125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1849 }, { "completion_length": 12.609375, "epoch": 0.3241633082179779, "grad_norm": 18.611472393606462, "kl": 0.09423828125, "learning_rate": 6.760119151918696e-07, "loss": 0.0378, "reward": 1.5120489597320557, "reward_std": 0.2501325011253357, "rewards/accuracy_reward_stage2": 0.5120488405227661, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1850 }, { "completion_length": 11.40625, "epoch": 0.3243385316278255, "grad_norm": 16.87913889310574, "kl": 0.036376953125, "learning_rate": 6.758366917820221e-07, "loss": 0.0145, "reward": 1.5233901739120483, "reward_std": 0.29552075266838074, "rewards/accuracy_reward_stage2": 0.5233901739120483, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1851 }, { "completion_length": 9.9375, "epoch": 0.32451375503767305, "grad_norm": 19.573316617620065, "kl": 0.060546875, "learning_rate": 6.756614683721746e-07, "loss": 0.0242, "reward": 1.6326043605804443, "reward_std": 0.23380601406097412, "rewards/accuracy_reward_stage2": 0.6326042413711548, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1852 }, { "completion_length": 12.109375, "epoch": 0.3246889784475206, "grad_norm": 17.457107762068464, "kl": 0.0654296875, "learning_rate": 6.754862449623269e-07, "loss": 0.0262, "reward": 1.4427083730697632, "reward_std": 0.17677412927150726, "rewards/accuracy_reward_stage2": 0.4427083134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1853 }, { "completion_length": 11.34375, "epoch": 0.32486420185736814, "grad_norm": 18.220680601291093, "kl": 0.109375, "learning_rate": 6.753110215524794e-07, "loss": 0.0016, "reward": 1.3575248718261719, "reward_std": 0.229498952627182, "rewards/accuracy_reward_stage2": 0.4981498122215271, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1854 }, { "completion_length": 9.390625, "epoch": 0.3250394252672157, "grad_norm": 21.931580448265247, "kl": 0.271484375, "learning_rate": 6.751357981426318e-07, "loss": 0.1087, "reward": 1.392435908317566, "reward_std": 0.2234029769897461, "rewards/accuracy_reward_stage2": 0.6424359083175659, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1855 }, { "completion_length": 6.515625, "epoch": 0.32521464867706323, "grad_norm": 13.46112374907508, "kl": 0.05322265625, "learning_rate": 6.749605747327842e-07, "loss": 0.0212, "reward": 1.5464448928833008, "reward_std": 0.08465109765529633, "rewards/accuracy_reward_stage2": 0.5464448928833008, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1856 }, { "completion_length": 4.796875, "epoch": 0.32538987208691084, "grad_norm": 16.306716966882355, "kl": 0.1513671875, "learning_rate": 6.747853513229367e-07, "loss": -0.0126, "reward": 1.6233456134796143, "reward_std": 0.17143097519874573, "rewards/accuracy_reward_stage2": 0.6545956134796143, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1857 }, { "completion_length": 12.9375, "epoch": 0.3255650954967584, "grad_norm": 14.849624955126052, "kl": 0.2412109375, "learning_rate": 6.746101279130891e-07, "loss": 0.0526, "reward": 1.2325433492660522, "reward_std": 0.1239112988114357, "rewards/accuracy_reward_stage2": 0.37316834926605225, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1858 }, { "completion_length": 10.8125, "epoch": 0.32574031890660593, "grad_norm": 24.755112462709235, "kl": 0.18359375, "learning_rate": 6.744349045032416e-07, "loss": 0.0399, "reward": 1.4605742692947388, "reward_std": 0.29862216114997864, "rewards/accuracy_reward_stage2": 0.6011992692947388, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1859 }, { "completion_length": 10.1875, "epoch": 0.3259155423164535, "grad_norm": 19.06098095277532, "kl": 0.2119140625, "learning_rate": 6.742596810933941e-07, "loss": 0.0409, "reward": 1.6791045665740967, "reward_std": 0.21750207245349884, "rewards/accuracy_reward_stage2": 0.8197296261787415, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1860 }, { "completion_length": 7.484375, "epoch": 0.326090765726301, "grad_norm": 13.58793029092376, "kl": 0.0289306640625, "learning_rate": 6.740844576835465e-07, "loss": 0.0116, "reward": 1.6041667461395264, "reward_std": 0.1746530830860138, "rewards/accuracy_reward_stage2": 0.6041666865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1861 }, { "completion_length": 8.828125, "epoch": 0.32626598913614857, "grad_norm": 17.06926939592597, "kl": 0.287109375, "learning_rate": 6.73909234273699e-07, "loss": 0.1147, "reward": 1.2411842346191406, "reward_std": 0.24955028295516968, "rewards/accuracy_reward_stage2": 0.49118414521217346, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1862 }, { "completion_length": 6.296875, "epoch": 0.3264412125459961, "grad_norm": 21.228109264490534, "kl": 0.05517578125, "learning_rate": 6.737340108638514e-07, "loss": 0.0221, "reward": 1.6191439628601074, "reward_std": 0.2491680532693863, "rewards/accuracy_reward_stage2": 0.6191439032554626, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1863 }, { "completion_length": 7.84375, "epoch": 0.3266164359558437, "grad_norm": 15.003439247671935, "kl": 0.06298828125, "learning_rate": 6.735587874540038e-07, "loss": 0.0251, "reward": 1.6949942111968994, "reward_std": 0.1083931177854538, "rewards/accuracy_reward_stage2": 0.694994330406189, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1864 }, { "completion_length": 11.71875, "epoch": 0.32679165936569127, "grad_norm": 28.267395553834955, "kl": 0.1904296875, "learning_rate": 6.733835640441563e-07, "loss": 0.076, "reward": 1.5004314184188843, "reward_std": 0.27763575315475464, "rewards/accuracy_reward_stage2": 0.7504312992095947, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1865 }, { "completion_length": 10.265625, "epoch": 0.3269668827755388, "grad_norm": 26.120695411410892, "kl": 0.107421875, "learning_rate": 6.732083406343087e-07, "loss": 0.0431, "reward": 1.693007230758667, "reward_std": 0.2572506070137024, "rewards/accuracy_reward_stage2": 0.6930071711540222, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1866 }, { "completion_length": 12.21875, "epoch": 0.32714210618538636, "grad_norm": 15.245095474259111, "kl": 0.0244140625, "learning_rate": 6.730331172244612e-07, "loss": 0.0098, "reward": 1.7239583730697632, "reward_std": 0.17329776287078857, "rewards/accuracy_reward_stage2": 0.7239583730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1867 }, { "completion_length": 10.046875, "epoch": 0.3273173295952339, "grad_norm": 16.955249727198467, "kl": 0.0213623046875, "learning_rate": 6.728578938146136e-07, "loss": 0.0085, "reward": 1.2291667461395264, "reward_std": 0.15133953094482422, "rewards/accuracy_reward_stage2": 0.3541666567325592, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1868 }, { "completion_length": 9.015625, "epoch": 0.32749255300508145, "grad_norm": 32.29386433854243, "kl": 0.16796875, "learning_rate": 6.72682670404766e-07, "loss": 0.0229, "reward": 1.3476319313049316, "reward_std": 0.156138077378273, "rewards/accuracy_reward_stage2": 0.36325690150260925, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1869 }, { "completion_length": 9.5, "epoch": 0.32766777641492906, "grad_norm": 12.37315895680729, "kl": 0.0634765625, "learning_rate": 6.725074469949185e-07, "loss": 0.0254, "reward": 1.488959550857544, "reward_std": 0.07612180709838867, "rewards/accuracy_reward_stage2": 0.6139594316482544, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1870 }, { "completion_length": 9.1875, "epoch": 0.3278429998247766, "grad_norm": 20.2049118237751, "kl": 0.1611328125, "learning_rate": 6.72332223585071e-07, "loss": 0.0203, "reward": 1.7136660814285278, "reward_std": 0.15727868676185608, "rewards/accuracy_reward_stage2": 0.7292912006378174, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1871 }, { "completion_length": 12.28125, "epoch": 0.32801822323462415, "grad_norm": 20.781043538536267, "kl": 0.1396484375, "learning_rate": 6.721570001752234e-07, "loss": 0.0561, "reward": 1.3187334537506104, "reward_std": 0.1667216718196869, "rewards/accuracy_reward_stage2": 0.5687333941459656, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1872 }, { "completion_length": 12.671875, "epoch": 0.3281934466444717, "grad_norm": 15.177500825022513, "kl": 0.043701171875, "learning_rate": 6.719817767653758e-07, "loss": 0.0175, "reward": 1.38564133644104, "reward_std": 0.12951165437698364, "rewards/accuracy_reward_stage2": 0.38564133644104004, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1873 }, { "completion_length": 10.21875, "epoch": 0.32836867005431924, "grad_norm": 16.988081404749867, "kl": 0.1904296875, "learning_rate": 6.718065533555282e-07, "loss": 0.0763, "reward": 1.4357813596725464, "reward_std": 0.11432722210884094, "rewards/accuracy_reward_stage2": 0.5607813596725464, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1874 }, { "completion_length": 6.078125, "epoch": 0.3285438934641668, "grad_norm": 7.521942053848321, "kl": 0.03759765625, "learning_rate": 6.716313299456807e-07, "loss": 0.015, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1875 }, { "completion_length": 8.515625, "epoch": 0.3287191168740144, "grad_norm": 18.43664423428494, "kl": 0.0810546875, "learning_rate": 6.714561065358332e-07, "loss": -0.0119, "reward": 1.522249460220337, "reward_std": 0.2793046236038208, "rewards/accuracy_reward_stage2": 0.5378744602203369, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1876 }, { "completion_length": 10.78125, "epoch": 0.32889434028386194, "grad_norm": 23.785278163967817, "kl": 0.044921875, "learning_rate": 6.712808831259856e-07, "loss": 0.018, "reward": 1.4928114414215088, "reward_std": 0.32112449407577515, "rewards/accuracy_reward_stage2": 0.49281150102615356, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1877 }, { "completion_length": 9.75, "epoch": 0.3290695636937095, "grad_norm": 10.868982497146204, "kl": 0.037841796875, "learning_rate": 6.711056597161381e-07, "loss": 0.0151, "reward": 1.6688337326049805, "reward_std": 0.055166780948638916, "rewards/accuracy_reward_stage2": 0.6688336730003357, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1878 }, { "completion_length": 11.125, "epoch": 0.32924478710355704, "grad_norm": 17.995101435846504, "kl": 0.10302734375, "learning_rate": 6.709304363062906e-07, "loss": -0.0028, "reward": 1.4962878227233887, "reward_std": 0.22154772281646729, "rewards/accuracy_reward_stage2": 0.5119128227233887, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1879 }, { "completion_length": 10.375, "epoch": 0.3294200105134046, "grad_norm": 15.528621184241214, "kl": 0.06591796875, "learning_rate": 6.707552128964429e-07, "loss": 0.0263, "reward": 1.7492897510528564, "reward_std": 0.18351225554943085, "rewards/accuracy_reward_stage2": 0.7492897510528564, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1880 }, { "completion_length": 13.71875, "epoch": 0.32959523392325213, "grad_norm": 14.708818987692135, "kl": 0.0751953125, "learning_rate": 6.705799894865954e-07, "loss": 0.03, "reward": 1.515625, "reward_std": 0.19939783215522766, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1881 }, { "completion_length": 12.234375, "epoch": 0.3297704573330997, "grad_norm": 16.61658553061757, "kl": 0.064453125, "learning_rate": 6.704047660767477e-07, "loss": 0.0258, "reward": 1.690694808959961, "reward_std": 0.19719509780406952, "rewards/accuracy_reward_stage2": 0.8156948089599609, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1882 }, { "completion_length": 22.75, "epoch": 0.3299456807429473, "grad_norm": 17.117396245079153, "kl": 0.119140625, "learning_rate": 6.702295426669002e-07, "loss": 0.0477, "reward": 1.3262156248092651, "reward_std": 0.13321471214294434, "rewards/accuracy_reward_stage2": 0.3262156844139099, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1883 }, { "completion_length": 11.265625, "epoch": 0.3301209041527948, "grad_norm": 17.60683011247102, "kl": 0.04541015625, "learning_rate": 6.700543192570527e-07, "loss": -0.0702, "reward": 1.4391136169433594, "reward_std": 0.3226979374885559, "rewards/accuracy_reward_stage2": 0.5953635573387146, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1884 }, { "completion_length": 8.390625, "epoch": 0.33029612756264237, "grad_norm": 21.465121692233303, "kl": 0.04248046875, "learning_rate": 6.698790958472051e-07, "loss": 0.017, "reward": 1.71236252784729, "reward_std": 0.3065722584724426, "rewards/accuracy_reward_stage2": 0.71236252784729, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1885 }, { "completion_length": 11.09375, "epoch": 0.3304713509724899, "grad_norm": 14.173433662062596, "kl": 0.07666015625, "learning_rate": 6.697038724373576e-07, "loss": 0.0307, "reward": 1.0344958305358887, "reward_std": 0.1303577572107315, "rewards/accuracy_reward_stage2": 0.2844958007335663, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1886 }, { "completion_length": 12.640625, "epoch": 0.33064657438233747, "grad_norm": 15.206400035440623, "kl": 0.05078125, "learning_rate": 6.6952864902751e-07, "loss": 0.0203, "reward": 1.540507435798645, "reward_std": 0.1567876935005188, "rewards/accuracy_reward_stage2": 0.6655075550079346, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1887 }, { "completion_length": 12.21875, "epoch": 0.330821797792185, "grad_norm": 16.109514600038697, "kl": 0.283203125, "learning_rate": 6.693534256176625e-07, "loss": 0.1129, "reward": 1.6333606243133545, "reward_std": 0.15332239866256714, "rewards/accuracy_reward_stage2": 0.8833605647087097, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1888 }, { "completion_length": 7.90625, "epoch": 0.3309970212020326, "grad_norm": 16.047303236328855, "kl": 0.052001953125, "learning_rate": 6.69178202207815e-07, "loss": 0.0208, "reward": 1.561516284942627, "reward_std": 0.25483137369155884, "rewards/accuracy_reward_stage2": 0.561516284942627, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1889 }, { "completion_length": 18.453125, "epoch": 0.33117224461188016, "grad_norm": 16.776042014411114, "kl": 0.11669921875, "learning_rate": 6.690029787979674e-07, "loss": 0.0467, "reward": 1.7119944095611572, "reward_std": 0.1588687002658844, "rewards/accuracy_reward_stage2": 0.7119944095611572, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1890 }, { "completion_length": 12.15625, "epoch": 0.3313474680217277, "grad_norm": 17.90016040850292, "kl": 0.2080078125, "learning_rate": 6.688277553881199e-07, "loss": 0.039, "reward": 1.617673635482788, "reward_std": 0.17623630166053772, "rewards/accuracy_reward_stage2": 0.7582985758781433, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1891 }, { "completion_length": 10.703125, "epoch": 0.33152269143157526, "grad_norm": 23.42697168203776, "kl": 0.1708984375, "learning_rate": 6.686525319782724e-07, "loss": 0.0394, "reward": 1.5168395042419434, "reward_std": 0.23415003716945648, "rewards/accuracy_reward_stage2": 0.6574645042419434, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1892 }, { "completion_length": 20.15625, "epoch": 0.3316979148414228, "grad_norm": 16.298440462486333, "kl": 0.0279541015625, "learning_rate": 6.684773085684246e-07, "loss": -0.0242, "reward": 1.6867103576660156, "reward_std": 0.17012272775173187, "rewards/accuracy_reward_stage2": 0.7023352384567261, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1893 }, { "completion_length": 14.203125, "epoch": 0.33187313825127035, "grad_norm": 22.995134068750403, "kl": 0.1376953125, "learning_rate": 6.683020851585771e-07, "loss": 0.0107, "reward": 1.4166667461395264, "reward_std": 0.3733384609222412, "rewards/accuracy_reward_stage2": 0.4322916865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1894 }, { "completion_length": 8.578125, "epoch": 0.33204836166111795, "grad_norm": 19.63325924492353, "kl": 0.050537109375, "learning_rate": 6.681268617487295e-07, "loss": -0.0148, "reward": 1.6966354846954346, "reward_std": 0.20570926368236542, "rewards/accuracy_reward_stage2": 0.8372604250907898, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1895 }, { "completion_length": 13.875, "epoch": 0.3322235850709655, "grad_norm": 19.72172048639019, "kl": 0.1982421875, "learning_rate": 6.67951638338882e-07, "loss": 0.0793, "reward": 1.390139102935791, "reward_std": 0.20460231602191925, "rewards/accuracy_reward_stage2": 0.515139102935791, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1896 }, { "completion_length": 10.734375, "epoch": 0.33239880848081305, "grad_norm": 17.034895000067117, "kl": 0.1162109375, "learning_rate": 6.677764149290345e-07, "loss": 0.0024, "reward": 1.743318796157837, "reward_std": 0.25681620836257935, "rewards/accuracy_reward_stage2": 0.7589437961578369, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1897 }, { "completion_length": 8.3125, "epoch": 0.3325740318906606, "grad_norm": 13.096367317694307, "kl": 0.09521484375, "learning_rate": 6.676011915191869e-07, "loss": 0.0382, "reward": 1.7255065441131592, "reward_std": 0.19382783770561218, "rewards/accuracy_reward_stage2": 0.7255065441131592, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1898 }, { "completion_length": 12.8125, "epoch": 0.33274925530050814, "grad_norm": 18.103895780983308, "kl": 0.11376953125, "learning_rate": 6.674259681093394e-07, "loss": 0.0014, "reward": 1.4381917715072632, "reward_std": 0.2100490778684616, "rewards/accuracy_reward_stage2": 0.4538167119026184, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1899 }, { "completion_length": 12.359375, "epoch": 0.3329244787103557, "grad_norm": 14.372391687615288, "kl": 0.0537109375, "learning_rate": 6.672507446994919e-07, "loss": -0.0226, "reward": 1.0958333015441895, "reward_std": 0.1708841323852539, "rewards/accuracy_reward_stage2": 0.36145833134651184, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1900 }, { "completion_length": 10.390625, "epoch": 0.33309970212020323, "grad_norm": 18.28746014331557, "kl": 0.07421875, "learning_rate": 6.670755212896443e-07, "loss": -0.0066, "reward": 1.7098007202148438, "reward_std": 0.21772566437721252, "rewards/accuracy_reward_stage2": 0.725425660610199, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1901 }, { "completion_length": 22.21875, "epoch": 0.33327492553005084, "grad_norm": 33.34818803522203, "kl": 0.0517578125, "learning_rate": 6.669002978797968e-07, "loss": -0.0234, "reward": 1.6470694541931152, "reward_std": 0.17796316742897034, "rewards/accuracy_reward_stage2": 0.6626943349838257, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1902 }, { "completion_length": 14.875, "epoch": 0.3334501489398984, "grad_norm": 19.340941210498876, "kl": 0.08642578125, "learning_rate": 6.667250744699491e-07, "loss": -0.0514, "reward": 1.5270521640777588, "reward_std": 0.32001104950904846, "rewards/accuracy_reward_stage2": 0.5583021640777588, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1903 }, { "completion_length": 8.75, "epoch": 0.33362537234974593, "grad_norm": 19.62612387488368, "kl": 0.1708984375, "learning_rate": 6.665498510601016e-07, "loss": 0.0683, "reward": 1.6385350227355957, "reward_std": 0.0839998871088028, "rewards/accuracy_reward_stage2": 0.7635350227355957, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1904 }, { "completion_length": 12.109375, "epoch": 0.3338005957595935, "grad_norm": 13.862894810729566, "kl": 0.0184326171875, "learning_rate": 6.663746276502541e-07, "loss": 0.0074, "reward": 1.5989583730697632, "reward_std": 0.13459712266921997, "rewards/accuracy_reward_stage2": 0.5989583134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1905 }, { "completion_length": 14.015625, "epoch": 0.333975819169441, "grad_norm": 21.690891130630494, "kl": 0.275390625, "learning_rate": 6.661994042404064e-07, "loss": 0.0112, "reward": 1.396896243095398, "reward_std": 0.23912891745567322, "rewards/accuracy_reward_stage2": 0.5687711834907532, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1906 }, { "completion_length": 10.25, "epoch": 0.33415104257928857, "grad_norm": 16.18886844350906, "kl": 0.045654296875, "learning_rate": 6.660241808305589e-07, "loss": 0.0182, "reward": 1.7281818389892578, "reward_std": 0.18544113636016846, "rewards/accuracy_reward_stage2": 0.728181779384613, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1907 }, { "completion_length": 11.53125, "epoch": 0.3343262659891362, "grad_norm": 8.774059095589669, "kl": 0.06494140625, "learning_rate": 6.658489574207114e-07, "loss": -0.0182, "reward": 1.3096954822540283, "reward_std": 0.11100947856903076, "rewards/accuracy_reward_stage2": 0.3253205120563507, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1908 }, { "completion_length": 8.671875, "epoch": 0.3345014893989837, "grad_norm": 26.765411058532464, "kl": 0.181640625, "learning_rate": 6.656737340108638e-07, "loss": 0.0438, "reward": 1.8735935688018799, "reward_std": 0.25263744592666626, "rewards/accuracy_reward_stage2": 0.8892185688018799, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1909 }, { "completion_length": 13.78125, "epoch": 0.33467671280883127, "grad_norm": 16.654080115758653, "kl": 0.042724609375, "learning_rate": 6.654985106010163e-07, "loss": 0.0171, "reward": 1.5321245193481445, "reward_std": 0.16976571083068848, "rewards/accuracy_reward_stage2": 0.5321245193481445, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1910 }, { "completion_length": 10.578125, "epoch": 0.3348519362186788, "grad_norm": 22.42678703367036, "kl": 0.279296875, "learning_rate": 6.653232871911687e-07, "loss": 0.0479, "reward": 1.2796870470046997, "reward_std": 0.23620405793190002, "rewards/accuracy_reward_stage2": 0.5609370470046997, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1911 }, { "completion_length": 10.859375, "epoch": 0.33502715962852636, "grad_norm": 29.145456284362375, "kl": 0.053955078125, "learning_rate": 6.651480637813211e-07, "loss": 0.0216, "reward": 1.7155293226242065, "reward_std": 0.15784859657287598, "rewards/accuracy_reward_stage2": 0.8405293226242065, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1912 }, { "completion_length": 29.1875, "epoch": 0.3352023830383739, "grad_norm": 17.007278396942223, "kl": 0.0556640625, "learning_rate": 6.649728403714736e-07, "loss": -0.0637, "reward": 1.624013900756836, "reward_std": 0.24514907598495483, "rewards/accuracy_reward_stage2": 0.6552638411521912, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1913 }, { "completion_length": 13.125, "epoch": 0.33537760644822145, "grad_norm": 22.398287940378502, "kl": 0.142578125, "learning_rate": 6.64797616961626e-07, "loss": 0.0262, "reward": 1.3815398216247559, "reward_std": 0.299510657787323, "rewards/accuracy_reward_stage2": 0.5221648216247559, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1914 }, { "completion_length": 11.09375, "epoch": 0.33555282985806906, "grad_norm": 21.564872684074892, "kl": 0.2177734375, "learning_rate": 6.646223935517785e-07, "loss": 0.0304, "reward": 1.7198235988616943, "reward_std": 0.27606385946273804, "rewards/accuracy_reward_stage2": 0.7510735988616943, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1915 }, { "completion_length": 10.8125, "epoch": 0.3357280532679166, "grad_norm": 24.75448251141867, "kl": 0.107421875, "learning_rate": 6.64447170141931e-07, "loss": 0.0214, "reward": 1.3904714584350586, "reward_std": 0.2850678861141205, "rewards/accuracy_reward_stage2": 0.5310965180397034, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1916 }, { "completion_length": 20.8125, "epoch": 0.33590327667776415, "grad_norm": 18.343565887362885, "kl": 0.14453125, "learning_rate": 6.642719467320834e-07, "loss": 0.0578, "reward": 1.4843181371688843, "reward_std": 0.11950040608644485, "rewards/accuracy_reward_stage2": 0.6093181371688843, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1917 }, { "completion_length": 15.0, "epoch": 0.3360785000876117, "grad_norm": 16.500056251343526, "kl": 0.05224609375, "learning_rate": 6.640967233222359e-07, "loss": 0.0209, "reward": 1.7818292379379272, "reward_std": 0.15213115513324738, "rewards/accuracy_reward_stage2": 0.7818291783332825, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1918 }, { "completion_length": 11.5, "epoch": 0.33625372349745924, "grad_norm": 23.112114220771804, "kl": 0.158203125, "learning_rate": 6.639214999123882e-07, "loss": 0.0634, "reward": 1.6982868909835815, "reward_std": 0.29553136229515076, "rewards/accuracy_reward_stage2": 0.6982868313789368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1919 }, { "completion_length": 13.28125, "epoch": 0.3364289469073068, "grad_norm": 26.025706676782015, "kl": 0.294921875, "learning_rate": 6.637462765025407e-07, "loss": 0.0736, "reward": 1.2600409984588623, "reward_std": 0.25321489572525024, "rewards/accuracy_reward_stage2": 0.4006659686565399, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1920 }, { "completion_length": 18.734375, "epoch": 0.3366041703171544, "grad_norm": 16.870784749615115, "kl": 0.1953125, "learning_rate": 6.635710530926932e-07, "loss": 0.0007, "reward": 1.2553613185882568, "reward_std": 0.16248267889022827, "rewards/accuracy_reward_stage2": 0.41161128878593445, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1921 }, { "completion_length": 5.203125, "epoch": 0.33677939372700194, "grad_norm": 11.88597080650647, "kl": 0.08740234375, "learning_rate": 6.633958296828455e-07, "loss": -0.0092, "reward": 1.796875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward_stage2": 0.8125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1922 }, { "completion_length": 8.8125, "epoch": 0.3369546171368495, "grad_norm": 18.00565963799991, "kl": 0.1044921875, "learning_rate": 6.63220606272998e-07, "loss": -0.0396, "reward": 1.3730442523956299, "reward_std": 0.24149462580680847, "rewards/accuracy_reward_stage2": 0.40429437160491943, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1923 }, { "completion_length": 11.953125, "epoch": 0.33712984054669703, "grad_norm": 21.296992883781495, "kl": 0.24609375, "learning_rate": 6.630453828631505e-07, "loss": 0.0543, "reward": 1.3399149179458618, "reward_std": 0.19342654943466187, "rewards/accuracy_reward_stage2": 0.4805399179458618, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1924 }, { "completion_length": 6.296875, "epoch": 0.3373050639565446, "grad_norm": 18.698121586193267, "kl": 0.1923828125, "learning_rate": 6.628701594533029e-07, "loss": -0.0437, "reward": 1.8635075092315674, "reward_std": 0.2860381007194519, "rewards/accuracy_reward_stage2": 0.9103825092315674, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1925 }, { "completion_length": 11.796875, "epoch": 0.33748028736639213, "grad_norm": 15.837593428346915, "kl": 0.059326171875, "learning_rate": 6.626949360434554e-07, "loss": -0.0205, "reward": 1.519178867340088, "reward_std": 0.15417931973934174, "rewards/accuracy_reward_stage2": 0.5348039269447327, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1926 }, { "completion_length": 10.15625, "epoch": 0.33765551077623973, "grad_norm": 22.356629840600462, "kl": 0.201171875, "learning_rate": 6.625197126336078e-07, "loss": -0.0407, "reward": 1.5961397886276245, "reward_std": 0.3365238308906555, "rewards/accuracy_reward_stage2": 0.6430148482322693, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1927 }, { "completion_length": 9.640625, "epoch": 0.3378307341860873, "grad_norm": 19.981589308553318, "kl": 0.228515625, "learning_rate": 6.623444892237603e-07, "loss": 0.0523, "reward": 1.538138508796692, "reward_std": 0.3606716990470886, "rewards/accuracy_reward_stage2": 0.6787635087966919, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1928 }, { "completion_length": 9.125, "epoch": 0.3380059575959348, "grad_norm": 21.779944693905204, "kl": 0.19921875, "learning_rate": 6.621692658139128e-07, "loss": 0.0246, "reward": 1.6251780986785889, "reward_std": 0.30405157804489136, "rewards/accuracy_reward_stage2": 0.6564280986785889, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1929 }, { "completion_length": 10.90625, "epoch": 0.33818118100578237, "grad_norm": 17.360682645924115, "kl": 0.1181640625, "learning_rate": 6.619940424040652e-07, "loss": 0.0074, "reward": 1.4416460990905762, "reward_std": 0.15304405987262726, "rewards/accuracy_reward_stage2": 0.5822710394859314, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1930 }, { "completion_length": 9.421875, "epoch": 0.3383564044156299, "grad_norm": 15.773455335638124, "kl": 0.1123046875, "learning_rate": 6.618188189942176e-07, "loss": 0.0009, "reward": 1.5871254205703735, "reward_std": 0.14930549263954163, "rewards/accuracy_reward_stage2": 0.6027504205703735, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1931 }, { "completion_length": 11.0, "epoch": 0.33853162782547747, "grad_norm": 18.534239535030164, "kl": 0.04296875, "learning_rate": 6.6164359558437e-07, "loss": 0.0172, "reward": 1.6630206108093262, "reward_std": 0.1501636952161789, "rewards/accuracy_reward_stage2": 0.6630206108093262, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1932 }, { "completion_length": 11.328125, "epoch": 0.338706851235325, "grad_norm": 16.023803016884205, "kl": 0.0341796875, "learning_rate": 6.614683721745224e-07, "loss": 0.0137, "reward": 1.7510204315185547, "reward_std": 0.07846297323703766, "rewards/accuracy_reward_stage2": 0.7510203123092651, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1933 }, { "completion_length": 11.453125, "epoch": 0.3388820746451726, "grad_norm": 21.742080959952997, "kl": 0.076171875, "learning_rate": 6.612931487646749e-07, "loss": 0.0304, "reward": 1.41162109375, "reward_std": 0.3468879461288452, "rewards/accuracy_reward_stage2": 0.4116211533546448, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1934 }, { "completion_length": 10.625, "epoch": 0.33905729805502016, "grad_norm": 15.14521136054857, "kl": 0.18359375, "learning_rate": 6.611179253548273e-07, "loss": 0.0733, "reward": 1.47281813621521, "reward_std": 0.13576127588748932, "rewards/accuracy_reward_stage2": 0.59781813621521, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1935 }, { "completion_length": 9.671875, "epoch": 0.3392325214648677, "grad_norm": 14.546117917073072, "kl": 0.09619140625, "learning_rate": 6.609427019449798e-07, "loss": -0.0056, "reward": 1.7761536836624146, "reward_std": 0.09021301567554474, "rewards/accuracy_reward_stage2": 0.7917786240577698, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1936 }, { "completion_length": 9.8125, "epoch": 0.33940774487471526, "grad_norm": 13.956049087009886, "kl": 0.12451171875, "learning_rate": 6.607674785351323e-07, "loss": 0.0497, "reward": 1.6214237213134766, "reward_std": 0.11793522536754608, "rewards/accuracy_reward_stage2": 0.7464236617088318, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1937 }, { "completion_length": 11.5625, "epoch": 0.3395829682845628, "grad_norm": 10.727183178173318, "kl": 0.07958984375, "learning_rate": 6.605922551252847e-07, "loss": -0.0124, "reward": 1.488537073135376, "reward_std": 0.1613566279411316, "rewards/accuracy_reward_stage2": 0.629162073135376, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1938 }, { "completion_length": 6.078125, "epoch": 0.33975819169441035, "grad_norm": 20.65504262119513, "kl": 0.068359375, "learning_rate": 6.604170317154372e-07, "loss": 0.0274, "reward": 1.7564607858657837, "reward_std": 0.16308678686618805, "rewards/accuracy_reward_stage2": 0.7564607262611389, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1939 }, { "completion_length": 8.40625, "epoch": 0.33993341510425795, "grad_norm": 17.045082527702256, "kl": 0.1298828125, "learning_rate": 6.602418083055897e-07, "loss": -0.035, "reward": 1.672278642654419, "reward_std": 0.2258690595626831, "rewards/accuracy_reward_stage2": 0.7035285830497742, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1940 }, { "completion_length": 9.40625, "epoch": 0.3401086385141055, "grad_norm": 20.15182148776487, "kl": 0.1943359375, "learning_rate": 6.600665848957421e-07, "loss": 0.0778, "reward": 1.5685465335845947, "reward_std": 0.2734663784503937, "rewards/accuracy_reward_stage2": 0.6935466527938843, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1941 }, { "completion_length": 14.28125, "epoch": 0.34028386192395305, "grad_norm": 16.56521479852848, "kl": 0.111328125, "learning_rate": 6.598913614858945e-07, "loss": 0.0445, "reward": 1.4376866817474365, "reward_std": 0.13336199522018433, "rewards/accuracy_reward_stage2": 0.4376866817474365, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1942 }, { "completion_length": 9.390625, "epoch": 0.3404590853338006, "grad_norm": 18.12517967513942, "kl": 0.0771484375, "learning_rate": 6.597161380760469e-07, "loss": 0.0308, "reward": 1.4623807668685913, "reward_std": 0.18650619685649872, "rewards/accuracy_reward_stage2": 0.5873807668685913, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1943 }, { "completion_length": 11.359375, "epoch": 0.34063430874364814, "grad_norm": 20.774634464244237, "kl": 0.11083984375, "learning_rate": 6.595409146661993e-07, "loss": 0.0164, "reward": 1.4170732498168945, "reward_std": 0.26271551847457886, "rewards/accuracy_reward_stage2": 0.43269819021224976, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1944 }, { "completion_length": 10.3125, "epoch": 0.3408095321534957, "grad_norm": 21.154764502543014, "kl": 0.1630859375, "learning_rate": 6.593656912563518e-07, "loss": 0.0655, "reward": 1.6541085243225098, "reward_std": 0.20481356978416443, "rewards/accuracy_reward_stage2": 0.779108464717865, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1945 }, { "completion_length": 15.8125, "epoch": 0.3409847555633433, "grad_norm": 26.25091427777858, "kl": 0.046875, "learning_rate": 6.591904678465042e-07, "loss": 0.0188, "reward": 1.4044055938720703, "reward_std": 0.3343871831893921, "rewards/accuracy_reward_stage2": 0.4044056236743927, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1946 }, { "completion_length": 7.578125, "epoch": 0.34115997897319084, "grad_norm": 21.537045200934948, "kl": 0.0576171875, "learning_rate": 6.590152444366567e-07, "loss": -0.021, "reward": 1.3350812196731567, "reward_std": 0.2705545425415039, "rewards/accuracy_reward_stage2": 0.35070618987083435, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1947 }, { "completion_length": 9.828125, "epoch": 0.3413352023830384, "grad_norm": 15.870683875822307, "kl": 0.119140625, "learning_rate": 6.588400210268091e-07, "loss": -0.0296, "reward": 1.4924291372299194, "reward_std": 0.3276829123497009, "rewards/accuracy_reward_stage2": 0.5236790776252747, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1948 }, { "completion_length": 11.640625, "epoch": 0.34151042579288593, "grad_norm": 19.496774629125934, "kl": 0.1337890625, "learning_rate": 6.586647976169616e-07, "loss": -0.0063, "reward": 1.6220420598983765, "reward_std": 0.2530994415283203, "rewards/accuracy_reward_stage2": 0.7782920002937317, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1949 }, { "completion_length": 9.765625, "epoch": 0.3416856492027335, "grad_norm": 21.680354742630147, "kl": 0.1611328125, "learning_rate": 6.584895742071141e-07, "loss": -0.0087, "reward": 1.5224058628082275, "reward_std": 0.2604309916496277, "rewards/accuracy_reward_stage2": 0.6786558628082275, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1950 }, { "completion_length": 14.59375, "epoch": 0.341860872612581, "grad_norm": 20.994977503450045, "kl": 0.046142578125, "learning_rate": 6.583143507972665e-07, "loss": 0.0184, "reward": 1.4522783756256104, "reward_std": 0.3166598677635193, "rewards/accuracy_reward_stage2": 0.4522784352302551, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1951 }, { "completion_length": 9.96875, "epoch": 0.34203609602242857, "grad_norm": 27.082783672558996, "kl": 0.1552734375, "learning_rate": 6.581391273874189e-07, "loss": 0.062, "reward": 1.4804890155792236, "reward_std": 0.19034737348556519, "rewards/accuracy_reward_stage2": 0.7304890751838684, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1952 }, { "completion_length": 13.4375, "epoch": 0.3422113194322762, "grad_norm": 21.054283543273684, "kl": 0.158203125, "learning_rate": 6.579639039775714e-07, "loss": -0.0083, "reward": 1.4764494895935059, "reward_std": 0.28760117292404175, "rewards/accuracy_reward_stage2": 0.5076994299888611, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1953 }, { "completion_length": 7.21875, "epoch": 0.3423865428421237, "grad_norm": 13.542532358252753, "kl": 0.0361328125, "learning_rate": 6.577886805677238e-07, "loss": 0.0145, "reward": 1.671720266342163, "reward_std": 0.11243344098329544, "rewards/accuracy_reward_stage2": 0.6717202663421631, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1954 }, { "completion_length": 8.5, "epoch": 0.34256176625197127, "grad_norm": 27.08696220045124, "kl": 0.1416015625, "learning_rate": 6.576134571578763e-07, "loss": -0.0228, "reward": 1.2418166399002075, "reward_std": 0.2573709487915039, "rewards/accuracy_reward_stage2": 0.5230665802955627, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1955 }, { "completion_length": 17.078125, "epoch": 0.3427369896618188, "grad_norm": 14.514008667861964, "kl": 0.043701171875, "learning_rate": 6.574382337480288e-07, "loss": 0.0174, "reward": 1.6762876510620117, "reward_std": 0.12784245610237122, "rewards/accuracy_reward_stage2": 0.6762876510620117, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1956 }, { "completion_length": 12.046875, "epoch": 0.34291221307166636, "grad_norm": 11.57712868725116, "kl": 0.0712890625, "learning_rate": 6.572630103381811e-07, "loss": 0.0285, "reward": 1.4849703311920166, "reward_std": 0.0639050081372261, "rewards/accuracy_reward_stage2": 0.48497024178504944, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1957 }, { "completion_length": 10.484375, "epoch": 0.3430874364815139, "grad_norm": 17.600663870878677, "kl": 0.0810546875, "learning_rate": 6.570877869283336e-07, "loss": 0.0324, "reward": 1.7086526155471802, "reward_std": 0.1977860927581787, "rewards/accuracy_reward_stage2": 0.7086526155471802, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1958 }, { "completion_length": 11.53125, "epoch": 0.3432626598913615, "grad_norm": 21.655825213453603, "kl": 0.17578125, "learning_rate": 6.56912563518486e-07, "loss": 0.0306, "reward": 1.259408712387085, "reward_std": 0.23448513448238373, "rewards/accuracy_reward_stage2": 0.5250337719917297, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1959 }, { "completion_length": 10.09375, "epoch": 0.34343788330120906, "grad_norm": 14.008372896408902, "kl": 0.0634765625, "learning_rate": 6.567373401086385e-07, "loss": 0.0254, "reward": 1.745302438735962, "reward_std": 0.04648788273334503, "rewards/accuracy_reward_stage2": 0.7453025579452515, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1960 }, { "completion_length": 11.453125, "epoch": 0.3436131067110566, "grad_norm": 26.769825802585252, "kl": 0.11083984375, "learning_rate": 6.56562116698791e-07, "loss": 0.0444, "reward": 1.5379596948623657, "reward_std": 0.06265933811664581, "rewards/accuracy_reward_stage2": 0.6629596948623657, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1961 }, { "completion_length": 12.53125, "epoch": 0.34378833012090415, "grad_norm": 25.466248398573278, "kl": 0.16796875, "learning_rate": 6.563868932889433e-07, "loss": 0.023, "reward": 1.312827229499817, "reward_std": 0.31713223457336426, "rewards/accuracy_reward_stage2": 0.32845228910446167, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1962 }, { "completion_length": 12.515625, "epoch": 0.3439635535307517, "grad_norm": 19.458826083401085, "kl": 0.10400390625, "learning_rate": 6.562116698790958e-07, "loss": 0.0415, "reward": 1.4178786277770996, "reward_std": 0.06989157199859619, "rewards/accuracy_reward_stage2": 0.4178787171840668, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1963 }, { "completion_length": 12.296875, "epoch": 0.34413877694059924, "grad_norm": 28.785121368008614, "kl": 0.0927734375, "learning_rate": 6.560364464692482e-07, "loss": 0.037, "reward": 1.6619970798492432, "reward_std": 0.2762282192707062, "rewards/accuracy_reward_stage2": 0.6619970202445984, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1964 }, { "completion_length": 12.734375, "epoch": 0.34431400035044685, "grad_norm": 17.61475593043083, "kl": 0.162109375, "learning_rate": 6.558612230594007e-07, "loss": 0.0056, "reward": 1.4520516395568848, "reward_std": 0.20958861708641052, "rewards/accuracy_reward_stage2": 0.48330157995224, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1965 }, { "completion_length": 7.171875, "epoch": 0.3444892237602944, "grad_norm": 15.181443481188113, "kl": 0.12353515625, "learning_rate": 6.556859996495532e-07, "loss": 0.0052, "reward": 1.7034376859664917, "reward_std": 0.15864676237106323, "rewards/accuracy_reward_stage2": 0.8440626859664917, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1966 }, { "completion_length": 8.703125, "epoch": 0.34466444717014194, "grad_norm": 18.706753461209544, "kl": 0.0732421875, "learning_rate": 6.555107762397056e-07, "loss": 0.0293, "reward": 1.7096948623657227, "reward_std": 0.15963897109031677, "rewards/accuracy_reward_stage2": 0.8346949219703674, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1967 }, { "completion_length": 9.28125, "epoch": 0.3448396705799895, "grad_norm": 20.080951676498536, "kl": 0.05908203125, "learning_rate": 6.553355528298581e-07, "loss": 0.0236, "reward": 1.6409235000610352, "reward_std": 0.18434467911720276, "rewards/accuracy_reward_stage2": 0.6409235000610352, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1968 }, { "completion_length": 19.921875, "epoch": 0.34501489398983703, "grad_norm": 15.480120800250623, "kl": 0.08984375, "learning_rate": 6.551603294200106e-07, "loss": 0.036, "reward": 1.6779283285140991, "reward_std": 0.12816794216632843, "rewards/accuracy_reward_stage2": 0.6779283285140991, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1969 }, { "completion_length": 13.625, "epoch": 0.3451901173996846, "grad_norm": 9.019538711132414, "kl": 0.0255126953125, "learning_rate": 6.549851060101629e-07, "loss": 0.0102, "reward": 1.7239811420440674, "reward_std": 0.03791867941617966, "rewards/accuracy_reward_stage2": 0.7239811420440674, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1970 }, { "completion_length": 12.53125, "epoch": 0.34536534080953213, "grad_norm": 18.72549133371684, "kl": 0.076171875, "learning_rate": 6.548098826003154e-07, "loss": 0.0304, "reward": 1.5978374481201172, "reward_std": 0.24242404103279114, "rewards/accuracy_reward_stage2": 0.5978374481201172, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1971 }, { "completion_length": 13.5625, "epoch": 0.34554056421937973, "grad_norm": 13.629983423344452, "kl": 0.10595703125, "learning_rate": 6.546346591904677e-07, "loss": -0.0019, "reward": 1.494866132736206, "reward_std": 0.0999663770198822, "rewards/accuracy_reward_stage2": 0.510491132736206, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1972 }, { "completion_length": 12.40625, "epoch": 0.3457157876292273, "grad_norm": 16.544557598029545, "kl": 0.1640625, "learning_rate": 6.544594357806202e-07, "loss": 0.024, "reward": 1.320347547531128, "reward_std": 0.25300195813179016, "rewards/accuracy_reward_stage2": 0.46097254753112793, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1973 }, { "completion_length": 8.65625, "epoch": 0.3458910110390748, "grad_norm": 27.031728264190516, "kl": 0.1015625, "learning_rate": 6.542842123707727e-07, "loss": 0.0407, "reward": 1.6358357667922974, "reward_std": 0.2411271631717682, "rewards/accuracy_reward_stage2": 0.6358357667922974, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1974 }, { "completion_length": 7.03125, "epoch": 0.34606623444892237, "grad_norm": 20.65655761948469, "kl": 0.142578125, "learning_rate": 6.541089889609251e-07, "loss": 0.0572, "reward": 1.5470199584960938, "reward_std": 0.2650681138038635, "rewards/accuracy_reward_stage2": 0.6720199584960938, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1975 }, { "completion_length": 10.140625, "epoch": 0.3462414578587699, "grad_norm": 17.86962987062657, "kl": 0.049072265625, "learning_rate": 6.539337655510776e-07, "loss": 0.0133, "reward": 1.5956544876098633, "reward_std": 0.17510367929935455, "rewards/accuracy_reward_stage2": 0.6112794280052185, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1976 }, { "completion_length": 10.9375, "epoch": 0.34641668126861747, "grad_norm": 14.238587117023814, "kl": 0.060546875, "learning_rate": 6.537585421412301e-07, "loss": 0.0241, "reward": 1.809149980545044, "reward_std": 0.15811999142169952, "rewards/accuracy_reward_stage2": 0.8091498613357544, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1977 }, { "completion_length": 9.203125, "epoch": 0.34659190467846507, "grad_norm": 17.02518771036635, "kl": 0.1845703125, "learning_rate": 6.535833187313825e-07, "loss": -0.0057, "reward": 1.661616325378418, "reward_std": 0.28009992837905884, "rewards/accuracy_reward_stage2": 0.8178663849830627, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1978 }, { "completion_length": 7.75, "epoch": 0.3467671280883126, "grad_norm": 27.107300154566495, "kl": 0.2470703125, "learning_rate": 6.53408095321535e-07, "loss": 0.055, "reward": 1.6364436149597168, "reward_std": 0.17650076746940613, "rewards/accuracy_reward_stage2": 0.6520686149597168, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1979 }, { "completion_length": 10.71875, "epoch": 0.34694235149816016, "grad_norm": 16.632001236146216, "kl": 0.119140625, "learning_rate": 6.532328719116874e-07, "loss": -0.0406, "reward": 1.2066890001296997, "reward_std": 0.15964210033416748, "rewards/accuracy_reward_stage2": 0.3629389703273773, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1980 }, { "completion_length": 11.5625, "epoch": 0.3471175749080077, "grad_norm": 25.372168175931847, "kl": 0.1220703125, "learning_rate": 6.530576485018399e-07, "loss": 0.0047, "reward": 1.4195719957351685, "reward_std": 0.3210878372192383, "rewards/accuracy_reward_stage2": 0.5601969957351685, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1981 }, { "completion_length": 13.484375, "epoch": 0.34729279831785526, "grad_norm": 18.356621675505927, "kl": 0.09765625, "learning_rate": 6.528824250919922e-07, "loss": 0.039, "reward": 1.68565034866333, "reward_std": 0.1419445276260376, "rewards/accuracy_reward_stage2": 0.6856504678726196, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1982 }, { "completion_length": 15.625, "epoch": 0.3474680217277028, "grad_norm": 15.596275383801663, "kl": 0.064453125, "learning_rate": 6.527072016821446e-07, "loss": 0.0258, "reward": 1.618418574333191, "reward_std": 0.22926867008209229, "rewards/accuracy_reward_stage2": 0.6184185147285461, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1983 }, { "completion_length": 10.125, "epoch": 0.34764324513755035, "grad_norm": 17.592549545737327, "kl": 0.16015625, "learning_rate": 6.525319782722971e-07, "loss": 0.064, "reward": 1.397711157798767, "reward_std": 0.20366990566253662, "rewards/accuracy_reward_stage2": 0.6477111577987671, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1984 }, { "completion_length": 9.296875, "epoch": 0.34781846854739795, "grad_norm": 20.029006910207976, "kl": 0.208984375, "learning_rate": 6.523567548624496e-07, "loss": 0.0344, "reward": 1.679081916809082, "reward_std": 0.2575289309024811, "rewards/accuracy_reward_stage2": 0.835331916809082, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1985 }, { "completion_length": 26.0625, "epoch": 0.3479936919572455, "grad_norm": 15.756414153200662, "kl": 0.10400390625, "learning_rate": 6.52181531452602e-07, "loss": -0.0028, "reward": 1.7272648811340332, "reward_std": 0.15835845470428467, "rewards/accuracy_reward_stage2": 0.7428898811340332, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1986 }, { "completion_length": 8.609375, "epoch": 0.34816891536709305, "grad_norm": 25.260937057275694, "kl": 0.1376953125, "learning_rate": 6.520063080427545e-07, "loss": 0.0552, "reward": 1.3206074237823486, "reward_std": 0.30781957507133484, "rewards/accuracy_reward_stage2": 0.44560742378234863, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1987 }, { "completion_length": 8.203125, "epoch": 0.3483441387769406, "grad_norm": 21.79510382885528, "kl": 0.267578125, "learning_rate": 6.518310846329069e-07, "loss": -0.0048, "reward": 1.5288782119750977, "reward_std": 0.19049212336540222, "rewards/accuracy_reward_stage2": 0.7007532715797424, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1988 }, { "completion_length": 8.34375, "epoch": 0.34851936218678814, "grad_norm": 16.02145132189449, "kl": 0.078125, "learning_rate": 6.516558612230594e-07, "loss": -0.0093, "reward": 1.5895137786865234, "reward_std": 0.15934374928474426, "rewards/accuracy_reward_stage2": 0.6051387786865234, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1989 }, { "completion_length": 9.703125, "epoch": 0.3486945855966357, "grad_norm": 14.497867470770592, "kl": 0.0184326171875, "learning_rate": 6.514806378132119e-07, "loss": 0.0074, "reward": 1.5919466018676758, "reward_std": 0.13881367444992065, "rewards/accuracy_reward_stage2": 0.5919466018676758, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1990 }, { "completion_length": 12.875, "epoch": 0.3488698090064833, "grad_norm": 17.89472880207737, "kl": 0.208984375, "learning_rate": 6.513054144033643e-07, "loss": 0.0546, "reward": 1.4190115928649902, "reward_std": 0.1824534833431244, "rewards/accuracy_reward_stage2": 0.6846365928649902, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1991 }, { "completion_length": 12.5625, "epoch": 0.34904503241633084, "grad_norm": 20.924271113255397, "kl": 0.09130859375, "learning_rate": 6.511301909935167e-07, "loss": -0.0075, "reward": 1.6088534593582153, "reward_std": 0.2591401934623718, "rewards/accuracy_reward_stage2": 0.6244784593582153, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1992 }, { "completion_length": 7.453125, "epoch": 0.3492202558261784, "grad_norm": 15.743749637089023, "kl": 0.0267333984375, "learning_rate": 6.509549675836692e-07, "loss": -0.0224, "reward": 1.6175103187561035, "reward_std": 0.11349479854106903, "rewards/accuracy_reward_stage2": 0.6331353187561035, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1993 }, { "completion_length": 9.3125, "epoch": 0.34939547923602593, "grad_norm": 16.28655284774425, "kl": 0.1318359375, "learning_rate": 6.507797441738216e-07, "loss": 0.0157, "reward": 1.5714805126190186, "reward_std": 0.14586706459522247, "rewards/accuracy_reward_stage2": 0.5871055126190186, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1994 }, { "completion_length": 10.640625, "epoch": 0.3495707026458735, "grad_norm": 19.764527745348857, "kl": 0.1494140625, "learning_rate": 6.50604520763974e-07, "loss": 0.0598, "reward": 1.5106749534606934, "reward_std": 0.24963447451591492, "rewards/accuracy_reward_stage2": 0.5106750130653381, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1995 }, { "completion_length": 7.78125, "epoch": 0.349745926055721, "grad_norm": 25.807882325692766, "kl": 0.212890625, "learning_rate": 6.504292973541264e-07, "loss": 0.0412, "reward": 1.402549386024475, "reward_std": 0.22109973430633545, "rewards/accuracy_reward_stage2": 0.4181743860244751, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1996 }, { "completion_length": 10.890625, "epoch": 0.3499211494655686, "grad_norm": 19.0405343018544, "kl": 0.05029296875, "learning_rate": 6.502540739442789e-07, "loss": 0.0201, "reward": 1.6537227630615234, "reward_std": 0.25505587458610535, "rewards/accuracy_reward_stage2": 0.7787227630615234, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1997 }, { "completion_length": 13.421875, "epoch": 0.3500963728754162, "grad_norm": 18.426578115254816, "kl": 0.033447265625, "learning_rate": 6.500788505344314e-07, "loss": -0.0306, "reward": 1.6478216648101807, "reward_std": 0.19639632105827332, "rewards/accuracy_reward_stage2": 0.6634466052055359, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1998 }, { "completion_length": 6.578125, "epoch": 0.3502715962852637, "grad_norm": 16.863516374025938, "kl": 0.10595703125, "learning_rate": 6.499036271245838e-07, "loss": 0.0424, "reward": 1.6519603729248047, "reward_std": 0.1129794791340828, "rewards/accuracy_reward_stage2": 0.6519604325294495, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1999 }, { "completion_length": 9.546875, "epoch": 0.35044681969511127, "grad_norm": 18.840306610077086, "kl": 0.1435546875, "learning_rate": 6.497284037147363e-07, "loss": 0.013, "reward": 1.7694342136383057, "reward_std": 0.15474824607372284, "rewards/accuracy_reward_stage2": 0.7850591540336609, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2000 }, { "completion_length": 11.3125, "epoch": 0.3506220431049588, "grad_norm": 15.429352320389754, "kl": 0.054931640625, "learning_rate": 6.495531803048888e-07, "loss": -0.017, "reward": 1.5891973972320557, "reward_std": 0.2515534460544586, "rewards/accuracy_reward_stage2": 0.6048224568367004, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2001 }, { "completion_length": 10.671875, "epoch": 0.35079726651480636, "grad_norm": 18.638199329131833, "kl": 0.061767578125, "learning_rate": 6.493779568950411e-07, "loss": -0.0654, "reward": 1.734375, "reward_std": 0.3686423897743225, "rewards/accuracy_reward_stage2": 0.78125, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2002 }, { "completion_length": 7.65625, "epoch": 0.3509724899246539, "grad_norm": 21.399105794557666, "kl": 0.15234375, "learning_rate": 6.492027334851936e-07, "loss": 0.061, "reward": 1.5191402435302734, "reward_std": 0.18384888768196106, "rewards/accuracy_reward_stage2": 0.519140362739563, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2003 }, { "completion_length": 17.203125, "epoch": 0.3511477133345015, "grad_norm": 19.156350104124215, "kl": 0.08447265625, "learning_rate": 6.49027510075346e-07, "loss": -0.0482, "reward": 1.5820919275283813, "reward_std": 0.2840636372566223, "rewards/accuracy_reward_stage2": 0.6133419275283813, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2004 }, { "completion_length": 15.453125, "epoch": 0.35132293674434906, "grad_norm": 14.893311472170298, "kl": 0.2255859375, "learning_rate": 6.488522866654985e-07, "loss": 0.0459, "reward": 1.4101537466049194, "reward_std": 0.23728793859481812, "rewards/accuracy_reward_stage2": 0.4257788062095642, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2005 }, { "completion_length": 13.59375, "epoch": 0.3514981601541966, "grad_norm": 22.347330342450295, "kl": 0.07470703125, "learning_rate": 6.48677063255651e-07, "loss": 0.0299, "reward": 1.632706880569458, "reward_std": 0.24964894354343414, "rewards/accuracy_reward_stage2": 0.632706880569458, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2006 }, { "completion_length": 11.484375, "epoch": 0.35167338356404415, "grad_norm": 12.16406297008582, "kl": 0.057861328125, "learning_rate": 6.485018398458034e-07, "loss": 0.0231, "reward": 1.4031907320022583, "reward_std": 0.10199880599975586, "rewards/accuracy_reward_stage2": 0.5281907320022583, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2007 }, { "completion_length": 8.5, "epoch": 0.3518486069738917, "grad_norm": 14.241194495526061, "kl": 0.0576171875, "learning_rate": 6.483266164359558e-07, "loss": 0.023, "reward": 1.467761754989624, "reward_std": 0.1030719205737114, "rewards/accuracy_reward_stage2": 0.467761754989624, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2008 }, { "completion_length": 9.6875, "epoch": 0.35202383038373924, "grad_norm": 13.251755078633238, "kl": 0.1259765625, "learning_rate": 6.481513930261083e-07, "loss": 0.0504, "reward": 1.577303409576416, "reward_std": 0.08948713541030884, "rewards/accuracy_reward_stage2": 0.7023034691810608, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2009 }, { "completion_length": 10.703125, "epoch": 0.35219905379358685, "grad_norm": 19.886765618288138, "kl": 0.18359375, "learning_rate": 6.479761696162607e-07, "loss": 0.0292, "reward": 1.5866138935089111, "reward_std": 0.19587093591690063, "rewards/accuracy_reward_stage2": 0.6022388935089111, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2010 }, { "completion_length": 9.453125, "epoch": 0.3523742772034344, "grad_norm": 13.824131232354484, "kl": 0.10791015625, "learning_rate": 6.478009462064131e-07, "loss": 0.0006, "reward": 1.2880022525787354, "reward_std": 0.2643819749355316, "rewards/accuracy_reward_stage2": 0.3036273121833801, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2011 }, { "completion_length": 12.8125, "epoch": 0.35254950061328194, "grad_norm": 21.65675317994148, "kl": 0.052490234375, "learning_rate": 6.476257227965655e-07, "loss": -0.0368, "reward": 1.6309711933135986, "reward_std": 0.24923476576805115, "rewards/accuracy_reward_stage2": 0.7872211337089539, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2012 }, { "completion_length": 9.296875, "epoch": 0.3527247240231295, "grad_norm": 19.112002940505786, "kl": 0.21875, "learning_rate": 6.47450499386718e-07, "loss": 0.0088, "reward": 1.6558035612106323, "reward_std": 0.27317488193511963, "rewards/accuracy_reward_stage2": 0.6870535612106323, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2013 }, { "completion_length": 5.625, "epoch": 0.35289994743297703, "grad_norm": 18.397807988383157, "kl": 0.06201171875, "learning_rate": 6.472752759768705e-07, "loss": -0.0107, "reward": 1.3729119300842285, "reward_std": 0.20705291628837585, "rewards/accuracy_reward_stage2": 0.38853681087493896, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2014 }, { "completion_length": 6.46875, "epoch": 0.3530751708428246, "grad_norm": 14.86373871431777, "kl": 0.201171875, "learning_rate": 6.471000525670229e-07, "loss": -0.0046, "reward": 1.536039113998413, "reward_std": 0.14704757928848267, "rewards/accuracy_reward_stage2": 0.6922890543937683, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2015 }, { "completion_length": 7.0, "epoch": 0.3532503942526722, "grad_norm": 12.402681811929515, "kl": 0.072265625, "learning_rate": 6.469248291571754e-07, "loss": -0.0154, "reward": 1.7066841125488281, "reward_std": 0.14217713475227356, "rewards/accuracy_reward_stage2": 0.7223089933395386, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2016 }, { "completion_length": 11.3125, "epoch": 0.35342561766251973, "grad_norm": 20.751088060330876, "kl": 0.09228515625, "learning_rate": 6.467496057473279e-07, "loss": 0.037, "reward": 1.7199280261993408, "reward_std": 0.24458998441696167, "rewards/accuracy_reward_stage2": 0.8449280858039856, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2017 }, { "completion_length": 14.578125, "epoch": 0.3536008410723673, "grad_norm": 23.455831102973022, "kl": 0.15234375, "learning_rate": 6.465743823374803e-07, "loss": -0.0263, "reward": 1.6048414707183838, "reward_std": 0.29559487104415894, "rewards/accuracy_reward_stage2": 0.6360914707183838, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2018 }, { "completion_length": 11.46875, "epoch": 0.3537760644822148, "grad_norm": 15.818034624782776, "kl": 0.0537109375, "learning_rate": 6.463991589276328e-07, "loss": 0.0215, "reward": 1.595871925354004, "reward_std": 0.16525067389011383, "rewards/accuracy_reward_stage2": 0.5958719253540039, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2019 }, { "completion_length": 14.140625, "epoch": 0.35395128789206237, "grad_norm": 17.47953156713791, "kl": 0.09765625, "learning_rate": 6.462239355177852e-07, "loss": -0.004, "reward": 1.6608421802520752, "reward_std": 0.19523340463638306, "rewards/accuracy_reward_stage2": 0.6764671802520752, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2020 }, { "completion_length": 9.484375, "epoch": 0.3541265113019099, "grad_norm": 23.25571658748381, "kl": 0.13671875, "learning_rate": 6.460487121079375e-07, "loss": 0.0546, "reward": 1.4660158157348633, "reward_std": 0.38287341594696045, "rewards/accuracy_reward_stage2": 0.4660158157348633, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2021 }, { "completion_length": 6.046875, "epoch": 0.35430173471175747, "grad_norm": 15.16000099952722, "kl": 0.10498046875, "learning_rate": 6.4587348869809e-07, "loss": 0.0066, "reward": 1.6145833730697632, "reward_std": 0.23177990317344666, "rewards/accuracy_reward_stage2": 0.6302083730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2022 }, { "completion_length": 17.296875, "epoch": 0.35447695812160507, "grad_norm": 19.871120104217546, "kl": 0.0517578125, "learning_rate": 6.456982652882424e-07, "loss": -0.0235, "reward": 1.518093466758728, "reward_std": 0.20393085479736328, "rewards/accuracy_reward_stage2": 0.533718466758728, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2023 }, { "completion_length": 9.84375, "epoch": 0.3546521815314526, "grad_norm": 16.326228486631557, "kl": 0.049072265625, "learning_rate": 6.455230418783949e-07, "loss": 0.0196, "reward": 1.6053377389907837, "reward_std": 0.13290469348430634, "rewards/accuracy_reward_stage2": 0.6053377389907837, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2024 }, { "completion_length": 10.578125, "epoch": 0.35482740494130016, "grad_norm": 15.652855871849557, "kl": 0.1875, "learning_rate": 6.453478184685473e-07, "loss": 0.0307, "reward": 1.299550175666809, "reward_std": 0.1480276733636856, "rewards/accuracy_reward_stage2": 0.5651751756668091, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2025 }, { "completion_length": 5.984375, "epoch": 0.3550026283511477, "grad_norm": 17.75877666884954, "kl": 0.10595703125, "learning_rate": 6.451725950586998e-07, "loss": -0.0018, "reward": 1.9087555408477783, "reward_std": 0.18428705632686615, "rewards/accuracy_reward_stage2": 0.9243804812431335, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2026 }, { "completion_length": 7.9375, "epoch": 0.35517785176099526, "grad_norm": 21.16012729432452, "kl": 0.138671875, "learning_rate": 6.449973716488523e-07, "loss": 0.0242, "reward": 1.3229167461395264, "reward_std": 0.31512731313705444, "rewards/accuracy_reward_stage2": 0.3385416865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2027 }, { "completion_length": 8.9375, "epoch": 0.3553530751708428, "grad_norm": 29.08543121262397, "kl": 0.19921875, "learning_rate": 6.448221482390047e-07, "loss": 0.0798, "reward": 1.4590182304382324, "reward_std": 0.21175247430801392, "rewards/accuracy_reward_stage2": 0.5840181708335876, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2028 }, { "completion_length": 7.921875, "epoch": 0.3555282985806904, "grad_norm": 22.709142062225904, "kl": 0.138671875, "learning_rate": 6.446469248291572e-07, "loss": 0.0265, "reward": 1.6932740211486816, "reward_std": 0.2825216054916382, "rewards/accuracy_reward_stage2": 0.7088989019393921, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2029 }, { "completion_length": 12.71875, "epoch": 0.35570352199053795, "grad_norm": 13.691321079425855, "kl": 0.07666015625, "learning_rate": 6.444717014193097e-07, "loss": 0.0023, "reward": 1.057405710220337, "reward_std": 0.08621557056903839, "rewards/accuracy_reward_stage2": 0.4480307996273041, "rewards/format_reward_stage1_pointerpad": 0.609375, "scores/accuracy_reward_stage2": 0.609375, "step": 2030 }, { "completion_length": 7.15625, "epoch": 0.3558787454003855, "grad_norm": 21.772071074721506, "kl": 0.0830078125, "learning_rate": 6.44296478009462e-07, "loss": -0.042, "reward": 1.594986081123352, "reward_std": 0.35093414783477783, "rewards/accuracy_reward_stage2": 0.626236081123352, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2031 }, { "completion_length": 7.9375, "epoch": 0.35605396881023305, "grad_norm": 21.906110366807358, "kl": 0.0947265625, "learning_rate": 6.441212545996145e-07, "loss": 0.0163, "reward": 1.538081407546997, "reward_std": 0.24354253709316254, "rewards/accuracy_reward_stage2": 0.5537062883377075, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2032 }, { "completion_length": 10.59375, "epoch": 0.3562291922200806, "grad_norm": 19.764452082312108, "kl": 0.099609375, "learning_rate": 6.439460311897668e-07, "loss": -0.0043, "reward": 1.6770200729370117, "reward_std": 0.23365044593811035, "rewards/accuracy_reward_stage2": 0.6926450729370117, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2033 }, { "completion_length": 10.0625, "epoch": 0.35640441562992814, "grad_norm": 17.17510404758707, "kl": 0.1435546875, "learning_rate": 6.437708077799193e-07, "loss": -0.0825, "reward": 1.515505313873291, "reward_std": 0.25960028171539307, "rewards/accuracy_reward_stage2": 0.578005313873291, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2034 }, { "completion_length": 9.953125, "epoch": 0.3565796390397757, "grad_norm": 8.404722024682297, "kl": 0.060791015625, "learning_rate": 6.435955843700718e-07, "loss": -0.0198, "reward": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2035 }, { "completion_length": 9.734375, "epoch": 0.3567548624496233, "grad_norm": 25.262565696637864, "kl": 0.201171875, "learning_rate": 6.434203609602242e-07, "loss": -0.0703, "reward": 1.5607659816741943, "reward_std": 0.3894246518611908, "rewards/accuracy_reward_stage2": 0.6232660412788391, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2036 }, { "completion_length": 8.875, "epoch": 0.35693008585947084, "grad_norm": 17.54284935941715, "kl": 0.11328125, "learning_rate": 6.432451375503767e-07, "loss": 0.0104, "reward": 1.6565544605255127, "reward_std": 0.14218929409980774, "rewards/accuracy_reward_stage2": 0.6721794605255127, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2037 }, { "completion_length": 10.234375, "epoch": 0.3571053092693184, "grad_norm": 16.722634304165545, "kl": 0.07373046875, "learning_rate": 6.430699141405292e-07, "loss": -0.0147, "reward": 1.6033732891082764, "reward_std": 0.2091490775346756, "rewards/accuracy_reward_stage2": 0.6189983487129211, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2038 }, { "completion_length": 8.078125, "epoch": 0.35728053267916593, "grad_norm": 28.241725109043774, "kl": 0.1826171875, "learning_rate": 6.428946907306816e-07, "loss": 0.0664, "reward": 1.652994155883789, "reward_std": 0.322782039642334, "rewards/accuracy_reward_stage2": 0.6686190366744995, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2039 }, { "completion_length": 10.9375, "epoch": 0.3574557560890135, "grad_norm": 17.023308776735536, "kl": 0.115234375, "learning_rate": 6.427194673208341e-07, "loss": -0.0315, "reward": 1.7313854694366455, "reward_std": 0.2074601650238037, "rewards/accuracy_reward_stage2": 0.762635350227356, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2040 }, { "completion_length": 6.78125, "epoch": 0.357630979498861, "grad_norm": 21.88276224578183, "kl": 0.1611328125, "learning_rate": 6.425442439109864e-07, "loss": -0.0487, "reward": 1.6473379135131836, "reward_std": 0.3075483441352844, "rewards/accuracy_reward_stage2": 0.6942129731178284, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2041 }, { "completion_length": 9.0625, "epoch": 0.3578062029087086, "grad_norm": 22.899850742372728, "kl": 0.04736328125, "learning_rate": 6.423690205011389e-07, "loss": 0.0189, "reward": 1.5322256088256836, "reward_std": 0.13407042622566223, "rewards/accuracy_reward_stage2": 0.5322255492210388, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2042 }, { "completion_length": 6.734375, "epoch": 0.3579814263185562, "grad_norm": 31.881156553500293, "kl": 0.1875, "learning_rate": 6.421937970912914e-07, "loss": 0.0749, "reward": 1.6132948398590088, "reward_std": 0.1358180195093155, "rewards/accuracy_reward_stage2": 0.6132948398590088, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2043 }, { "completion_length": 12.28125, "epoch": 0.3581566497284037, "grad_norm": 20.593066376423288, "kl": 0.06494140625, "learning_rate": 6.420185736814438e-07, "loss": 0.0259, "reward": 1.8108373880386353, "reward_std": 0.1923947036266327, "rewards/accuracy_reward_stage2": 0.8108373880386353, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2044 }, { "completion_length": 7.875, "epoch": 0.35833187313825127, "grad_norm": 20.026577573811554, "kl": 0.2265625, "learning_rate": 6.418433502715963e-07, "loss": 0.0103, "reward": 1.3905614614486694, "reward_std": 0.2854178547859192, "rewards/accuracy_reward_stage2": 0.42181146144866943, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2045 }, { "completion_length": 12.828125, "epoch": 0.3585070965480988, "grad_norm": 20.679212222356824, "kl": 0.1875, "learning_rate": 6.416681268617487e-07, "loss": 0.0394, "reward": 1.506111979484558, "reward_std": 0.41419732570648193, "rewards/accuracy_reward_stage2": 0.5217369794845581, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2046 }, { "completion_length": 10.390625, "epoch": 0.35868231995794636, "grad_norm": 18.808809960877102, "kl": 0.1728515625, "learning_rate": 6.414929034519011e-07, "loss": -0.0093, "reward": 1.079209804534912, "reward_std": 0.29377779364585876, "rewards/accuracy_reward_stage2": 0.37608474493026733, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 2047 }, { "completion_length": 7.28125, "epoch": 0.35885754336779396, "grad_norm": 15.818689616065766, "kl": 0.0458984375, "learning_rate": 6.413176800420536e-07, "loss": 0.0183, "reward": 1.321853518486023, "reward_std": 0.20559610426425934, "rewards/accuracy_reward_stage2": 0.32185354828834534, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2048 }, { "completion_length": 10.21875, "epoch": 0.3590327667776415, "grad_norm": 23.13419565702121, "kl": 0.09033203125, "learning_rate": 6.41142456632206e-07, "loss": -0.0523, "reward": 1.5469226837158203, "reward_std": 0.2661864161491394, "rewards/accuracy_reward_stage2": 0.5781725645065308, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2049 }, { "completion_length": 9.8125, "epoch": 0.35920799018748906, "grad_norm": 20.85199762225754, "kl": 0.265625, "learning_rate": 6.409672332223585e-07, "loss": -0.0781, "reward": 1.7552984952926636, "reward_std": 0.37930476665496826, "rewards/accuracy_reward_stage2": 0.8334234952926636, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 2050 }, { "completion_length": 11.328125, "epoch": 0.3593832135973366, "grad_norm": 22.537170988157758, "kl": 0.181640625, "learning_rate": 6.407920098125109e-07, "loss": 0.0028, "reward": 1.5682830810546875, "reward_std": 0.24193453788757324, "rewards/accuracy_reward_stage2": 0.708907961845398, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2051 }, { "completion_length": 28.390625, "epoch": 0.35955843700718415, "grad_norm": 23.631049170169213, "kl": 0.1416015625, "learning_rate": 6.406167864026633e-07, "loss": -0.0653, "reward": 1.147277593612671, "reward_std": 0.26074129343032837, "rewards/accuracy_reward_stage2": 0.3191525340080261, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2052 }, { "completion_length": 8.515625, "epoch": 0.3597336604170317, "grad_norm": 24.54048817390111, "kl": 0.11083984375, "learning_rate": 6.404415629928158e-07, "loss": -0.0462, "reward": 1.5549089908599854, "reward_std": 0.4777141511440277, "rewards/accuracy_reward_stage2": 0.6017839908599854, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2053 }, { "completion_length": 7.734375, "epoch": 0.35990888382687924, "grad_norm": 14.236796163475823, "kl": 0.12255859375, "learning_rate": 6.402663395829683e-07, "loss": -0.0079, "reward": 1.3742108345031738, "reward_std": 0.2040639966726303, "rewards/accuracy_reward_stage2": 0.5304608941078186, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2054 }, { "completion_length": 16.34375, "epoch": 0.36008410723672685, "grad_norm": 15.587263426743462, "kl": 0.083984375, "learning_rate": 6.400911161731207e-07, "loss": 0.0335, "reward": 1.8176294565200806, "reward_std": 0.10821881890296936, "rewards/accuracy_reward_stage2": 0.9426294565200806, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2055 }, { "completion_length": 12.078125, "epoch": 0.3602593306465744, "grad_norm": 21.937235308292358, "kl": 0.125, "learning_rate": 6.399158927632732e-07, "loss": -0.022, "reward": 1.440577745437622, "reward_std": 0.25374239683151245, "rewards/accuracy_reward_stage2": 0.47182780504226685, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2056 }, { "completion_length": 15.578125, "epoch": 0.36043455405642194, "grad_norm": 18.473381578423286, "kl": 0.140625, "learning_rate": 6.397406693534256e-07, "loss": 0.0565, "reward": 1.441630244255066, "reward_std": 0.14738719165325165, "rewards/accuracy_reward_stage2": 0.5666301846504211, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2057 }, { "completion_length": 11.765625, "epoch": 0.3606097774662695, "grad_norm": 18.322017197253313, "kl": 0.150390625, "learning_rate": 6.395654459435781e-07, "loss": 0.0186, "reward": 0.9354975819587708, "reward_std": 0.25476324558258057, "rewards/accuracy_reward_stage2": 0.34174755215644836, "rewards/format_reward_stage1_pointerpad": 0.59375, "scores/accuracy_reward_stage2": 0.59375, "step": 2058 }, { "completion_length": 8.046875, "epoch": 0.36078500087611703, "grad_norm": 12.411016991104368, "kl": 0.04150390625, "learning_rate": 6.393902225337305e-07, "loss": 0.0165, "reward": 1.748430848121643, "reward_std": 0.10755133628845215, "rewards/accuracy_reward_stage2": 0.7484308481216431, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2059 }, { "completion_length": 21.53125, "epoch": 0.3609602242859646, "grad_norm": 25.542296675136917, "kl": 0.13671875, "learning_rate": 6.392149991238828e-07, "loss": 0.0033, "reward": 1.7117555141448975, "reward_std": 0.3055155873298645, "rewards/accuracy_reward_stage2": 0.7430053949356079, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2060 }, { "completion_length": 11.125, "epoch": 0.3611354476958122, "grad_norm": 19.080721242770668, "kl": 0.1513671875, "learning_rate": 6.390397757140353e-07, "loss": -0.0493, "reward": 1.5307211875915527, "reward_std": 0.23762869834899902, "rewards/accuracy_reward_stage2": 0.7025961875915527, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2061 }, { "completion_length": 8.171875, "epoch": 0.36131067110565973, "grad_norm": 24.843716465626326, "kl": 0.173828125, "learning_rate": 6.388645523041878e-07, "loss": 0.0194, "reward": 1.4921178817749023, "reward_std": 0.3609883189201355, "rewards/accuracy_reward_stage2": 0.5233679413795471, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2062 }, { "completion_length": 16.453125, "epoch": 0.3614858945155073, "grad_norm": 17.65292521561761, "kl": 0.11474609375, "learning_rate": 6.386893288943402e-07, "loss": 0.0458, "reward": 1.544228434562683, "reward_std": 0.16590997576713562, "rewards/accuracy_reward_stage2": 0.6692283749580383, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2063 }, { "completion_length": 8.0, "epoch": 0.3616611179253548, "grad_norm": 19.760964980028835, "kl": 0.1259765625, "learning_rate": 6.385141054844927e-07, "loss": 0.0063, "reward": 1.652033805847168, "reward_std": 0.19817985594272614, "rewards/accuracy_reward_stage2": 0.6676587462425232, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2064 }, { "completion_length": 8.109375, "epoch": 0.36183634133520237, "grad_norm": 15.99382134890339, "kl": 0.03125, "learning_rate": 6.383388820746451e-07, "loss": 0.0125, "reward": 1.565201997756958, "reward_std": 0.10470834374427795, "rewards/accuracy_reward_stage2": 0.565201997756958, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2065 }, { "completion_length": 11.234375, "epoch": 0.3620115647450499, "grad_norm": 19.134680179647805, "kl": 0.0322265625, "learning_rate": 6.381636586647976e-07, "loss": 0.0129, "reward": 1.2855641841888428, "reward_std": 0.1369982659816742, "rewards/accuracy_reward_stage2": 0.4105641841888428, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2066 }, { "completion_length": 12.953125, "epoch": 0.3621867881548975, "grad_norm": 17.13585526565401, "kl": 0.07861328125, "learning_rate": 6.379884352549501e-07, "loss": 0.0148, "reward": 1.3759901523590088, "reward_std": 0.1790485382080078, "rewards/accuracy_reward_stage2": 0.516615092754364, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2067 }, { "completion_length": 9.140625, "epoch": 0.36236201156474507, "grad_norm": 19.919764455309917, "kl": 0.1533203125, "learning_rate": 6.378132118451025e-07, "loss": -0.0473, "reward": 1.5150220394134521, "reward_std": 0.2945772409439087, "rewards/accuracy_reward_stage2": 0.5618971586227417, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2068 }, { "completion_length": 10.71875, "epoch": 0.3625372349745926, "grad_norm": 20.46268376467591, "kl": 0.1259765625, "learning_rate": 6.37637988435255e-07, "loss": 0.019, "reward": 1.4374001026153564, "reward_std": 0.30063989758491516, "rewards/accuracy_reward_stage2": 0.4530249834060669, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2069 }, { "completion_length": 11.6875, "epoch": 0.36271245838444016, "grad_norm": 19.75936756902344, "kl": 0.11181640625, "learning_rate": 6.374627650254075e-07, "loss": 0.0447, "reward": 1.327678918838501, "reward_std": 0.17409127950668335, "rewards/accuracy_reward_stage2": 0.4526788890361786, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2070 }, { "completion_length": 8.765625, "epoch": 0.3628876817942877, "grad_norm": 22.704894291915178, "kl": 0.2451171875, "learning_rate": 6.372875416155598e-07, "loss": -0.0568, "reward": 1.6028995513916016, "reward_std": 0.42264601588249207, "rewards/accuracy_reward_stage2": 0.6810245513916016, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 2071 }, { "completion_length": 15.046875, "epoch": 0.36306290520413526, "grad_norm": 22.1590454237176, "kl": 0.1474609375, "learning_rate": 6.371123182057122e-07, "loss": 0.0625, "reward": 1.362949013710022, "reward_std": 0.21775904297828674, "rewards/accuracy_reward_stage2": 0.503574013710022, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2072 }, { "completion_length": 12.078125, "epoch": 0.3632381286139828, "grad_norm": 18.402322297906913, "kl": 0.1318359375, "learning_rate": 6.369370947958646e-07, "loss": -0.1177, "reward": 1.6029870510101318, "reward_std": 0.27655163407325745, "rewards/accuracy_reward_stage2": 0.6654870510101318, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2073 }, { "completion_length": 9.53125, "epoch": 0.3634133520238304, "grad_norm": 14.489973679958617, "kl": 0.138671875, "learning_rate": 6.367618713860171e-07, "loss": -0.0287, "reward": 1.4545139074325562, "reward_std": 0.25849562883377075, "rewards/accuracy_reward_stage2": 0.6107639074325562, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2074 }, { "completion_length": 9.328125, "epoch": 0.36358857543367795, "grad_norm": 15.909096721212883, "kl": 0.099609375, "learning_rate": 6.365866479761696e-07, "loss": -0.0043, "reward": 1.6145410537719727, "reward_std": 0.09930635988712311, "rewards/accuracy_reward_stage2": 0.6301660537719727, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2075 }, { "completion_length": 11.71875, "epoch": 0.3637637988435255, "grad_norm": 23.310846273565204, "kl": 0.2236328125, "learning_rate": 6.36411424566322e-07, "loss": 0.0079, "reward": 1.6138209104537964, "reward_std": 0.2763480246067047, "rewards/accuracy_reward_stage2": 0.6606959104537964, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2076 }, { "completion_length": 8.59375, "epoch": 0.36393902225337305, "grad_norm": 19.359891452547277, "kl": 0.0732421875, "learning_rate": 6.362362011564745e-07, "loss": 0.0293, "reward": 1.501549243927002, "reward_std": 0.20947618782520294, "rewards/accuracy_reward_stage2": 0.5015493631362915, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2077 }, { "completion_length": 8.0, "epoch": 0.3641142456632206, "grad_norm": 55.484372617544224, "kl": 0.3359375, "learning_rate": 6.36060977746627e-07, "loss": 0.1337, "reward": 1.5378704071044922, "reward_std": 0.2450861930847168, "rewards/accuracy_reward_stage2": 0.6628704071044922, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2078 }, { "completion_length": 11.375, "epoch": 0.36428946907306814, "grad_norm": 16.416779187832226, "kl": 0.0181884765625, "learning_rate": 6.358857543367794e-07, "loss": 0.0073, "reward": 1.8072917461395264, "reward_std": 0.1814829707145691, "rewards/accuracy_reward_stage2": 0.8072916865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2079 }, { "completion_length": 10.890625, "epoch": 0.36446469248291574, "grad_norm": 16.328709056063502, "kl": 0.12353515625, "learning_rate": 6.357105309269319e-07, "loss": 0.0198, "reward": 1.7240674495697021, "reward_std": 0.18539977073669434, "rewards/accuracy_reward_stage2": 0.8646925091743469, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2080 }, { "completion_length": 8.96875, "epoch": 0.3646399158927633, "grad_norm": 15.053467686996736, "kl": 0.10546875, "learning_rate": 6.355353075170842e-07, "loss": -0.0015, "reward": 1.6101465225219727, "reward_std": 0.16715562343597412, "rewards/accuracy_reward_stage2": 0.6257715225219727, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2081 }, { "completion_length": 10.75, "epoch": 0.36481513930261084, "grad_norm": 18.342628051433906, "kl": 0.09912109375, "learning_rate": 6.353600841072367e-07, "loss": 0.0005, "reward": 1.6331281661987305, "reward_std": 0.16434608399868011, "rewards/accuracy_reward_stage2": 0.6487530469894409, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2082 }, { "completion_length": 10.359375, "epoch": 0.3649903627124584, "grad_norm": 20.182663981829865, "kl": 0.03466796875, "learning_rate": 6.351848606973892e-07, "loss": 0.0265, "reward": 1.6030964851379395, "reward_std": 0.13247910141944885, "rewards/accuracy_reward_stage2": 0.8530964255332947, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2083 }, { "completion_length": 7.984375, "epoch": 0.36516558612230593, "grad_norm": 15.458876474291781, "kl": 0.0189208984375, "learning_rate": 6.350096372875415e-07, "loss": 0.0076, "reward": 1.7303240299224854, "reward_std": 0.23356689512729645, "rewards/accuracy_reward_stage2": 0.7303240895271301, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2084 }, { "completion_length": 9.46875, "epoch": 0.3653408095321535, "grad_norm": 20.7766939779222, "kl": 0.1767578125, "learning_rate": 6.34834413877694e-07, "loss": 0.0551, "reward": 1.4968338012695312, "reward_std": 0.19729726016521454, "rewards/accuracy_reward_stage2": 0.6374588012695312, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2085 }, { "completion_length": 10.96875, "epoch": 0.3655160329420011, "grad_norm": 17.868613252036152, "kl": 0.1240234375, "learning_rate": 6.346591904678465e-07, "loss": 0.0182, "reward": 1.673816204071045, "reward_std": 0.23597976565361023, "rewards/accuracy_reward_stage2": 0.6894412040710449, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2086 }, { "completion_length": 7.734375, "epoch": 0.3656912563518486, "grad_norm": 17.78504110129761, "kl": 0.0216064453125, "learning_rate": 6.344839670579989e-07, "loss": 0.0086, "reward": 1.8547598123550415, "reward_std": 0.04631902277469635, "rewards/accuracy_reward_stage2": 0.8547598123550415, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2087 }, { "completion_length": 19.875, "epoch": 0.3658664797616962, "grad_norm": 17.125101728063743, "kl": 0.0162353515625, "learning_rate": 6.343087436481514e-07, "loss": 0.0065, "reward": 1.5940463542938232, "reward_std": 0.19404737651348114, "rewards/accuracy_reward_stage2": 0.594046413898468, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2088 }, { "completion_length": 15.6875, "epoch": 0.3660417031715437, "grad_norm": 23.482317824072272, "kl": 0.08837890625, "learning_rate": 6.341335202383038e-07, "loss": 0.0354, "reward": 1.4581522941589355, "reward_std": 0.21087868511676788, "rewards/accuracy_reward_stage2": 0.45815223455429077, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2089 }, { "completion_length": 8.03125, "epoch": 0.36621692658139127, "grad_norm": 13.199062968702936, "kl": 0.03125, "learning_rate": 6.339582968284563e-07, "loss": 0.0125, "reward": 1.7291667461395264, "reward_std": 0.1836046278476715, "rewards/accuracy_reward_stage2": 0.7291666865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2090 }, { "completion_length": 9.40625, "epoch": 0.3663921499912388, "grad_norm": 16.858039673473282, "kl": 0.13671875, "learning_rate": 6.337830734186087e-07, "loss": 0.013, "reward": 1.5398142337799072, "reward_std": 0.21081207692623138, "rewards/accuracy_reward_stage2": 0.5554392337799072, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2091 }, { "completion_length": 8.578125, "epoch": 0.36656737340108636, "grad_norm": 18.827872921303243, "kl": 0.06884765625, "learning_rate": 6.336078500087611e-07, "loss": 0.0274, "reward": 1.4628106355667114, "reward_std": 0.23339983820915222, "rewards/accuracy_reward_stage2": 0.5878106355667114, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2092 }, { "completion_length": 6.765625, "epoch": 0.36674259681093396, "grad_norm": 14.330653120833654, "kl": 0.0234375, "learning_rate": 6.334326265989136e-07, "loss": 0.0094, "reward": 1.6666667461395264, "reward_std": 0.15343135595321655, "rewards/accuracy_reward_stage2": 0.6666666269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2093 }, { "completion_length": 9.015625, "epoch": 0.3669178202207815, "grad_norm": 20.552324553426114, "kl": 0.1357421875, "learning_rate": 6.332574031890661e-07, "loss": 0.0543, "reward": 1.632039189338684, "reward_std": 0.24369873106479645, "rewards/accuracy_reward_stage2": 0.6320391297340393, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2094 }, { "completion_length": 17.640625, "epoch": 0.36709304363062906, "grad_norm": 20.42833604517789, "kl": 0.10693359375, "learning_rate": 6.330821797792185e-07, "loss": 0.0427, "reward": 1.4707213640213013, "reward_std": 0.18929457664489746, "rewards/accuracy_reward_stage2": 0.47072139382362366, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2095 }, { "completion_length": 9.3125, "epoch": 0.3672682670404766, "grad_norm": 19.495544878255846, "kl": 0.08349609375, "learning_rate": 6.32906956369371e-07, "loss": -0.0279, "reward": 1.6371318101882935, "reward_std": 0.19539067149162292, "rewards/accuracy_reward_stage2": 0.6683818101882935, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2096 }, { "completion_length": 9.859375, "epoch": 0.36744349045032415, "grad_norm": 24.742390078521236, "kl": 0.30859375, "learning_rate": 6.327317329595233e-07, "loss": 0.1236, "reward": 1.5489616394042969, "reward_std": 0.14816182851791382, "rewards/accuracy_reward_stage2": 0.6739615201950073, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2097 }, { "completion_length": 7.109375, "epoch": 0.3676187138601717, "grad_norm": 17.907769375524044, "kl": 0.06787109375, "learning_rate": 6.325565095496758e-07, "loss": -0.0171, "reward": 1.6017649173736572, "reward_std": 0.18081702291965485, "rewards/accuracy_reward_stage2": 0.6173898577690125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2098 }, { "completion_length": 8.96875, "epoch": 0.3677939372700193, "grad_norm": 10.834800917825248, "kl": 0.09765625, "learning_rate": 6.323812861398283e-07, "loss": -0.005, "reward": 1.584733009338379, "reward_std": 0.10907712578773499, "rewards/accuracy_reward_stage2": 0.6003579497337341, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2099 }, { "completion_length": 12.34375, "epoch": 0.36796916067986685, "grad_norm": 19.662688636320638, "kl": 0.050048828125, "learning_rate": 6.322060627299806e-07, "loss": -0.0131, "reward": 1.6531250476837158, "reward_std": 0.3001001179218292, "rewards/accuracy_reward_stage2": 0.668749988079071, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2100 }, { "completion_length": 10.609375, "epoch": 0.3681443840897144, "grad_norm": 24.23833484560236, "kl": 0.34765625, "learning_rate": 6.320308393201331e-07, "loss": 0.1391, "reward": 1.312018871307373, "reward_std": 0.26352953910827637, "rewards/accuracy_reward_stage2": 0.43701881170272827, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2101 }, { "completion_length": 4.859375, "epoch": 0.36831960749956194, "grad_norm": 14.376870930324024, "kl": 0.08740234375, "learning_rate": 6.318556159102855e-07, "loss": -0.0093, "reward": 1.8119255304336548, "reward_std": 0.12499181926250458, "rewards/accuracy_reward_stage2": 0.8275505304336548, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2102 }, { "completion_length": 18.75, "epoch": 0.3684948309094095, "grad_norm": 17.023075703116874, "kl": 0.023681640625, "learning_rate": 6.31680392500438e-07, "loss": -0.0343, "reward": 1.6704235076904297, "reward_std": 0.2026294320821762, "rewards/accuracy_reward_stage2": 0.6860485076904297, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2103 }, { "completion_length": 9.796875, "epoch": 0.36867005431925703, "grad_norm": 18.01675915994742, "kl": 0.11328125, "learning_rate": 6.315051690905905e-07, "loss": 0.0011, "reward": 1.657438039779663, "reward_std": 0.21235749125480652, "rewards/accuracy_reward_stage2": 0.6730630397796631, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2104 }, { "completion_length": 11.578125, "epoch": 0.3688452777291046, "grad_norm": 20.560044080024486, "kl": 0.15625, "learning_rate": 6.313299456807429e-07, "loss": 0.0184, "reward": 1.146272897720337, "reward_std": 0.21671149134635925, "rewards/accuracy_reward_stage2": 0.2868978977203369, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2105 }, { "completion_length": 9.4375, "epoch": 0.3690205011389522, "grad_norm": 18.96185128228613, "kl": 0.07958984375, "learning_rate": 6.311547222708954e-07, "loss": -0.0178, "reward": 1.6386375427246094, "reward_std": 0.23830300569534302, "rewards/accuracy_reward_stage2": 0.6698874235153198, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2106 }, { "completion_length": 8.796875, "epoch": 0.36919572454879973, "grad_norm": 19.016078689475286, "kl": 0.076171875, "learning_rate": 6.309794988610479e-07, "loss": 0.0304, "reward": 1.6456325054168701, "reward_std": 0.12463878095149994, "rewards/accuracy_reward_stage2": 0.6456325054168701, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2107 }, { "completion_length": 10.734375, "epoch": 0.3693709479586473, "grad_norm": 16.740065149883968, "kl": 0.0294189453125, "learning_rate": 6.308042754512003e-07, "loss": 0.0118, "reward": 1.7477238178253174, "reward_std": 0.10108815133571625, "rewards/accuracy_reward_stage2": 0.7477236986160278, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2108 }, { "completion_length": 12.078125, "epoch": 0.3695461713684948, "grad_norm": 24.223329981252363, "kl": 0.061279296875, "learning_rate": 6.306290520413528e-07, "loss": 0.0245, "reward": 1.6649169921875, "reward_std": 0.1374625563621521, "rewards/accuracy_reward_stage2": 0.6649170517921448, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2109 }, { "completion_length": 9.71875, "epoch": 0.36972139477834237, "grad_norm": 16.487067073275398, "kl": 0.1171875, "learning_rate": 6.30453828631505e-07, "loss": 0.0027, "reward": 1.557640790939331, "reward_std": 0.22457411885261536, "rewards/accuracy_reward_stage2": 0.573265790939331, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2110 }, { "completion_length": 8.796875, "epoch": 0.3698966181881899, "grad_norm": 21.62888433471808, "kl": 0.140625, "learning_rate": 6.302786052216575e-07, "loss": 0.0561, "reward": 1.6204335689544678, "reward_std": 0.22641383111476898, "rewards/accuracy_reward_stage2": 0.6204336881637573, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2111 }, { "completion_length": 11.328125, "epoch": 0.3700718415980375, "grad_norm": 16.87788815845728, "kl": 0.0078125, "learning_rate": 6.3010338181181e-07, "loss": 0.0031, "reward": 1.8675432205200195, "reward_std": 0.11035715043544769, "rewards/accuracy_reward_stage2": 0.8675432205200195, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2112 }, { "completion_length": 9.4375, "epoch": 0.37024706500788507, "grad_norm": 24.39768532767775, "kl": 0.2431640625, "learning_rate": 6.299281584019624e-07, "loss": 0.0091, "reward": 1.415531873703003, "reward_std": 0.25741642713546753, "rewards/accuracy_reward_stage2": 0.5717819929122925, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2113 }, { "completion_length": 21.625, "epoch": 0.3704222884177326, "grad_norm": 15.837403285224246, "kl": 0.06884765625, "learning_rate": 6.297529349921149e-07, "loss": 0.0276, "reward": 1.5362218618392944, "reward_std": 0.1085284948348999, "rewards/accuracy_reward_stage2": 0.6612218022346497, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2114 }, { "completion_length": 12.28125, "epoch": 0.37059751182758016, "grad_norm": 13.917895664990166, "kl": 0.142578125, "learning_rate": 6.295777115822674e-07, "loss": 0.032, "reward": 1.4274253845214844, "reward_std": 0.17158064246177673, "rewards/accuracy_reward_stage2": 0.5680503845214844, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2115 }, { "completion_length": 9.8125, "epoch": 0.3707727352374277, "grad_norm": 16.353481146815447, "kl": 0.326171875, "learning_rate": 6.294024881724198e-07, "loss": 0.1303, "reward": 1.4166667461395264, "reward_std": 0.11135885119438171, "rewards/accuracy_reward_stage2": 0.6666666269302368, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2116 }, { "completion_length": 12.40625, "epoch": 0.37094795864727526, "grad_norm": 23.075007096362388, "kl": 0.20703125, "learning_rate": 6.292272647625723e-07, "loss": 0.0783, "reward": 1.3603246212005615, "reward_std": 0.14933273196220398, "rewards/accuracy_reward_stage2": 0.6103246808052063, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2117 }, { "completion_length": 8.34375, "epoch": 0.37112318205712286, "grad_norm": 24.00052084623189, "kl": 0.140625, "learning_rate": 6.290520413527247e-07, "loss": 0.003, "reward": 1.4961320161819458, "reward_std": 0.32098907232284546, "rewards/accuracy_reward_stage2": 0.5273820161819458, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2118 }, { "completion_length": 9.078125, "epoch": 0.3712984054669704, "grad_norm": 20.123218248950387, "kl": 0.32421875, "learning_rate": 6.288768179428772e-07, "loss": 0.1298, "reward": 1.5326968431472778, "reward_std": 0.24906522035598755, "rewards/accuracy_reward_stage2": 0.6576968431472778, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2119 }, { "completion_length": 9.203125, "epoch": 0.37147362887681795, "grad_norm": 18.376972007455322, "kl": 0.12890625, "learning_rate": 6.287015945330297e-07, "loss": 0.0123, "reward": 1.8160606622695923, "reward_std": 0.270508348941803, "rewards/accuracy_reward_stage2": 0.8316856026649475, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2120 }, { "completion_length": 12.5, "epoch": 0.3716488522866655, "grad_norm": 22.271447130988797, "kl": 0.109375, "learning_rate": 6.28526371123182e-07, "loss": 0.0311, "reward": 1.5290179252624512, "reward_std": 0.22298547625541687, "rewards/accuracy_reward_stage2": 0.6696428656578064, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2121 }, { "completion_length": 10.65625, "epoch": 0.37182407569651305, "grad_norm": 19.405882578543313, "kl": 0.126953125, "learning_rate": 6.283511477133345e-07, "loss": -0.0267, "reward": 1.306645154953003, "reward_std": 0.23632827401161194, "rewards/accuracy_reward_stage2": 0.3378952145576477, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2122 }, { "completion_length": 16.796875, "epoch": 0.3719992991063606, "grad_norm": 20.1689736986702, "kl": 0.053955078125, "learning_rate": 6.281759243034869e-07, "loss": 0.0215, "reward": 1.6409006118774414, "reward_std": 0.14495626091957092, "rewards/accuracy_reward_stage2": 0.6409005522727966, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2123 }, { "completion_length": 7.40625, "epoch": 0.37217452251620814, "grad_norm": 22.532524076188334, "kl": 0.12890625, "learning_rate": 6.280007008936393e-07, "loss": 0.0515, "reward": 1.6531740427017212, "reward_std": 0.21819378435611725, "rewards/accuracy_reward_stage2": 0.6531739830970764, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2124 }, { "completion_length": 7.8125, "epoch": 0.37234974592605574, "grad_norm": 16.457651486905775, "kl": 0.0751953125, "learning_rate": 6.278254774837918e-07, "loss": -0.014, "reward": 1.547957181930542, "reward_std": 0.13146492838859558, "rewards/accuracy_reward_stage2": 0.563582181930542, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2125 }, { "completion_length": 15.34375, "epoch": 0.3725249693359033, "grad_norm": 64.28172636474491, "kl": 0.494140625, "learning_rate": 6.276502540739442e-07, "loss": 0.1979, "reward": 1.3385417461395264, "reward_std": 0.19351094961166382, "rewards/accuracy_reward_stage2": 0.5885416269302368, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2126 }, { "completion_length": 11.328125, "epoch": 0.37270019274575084, "grad_norm": 18.082131358708025, "kl": 0.08251953125, "learning_rate": 6.274750306640967e-07, "loss": 0.001, "reward": 1.559586763381958, "reward_std": 0.1725788712501526, "rewards/accuracy_reward_stage2": 0.575211763381958, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2127 }, { "completion_length": 9.84375, "epoch": 0.3728754161555984, "grad_norm": 20.991156359291157, "kl": 0.1875, "learning_rate": 6.272998072542492e-07, "loss": -0.0102, "reward": 1.5073506832122803, "reward_std": 0.23045286536216736, "rewards/accuracy_reward_stage2": 0.5386006236076355, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2128 }, { "completion_length": 10.84375, "epoch": 0.37305063956544593, "grad_norm": 14.329424297893015, "kl": 0.04052734375, "learning_rate": 6.271245838444016e-07, "loss": -0.028, "reward": 1.6722835302352905, "reward_std": 0.19924761354923248, "rewards/accuracy_reward_stage2": 0.6879085302352905, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2129 }, { "completion_length": 13.828125, "epoch": 0.3732258629752935, "grad_norm": 21.446783046742176, "kl": 0.306640625, "learning_rate": 6.26949360434554e-07, "loss": 0.1223, "reward": 1.3281686305999756, "reward_std": 0.2296711653470993, "rewards/accuracy_reward_stage2": 0.453168660402298, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2130 }, { "completion_length": 7.484375, "epoch": 0.3734010863851411, "grad_norm": 17.776532396212232, "kl": 0.05615234375, "learning_rate": 6.267741370247065e-07, "loss": -0.0217, "reward": 1.695128321647644, "reward_std": 0.215063214302063, "rewards/accuracy_reward_stage2": 0.710753321647644, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2131 }, { "completion_length": 10.359375, "epoch": 0.3735763097949886, "grad_norm": 16.999543337441377, "kl": 0.11376953125, "learning_rate": 6.265989136148589e-07, "loss": 0.0013, "reward": 1.6943080425262451, "reward_std": 0.15610839426517487, "rewards/accuracy_reward_stage2": 0.7099331021308899, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2132 }, { "completion_length": 8.921875, "epoch": 0.3737515332048362, "grad_norm": 16.62115120629419, "kl": 0.111328125, "learning_rate": 6.264236902050114e-07, "loss": 0.0154, "reward": 1.5700395107269287, "reward_std": 0.18041619658470154, "rewards/accuracy_reward_stage2": 0.5856645107269287, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2133 }, { "completion_length": 9.5625, "epoch": 0.3739267566146837, "grad_norm": 24.958913260953675, "kl": 0.296875, "learning_rate": 6.262484667951638e-07, "loss": 0.1185, "reward": 1.5284466743469238, "reward_std": 0.27969932556152344, "rewards/accuracy_reward_stage2": 0.6534466743469238, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2134 }, { "completion_length": 10.578125, "epoch": 0.37410198002453127, "grad_norm": 19.010121391563455, "kl": 0.09130859375, "learning_rate": 6.260732433853162e-07, "loss": -0.0067, "reward": 1.3212120532989502, "reward_std": 0.1627998948097229, "rewards/accuracy_reward_stage2": 0.4618370831012726, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2135 }, { "completion_length": 9.59375, "epoch": 0.3742772034343788, "grad_norm": 15.576660675742353, "kl": 0.0498046875, "learning_rate": 6.258980199754687e-07, "loss": 0.0199, "reward": 1.5728716850280762, "reward_std": 0.17854374647140503, "rewards/accuracy_reward_stage2": 0.6978715658187866, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2136 }, { "completion_length": 9.90625, "epoch": 0.3744524268442264, "grad_norm": 19.01937287035376, "kl": 0.10107421875, "learning_rate": 6.257227965656211e-07, "loss": -0.0038, "reward": 1.3226069211959839, "reward_std": 0.3287656605243683, "rewards/accuracy_reward_stage2": 0.3382318615913391, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2137 }, { "completion_length": 10.28125, "epoch": 0.37462765025407396, "grad_norm": 17.223905236511207, "kl": 0.1328125, "learning_rate": 6.255475731557736e-07, "loss": 0.0092, "reward": 1.548721194267273, "reward_std": 0.196788489818573, "rewards/accuracy_reward_stage2": 0.5643461346626282, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2138 }, { "completion_length": 11.109375, "epoch": 0.3748028736639215, "grad_norm": 18.440782368725525, "kl": 0.09716796875, "learning_rate": 6.253723497459261e-07, "loss": -0.0052, "reward": 1.5827999114990234, "reward_std": 0.22741162776947021, "rewards/accuracy_reward_stage2": 0.5984249114990234, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2139 }, { "completion_length": 9.546875, "epoch": 0.37497809707376906, "grad_norm": 21.842367934370266, "kl": 0.224609375, "learning_rate": 6.251971263360784e-07, "loss": 0.0565, "reward": 1.41269850730896, "reward_std": 0.2589086890220642, "rewards/accuracy_reward_stage2": 0.5533234477043152, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2140 }, { "completion_length": 13.734375, "epoch": 0.3751533204836166, "grad_norm": 23.19318360447688, "kl": 0.11474609375, "learning_rate": 6.250219029262309e-07, "loss": 0.0458, "reward": 1.5469526052474976, "reward_std": 0.23762944340705872, "rewards/accuracy_reward_stage2": 0.5469525456428528, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2141 }, { "completion_length": 13.078125, "epoch": 0.37532854389346415, "grad_norm": 20.666520109212396, "kl": 0.10498046875, "learning_rate": 6.248466795163833e-07, "loss": -0.0003, "reward": 1.459972858428955, "reward_std": 0.26203304529190063, "rewards/accuracy_reward_stage2": 0.4755978584289551, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2142 }, { "completion_length": 11.203125, "epoch": 0.3755037673033117, "grad_norm": 61.93263278879373, "kl": 0.48046875, "learning_rate": 6.246714561065358e-07, "loss": 0.1529, "reward": 1.3742291927337646, "reward_std": 0.282656192779541, "rewards/accuracy_reward_stage2": 0.3898541331291199, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2143 }, { "completion_length": 7.984375, "epoch": 0.3756789907131593, "grad_norm": 15.307871940423698, "kl": 0.016357421875, "learning_rate": 6.244962326966883e-07, "loss": 0.0066, "reward": 1.7660094499588013, "reward_std": 0.20985843241214752, "rewards/accuracy_reward_stage2": 0.7660094499588013, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2144 }, { "completion_length": 13.625, "epoch": 0.37585421412300685, "grad_norm": 17.61172102911123, "kl": 0.1416015625, "learning_rate": 6.243210092868407e-07, "loss": -0.0297, "reward": 1.2838246822357178, "reward_std": 0.20607344806194305, "rewards/accuracy_reward_stage2": 0.44007477164268494, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2145 }, { "completion_length": 20.0, "epoch": 0.3760294375328544, "grad_norm": 21.1168053036622, "kl": 0.058837890625, "learning_rate": 6.241457858769932e-07, "loss": 0.0236, "reward": 1.6720213890075684, "reward_std": 0.15761704742908478, "rewards/accuracy_reward_stage2": 0.6720214486122131, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2146 }, { "completion_length": 7.796875, "epoch": 0.37620466094270194, "grad_norm": 19.460637478896743, "kl": 0.060791015625, "learning_rate": 6.239705624671457e-07, "loss": -0.0062, "reward": 1.7685046195983887, "reward_std": 0.25962045788764954, "rewards/accuracy_reward_stage2": 0.7841296792030334, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2147 }, { "completion_length": 9.984375, "epoch": 0.3763798843525495, "grad_norm": 15.049930214687446, "kl": 0.10009765625, "learning_rate": 6.23795339057298e-07, "loss": -0.0192, "reward": 1.5843532085418701, "reward_std": 0.18276052176952362, "rewards/accuracy_reward_stage2": 0.6156031489372253, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2148 }, { "completion_length": 12.6875, "epoch": 0.37655510776239703, "grad_norm": 32.813654287269, "kl": 0.27734375, "learning_rate": 6.236201156474505e-07, "loss": 0.0208, "reward": 1.2662172317504883, "reward_std": 0.28443384170532227, "rewards/accuracy_reward_stage2": 0.5474672317504883, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2149 }, { "completion_length": 16.234375, "epoch": 0.37673033117224464, "grad_norm": 21.62062817649044, "kl": 0.1865234375, "learning_rate": 6.234448922376028e-07, "loss": 0.0533, "reward": 1.4966247081756592, "reward_std": 0.3166399598121643, "rewards/accuracy_reward_stage2": 0.5122496485710144, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2150 }, { "completion_length": 11.390625, "epoch": 0.3769055545820922, "grad_norm": 34.291704019048645, "kl": 0.21875, "learning_rate": 6.232696688277553e-07, "loss": 0.0125, "reward": 1.5817184448242188, "reward_std": 0.2617461681365967, "rewards/accuracy_reward_stage2": 0.6129684448242188, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2151 }, { "completion_length": 5.9375, "epoch": 0.37708077799193973, "grad_norm": 18.933549522426574, "kl": 0.11572265625, "learning_rate": 6.230944454179078e-07, "loss": 0.0109, "reward": 1.6863667964935303, "reward_std": 0.24173963069915771, "rewards/accuracy_reward_stage2": 0.7019917964935303, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2152 }, { "completion_length": 12.671875, "epoch": 0.3772560014017873, "grad_norm": 20.28812624382422, "kl": 0.1416015625, "learning_rate": 6.229192220080602e-07, "loss": -0.0256, "reward": 1.601873517036438, "reward_std": 0.15559428930282593, "rewards/accuracy_reward_stage2": 0.633123517036438, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2153 }, { "completion_length": 6.96875, "epoch": 0.3774312248116348, "grad_norm": 20.570384731892993, "kl": 0.302734375, "learning_rate": 6.227439985982127e-07, "loss": 0.0875, "reward": 1.512142300605774, "reward_std": 0.22807812690734863, "rewards/accuracy_reward_stage2": 0.6527671813964844, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2154 }, { "completion_length": 11.296875, "epoch": 0.37760644822148237, "grad_norm": 22.071858525335955, "kl": 0.1376953125, "learning_rate": 6.225687751883652e-07, "loss": 0.0133, "reward": 1.5736531019210815, "reward_std": 0.26310211420059204, "rewards/accuracy_reward_stage2": 0.7142781615257263, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2155 }, { "completion_length": 10.296875, "epoch": 0.3777816716313299, "grad_norm": 18.90253121748625, "kl": 0.038330078125, "learning_rate": 6.223935517785176e-07, "loss": 0.0153, "reward": 1.4485113620758057, "reward_std": 0.20023290812969208, "rewards/accuracy_reward_stage2": 0.4485113024711609, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2156 }, { "completion_length": 8.4375, "epoch": 0.3779568950411775, "grad_norm": 14.504422161072734, "kl": 0.0693359375, "learning_rate": 6.222183283686701e-07, "loss": -0.0536, "reward": 1.6381034851074219, "reward_std": 0.19941505789756775, "rewards/accuracy_reward_stage2": 0.7943534851074219, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2157 }, { "completion_length": 10.375, "epoch": 0.37813211845102507, "grad_norm": 15.329958007937897, "kl": 0.0986328125, "learning_rate": 6.220431049588225e-07, "loss": -0.0046, "reward": 1.8430249691009521, "reward_std": 0.17351368069648743, "rewards/accuracy_reward_stage2": 0.8586499691009521, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2158 }, { "completion_length": 12.71875, "epoch": 0.3783073418608726, "grad_norm": 15.56534467237157, "kl": 0.04296875, "learning_rate": 6.21867881548975e-07, "loss": 0.0171, "reward": 1.3171195983886719, "reward_std": 0.21159344911575317, "rewards/accuracy_reward_stage2": 0.44211962819099426, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2159 }, { "completion_length": 9.0625, "epoch": 0.37848256527072016, "grad_norm": 48.364954367785906, "kl": 0.0859375, "learning_rate": 6.216926581391274e-07, "loss": 0.0343, "reward": 1.4293932914733887, "reward_std": 0.2702651023864746, "rewards/accuracy_reward_stage2": 0.5543933510780334, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2160 }, { "completion_length": 9.0, "epoch": 0.3786577886805677, "grad_norm": 18.0001437695822, "kl": 0.0751953125, "learning_rate": 6.215174347292797e-07, "loss": 0.03, "reward": 1.7156038284301758, "reward_std": 0.1932457685470581, "rewards/accuracy_reward_stage2": 0.7156038284301758, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2161 }, { "completion_length": 11.046875, "epoch": 0.37883301209041526, "grad_norm": 19.107410028406957, "kl": 0.142578125, "learning_rate": 6.213422113194322e-07, "loss": 0.0131, "reward": 1.3738727569580078, "reward_std": 0.2558482885360718, "rewards/accuracy_reward_stage2": 0.3894977271556854, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2162 }, { "completion_length": 12.40625, "epoch": 0.37900823550026286, "grad_norm": 21.945046097805324, "kl": 0.28515625, "learning_rate": 6.211669879095846e-07, "loss": 0.0744, "reward": 1.4311002492904663, "reward_std": 0.19576802849769592, "rewards/accuracy_reward_stage2": 0.7123501896858215, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2163 }, { "completion_length": 7.078125, "epoch": 0.3791834589101104, "grad_norm": 13.820927562745863, "kl": 0.09814453125, "learning_rate": 6.209917644997371e-07, "loss": -0.0338, "reward": 1.875470757484436, "reward_std": 0.1988440304994583, "rewards/accuracy_reward_stage2": 0.906720757484436, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2164 }, { "completion_length": 5.375, "epoch": 0.37935868231995795, "grad_norm": 18.20756335986232, "kl": 0.1865234375, "learning_rate": 6.208165410898896e-07, "loss": 0.0307, "reward": 1.600242018699646, "reward_std": 0.19384321570396423, "rewards/accuracy_reward_stage2": 0.740867018699646, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2165 }, { "completion_length": 8.90625, "epoch": 0.3795339057298055, "grad_norm": 19.02185897001305, "kl": 0.1474609375, "learning_rate": 6.20641317680042e-07, "loss": -0.0998, "reward": 1.5020967721939087, "reward_std": 0.30960702896118164, "rewards/accuracy_reward_stage2": 0.5645967125892639, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2166 }, { "completion_length": 18.328125, "epoch": 0.37970912913965305, "grad_norm": 18.567459094184585, "kl": 0.1826171875, "learning_rate": 6.204660942701945e-07, "loss": 0.0729, "reward": 1.330150842666626, "reward_std": 0.11160098016262054, "rewards/accuracy_reward_stage2": 0.580150842666626, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2167 }, { "completion_length": 9.109375, "epoch": 0.3798843525495006, "grad_norm": 16.303999370260797, "kl": 0.16015625, "learning_rate": 6.20290870860347e-07, "loss": -0.0299, "reward": 1.5709354877471924, "reward_std": 0.3122965097427368, "rewards/accuracy_reward_stage2": 0.6178104877471924, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2168 }, { "completion_length": 7.546875, "epoch": 0.3800595759593482, "grad_norm": 17.12522028504591, "kl": 0.189453125, "learning_rate": 6.201156474504994e-07, "loss": 0.0314, "reward": 1.2758276462554932, "reward_std": 0.24823029339313507, "rewards/accuracy_reward_stage2": 0.2914525866508484, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2169 }, { "completion_length": 13.0, "epoch": 0.38023479936919574, "grad_norm": 18.81176063318736, "kl": 0.1513671875, "learning_rate": 6.199404240406518e-07, "loss": 0.0165, "reward": 1.4827790260314941, "reward_std": 0.21302473545074463, "rewards/accuracy_reward_stage2": 0.6234040260314941, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2170 }, { "completion_length": 9.21875, "epoch": 0.3804100227790433, "grad_norm": 16.121698472462196, "kl": 0.134765625, "learning_rate": 6.197652006308043e-07, "loss": -0.0314, "reward": 1.466298222541809, "reward_std": 0.17698809504508972, "rewards/accuracy_reward_stage2": 0.4975482225418091, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2171 }, { "completion_length": 9.203125, "epoch": 0.38058524618889084, "grad_norm": 22.993006711543188, "kl": 0.2265625, "learning_rate": 6.195899772209567e-07, "loss": -0.028, "reward": 1.6543668508529663, "reward_std": 0.3108653426170349, "rewards/accuracy_reward_stage2": 0.8262418508529663, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2172 }, { "completion_length": 20.59375, "epoch": 0.3807604695987384, "grad_norm": 22.070196626977605, "kl": 0.07080078125, "learning_rate": 6.194147538111091e-07, "loss": 0.0284, "reward": 1.2055113315582275, "reward_std": 0.15062808990478516, "rewards/accuracy_reward_stage2": 0.20551134645938873, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2173 }, { "completion_length": 8.578125, "epoch": 0.38093569300858593, "grad_norm": 79.3112215970755, "kl": 0.34375, "learning_rate": 6.192395304012615e-07, "loss": 0.1048, "reward": 1.4546903371810913, "reward_std": 0.2561033368110657, "rewards/accuracy_reward_stage2": 0.5953153371810913, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2174 }, { "completion_length": 14.328125, "epoch": 0.3811109164184335, "grad_norm": 13.769850844797942, "kl": 0.047607421875, "learning_rate": 6.19064306991414e-07, "loss": -0.0237, "reward": 1.4421863555908203, "reward_std": 0.18708321452140808, "rewards/accuracy_reward_stage2": 0.4578113257884979, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2175 }, { "completion_length": 11.84375, "epoch": 0.3812861398282811, "grad_norm": 17.77441489979722, "kl": 0.2734375, "learning_rate": 6.188890835815665e-07, "loss": -0.0233, "reward": 1.3345986604690552, "reward_std": 0.21731144189834595, "rewards/accuracy_reward_stage2": 0.6314736604690552, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 2176 }, { "completion_length": 7.21875, "epoch": 0.3814613632381286, "grad_norm": 14.420432239876597, "kl": 0.08544921875, "learning_rate": 6.187138601717189e-07, "loss": -0.0099, "reward": 1.375, "reward_std": 0.2756394147872925, "rewards/accuracy_reward_stage2": 0.390625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2177 }, { "completion_length": 4.53125, "epoch": 0.3816365866479762, "grad_norm": 16.752767383625837, "kl": 0.02490234375, "learning_rate": 6.185386367618714e-07, "loss": 0.0099, "reward": 1.7901811599731445, "reward_std": 0.18780627846717834, "rewards/accuracy_reward_stage2": 0.7901811599731445, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2178 }, { "completion_length": 11.984375, "epoch": 0.3818118100578237, "grad_norm": 19.134729548097912, "kl": 0.138671875, "learning_rate": 6.183634133520237e-07, "loss": 0.0163, "reward": 1.8155899047851562, "reward_std": 0.20211075246334076, "rewards/accuracy_reward_stage2": 0.8312147855758667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2179 }, { "completion_length": 10.1875, "epoch": 0.38198703346767127, "grad_norm": 20.542028737132014, "kl": 0.10595703125, "learning_rate": 6.181881899421762e-07, "loss": -0.0018, "reward": 1.8165841102600098, "reward_std": 0.24220114946365356, "rewards/accuracy_reward_stage2": 0.8322091102600098, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2180 }, { "completion_length": 8.953125, "epoch": 0.3821622568775188, "grad_norm": 15.416932155893878, "kl": 0.087890625, "learning_rate": 6.180129665323287e-07, "loss": 0.0351, "reward": 1.6418914794921875, "reward_std": 0.26054543256759644, "rewards/accuracy_reward_stage2": 0.6418914198875427, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2181 }, { "completion_length": 9.34375, "epoch": 0.3823374802873664, "grad_norm": 14.650724234460618, "kl": 0.162109375, "learning_rate": 6.178377431224811e-07, "loss": -0.0025, "reward": 1.7068524360656738, "reward_std": 0.19050803780555725, "rewards/accuracy_reward_stage2": 0.7381024360656738, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2182 }, { "completion_length": 9.671875, "epoch": 0.38251270369721396, "grad_norm": 18.73601022394114, "kl": 0.1962890625, "learning_rate": 6.176625197126336e-07, "loss": 0.0369, "reward": 1.4021036624908447, "reward_std": 0.19758348166942596, "rewards/accuracy_reward_stage2": 0.5427286028862, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2183 }, { "completion_length": 10.015625, "epoch": 0.3826879271070615, "grad_norm": 16.57620370026051, "kl": 0.1494140625, "learning_rate": 6.174872963027861e-07, "loss": -0.0179, "reward": 1.5209438800811768, "reward_std": 0.25040262937545776, "rewards/accuracy_reward_stage2": 0.552193820476532, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2184 }, { "completion_length": 9.140625, "epoch": 0.38286315051690906, "grad_norm": 16.435430776689625, "kl": 0.1328125, "learning_rate": 6.173120728929385e-07, "loss": -0.0297, "reward": 1.881011962890625, "reward_std": 0.1751585155725479, "rewards/accuracy_reward_stage2": 0.9122620224952698, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2185 }, { "completion_length": 7.359375, "epoch": 0.3830383739267566, "grad_norm": 12.327616399469395, "kl": 0.07470703125, "learning_rate": 6.171368494830909e-07, "loss": 0.0299, "reward": 1.6680908203125, "reward_std": 0.1365250200033188, "rewards/accuracy_reward_stage2": 0.6680908799171448, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2186 }, { "completion_length": 9.34375, "epoch": 0.38321359733660415, "grad_norm": 23.12163155547108, "kl": 0.126953125, "learning_rate": 6.169616260732433e-07, "loss": -0.0032, "reward": 1.4950852394104004, "reward_std": 0.20893503725528717, "rewards/accuracy_reward_stage2": 0.5263352394104004, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2187 }, { "completion_length": 14.5625, "epoch": 0.38338882074645175, "grad_norm": 12.708155648632918, "kl": 0.0595703125, "learning_rate": 6.167864026633958e-07, "loss": -0.0552, "reward": 1.695472002029419, "reward_std": 0.11602778732776642, "rewards/accuracy_reward_stage2": 0.7267219424247742, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2188 }, { "completion_length": 8.0625, "epoch": 0.3835640441562993, "grad_norm": 19.159424648991013, "kl": 0.1455078125, "learning_rate": 6.166111792535483e-07, "loss": 0.0141, "reward": 1.589672565460205, "reward_std": 0.26173871755599976, "rewards/accuracy_reward_stage2": 0.6052975654602051, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2189 }, { "completion_length": 10.921875, "epoch": 0.38373926756614685, "grad_norm": 14.923624480497136, "kl": 0.04443359375, "learning_rate": 6.164359558437006e-07, "loss": 0.0178, "reward": 1.6148505210876465, "reward_std": 0.1436673402786255, "rewards/accuracy_reward_stage2": 0.6148505210876465, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2190 }, { "completion_length": 8.90625, "epoch": 0.3839144909759944, "grad_norm": 17.51066119218728, "kl": 0.12353515625, "learning_rate": 6.162607324338531e-07, "loss": 0.0053, "reward": 1.3896396160125732, "reward_std": 0.29294657707214355, "rewards/accuracy_reward_stage2": 0.6552645564079285, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2191 }, { "completion_length": 11.859375, "epoch": 0.38408971438584194, "grad_norm": 13.80016364569103, "kl": 0.1953125, "learning_rate": 6.160855090240056e-07, "loss": -0.0815, "reward": 1.6285045146942139, "reward_std": 0.3758038282394409, "rewards/accuracy_reward_stage2": 0.6910045146942139, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2192 }, { "completion_length": 9.546875, "epoch": 0.3842649377956895, "grad_norm": 22.35046974856224, "kl": 0.1201171875, "learning_rate": 6.15910285614158e-07, "loss": -0.0138, "reward": 1.4999234676361084, "reward_std": 0.2815472483634949, "rewards/accuracy_reward_stage2": 0.6561734676361084, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2193 }, { "completion_length": 10.109375, "epoch": 0.38444016120553703, "grad_norm": 16.895726320418603, "kl": 0.0830078125, "learning_rate": 6.157350622043105e-07, "loss": 0.0331, "reward": 1.51102614402771, "reward_std": 0.1778338998556137, "rewards/accuracy_reward_stage2": 0.51102614402771, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2194 }, { "completion_length": 9.78125, "epoch": 0.38461538461538464, "grad_norm": 14.773306048930774, "kl": 0.1298828125, "learning_rate": 6.155598387944629e-07, "loss": -0.0137, "reward": 1.846685767173767, "reward_std": 0.21043790876865387, "rewards/accuracy_reward_stage2": 0.8779357671737671, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2195 }, { "completion_length": 9.265625, "epoch": 0.3847906080252322, "grad_norm": 20.128255005421288, "kl": 0.08984375, "learning_rate": 6.153846153846154e-07, "loss": 0.0005, "reward": 1.0712401866912842, "reward_std": 0.06993351876735687, "rewards/accuracy_reward_stage2": 0.3524901568889618, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2196 }, { "completion_length": 10.171875, "epoch": 0.38496583143507973, "grad_norm": 14.97086397468025, "kl": 0.2001953125, "learning_rate": 6.152093919747679e-07, "loss": 0.0801, "reward": 1.7159042358398438, "reward_std": 0.058444324880838394, "rewards/accuracy_reward_stage2": 0.8409042954444885, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2197 }, { "completion_length": 6.96875, "epoch": 0.3851410548449273, "grad_norm": 14.820380862297538, "kl": 0.0703125, "learning_rate": 6.150341685649203e-07, "loss": 0.0002, "reward": 1.7224714756011963, "reward_std": 0.21219471096992493, "rewards/accuracy_reward_stage2": 0.7380965352058411, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2198 }, { "completion_length": 10.140625, "epoch": 0.3853162782547748, "grad_norm": 17.29181890196777, "kl": 0.12255859375, "learning_rate": 6.148589451550726e-07, "loss": 0.0049, "reward": 1.4856467247009277, "reward_std": 0.1462748497724533, "rewards/accuracy_reward_stage2": 0.5012717843055725, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2199 }, { "completion_length": 13.578125, "epoch": 0.38549150166462237, "grad_norm": 17.56320922297244, "kl": 0.1015625, "learning_rate": 6.146837217452251e-07, "loss": 0.0406, "reward": 1.3396921157836914, "reward_std": 0.24257370829582214, "rewards/accuracy_reward_stage2": 0.3396921157836914, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2200 }, { "completion_length": 12.375, "epoch": 0.38566672507447, "grad_norm": 21.98389020678823, "kl": 0.10986328125, "learning_rate": 6.145084983353775e-07, "loss": -0.0367, "reward": 1.409691572189331, "reward_std": 0.2644343078136444, "rewards/accuracy_reward_stage2": 0.44094154238700867, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2201 }, { "completion_length": 8.609375, "epoch": 0.3858419484843175, "grad_norm": 20.532977914563833, "kl": 0.17578125, "learning_rate": 6.1433327492553e-07, "loss": -0.0015, "reward": 1.5434110164642334, "reward_std": 0.370197057723999, "rewards/accuracy_reward_stage2": 0.5746610760688782, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2202 }, { "completion_length": 11.953125, "epoch": 0.38601717189416507, "grad_norm": 20.94024842889363, "kl": 0.1591796875, "learning_rate": 6.141580515156824e-07, "loss": 0.0322, "reward": 1.5620813369750977, "reward_std": 0.3396362066268921, "rewards/accuracy_reward_stage2": 0.7027062773704529, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2203 }, { "completion_length": 9.671875, "epoch": 0.3861923953040126, "grad_norm": 22.13104501755214, "kl": 0.0233154296875, "learning_rate": 6.139828281058349e-07, "loss": 0.0093, "reward": 1.7760417461395264, "reward_std": 0.20276054739952087, "rewards/accuracy_reward_stage2": 0.7760416269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2204 }, { "completion_length": 10.828125, "epoch": 0.38636761871386016, "grad_norm": 21.760828418146833, "kl": 0.06884765625, "learning_rate": 6.138076046959874e-07, "loss": 0.0276, "reward": 1.4502272605895996, "reward_std": 0.25459161400794983, "rewards/accuracy_reward_stage2": 0.45022720098495483, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2205 }, { "completion_length": 8.5625, "epoch": 0.3865428421237077, "grad_norm": 21.217578680048884, "kl": 0.11376953125, "learning_rate": 6.136323812861398e-07, "loss": 0.0037, "reward": 1.5424821376800537, "reward_std": 0.16328753530979156, "rewards/accuracy_reward_stage2": 0.5581071376800537, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2206 }, { "completion_length": 7.921875, "epoch": 0.38671806553355526, "grad_norm": 14.634238577480422, "kl": 0.1103515625, "learning_rate": 6.134571578762923e-07, "loss": -0.0, "reward": 1.578125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2207 }, { "completion_length": 10.40625, "epoch": 0.38689328894340286, "grad_norm": 17.851183119937975, "kl": 0.1376953125, "learning_rate": 6.132819344664448e-07, "loss": 0.0161, "reward": 1.6369647979736328, "reward_std": 0.20657899975776672, "rewards/accuracy_reward_stage2": 0.6525896787643433, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2208 }, { "completion_length": 7.625, "epoch": 0.3870685123532504, "grad_norm": 26.111489744750457, "kl": 0.06396484375, "learning_rate": 6.131067110565971e-07, "loss": 0.0257, "reward": 1.4487724304199219, "reward_std": 0.24807778000831604, "rewards/accuracy_reward_stage2": 0.5737723112106323, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2209 }, { "completion_length": 12.96875, "epoch": 0.38724373576309795, "grad_norm": 16.602603346013872, "kl": 0.115234375, "learning_rate": 6.129314876467496e-07, "loss": 0.046, "reward": 1.612157940864563, "reward_std": 0.18975886702537537, "rewards/accuracy_reward_stage2": 0.7371578812599182, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2210 }, { "completion_length": 8.140625, "epoch": 0.3874189591729455, "grad_norm": 15.491528641737077, "kl": 0.0947265625, "learning_rate": 6.12756264236902e-07, "loss": -0.0461, "reward": 1.6862807273864746, "reward_std": 0.23824840784072876, "rewards/accuracy_reward_stage2": 0.7175307869911194, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2211 }, { "completion_length": 13.265625, "epoch": 0.38759418258279305, "grad_norm": 16.70501103695778, "kl": 0.11767578125, "learning_rate": 6.125810408270544e-07, "loss": -0.11, "reward": 1.2275532484054565, "reward_std": 0.22072014212608337, "rewards/accuracy_reward_stage2": 0.29005324840545654, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2212 }, { "completion_length": 11.84375, "epoch": 0.3877694059926406, "grad_norm": 10.653261541291995, "kl": 0.1669921875, "learning_rate": 6.124058174172069e-07, "loss": 0.0272, "reward": 1.6923514604568481, "reward_std": 0.0814502090215683, "rewards/accuracy_reward_stage2": 0.8329764604568481, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2213 }, { "completion_length": 12.75, "epoch": 0.3879446294024882, "grad_norm": 18.466229184417685, "kl": 0.2265625, "learning_rate": 6.122305940073593e-07, "loss": -0.0294, "reward": 1.4522085189819336, "reward_std": 0.3213249742984772, "rewards/accuracy_reward_stage2": 0.499083548784256, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2214 }, { "completion_length": 9.484375, "epoch": 0.38811985281233574, "grad_norm": 23.07519279033274, "kl": 0.203125, "learning_rate": 6.120553705975118e-07, "loss": 0.0371, "reward": 1.647832989692688, "reward_std": 0.17026206851005554, "rewards/accuracy_reward_stage2": 0.7884579300880432, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2215 }, { "completion_length": 8.671875, "epoch": 0.3882950762221833, "grad_norm": 28.513257819873985, "kl": 0.2001953125, "learning_rate": 6.118801471876643e-07, "loss": 0.0356, "reward": 1.6165659427642822, "reward_std": 0.23038463294506073, "rewards/accuracy_reward_stage2": 0.6321908831596375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2216 }, { "completion_length": 9.765625, "epoch": 0.38847029963203084, "grad_norm": 27.15934863307988, "kl": 0.138671875, "learning_rate": 6.117049237778167e-07, "loss": 0.0113, "reward": 1.5638474225997925, "reward_std": 0.2967807650566101, "rewards/accuracy_reward_stage2": 0.5794724225997925, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2217 }, { "completion_length": 10.890625, "epoch": 0.3886455230418784, "grad_norm": 16.716283148171772, "kl": 0.0673828125, "learning_rate": 6.115297003679692e-07, "loss": -0.0103, "reward": 1.5654242038726807, "reward_std": 0.19815650582313538, "rewards/accuracy_reward_stage2": 0.5810492038726807, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2218 }, { "completion_length": 8.0625, "epoch": 0.38882074645172593, "grad_norm": 13.171143475838283, "kl": 0.162109375, "learning_rate": 6.113544769581215e-07, "loss": 0.0208, "reward": 1.3602125644683838, "reward_std": 0.14076007902622223, "rewards/accuracy_reward_stage2": 0.3758375942707062, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2219 }, { "completion_length": 11.109375, "epoch": 0.38899596986157353, "grad_norm": 22.902761311358915, "kl": 0.1279296875, "learning_rate": 6.11179253548274e-07, "loss": 0.0176, "reward": 1.2605656385421753, "reward_std": 0.3278544843196869, "rewards/accuracy_reward_stage2": 0.5261905789375305, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2220 }, { "completion_length": 10.171875, "epoch": 0.3891711932714211, "grad_norm": 60.59700362265599, "kl": 0.345703125, "learning_rate": 6.110040301384265e-07, "loss": 0.138, "reward": 1.541421890258789, "reward_std": 0.33629554510116577, "rewards/accuracy_reward_stage2": 0.5414219498634338, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2221 }, { "completion_length": 17.390625, "epoch": 0.3893464166812686, "grad_norm": 21.843503785841705, "kl": 0.09765625, "learning_rate": 6.108288067285789e-07, "loss": 0.039, "reward": 1.5664076805114746, "reward_std": 0.17039306461811066, "rewards/accuracy_reward_stage2": 0.5664076805114746, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2222 }, { "completion_length": 22.28125, "epoch": 0.3895216400911162, "grad_norm": 22.159828627899486, "kl": 0.1787109375, "learning_rate": 6.106535833187314e-07, "loss": 0.0059, "reward": 1.702857255935669, "reward_std": 0.2878776788711548, "rewards/accuracy_reward_stage2": 0.734107255935669, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2223 }, { "completion_length": 13.4375, "epoch": 0.3896968635009637, "grad_norm": 16.887738826370676, "kl": 0.2099609375, "learning_rate": 6.104783599088838e-07, "loss": 0.048, "reward": 1.2868878841400146, "reward_std": 0.15713664889335632, "rewards/accuracy_reward_stage2": 0.3025129437446594, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2224 }, { "completion_length": 9.578125, "epoch": 0.38987208691081127, "grad_norm": 21.232434409269125, "kl": 0.1416015625, "learning_rate": 6.103031364990362e-07, "loss": 0.0125, "reward": 1.5615177154541016, "reward_std": 0.21830402314662933, "rewards/accuracy_reward_stage2": 0.5771427154541016, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2225 }, { "completion_length": 8.75, "epoch": 0.3900473103206588, "grad_norm": 20.431913799465242, "kl": 0.087890625, "learning_rate": 6.101279130891887e-07, "loss": -0.009, "reward": 1.498471736907959, "reward_std": 0.23524844646453857, "rewards/accuracy_reward_stage2": 0.514096736907959, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2226 }, { "completion_length": 12.40625, "epoch": 0.3902225337305064, "grad_norm": 25.779614194701836, "kl": 0.1201171875, "learning_rate": 6.099526896793411e-07, "loss": 0.0482, "reward": 1.6931931972503662, "reward_std": 0.19687005877494812, "rewards/accuracy_reward_stage2": 0.6931931376457214, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2227 }, { "completion_length": 13.359375, "epoch": 0.39039775714035396, "grad_norm": 115.30703123252181, "kl": 0.55078125, "learning_rate": 6.097774662694936e-07, "loss": 0.2402, "reward": 1.5276418924331665, "reward_std": 0.11854963004589081, "rewards/accuracy_reward_stage2": 0.6526418924331665, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2228 }, { "completion_length": 10.796875, "epoch": 0.3905729805502015, "grad_norm": 14.790021209721399, "kl": 0.0634765625, "learning_rate": 6.09602242859646e-07, "loss": -0.0188, "reward": 1.562615156173706, "reward_std": 0.14368489384651184, "rewards/accuracy_reward_stage2": 0.578240156173706, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2229 }, { "completion_length": 8.203125, "epoch": 0.39074820396004906, "grad_norm": 21.47388207177197, "kl": 0.16796875, "learning_rate": 6.094270194497984e-07, "loss": 0.0673, "reward": 1.5674675703048706, "reward_std": 0.29743796586990356, "rewards/accuracy_reward_stage2": 0.6924675703048706, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2230 }, { "completion_length": 14.078125, "epoch": 0.3909234273698966, "grad_norm": 18.456799523246854, "kl": 0.1591796875, "learning_rate": 6.092517960399509e-07, "loss": -0.0777, "reward": 1.2790626287460327, "reward_std": 0.2566419243812561, "rewards/accuracy_reward_stage2": 0.3415626287460327, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2231 }, { "completion_length": 11.09375, "epoch": 0.39109865077974415, "grad_norm": 13.925303709250686, "kl": 0.1455078125, "learning_rate": 6.090765726301034e-07, "loss": -0.0635, "reward": 1.607444405555725, "reward_std": 0.2526930570602417, "rewards/accuracy_reward_stage2": 0.7793193459510803, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2232 }, { "completion_length": 24.921875, "epoch": 0.39127387418959175, "grad_norm": 23.813652296058024, "kl": 0.0830078125, "learning_rate": 6.089013492202558e-07, "loss": -0.011, "reward": 1.5801374912261963, "reward_std": 0.28528648614883423, "rewards/accuracy_reward_stage2": 0.5957625508308411, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2233 }, { "completion_length": 11.25, "epoch": 0.3914490975994393, "grad_norm": 17.95375870725181, "kl": 0.07373046875, "learning_rate": 6.087261258104083e-07, "loss": 0.008, "reward": 1.7711431980133057, "reward_std": 0.2443651705980301, "rewards/accuracy_reward_stage2": 0.7867681980133057, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2234 }, { "completion_length": 11.359375, "epoch": 0.39162432100928685, "grad_norm": 14.143073454348915, "kl": 0.10498046875, "learning_rate": 6.085509024005607e-07, "loss": -0.0311, "reward": 1.688242793083191, "reward_std": 0.17107422649860382, "rewards/accuracy_reward_stage2": 0.7194927930831909, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2235 }, { "completion_length": 10.171875, "epoch": 0.3917995444191344, "grad_norm": 18.96235279859737, "kl": 0.212890625, "learning_rate": 6.083756789907132e-07, "loss": -0.0775, "reward": 1.7642440795898438, "reward_std": 0.3353942632675171, "rewards/accuracy_reward_stage2": 0.826744019985199, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2236 }, { "completion_length": 10.6875, "epoch": 0.39197476782898194, "grad_norm": 44.617783503094316, "kl": 0.26953125, "learning_rate": 6.082004555808656e-07, "loss": 0.0191, "reward": 1.5416667461395264, "reward_std": 0.3209052085876465, "rewards/accuracy_reward_stage2": 0.5729166269302368, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2237 }, { "completion_length": 8.359375, "epoch": 0.3921499912388295, "grad_norm": 20.41487185448497, "kl": 0.2158203125, "learning_rate": 6.08025232171018e-07, "loss": 0.0209, "reward": 1.5106935501098633, "reward_std": 0.19475436210632324, "rewards/accuracy_reward_stage2": 0.5419436693191528, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2238 }, { "completion_length": 11.28125, "epoch": 0.3923252146486771, "grad_norm": 21.018475522284266, "kl": 0.24609375, "learning_rate": 6.078500087611704e-07, "loss": 0.055, "reward": 1.3984198570251465, "reward_std": 0.282496839761734, "rewards/accuracy_reward_stage2": 0.6640447378158569, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2239 }, { "completion_length": 15.265625, "epoch": 0.39250043805852464, "grad_norm": 19.170457946095826, "kl": 0.11328125, "learning_rate": 6.076747853513228e-07, "loss": -0.0192, "reward": 1.558809757232666, "reward_std": 0.38950616121292114, "rewards/accuracy_reward_stage2": 0.590059757232666, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2240 }, { "completion_length": 9.5, "epoch": 0.3926756614683722, "grad_norm": 23.118453194182642, "kl": 0.37890625, "learning_rate": 6.074995619414753e-07, "loss": 0.1076, "reward": 1.6901085376739502, "reward_std": 0.21713736653327942, "rewards/accuracy_reward_stage2": 0.830733597278595, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2241 }, { "completion_length": 14.609375, "epoch": 0.39285088487821973, "grad_norm": 15.373361647993663, "kl": 0.024658203125, "learning_rate": 6.073243385316278e-07, "loss": 0.0099, "reward": 1.337906837463379, "reward_std": 0.150638610124588, "rewards/accuracy_reward_stage2": 0.3379068374633789, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2242 }, { "completion_length": 24.15625, "epoch": 0.3930261082880673, "grad_norm": 22.081573519968778, "kl": 0.0625, "learning_rate": 6.071491151217802e-07, "loss": -0.0585, "reward": 1.5403672456741333, "reward_std": 0.2464727759361267, "rewards/accuracy_reward_stage2": 0.5716171860694885, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2243 }, { "completion_length": 8.8125, "epoch": 0.3932013316979148, "grad_norm": 22.833636209238247, "kl": 0.09814453125, "learning_rate": 6.069738917119327e-07, "loss": 0.0077, "reward": 1.6146864891052246, "reward_std": 0.2203153371810913, "rewards/accuracy_reward_stage2": 0.6303114295005798, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2244 }, { "completion_length": 12.921875, "epoch": 0.39337655510776237, "grad_norm": 23.377203815412372, "kl": 0.197265625, "learning_rate": 6.067986683020852e-07, "loss": 0.0264, "reward": 1.4449132680892944, "reward_std": 0.2471257746219635, "rewards/accuracy_reward_stage2": 0.6011632084846497, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2245 }, { "completion_length": 10.421875, "epoch": 0.39355177851761, "grad_norm": 19.72501700323542, "kl": 0.1708984375, "learning_rate": 6.066234448922376e-07, "loss": 0.0244, "reward": 1.5555421113967896, "reward_std": 0.21328100562095642, "rewards/accuracy_reward_stage2": 0.6961670517921448, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2246 }, { "completion_length": 11.375, "epoch": 0.3937270019274575, "grad_norm": 22.98459927286484, "kl": 0.1513671875, "learning_rate": 6.064482214823901e-07, "loss": -0.0156, "reward": 1.5627611875534058, "reward_std": 0.366780549287796, "rewards/accuracy_reward_stage2": 0.5940111875534058, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2247 }, { "completion_length": 10.9375, "epoch": 0.39390222533730507, "grad_norm": 22.93261313656731, "kl": 0.1416015625, "learning_rate": 6.062729980725426e-07, "loss": 0.0231, "reward": 1.6208666563034058, "reward_std": 0.27645695209503174, "rewards/accuracy_reward_stage2": 0.6364917159080505, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2248 }, { "completion_length": 11.59375, "epoch": 0.3940774487471526, "grad_norm": 17.41514520486553, "kl": 0.054931640625, "learning_rate": 6.060977746626949e-07, "loss": 0.022, "reward": 1.6029356718063354, "reward_std": 0.0914289802312851, "rewards/accuracy_reward_stage2": 0.6029355525970459, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2249 }, { "completion_length": 11.65625, "epoch": 0.39425267215700016, "grad_norm": 17.63786857950724, "kl": 0.04736328125, "learning_rate": 6.059225512528473e-07, "loss": 0.0189, "reward": 1.6209280490875244, "reward_std": 0.18614742159843445, "rewards/accuracy_reward_stage2": 0.6209280490875244, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2250 }, { "completion_length": 15.296875, "epoch": 0.3944278955668477, "grad_norm": 15.877845199573407, "kl": 0.06494140625, "learning_rate": 6.057473278429997e-07, "loss": 0.0041, "reward": 1.6792845726013184, "reward_std": 0.17453373968601227, "rewards/accuracy_reward_stage2": 0.6949096322059631, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2251 }, { "completion_length": 6.40625, "epoch": 0.3946031189766953, "grad_norm": 18.188032907525475, "kl": 0.087890625, "learning_rate": 6.055721044331522e-07, "loss": 0.0351, "reward": 1.784743070602417, "reward_std": 0.20735129714012146, "rewards/accuracy_reward_stage2": 0.784743070602417, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2252 }, { "completion_length": 11.40625, "epoch": 0.39477834238654286, "grad_norm": 24.27092280061725, "kl": 0.032470703125, "learning_rate": 6.053968810233047e-07, "loss": 0.013, "reward": 1.7523884773254395, "reward_std": 0.18452656269073486, "rewards/accuracy_reward_stage2": 0.7523884773254395, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2253 }, { "completion_length": 10.8125, "epoch": 0.3949535657963904, "grad_norm": 31.106394878920838, "kl": 0.16796875, "learning_rate": 6.052216576134571e-07, "loss": -0.0084, "reward": 1.6320232152938843, "reward_std": 0.2674624025821686, "rewards/accuracy_reward_stage2": 0.6788982152938843, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2254 }, { "completion_length": 8.015625, "epoch": 0.39512878920623795, "grad_norm": 19.402853907661573, "kl": 0.11181640625, "learning_rate": 6.050464342036096e-07, "loss": 0.0351, "reward": 1.7537932395935059, "reward_std": 0.1687513291835785, "rewards/accuracy_reward_stage2": 0.7694183588027954, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2255 }, { "completion_length": 11.859375, "epoch": 0.3953040126160855, "grad_norm": 23.78392267136443, "kl": 0.06591796875, "learning_rate": 6.04871210793762e-07, "loss": -0.0066, "reward": 1.2469592094421387, "reward_std": 0.21802271902561188, "rewards/accuracy_reward_stage2": 0.38758420944213867, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2256 }, { "completion_length": 8.390625, "epoch": 0.39547923602593305, "grad_norm": 17.136787258060238, "kl": 0.08251953125, "learning_rate": 6.046959873839145e-07, "loss": -0.0112, "reward": 1.6200617551803589, "reward_std": 0.2077975869178772, "rewards/accuracy_reward_stage2": 0.7606866955757141, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2257 }, { "completion_length": 20.28125, "epoch": 0.39565445943578065, "grad_norm": 20.39764284951551, "kl": 0.0869140625, "learning_rate": 6.04520763974067e-07, "loss": 0.0192, "reward": 1.4464240074157715, "reward_std": 0.2652880549430847, "rewards/accuracy_reward_stage2": 0.5870490074157715, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2258 }, { "completion_length": 7.65625, "epoch": 0.3958296828456282, "grad_norm": 16.894150235680982, "kl": 0.1083984375, "learning_rate": 6.043455405642193e-07, "loss": 0.0433, "reward": 1.6962438821792603, "reward_std": 0.14486585557460785, "rewards/accuracy_reward_stage2": 0.6962438225746155, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2259 }, { "completion_length": 25.328125, "epoch": 0.39600490625547574, "grad_norm": 15.769439083890266, "kl": 0.1318359375, "learning_rate": 6.041703171543718e-07, "loss": 0.0219, "reward": 1.3447751998901367, "reward_std": 0.11412560939788818, "rewards/accuracy_reward_stage2": 0.6104001998901367, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2260 }, { "completion_length": 9.328125, "epoch": 0.3961801296653233, "grad_norm": 12.128215396278677, "kl": 0.034912109375, "learning_rate": 6.039950937445243e-07, "loss": 0.0139, "reward": 1.8489582538604736, "reward_std": 0.13258251547813416, "rewards/accuracy_reward_stage2": 0.8489583730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2261 }, { "completion_length": 8.4375, "epoch": 0.39635535307517084, "grad_norm": 18.815808354313415, "kl": 0.05126953125, "learning_rate": 6.038198703346767e-07, "loss": 0.0205, "reward": 1.71771240234375, "reward_std": 0.3155533969402313, "rewards/accuracy_reward_stage2": 0.71771240234375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2262 }, { "completion_length": 12.875, "epoch": 0.3965305764850184, "grad_norm": 19.072455081759877, "kl": 0.2021484375, "learning_rate": 6.036446469248291e-07, "loss": 0.0241, "reward": 1.636966347694397, "reward_std": 0.3073020577430725, "rewards/accuracy_reward_stage2": 0.6682164072990417, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2263 }, { "completion_length": 9.46875, "epoch": 0.39670579989486593, "grad_norm": 14.377658250079735, "kl": 0.056640625, "learning_rate": 6.034694235149815e-07, "loss": 0.0227, "reward": 1.4962797164916992, "reward_std": 0.14264775812625885, "rewards/accuracy_reward_stage2": 0.6212797164916992, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2264 }, { "completion_length": 11.203125, "epoch": 0.39688102330471353, "grad_norm": 22.85198974569718, "kl": 0.20703125, "learning_rate": 6.03294200105134e-07, "loss": -0.0026, "reward": 1.5522370338439941, "reward_std": 0.2735584080219269, "rewards/accuracy_reward_stage2": 0.5834869146347046, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2265 }, { "completion_length": 7.65625, "epoch": 0.3970562467145611, "grad_norm": 18.775607460968544, "kl": 0.052734375, "learning_rate": 6.031189766952865e-07, "loss": 0.0211, "reward": 1.533717155456543, "reward_std": 0.13953596353530884, "rewards/accuracy_reward_stage2": 0.533717155456543, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2266 }, { "completion_length": 9.21875, "epoch": 0.3972314701244086, "grad_norm": 17.46988710403811, "kl": 0.10546875, "learning_rate": 6.029437532854389e-07, "loss": -0.0019, "reward": 1.6571948528289795, "reward_std": 0.20851103961467743, "rewards/accuracy_reward_stage2": 0.6728198528289795, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2267 }, { "completion_length": 9.796875, "epoch": 0.3974066935342562, "grad_norm": 16.101983150034496, "kl": 0.212890625, "learning_rate": 6.027685298755914e-07, "loss": 0.0281, "reward": 1.4166667461395264, "reward_std": 0.16781339049339294, "rewards/accuracy_reward_stage2": 0.5729166865348816, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2268 }, { "completion_length": 7.703125, "epoch": 0.3975819169441037, "grad_norm": 18.603337507415958, "kl": 0.11767578125, "learning_rate": 6.025933064657438e-07, "loss": 0.0028, "reward": 1.6792187690734863, "reward_std": 0.2111339271068573, "rewards/accuracy_reward_stage2": 0.6948437690734863, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2269 }, { "completion_length": 10.34375, "epoch": 0.39775714035395127, "grad_norm": 21.344244615987552, "kl": 0.09912109375, "learning_rate": 6.024180830558962e-07, "loss": 0.0397, "reward": 1.2726788520812988, "reward_std": 0.29367613792419434, "rewards/accuracy_reward_stage2": 0.39767885208129883, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2270 }, { "completion_length": 9.5625, "epoch": 0.39793236376379887, "grad_norm": 18.185559517665077, "kl": 0.048583984375, "learning_rate": 6.022428596460487e-07, "loss": 0.0194, "reward": 1.5903193950653076, "reward_std": 0.22018752992153168, "rewards/accuracy_reward_stage2": 0.5903194546699524, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2271 }, { "completion_length": 8.15625, "epoch": 0.3981075871736464, "grad_norm": 19.903045893225336, "kl": 0.150390625, "learning_rate": 6.020676362362011e-07, "loss": 0.06, "reward": 1.56424081325531, "reward_std": 0.20303234457969666, "rewards/accuracy_reward_stage2": 0.5642408132553101, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2272 }, { "completion_length": 7.953125, "epoch": 0.39828281058349396, "grad_norm": 8.3917061329364, "kl": 0.0556640625, "learning_rate": 6.018924128263536e-07, "loss": 0.0222, "reward": 1.484375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.484375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2273 }, { "completion_length": 10.71875, "epoch": 0.3984580339933415, "grad_norm": 17.9917725511186, "kl": 0.099609375, "learning_rate": 6.017171894165061e-07, "loss": -0.0044, "reward": 1.5761617422103882, "reward_std": 0.10360636562108994, "rewards/accuracy_reward_stage2": 0.5917867422103882, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2274 }, { "completion_length": 16.703125, "epoch": 0.39863325740318906, "grad_norm": 19.771813163606605, "kl": 0.10595703125, "learning_rate": 6.015419660066584e-07, "loss": 0.0057, "reward": 1.7596876621246338, "reward_std": 0.17410920560359955, "rewards/accuracy_reward_stage2": 0.7753127217292786, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2275 }, { "completion_length": 10.15625, "epoch": 0.3988084808130366, "grad_norm": 20.262330099879136, "kl": 0.03662109375, "learning_rate": 6.013667425968109e-07, "loss": 0.0147, "reward": 1.481999397277832, "reward_std": 0.11602069437503815, "rewards/accuracy_reward_stage2": 0.4819994568824768, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2276 }, { "completion_length": 11.34375, "epoch": 0.39898370422288415, "grad_norm": 17.02612667675846, "kl": 0.0673828125, "learning_rate": 6.011915191869634e-07, "loss": 0.027, "reward": 1.57454514503479, "reward_std": 0.18871784210205078, "rewards/accuracy_reward_stage2": 0.57454514503479, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2277 }, { "completion_length": 13.0625, "epoch": 0.39915892763273175, "grad_norm": 22.694585700918292, "kl": 0.09326171875, "learning_rate": 6.010162957771157e-07, "loss": 0.0373, "reward": 1.4261221885681152, "reward_std": 0.23612135648727417, "rewards/accuracy_reward_stage2": 0.42612212896347046, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2278 }, { "completion_length": 13.890625, "epoch": 0.3993341510425793, "grad_norm": 18.353435530372224, "kl": 0.087890625, "learning_rate": 6.008410723672682e-07, "loss": 0.0352, "reward": 1.4055817127227783, "reward_std": 0.1411152333021164, "rewards/accuracy_reward_stage2": 0.4055817723274231, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2279 }, { "completion_length": 11.0, "epoch": 0.39950937445242685, "grad_norm": 20.489860262338013, "kl": 0.17578125, "learning_rate": 6.006658489574206e-07, "loss": 0.0699, "reward": 1.3175370693206787, "reward_std": 0.13264381885528564, "rewards/accuracy_reward_stage2": 0.44253700971603394, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2280 }, { "completion_length": 10.59375, "epoch": 0.3996845978622744, "grad_norm": 19.79224638731199, "kl": 0.154296875, "learning_rate": 6.004906255475731e-07, "loss": 0.0617, "reward": 1.0645630359649658, "reward_std": 0.23653094470500946, "rewards/accuracy_reward_stage2": 0.4395630359649658, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 2281 }, { "completion_length": 7.859375, "epoch": 0.39985982127212194, "grad_norm": 18.134076910029414, "kl": 0.07373046875, "learning_rate": 6.003154021377256e-07, "loss": 0.0296, "reward": 1.6970256567001343, "reward_std": 0.06270062178373337, "rewards/accuracy_reward_stage2": 0.6970256567001343, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2282 }, { "completion_length": 9.625, "epoch": 0.4000350446819695, "grad_norm": 17.74429963997985, "kl": 0.05908203125, "learning_rate": 6.00140178727878e-07, "loss": 0.0237, "reward": 1.6750741004943848, "reward_std": 0.163091778755188, "rewards/accuracy_reward_stage2": 0.6750742197036743, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2283 }, { "completion_length": 10.984375, "epoch": 0.4002102680918171, "grad_norm": 16.65654588186162, "kl": 0.1572265625, "learning_rate": 5.999649553180305e-07, "loss": 0.063, "reward": 1.5580189228057861, "reward_std": 0.2063732147216797, "rewards/accuracy_reward_stage2": 0.5580189228057861, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2284 }, { "completion_length": 8.609375, "epoch": 0.40038549150166464, "grad_norm": 25.418214645346826, "kl": 0.2333984375, "learning_rate": 5.99789731908183e-07, "loss": 0.0503, "reward": 1.7506150007247925, "reward_std": 0.248035728931427, "rewards/accuracy_reward_stage2": 0.7662400007247925, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2285 }, { "completion_length": 9.109375, "epoch": 0.4005607149115122, "grad_norm": 24.176216380847496, "kl": 0.1376953125, "learning_rate": 5.996145084983354e-07, "loss": 0.0109, "reward": 1.501746416091919, "reward_std": 0.2735890746116638, "rewards/accuracy_reward_stage2": 0.517371416091919, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2286 }, { "completion_length": 10.515625, "epoch": 0.40073593832135973, "grad_norm": 23.296848488888745, "kl": 0.055908203125, "learning_rate": 5.994392850884879e-07, "loss": -0.0108, "reward": 1.4851511716842651, "reward_std": 0.2239103764295578, "rewards/accuracy_reward_stage2": 0.5007761716842651, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2287 }, { "completion_length": 14.796875, "epoch": 0.4009111617312073, "grad_norm": 15.295213469553111, "kl": 0.06396484375, "learning_rate": 5.992640616786401e-07, "loss": -0.0134, "reward": 1.421875, "reward_std": 0.2597545385360718, "rewards/accuracy_reward_stage2": 0.4375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2288 }, { "completion_length": 6.953125, "epoch": 0.4010863851410548, "grad_norm": 22.493392811606345, "kl": 0.078125, "learning_rate": 5.990888382687926e-07, "loss": 0.0313, "reward": 1.4117205142974854, "reward_std": 0.3008676767349243, "rewards/accuracy_reward_stage2": 0.5367204546928406, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2289 }, { "completion_length": 10.921875, "epoch": 0.4012616085509024, "grad_norm": 19.29072294069736, "kl": 0.0966796875, "learning_rate": 5.989136148589451e-07, "loss": 0.0387, "reward": 1.4010417461395264, "reward_std": 0.24621228873729706, "rewards/accuracy_reward_stage2": 0.5260416269302368, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2290 }, { "completion_length": 10.375, "epoch": 0.40143683196075, "grad_norm": 17.345347687316323, "kl": 0.10498046875, "learning_rate": 5.987383914490975e-07, "loss": 0.042, "reward": 1.8262851238250732, "reward_std": 0.21577240526676178, "rewards/accuracy_reward_stage2": 0.8262850642204285, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2291 }, { "completion_length": 19.953125, "epoch": 0.4016120553705975, "grad_norm": 22.370097121820592, "kl": 0.1083984375, "learning_rate": 5.9856316803925e-07, "loss": 0.0432, "reward": 1.4479179382324219, "reward_std": 0.32231512665748596, "rewards/accuracy_reward_stage2": 0.44791799783706665, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2292 }, { "completion_length": 10.46875, "epoch": 0.40178727878044507, "grad_norm": 19.718412661688117, "kl": 0.146484375, "learning_rate": 5.983879446294025e-07, "loss": 0.0143, "reward": 1.53125, "reward_std": 0.25513994693756104, "rewards/accuracy_reward_stage2": 0.671875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2293 }, { "completion_length": 21.390625, "epoch": 0.4019625021902926, "grad_norm": 32.51152529262389, "kl": 0.07861328125, "learning_rate": 5.982127212195549e-07, "loss": 0.0314, "reward": 1.2806692123413086, "reward_std": 0.2547403573989868, "rewards/accuracy_reward_stage2": 0.40566927194595337, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2294 }, { "completion_length": 8.65625, "epoch": 0.40213772560014016, "grad_norm": 32.45483359709806, "kl": 0.078125, "learning_rate": 5.980374978097074e-07, "loss": 0.0312, "reward": 1.7657719850540161, "reward_std": 0.2733202576637268, "rewards/accuracy_reward_stage2": 0.7657719850540161, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2295 }, { "completion_length": 9.671875, "epoch": 0.4023129490099877, "grad_norm": 18.02935670992023, "kl": 0.1005859375, "learning_rate": 5.978622743998598e-07, "loss": 0.0402, "reward": 1.7565680742263794, "reward_std": 0.2380290925502777, "rewards/accuracy_reward_stage2": 0.7565680742263794, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2296 }, { "completion_length": 7.65625, "epoch": 0.4024881724198353, "grad_norm": 16.789220677853052, "kl": 0.1005859375, "learning_rate": 5.976870509900123e-07, "loss": -0.0481, "reward": 1.4566528797149658, "reward_std": 0.22974956035614014, "rewards/accuracy_reward_stage2": 0.48790284991264343, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2297 }, { "completion_length": 8.5, "epoch": 0.40266339582968286, "grad_norm": 19.418589942374066, "kl": 0.0947265625, "learning_rate": 5.975118275801648e-07, "loss": -0.0062, "reward": 1.598454236984253, "reward_std": 0.19560183584690094, "rewards/accuracy_reward_stage2": 0.6140791773796082, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2298 }, { "completion_length": 20.4375, "epoch": 0.4028386192395304, "grad_norm": 14.144931450364158, "kl": 0.0174560546875, "learning_rate": 5.973366041703171e-07, "loss": -0.0372, "reward": 1.5348436832427979, "reward_std": 0.15786908566951752, "rewards/accuracy_reward_stage2": 0.5504686832427979, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2299 }, { "completion_length": 7.109375, "epoch": 0.40301384264937795, "grad_norm": 25.985920547810785, "kl": 0.08740234375, "learning_rate": 5.971613807604696e-07, "loss": 0.035, "reward": 1.701280117034912, "reward_std": 0.3256889581680298, "rewards/accuracy_reward_stage2": 0.7012800574302673, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2300 }, { "completion_length": 9.65625, "epoch": 0.4031890660592255, "grad_norm": 18.26426575589906, "kl": 0.1044921875, "learning_rate": 5.969861573506219e-07, "loss": 0.0417, "reward": 1.7318881750106812, "reward_std": 0.18695884943008423, "rewards/accuracy_reward_stage2": 0.8568881750106812, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2301 }, { "completion_length": 8.34375, "epoch": 0.40336428946907305, "grad_norm": 19.11727770163386, "kl": 0.1044921875, "learning_rate": 5.968109339407744e-07, "loss": 0.0099, "reward": 1.6046810150146484, "reward_std": 0.24243220686912537, "rewards/accuracy_reward_stage2": 0.6203060150146484, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2302 }, { "completion_length": 9.359375, "epoch": 0.40353951287892065, "grad_norm": 23.491406867573176, "kl": 0.050048828125, "learning_rate": 5.966357105309269e-07, "loss": 0.02, "reward": 1.4263932704925537, "reward_std": 0.3580125868320465, "rewards/accuracy_reward_stage2": 0.5513932704925537, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2303 }, { "completion_length": 8.5625, "epoch": 0.4037147362887682, "grad_norm": 15.3331068572184, "kl": 0.041259765625, "learning_rate": 5.964604871210793e-07, "loss": 0.0165, "reward": 1.697649598121643, "reward_std": 0.19590801000595093, "rewards/accuracy_reward_stage2": 0.6976495981216431, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2304 }, { "completion_length": 13.046875, "epoch": 0.40388995969861574, "grad_norm": 11.462365031990299, "kl": 0.024658203125, "learning_rate": 5.962852637112318e-07, "loss": 0.0099, "reward": 1.6736334562301636, "reward_std": 0.063841812312603, "rewards/accuracy_reward_stage2": 0.6736334562301636, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2305 }, { "completion_length": 8.40625, "epoch": 0.4040651831084633, "grad_norm": 16.405845023216393, "kl": 0.0673828125, "learning_rate": 5.961100403013843e-07, "loss": 0.027, "reward": 1.499981164932251, "reward_std": 0.10144259035587311, "rewards/accuracy_reward_stage2": 0.624981164932251, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2306 }, { "completion_length": 10.453125, "epoch": 0.40424040651831084, "grad_norm": 15.784375987182974, "kl": 0.15234375, "learning_rate": 5.959348168915367e-07, "loss": -0.0628, "reward": 1.589674949645996, "reward_std": 0.27123454213142395, "rewards/accuracy_reward_stage2": 0.6365499496459961, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2307 }, { "completion_length": 6.421875, "epoch": 0.4044156299281584, "grad_norm": 18.311048533606936, "kl": 0.087890625, "learning_rate": 5.957595934816891e-07, "loss": 0.0039, "reward": 1.6572329998016357, "reward_std": 0.26454687118530273, "rewards/accuracy_reward_stage2": 0.672857940196991, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2308 }, { "completion_length": 6.46875, "epoch": 0.404590853338006, "grad_norm": 17.68654151954203, "kl": 0.1416015625, "learning_rate": 5.955843700718416e-07, "loss": -0.0318, "reward": 1.57929265499115, "reward_std": 0.13953736424446106, "rewards/accuracy_reward_stage2": 0.6105427145957947, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2309 }, { "completion_length": 12.875, "epoch": 0.40476607674785353, "grad_norm": 19.35674883065028, "kl": 0.12353515625, "learning_rate": 5.95409146661994e-07, "loss": 0.0494, "reward": 1.546875, "reward_std": 0.2867125868797302, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2310 }, { "completion_length": 10.421875, "epoch": 0.4049413001577011, "grad_norm": 19.56185076460949, "kl": 0.1875, "learning_rate": 5.952339232521465e-07, "loss": 0.0361, "reward": 1.436431646347046, "reward_std": 0.3601900339126587, "rewards/accuracy_reward_stage2": 0.7020567059516907, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2311 }, { "completion_length": 7.828125, "epoch": 0.4051165235675486, "grad_norm": 22.221393548545816, "kl": 0.08056640625, "learning_rate": 5.950586998422989e-07, "loss": -0.0119, "reward": 1.5891380310058594, "reward_std": 0.1557682603597641, "rewards/accuracy_reward_stage2": 0.7297629714012146, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2312 }, { "completion_length": 9.546875, "epoch": 0.4052917469773962, "grad_norm": 21.028475980947228, "kl": 0.2265625, "learning_rate": 5.948834764324514e-07, "loss": 0.016, "reward": 1.4670155048370361, "reward_std": 0.20655208826065063, "rewards/accuracy_reward_stage2": 0.6232655048370361, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2313 }, { "completion_length": 7.890625, "epoch": 0.4054669703872437, "grad_norm": 18.888146521184282, "kl": 0.1494140625, "learning_rate": 5.947082530226038e-07, "loss": 0.0155, "reward": 1.5098905563354492, "reward_std": 0.2939804792404175, "rewards/accuracy_reward_stage2": 0.5255155563354492, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2314 }, { "completion_length": 8.03125, "epoch": 0.40564219379709127, "grad_norm": 18.779321483798796, "kl": 0.0888671875, "learning_rate": 5.945330296127562e-07, "loss": 0.0355, "reward": 1.5954867601394653, "reward_std": 0.27154141664505005, "rewards/accuracy_reward_stage2": 0.7204867601394653, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2315 }, { "completion_length": 8.671875, "epoch": 0.40581741720693887, "grad_norm": 18.485802326126173, "kl": 0.13671875, "learning_rate": 5.943578062029087e-07, "loss": -0.0229, "reward": 1.25, "reward_std": 0.24359199404716492, "rewards/accuracy_reward_stage2": 0.40625, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2316 }, { "completion_length": 10.015625, "epoch": 0.4059926406167864, "grad_norm": 28.565253954106154, "kl": 0.283203125, "learning_rate": 5.941825827930611e-07, "loss": 0.0337, "reward": 1.1393563747406006, "reward_std": 0.27226772904396057, "rewards/accuracy_reward_stage2": 0.4206062853336334, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2317 }, { "completion_length": 10.875, "epoch": 0.40616786402663396, "grad_norm": 18.355085386228502, "kl": 0.12890625, "learning_rate": 5.940073593832135e-07, "loss": 0.0146, "reward": 1.6291195154190063, "reward_std": 0.2628851532936096, "rewards/accuracy_reward_stage2": 0.6447445154190063, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2318 }, { "completion_length": 9.0, "epoch": 0.4063430874364815, "grad_norm": 19.009951811781782, "kl": 0.087890625, "learning_rate": 5.93832135973366e-07, "loss": 0.0351, "reward": 1.7401200532913208, "reward_std": 0.21193233132362366, "rewards/accuracy_reward_stage2": 0.740119993686676, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2319 }, { "completion_length": 7.84375, "epoch": 0.40651831084632906, "grad_norm": 16.304528563870416, "kl": 0.0439453125, "learning_rate": 5.936569125635184e-07, "loss": 0.0175, "reward": 1.640625, "reward_std": 0.32878512144088745, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2320 }, { "completion_length": 15.15625, "epoch": 0.4066935342561766, "grad_norm": 15.160611953847901, "kl": 0.037109375, "learning_rate": 5.934816891536709e-07, "loss": 0.0148, "reward": 1.5934399366378784, "reward_std": 0.07537385821342468, "rewards/accuracy_reward_stage2": 0.5934398770332336, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2321 }, { "completion_length": 9.328125, "epoch": 0.4068687576660242, "grad_norm": 17.838232315322752, "kl": 0.0400390625, "learning_rate": 5.933064657438234e-07, "loss": 0.0159, "reward": 1.8585360050201416, "reward_std": 0.12488370388746262, "rewards/accuracy_reward_stage2": 0.8585360050201416, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2322 }, { "completion_length": 11.21875, "epoch": 0.40704398107587175, "grad_norm": 18.85560208158748, "kl": 0.076171875, "learning_rate": 5.931312423339758e-07, "loss": 0.0306, "reward": 1.368743658065796, "reward_std": 0.16224510967731476, "rewards/accuracy_reward_stage2": 0.3687437176704407, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2323 }, { "completion_length": 12.53125, "epoch": 0.4072192044857193, "grad_norm": 19.332274366874387, "kl": 0.056884765625, "learning_rate": 5.929560189241283e-07, "loss": 0.0228, "reward": 1.4458041191101074, "reward_std": 0.19428783655166626, "rewards/accuracy_reward_stage2": 0.5708041787147522, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2324 }, { "completion_length": 10.421875, "epoch": 0.40739442789556685, "grad_norm": 18.71638987852407, "kl": 0.0732421875, "learning_rate": 5.927807955142807e-07, "loss": -0.0591, "reward": 1.426041603088379, "reward_std": 0.261094331741333, "rewards/accuracy_reward_stage2": 0.4572916626930237, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2325 }, { "completion_length": 9.234375, "epoch": 0.4075696513054144, "grad_norm": 11.874790893429372, "kl": 0.06396484375, "learning_rate": 5.926055721044331e-07, "loss": 0.0256, "reward": 1.5826786756515503, "reward_std": 0.08243951201438904, "rewards/accuracy_reward_stage2": 0.5826787948608398, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2326 }, { "completion_length": 7.5625, "epoch": 0.40774487471526194, "grad_norm": 20.37028603700548, "kl": 0.09033203125, "learning_rate": 5.924303486945856e-07, "loss": 0.0362, "reward": 1.448150634765625, "reward_std": 0.20492343604564667, "rewards/accuracy_reward_stage2": 0.4481506943702698, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2327 }, { "completion_length": 10.703125, "epoch": 0.4079200981251095, "grad_norm": 17.434897565743288, "kl": 0.1474609375, "learning_rate": 5.922551252847379e-07, "loss": 0.0147, "reward": 1.474340558052063, "reward_std": 0.2231925129890442, "rewards/accuracy_reward_stage2": 0.4899655282497406, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2328 }, { "completion_length": 9.25, "epoch": 0.4080953215349571, "grad_norm": 16.34813909602192, "kl": 0.06201171875, "learning_rate": 5.920799018748904e-07, "loss": 0.0248, "reward": 1.3414571285247803, "reward_std": 0.09716267138719559, "rewards/accuracy_reward_stage2": 0.3414571285247803, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2329 }, { "completion_length": 10.53125, "epoch": 0.40827054494480464, "grad_norm": 22.23078087484954, "kl": 0.091796875, "learning_rate": 5.919046784650429e-07, "loss": 0.0368, "reward": 1.6912882328033447, "reward_std": 0.2064078152179718, "rewards/accuracy_reward_stage2": 0.8162882328033447, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2330 }, { "completion_length": 16.40625, "epoch": 0.4084457683546522, "grad_norm": 24.824321168016088, "kl": 0.04248046875, "learning_rate": 5.917294550551953e-07, "loss": 0.017, "reward": 1.4140516519546509, "reward_std": 0.2925530672073364, "rewards/accuracy_reward_stage2": 0.5390516519546509, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2331 }, { "completion_length": 16.28125, "epoch": 0.40862099176449973, "grad_norm": 18.417940232007503, "kl": 0.10205078125, "learning_rate": 5.915542316453478e-07, "loss": 0.0408, "reward": 1.3555889129638672, "reward_std": 0.1305437982082367, "rewards/accuracy_reward_stage2": 0.48058879375457764, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2332 }, { "completion_length": 10.234375, "epoch": 0.4087962151743473, "grad_norm": 18.67898869078281, "kl": 0.158203125, "learning_rate": 5.913790082355002e-07, "loss": 0.0191, "reward": 1.4799710512161255, "reward_std": 0.2335529327392578, "rewards/accuracy_reward_stage2": 0.6205961108207703, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2333 }, { "completion_length": 9.296875, "epoch": 0.4089714385841948, "grad_norm": 20.62020309808911, "kl": 0.119140625, "learning_rate": 5.912037848256527e-07, "loss": 0.0477, "reward": 1.198518991470337, "reward_std": 0.3195599317550659, "rewards/accuracy_reward_stage2": 0.4485190510749817, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2334 }, { "completion_length": 9.1875, "epoch": 0.4091466619940424, "grad_norm": 15.477641285474997, "kl": 0.048583984375, "learning_rate": 5.910285614158052e-07, "loss": 0.0194, "reward": 1.541133999824524, "reward_std": 0.17401783168315887, "rewards/accuracy_reward_stage2": 0.5411341190338135, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2335 }, { "completion_length": 22.546875, "epoch": 0.40932188540389, "grad_norm": 16.15447248636756, "kl": 0.171875, "learning_rate": 5.908533380059576e-07, "loss": 0.025, "reward": 1.6095988750457764, "reward_std": 0.27953198552131653, "rewards/accuracy_reward_stage2": 0.6252239942550659, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2336 }, { "completion_length": 7.8125, "epoch": 0.4094971088137375, "grad_norm": 14.97687341250913, "kl": 0.060791015625, "learning_rate": 5.906781145961101e-07, "loss": 0.0244, "reward": 1.546875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward_stage2": 0.671875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2337 }, { "completion_length": 9.25, "epoch": 0.40967233222358507, "grad_norm": 20.206446418912705, "kl": 0.055908203125, "learning_rate": 5.905028911862626e-07, "loss": 0.0224, "reward": 1.4281362295150757, "reward_std": 0.1827203780412674, "rewards/accuracy_reward_stage2": 0.4281362295150757, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2338 }, { "completion_length": 14.078125, "epoch": 0.4098475556334326, "grad_norm": 18.36376649072188, "kl": 0.080078125, "learning_rate": 5.903276677764148e-07, "loss": -0.0034, "reward": 1.5158047676086426, "reward_std": 0.28939586877822876, "rewards/accuracy_reward_stage2": 0.6564297080039978, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2339 }, { "completion_length": 7.484375, "epoch": 0.41002277904328016, "grad_norm": 19.562603721538245, "kl": 0.08447265625, "learning_rate": 5.901524443665673e-07, "loss": 0.0014, "reward": 1.4270833730697632, "reward_std": 0.32159775495529175, "rewards/accuracy_reward_stage2": 0.4427083432674408, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2340 }, { "completion_length": 14.015625, "epoch": 0.41019800245312776, "grad_norm": 77.16144099299103, "kl": 0.37109375, "learning_rate": 5.899772209567197e-07, "loss": 0.0955, "reward": 1.4920990467071533, "reward_std": 0.1890915185213089, "rewards/accuracy_reward_stage2": 0.6483490467071533, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2341 }, { "completion_length": 9.3125, "epoch": 0.4103732258629753, "grad_norm": 21.558413026021842, "kl": 0.2490234375, "learning_rate": 5.898019975468722e-07, "loss": 0.0554, "reward": 1.7347142696380615, "reward_std": 0.21133540570735931, "rewards/accuracy_reward_stage2": 0.7503393888473511, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2342 }, { "completion_length": 10.125, "epoch": 0.41054844927282286, "grad_norm": 22.687231445611207, "kl": 0.16796875, "learning_rate": 5.896267741370247e-07, "loss": 0.0671, "reward": 1.3260161876678467, "reward_std": 0.19513970613479614, "rewards/accuracy_reward_stage2": 0.5760161876678467, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2343 }, { "completion_length": 9.734375, "epoch": 0.4107236726826704, "grad_norm": 18.09101369627577, "kl": 0.0947265625, "learning_rate": 5.894515507271771e-07, "loss": 0.0018, "reward": 1.4208391904830933, "reward_std": 0.28820210695266724, "rewards/accuracy_reward_stage2": 0.43646419048309326, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2344 }, { "completion_length": 9.921875, "epoch": 0.41089889609251795, "grad_norm": 13.054076648742582, "kl": 0.047607421875, "learning_rate": 5.892763273173296e-07, "loss": 0.0191, "reward": 1.3740955591201782, "reward_std": 0.06014459952712059, "rewards/accuracy_reward_stage2": 0.49909549951553345, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2345 }, { "completion_length": 11.375, "epoch": 0.4110741195023655, "grad_norm": 12.061553291437448, "kl": 0.07275390625, "learning_rate": 5.891011039074821e-07, "loss": 0.0292, "reward": 1.8303499221801758, "reward_std": 0.11832354962825775, "rewards/accuracy_reward_stage2": 0.8303500413894653, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2346 }, { "completion_length": 17.546875, "epoch": 0.41124934291221305, "grad_norm": 15.867100667252611, "kl": 0.050537109375, "learning_rate": 5.889258804976345e-07, "loss": -0.024, "reward": 1.6114246845245361, "reward_std": 0.2627890110015869, "rewards/accuracy_reward_stage2": 0.6270497441291809, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2347 }, { "completion_length": 12.171875, "epoch": 0.41142456632206065, "grad_norm": 20.27755333621668, "kl": 0.0703125, "learning_rate": 5.887506570877869e-07, "loss": 0.0282, "reward": 1.559401035308838, "reward_std": 0.2290632128715515, "rewards/accuracy_reward_stage2": 0.5594009160995483, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2348 }, { "completion_length": 11.90625, "epoch": 0.4115997897319082, "grad_norm": 23.597797783521305, "kl": 0.07470703125, "learning_rate": 5.885754336779393e-07, "loss": 0.0299, "reward": 1.4080188274383545, "reward_std": 0.2232939600944519, "rewards/accuracy_reward_stage2": 0.5330188274383545, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2349 }, { "completion_length": 17.25, "epoch": 0.41177501314175574, "grad_norm": 9.278143499351788, "kl": 0.06640625, "learning_rate": 5.884002102680918e-07, "loss": -0.0134, "reward": 1.5017169713974, "reward_std": 0.0607755072414875, "rewards/accuracy_reward_stage2": 0.5173419117927551, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2350 }, { "completion_length": 12.953125, "epoch": 0.4119502365516033, "grad_norm": 23.03371984485041, "kl": 0.10595703125, "learning_rate": 5.882249868582443e-07, "loss": 0.0423, "reward": 1.599036693572998, "reward_std": 0.14129234850406647, "rewards/accuracy_reward_stage2": 0.5990367531776428, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2351 }, { "completion_length": 15.171875, "epoch": 0.41212545996145084, "grad_norm": 18.022645161745377, "kl": 0.054931640625, "learning_rate": 5.880497634483966e-07, "loss": -0.0222, "reward": 1.696798324584961, "reward_std": 0.17698150873184204, "rewards/accuracy_reward_stage2": 0.7124233245849609, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2352 }, { "completion_length": 9.84375, "epoch": 0.4123006833712984, "grad_norm": 12.495739623364917, "kl": 0.034912109375, "learning_rate": 5.878745400385491e-07, "loss": 0.014, "reward": 1.3732178211212158, "reward_std": 0.17081034183502197, "rewards/accuracy_reward_stage2": 0.37321779131889343, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2353 }, { "completion_length": 17.03125, "epoch": 0.412475906781146, "grad_norm": 20.169046026792728, "kl": 0.0478515625, "learning_rate": 5.876993166287016e-07, "loss": -0.0167, "reward": 1.5913221836090088, "reward_std": 0.24479767680168152, "rewards/accuracy_reward_stage2": 0.6069472432136536, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2354 }, { "completion_length": 9.75, "epoch": 0.41265113019099353, "grad_norm": 19.99644756259702, "kl": 0.1630859375, "learning_rate": 5.87524093218854e-07, "loss": 0.0653, "reward": 1.5812091827392578, "reward_std": 0.24733664095401764, "rewards/accuracy_reward_stage2": 0.581209123134613, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2355 }, { "completion_length": 10.484375, "epoch": 0.4128263536008411, "grad_norm": 17.799954110495957, "kl": 0.1416015625, "learning_rate": 5.873488698090065e-07, "loss": 0.0207, "reward": 1.379780888557434, "reward_std": 0.19786177575588226, "rewards/accuracy_reward_stage2": 0.5204058289527893, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2356 }, { "completion_length": 8.453125, "epoch": 0.4130015770106886, "grad_norm": 20.148851520890076, "kl": 0.08203125, "learning_rate": 5.871736463991589e-07, "loss": -0.0114, "reward": 1.6698863506317139, "reward_std": 0.26759451627731323, "rewards/accuracy_reward_stage2": 0.6855113506317139, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2357 }, { "completion_length": 8.734375, "epoch": 0.41317680042053617, "grad_norm": 10.823426387963599, "kl": 0.1337890625, "learning_rate": 5.869984229893113e-07, "loss": -0.0319, "reward": 1.6802399158477783, "reward_std": 0.14132839441299438, "rewards/accuracy_reward_stage2": 0.7114899754524231, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2358 }, { "completion_length": 7.265625, "epoch": 0.4133520238303837, "grad_norm": 19.50099203681281, "kl": 0.1533203125, "learning_rate": 5.868231995794638e-07, "loss": -0.0199, "reward": 1.6012730598449707, "reward_std": 0.31847089529037476, "rewards/accuracy_reward_stage2": 0.6481481194496155, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2359 }, { "completion_length": 7.390625, "epoch": 0.4135272472402313, "grad_norm": 24.237481396274326, "kl": 0.369140625, "learning_rate": 5.866479761696162e-07, "loss": 0.0941, "reward": 1.1497249603271484, "reward_std": 0.31039959192276, "rewards/accuracy_reward_stage2": 0.4153498709201813, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2360 }, { "completion_length": 9.921875, "epoch": 0.41370247065007887, "grad_norm": 16.566273456490055, "kl": 0.1748046875, "learning_rate": 5.864727527597687e-07, "loss": 0.0308, "reward": 1.4874138832092285, "reward_std": 0.17307858169078827, "rewards/accuracy_reward_stage2": 0.6280390024185181, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2361 }, { "completion_length": 8.625, "epoch": 0.4138776940599264, "grad_norm": 11.367175152946368, "kl": 0.037109375, "learning_rate": 5.862975293499212e-07, "loss": 0.0148, "reward": 1.4189951419830322, "reward_std": 0.07283923774957657, "rewards/accuracy_reward_stage2": 0.41899508237838745, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2362 }, { "completion_length": 14.890625, "epoch": 0.41405291746977396, "grad_norm": 23.14965223243577, "kl": 0.11279296875, "learning_rate": 5.861223059400736e-07, "loss": 0.0601, "reward": 1.551171064376831, "reward_std": 0.19559019804000854, "rewards/accuracy_reward_stage2": 0.6761711239814758, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2363 }, { "completion_length": 12.125, "epoch": 0.4142281408796215, "grad_norm": 13.994240957679672, "kl": 0.1396484375, "learning_rate": 5.859470825302261e-07, "loss": 0.0115, "reward": 1.59100341796875, "reward_std": 0.14728805422782898, "rewards/accuracy_reward_stage2": 0.6066284775733948, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2364 }, { "completion_length": 9.78125, "epoch": 0.41440336428946906, "grad_norm": 21.323259031008334, "kl": 0.1328125, "learning_rate": 5.857718591203784e-07, "loss": 0.0089, "reward": 1.6100748777389526, "reward_std": 0.28893154859542847, "rewards/accuracy_reward_stage2": 0.6256999373435974, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2365 }, { "completion_length": 14.53125, "epoch": 0.4145785876993166, "grad_norm": 27.66782558140099, "kl": 0.1904296875, "learning_rate": 5.855966357105309e-07, "loss": 0.0761, "reward": 1.4185956716537476, "reward_std": 0.2524084746837616, "rewards/accuracy_reward_stage2": 0.5435957312583923, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2366 }, { "completion_length": 6.34375, "epoch": 0.4147538111091642, "grad_norm": 18.43566490915197, "kl": 0.040283203125, "learning_rate": 5.854214123006834e-07, "loss": 0.0161, "reward": 1.792750358581543, "reward_std": 0.12878680229187012, "rewards/accuracy_reward_stage2": 0.7927502393722534, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2367 }, { "completion_length": 14.59375, "epoch": 0.41492903451901175, "grad_norm": 13.91831957839429, "kl": 0.083984375, "learning_rate": 5.852461888908357e-07, "loss": 0.0056, "reward": 1.3589837551116943, "reward_std": 0.1351110190153122, "rewards/accuracy_reward_stage2": 0.37460869550704956, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2368 }, { "completion_length": 10.5, "epoch": 0.4151042579288593, "grad_norm": 16.737366881330868, "kl": 0.1142578125, "learning_rate": 5.850709654809882e-07, "loss": 0.0019, "reward": 1.6517565250396729, "reward_std": 0.16313903033733368, "rewards/accuracy_reward_stage2": 0.6673814654350281, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2369 }, { "completion_length": 14.828125, "epoch": 0.41527948133870685, "grad_norm": 17.248787576180106, "kl": 0.220703125, "learning_rate": 5.848957420711407e-07, "loss": -0.0111, "reward": 1.3924638032913208, "reward_std": 0.207502081990242, "rewards/accuracy_reward_stage2": 0.6893388032913208, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 2370 }, { "completion_length": 20.984375, "epoch": 0.4154547047485544, "grad_norm": 20.195264394665625, "kl": 0.06982421875, "learning_rate": 5.847205186612931e-07, "loss": 0.0116, "reward": 1.4419444799423218, "reward_std": 0.1619192659854889, "rewards/accuracy_reward_stage2": 0.582569420337677, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2371 }, { "completion_length": 8.703125, "epoch": 0.41562992815840194, "grad_norm": 23.724709031407677, "kl": 0.1279296875, "learning_rate": 5.845452952514456e-07, "loss": 0.0069, "reward": 1.5115091800689697, "reward_std": 0.25413644313812256, "rewards/accuracy_reward_stage2": 0.6521342396736145, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2372 }, { "completion_length": 11.46875, "epoch": 0.41580515156824954, "grad_norm": 24.900258630232408, "kl": 0.2255859375, "learning_rate": 5.84370071841598e-07, "loss": 0.0916, "reward": 1.3193838596343994, "reward_std": 0.2140500545501709, "rewards/accuracy_reward_stage2": 0.44438380002975464, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2373 }, { "completion_length": 12.390625, "epoch": 0.4159803749780971, "grad_norm": 13.963494157079705, "kl": 0.134765625, "learning_rate": 5.841948484317505e-07, "loss": 0.0195, "reward": 1.5970426797866821, "reward_std": 0.18137384951114655, "rewards/accuracy_reward_stage2": 0.6126677989959717, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2374 }, { "completion_length": 10.265625, "epoch": 0.41615559838794464, "grad_norm": 26.68313987733041, "kl": 0.0576171875, "learning_rate": 5.84019625021903e-07, "loss": 0.023, "reward": 1.7117927074432373, "reward_std": 0.14283138513565063, "rewards/accuracy_reward_stage2": 0.7117927670478821, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2375 }, { "completion_length": 6.828125, "epoch": 0.4163308217977922, "grad_norm": 19.976590487399438, "kl": 0.038330078125, "learning_rate": 5.838444016120554e-07, "loss": 0.0153, "reward": 1.5760102272033691, "reward_std": 0.1239052414894104, "rewards/accuracy_reward_stage2": 0.5760102868080139, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2376 }, { "completion_length": 13.8125, "epoch": 0.41650604520763973, "grad_norm": 24.706278623692512, "kl": 0.109375, "learning_rate": 5.836691782022077e-07, "loss": 0.0437, "reward": 1.5468864440917969, "reward_std": 0.24405643343925476, "rewards/accuracy_reward_stage2": 0.5468865036964417, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2377 }, { "completion_length": 9.078125, "epoch": 0.4166812686174873, "grad_norm": 23.394912393901517, "kl": 0.171875, "learning_rate": 5.834939547923601e-07, "loss": 0.0247, "reward": 1.769432544708252, "reward_std": 0.27068987488746643, "rewards/accuracy_reward_stage2": 0.785057544708252, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2378 }, { "completion_length": 11.515625, "epoch": 0.4168564920273349, "grad_norm": 14.7960789603516, "kl": 0.0257568359375, "learning_rate": 5.833187313825126e-07, "loss": 0.0103, "reward": 1.6770833730697632, "reward_std": 0.13835059106349945, "rewards/accuracy_reward_stage2": 0.6770833730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2379 }, { "completion_length": 14.671875, "epoch": 0.4170317154371824, "grad_norm": 14.686541826227506, "kl": 0.04296875, "learning_rate": 5.831435079726651e-07, "loss": -0.0271, "reward": 1.234375, "reward_std": 0.19044627249240875, "rewards/accuracy_reward_stage2": 0.25, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2380 }, { "completion_length": 9.390625, "epoch": 0.41720693884703, "grad_norm": 16.51517195261775, "kl": 0.1875, "learning_rate": 5.829682845628175e-07, "loss": 0.0748, "reward": 1.4675534963607788, "reward_std": 0.16185539960861206, "rewards/accuracy_reward_stage2": 0.4675534665584564, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2381 }, { "completion_length": 21.046875, "epoch": 0.4173821622568775, "grad_norm": 19.278164592265732, "kl": 0.091796875, "learning_rate": 5.8279306115297e-07, "loss": 0.0366, "reward": 1.4626104831695557, "reward_std": 0.12916041910648346, "rewards/accuracy_reward_stage2": 0.5876104831695557, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2382 }, { "completion_length": 10.4375, "epoch": 0.41755738566672507, "grad_norm": 17.740228123681344, "kl": 0.1787109375, "learning_rate": 5.826178377431225e-07, "loss": 0.0323, "reward": 1.526477336883545, "reward_std": 0.1329089105129242, "rewards/accuracy_reward_stage2": 0.6671023964881897, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2383 }, { "completion_length": 13.078125, "epoch": 0.4177326090765726, "grad_norm": 38.03817298589301, "kl": 0.09716796875, "learning_rate": 5.824426143332749e-07, "loss": 0.0174, "reward": 1.5752477645874023, "reward_std": 0.19630175828933716, "rewards/accuracy_reward_stage2": 0.5908727645874023, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2384 }, { "completion_length": 10.453125, "epoch": 0.41790783248642016, "grad_norm": 20.96990598678958, "kl": 0.1630859375, "learning_rate": 5.822673909234274e-07, "loss": 0.022, "reward": 1.7025611400604248, "reward_std": 0.22266198694705963, "rewards/accuracy_reward_stage2": 0.7181861400604248, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2385 }, { "completion_length": 11.296875, "epoch": 0.41808305589626776, "grad_norm": 11.484983822112438, "kl": 0.050048828125, "learning_rate": 5.820921675135799e-07, "loss": -0.0242, "reward": 1.604400873184204, "reward_std": 0.05826300382614136, "rewards/accuracy_reward_stage2": 0.6200259327888489, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2386 }, { "completion_length": 14.078125, "epoch": 0.4182582793061153, "grad_norm": 18.563600984589137, "kl": 0.1982421875, "learning_rate": 5.819169441037323e-07, "loss": -0.009, "reward": 1.8097143173217773, "reward_std": 0.20771729946136475, "rewards/accuracy_reward_stage2": 0.8409643173217773, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2387 }, { "completion_length": 13.890625, "epoch": 0.41843350271596286, "grad_norm": 19.485960652460495, "kl": 0.125, "learning_rate": 5.817417206938847e-07, "loss": 0.05, "reward": 1.6337437629699707, "reward_std": 0.258608341217041, "rewards/accuracy_reward_stage2": 0.6337437033653259, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2388 }, { "completion_length": 9.421875, "epoch": 0.4186087261258104, "grad_norm": 14.326349224837957, "kl": 0.045654296875, "learning_rate": 5.815664972840371e-07, "loss": 0.0183, "reward": 1.6053376197814941, "reward_std": 0.13150310516357422, "rewards/accuracy_reward_stage2": 0.6053376197814941, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2389 }, { "completion_length": 16.984375, "epoch": 0.41878394953565795, "grad_norm": 14.73197467489026, "kl": 0.1318359375, "learning_rate": 5.813912738741895e-07, "loss": 0.0025, "reward": 1.8244647979736328, "reward_std": 0.14700627326965332, "rewards/accuracy_reward_stage2": 0.8557147979736328, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2390 }, { "completion_length": 7.65625, "epoch": 0.4189591729455055, "grad_norm": 18.481419837735878, "kl": 0.158203125, "learning_rate": 5.81216050464342e-07, "loss": 0.0193, "reward": 1.6163908243179321, "reward_std": 0.2330131232738495, "rewards/accuracy_reward_stage2": 0.7570158243179321, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2391 }, { "completion_length": 12.140625, "epoch": 0.4191343963553531, "grad_norm": 18.545690051338077, "kl": 0.1328125, "learning_rate": 5.810408270544944e-07, "loss": 0.0532, "reward": 1.7197383642196655, "reward_std": 0.2599320113658905, "rewards/accuracy_reward_stage2": 0.7197383046150208, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2392 }, { "completion_length": 7.421875, "epoch": 0.41930961976520065, "grad_norm": 21.630837755379087, "kl": 0.20703125, "learning_rate": 5.808656036446469e-07, "loss": 0.0539, "reward": 1.6220672130584717, "reward_std": 0.21142326295375824, "rewards/accuracy_reward_stage2": 0.6376922130584717, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2393 }, { "completion_length": 15.875, "epoch": 0.4194848431750482, "grad_norm": 22.5945277461123, "kl": 0.1240234375, "learning_rate": 5.806903802347993e-07, "loss": 0.0053, "reward": 1.5958011150360107, "reward_std": 0.2829567790031433, "rewards/accuracy_reward_stage2": 0.6114259958267212, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2394 }, { "completion_length": 8.8125, "epoch": 0.41966006658489574, "grad_norm": 22.150968540044698, "kl": 0.16015625, "learning_rate": 5.805151568249518e-07, "loss": 0.0473, "reward": 1.3075213432312012, "reward_std": 0.33274227380752563, "rewards/accuracy_reward_stage2": 0.4481462836265564, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2395 }, { "completion_length": 15.859375, "epoch": 0.4198352899947433, "grad_norm": 58.532722707916875, "kl": 0.275390625, "learning_rate": 5.803399334151043e-07, "loss": 0.1387, "reward": 1.5200127363204956, "reward_std": 0.15271207690238953, "rewards/accuracy_reward_stage2": 0.6450127363204956, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2396 }, { "completion_length": 11.015625, "epoch": 0.42001051340459084, "grad_norm": 15.509242191723505, "kl": 0.07861328125, "learning_rate": 5.801647100052566e-07, "loss": -0.0127, "reward": 1.5883493423461914, "reward_std": 0.15838760137557983, "rewards/accuracy_reward_stage2": 0.6039743423461914, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2397 }, { "completion_length": 7.078125, "epoch": 0.4201857368144384, "grad_norm": 15.094137285652373, "kl": 0.1279296875, "learning_rate": 5.799894865954091e-07, "loss": -0.006, "reward": 1.484375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2398 }, { "completion_length": 12.078125, "epoch": 0.420360960224286, "grad_norm": 19.19974924019669, "kl": 0.306640625, "learning_rate": 5.798142631855616e-07, "loss": 0.0341, "reward": 1.3660731315612793, "reward_std": 0.29376906156539917, "rewards/accuracy_reward_stage2": 0.5223231315612793, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2399 }, { "completion_length": 11.984375, "epoch": 0.42053618363413353, "grad_norm": 14.368830703427102, "kl": 0.062255859375, "learning_rate": 5.79639039775714e-07, "loss": 0.0249, "reward": 1.515123724937439, "reward_std": 0.2202598750591278, "rewards/accuracy_reward_stage2": 0.5151236653327942, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2400 }, { "completion_length": 8.796875, "epoch": 0.4207114070439811, "grad_norm": 19.06424785954498, "kl": 0.11865234375, "learning_rate": 5.794638163658665e-07, "loss": 0.0065, "reward": 1.5015455484390259, "reward_std": 0.3422601819038391, "rewards/accuracy_reward_stage2": 0.5327955484390259, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2401 }, { "completion_length": 8.953125, "epoch": 0.4208866304538286, "grad_norm": 19.732187174356824, "kl": 0.0625, "learning_rate": 5.792885929560189e-07, "loss": 0.025, "reward": 1.4542206525802612, "reward_std": 0.17934425175189972, "rewards/accuracy_reward_stage2": 0.5792206525802612, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2402 }, { "completion_length": 7.515625, "epoch": 0.42106185386367617, "grad_norm": 17.66601038639952, "kl": 0.07861328125, "learning_rate": 5.791133695461713e-07, "loss": -0.0128, "reward": 1.6888391971588135, "reward_std": 0.20628750324249268, "rewards/accuracy_reward_stage2": 0.7044641375541687, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2403 }, { "completion_length": 8.359375, "epoch": 0.4212370772735237, "grad_norm": 22.577077448823406, "kl": 0.068359375, "learning_rate": 5.789381461363238e-07, "loss": -0.0056, "reward": 1.601570725440979, "reward_std": 0.3855169117450714, "rewards/accuracy_reward_stage2": 0.617195725440979, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2404 }, { "completion_length": 15.1875, "epoch": 0.4214123006833713, "grad_norm": 8.304898189657713, "kl": 0.0174560546875, "learning_rate": 5.787629227264762e-07, "loss": 0.007, "reward": 1.7204861640930176, "reward_std": 0.054775021970272064, "rewards/accuracy_reward_stage2": 0.7204861044883728, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2405 }, { "completion_length": 9.78125, "epoch": 0.42158752409321887, "grad_norm": 21.344555928253595, "kl": 0.068359375, "learning_rate": 5.785876993166287e-07, "loss": 0.0274, "reward": 1.5492045879364014, "reward_std": 0.21212232112884521, "rewards/accuracy_reward_stage2": 0.6742044687271118, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2406 }, { "completion_length": 12.0625, "epoch": 0.4217627475030664, "grad_norm": 16.746001989010427, "kl": 0.140625, "learning_rate": 5.784124759067811e-07, "loss": 0.0188, "reward": 1.6990876197814941, "reward_std": 0.20534729957580566, "rewards/accuracy_reward_stage2": 0.7147126197814941, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2407 }, { "completion_length": 28.90625, "epoch": 0.42193797091291396, "grad_norm": 18.15173262927103, "kl": 0.0257568359375, "learning_rate": 5.782372524969335e-07, "loss": -0.0337, "reward": 1.58535635471344, "reward_std": 0.2794279158115387, "rewards/accuracy_reward_stage2": 0.6009812951087952, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2408 }, { "completion_length": 9.203125, "epoch": 0.4221131943227615, "grad_norm": 21.973624282288817, "kl": 0.11083984375, "learning_rate": 5.78062029087086e-07, "loss": 0.0086, "reward": 1.8336703777313232, "reward_std": 0.14962394535541534, "rewards/accuracy_reward_stage2": 0.8492953777313232, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2409 }, { "completion_length": 8.8125, "epoch": 0.42228841773260906, "grad_norm": 18.160116177629245, "kl": 0.061279296875, "learning_rate": 5.778868056772384e-07, "loss": 0.0245, "reward": 1.5550178289413452, "reward_std": 0.17941808700561523, "rewards/accuracy_reward_stage2": 0.5550177693367004, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2410 }, { "completion_length": 10.015625, "epoch": 0.42246364114245666, "grad_norm": 14.5878725632844, "kl": 0.0771484375, "learning_rate": 5.777115822673909e-07, "loss": -0.0131, "reward": 1.8403640985488892, "reward_std": 0.15390296280384064, "rewards/accuracy_reward_stage2": 0.8559890985488892, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2411 }, { "completion_length": 8.1875, "epoch": 0.4226388645523042, "grad_norm": 18.022444942085286, "kl": 0.19921875, "learning_rate": 5.775363588575434e-07, "loss": -0.0373, "reward": 1.6093087196350098, "reward_std": 0.26443642377853394, "rewards/accuracy_reward_stage2": 0.6561837196350098, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2412 }, { "completion_length": 9.109375, "epoch": 0.42281408796215175, "grad_norm": 14.90491302423205, "kl": 0.1328125, "learning_rate": 5.773611354476958e-07, "loss": 0.053, "reward": 1.486379861831665, "reward_std": 0.10205523669719696, "rewards/accuracy_reward_stage2": 0.4863799214363098, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2413 }, { "completion_length": 9.453125, "epoch": 0.4229893113719993, "grad_norm": 19.41092038526134, "kl": 0.33203125, "learning_rate": 5.771859120378483e-07, "loss": 0.0286, "reward": 1.4454668760299683, "reward_std": 0.23624253273010254, "rewards/accuracy_reward_stage2": 0.6173418760299683, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2414 }, { "completion_length": 7.765625, "epoch": 0.42316453478184685, "grad_norm": 18.530930531417717, "kl": 0.2451171875, "learning_rate": 5.770106886280008e-07, "loss": 0.0182, "reward": 1.3699134588241577, "reward_std": 0.2863275408744812, "rewards/accuracy_reward_stage2": 0.5261634588241577, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2415 }, { "completion_length": 7.953125, "epoch": 0.4233397581916944, "grad_norm": 19.940209981188683, "kl": 0.23828125, "learning_rate": 5.768354652181531e-07, "loss": -0.0313, "reward": 1.5837643146514893, "reward_std": 0.2522251605987549, "rewards/accuracy_reward_stage2": 0.6306391954421997, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2416 }, { "completion_length": 12.578125, "epoch": 0.42351498160154194, "grad_norm": 20.306264227490708, "kl": 0.2490234375, "learning_rate": 5.766602418083055e-07, "loss": -0.0231, "reward": 1.3948123455047607, "reward_std": 0.3546648919582367, "rewards/accuracy_reward_stage2": 0.5666873455047607, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2417 }, { "completion_length": 11.53125, "epoch": 0.42369020501138954, "grad_norm": 20.154209703831523, "kl": 0.12890625, "learning_rate": 5.764850183984579e-07, "loss": 0.0349, "reward": 1.2792103290557861, "reward_std": 0.2176147699356079, "rewards/accuracy_reward_stage2": 0.4198353886604309, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2418 }, { "completion_length": 12.59375, "epoch": 0.4238654284212371, "grad_norm": 15.276341266991679, "kl": 0.068359375, "learning_rate": 5.763097949886104e-07, "loss": -0.0056, "reward": 1.411638617515564, "reward_std": 0.21439909934997559, "rewards/accuracy_reward_stage2": 0.4272635877132416, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2419 }, { "completion_length": 13.375, "epoch": 0.42404065183108464, "grad_norm": 25.83525267716047, "kl": 0.166015625, "learning_rate": 5.761345715787629e-07, "loss": 0.009, "reward": 1.4438656568527222, "reward_std": 0.40633732080459595, "rewards/accuracy_reward_stage2": 0.47511565685272217, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2420 }, { "completion_length": 30.53125, "epoch": 0.4242158752409322, "grad_norm": 19.080239012482636, "kl": 0.1708984375, "learning_rate": 5.759593481689153e-07, "loss": 0.0242, "reward": 1.6823811531066895, "reward_std": 0.23308956623077393, "rewards/accuracy_reward_stage2": 0.6980061531066895, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2421 }, { "completion_length": 14.0625, "epoch": 0.42439109865077973, "grad_norm": 17.221754256398736, "kl": 0.1279296875, "learning_rate": 5.757841247590678e-07, "loss": 0.0179, "reward": 1.3743314743041992, "reward_std": 0.16007143259048462, "rewards/accuracy_reward_stage2": 0.5149564743041992, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2422 }, { "completion_length": 10.765625, "epoch": 0.4245663220606273, "grad_norm": 17.92302781537834, "kl": 0.13671875, "learning_rate": 5.756089013492203e-07, "loss": -0.0002, "reward": 1.69929838180542, "reward_std": 0.11890119314193726, "rewards/accuracy_reward_stage2": 0.8399233222007751, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2423 }, { "completion_length": 8.21875, "epoch": 0.4247415454704749, "grad_norm": 19.420077030170336, "kl": 0.2158203125, "learning_rate": 5.754336779393727e-07, "loss": -0.0744, "reward": 1.741911768913269, "reward_std": 0.3283819556236267, "rewards/accuracy_reward_stage2": 0.804411768913269, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2424 }, { "completion_length": 13.265625, "epoch": 0.4249167688803224, "grad_norm": 20.955694465793627, "kl": 0.2451171875, "learning_rate": 5.752584545295252e-07, "loss": 0.0293, "reward": 1.463441252708435, "reward_std": 0.2972392439842224, "rewards/accuracy_reward_stage2": 0.49469125270843506, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2425 }, { "completion_length": 15.734375, "epoch": 0.42509199229017, "grad_norm": 21.360009650947255, "kl": 0.044677734375, "learning_rate": 5.750832311196776e-07, "loss": 0.0179, "reward": 1.5914231538772583, "reward_std": 0.16339072585105896, "rewards/accuracy_reward_stage2": 0.7164231538772583, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2426 }, { "completion_length": 9.859375, "epoch": 0.4252672157000175, "grad_norm": 13.966446126628513, "kl": 0.1357421875, "learning_rate": 5.7490800770983e-07, "loss": 0.0545, "reward": 1.5450599193572998, "reward_std": 0.10719159245491028, "rewards/accuracy_reward_stage2": 0.6700599789619446, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2427 }, { "completion_length": 11.625, "epoch": 0.42544243910986507, "grad_norm": 19.51937307944042, "kl": 0.15234375, "learning_rate": 5.747327842999824e-07, "loss": 0.0611, "reward": 1.6079663038253784, "reward_std": 0.1823212057352066, "rewards/accuracy_reward_stage2": 0.7329663038253784, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2428 }, { "completion_length": 9.90625, "epoch": 0.4256176625197126, "grad_norm": 14.795791933246239, "kl": 0.201171875, "learning_rate": 5.745575608901348e-07, "loss": -0.0012, "reward": 1.6733319759368896, "reward_std": 0.28265178203582764, "rewards/accuracy_reward_stage2": 0.7045819759368896, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2429 }, { "completion_length": 12.796875, "epoch": 0.4257928859295602, "grad_norm": 22.40763134045858, "kl": 0.130859375, "learning_rate": 5.743823374802873e-07, "loss": 0.0081, "reward": 1.3852684497833252, "reward_std": 0.20095396041870117, "rewards/accuracy_reward_stage2": 0.4008934497833252, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2430 }, { "completion_length": 12.4375, "epoch": 0.42596810933940776, "grad_norm": 17.35337680952318, "kl": 0.1767578125, "learning_rate": 5.742071140704398e-07, "loss": -0.0526, "reward": 1.5718660354614258, "reward_std": 0.3506343364715576, "rewards/accuracy_reward_stage2": 0.6187410354614258, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2431 }, { "completion_length": 7.578125, "epoch": 0.4261433327492553, "grad_norm": 19.778617819967412, "kl": 0.0419921875, "learning_rate": 5.740318906605922e-07, "loss": 0.0168, "reward": 1.5186505317687988, "reward_std": 0.27518704533576965, "rewards/accuracy_reward_stage2": 0.518650472164154, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2432 }, { "completion_length": 16.5, "epoch": 0.42631855615910286, "grad_norm": 15.30317538140248, "kl": 0.04345703125, "learning_rate": 5.738566672507447e-07, "loss": 0.0174, "reward": 1.603685736656189, "reward_std": 0.08226728439331055, "rewards/accuracy_reward_stage2": 0.603685736656189, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2433 }, { "completion_length": 11.25, "epoch": 0.4264937795689504, "grad_norm": 20.356807009322075, "kl": 0.2041015625, "learning_rate": 5.736814438408971e-07, "loss": 0.0398, "reward": 1.5039076805114746, "reward_std": 0.2799040675163269, "rewards/accuracy_reward_stage2": 0.5195327401161194, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2434 }, { "completion_length": 8.015625, "epoch": 0.42666900297879795, "grad_norm": 14.960749798331605, "kl": 0.053466796875, "learning_rate": 5.735062204310496e-07, "loss": 0.0214, "reward": 1.6923701763153076, "reward_std": 0.10996302962303162, "rewards/accuracy_reward_stage2": 0.6923701763153076, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2435 }, { "completion_length": 11.65625, "epoch": 0.4268442263886455, "grad_norm": 17.103308154395133, "kl": 0.05419921875, "learning_rate": 5.733309970212021e-07, "loss": 0.0216, "reward": 1.2994627952575684, "reward_std": 0.1993103176355362, "rewards/accuracy_reward_stage2": 0.29946279525756836, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2436 }, { "completion_length": 9.296875, "epoch": 0.4270194497984931, "grad_norm": 29.964224876637175, "kl": 0.11474609375, "learning_rate": 5.731557736113544e-07, "loss": -0.0325, "reward": 1.4375, "reward_std": 0.303472638130188, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2437 }, { "completion_length": 7.3125, "epoch": 0.42719467320834065, "grad_norm": 17.02039408599391, "kl": 0.1513671875, "learning_rate": 5.729805502015069e-07, "loss": 0.0035, "reward": 1.6351255178451538, "reward_std": 0.1954609751701355, "rewards/accuracy_reward_stage2": 0.666375458240509, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2438 }, { "completion_length": 7.84375, "epoch": 0.4273698966181882, "grad_norm": 17.43461625512826, "kl": 0.1484375, "learning_rate": 5.728053267916594e-07, "loss": -0.0059, "reward": 1.4844474792480469, "reward_std": 0.20424222946166992, "rewards/accuracy_reward_stage2": 0.5156975984573364, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2439 }, { "completion_length": 7.171875, "epoch": 0.42754512002803574, "grad_norm": 12.179955934075089, "kl": 0.0947265625, "learning_rate": 5.726301033818118e-07, "loss": -0.0061, "reward": 1.6276520490646362, "reward_std": 0.09620348364114761, "rewards/accuracy_reward_stage2": 0.6432770490646362, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2440 }, { "completion_length": 8.4375, "epoch": 0.4277203434378833, "grad_norm": 19.9623888755173, "kl": 0.14453125, "learning_rate": 5.724548799719642e-07, "loss": 0.0579, "reward": 1.653172254562378, "reward_std": 0.1910550743341446, "rewards/accuracy_reward_stage2": 0.6531723737716675, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2441 }, { "completion_length": 9.46875, "epoch": 0.42789556684773083, "grad_norm": 23.38544505707681, "kl": 0.1513671875, "learning_rate": 5.722796565621166e-07, "loss": -0.0094, "reward": 1.4377069473266602, "reward_std": 0.3157484829425812, "rewards/accuracy_reward_stage2": 0.4689568877220154, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2442 }, { "completion_length": 6.890625, "epoch": 0.42807079025757844, "grad_norm": 19.817111523775036, "kl": 0.059814453125, "learning_rate": 5.721044331522691e-07, "loss": 0.0239, "reward": 1.669250249862671, "reward_std": 0.2423802763223648, "rewards/accuracy_reward_stage2": 0.6692502498626709, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2443 }, { "completion_length": 10.296875, "epoch": 0.428246013667426, "grad_norm": 17.613478551009646, "kl": 0.10302734375, "learning_rate": 5.719292097424216e-07, "loss": -0.0006, "reward": 1.6044498682022095, "reward_std": 0.27946341037750244, "rewards/accuracy_reward_stage2": 0.6200748682022095, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2444 }, { "completion_length": 10.3125, "epoch": 0.42842123707727353, "grad_norm": 22.91097469403058, "kl": 0.2734375, "learning_rate": 5.71753986332574e-07, "loss": 0.0262, "reward": 1.5360618829727173, "reward_std": 0.28243163228034973, "rewards/accuracy_reward_stage2": 0.5673118829727173, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2445 }, { "completion_length": 6.71875, "epoch": 0.4285964604871211, "grad_norm": 23.222689234508, "kl": 0.06640625, "learning_rate": 5.715787629227265e-07, "loss": -0.0014, "reward": 1.4264297485351562, "reward_std": 0.2040741741657257, "rewards/accuracy_reward_stage2": 0.6920547485351562, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2446 }, { "completion_length": 13.90625, "epoch": 0.4287716838969686, "grad_norm": 19.669761903072615, "kl": 0.150390625, "learning_rate": 5.714035395128789e-07, "loss": 0.0601, "reward": 1.4472002983093262, "reward_std": 0.2749719023704529, "rewards/accuracy_reward_stage2": 0.6972004175186157, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2447 }, { "completion_length": 9.671875, "epoch": 0.42894690730681617, "grad_norm": 16.145787029771405, "kl": 0.1640625, "learning_rate": 5.712283161030313e-07, "loss": 0.0152, "reward": 1.766361951828003, "reward_std": 0.14357280731201172, "rewards/accuracy_reward_stage2": 0.7976118922233582, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2448 }, { "completion_length": 12.640625, "epoch": 0.4291221307166637, "grad_norm": 21.49360682772942, "kl": 0.30078125, "learning_rate": 5.710530926931838e-07, "loss": 0.0347, "reward": 1.4998382329940796, "reward_std": 0.23182250559329987, "rewards/accuracy_reward_stage2": 0.6560881733894348, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2449 }, { "completion_length": 11.59375, "epoch": 0.4292973541265113, "grad_norm": 24.51503238020705, "kl": 0.203125, "learning_rate": 5.708778692833362e-07, "loss": 0.0413, "reward": 1.3393845558166504, "reward_std": 0.26047730445861816, "rewards/accuracy_reward_stage2": 0.6050096154212952, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2450 }, { "completion_length": 22.6875, "epoch": 0.42947257753635887, "grad_norm": 20.936424541852354, "kl": 0.30078125, "learning_rate": 5.707026458734887e-07, "loss": 0.026, "reward": 1.1758679151535034, "reward_std": 0.2872629165649414, "rewards/accuracy_reward_stage2": 0.3477429151535034, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2451 }, { "completion_length": 9.09375, "epoch": 0.4296478009462064, "grad_norm": 14.344676006512493, "kl": 0.072265625, "learning_rate": 5.705274224636412e-07, "loss": -0.0042, "reward": 1.702180027961731, "reward_std": 0.12235504388809204, "rewards/accuracy_reward_stage2": 0.7178049683570862, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2452 }, { "completion_length": 12.453125, "epoch": 0.42982302435605396, "grad_norm": 21.047318587932917, "kl": 0.1376953125, "learning_rate": 5.703521990537936e-07, "loss": -0.0279, "reward": 1.59375, "reward_std": 0.354972779750824, "rewards/accuracy_reward_stage2": 0.75, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2453 }, { "completion_length": 10.015625, "epoch": 0.4299982477659015, "grad_norm": 18.28343107689635, "kl": 0.0693359375, "learning_rate": 5.70176975643946e-07, "loss": -0.0035, "reward": 1.4171864986419678, "reward_std": 0.26342296600341797, "rewards/accuracy_reward_stage2": 0.4328114986419678, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2454 }, { "completion_length": 9.765625, "epoch": 0.43017347117574906, "grad_norm": 22.09226668402471, "kl": 0.142578125, "learning_rate": 5.700017522340984e-07, "loss": 0.0131, "reward": 1.3958333730697632, "reward_std": 0.3385624885559082, "rewards/accuracy_reward_stage2": 0.6614583730697632, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2455 }, { "completion_length": 8.984375, "epoch": 0.43034869458559666, "grad_norm": 16.869530097214756, "kl": 0.2099609375, "learning_rate": 5.698265288242509e-07, "loss": -0.093, "reward": 1.744043231010437, "reward_std": 0.2645736336708069, "rewards/accuracy_reward_stage2": 0.931543231010437, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 2456 }, { "completion_length": 8.28125, "epoch": 0.4305239179954442, "grad_norm": 18.478454670590494, "kl": 0.25, "learning_rate": 5.696513054144033e-07, "loss": -0.0083, "reward": 1.5073299407958984, "reward_std": 0.231063574552536, "rewards/accuracy_reward_stage2": 0.5542050004005432, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2457 }, { "completion_length": 12.03125, "epoch": 0.43069914140529175, "grad_norm": 16.2323364003327, "kl": 0.130859375, "learning_rate": 5.694760820045557e-07, "loss": 0.0108, "reward": 1.563699722290039, "reward_std": 0.20556166768074036, "rewards/accuracy_reward_stage2": 0.5793246030807495, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2458 }, { "completion_length": 8.9375, "epoch": 0.4308743648151393, "grad_norm": 18.48203471270133, "kl": 0.19140625, "learning_rate": 5.693008585947082e-07, "loss": 0.0595, "reward": 1.7979397773742676, "reward_std": 0.07218047231435776, "rewards/accuracy_reward_stage2": 0.8291897773742676, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2459 }, { "completion_length": 12.96875, "epoch": 0.43104958822498685, "grad_norm": 21.08552299144359, "kl": 0.08984375, "learning_rate": 5.691256351848607e-07, "loss": 0.0359, "reward": 1.418628215789795, "reward_std": 0.1847638040781021, "rewards/accuracy_reward_stage2": 0.4186283051967621, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2460 }, { "completion_length": 9.75, "epoch": 0.4312248116348344, "grad_norm": 9.440874169401996, "kl": 0.0859375, "learning_rate": 5.689504117750131e-07, "loss": -0.054, "reward": 1.639630913734436, "reward_std": 0.0937529057264328, "rewards/accuracy_reward_stage2": 0.795880913734436, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2461 }, { "completion_length": 10.859375, "epoch": 0.431400035044682, "grad_norm": 20.556394887831715, "kl": 0.0654296875, "learning_rate": 5.687751883651656e-07, "loss": 0.0262, "reward": 1.713507056236267, "reward_std": 0.21507734060287476, "rewards/accuracy_reward_stage2": 0.7135070562362671, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2462 }, { "completion_length": 8.765625, "epoch": 0.43157525845452954, "grad_norm": 16.17523527305367, "kl": 0.115234375, "learning_rate": 5.685999649553181e-07, "loss": 0.0405, "reward": 1.5567264556884766, "reward_std": 0.10888748615980148, "rewards/accuracy_reward_stage2": 0.6817264556884766, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2463 }, { "completion_length": 7.734375, "epoch": 0.4317504818643771, "grad_norm": 20.05211199152606, "kl": 0.2216796875, "learning_rate": 5.684247415454705e-07, "loss": 0.0134, "reward": 1.4463741779327393, "reward_std": 0.3639988303184509, "rewards/accuracy_reward_stage2": 0.47762417793273926, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2464 }, { "completion_length": 10.96875, "epoch": 0.43192570527422464, "grad_norm": 19.119738395479125, "kl": 0.056640625, "learning_rate": 5.68249518135623e-07, "loss": 0.0226, "reward": 1.7154829502105713, "reward_std": 0.20708030462265015, "rewards/accuracy_reward_stage2": 0.7154829502105713, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2465 }, { "completion_length": 23.453125, "epoch": 0.4321009286840722, "grad_norm": 16.74592932741985, "kl": 0.06396484375, "learning_rate": 5.680742947257752e-07, "loss": -0.0185, "reward": 1.7202057838439941, "reward_std": 0.13266494870185852, "rewards/accuracy_reward_stage2": 0.7358307838439941, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2466 }, { "completion_length": 11.796875, "epoch": 0.43227615209391973, "grad_norm": 19.32836770464296, "kl": 0.11279296875, "learning_rate": 5.678990713159277e-07, "loss": 0.0023, "reward": 1.5369431972503662, "reward_std": 0.2311927080154419, "rewards/accuracy_reward_stage2": 0.5525681376457214, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2467 }, { "completion_length": 6.59375, "epoch": 0.4324513755037673, "grad_norm": 15.640132670257081, "kl": 0.236328125, "learning_rate": 5.677238479060802e-07, "loss": 0.0211, "reward": 1.5886476039886475, "reward_std": 0.2265038788318634, "rewards/accuracy_reward_stage2": 0.6198976635932922, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2468 }, { "completion_length": 8.421875, "epoch": 0.4326265989136149, "grad_norm": 22.452989347045982, "kl": 0.1298828125, "learning_rate": 5.675486244962326e-07, "loss": 0.0077, "reward": 1.7098286151885986, "reward_std": 0.256533682346344, "rewards/accuracy_reward_stage2": 0.7254536151885986, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2469 }, { "completion_length": 12.65625, "epoch": 0.4328018223234624, "grad_norm": 22.12282683860938, "kl": 0.12353515625, "learning_rate": 5.673734010863851e-07, "loss": 0.0181, "reward": 1.2503278255462646, "reward_std": 0.3297951817512512, "rewards/accuracy_reward_stage2": 0.5159528255462646, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2470 }, { "completion_length": 12.390625, "epoch": 0.43297704573331, "grad_norm": 13.672982165021379, "kl": 0.126953125, "learning_rate": 5.671981776765375e-07, "loss": -0.0294, "reward": 1.3744654655456543, "reward_std": 0.14306974411010742, "rewards/accuracy_reward_stage2": 0.5307154059410095, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2471 }, { "completion_length": 9.171875, "epoch": 0.4331522691431575, "grad_norm": 20.3049908134314, "kl": 0.1943359375, "learning_rate": 5.6702295426669e-07, "loss": 0.0004, "reward": 1.2787423133850098, "reward_std": 0.34057146310806274, "rewards/accuracy_reward_stage2": 0.4349922835826874, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2472 }, { "completion_length": 8.375, "epoch": 0.43332749255300507, "grad_norm": 15.57066797986624, "kl": 0.1728515625, "learning_rate": 5.668477308568425e-07, "loss": -0.0273, "reward": 1.227855920791626, "reward_std": 0.2600477337837219, "rewards/accuracy_reward_stage2": 0.649730920791626, "rewards/format_reward_stage1_pointerpad": 0.578125, "scores/accuracy_reward_stage2": 0.578125, "step": 2473 }, { "completion_length": 9.015625, "epoch": 0.4335027159628526, "grad_norm": 16.92854945026168, "kl": 0.06689453125, "learning_rate": 5.666725074469949e-07, "loss": 0.0267, "reward": 1.605328917503357, "reward_std": 0.1650751680135727, "rewards/accuracy_reward_stage2": 0.6053289175033569, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2474 }, { "completion_length": 9.140625, "epoch": 0.4336779393727002, "grad_norm": 36.341656245826364, "kl": 0.32421875, "learning_rate": 5.664972840371474e-07, "loss": 0.0901, "reward": 1.2757874727249146, "reward_std": 0.2893187999725342, "rewards/accuracy_reward_stage2": 0.5414124727249146, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2475 }, { "completion_length": 8.640625, "epoch": 0.43385316278254776, "grad_norm": 16.659823748323067, "kl": 0.0390625, "learning_rate": 5.663220606272999e-07, "loss": 0.0156, "reward": 1.8644938468933105, "reward_std": 0.10486021637916565, "rewards/accuracy_reward_stage2": 0.8644937872886658, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2476 }, { "completion_length": 8.1875, "epoch": 0.4340283861923953, "grad_norm": 13.634458932508217, "kl": 0.19140625, "learning_rate": 5.661468372174522e-07, "loss": -0.0392, "reward": 1.6913225650787354, "reward_std": 0.19711565971374512, "rewards/accuracy_reward_stage2": 0.7381975054740906, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2477 }, { "completion_length": 10.078125, "epoch": 0.43420360960224286, "grad_norm": 20.019412732245556, "kl": 0.087890625, "learning_rate": 5.659716138076047e-07, "loss": 0.0352, "reward": 1.625917911529541, "reward_std": 0.22597362101078033, "rewards/accuracy_reward_stage2": 0.6259177923202515, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2478 }, { "completion_length": 8.421875, "epoch": 0.4343788330120904, "grad_norm": 20.72840852940728, "kl": 0.322265625, "learning_rate": 5.65796390397757e-07, "loss": 0.0199, "reward": 1.2296040058135986, "reward_std": 0.256480872631073, "rewards/accuracy_reward_stage2": 0.40147897601127625, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2479 }, { "completion_length": 15.046875, "epoch": 0.43455405642193795, "grad_norm": 16.062794937831228, "kl": 0.19140625, "learning_rate": 5.656211669879095e-07, "loss": -0.0408, "reward": 1.5773541927337646, "reward_std": 0.19875936210155487, "rewards/accuracy_reward_stage2": 0.6242291927337646, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2480 }, { "completion_length": 9.59375, "epoch": 0.43472927983178555, "grad_norm": 20.873383707835117, "kl": 0.11328125, "learning_rate": 5.65445943578062e-07, "loss": 0.0454, "reward": 1.7643051147460938, "reward_std": 0.12677878141403198, "rewards/accuracy_reward_stage2": 0.7643051147460938, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2481 }, { "completion_length": 8.40625, "epoch": 0.4349045032416331, "grad_norm": 15.811934560487519, "kl": 0.03125, "learning_rate": 5.652707201682144e-07, "loss": 0.0125, "reward": 1.6651852130889893, "reward_std": 0.25956788659095764, "rewards/accuracy_reward_stage2": 0.6651852130889893, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2482 }, { "completion_length": 10.484375, "epoch": 0.43507972665148065, "grad_norm": 18.44281784049491, "kl": 0.2294921875, "learning_rate": 5.650954967583669e-07, "loss": 0.0474, "reward": 1.1519947052001953, "reward_std": 0.1992306411266327, "rewards/accuracy_reward_stage2": 0.30824464559555054, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2483 }, { "completion_length": 9.453125, "epoch": 0.4352549500613282, "grad_norm": 17.669081506895804, "kl": 0.32421875, "learning_rate": 5.649202733485194e-07, "loss": 0.019, "reward": 1.6348530054092407, "reward_std": 0.2544611692428589, "rewards/accuracy_reward_stage2": 0.6817280054092407, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2484 }, { "completion_length": 11.28125, "epoch": 0.43543017347117574, "grad_norm": 18.882120225824227, "kl": 0.1455078125, "learning_rate": 5.647450499386718e-07, "loss": 0.0354, "reward": 1.2779840230941772, "reward_std": 0.32248374819755554, "rewards/accuracy_reward_stage2": 0.41860899329185486, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2485 }, { "completion_length": 7.59375, "epoch": 0.4356053968810233, "grad_norm": 15.706969735056607, "kl": 0.171875, "learning_rate": 5.645698265288243e-07, "loss": -0.0103, "reward": 1.625713586807251, "reward_std": 0.20755544304847717, "rewards/accuracy_reward_stage2": 0.656963586807251, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2486 }, { "completion_length": 9.25, "epoch": 0.43578062029087083, "grad_norm": 16.539471166479995, "kl": 0.09619140625, "learning_rate": 5.643946031189766e-07, "loss": -0.0499, "reward": 1.5612146854400635, "reward_std": 0.24608927965164185, "rewards/accuracy_reward_stage2": 0.5924647450447083, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2487 }, { "completion_length": 7.0, "epoch": 0.43595584370071844, "grad_norm": 20.675547669779483, "kl": 0.265625, "learning_rate": 5.642193797091291e-07, "loss": -0.036, "reward": 1.549863576889038, "reward_std": 0.3346082270145416, "rewards/accuracy_reward_stage2": 0.6123635768890381, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2488 }, { "completion_length": 10.0625, "epoch": 0.436131067110566, "grad_norm": 22.613480108024827, "kl": 0.2353515625, "learning_rate": 5.640441562992816e-07, "loss": -0.0745, "reward": 1.6920068264007568, "reward_std": 0.29510557651519775, "rewards/accuracy_reward_stage2": 0.7545068264007568, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2489 }, { "completion_length": 14.296875, "epoch": 0.43630629052041353, "grad_norm": 19.553744432186637, "kl": 0.07568359375, "learning_rate": 5.63868932889434e-07, "loss": -0.014, "reward": 1.7241294384002686, "reward_std": 0.29626208543777466, "rewards/accuracy_reward_stage2": 0.7397544980049133, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2490 }, { "completion_length": 11.0, "epoch": 0.4364815139302611, "grad_norm": 16.8282524901791, "kl": 0.08837890625, "learning_rate": 5.636937094795865e-07, "loss": 0.0355, "reward": 1.6007554531097412, "reward_std": 0.17432302236557007, "rewards/accuracy_reward_stage2": 0.6007554531097412, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2491 }, { "completion_length": 10.65625, "epoch": 0.4366567373401086, "grad_norm": 21.60463160566603, "kl": 0.2333984375, "learning_rate": 5.635184860697389e-07, "loss": -0.0129, "reward": 1.5278030633926392, "reward_std": 0.3193010687828064, "rewards/accuracy_reward_stage2": 0.5746780633926392, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2492 }, { "completion_length": 12.875, "epoch": 0.43683196074995617, "grad_norm": 21.031349941173335, "kl": 0.103515625, "learning_rate": 5.633432626598913e-07, "loss": 0.0103, "reward": 1.7680280208587646, "reward_std": 0.19699907302856445, "rewards/accuracy_reward_stage2": 0.7836530208587646, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2493 }, { "completion_length": 9.875, "epoch": 0.4370071841598038, "grad_norm": 17.252817207871704, "kl": 0.1865234375, "learning_rate": 5.631680392500438e-07, "loss": 0.0122, "reward": 1.5799731016159058, "reward_std": 0.20395085215568542, "rewards/accuracy_reward_stage2": 0.736223042011261, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2494 }, { "completion_length": 10.046875, "epoch": 0.4371824075696513, "grad_norm": 16.459408311866355, "kl": 0.10595703125, "learning_rate": 5.629928158401962e-07, "loss": 0.0425, "reward": 1.7014847993850708, "reward_std": 0.27053919434547424, "rewards/accuracy_reward_stage2": 0.7014847993850708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2495 }, { "completion_length": 9.0, "epoch": 0.43735763097949887, "grad_norm": 17.406967530913867, "kl": 0.212890625, "learning_rate": 5.628175924303486e-07, "loss": 0.0406, "reward": 1.4516146183013916, "reward_std": 0.2875097990036011, "rewards/accuracy_reward_stage2": 0.5922396183013916, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2496 }, { "completion_length": 10.90625, "epoch": 0.4375328543893464, "grad_norm": 18.29723073565994, "kl": 0.2138671875, "learning_rate": 5.626423690205011e-07, "loss": 0.0414, "reward": 1.6410624980926514, "reward_std": 0.24239271879196167, "rewards/accuracy_reward_stage2": 0.6566874980926514, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2497 }, { "completion_length": 10.96875, "epoch": 0.43770807779919396, "grad_norm": 13.061474527595225, "kl": 0.08251953125, "learning_rate": 5.624671456106535e-07, "loss": 0.033, "reward": 1.4094147682189941, "reward_std": 0.125525563955307, "rewards/accuracy_reward_stage2": 0.4094148278236389, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2498 }, { "completion_length": 11.171875, "epoch": 0.4378833012090415, "grad_norm": 19.58076173683503, "kl": 0.2197265625, "learning_rate": 5.62291922200806e-07, "loss": 0.0198, "reward": 1.470327377319336, "reward_std": 0.30748701095581055, "rewards/accuracy_reward_stage2": 0.6265773177146912, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2499 }, { "completion_length": 7.59375, "epoch": 0.4380585246188891, "grad_norm": 18.040021273363966, "kl": 0.07861328125, "learning_rate": 5.621166987909585e-07, "loss": -0.0081, "reward": 1.828190803527832, "reward_std": 0.21691644191741943, "rewards/accuracy_reward_stage2": 0.843815803527832, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2500 }, { "completion_length": 8.265625, "epoch": 0.43823374802873666, "grad_norm": 17.384233035604485, "kl": 0.05029296875, "learning_rate": 5.619414753811109e-07, "loss": 0.0201, "reward": 1.7455376386642456, "reward_std": 0.14652395248413086, "rewards/accuracy_reward_stage2": 0.7455376386642456, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2501 }, { "completion_length": 10.203125, "epoch": 0.4384089714385842, "grad_norm": 17.737734790062877, "kl": 0.1767578125, "learning_rate": 5.617662519712634e-07, "loss": 0.0613, "reward": 1.6611220836639404, "reward_std": 0.20425570011138916, "rewards/accuracy_reward_stage2": 0.6767470836639404, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2502 }, { "completion_length": 14.140625, "epoch": 0.43858419484843175, "grad_norm": 12.045441698437324, "kl": 0.10107421875, "learning_rate": 5.615910285614158e-07, "loss": -0.0251, "reward": 1.0958956480026245, "reward_std": 0.14148275554180145, "rewards/accuracy_reward_stage2": 0.12714560329914093, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2503 }, { "completion_length": 10.875, "epoch": 0.4387594182582793, "grad_norm": 15.969767938919299, "kl": 0.04345703125, "learning_rate": 5.614158051515683e-07, "loss": 0.0174, "reward": 1.5387020111083984, "reward_std": 0.125541090965271, "rewards/accuracy_reward_stage2": 0.6637020111083984, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2504 }, { "completion_length": 10.046875, "epoch": 0.43893464166812685, "grad_norm": 16.597279311330908, "kl": 0.0751953125, "learning_rate": 5.612405817417207e-07, "loss": -0.0134, "reward": 1.6844640970230103, "reward_std": 0.338733047246933, "rewards/accuracy_reward_stage2": 0.7000890374183655, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2505 }, { "completion_length": 10.640625, "epoch": 0.4391098650779744, "grad_norm": 14.121928210049473, "kl": 0.03466796875, "learning_rate": 5.61065358331873e-07, "loss": 0.0139, "reward": 1.6286423206329346, "reward_std": 0.1547921895980835, "rewards/accuracy_reward_stage2": 0.6286423206329346, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2506 }, { "completion_length": 6.203125, "epoch": 0.439285088487822, "grad_norm": 35.73116912990075, "kl": 0.058837890625, "learning_rate": 5.608901349220255e-07, "loss": 0.0139, "reward": 1.3072917461395264, "reward_std": 0.16098350286483765, "rewards/accuracy_reward_stage2": 0.3229166865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2507 }, { "completion_length": 11.25, "epoch": 0.43946031189766954, "grad_norm": 17.708270132972558, "kl": 0.02001953125, "learning_rate": 5.60714911512178e-07, "loss": 0.008, "reward": 1.609148621559143, "reward_std": 0.16704700887203217, "rewards/accuracy_reward_stage2": 0.6091486215591431, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2508 }, { "completion_length": 14.078125, "epoch": 0.4396355353075171, "grad_norm": 18.329023276387215, "kl": 0.1064453125, "learning_rate": 5.605396881023304e-07, "loss": 0.0036, "reward": 1.6048202514648438, "reward_std": 0.23036982119083405, "rewards/accuracy_reward_stage2": 0.620445191860199, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2509 }, { "completion_length": 7.984375, "epoch": 0.43981075871736464, "grad_norm": 20.726709280170827, "kl": 0.06591796875, "learning_rate": 5.603644646924829e-07, "loss": 0.0264, "reward": 1.8156670331954956, "reward_std": 0.13929487764835358, "rewards/accuracy_reward_stage2": 0.8156670331954956, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2510 }, { "completion_length": 8.515625, "epoch": 0.4399859821272122, "grad_norm": 27.31374661746669, "kl": 0.2255859375, "learning_rate": 5.601892412826353e-07, "loss": 0.0273, "reward": 1.1579368114471436, "reward_std": 0.18273042142391205, "rewards/accuracy_reward_stage2": 0.4235617518424988, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2511 }, { "completion_length": 10.90625, "epoch": 0.44016120553705973, "grad_norm": 21.205034695123313, "kl": 0.026611328125, "learning_rate": 5.600140178727878e-07, "loss": 0.0107, "reward": 1.3730418682098389, "reward_std": 0.19322770833969116, "rewards/accuracy_reward_stage2": 0.3730418086051941, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2512 }, { "completion_length": 15.515625, "epoch": 0.44033642894690733, "grad_norm": 25.29220340718031, "kl": 0.2392578125, "learning_rate": 5.598387944629403e-07, "loss": 0.064, "reward": 1.3425215482711792, "reward_std": 0.3277851343154907, "rewards/accuracy_reward_stage2": 0.4831465482711792, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2513 }, { "completion_length": 8.359375, "epoch": 0.4405116523567549, "grad_norm": 17.103435466073208, "kl": 0.1328125, "learning_rate": 5.596635710530927e-07, "loss": 0.0529, "reward": 1.6302083730697632, "reward_std": 0.04419417679309845, "rewards/accuracy_reward_stage2": 0.7552083730697632, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2514 }, { "completion_length": 9.578125, "epoch": 0.4406868757666024, "grad_norm": 13.641401001570447, "kl": 0.056640625, "learning_rate": 5.594883476432452e-07, "loss": -0.0191, "reward": 1.6454601287841797, "reward_std": 0.08494816720485687, "rewards/accuracy_reward_stage2": 0.6610851883888245, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2515 }, { "completion_length": 12.875, "epoch": 0.44086209917645, "grad_norm": 18.981704398952896, "kl": 0.1708984375, "learning_rate": 5.593131242333977e-07, "loss": 0.0682, "reward": 1.295891284942627, "reward_std": 0.20116037130355835, "rewards/accuracy_reward_stage2": 0.5458913445472717, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2516 }, { "completion_length": 6.625, "epoch": 0.4410373225862975, "grad_norm": 11.751108110567436, "kl": 0.1015625, "learning_rate": 5.591379008235499e-07, "loss": 0.001, "reward": 1.798114538192749, "reward_std": 0.20511700212955475, "rewards/accuracy_reward_stage2": 0.8137395977973938, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2517 }, { "completion_length": 10.3125, "epoch": 0.44121254599614507, "grad_norm": 13.022570147089745, "kl": 0.03369140625, "learning_rate": 5.589626774137024e-07, "loss": 0.0135, "reward": 1.6906923055648804, "reward_std": 0.08085846900939941, "rewards/accuracy_reward_stage2": 0.6906922459602356, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2518 }, { "completion_length": 12.0625, "epoch": 0.4413877694059926, "grad_norm": 18.866592945670142, "kl": 0.07373046875, "learning_rate": 5.587874540038548e-07, "loss": 0.0294, "reward": 1.6115610599517822, "reward_std": 0.21088054776191711, "rewards/accuracy_reward_stage2": 0.6115610599517822, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2519 }, { "completion_length": 14.078125, "epoch": 0.4415629928158402, "grad_norm": 22.588152551889987, "kl": 0.193359375, "learning_rate": 5.586122305940073e-07, "loss": 0.045, "reward": 1.5260608196258545, "reward_std": 0.2753870487213135, "rewards/accuracy_reward_stage2": 0.7916858196258545, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2520 }, { "completion_length": 10.125, "epoch": 0.44173821622568776, "grad_norm": 19.79433337645499, "kl": 0.158203125, "learning_rate": 5.584370071841598e-07, "loss": 0.063, "reward": 1.4418141841888428, "reward_std": 0.11664269864559174, "rewards/accuracy_reward_stage2": 0.5668141841888428, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2521 }, { "completion_length": 9.75, "epoch": 0.4419134396355353, "grad_norm": 16.81120708295592, "kl": 0.189453125, "learning_rate": 5.582617837743122e-07, "loss": -0.0077, "reward": 1.4906489849090576, "reward_std": 0.25340473651885986, "rewards/accuracy_reward_stage2": 0.5218990445137024, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2522 }, { "completion_length": 9.71875, "epoch": 0.44208866304538286, "grad_norm": 28.142505492164215, "kl": 0.130859375, "learning_rate": 5.580865603644647e-07, "loss": 0.043, "reward": 1.1741572618484497, "reward_std": 0.4262906312942505, "rewards/accuracy_reward_stage2": 0.42415720224380493, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2523 }, { "completion_length": 23.515625, "epoch": 0.4422638864552304, "grad_norm": 14.510789188327777, "kl": 0.0703125, "learning_rate": 5.579113369546172e-07, "loss": -0.0161, "reward": 1.7277365922927856, "reward_std": 0.17168009281158447, "rewards/accuracy_reward_stage2": 0.7433614730834961, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2524 }, { "completion_length": 9.546875, "epoch": 0.44243910986507795, "grad_norm": 29.374054399951252, "kl": 0.10107421875, "learning_rate": 5.577361135447696e-07, "loss": 0.0403, "reward": 1.5435447692871094, "reward_std": 0.29414820671081543, "rewards/accuracy_reward_stage2": 0.5435448288917542, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2525 }, { "completion_length": 8.953125, "epoch": 0.44261433327492555, "grad_norm": 17.304382130864624, "kl": 0.134765625, "learning_rate": 5.57560890134922e-07, "loss": -0.0765, "reward": 1.5247858762741089, "reward_std": 0.20727473497390747, "rewards/accuracy_reward_stage2": 0.5872858762741089, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2526 }, { "completion_length": 8.125, "epoch": 0.4427895566847731, "grad_norm": 17.846323420295455, "kl": 0.1552734375, "learning_rate": 5.573856667250744e-07, "loss": -0.0814, "reward": 1.5099225044250488, "reward_std": 0.3462476134300232, "rewards/accuracy_reward_stage2": 0.5724225044250488, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2527 }, { "completion_length": 14.125, "epoch": 0.44296478009462065, "grad_norm": 17.30740806674459, "kl": 0.12109375, "learning_rate": 5.572104433152269e-07, "loss": 0.0042, "reward": 1.4569900035858154, "reward_std": 0.17400649189949036, "rewards/accuracy_reward_stage2": 0.5976149439811707, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2528 }, { "completion_length": 15.890625, "epoch": 0.4431400035044682, "grad_norm": 15.564204835103096, "kl": 0.1005859375, "learning_rate": 5.570352199053794e-07, "loss": -0.01, "reward": 1.460471510887146, "reward_std": 0.15011385083198547, "rewards/accuracy_reward_stage2": 0.491721510887146, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2529 }, { "completion_length": 9.203125, "epoch": 0.44331522691431574, "grad_norm": 18.140320819227657, "kl": 0.07080078125, "learning_rate": 5.568599964955317e-07, "loss": -0.0288, "reward": 1.5155820846557617, "reward_std": 0.3032262623310089, "rewards/accuracy_reward_stage2": 0.5468320250511169, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2530 }, { "completion_length": 7.453125, "epoch": 0.4434904503241633, "grad_norm": 19.07138088025204, "kl": 0.1220703125, "learning_rate": 5.566847730856842e-07, "loss": -0.0371, "reward": 1.4525704383850098, "reward_std": 0.2989710867404938, "rewards/accuracy_reward_stage2": 0.6088204383850098, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2531 }, { "completion_length": 20.734375, "epoch": 0.4436656737340109, "grad_norm": 18.482553685228076, "kl": 0.1572265625, "learning_rate": 5.565095496758366e-07, "loss": 0.0299, "reward": 1.2634599208831787, "reward_std": 0.2991285026073456, "rewards/accuracy_reward_stage2": 0.40408504009246826, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2532 }, { "completion_length": 11.84375, "epoch": 0.44384089714385844, "grad_norm": 20.09217595030906, "kl": 0.1748046875, "learning_rate": 5.563343262659891e-07, "loss": 0.07, "reward": 1.6363379955291748, "reward_std": 0.18992879986763, "rewards/accuracy_reward_stage2": 0.7613379955291748, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2533 }, { "completion_length": 19.71875, "epoch": 0.444016120553706, "grad_norm": 21.313434613331474, "kl": 0.2001953125, "learning_rate": 5.561591028561416e-07, "loss": -0.0077, "reward": 1.2119635343551636, "reward_std": 0.20474477112293243, "rewards/accuracy_reward_stage2": 0.4932134747505188, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2534 }, { "completion_length": 18.046875, "epoch": 0.44419134396355353, "grad_norm": 22.413468625805006, "kl": 0.310546875, "learning_rate": 5.55983879446294e-07, "loss": 0.0457, "reward": 1.422907829284668, "reward_std": 0.159887433052063, "rewards/accuracy_reward_stage2": 0.594782829284668, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2535 }, { "completion_length": 16.5, "epoch": 0.4443665673734011, "grad_norm": 62.449080252584, "kl": 0.068359375, "learning_rate": 5.558086560364464e-07, "loss": -0.008, "reward": 1.517016887664795, "reward_std": 0.2230542004108429, "rewards/accuracy_reward_stage2": 0.5326418876647949, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2536 }, { "completion_length": 9.421875, "epoch": 0.4445417907832486, "grad_norm": 16.9292753906615, "kl": 0.04150390625, "learning_rate": 5.556334326265989e-07, "loss": 0.0165, "reward": 1.482633113861084, "reward_std": 0.1710350215435028, "rewards/accuracy_reward_stage2": 0.607633113861084, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2537 }, { "completion_length": 10.5625, "epoch": 0.44471701419309617, "grad_norm": 26.703555972127315, "kl": 0.31640625, "learning_rate": 5.554582092167513e-07, "loss": -0.0319, "reward": 1.3041858673095703, "reward_std": 0.2861517071723938, "rewards/accuracy_reward_stage2": 0.5073109269142151, "rewards/format_reward_stage1_pointerpad": 0.796875, "scores/accuracy_reward_stage2": 0.796875, "step": 2538 }, { "completion_length": 11.4375, "epoch": 0.4448922376029438, "grad_norm": 7.063004974438899, "kl": 0.044677734375, "learning_rate": 5.552829858069038e-07, "loss": -0.0263, "reward": 1.78125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2539 }, { "completion_length": 7.328125, "epoch": 0.4450674610127913, "grad_norm": 14.845076825348983, "kl": 0.15234375, "learning_rate": 5.551077623970562e-07, "loss": -0.0562, "reward": 1.6265857219696045, "reward_std": 0.2581363916397095, "rewards/accuracy_reward_stage2": 0.6734606623649597, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2540 }, { "completion_length": 8.453125, "epoch": 0.44524268442263887, "grad_norm": 25.135986150588074, "kl": 0.1669921875, "learning_rate": 5.549325389872087e-07, "loss": 0.0133, "reward": 1.6385996341705322, "reward_std": 0.3358069062232971, "rewards/accuracy_reward_stage2": 0.6698496341705322, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2541 }, { "completion_length": 12.46875, "epoch": 0.4454179078324864, "grad_norm": 43.303538715189156, "kl": 0.490234375, "learning_rate": 5.547573155773612e-07, "loss": 0.1526, "reward": 1.3292334079742432, "reward_std": 0.25464826822280884, "rewards/accuracy_reward_stage2": 0.5948582887649536, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2542 }, { "completion_length": 14.09375, "epoch": 0.44559313124233396, "grad_norm": 16.19933901607236, "kl": 0.044189453125, "learning_rate": 5.545820921675135e-07, "loss": 0.0177, "reward": 1.329080581665039, "reward_std": 0.20220564305782318, "rewards/accuracy_reward_stage2": 0.3290805220603943, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2543 }, { "completion_length": 9.953125, "epoch": 0.4457683546521815, "grad_norm": 21.785405769856855, "kl": 0.062255859375, "learning_rate": 5.54406868757666e-07, "loss": 0.0249, "reward": 1.7413477897644043, "reward_std": 0.24454760551452637, "rewards/accuracy_reward_stage2": 0.7413477301597595, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2544 }, { "completion_length": 8.390625, "epoch": 0.4459435780620291, "grad_norm": 19.600424294478405, "kl": 0.1162109375, "learning_rate": 5.542316453478185e-07, "loss": 0.0464, "reward": 1.4191194772720337, "reward_std": 0.27458277344703674, "rewards/accuracy_reward_stage2": 0.5441195368766785, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2545 }, { "completion_length": 9.3125, "epoch": 0.44611880147187666, "grad_norm": 19.44479956550256, "kl": 0.15234375, "learning_rate": 5.540564219379708e-07, "loss": 0.0606, "reward": 0.9888094663619995, "reward_std": 0.180924654006958, "rewards/accuracy_reward_stage2": 0.2388094961643219, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2546 }, { "completion_length": 10.828125, "epoch": 0.4462940248817242, "grad_norm": 19.593423184652256, "kl": 0.1572265625, "learning_rate": 5.538811985281233e-07, "loss": 0.0239, "reward": 1.71272873878479, "reward_std": 0.3122587203979492, "rewards/accuracy_reward_stage2": 0.7283537983894348, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2547 }, { "completion_length": 12.53125, "epoch": 0.44646924829157175, "grad_norm": 17.261805816189508, "kl": 0.10009765625, "learning_rate": 5.537059751182757e-07, "loss": -0.0192, "reward": 1.4522144794464111, "reward_std": 0.22447730600833893, "rewards/accuracy_reward_stage2": 0.48346447944641113, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2548 }, { "completion_length": 8.734375, "epoch": 0.4466444717014193, "grad_norm": 14.751921509927948, "kl": 0.0174560546875, "learning_rate": 5.535307517084282e-07, "loss": 0.007, "reward": 1.827867031097412, "reward_std": 0.1435975730419159, "rewards/accuracy_reward_stage2": 0.8278670907020569, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2549 }, { "completion_length": 13.875, "epoch": 0.44681969511126685, "grad_norm": 21.018803559215076, "kl": 0.240234375, "learning_rate": 5.533555282985807e-07, "loss": 0.0675, "reward": 1.5392227172851562, "reward_std": 0.20909258723258972, "rewards/accuracy_reward_stage2": 0.6798477172851562, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2550 }, { "completion_length": 15.71875, "epoch": 0.44699491852111445, "grad_norm": 18.16554039242802, "kl": 0.10009765625, "learning_rate": 5.531803048887331e-07, "loss": -0.0031, "reward": 1.5369999408721924, "reward_std": 0.21140316128730774, "rewards/accuracy_reward_stage2": 0.5526249408721924, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2551 }, { "completion_length": 10.859375, "epoch": 0.447170141930962, "grad_norm": 21.735984742936957, "kl": 0.0810546875, "learning_rate": 5.530050814788856e-07, "loss": 0.0324, "reward": 1.5851173400878906, "reward_std": 0.2794300317764282, "rewards/accuracy_reward_stage2": 0.5851173400878906, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2552 }, { "completion_length": 7.96875, "epoch": 0.44734536534080954, "grad_norm": 17.867846182029798, "kl": 0.12890625, "learning_rate": 5.528298580690381e-07, "loss": 0.0085, "reward": 1.6685552597045898, "reward_std": 0.20725038647651672, "rewards/accuracy_reward_stage2": 0.6841802597045898, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2553 }, { "completion_length": 11.953125, "epoch": 0.4475205887506571, "grad_norm": 34.389850460739886, "kl": 0.3203125, "learning_rate": 5.526546346591905e-07, "loss": 0.0838, "reward": 1.4835262298583984, "reward_std": 0.2388857752084732, "rewards/accuracy_reward_stage2": 0.6241511106491089, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2554 }, { "completion_length": 8.40625, "epoch": 0.44769581216050464, "grad_norm": 27.431755120229397, "kl": 0.19140625, "learning_rate": 5.52479411249343e-07, "loss": 0.0482, "reward": 1.5072015523910522, "reward_std": 0.28641408681869507, "rewards/accuracy_reward_stage2": 0.5228264927864075, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2555 }, { "completion_length": 8.890625, "epoch": 0.4478710355703522, "grad_norm": 16.111416884167895, "kl": 0.03515625, "learning_rate": 5.523041878394952e-07, "loss": -0.0301, "reward": 1.78125, "reward_std": 0.23356688022613525, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2556 }, { "completion_length": 9.453125, "epoch": 0.44804625898019973, "grad_norm": 14.573244310740995, "kl": 0.10546875, "learning_rate": 5.521289644296477e-07, "loss": 0.0421, "reward": 1.6069750785827637, "reward_std": 0.20344749093055725, "rewards/accuracy_reward_stage2": 0.6069749593734741, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2557 }, { "completion_length": 8.40625, "epoch": 0.44822148239004733, "grad_norm": 14.188444483313175, "kl": 0.06982421875, "learning_rate": 5.519537410198002e-07, "loss": -0.0322, "reward": 1.6511962413787842, "reward_std": 0.12565788626670837, "rewards/accuracy_reward_stage2": 0.6824462413787842, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2558 }, { "completion_length": 8.28125, "epoch": 0.4483967057998949, "grad_norm": 20.127520754602703, "kl": 0.09033203125, "learning_rate": 5.517785176099526e-07, "loss": -0.0522, "reward": 1.349897861480713, "reward_std": 0.26652413606643677, "rewards/accuracy_reward_stage2": 0.3811478316783905, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2559 }, { "completion_length": 13.34375, "epoch": 0.4485719292097424, "grad_norm": 20.592003423289654, "kl": 0.0810546875, "learning_rate": 5.516032942001051e-07, "loss": -0.0117, "reward": 1.5917490720748901, "reward_std": 0.18286369740962982, "rewards/accuracy_reward_stage2": 0.6073740720748901, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2560 }, { "completion_length": 8.5625, "epoch": 0.44874715261959, "grad_norm": 17.541758776799224, "kl": 0.12255859375, "learning_rate": 5.514280707902576e-07, "loss": 0.0119, "reward": 1.6510417461395264, "reward_std": 0.2547297775745392, "rewards/accuracy_reward_stage2": 0.6666666269302368, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2561 }, { "completion_length": 11.921875, "epoch": 0.4489223760294375, "grad_norm": 13.059778536734967, "kl": 0.1455078125, "learning_rate": 5.5125284738041e-07, "loss": 0.0583, "reward": 1.515625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2562 }, { "completion_length": 14.734375, "epoch": 0.44909759943928507, "grad_norm": 17.78107440024237, "kl": 0.06005859375, "learning_rate": 5.510776239705625e-07, "loss": 0.024, "reward": 1.5911427736282349, "reward_std": 0.23703868687152863, "rewards/accuracy_reward_stage2": 0.5911428332328796, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2563 }, { "completion_length": 15.09375, "epoch": 0.44927282284913267, "grad_norm": 21.778189739327654, "kl": 0.080078125, "learning_rate": 5.509024005607149e-07, "loss": 0.0319, "reward": 1.6581635475158691, "reward_std": 0.24475786089897156, "rewards/accuracy_reward_stage2": 0.6581635475158691, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2564 }, { "completion_length": 12.15625, "epoch": 0.4494480462589802, "grad_norm": 17.34065307299987, "kl": 0.11328125, "learning_rate": 5.507271771508674e-07, "loss": 0.0453, "reward": 1.4566253423690796, "reward_std": 0.17970743775367737, "rewards/accuracy_reward_stage2": 0.4566253423690796, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2565 }, { "completion_length": 11.09375, "epoch": 0.44962326966882776, "grad_norm": 26.66239812379797, "kl": 0.21875, "learning_rate": 5.505519537410198e-07, "loss": 0.0198, "reward": 1.3697327375411987, "reward_std": 0.2223256528377533, "rewards/accuracy_reward_stage2": 0.5259827971458435, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2566 }, { "completion_length": 15.34375, "epoch": 0.4497984930786753, "grad_norm": 17.23732687057513, "kl": 0.09033203125, "learning_rate": 5.503767303311722e-07, "loss": -0.0081, "reward": 1.6454896926879883, "reward_std": 0.23397132754325867, "rewards/accuracy_reward_stage2": 0.6611147522926331, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2567 }, { "completion_length": 18.5, "epoch": 0.44997371648852286, "grad_norm": 20.48343016070601, "kl": 0.091796875, "learning_rate": 5.502015069213246e-07, "loss": 0.0366, "reward": 1.6218650341033936, "reward_std": 0.15606439113616943, "rewards/accuracy_reward_stage2": 0.6218649744987488, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2568 }, { "completion_length": 13.25, "epoch": 0.4501489398983704, "grad_norm": 20.279168042824203, "kl": 0.1435546875, "learning_rate": 5.500262835114771e-07, "loss": 0.0575, "reward": 1.488884449005127, "reward_std": 0.24019384384155273, "rewards/accuracy_reward_stage2": 0.4888843894004822, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2569 }, { "completion_length": 7.75, "epoch": 0.45032416330821795, "grad_norm": 16.791397881227955, "kl": 0.0576171875, "learning_rate": 5.498510601016295e-07, "loss": 0.0231, "reward": 1.7581019401550293, "reward_std": 0.2148759514093399, "rewards/accuracy_reward_stage2": 0.7581018209457397, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2570 }, { "completion_length": 18.171875, "epoch": 0.45049938671806555, "grad_norm": 15.321118233041698, "kl": 0.07666015625, "learning_rate": 5.49675836691782e-07, "loss": 0.0018, "reward": 1.543554425239563, "reward_std": 0.16047553718090057, "rewards/accuracy_reward_stage2": 0.559179425239563, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2571 }, { "completion_length": 13.046875, "epoch": 0.4506746101279131, "grad_norm": 17.57267956385793, "kl": 0.2099609375, "learning_rate": 5.495006132819344e-07, "loss": 0.0399, "reward": 1.5052083730697632, "reward_std": 0.23056091368198395, "rewards/accuracy_reward_stage2": 0.6458333134651184, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2572 }, { "completion_length": 11.421875, "epoch": 0.45084983353776065, "grad_norm": 19.669061029863865, "kl": 0.0693359375, "learning_rate": 5.493253898720869e-07, "loss": 0.0278, "reward": 1.5255483388900757, "reward_std": 0.22976790368556976, "rewards/accuracy_reward_stage2": 0.5255483984947205, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2573 }, { "completion_length": 10.125, "epoch": 0.4510250569476082, "grad_norm": 13.546947500812909, "kl": 0.056396484375, "learning_rate": 5.491501664622394e-07, "loss": 0.0225, "reward": 1.3796207904815674, "reward_std": 0.10605320334434509, "rewards/accuracy_reward_stage2": 0.5046207904815674, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2574 }, { "completion_length": 11.0, "epoch": 0.45120028035745574, "grad_norm": 22.4200432270858, "kl": 0.251953125, "learning_rate": 5.489749430523917e-07, "loss": 0.0591, "reward": 1.4289274215698242, "reward_std": 0.19315001368522644, "rewards/accuracy_reward_stage2": 0.569552481174469, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2575 }, { "completion_length": 10.453125, "epoch": 0.4513755037673033, "grad_norm": 17.1520528815772, "kl": 0.08740234375, "learning_rate": 5.487997196425442e-07, "loss": -0.0092, "reward": 1.49286687374115, "reward_std": 0.20367136597633362, "rewards/accuracy_reward_stage2": 0.5084918737411499, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2576 }, { "completion_length": 12.125, "epoch": 0.4515507271771509, "grad_norm": 16.746090519646597, "kl": 0.06494140625, "learning_rate": 5.486244962326967e-07, "loss": 0.0261, "reward": 1.6583220958709717, "reward_std": 0.18301549553871155, "rewards/accuracy_reward_stage2": 0.7833219766616821, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2577 }, { "completion_length": 9.484375, "epoch": 0.45172595058699844, "grad_norm": 18.860574402696923, "kl": 0.1630859375, "learning_rate": 5.484492728228491e-07, "loss": 0.032, "reward": 1.6444649696350098, "reward_std": 0.22633656859397888, "rewards/accuracy_reward_stage2": 0.6600899696350098, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2578 }, { "completion_length": 15.453125, "epoch": 0.451901173996846, "grad_norm": 18.134086534099705, "kl": 0.06982421875, "learning_rate": 5.482740494130016e-07, "loss": 0.028, "reward": 1.4077608585357666, "reward_std": 0.12077020853757858, "rewards/accuracy_reward_stage2": 0.407760888338089, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2579 }, { "completion_length": 10.625, "epoch": 0.45207639740669353, "grad_norm": 18.336216197197302, "kl": 0.0250244140625, "learning_rate": 5.48098826003154e-07, "loss": 0.01, "reward": 1.5207476615905762, "reward_std": 0.16083654761314392, "rewards/accuracy_reward_stage2": 0.5207476615905762, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2580 }, { "completion_length": 18.53125, "epoch": 0.4522516208165411, "grad_norm": 18.966009229323692, "kl": 0.224609375, "learning_rate": 5.479236025933064e-07, "loss": 0.0455, "reward": 1.5232280492782593, "reward_std": 0.17001014947891235, "rewards/accuracy_reward_stage2": 0.663853108882904, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2581 }, { "completion_length": 10.03125, "epoch": 0.4524268442263886, "grad_norm": 19.36451871278005, "kl": 0.1611328125, "learning_rate": 5.477483791834589e-07, "loss": 0.0194, "reward": 1.6093440055847168, "reward_std": 0.31875163316726685, "rewards/accuracy_reward_stage2": 0.6405940055847168, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2582 }, { "completion_length": 11.640625, "epoch": 0.4526020676362362, "grad_norm": 18.495981870617012, "kl": 0.1884765625, "learning_rate": 5.475731557736113e-07, "loss": 0.0753, "reward": 1.604927897453308, "reward_std": 0.17703410983085632, "rewards/accuracy_reward_stage2": 0.7299278378486633, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2583 }, { "completion_length": 8.4375, "epoch": 0.4527772910460838, "grad_norm": 13.749480254627533, "kl": 0.1875, "learning_rate": 5.473979323637638e-07, "loss": 0.0306, "reward": 1.4577176570892334, "reward_std": 0.20654311776161194, "rewards/accuracy_reward_stage2": 0.4733426570892334, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2584 }, { "completion_length": 8.390625, "epoch": 0.4529525144559313, "grad_norm": 25.879623238170172, "kl": 0.1728515625, "learning_rate": 5.472227089539163e-07, "loss": 0.0479, "reward": 1.7104345560073853, "reward_std": 0.2433592975139618, "rewards/accuracy_reward_stage2": 0.7260594964027405, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2585 }, { "completion_length": 9.546875, "epoch": 0.45312773786577887, "grad_norm": 18.655052078316817, "kl": 0.271484375, "learning_rate": 5.470474855440686e-07, "loss": -0.0886, "reward": 1.6952078342437744, "reward_std": 0.2790555953979492, "rewards/accuracy_reward_stage2": 0.7733327746391296, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 2586 }, { "completion_length": 8.75, "epoch": 0.4533029612756264, "grad_norm": 17.11928499202011, "kl": 0.16796875, "learning_rate": 5.468722621342211e-07, "loss": 0.0673, "reward": 1.462017297744751, "reward_std": 0.3036963939666748, "rewards/accuracy_reward_stage2": 0.587017297744751, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2587 }, { "completion_length": 8.28125, "epoch": 0.45347818468547396, "grad_norm": 17.212160066694576, "kl": 0.054443359375, "learning_rate": 5.466970387243735e-07, "loss": 0.0217, "reward": 1.844616413116455, "reward_std": 0.07760395854711533, "rewards/accuracy_reward_stage2": 0.8446164727210999, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2588 }, { "completion_length": 7.734375, "epoch": 0.4536534080953215, "grad_norm": 27.47549068286514, "kl": 0.298828125, "learning_rate": 5.46521815314526e-07, "loss": 0.0384, "reward": 1.6829417943954468, "reward_std": 0.24853834509849548, "rewards/accuracy_reward_stage2": 0.714191734790802, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2589 }, { "completion_length": 6.890625, "epoch": 0.4538286315051691, "grad_norm": 17.530731905229523, "kl": 0.1484375, "learning_rate": 5.463465919046785e-07, "loss": -0.0719, "reward": 1.6600399017333984, "reward_std": 0.2958260476589203, "rewards/accuracy_reward_stage2": 0.7069148421287537, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2590 }, { "completion_length": 11.546875, "epoch": 0.45400385491501666, "grad_norm": 19.476134745790667, "kl": 0.12890625, "learning_rate": 5.461713684948309e-07, "loss": 0.0514, "reward": 1.619103193283081, "reward_std": 0.23953868448734283, "rewards/accuracy_reward_stage2": 0.6191032528877258, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2591 }, { "completion_length": 10.59375, "epoch": 0.4541790783248642, "grad_norm": 16.844721399664373, "kl": 0.125, "learning_rate": 5.459961450849834e-07, "loss": 0.0059, "reward": 1.814000129699707, "reward_std": 0.18107157945632935, "rewards/accuracy_reward_stage2": 0.8296250700950623, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2592 }, { "completion_length": 19.125, "epoch": 0.45435430173471175, "grad_norm": 15.050078423215997, "kl": 0.09912109375, "learning_rate": 5.458209216751359e-07, "loss": -0.0045, "reward": 1.384594202041626, "reward_std": 0.1534789651632309, "rewards/accuracy_reward_stage2": 0.40021926164627075, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2593 }, { "completion_length": 12.578125, "epoch": 0.4545295251445593, "grad_norm": 16.303858847797237, "kl": 0.125, "learning_rate": 5.456456982652882e-07, "loss": 0.0059, "reward": 1.482431411743164, "reward_std": 0.2253151684999466, "rewards/accuracy_reward_stage2": 0.4980563521385193, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2594 }, { "completion_length": 11.203125, "epoch": 0.45470474855440685, "grad_norm": 16.395967548946256, "kl": 0.169921875, "learning_rate": 5.454704748554406e-07, "loss": 0.0237, "reward": 1.3767869472503662, "reward_std": 0.22264955937862396, "rewards/accuracy_reward_stage2": 0.5174120664596558, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2595 }, { "completion_length": 10.578125, "epoch": 0.45487997196425445, "grad_norm": 18.742319177335087, "kl": 0.12060546875, "learning_rate": 5.45295251445593e-07, "loss": 0.0086, "reward": 1.7379519939422607, "reward_std": 0.2344542145729065, "rewards/accuracy_reward_stage2": 0.7535768747329712, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2596 }, { "completion_length": 11.515625, "epoch": 0.455055195374102, "grad_norm": 20.001536055255976, "kl": 0.23046875, "learning_rate": 5.451200280357455e-07, "loss": -0.0223, "reward": 1.3056724071502686, "reward_std": 0.3078497648239136, "rewards/accuracy_reward_stage2": 0.35254743695259094, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2597 }, { "completion_length": 15.28125, "epoch": 0.45523041878394954, "grad_norm": 23.3106633392216, "kl": 0.140625, "learning_rate": 5.44944804625898e-07, "loss": 0.0286, "reward": 1.4137800931930542, "reward_std": 0.29428166151046753, "rewards/accuracy_reward_stage2": 0.4294050335884094, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2598 }, { "completion_length": 11.515625, "epoch": 0.4554056421937971, "grad_norm": 17.17033481514972, "kl": 0.08837890625, "learning_rate": 5.447695812160504e-07, "loss": 0.0054, "reward": 1.3880894184112549, "reward_std": 0.25328290462493896, "rewards/accuracy_reward_stage2": 0.5287142992019653, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2599 }, { "completion_length": 9.171875, "epoch": 0.45558086560364464, "grad_norm": 21.993536038979904, "kl": 0.142578125, "learning_rate": 5.445943578062029e-07, "loss": 0.0572, "reward": 1.56821870803833, "reward_std": 0.24604782462120056, "rewards/accuracy_reward_stage2": 0.6932187676429749, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2600 }, { "completion_length": 9.03125, "epoch": 0.4557560890134922, "grad_norm": 19.09370863576669, "kl": 0.076171875, "learning_rate": 5.444191343963554e-07, "loss": -0.0137, "reward": 1.6924455165863037, "reward_std": 0.19052401185035706, "rewards/accuracy_reward_stage2": 0.7080705165863037, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2601 }, { "completion_length": 9.96875, "epoch": 0.4559313124233398, "grad_norm": 20.253305860465996, "kl": 0.1416015625, "learning_rate": 5.442439109865078e-07, "loss": 0.0236, "reward": 1.4954473972320557, "reward_std": 0.2541544735431671, "rewards/accuracy_reward_stage2": 0.6360723972320557, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2602 }, { "completion_length": 12.765625, "epoch": 0.45610653583318733, "grad_norm": 15.689281208872066, "kl": 0.1240234375, "learning_rate": 5.440686875766603e-07, "loss": 0.0056, "reward": 1.3455251455307007, "reward_std": 0.13567785918712616, "rewards/accuracy_reward_stage2": 0.4861501157283783, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2603 }, { "completion_length": 11.90625, "epoch": 0.4562817592430349, "grad_norm": 18.400359885372882, "kl": 0.150390625, "learning_rate": 5.438934641668127e-07, "loss": 0.0386, "reward": 1.3754087686538696, "reward_std": 0.26961177587509155, "rewards/accuracy_reward_stage2": 0.6410337686538696, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2604 }, { "completion_length": 6.015625, "epoch": 0.4564569826528824, "grad_norm": 18.298089152870244, "kl": 0.038330078125, "learning_rate": 5.437182407569652e-07, "loss": 0.0154, "reward": 1.7267228364944458, "reward_std": 0.15918239951133728, "rewards/accuracy_reward_stage2": 0.8517228960990906, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2605 }, { "completion_length": 11.609375, "epoch": 0.45663220606273, "grad_norm": 14.534911045435273, "kl": 0.1318359375, "learning_rate": 5.435430173471176e-07, "loss": -0.0358, "reward": 1.7274062633514404, "reward_std": 0.11909748613834381, "rewards/accuracy_reward_stage2": 0.7586562633514404, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2606 }, { "completion_length": 12.1875, "epoch": 0.4568074294725775, "grad_norm": 15.895748749987368, "kl": 0.08544921875, "learning_rate": 5.433677939372699e-07, "loss": 0.0341, "reward": 1.7754074335098267, "reward_std": 0.17999312281608582, "rewards/accuracy_reward_stage2": 0.7754074335098267, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2607 }, { "completion_length": 8.453125, "epoch": 0.45698265288242507, "grad_norm": 15.343178933225655, "kl": 0.08154296875, "learning_rate": 5.431925705274224e-07, "loss": -0.0116, "reward": 1.7874504327774048, "reward_std": 0.17973880469799042, "rewards/accuracy_reward_stage2": 0.8030754327774048, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2608 }, { "completion_length": 9.96875, "epoch": 0.45715787629227267, "grad_norm": 16.31743124314278, "kl": 0.2197265625, "learning_rate": 5.430173471175748e-07, "loss": 0.088, "reward": 1.2728722095489502, "reward_std": 0.06837272644042969, "rewards/accuracy_reward_stage2": 0.39787212014198303, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2609 }, { "completion_length": 12.921875, "epoch": 0.4573330997021202, "grad_norm": 22.3630972019856, "kl": 0.3046875, "learning_rate": 5.428421237077273e-07, "loss": 0.0188, "reward": 1.1350054740905762, "reward_std": 0.45940613746643066, "rewards/accuracy_reward_stage2": 0.3068804144859314, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2610 }, { "completion_length": 10.03125, "epoch": 0.45750832311196776, "grad_norm": 18.02245735574043, "kl": 0.11669921875, "learning_rate": 5.426669002978798e-07, "loss": 0.01, "reward": 1.3457145690917969, "reward_std": 0.18275277316570282, "rewards/accuracy_reward_stage2": 0.4863395094871521, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2611 }, { "completion_length": 11.625, "epoch": 0.4576835465218153, "grad_norm": 27.94120506568601, "kl": 0.224609375, "learning_rate": 5.424916768880322e-07, "loss": 0.0393, "reward": 1.4052271842956543, "reward_std": 0.3087007403373718, "rewards/accuracy_reward_stage2": 0.5458522439002991, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2612 }, { "completion_length": 13.28125, "epoch": 0.45785876993166286, "grad_norm": 18.883931756205346, "kl": 0.1494140625, "learning_rate": 5.423164534781847e-07, "loss": 0.0286, "reward": 1.4941810369491577, "reward_std": 0.17316259443759918, "rewards/accuracy_reward_stage2": 0.5098060369491577, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2613 }, { "completion_length": 15.90625, "epoch": 0.4580339933415104, "grad_norm": 19.042972465701816, "kl": 0.0771484375, "learning_rate": 5.421412300683372e-07, "loss": -0.0133, "reward": 1.240898609161377, "reward_std": 0.19125321507453918, "rewards/accuracy_reward_stage2": 0.3815236985683441, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2614 }, { "completion_length": 15.765625, "epoch": 0.458209216751358, "grad_norm": 60.96562398209649, "kl": 0.380859375, "learning_rate": 5.419660066584895e-07, "loss": 0.0644, "reward": 1.300868034362793, "reward_std": 0.1779276430606842, "rewards/accuracy_reward_stage2": 0.33211806416511536, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2615 }, { "completion_length": 7.171875, "epoch": 0.45838444016120555, "grad_norm": 16.982198405407203, "kl": 0.06494140625, "learning_rate": 5.41790783248642e-07, "loss": 0.0259, "reward": 1.6714116334915161, "reward_std": 0.10098038613796234, "rewards/accuracy_reward_stage2": 0.6714116334915161, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2616 }, { "completion_length": 8.40625, "epoch": 0.4585596635710531, "grad_norm": 14.23052773010471, "kl": 0.1220703125, "learning_rate": 5.416155598387944e-07, "loss": 0.0047, "reward": 1.5308772325515747, "reward_std": 0.17923077940940857, "rewards/accuracy_reward_stage2": 0.6715022325515747, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2617 }, { "completion_length": 9.015625, "epoch": 0.45873488698090065, "grad_norm": 16.07430260605479, "kl": 0.08544921875, "learning_rate": 5.414403364289469e-07, "loss": -0.0068, "reward": 1.5297343730926514, "reward_std": 0.19875817000865936, "rewards/accuracy_reward_stage2": 0.5453594326972961, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2618 }, { "completion_length": 6.75, "epoch": 0.4589101103907482, "grad_norm": 10.922705313788358, "kl": 0.1181640625, "learning_rate": 5.412651130190993e-07, "loss": 0.003, "reward": 1.6112689971923828, "reward_std": 0.12722565233707428, "rewards/accuracy_reward_stage2": 0.626893937587738, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2619 }, { "completion_length": 11.375, "epoch": 0.45908533380059574, "grad_norm": 16.884670927133442, "kl": 0.099609375, "learning_rate": 5.410898896092517e-07, "loss": -0.0378, "reward": 1.529841661453247, "reward_std": 0.28377997875213623, "rewards/accuracy_reward_stage2": 0.5610915422439575, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2620 }, { "completion_length": 8.515625, "epoch": 0.4592605572104433, "grad_norm": 19.32312477942592, "kl": 0.1376953125, "learning_rate": 5.409146661994042e-07, "loss": 0.055, "reward": 1.831881046295166, "reward_std": 0.24564093351364136, "rewards/accuracy_reward_stage2": 0.831881046295166, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2621 }, { "completion_length": 13.1875, "epoch": 0.4594357806202909, "grad_norm": 50.20033941318913, "kl": 0.059814453125, "learning_rate": 5.407394427895567e-07, "loss": -0.0075, "reward": 1.488027811050415, "reward_std": 0.25944140553474426, "rewards/accuracy_reward_stage2": 0.503652811050415, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2622 }, { "completion_length": 9.53125, "epoch": 0.45961100403013844, "grad_norm": 20.21375048286647, "kl": 0.1640625, "learning_rate": 5.405642193797091e-07, "loss": -0.0056, "reward": 1.4095327854156494, "reward_std": 0.1797855645418167, "rewards/accuracy_reward_stage2": 0.6907828450202942, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2623 }, { "completion_length": 9.84375, "epoch": 0.459786227439986, "grad_norm": 16.485504483917524, "kl": 0.12060546875, "learning_rate": 5.403889959698616e-07, "loss": 0.004, "reward": 1.4300655126571655, "reward_std": 0.24736399948596954, "rewards/accuracy_reward_stage2": 0.4456905424594879, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2624 }, { "completion_length": 11.71875, "epoch": 0.45996145084983353, "grad_norm": 24.078458947001245, "kl": 0.11181640625, "learning_rate": 5.402137725600139e-07, "loss": 0.0212, "reward": 1.616410493850708, "reward_std": 0.3158206343650818, "rewards/accuracy_reward_stage2": 0.6320353746414185, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2625 }, { "completion_length": 11.734375, "epoch": 0.4601366742596811, "grad_norm": 20.69459400538519, "kl": 0.10498046875, "learning_rate": 5.400385491501664e-07, "loss": -0.0401, "reward": 1.7175240516662598, "reward_std": 0.2727046310901642, "rewards/accuracy_reward_stage2": 0.7487740516662598, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2626 }, { "completion_length": 12.984375, "epoch": 0.4603118976695286, "grad_norm": 27.47238303244588, "kl": 0.1318359375, "learning_rate": 5.398633257403189e-07, "loss": 0.0526, "reward": 1.394019365310669, "reward_std": 0.2805423140525818, "rewards/accuracy_reward_stage2": 0.39401930570602417, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2627 }, { "completion_length": 7.921875, "epoch": 0.4604871210793762, "grad_norm": 23.372854969361217, "kl": 0.08251953125, "learning_rate": 5.396881023304713e-07, "loss": -0.0553, "reward": 1.5483975410461426, "reward_std": 0.2884424328804016, "rewards/accuracy_reward_stage2": 0.5796475410461426, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2628 }, { "completion_length": 8.109375, "epoch": 0.4606623444892238, "grad_norm": 21.654997769290077, "kl": 0.234375, "learning_rate": 5.395128789206238e-07, "loss": -0.0155, "reward": 1.647642731666565, "reward_std": 0.29530832171440125, "rewards/accuracy_reward_stage2": 0.6945177316665649, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2629 }, { "completion_length": 7.1875, "epoch": 0.4608375678990713, "grad_norm": 20.993264285496924, "kl": 0.0498046875, "learning_rate": 5.393376555107763e-07, "loss": 0.0199, "reward": 1.6677969694137573, "reward_std": 0.250203400850296, "rewards/accuracy_reward_stage2": 0.6677969694137573, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2630 }, { "completion_length": 12.375, "epoch": 0.46101279130891887, "grad_norm": 22.242581404062868, "kl": 0.1572265625, "learning_rate": 5.391624321009287e-07, "loss": -0.0134, "reward": 1.2994139194488525, "reward_std": 0.27216124534606934, "rewards/accuracy_reward_stage2": 0.45566391944885254, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2631 }, { "completion_length": 9.0, "epoch": 0.4611880147187664, "grad_norm": 16.18477678764415, "kl": 0.1669921875, "learning_rate": 5.389872086910811e-07, "loss": -0.0109, "reward": 1.5696120262145996, "reward_std": 0.22004368901252747, "rewards/accuracy_reward_stage2": 0.7258619666099548, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2632 }, { "completion_length": 10.84375, "epoch": 0.46136323812861396, "grad_norm": 18.029162444228128, "kl": 0.2265625, "learning_rate": 5.388119852812335e-07, "loss": -0.0061, "reward": 1.670124888420105, "reward_std": 0.34664466977119446, "rewards/accuracy_reward_stage2": 0.7169998288154602, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2633 }, { "completion_length": 11.203125, "epoch": 0.46153846153846156, "grad_norm": 20.919917738046767, "kl": 0.2021484375, "learning_rate": 5.38636761871386e-07, "loss": 0.0468, "reward": 1.5588070154190063, "reward_std": 0.20780065655708313, "rewards/accuracy_reward_stage2": 0.6994320154190063, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2634 }, { "completion_length": 10.734375, "epoch": 0.4617136849483091, "grad_norm": 13.869513449660197, "kl": 0.048583984375, "learning_rate": 5.384615384615384e-07, "loss": 0.0194, "reward": 1.3759428262710571, "reward_std": 0.11085714399814606, "rewards/accuracy_reward_stage2": 0.5009427666664124, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2635 }, { "completion_length": 10.484375, "epoch": 0.46188890835815666, "grad_norm": 38.51280865681154, "kl": 0.345703125, "learning_rate": 5.382863150516908e-07, "loss": 0.0005, "reward": 1.303555965423584, "reward_std": 0.3703456521034241, "rewards/accuracy_reward_stage2": 0.4910559356212616, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 2636 }, { "completion_length": 10.234375, "epoch": 0.4620641317680042, "grad_norm": 20.342024074638832, "kl": 0.09716796875, "learning_rate": 5.381110916418433e-07, "loss": 0.0049, "reward": 1.2301913499832153, "reward_std": 0.29700982570648193, "rewards/accuracy_reward_stage2": 0.37081634998321533, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2637 }, { "completion_length": 11.03125, "epoch": 0.46223935517785175, "grad_norm": 23.111731352596458, "kl": 0.103515625, "learning_rate": 5.379358682319958e-07, "loss": -0.0029, "reward": 1.7350542545318604, "reward_std": 0.3148344159126282, "rewards/accuracy_reward_stage2": 0.7506792545318604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2638 }, { "completion_length": 9.828125, "epoch": 0.4624145785876993, "grad_norm": 13.631803606055552, "kl": 0.10546875, "learning_rate": 5.377606448221482e-07, "loss": -0.0008, "reward": 1.5906519889831543, "reward_std": 0.1260703206062317, "rewards/accuracy_reward_stage2": 0.6062769293785095, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2639 }, { "completion_length": 9.4375, "epoch": 0.46258980199754685, "grad_norm": 19.392433475402512, "kl": 0.1806640625, "learning_rate": 5.375854214123007e-07, "loss": 0.0201, "reward": 1.4371408224105835, "reward_std": 0.2100004106760025, "rewards/accuracy_reward_stage2": 0.4683907926082611, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2640 }, { "completion_length": 23.40625, "epoch": 0.46276502540739445, "grad_norm": 17.634424260006245, "kl": 0.1904296875, "learning_rate": 5.374101980024531e-07, "loss": 0.0321, "reward": 1.568893551826477, "reward_std": 0.24052214622497559, "rewards/accuracy_reward_stage2": 0.709518551826477, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2641 }, { "completion_length": 14.765625, "epoch": 0.462940248817242, "grad_norm": 47.522481370357156, "kl": 0.42578125, "learning_rate": 5.372349745926056e-07, "loss": 0.1699, "reward": 1.3623605966567993, "reward_std": 0.22106300294399261, "rewards/accuracy_reward_stage2": 0.4873605966567993, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2642 }, { "completion_length": 8.5, "epoch": 0.46311547222708954, "grad_norm": 24.112626776811034, "kl": 0.1923828125, "learning_rate": 5.370597511827581e-07, "loss": 0.0769, "reward": 1.6660329103469849, "reward_std": 0.2319527268409729, "rewards/accuracy_reward_stage2": 0.6660328507423401, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2643 }, { "completion_length": 8.859375, "epoch": 0.4632906956369371, "grad_norm": 21.399072455362678, "kl": 0.040283203125, "learning_rate": 5.368845277729105e-07, "loss": 0.0161, "reward": 1.5974323749542236, "reward_std": 0.17615637183189392, "rewards/accuracy_reward_stage2": 0.5974323749542236, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2644 }, { "completion_length": 12.625, "epoch": 0.46346591904678464, "grad_norm": 22.64683277462908, "kl": 0.294921875, "learning_rate": 5.367093043630628e-07, "loss": 0.1177, "reward": 1.2825133800506592, "reward_std": 0.27305760979652405, "rewards/accuracy_reward_stage2": 0.532513439655304, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2645 }, { "completion_length": 9.59375, "epoch": 0.4636411424566322, "grad_norm": 18.268335926025458, "kl": 0.0966796875, "learning_rate": 5.365340809532153e-07, "loss": -0.0055, "reward": 1.5037181377410889, "reward_std": 0.28322064876556396, "rewards/accuracy_reward_stage2": 0.5193430185317993, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2646 }, { "completion_length": 7.234375, "epoch": 0.4638163658664798, "grad_norm": 19.29371554660658, "kl": 0.056396484375, "learning_rate": 5.363588575433677e-07, "loss": -0.0108, "reward": 1.640625, "reward_std": 0.30721205472946167, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2647 }, { "completion_length": 8.671875, "epoch": 0.46399158927632733, "grad_norm": 12.55328298580008, "kl": 0.1484375, "learning_rate": 5.361836341335202e-07, "loss": 0.0187, "reward": 1.6145833730697632, "reward_std": 0.14359083771705627, "rewards/accuracy_reward_stage2": 0.7708333730697632, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2648 }, { "completion_length": 13.703125, "epoch": 0.4641668126861749, "grad_norm": 28.805720632436707, "kl": 0.341796875, "learning_rate": 5.360084107236726e-07, "loss": 0.0532, "reward": 1.2130721807479858, "reward_std": 0.22352425754070282, "rewards/accuracy_reward_stage2": 0.36932215094566345, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2649 }, { "completion_length": 9.296875, "epoch": 0.4643420360960224, "grad_norm": 16.244027746694883, "kl": 0.1435546875, "learning_rate": 5.358331873138251e-07, "loss": 0.0446, "reward": 1.689650058746338, "reward_std": 0.1842573583126068, "rewards/accuracy_reward_stage2": 0.7052749991416931, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2650 }, { "completion_length": 10.390625, "epoch": 0.46451725950587, "grad_norm": 14.736478953994629, "kl": 0.142578125, "learning_rate": 5.356579639039776e-07, "loss": 0.0281, "reward": 1.7371633052825928, "reward_std": 0.16478168964385986, "rewards/accuracy_reward_stage2": 0.752788245677948, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2651 }, { "completion_length": 28.796875, "epoch": 0.4646924829157175, "grad_norm": 18.855857379978904, "kl": 0.1201171875, "learning_rate": 5.3548274049413e-07, "loss": 0.0142, "reward": 1.503377914428711, "reward_std": 0.19588232040405273, "rewards/accuracy_reward_stage2": 0.6440029740333557, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2652 }, { "completion_length": 11.984375, "epoch": 0.4648677063255651, "grad_norm": 18.497622511685186, "kl": 0.1748046875, "learning_rate": 5.353075170842825e-07, "loss": -0.0081, "reward": 1.4114044904708862, "reward_std": 0.21059125661849976, "rewards/accuracy_reward_stage2": 0.45827943086624146, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2653 }, { "completion_length": 7.671875, "epoch": 0.46504292973541267, "grad_norm": 15.292450968559894, "kl": 0.1767578125, "learning_rate": 5.35132293674435e-07, "loss": -0.0177, "reward": 1.621706247329712, "reward_std": 0.2714681327342987, "rewards/accuracy_reward_stage2": 0.6529563665390015, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2654 }, { "completion_length": 7.5625, "epoch": 0.4652181531452602, "grad_norm": 18.294816417859785, "kl": 0.08154296875, "learning_rate": 5.349570702645873e-07, "loss": 0.0326, "reward": 1.434023141860962, "reward_std": 0.22572475671768188, "rewards/accuracy_reward_stage2": 0.43402308225631714, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2655 }, { "completion_length": 11.78125, "epoch": 0.46539337655510776, "grad_norm": 20.490285425162387, "kl": 0.0380859375, "learning_rate": 5.347818468547398e-07, "loss": 0.0153, "reward": 1.6418755054473877, "reward_std": 0.273904949426651, "rewards/accuracy_reward_stage2": 0.6418755054473877, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2656 }, { "completion_length": 6.984375, "epoch": 0.4655685999649553, "grad_norm": 16.005220815998094, "kl": 0.359375, "learning_rate": 5.346066234448922e-07, "loss": 0.0993, "reward": 1.5720269680023193, "reward_std": 0.21252962946891785, "rewards/accuracy_reward_stage2": 0.8376519083976746, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2657 }, { "completion_length": 11.59375, "epoch": 0.46574382337480286, "grad_norm": 18.98961650790962, "kl": 0.05224609375, "learning_rate": 5.344314000350446e-07, "loss": 0.021, "reward": 1.7668068408966064, "reward_std": 0.23004046082496643, "rewards/accuracy_reward_stage2": 0.7668067812919617, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2658 }, { "completion_length": 8.140625, "epoch": 0.4659190467846504, "grad_norm": 15.850391773857606, "kl": 0.177734375, "learning_rate": 5.342561766251971e-07, "loss": -0.0365, "reward": 1.7493340969085693, "reward_std": 0.26293256878852844, "rewards/accuracy_reward_stage2": 0.7962090373039246, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2659 }, { "completion_length": 7.515625, "epoch": 0.466094270194498, "grad_norm": 16.976524700254885, "kl": 0.1396484375, "learning_rate": 5.340809532153495e-07, "loss": -0.017, "reward": 1.4632868766784668, "reward_std": 0.3231024742126465, "rewards/accuracy_reward_stage2": 0.494536817073822, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2660 }, { "completion_length": 10.125, "epoch": 0.46626949360434555, "grad_norm": 14.97834778598425, "kl": 0.10888671875, "learning_rate": 5.33905729805502e-07, "loss": -0.0117, "reward": 1.3670856952667236, "reward_std": 0.11902426183223724, "rewards/accuracy_reward_stage2": 0.5233356952667236, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2661 }, { "completion_length": 12.609375, "epoch": 0.4664447170141931, "grad_norm": 24.159374464410256, "kl": 0.392578125, "learning_rate": 5.337305063956545e-07, "loss": 0.1385, "reward": 1.3967511653900146, "reward_std": 0.274562805891037, "rewards/accuracy_reward_stage2": 0.6780011653900146, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2662 }, { "completion_length": 12.78125, "epoch": 0.46661994042404065, "grad_norm": 17.379055498809137, "kl": 0.1689453125, "learning_rate": 5.335552829858069e-07, "loss": -0.0268, "reward": 1.5766353607177734, "reward_std": 0.13020777702331543, "rewards/accuracy_reward_stage2": 0.6235103011131287, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2663 }, { "completion_length": 9.4375, "epoch": 0.4667951638338882, "grad_norm": 19.81851006216102, "kl": 0.09521484375, "learning_rate": 5.333800595759594e-07, "loss": 0.0381, "reward": 1.2878808975219727, "reward_std": 0.16528277099132538, "rewards/accuracy_reward_stage2": 0.28788089752197266, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2664 }, { "completion_length": 9.796875, "epoch": 0.46697038724373574, "grad_norm": 22.755169857106104, "kl": 0.26171875, "learning_rate": 5.332048361661117e-07, "loss": 0.0705, "reward": 1.0916426181793213, "reward_std": 0.2884361147880554, "rewards/accuracy_reward_stage2": 0.35726767778396606, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2665 }, { "completion_length": 8.078125, "epoch": 0.46714561065358334, "grad_norm": 14.03332775424425, "kl": 0.10791015625, "learning_rate": 5.330296127562642e-07, "loss": 0.0075, "reward": 1.8164703845977783, "reward_std": 0.17855873703956604, "rewards/accuracy_reward_stage2": 0.8320953845977783, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2666 }, { "completion_length": 9.265625, "epoch": 0.4673208340634309, "grad_norm": 21.478285987226695, "kl": 0.047607421875, "learning_rate": 5.328543893464167e-07, "loss": 0.0191, "reward": 1.6493043899536133, "reward_std": 0.20907220244407654, "rewards/accuracy_reward_stage2": 0.6493044495582581, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2667 }, { "completion_length": 17.453125, "epoch": 0.46749605747327844, "grad_norm": 18.982904311577798, "kl": 0.04248046875, "learning_rate": 5.326791659365691e-07, "loss": 0.0171, "reward": 1.5038068294525146, "reward_std": 0.24129807949066162, "rewards/accuracy_reward_stage2": 0.5038068294525146, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2668 }, { "completion_length": 10.21875, "epoch": 0.467671280883126, "grad_norm": 18.579744416225406, "kl": 0.08837890625, "learning_rate": 5.325039425267216e-07, "loss": -0.0088, "reward": 1.5844638347625732, "reward_std": 0.32090964913368225, "rewards/accuracy_reward_stage2": 0.6000887751579285, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2669 }, { "completion_length": 8.625, "epoch": 0.46784650429297353, "grad_norm": 18.892868541506303, "kl": 0.2080078125, "learning_rate": 5.323287191168739e-07, "loss": 0.0829, "reward": 1.334208369255066, "reward_std": 0.2512897253036499, "rewards/accuracy_reward_stage2": 0.45920833945274353, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2670 }, { "completion_length": 7.5, "epoch": 0.4680217277028211, "grad_norm": 19.860807173605743, "kl": 0.11181640625, "learning_rate": 5.321534957070264e-07, "loss": 0.0006, "reward": 1.6716079711914062, "reward_std": 0.2117011994123459, "rewards/accuracy_reward_stage2": 0.6872329115867615, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2671 }, { "completion_length": 7.65625, "epoch": 0.4681969511126687, "grad_norm": 12.98836942773075, "kl": 0.10546875, "learning_rate": 5.319782722971789e-07, "loss": -0.0019, "reward": 1.8471788167953491, "reward_std": 0.20064638555049896, "rewards/accuracy_reward_stage2": 0.8628038167953491, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2672 }, { "completion_length": 9.890625, "epoch": 0.4683721745225162, "grad_norm": 16.79316864521017, "kl": 0.11572265625, "learning_rate": 5.318030488873313e-07, "loss": 0.0179, "reward": 1.1738388538360596, "reward_std": 0.22742962837219238, "rewards/accuracy_reward_stage2": 0.33008885383605957, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2673 }, { "completion_length": 10.71875, "epoch": 0.4685473979323638, "grad_norm": 15.028835319480741, "kl": 0.09521484375, "learning_rate": 5.316278254774837e-07, "loss": 0.0445, "reward": 1.4867298603057861, "reward_std": 0.1454581469297409, "rewards/accuracy_reward_stage2": 0.6117299199104309, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2674 }, { "completion_length": 12.671875, "epoch": 0.4687226213422113, "grad_norm": 18.414379794358428, "kl": 0.1591796875, "learning_rate": 5.314526020676362e-07, "loss": -0.0015, "reward": 1.24593985080719, "reward_std": 0.25791746377944946, "rewards/accuracy_reward_stage2": 0.2771899104118347, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2675 }, { "completion_length": 9.078125, "epoch": 0.46889784475205887, "grad_norm": 17.428660061327285, "kl": 0.1435546875, "learning_rate": 5.312773786577886e-07, "loss": -0.0857, "reward": 1.5682522058486938, "reward_std": 0.3257772922515869, "rewards/accuracy_reward_stage2": 0.6307522058486938, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2676 }, { "completion_length": 7.984375, "epoch": 0.4690730681619064, "grad_norm": 15.516788823611714, "kl": 0.2060546875, "learning_rate": 5.311021552479411e-07, "loss": 0.041, "reward": 1.3916726112365723, "reward_std": 0.23419056832790375, "rewards/accuracy_reward_stage2": 0.5322976112365723, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2677 }, { "completion_length": 14.671875, "epoch": 0.46924829157175396, "grad_norm": 18.35877218792254, "kl": 0.173828125, "learning_rate": 5.309269318380935e-07, "loss": 0.0084, "reward": 1.2911961078643799, "reward_std": 0.1765722930431366, "rewards/accuracy_reward_stage2": 0.4474460780620575, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2678 }, { "completion_length": 9.5, "epoch": 0.46942351498160156, "grad_norm": 8.312887165929723, "kl": 0.1572265625, "learning_rate": 5.30751708428246e-07, "loss": 0.0626, "reward": 1.453125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward_stage2": 0.578125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2679 }, { "completion_length": 8.375, "epoch": 0.4695987383914491, "grad_norm": 16.46782716326156, "kl": 0.08447265625, "learning_rate": 5.305764850183985e-07, "loss": -0.0258, "reward": 1.4971678256988525, "reward_std": 0.23257961869239807, "rewards/accuracy_reward_stage2": 0.5284177660942078, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2680 }, { "completion_length": 9.078125, "epoch": 0.46977396180129666, "grad_norm": 19.04409731102832, "kl": 0.11962890625, "learning_rate": 5.304012616085509e-07, "loss": -0.0398, "reward": 1.68888258934021, "reward_std": 0.34067055583000183, "rewards/accuracy_reward_stage2": 0.72013258934021, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2681 }, { "completion_length": 6.578125, "epoch": 0.4699491852111442, "grad_norm": 20.867960532935875, "kl": 0.28515625, "learning_rate": 5.302260381987034e-07, "loss": 0.0963, "reward": 1.5492311716079712, "reward_std": 0.19412344694137573, "rewards/accuracy_reward_stage2": 0.6898561716079712, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2682 }, { "completion_length": 11.78125, "epoch": 0.47012440862099175, "grad_norm": 16.12013757010351, "kl": 0.091796875, "learning_rate": 5.300508147888558e-07, "loss": 0.0368, "reward": 1.4084248542785645, "reward_std": 0.1533108353614807, "rewards/accuracy_reward_stage2": 0.5334248542785645, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2683 }, { "completion_length": 6.703125, "epoch": 0.4702996320308393, "grad_norm": 19.017046076157825, "kl": 0.140625, "learning_rate": 5.298755913790081e-07, "loss": -0.0103, "reward": 1.6729154586791992, "reward_std": 0.2343599647283554, "rewards/accuracy_reward_stage2": 0.7041655778884888, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2684 }, { "completion_length": 10.15625, "epoch": 0.4704748554406869, "grad_norm": 20.280683165766458, "kl": 0.1396484375, "learning_rate": 5.297003679691606e-07, "loss": 0.0559, "reward": 1.545297384262085, "reward_std": 0.14967718720436096, "rewards/accuracy_reward_stage2": 0.5452974438667297, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2685 }, { "completion_length": 7.140625, "epoch": 0.47065007885053445, "grad_norm": 18.72214225563852, "kl": 0.189453125, "learning_rate": 5.29525144559313e-07, "loss": -0.0698, "reward": 1.581083059310913, "reward_std": 0.326572060585022, "rewards/accuracy_reward_stage2": 0.6435831189155579, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2686 }, { "completion_length": 11.5625, "epoch": 0.470825302260382, "grad_norm": 18.248174147174804, "kl": 0.2275390625, "learning_rate": 5.293499211494655e-07, "loss": 0.0168, "reward": 1.6100983619689941, "reward_std": 0.376659095287323, "rewards/accuracy_reward_stage2": 0.6413483619689941, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2687 }, { "completion_length": 7.390625, "epoch": 0.47100052567022954, "grad_norm": 17.62123757873847, "kl": 0.12451171875, "learning_rate": 5.29174697739618e-07, "loss": -0.1241, "reward": 1.5080466270446777, "reward_std": 0.36043059825897217, "rewards/accuracy_reward_stage2": 0.5705466866493225, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2688 }, { "completion_length": 10.34375, "epoch": 0.4711757490800771, "grad_norm": 19.985883149588634, "kl": 0.2470703125, "learning_rate": 5.289994743297704e-07, "loss": 0.0313, "reward": 1.5201334953308105, "reward_std": 0.16913798451423645, "rewards/accuracy_reward_stage2": 0.6763834357261658, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2689 }, { "completion_length": 10.375, "epoch": 0.47135097248992464, "grad_norm": 14.34850276613726, "kl": 0.25390625, "learning_rate": 5.288242509199229e-07, "loss": 0.0128, "reward": 1.7002893686294556, "reward_std": 0.21416278183460236, "rewards/accuracy_reward_stage2": 0.8565393090248108, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2690 }, { "completion_length": 12.828125, "epoch": 0.4715261958997722, "grad_norm": 25.408458323983105, "kl": 0.279296875, "learning_rate": 5.286490275100754e-07, "loss": 0.0334, "reward": 1.489842414855957, "reward_std": 0.1563968062400818, "rewards/accuracy_reward_stage2": 0.6460922956466675, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2691 }, { "completion_length": 7.015625, "epoch": 0.4717014193096198, "grad_norm": 11.780555474494543, "kl": 0.19140625, "learning_rate": 5.284738041002278e-07, "loss": -0.0559, "reward": 1.4947917461395264, "reward_std": 0.2120075523853302, "rewards/accuracy_reward_stage2": 0.5416666865348816, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2692 }, { "completion_length": 12.078125, "epoch": 0.47187664271946733, "grad_norm": 15.257207088001591, "kl": 0.1552734375, "learning_rate": 5.282985806903803e-07, "loss": 0.0179, "reward": 1.0771777629852295, "reward_std": 0.24159622192382812, "rewards/accuracy_reward_stage2": 0.21780279278755188, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2693 }, { "completion_length": 9.0, "epoch": 0.4720518661293149, "grad_norm": 21.52324411905624, "kl": 0.0869140625, "learning_rate": 5.281233572805326e-07, "loss": 0.0251, "reward": 1.5122637748718262, "reward_std": 0.2604824900627136, "rewards/accuracy_reward_stage2": 0.5278887152671814, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2694 }, { "completion_length": 9.71875, "epoch": 0.4722270895391624, "grad_norm": 16.36077000355877, "kl": 0.06982421875, "learning_rate": 5.279481338706851e-07, "loss": -0.0162, "reward": 1.5569807291030884, "reward_std": 0.21449267864227295, "rewards/accuracy_reward_stage2": 0.5726057291030884, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2695 }, { "completion_length": 10.421875, "epoch": 0.47240231294901, "grad_norm": 17.87484625185122, "kl": 0.236328125, "learning_rate": 5.277729104608375e-07, "loss": -0.0391, "reward": 1.5181585550308228, "reward_std": 0.32765793800354004, "rewards/accuracy_reward_stage2": 0.5806585550308228, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2696 }, { "completion_length": 10.109375, "epoch": 0.4725775363588575, "grad_norm": 18.244962917000976, "kl": 0.1875, "learning_rate": 5.275976870509899e-07, "loss": -0.0004, "reward": 1.5278222560882568, "reward_std": 0.2655717730522156, "rewards/accuracy_reward_stage2": 0.5590722560882568, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2697 }, { "completion_length": 9.84375, "epoch": 0.4727527597687051, "grad_norm": 23.181774765042697, "kl": 0.283203125, "learning_rate": 5.274224636411424e-07, "loss": 0.0402, "reward": 1.4854509830474854, "reward_std": 0.2893902063369751, "rewards/accuracy_reward_stage2": 0.6417009830474854, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2698 }, { "completion_length": 9.640625, "epoch": 0.47292798317855267, "grad_norm": 13.396986252061001, "kl": 0.083984375, "learning_rate": 5.272472402312949e-07, "loss": 0.0336, "reward": 1.7557774782180786, "reward_std": 0.19559994339942932, "rewards/accuracy_reward_stage2": 0.7557775378227234, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2699 }, { "completion_length": 6.828125, "epoch": 0.4731032065884002, "grad_norm": 35.2728621141681, "kl": 0.15625, "learning_rate": 5.270720168214473e-07, "loss": 0.0181, "reward": 1.7280571460723877, "reward_std": 0.1603502482175827, "rewards/accuracy_reward_stage2": 0.7436821460723877, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2700 }, { "completion_length": 10.5, "epoch": 0.47327842999824776, "grad_norm": 16.561588817945058, "kl": 0.025634765625, "learning_rate": 5.268967934115998e-07, "loss": 0.0102, "reward": 1.59375, "reward_std": 0.22461533546447754, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2701 }, { "completion_length": 9.71875, "epoch": 0.4734536534080953, "grad_norm": 18.49027431198656, "kl": 0.18359375, "learning_rate": 5.267215700017522e-07, "loss": 0.0291, "reward": 1.696709156036377, "reward_std": 0.3068729639053345, "rewards/accuracy_reward_stage2": 0.712334156036377, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2702 }, { "completion_length": 10.375, "epoch": 0.47362887681794286, "grad_norm": 17.52710806280741, "kl": 0.32421875, "learning_rate": 5.265463465919047e-07, "loss": -0.0549, "reward": 1.6311153173446655, "reward_std": 0.28500238060951233, "rewards/accuracy_reward_stage2": 0.7248653173446655, "rewards/format_reward_stage1_pointerpad": 0.90625, "scores/accuracy_reward_stage2": 0.90625, "step": 2703 }, { "completion_length": 13.265625, "epoch": 0.47380410022779046, "grad_norm": 14.301506601984673, "kl": 0.08251953125, "learning_rate": 5.263711231820572e-07, "loss": -0.0444, "reward": 1.4702627658843994, "reward_std": 0.14659681916236877, "rewards/accuracy_reward_stage2": 0.5015127658843994, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2704 }, { "completion_length": 10.0, "epoch": 0.473979323637638, "grad_norm": 14.95292873090198, "kl": 0.07958984375, "learning_rate": 5.261958997722095e-07, "loss": -0.0122, "reward": 1.3216766119003296, "reward_std": 0.19845643639564514, "rewards/accuracy_reward_stage2": 0.3373016119003296, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2705 }, { "completion_length": 12.875, "epoch": 0.47415454704748555, "grad_norm": 17.955246882117542, "kl": 0.1787109375, "learning_rate": 5.26020676362362e-07, "loss": 0.0042, "reward": 1.5646817684173584, "reward_std": 0.1885027289390564, "rewards/accuracy_reward_stage2": 0.7209317684173584, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2706 }, { "completion_length": 8.328125, "epoch": 0.4743297704573331, "grad_norm": 19.838909273934828, "kl": 0.09375, "learning_rate": 5.258454529525145e-07, "loss": -0.051, "reward": 1.4685415029525757, "reward_std": 0.23180589079856873, "rewards/accuracy_reward_stage2": 0.4997915029525757, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2707 }, { "completion_length": 7.28125, "epoch": 0.47450499386718065, "grad_norm": 19.38312558603043, "kl": 0.142578125, "learning_rate": 5.256702295426669e-07, "loss": 0.0572, "reward": 1.6531894207000732, "reward_std": 0.3191227316856384, "rewards/accuracy_reward_stage2": 0.6531893610954285, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2708 }, { "completion_length": 5.28125, "epoch": 0.4746802172770282, "grad_norm": 15.183205801849777, "kl": 0.109375, "learning_rate": 5.254950061328193e-07, "loss": 0.0121, "reward": 1.630523681640625, "reward_std": 0.2849048376083374, "rewards/accuracy_reward_stage2": 0.6461487412452698, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2709 }, { "completion_length": 10.671875, "epoch": 0.47485544068687574, "grad_norm": 17.944530022570852, "kl": 0.2099609375, "learning_rate": 5.253197827229717e-07, "loss": -0.0046, "reward": 1.6410465240478516, "reward_std": 0.3454177677631378, "rewards/accuracy_reward_stage2": 0.7972965836524963, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2710 }, { "completion_length": 10.03125, "epoch": 0.47503066409672334, "grad_norm": 14.387584231770411, "kl": 0.0537109375, "learning_rate": 5.251445593131242e-07, "loss": 0.0215, "reward": 1.7751755714416504, "reward_std": 0.06710825860500336, "rewards/accuracy_reward_stage2": 0.7751755118370056, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2711 }, { "completion_length": 8.5, "epoch": 0.4752058875065709, "grad_norm": 18.35149562175783, "kl": 0.107421875, "learning_rate": 5.249693359032767e-07, "loss": -0.0011, "reward": 1.4777603149414062, "reward_std": 0.2660313844680786, "rewards/accuracy_reward_stage2": 0.509010374546051, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2712 }, { "completion_length": 9.109375, "epoch": 0.47538111091641844, "grad_norm": 22.14344971109736, "kl": 0.1328125, "learning_rate": 5.247941124934291e-07, "loss": 0.0091, "reward": 1.499030590057373, "reward_std": 0.26058027148246765, "rewards/accuracy_reward_stage2": 0.514655590057373, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2713 }, { "completion_length": 8.09375, "epoch": 0.475556334326266, "grad_norm": 18.117297272818654, "kl": 0.08203125, "learning_rate": 5.246188890835815e-07, "loss": -0.0113, "reward": 1.8141202926635742, "reward_std": 0.22963713109493256, "rewards/accuracy_reward_stage2": 0.8297452926635742, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2714 }, { "completion_length": 7.09375, "epoch": 0.47573155773611353, "grad_norm": 21.12299950414213, "kl": 0.0556640625, "learning_rate": 5.24443665673734e-07, "loss": 0.0223, "reward": 1.625364899635315, "reward_std": 0.26791059970855713, "rewards/accuracy_reward_stage2": 0.6253648996353149, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2715 }, { "completion_length": 9.46875, "epoch": 0.4759067811459611, "grad_norm": 18.622730234407964, "kl": 0.15234375, "learning_rate": 5.242684422638864e-07, "loss": 0.0202, "reward": 1.7383769750595093, "reward_std": 0.2080550491809845, "rewards/accuracy_reward_stage2": 0.754002034664154, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2716 }, { "completion_length": 17.484375, "epoch": 0.4760820045558087, "grad_norm": 19.294998296148034, "kl": 0.0257568359375, "learning_rate": 5.240932188540389e-07, "loss": 0.0103, "reward": 1.5087703466415405, "reward_std": 0.2145693451166153, "rewards/accuracy_reward_stage2": 0.5087703466415405, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2717 }, { "completion_length": 14.46875, "epoch": 0.4762572279656562, "grad_norm": 16.976148575535277, "kl": 0.1328125, "learning_rate": 5.239179954441913e-07, "loss": 0.0185, "reward": 1.5493242740631104, "reward_std": 0.19351491332054138, "rewards/accuracy_reward_stage2": 0.5649492144584656, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2718 }, { "completion_length": 13.140625, "epoch": 0.4764324513755038, "grad_norm": 455.72001856064566, "kl": 2.078125, "learning_rate": 5.237427720343438e-07, "loss": 0.838, "reward": 1.220144510269165, "reward_std": 0.18339301645755768, "rewards/accuracy_reward_stage2": 0.34514448046684265, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2719 }, { "completion_length": 9.359375, "epoch": 0.4766076747853513, "grad_norm": 15.895693514019884, "kl": 0.039794921875, "learning_rate": 5.235675486244963e-07, "loss": 0.016, "reward": 1.7980906963348389, "reward_std": 0.17123383283615112, "rewards/accuracy_reward_stage2": 0.7980905771255493, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2720 }, { "completion_length": 26.078125, "epoch": 0.47678289819519887, "grad_norm": 20.814016048937567, "kl": 0.056640625, "learning_rate": 5.233923252146486e-07, "loss": 0.0227, "reward": 1.6870832443237305, "reward_std": 0.14134840667247772, "rewards/accuracy_reward_stage2": 0.6870833039283752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2721 }, { "completion_length": 10.125, "epoch": 0.4769581216050464, "grad_norm": 19.542106632980662, "kl": 0.158203125, "learning_rate": 5.232171018048011e-07, "loss": 0.0333, "reward": 1.585869312286377, "reward_std": 0.27999138832092285, "rewards/accuracy_reward_stage2": 0.601494312286377, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2722 }, { "completion_length": 14.953125, "epoch": 0.477133345014894, "grad_norm": 18.92187880218421, "kl": 0.049560546875, "learning_rate": 5.230418783949536e-07, "loss": -0.0244, "reward": 1.6283730268478394, "reward_std": 0.20701447129249573, "rewards/accuracy_reward_stage2": 0.6439980268478394, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2723 }, { "completion_length": 9.421875, "epoch": 0.47730856842474156, "grad_norm": 17.871778532303395, "kl": 0.12890625, "learning_rate": 5.228666549851059e-07, "loss": -0.0161, "reward": 1.4331648349761963, "reward_std": 0.21878674626350403, "rewards/accuracy_reward_stage2": 0.7144148945808411, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2724 }, { "completion_length": 14.25, "epoch": 0.4774837918345891, "grad_norm": 23.580290074841233, "kl": 0.18359375, "learning_rate": 5.226914315752584e-07, "loss": 0.0391, "reward": 1.5170743465423584, "reward_std": 0.3067265748977661, "rewards/accuracy_reward_stage2": 0.5326994061470032, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2725 }, { "completion_length": 9.375, "epoch": 0.47765901524443666, "grad_norm": 12.153889116582954, "kl": 0.142578125, "learning_rate": 5.225162081654108e-07, "loss": -0.0313, "reward": 1.4144365787506104, "reward_std": 0.24257135391235352, "rewards/accuracy_reward_stage2": 0.5706866383552551, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2726 }, { "completion_length": 25.015625, "epoch": 0.4778342386542842, "grad_norm": 24.166691606219224, "kl": 0.1279296875, "learning_rate": 5.223409847555633e-07, "loss": -0.022, "reward": 1.6062610149383545, "reward_std": 0.19640934467315674, "rewards/accuracy_reward_stage2": 0.6375110149383545, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2727 }, { "completion_length": 12.734375, "epoch": 0.47800946206413175, "grad_norm": 15.49223977916494, "kl": 0.109375, "learning_rate": 5.221657613457158e-07, "loss": 0.0146, "reward": 1.351271390914917, "reward_std": 0.15009453892707825, "rewards/accuracy_reward_stage2": 0.4918963313102722, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2728 }, { "completion_length": 10.28125, "epoch": 0.4781846854739793, "grad_norm": 18.615076012997733, "kl": 0.1552734375, "learning_rate": 5.219905379358682e-07, "loss": 0.0306, "reward": 1.407578706741333, "reward_std": 0.19074060022830963, "rewards/accuracy_reward_stage2": 0.4232037663459778, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2729 }, { "completion_length": 14.703125, "epoch": 0.4783599088838269, "grad_norm": 18.12338121605169, "kl": 0.275390625, "learning_rate": 5.218153145260207e-07, "loss": -0.0129, "reward": 1.5154175758361816, "reward_std": 0.34471186995506287, "rewards/accuracy_reward_stage2": 0.5622925758361816, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2730 }, { "completion_length": 8.5625, "epoch": 0.47853513229367445, "grad_norm": 16.168990286816936, "kl": 0.21875, "learning_rate": 5.216400911161732e-07, "loss": 0.0064, "reward": 1.820338249206543, "reward_std": 0.19135203957557678, "rewards/accuracy_reward_stage2": 0.8515881896018982, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2731 }, { "completion_length": 6.75, "epoch": 0.478710355703522, "grad_norm": 18.149000957856924, "kl": 0.134765625, "learning_rate": 5.214648677063256e-07, "loss": 0.0191, "reward": 1.583438515663147, "reward_std": 0.2895791232585907, "rewards/accuracy_reward_stage2": 0.5990635752677917, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2732 }, { "completion_length": 6.875, "epoch": 0.47888557911336954, "grad_norm": 16.452446078960264, "kl": 0.06689453125, "learning_rate": 5.212896442964781e-07, "loss": 0.0267, "reward": 1.7636473178863525, "reward_std": 0.22638459503650665, "rewards/accuracy_reward_stage2": 0.7636473178863525, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2733 }, { "completion_length": 14.546875, "epoch": 0.4790608025232171, "grad_norm": 19.049273000089443, "kl": 0.14453125, "learning_rate": 5.211144208866303e-07, "loss": -0.0298, "reward": 1.4528450965881348, "reward_std": 0.2527550458908081, "rewards/accuracy_reward_stage2": 0.6090949773788452, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2734 }, { "completion_length": 10.90625, "epoch": 0.47923602593306464, "grad_norm": 13.796239244227346, "kl": 0.1455078125, "learning_rate": 5.209391974767828e-07, "loss": 0.0142, "reward": 1.4563570022583008, "reward_std": 0.16542761027812958, "rewards/accuracy_reward_stage2": 0.596981942653656, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2735 }, { "completion_length": 12.46875, "epoch": 0.47941124934291224, "grad_norm": 16.062767527991255, "kl": 0.028564453125, "learning_rate": 5.207639740669353e-07, "loss": 0.0114, "reward": 1.559941053390503, "reward_std": 0.18492963910102844, "rewards/accuracy_reward_stage2": 0.5599411129951477, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2736 }, { "completion_length": 9.21875, "epoch": 0.4795864727527598, "grad_norm": 12.052747871748826, "kl": 0.10400390625, "learning_rate": 5.205887506570877e-07, "loss": -0.0026, "reward": 1.2457869052886963, "reward_std": 0.1598706841468811, "rewards/accuracy_reward_stage2": 0.26141196489334106, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2737 }, { "completion_length": 9.4375, "epoch": 0.47976169616260733, "grad_norm": 17.36749522465019, "kl": 0.2392578125, "learning_rate": 5.204135272472402e-07, "loss": 0.052, "reward": 1.4359642267227173, "reward_std": 0.2153088003396988, "rewards/accuracy_reward_stage2": 0.5765892267227173, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2738 }, { "completion_length": 7.8125, "epoch": 0.4799369195724549, "grad_norm": 18.251925809945238, "kl": 0.1376953125, "learning_rate": 5.202383038373927e-07, "loss": 0.0109, "reward": 1.615727186203003, "reward_std": 0.2166653424501419, "rewards/accuracy_reward_stage2": 0.7563521265983582, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2739 }, { "completion_length": 15.109375, "epoch": 0.4801121429823024, "grad_norm": 19.749273852695445, "kl": 0.0849609375, "learning_rate": 5.200630804275451e-07, "loss": 0.0339, "reward": 1.3953063488006592, "reward_std": 0.2997656464576721, "rewards/accuracy_reward_stage2": 0.3953062891960144, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2740 }, { "completion_length": 7.046875, "epoch": 0.48028736639215, "grad_norm": 17.631041499607417, "kl": 0.185546875, "learning_rate": 5.198878570176976e-07, "loss": 0.0303, "reward": 1.7178881168365479, "reward_std": 0.12682121992111206, "rewards/accuracy_reward_stage2": 0.8585132360458374, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2741 }, { "completion_length": 9.953125, "epoch": 0.4804625898019975, "grad_norm": 12.619249957281948, "kl": 0.1328125, "learning_rate": 5.1971263360785e-07, "loss": 0.0532, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.78125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2742 }, { "completion_length": 14.015625, "epoch": 0.4806378132118451, "grad_norm": 19.162755775127746, "kl": 0.1103515625, "learning_rate": 5.195374101980025e-07, "loss": 0.0018, "reward": 1.5159637928009033, "reward_std": 0.2675696015357971, "rewards/accuracy_reward_stage2": 0.5315887928009033, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2743 }, { "completion_length": 6.6875, "epoch": 0.48081303662169267, "grad_norm": 9.318900647205547, "kl": 0.1181640625, "learning_rate": 5.19362186788155e-07, "loss": 0.0182, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.71875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2744 }, { "completion_length": 9.390625, "epoch": 0.4809882600315402, "grad_norm": 18.835359657896657, "kl": 0.2021484375, "learning_rate": 5.191869633783073e-07, "loss": 0.052, "reward": 1.319472074508667, "reward_std": 0.24716606736183167, "rewards/accuracy_reward_stage2": 0.4600970447063446, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2745 }, { "completion_length": 8.390625, "epoch": 0.48116348344138776, "grad_norm": 23.08480354751511, "kl": 0.2060546875, "learning_rate": 5.190117399684598e-07, "loss": 0.0439, "reward": 1.4809374809265137, "reward_std": 0.27444180846214294, "rewards/accuracy_reward_stage2": 0.6371875405311584, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2746 }, { "completion_length": 11.34375, "epoch": 0.4813387068512353, "grad_norm": 20.294096164081342, "kl": 0.150390625, "learning_rate": 5.188365165586121e-07, "loss": 0.0319, "reward": 1.4782813787460327, "reward_std": 0.14485354721546173, "rewards/accuracy_reward_stage2": 0.6189062595367432, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2747 }, { "completion_length": 7.3125, "epoch": 0.48151393026108286, "grad_norm": 21.102713733518698, "kl": 0.10791015625, "learning_rate": 5.186612931487646e-07, "loss": 0.0432, "reward": 1.6589291095733643, "reward_std": 0.172014981508255, "rewards/accuracy_reward_stage2": 0.6589291095733643, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2748 }, { "completion_length": 8.765625, "epoch": 0.48168915367093046, "grad_norm": 18.910051182893728, "kl": 0.1259765625, "learning_rate": 5.184860697389171e-07, "loss": 0.0569, "reward": 1.5960662364959717, "reward_std": 0.12384433299303055, "rewards/accuracy_reward_stage2": 0.7210662364959717, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2749 }, { "completion_length": 14.671875, "epoch": 0.481864377080778, "grad_norm": 17.660883704124444, "kl": 0.146484375, "learning_rate": 5.183108463290695e-07, "loss": -0.0074, "reward": 1.6546704769134521, "reward_std": 0.31043297052383423, "rewards/accuracy_reward_stage2": 0.6859204769134521, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2750 }, { "completion_length": 10.546875, "epoch": 0.48203960049062555, "grad_norm": 27.94560857989135, "kl": 0.2578125, "learning_rate": 5.18135622919222e-07, "loss": 0.0145, "reward": 1.5631669759750366, "reward_std": 0.3198142647743225, "rewards/accuracy_reward_stage2": 0.5944169759750366, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2751 }, { "completion_length": 10.15625, "epoch": 0.4822148239004731, "grad_norm": 22.23540615294016, "kl": 0.1611328125, "learning_rate": 5.179603995093745e-07, "loss": 0.0311, "reward": 1.7261333465576172, "reward_std": 0.28550904989242554, "rewards/accuracy_reward_stage2": 0.7417583465576172, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2752 }, { "completion_length": 8.40625, "epoch": 0.48239004731032065, "grad_norm": 18.09184589875181, "kl": 0.2353515625, "learning_rate": 5.177851760995269e-07, "loss": 0.0502, "reward": 1.505954623222351, "reward_std": 0.26220905780792236, "rewards/accuracy_reward_stage2": 0.6465796232223511, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2753 }, { "completion_length": 22.03125, "epoch": 0.4825652707201682, "grad_norm": 22.332245761929883, "kl": 0.10302734375, "learning_rate": 5.176099526896793e-07, "loss": 0.0127, "reward": 1.3112815618515015, "reward_std": 0.19790546596050262, "rewards/accuracy_reward_stage2": 0.4362815022468567, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2754 }, { "completion_length": 10.34375, "epoch": 0.4827404941300158, "grad_norm": 20.220182184480546, "kl": 0.06591796875, "learning_rate": 5.174347292798317e-07, "loss": 0.0265, "reward": 1.569887399673462, "reward_std": 0.1608072817325592, "rewards/accuracy_reward_stage2": 0.5698874592781067, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2755 }, { "completion_length": 8.234375, "epoch": 0.48291571753986334, "grad_norm": 21.54860264069522, "kl": 0.039794921875, "learning_rate": 5.172595058699842e-07, "loss": 0.0159, "reward": 1.6516201496124268, "reward_std": 0.24412159621715546, "rewards/accuracy_reward_stage2": 0.776620090007782, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2756 }, { "completion_length": 10.515625, "epoch": 0.4830909409497109, "grad_norm": 16.106284578983384, "kl": 0.2236328125, "learning_rate": 5.170842824601367e-07, "loss": -0.0316, "reward": 1.4267677068710327, "reward_std": 0.24894431233406067, "rewards/accuracy_reward_stage2": 0.4892677068710327, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2757 }, { "completion_length": 13.125, "epoch": 0.48326616435955844, "grad_norm": 16.33349841945425, "kl": 0.064453125, "learning_rate": 5.169090590502891e-07, "loss": -0.0398, "reward": 1.5895304679870605, "reward_std": 0.216194748878479, "rewards/accuracy_reward_stage2": 0.6207805275917053, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2758 }, { "completion_length": 9.390625, "epoch": 0.483441387769406, "grad_norm": 16.146027542899503, "kl": 0.1474609375, "learning_rate": 5.167338356404415e-07, "loss": 0.0476, "reward": 1.5741090774536133, "reward_std": 0.11098361015319824, "rewards/accuracy_reward_stage2": 0.8241090178489685, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2759 }, { "completion_length": 12.21875, "epoch": 0.48361661117925353, "grad_norm": 20.163523124388767, "kl": 0.1259765625, "learning_rate": 5.16558612230594e-07, "loss": -0.0227, "reward": 1.4342626333236694, "reward_std": 0.21140506863594055, "rewards/accuracy_reward_stage2": 0.5905126333236694, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2760 }, { "completion_length": 10.921875, "epoch": 0.4837918345891011, "grad_norm": 13.666142555657446, "kl": 0.04052734375, "learning_rate": 5.163833888207464e-07, "loss": 0.0163, "reward": 1.6616389751434326, "reward_std": 0.09498253464698792, "rewards/accuracy_reward_stage2": 0.7866389155387878, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2761 }, { "completion_length": 11.171875, "epoch": 0.4839670579989487, "grad_norm": 19.951044847589454, "kl": 0.1416015625, "learning_rate": 5.162081654108989e-07, "loss": -0.0053, "reward": 1.637669324874878, "reward_std": 0.27671176195144653, "rewards/accuracy_reward_stage2": 0.6689193248748779, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2762 }, { "completion_length": 9.53125, "epoch": 0.4841422814087962, "grad_norm": 20.847489567446733, "kl": 0.10107421875, "learning_rate": 5.160329420010512e-07, "loss": 0.0406, "reward": 1.5803248882293701, "reward_std": 0.20699653029441833, "rewards/accuracy_reward_stage2": 0.7053249478340149, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2763 }, { "completion_length": 8.265625, "epoch": 0.4843175048186438, "grad_norm": 18.2142109782604, "kl": 0.115234375, "learning_rate": 5.158577185912037e-07, "loss": 0.0461, "reward": 1.4986896514892578, "reward_std": 0.255196750164032, "rewards/accuracy_reward_stage2": 0.623689591884613, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2764 }, { "completion_length": 9.4375, "epoch": 0.4844927282284913, "grad_norm": 8.677932747497366, "kl": 0.1376953125, "learning_rate": 5.156824951813562e-07, "loss": 0.055, "reward": 1.34375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2765 }, { "completion_length": 10.40625, "epoch": 0.48466795163833887, "grad_norm": 30.869785143898685, "kl": 0.1259765625, "learning_rate": 5.155072717715086e-07, "loss": -0.0505, "reward": 1.3989291191101074, "reward_std": 0.253897100687027, "rewards/accuracy_reward_stage2": 0.5708041787147522, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2766 }, { "completion_length": 9.59375, "epoch": 0.4848431750481864, "grad_norm": 12.300031714411467, "kl": 0.0693359375, "learning_rate": 5.153320483616611e-07, "loss": 0.0276, "reward": 1.6179606914520264, "reward_std": 0.09867061674594879, "rewards/accuracy_reward_stage2": 0.6179606914520264, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2767 }, { "completion_length": 7.328125, "epoch": 0.485018398458034, "grad_norm": 12.288995085247821, "kl": 0.1787109375, "learning_rate": 5.151568249518136e-07, "loss": -0.0257, "reward": 1.5489342212677002, "reward_std": 0.16947272419929504, "rewards/accuracy_reward_stage2": 0.5958091616630554, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2768 }, { "completion_length": 10.703125, "epoch": 0.48519362186788156, "grad_norm": 16.628816013262394, "kl": 0.09033203125, "learning_rate": 5.14981601541966e-07, "loss": -0.0414, "reward": 1.580203890800476, "reward_std": 0.21251340210437775, "rewards/accuracy_reward_stage2": 0.6114539504051208, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2769 }, { "completion_length": 9.8125, "epoch": 0.4853688452777291, "grad_norm": 19.879967908938465, "kl": 0.16796875, "learning_rate": 5.148063781321185e-07, "loss": 0.0068, "reward": 1.3905811309814453, "reward_std": 0.2664790153503418, "rewards/accuracy_reward_stage2": 0.5468310117721558, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2770 }, { "completion_length": 9.5, "epoch": 0.48554406868757666, "grad_norm": 21.409098111522415, "kl": 0.177734375, "learning_rate": 5.146311547222709e-07, "loss": 0.04, "reward": 1.4670194387435913, "reward_std": 0.3582940399646759, "rewards/accuracy_reward_stage2": 0.6076444387435913, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2771 }, { "completion_length": 10.125, "epoch": 0.4857192920974242, "grad_norm": 17.443727373495843, "kl": 0.0771484375, "learning_rate": 5.144559313124233e-07, "loss": -0.0068, "reward": 1.7224445343017578, "reward_std": 0.1532498300075531, "rewards/accuracy_reward_stage2": 0.7380695939064026, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2772 }, { "completion_length": 9.375, "epoch": 0.48589451550727175, "grad_norm": 18.096878447544068, "kl": 0.1962890625, "learning_rate": 5.142807079025758e-07, "loss": 0.0119, "reward": 1.708147406578064, "reward_std": 0.2741885185241699, "rewards/accuracy_reward_stage2": 0.7393973469734192, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2773 }, { "completion_length": 11.8125, "epoch": 0.48606973891711935, "grad_norm": 23.912948047153062, "kl": 0.259765625, "learning_rate": 5.141054844927281e-07, "loss": 0.0469, "reward": 1.370512843132019, "reward_std": 0.3569799065589905, "rewards/accuracy_reward_stage2": 0.5267627835273743, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2774 }, { "completion_length": 15.046875, "epoch": 0.4862449623269669, "grad_norm": 20.040363660948454, "kl": 0.2275390625, "learning_rate": 5.139302610828806e-07, "loss": 0.0851, "reward": 1.113850712776184, "reward_std": 0.21950891613960266, "rewards/accuracy_reward_stage2": 0.3638507127761841, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2775 }, { "completion_length": 7.0625, "epoch": 0.48642018573681445, "grad_norm": 17.11712035620011, "kl": 0.04052734375, "learning_rate": 5.137550376730331e-07, "loss": 0.0162, "reward": 1.7094886302947998, "reward_std": 0.18887673318386078, "rewards/accuracy_reward_stage2": 0.709488570690155, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2776 }, { "completion_length": 11.046875, "epoch": 0.486595409146662, "grad_norm": 14.330378866431904, "kl": 0.054443359375, "learning_rate": 5.135798142631855e-07, "loss": -0.0223, "reward": 1.7937867641448975, "reward_std": 0.1975262314081192, "rewards/accuracy_reward_stage2": 0.8094117641448975, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2777 }, { "completion_length": 8.875, "epoch": 0.48677063255650954, "grad_norm": 14.410387426620687, "kl": 0.09423828125, "learning_rate": 5.13404590853338e-07, "loss": 0.0377, "reward": 1.3828563690185547, "reward_std": 0.20289446413516998, "rewards/accuracy_reward_stage2": 0.3828563094139099, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2778 }, { "completion_length": 10.875, "epoch": 0.4869458559663571, "grad_norm": 20.288824451679844, "kl": 0.2578125, "learning_rate": 5.132293674434904e-07, "loss": 0.1027, "reward": 1.5620172023773193, "reward_std": 0.21671007573604584, "rewards/accuracy_reward_stage2": 0.6870173215866089, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2779 }, { "completion_length": 9.96875, "epoch": 0.48712107937620464, "grad_norm": 20.68604980109284, "kl": 0.169921875, "learning_rate": 5.130541440336429e-07, "loss": 0.0677, "reward": 1.5459976196289062, "reward_std": 0.30775919556617737, "rewards/accuracy_reward_stage2": 0.6709976196289062, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2780 }, { "completion_length": 11.015625, "epoch": 0.48729630278605224, "grad_norm": 17.98928611390982, "kl": 0.05078125, "learning_rate": 5.128789206237954e-07, "loss": -0.0015, "reward": 1.6310027837753296, "reward_std": 0.28292495012283325, "rewards/accuracy_reward_stage2": 0.6466277837753296, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2781 }, { "completion_length": 8.15625, "epoch": 0.4874715261958998, "grad_norm": 19.448061377008912, "kl": 0.2412109375, "learning_rate": 5.127036972139478e-07, "loss": 0.0522, "reward": 1.4846069812774658, "reward_std": 0.21984370052814484, "rewards/accuracy_reward_stage2": 0.6252319812774658, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2782 }, { "completion_length": 10.046875, "epoch": 0.48764674960574733, "grad_norm": 16.371705096836756, "kl": 0.076171875, "learning_rate": 5.125284738041003e-07, "loss": -0.009, "reward": 1.4768791198730469, "reward_std": 0.21295681595802307, "rewards/accuracy_reward_stage2": 0.4925040900707245, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2783 }, { "completion_length": 15.4375, "epoch": 0.4878219730155949, "grad_norm": 16.31041929935746, "kl": 0.07861328125, "learning_rate": 5.123532503942527e-07, "loss": 0.0, "reward": 1.5155887603759766, "reward_std": 0.2410578578710556, "rewards/accuracy_reward_stage2": 0.5312137007713318, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2784 }, { "completion_length": 6.984375, "epoch": 0.4879971964254424, "grad_norm": 20.353214126347986, "kl": 0.1748046875, "learning_rate": 5.12178026984405e-07, "loss": -0.0075, "reward": 1.3905422687530518, "reward_std": 0.25600743293762207, "rewards/accuracy_reward_stage2": 0.5467923283576965, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2785 }, { "completion_length": 8.265625, "epoch": 0.48817241983529, "grad_norm": 21.969311129374322, "kl": 0.09423828125, "learning_rate": 5.120028035745575e-07, "loss": 0.0171, "reward": 1.532669186592102, "reward_std": 0.25590693950653076, "rewards/accuracy_reward_stage2": 0.548294186592102, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2786 }, { "completion_length": 12.5625, "epoch": 0.4883476432451376, "grad_norm": 20.324773293148578, "kl": 0.0294189453125, "learning_rate": 5.118275801647099e-07, "loss": 0.0117, "reward": 1.6665804386138916, "reward_std": 0.2079043984413147, "rewards/accuracy_reward_stage2": 0.666580319404602, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2787 }, { "completion_length": 8.484375, "epoch": 0.4885228666549851, "grad_norm": 28.316353788852847, "kl": 0.1669921875, "learning_rate": 5.116523567548624e-07, "loss": 0.0384, "reward": 1.57529616355896, "reward_std": 0.41628655791282654, "rewards/accuracy_reward_stage2": 0.59092116355896, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2788 }, { "completion_length": 9.265625, "epoch": 0.48869809006483267, "grad_norm": 23.53590174525446, "kl": 0.2080078125, "learning_rate": 5.114771333450149e-07, "loss": -0.012, "reward": 1.5387616157531738, "reward_std": 0.22167927026748657, "rewards/accuracy_reward_stage2": 0.7106366157531738, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2789 }, { "completion_length": 9.3125, "epoch": 0.4888733134746802, "grad_norm": 24.876890170809475, "kl": 0.201171875, "learning_rate": 5.113019099351673e-07, "loss": 0.0098, "reward": 1.638625144958496, "reward_std": 0.35675370693206787, "rewards/accuracy_reward_stage2": 0.6698752641677856, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2790 }, { "completion_length": 14.4375, "epoch": 0.48904853688452776, "grad_norm": 15.22026672089996, "kl": 0.1015625, "learning_rate": 5.111266865253198e-07, "loss": -0.0435, "reward": 1.7338073253631592, "reward_std": 0.19504126906394958, "rewards/accuracy_reward_stage2": 0.765057384967804, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2791 }, { "completion_length": 8.578125, "epoch": 0.4892237602943753, "grad_norm": 24.471302996369715, "kl": 0.09814453125, "learning_rate": 5.109514631154723e-07, "loss": -0.005, "reward": 1.4096736907958984, "reward_std": 0.3426571488380432, "rewards/accuracy_reward_stage2": 0.42529866099357605, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2792 }, { "completion_length": 12.609375, "epoch": 0.4893989837042229, "grad_norm": 19.71721072991048, "kl": 0.1650390625, "learning_rate": 5.107762397056246e-07, "loss": 0.0344, "reward": 1.5647720098495483, "reward_std": 0.2316807359457016, "rewards/accuracy_reward_stage2": 0.5803970098495483, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2793 }, { "completion_length": 8.4375, "epoch": 0.48957420711407046, "grad_norm": 29.986452757482443, "kl": 0.2353515625, "learning_rate": 5.106010162957771e-07, "loss": 0.0639, "reward": 1.599445104598999, "reward_std": 0.4075753092765808, "rewards/accuracy_reward_stage2": 0.615070104598999, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2794 }, { "completion_length": 16.359375, "epoch": 0.489749430523918, "grad_norm": 16.643489660943185, "kl": 0.119140625, "learning_rate": 5.104257928859295e-07, "loss": -0.0184, "reward": 1.2576736211776733, "reward_std": 0.22650864720344543, "rewards/accuracy_reward_stage2": 0.2889236509799957, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2795 }, { "completion_length": 15.671875, "epoch": 0.48992465393376555, "grad_norm": 15.229093807517911, "kl": 0.1982421875, "learning_rate": 5.10250569476082e-07, "loss": -0.0527, "reward": 1.4054017066955566, "reward_std": 0.18351644277572632, "rewards/accuracy_reward_stage2": 0.5772767663002014, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2796 }, { "completion_length": 8.109375, "epoch": 0.4900998773436131, "grad_norm": 19.304101051588315, "kl": 0.1962890625, "learning_rate": 5.100753460662345e-07, "loss": -0.0152, "reward": 1.6429263353347778, "reward_std": 0.30045899748802185, "rewards/accuracy_reward_stage2": 0.6898013949394226, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2797 }, { "completion_length": 9.84375, "epoch": 0.49027510075346065, "grad_norm": 14.141398792127648, "kl": 0.158203125, "learning_rate": 5.099001226563868e-07, "loss": 0.0192, "reward": 1.5334928035736084, "reward_std": 0.2365182787179947, "rewards/accuracy_reward_stage2": 0.549117922782898, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2798 }, { "completion_length": 5.828125, "epoch": 0.4904503241633082, "grad_norm": 16.55445750105496, "kl": 0.0673828125, "learning_rate": 5.097248992465393e-07, "loss": 0.027, "reward": 1.758355975151062, "reward_std": 0.20726990699768066, "rewards/accuracy_reward_stage2": 0.758355975151062, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2799 }, { "completion_length": 13.375, "epoch": 0.4906255475731558, "grad_norm": 26.371272784505244, "kl": 0.248046875, "learning_rate": 5.095496758366918e-07, "loss": 0.0994, "reward": 1.5160192251205444, "reward_std": 0.11810261011123657, "rewards/accuracy_reward_stage2": 0.6410192251205444, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2800 }, { "completion_length": 13.515625, "epoch": 0.49080077098300334, "grad_norm": 14.68377419112434, "kl": 0.06396484375, "learning_rate": 5.093744524268442e-07, "loss": 0.0255, "reward": 1.655139684677124, "reward_std": 0.08718352019786835, "rewards/accuracy_reward_stage2": 0.6551397442817688, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2801 }, { "completion_length": 14.921875, "epoch": 0.4909759943928509, "grad_norm": 20.048786580469528, "kl": 0.2314453125, "learning_rate": 5.091992290169967e-07, "loss": -0.0057, "reward": 1.4953479766845703, "reward_std": 0.29281944036483765, "rewards/accuracy_reward_stage2": 0.5422229170799255, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2802 }, { "completion_length": 9.21875, "epoch": 0.49115121780269844, "grad_norm": 16.685234932334513, "kl": 0.08984375, "learning_rate": 5.09024005607149e-07, "loss": -0.0077, "reward": 1.5765955448150635, "reward_std": 0.2887122929096222, "rewards/accuracy_reward_stage2": 0.5922205448150635, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2803 }, { "completion_length": 8.453125, "epoch": 0.491326441212546, "grad_norm": 19.963744178716368, "kl": 0.255859375, "learning_rate": 5.088487821973015e-07, "loss": -0.0377, "reward": 1.568144679069519, "reward_std": 0.2530245780944824, "rewards/accuracy_reward_stage2": 0.6306445598602295, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2804 }, { "completion_length": 12.703125, "epoch": 0.49150166462239353, "grad_norm": 17.01684463077485, "kl": 0.1748046875, "learning_rate": 5.08673558787454e-07, "loss": 0.0697, "reward": 1.2630748748779297, "reward_std": 0.22368191182613373, "rewards/accuracy_reward_stage2": 0.5130749344825745, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2805 }, { "completion_length": 13.1875, "epoch": 0.49167688803224113, "grad_norm": 23.596113627385748, "kl": 0.1279296875, "learning_rate": 5.084983353776064e-07, "loss": 0.051, "reward": 1.5360572338104248, "reward_std": 0.3088216781616211, "rewards/accuracy_reward_stage2": 0.6610572338104248, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2806 }, { "completion_length": 10.84375, "epoch": 0.4918521114420887, "grad_norm": 14.753660421038694, "kl": 0.1357421875, "learning_rate": 5.083231119677589e-07, "loss": 0.01, "reward": 1.46462881565094, "reward_std": 0.18203243613243103, "rewards/accuracy_reward_stage2": 0.6052538156509399, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2807 }, { "completion_length": 15.875, "epoch": 0.4920273348519362, "grad_norm": 20.038379015214403, "kl": 0.060791015625, "learning_rate": 5.081478885579114e-07, "loss": 0.0244, "reward": 1.5692226886749268, "reward_std": 0.07769454270601273, "rewards/accuracy_reward_stage2": 0.569222629070282, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2808 }, { "completion_length": 15.8125, "epoch": 0.4922025582617838, "grad_norm": 17.85182485742762, "kl": 0.09326171875, "learning_rate": 5.079726651480638e-07, "loss": -0.0049, "reward": 1.5670794248580933, "reward_std": 0.17647606134414673, "rewards/accuracy_reward_stage2": 0.5827044248580933, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2809 }, { "completion_length": 12.015625, "epoch": 0.4923777816716313, "grad_norm": 20.31571255444851, "kl": 0.279296875, "learning_rate": 5.077974417382162e-07, "loss": 0.1007, "reward": 1.331108808517456, "reward_std": 0.18084828555583954, "rewards/accuracy_reward_stage2": 0.4717338979244232, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2810 }, { "completion_length": 12.109375, "epoch": 0.49255300508147887, "grad_norm": 14.528217315109835, "kl": 0.146484375, "learning_rate": 5.076222183283686e-07, "loss": -0.0516, "reward": 1.3952136039733887, "reward_std": 0.20071014761924744, "rewards/accuracy_reward_stage2": 0.44208866357803345, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2811 }, { "completion_length": 12.71875, "epoch": 0.4927282284913264, "grad_norm": 15.939315200282937, "kl": 0.1630859375, "learning_rate": 5.074469949185211e-07, "loss": -0.0107, "reward": 1.4925997257232666, "reward_std": 0.23163168132305145, "rewards/accuracy_reward_stage2": 0.6488497257232666, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2812 }, { "completion_length": 9.8125, "epoch": 0.492903451901174, "grad_norm": 14.135382545836258, "kl": 0.1171875, "learning_rate": 5.072717715086735e-07, "loss": -0.0269, "reward": 1.4884263277053833, "reward_std": 0.2511558532714844, "rewards/accuracy_reward_stage2": 0.5196763277053833, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2813 }, { "completion_length": 20.140625, "epoch": 0.49307867531102156, "grad_norm": 18.67766438645424, "kl": 0.068359375, "learning_rate": 5.070965480988259e-07, "loss": 0.0273, "reward": 1.5892879962921143, "reward_std": 0.16745012998580933, "rewards/accuracy_reward_stage2": 0.5892879962921143, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2814 }, { "completion_length": 7.984375, "epoch": 0.4932538987208691, "grad_norm": 27.37512850007755, "kl": 0.2001953125, "learning_rate": 5.069213246889784e-07, "loss": 0.0801, "reward": 1.6561710834503174, "reward_std": 0.16692574322223663, "rewards/accuracy_reward_stage2": 0.6561710834503174, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2815 }, { "completion_length": 10.28125, "epoch": 0.49342912213071666, "grad_norm": 17.381150811786746, "kl": 0.1103515625, "learning_rate": 5.067461012791309e-07, "loss": 0.0081, "reward": 1.5132172107696533, "reward_std": 0.25957632064819336, "rewards/accuracy_reward_stage2": 0.5288421511650085, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2816 }, { "completion_length": 16.078125, "epoch": 0.4936043455405642, "grad_norm": 15.233822447838989, "kl": 0.037109375, "learning_rate": 5.065708778692833e-07, "loss": 0.0149, "reward": 1.5294086933135986, "reward_std": 0.2323162704706192, "rewards/accuracy_reward_stage2": 0.5294086337089539, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2817 }, { "completion_length": 10.578125, "epoch": 0.49377956895041175, "grad_norm": 17.999832607856707, "kl": 0.166015625, "learning_rate": 5.063956544594358e-07, "loss": -0.0649, "reward": 1.5089285373687744, "reward_std": 0.3682054281234741, "rewards/accuracy_reward_stage2": 0.5714285969734192, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2818 }, { "completion_length": 13.1875, "epoch": 0.49395479236025935, "grad_norm": 16.343371924459856, "kl": 0.08837890625, "learning_rate": 5.062204310495882e-07, "loss": -0.0088, "reward": 1.4112939834594727, "reward_std": 0.21046367287635803, "rewards/accuracy_reward_stage2": 0.42691895365715027, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2819 }, { "completion_length": 16.453125, "epoch": 0.4941300157701069, "grad_norm": 20.203369112743434, "kl": 0.10595703125, "learning_rate": 5.060452076397407e-07, "loss": 0.009, "reward": 1.5565242767333984, "reward_std": 0.2382836639881134, "rewards/accuracy_reward_stage2": 0.5721493363380432, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2820 }, { "completion_length": 6.1875, "epoch": 0.49430523917995445, "grad_norm": 15.530217389099516, "kl": 0.1015625, "learning_rate": 5.058699842298932e-07, "loss": -0.0477, "reward": 1.446732997894287, "reward_std": 0.22207242250442505, "rewards/accuracy_reward_stage2": 0.47798293828964233, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2821 }, { "completion_length": 8.703125, "epoch": 0.494480462589802, "grad_norm": 20.575110208490617, "kl": 0.134765625, "learning_rate": 5.056947608200456e-07, "loss": -0.0344, "reward": 1.4416877031326294, "reward_std": 0.29794514179229736, "rewards/accuracy_reward_stage2": 0.4729377031326294, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2822 }, { "completion_length": 7.578125, "epoch": 0.49465568599964954, "grad_norm": 13.743752274823677, "kl": 0.25, "learning_rate": 5.055195374101979e-07, "loss": 0.0217, "reward": 1.6781415939331055, "reward_std": 0.2623763084411621, "rewards/accuracy_reward_stage2": 0.7093915939331055, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2823 }, { "completion_length": 10.75, "epoch": 0.4948309094094971, "grad_norm": 16.738230361892004, "kl": 0.06787109375, "learning_rate": 5.053443140003503e-07, "loss": 0.0062, "reward": 1.3226996660232544, "reward_std": 0.19135063886642456, "rewards/accuracy_reward_stage2": 0.4633246660232544, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2824 }, { "completion_length": 8.015625, "epoch": 0.4950061328193447, "grad_norm": 13.641295648230404, "kl": 0.16796875, "learning_rate": 5.051690905905028e-07, "loss": -0.021, "reward": 1.5862393379211426, "reward_std": 0.2807191014289856, "rewards/accuracy_reward_stage2": 0.6174893975257874, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2825 }, { "completion_length": 9.609375, "epoch": 0.49518135622919224, "grad_norm": 19.495140221339017, "kl": 0.0302734375, "learning_rate": 5.049938671806553e-07, "loss": 0.0121, "reward": 1.5694129467010498, "reward_std": 0.2659677267074585, "rewards/accuracy_reward_stage2": 0.569412887096405, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2826 }, { "completion_length": 8.90625, "epoch": 0.4953565796390398, "grad_norm": 19.298477917356017, "kl": 0.11962890625, "learning_rate": 5.048186437708077e-07, "loss": 0.0036, "reward": 1.730294942855835, "reward_std": 0.17712485790252686, "rewards/accuracy_reward_stage2": 0.745919942855835, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2827 }, { "completion_length": 13.28125, "epoch": 0.49553180304888733, "grad_norm": 16.150852865470974, "kl": 0.08251953125, "learning_rate": 5.046434203609602e-07, "loss": -0.0552, "reward": 1.5572917461395264, "reward_std": 0.2696126699447632, "rewards/accuracy_reward_stage2": 0.5885416269302368, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2828 }, { "completion_length": 12.265625, "epoch": 0.4957070264587349, "grad_norm": 24.21432717723203, "kl": 0.375, "learning_rate": 5.044681969511127e-07, "loss": 0.0217, "reward": 1.6967790126800537, "reward_std": 0.31078892946243286, "rewards/accuracy_reward_stage2": 0.7592791318893433, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2829 }, { "completion_length": 10.671875, "epoch": 0.4958822498685824, "grad_norm": 16.31810279334247, "kl": 0.142578125, "learning_rate": 5.042929735412651e-07, "loss": -0.0225, "reward": 1.5832899808883667, "reward_std": 0.17362064123153687, "rewards/accuracy_reward_stage2": 0.6145399808883667, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2830 }, { "completion_length": 9.921875, "epoch": 0.49605747327843, "grad_norm": 12.727870552143301, "kl": 0.0791015625, "learning_rate": 5.041177501314176e-07, "loss": 0.0316, "reward": 1.6768295764923096, "reward_std": 0.13108648359775543, "rewards/accuracy_reward_stage2": 0.6768295764923096, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2831 }, { "completion_length": 26.1875, "epoch": 0.4962326966882776, "grad_norm": 19.66793729672711, "kl": 0.458984375, "learning_rate": 5.0394252672157e-07, "loss": 0.1178, "reward": 1.4868590831756592, "reward_std": 0.2769075632095337, "rewards/accuracy_reward_stage2": 0.6431090831756592, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2832 }, { "completion_length": 13.5, "epoch": 0.4964079200981251, "grad_norm": 18.268128966158116, "kl": 0.140625, "learning_rate": 5.037673033117224e-07, "loss": -0.042, "reward": 1.4005197286605835, "reward_std": 0.31960806250572205, "rewards/accuracy_reward_stage2": 0.4473947286605835, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2833 }, { "completion_length": 8.65625, "epoch": 0.49658314350797267, "grad_norm": 20.947171506470056, "kl": 0.05712890625, "learning_rate": 5.035920799018749e-07, "loss": 0.0229, "reward": 1.4880142211914062, "reward_std": 0.3103662133216858, "rewards/accuracy_reward_stage2": 0.48801422119140625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2834 }, { "completion_length": 7.78125, "epoch": 0.4967583669178202, "grad_norm": 11.906433925116346, "kl": 0.1171875, "learning_rate": 5.034168564920273e-07, "loss": -0.0209, "reward": 1.453125, "reward_std": 0.1804211586713791, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2835 }, { "completion_length": 14.6875, "epoch": 0.49693359032766776, "grad_norm": 20.864580568318974, "kl": 0.1328125, "learning_rate": 5.032416330821797e-07, "loss": -0.0142, "reward": 1.266761302947998, "reward_std": 0.23117133975028992, "rewards/accuracy_reward_stage2": 0.29801127314567566, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2836 }, { "completion_length": 7.59375, "epoch": 0.4971088137375153, "grad_norm": 17.392723074485005, "kl": 0.1728515625, "learning_rate": 5.030664096723322e-07, "loss": -0.004, "reward": 1.7567867040634155, "reward_std": 0.3129429817199707, "rewards/accuracy_reward_stage2": 0.7880366444587708, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2837 }, { "completion_length": 7.390625, "epoch": 0.4972840371473629, "grad_norm": 22.961837058645735, "kl": 0.216796875, "learning_rate": 5.028911862624846e-07, "loss": -0.0142, "reward": 1.252758502960205, "reward_std": 0.1922820806503296, "rewards/accuracy_reward_stage2": 0.2996334433555603, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2838 }, { "completion_length": 8.984375, "epoch": 0.49745926055721046, "grad_norm": 12.840102408536092, "kl": 0.171875, "learning_rate": 5.027159628526371e-07, "loss": -0.0292, "reward": 1.4514057636260986, "reward_std": 0.15803693234920502, "rewards/accuracy_reward_stage2": 0.4982808530330658, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2839 }, { "completion_length": 4.90625, "epoch": 0.497634483967058, "grad_norm": 16.67158345880587, "kl": 0.119140625, "learning_rate": 5.025407394427895e-07, "loss": -0.0408, "reward": 1.6197917461395264, "reward_std": 0.2089996337890625, "rewards/accuracy_reward_stage2": 0.6510416269302368, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2840 }, { "completion_length": 8.5625, "epoch": 0.49780970737690555, "grad_norm": 16.771018106748706, "kl": 0.052001953125, "learning_rate": 5.02365516032942e-07, "loss": 0.0209, "reward": 1.415387511253357, "reward_std": 0.2690582871437073, "rewards/accuracy_reward_stage2": 0.5403875112533569, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2841 }, { "completion_length": 9.984375, "epoch": 0.4979849307867531, "grad_norm": 17.667681684297957, "kl": 0.10546875, "learning_rate": 5.021902926230945e-07, "loss": 0.0421, "reward": 1.6814024448394775, "reward_std": 0.2051592618227005, "rewards/accuracy_reward_stage2": 0.6814025640487671, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2842 }, { "completion_length": 9.53125, "epoch": 0.49816015419660065, "grad_norm": 17.135267251551593, "kl": 0.11181640625, "learning_rate": 5.020150692132468e-07, "loss": 0.0005, "reward": 1.5520660877227783, "reward_std": 0.16430020332336426, "rewards/accuracy_reward_stage2": 0.6926910877227783, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2843 }, { "completion_length": 11.34375, "epoch": 0.49833537760644825, "grad_norm": 23.174818431076805, "kl": 0.27734375, "learning_rate": 5.018398458033993e-07, "loss": -0.081, "reward": 1.4596951007843018, "reward_std": 0.29588693380355835, "rewards/accuracy_reward_stage2": 0.5378201007843018, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 2844 }, { "completion_length": 14.53125, "epoch": 0.4985106010162958, "grad_norm": 41.97659001913079, "kl": 0.453125, "learning_rate": 5.016646223935518e-07, "loss": 0.1008, "reward": 1.3339383602142334, "reward_std": 0.2974085211753845, "rewards/accuracy_reward_stage2": 0.4901883602142334, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2845 }, { "completion_length": 8.484375, "epoch": 0.49868582442614334, "grad_norm": 19.85600785091683, "kl": 0.150390625, "learning_rate": 5.014893989837042e-07, "loss": 0.0314, "reward": 1.6979167461395264, "reward_std": 0.35343262553215027, "rewards/accuracy_reward_stage2": 0.7135416865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2846 }, { "completion_length": 15.671875, "epoch": 0.4988610478359909, "grad_norm": 18.853279175915038, "kl": 0.162109375, "learning_rate": 5.013141755738567e-07, "loss": -0.025, "reward": 1.6923900842666626, "reward_std": 0.23192203044891357, "rewards/accuracy_reward_stage2": 0.7392650246620178, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2847 }, { "completion_length": 6.65625, "epoch": 0.49903627124583844, "grad_norm": 22.894855403366556, "kl": 0.0751953125, "learning_rate": 5.011389521640091e-07, "loss": 0.03, "reward": 1.691043734550476, "reward_std": 0.20182660222053528, "rewards/accuracy_reward_stage2": 0.6910437345504761, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2848 }, { "completion_length": 17.234375, "epoch": 0.499211494655686, "grad_norm": 18.017055889535754, "kl": 0.10498046875, "learning_rate": 5.009637287541615e-07, "loss": -0.0363, "reward": 1.4788634777069092, "reward_std": 0.2652289867401123, "rewards/accuracy_reward_stage2": 0.6351134181022644, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2849 }, { "completion_length": 10.84375, "epoch": 0.49938671806553353, "grad_norm": 18.858962789288007, "kl": 0.1591796875, "learning_rate": 5.00788505344314e-07, "loss": 0.0292, "reward": 1.5011869668960571, "reward_std": 0.1766315996646881, "rewards/accuracy_reward_stage2": 0.6418120265007019, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2850 }, { "completion_length": 8.65625, "epoch": 0.49956194147538113, "grad_norm": 18.731961150873207, "kl": 0.2470703125, "learning_rate": 5.006132819344664e-07, "loss": 0.0211, "reward": 1.4539709091186523, "reward_std": 0.3791165053844452, "rewards/accuracy_reward_stage2": 0.4852209687232971, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2851 }, { "completion_length": 9.703125, "epoch": 0.4997371648852287, "grad_norm": 16.94499504943766, "kl": 0.06640625, "learning_rate": 5.004380585246189e-07, "loss": -0.0176, "reward": 1.596168041229248, "reward_std": 0.15915852785110474, "rewards/accuracy_reward_stage2": 0.736793041229248, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2852 }, { "completion_length": 12.203125, "epoch": 0.4999123882950762, "grad_norm": 21.043689187254092, "kl": 0.1611328125, "learning_rate": 5.002628351147713e-07, "loss": 0.0644, "reward": 1.3016023635864258, "reward_std": 0.22436624765396118, "rewards/accuracy_reward_stage2": 0.426602303981781, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2853 }, { "completion_length": 8.21875, "epoch": 0.5000876117049238, "grad_norm": 14.048458594879166, "kl": 0.03662109375, "learning_rate": 5.000876117049237e-07, "loss": 0.0147, "reward": 1.833432912826538, "reward_std": 0.153619185090065, "rewards/accuracy_reward_stage2": 0.8334329128265381, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2854 }, { "completion_length": 10.421875, "epoch": 0.5002628351147713, "grad_norm": 21.185086913316407, "kl": 0.05810546875, "learning_rate": 4.999123882950762e-07, "loss": 0.0232, "reward": 1.7559123039245605, "reward_std": 0.22805647552013397, "rewards/accuracy_reward_stage2": 0.7559122443199158, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2855 }, { "completion_length": 14.0, "epoch": 0.5004380585246189, "grad_norm": 18.58990052940012, "kl": 0.04248046875, "learning_rate": 4.997371648852286e-07, "loss": 0.017, "reward": 1.4115819931030273, "reward_std": 0.21287208795547485, "rewards/accuracy_reward_stage2": 0.4115820527076721, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2856 }, { "completion_length": 9.78125, "epoch": 0.5006132819344664, "grad_norm": 27.798694752355708, "kl": 0.10302734375, "learning_rate": 4.995619414753811e-07, "loss": -0.0028, "reward": 1.7112268209457397, "reward_std": 0.28396135568618774, "rewards/accuracy_reward_stage2": 0.7268518805503845, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2857 }, { "completion_length": 12.65625, "epoch": 0.500788505344314, "grad_norm": 18.676237589883304, "kl": 0.142578125, "learning_rate": 4.993867180655335e-07, "loss": -0.0206, "reward": 1.7398256063461304, "reward_std": 0.24794884026050568, "rewards/accuracy_reward_stage2": 0.7710756063461304, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2858 }, { "completion_length": 16.875, "epoch": 0.5009637287541615, "grad_norm": 21.658396617934685, "kl": 0.1142578125, "learning_rate": 4.99211494655686e-07, "loss": 0.0457, "reward": 1.3541054725646973, "reward_std": 0.24178023636341095, "rewards/accuracy_reward_stage2": 0.47910550236701965, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2859 }, { "completion_length": 7.6875, "epoch": 0.5011389521640092, "grad_norm": 20.286041713919595, "kl": 0.095703125, "learning_rate": 4.990362712458384e-07, "loss": 0.0383, "reward": 1.5827951431274414, "reward_std": 0.26222479343414307, "rewards/accuracy_reward_stage2": 0.7077950835227966, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2860 }, { "completion_length": 10.71875, "epoch": 0.5013141755738567, "grad_norm": 18.808117184993183, "kl": 0.1259765625, "learning_rate": 4.988610478359909e-07, "loss": 0.0503, "reward": 1.3651602268218994, "reward_std": 0.29143521189689636, "rewards/accuracy_reward_stage2": 0.6151602268218994, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2861 }, { "completion_length": 9.484375, "epoch": 0.5014893989837043, "grad_norm": 19.878420281823956, "kl": 0.061279296875, "learning_rate": 4.986858244261434e-07, "loss": -0.0196, "reward": 1.3310251235961914, "reward_std": 0.34876665472984314, "rewards/accuracy_reward_stage2": 0.3466501832008362, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2862 }, { "completion_length": 11.40625, "epoch": 0.5016646223935518, "grad_norm": 14.843342929453732, "kl": 0.03173828125, "learning_rate": 4.985106010162957e-07, "loss": 0.0127, "reward": 1.3046542406082153, "reward_std": 0.15470543503761292, "rewards/accuracy_reward_stage2": 0.30465424060821533, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2863 }, { "completion_length": 8.59375, "epoch": 0.5018398458033994, "grad_norm": 16.630839287099747, "kl": 0.16796875, "learning_rate": 4.983353776064482e-07, "loss": -0.0189, "reward": 1.6525473594665527, "reward_std": 0.2849022150039673, "rewards/accuracy_reward_stage2": 0.6837973594665527, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2864 }, { "completion_length": 6.984375, "epoch": 0.5020150692132469, "grad_norm": 13.160546414311293, "kl": 0.2490234375, "learning_rate": 4.981601541966006e-07, "loss": 0.0708, "reward": 1.4527487754821777, "reward_std": 0.09580159932374954, "rewards/accuracy_reward_stage2": 0.5933738350868225, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2865 }, { "completion_length": 10.5, "epoch": 0.5021902926230944, "grad_norm": 27.994825259025895, "kl": 0.251953125, "learning_rate": 4.979849307867531e-07, "loss": 0.057, "reward": 1.606134295463562, "reward_std": 0.30084317922592163, "rewards/accuracy_reward_stage2": 0.746759295463562, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2866 }, { "completion_length": 10.03125, "epoch": 0.502365516032942, "grad_norm": 16.449446402433466, "kl": 0.435546875, "learning_rate": 4.978097073769055e-07, "loss": 0.0975, "reward": 1.328658938407898, "reward_std": 0.23123815655708313, "rewards/accuracy_reward_stage2": 0.609908938407898, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2867 }, { "completion_length": 9.21875, "epoch": 0.5025407394427895, "grad_norm": 16.88690992041481, "kl": 0.06494140625, "learning_rate": 4.97634483967058e-07, "loss": -0.0183, "reward": 1.48995041847229, "reward_std": 0.12207160145044327, "rewards/accuracy_reward_stage2": 0.5055753588676453, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2868 }, { "completion_length": 14.171875, "epoch": 0.5027159628526371, "grad_norm": 21.897199972009407, "kl": 0.12353515625, "learning_rate": 4.974592605572105e-07, "loss": 0.0114, "reward": 1.1664299964904785, "reward_std": 0.267032653093338, "rewards/accuracy_reward_stage2": 0.43205493688583374, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2869 }, { "completion_length": 10.953125, "epoch": 0.5028911862624846, "grad_norm": 29.64452204880094, "kl": 0.1640625, "learning_rate": 4.972840371473629e-07, "loss": 0.0411, "reward": 1.656315803527832, "reward_std": 0.14924439787864685, "rewards/accuracy_reward_stage2": 0.7969407439231873, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2870 }, { "completion_length": 12.234375, "epoch": 0.5030664096723322, "grad_norm": 15.783355354362861, "kl": 0.060791015625, "learning_rate": 4.971088137375153e-07, "loss": 0.0243, "reward": 1.633192539215088, "reward_std": 0.07455827295780182, "rewards/accuracy_reward_stage2": 0.6331925988197327, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2871 }, { "completion_length": 17.078125, "epoch": 0.5032416330821797, "grad_norm": 20.14334209062457, "kl": 0.140625, "learning_rate": 4.969335903276678e-07, "loss": 0.0122, "reward": 1.4405739307403564, "reward_std": 0.2437485158443451, "rewards/accuracy_reward_stage2": 0.5811989903450012, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2872 }, { "completion_length": 13.125, "epoch": 0.5034168564920274, "grad_norm": 19.491096865220893, "kl": 0.0289306640625, "learning_rate": 4.967583669178202e-07, "loss": 0.0115, "reward": 1.4652515649795532, "reward_std": 0.14238084852695465, "rewards/accuracy_reward_stage2": 0.465251624584198, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2873 }, { "completion_length": 11.40625, "epoch": 0.5035920799018749, "grad_norm": 21.63362229573334, "kl": 0.1259765625, "learning_rate": 4.965831435079726e-07, "loss": 0.0111, "reward": 1.641181468963623, "reward_std": 0.260013610124588, "rewards/accuracy_reward_stage2": 0.656806468963623, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2874 }, { "completion_length": 9.25, "epoch": 0.5037673033117225, "grad_norm": 161.09061090228093, "kl": 0.83203125, "learning_rate": 4.964079200981251e-07, "loss": 0.2849, "reward": 1.5068423748016357, "reward_std": 0.2722551226615906, "rewards/accuracy_reward_stage2": 0.538092315196991, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2875 }, { "completion_length": 11.75, "epoch": 0.50394252672157, "grad_norm": 20.61916644423281, "kl": 0.1064453125, "learning_rate": 4.962326966882775e-07, "loss": 0.0426, "reward": 1.5803353786468506, "reward_std": 0.17665207386016846, "rewards/accuracy_reward_stage2": 0.5803354382514954, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2876 }, { "completion_length": 13.4375, "epoch": 0.5041177501314176, "grad_norm": 20.31103660975051, "kl": 0.07421875, "learning_rate": 4.9605747327843e-07, "loss": 0.0298, "reward": 1.6658729314804077, "reward_std": 0.22268161177635193, "rewards/accuracy_reward_stage2": 0.6658729314804077, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2877 }, { "completion_length": 9.28125, "epoch": 0.5042929735412651, "grad_norm": 22.3694362107489, "kl": 0.26953125, "learning_rate": 4.958822498685824e-07, "loss": 0.0265, "reward": 1.6044328212738037, "reward_std": 0.31371209025382996, "rewards/accuracy_reward_stage2": 0.6513078212738037, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2878 }, { "completion_length": 11.234375, "epoch": 0.5044681969511127, "grad_norm": 20.70169028531261, "kl": 0.0732421875, "learning_rate": 4.957070264587349e-07, "loss": 0.0294, "reward": 1.6221519708633423, "reward_std": 0.20389091968536377, "rewards/accuracy_reward_stage2": 0.6221520900726318, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2879 }, { "completion_length": 10.515625, "epoch": 0.5046434203609602, "grad_norm": 23.05165408478004, "kl": 0.11962890625, "learning_rate": 4.955318030488873e-07, "loss": -0.0199, "reward": 1.5865809917449951, "reward_std": 0.2581280469894409, "rewards/accuracy_reward_stage2": 0.6178310513496399, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2880 }, { "completion_length": 10.09375, "epoch": 0.5048186437708078, "grad_norm": 19.21254274225465, "kl": 0.054931640625, "learning_rate": 4.953565796390398e-07, "loss": 0.022, "reward": 1.615790605545044, "reward_std": 0.19559542834758759, "rewards/accuracy_reward_stage2": 0.6157907247543335, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2881 }, { "completion_length": 12.4375, "epoch": 0.5049938671806553, "grad_norm": 18.617819254545374, "kl": 0.263671875, "learning_rate": 4.951813562291923e-07, "loss": 0.0959, "reward": 1.452277421951294, "reward_std": 0.1409415304660797, "rewards/accuracy_reward_stage2": 0.702277421951294, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2882 }, { "completion_length": 6.21875, "epoch": 0.5051690905905029, "grad_norm": 16.133214593703105, "kl": 0.15234375, "learning_rate": 4.950061328193446e-07, "loss": -0.0037, "reward": 1.5024305582046509, "reward_std": 0.315075159072876, "rewards/accuracy_reward_stage2": 0.5336805582046509, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2883 }, { "completion_length": 10.359375, "epoch": 0.5053443140003504, "grad_norm": 17.226185966153473, "kl": 0.166015625, "learning_rate": 4.94830909409497e-07, "loss": -0.0167, "reward": 1.3070234060287476, "reward_std": 0.3721379041671753, "rewards/accuracy_reward_stage2": 0.47889846563339233, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2884 }, { "completion_length": 11.421875, "epoch": 0.505519537410198, "grad_norm": 14.702899239685644, "kl": 0.169921875, "learning_rate": 4.946556859996495e-07, "loss": 0.0237, "reward": 1.5311923027038574, "reward_std": 0.2328895628452301, "rewards/accuracy_reward_stage2": 0.671817421913147, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2885 }, { "completion_length": 11.46875, "epoch": 0.5056947608200456, "grad_norm": 23.022843906262594, "kl": 0.1181640625, "learning_rate": 4.94480462589802e-07, "loss": 0.0205, "reward": 1.471540927886963, "reward_std": 0.23446643352508545, "rewards/accuracy_reward_stage2": 0.4871658682823181, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2886 }, { "completion_length": 6.40625, "epoch": 0.5058699842298932, "grad_norm": 14.457415335603917, "kl": 0.0634765625, "learning_rate": 4.943052391799544e-07, "loss": 0.0255, "reward": 1.6342511177062988, "reward_std": 0.19844907522201538, "rewards/accuracy_reward_stage2": 0.7592511773109436, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2887 }, { "completion_length": 8.28125, "epoch": 0.5060452076397407, "grad_norm": 15.509418136970604, "kl": 0.06396484375, "learning_rate": 4.941300157701069e-07, "loss": -0.0186, "reward": 1.65625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward_stage2": 0.671875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2888 }, { "completion_length": 12.484375, "epoch": 0.5062204310495882, "grad_norm": 15.822741273420396, "kl": 0.095703125, "learning_rate": 4.939547923602594e-07, "loss": -0.0059, "reward": 1.6550612449645996, "reward_std": 0.20866025984287262, "rewards/accuracy_reward_stage2": 0.6706862449645996, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2889 }, { "completion_length": 10.59375, "epoch": 0.5063956544594358, "grad_norm": 24.030731546327335, "kl": 0.0810546875, "learning_rate": 4.937795689504118e-07, "loss": -0.056, "reward": 1.701578140258789, "reward_std": 0.13695769011974335, "rewards/accuracy_reward_stage2": 0.7328281402587891, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2890 }, { "completion_length": 11.65625, "epoch": 0.5065708778692833, "grad_norm": 11.550308632255781, "kl": 0.1123046875, "learning_rate": 4.936043455405642e-07, "loss": 0.0448, "reward": 1.6819041967391968, "reward_std": 0.08682706952095032, "rewards/accuracy_reward_stage2": 0.8069040775299072, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2891 }, { "completion_length": 8.84375, "epoch": 0.5067461012791309, "grad_norm": 18.67077224667281, "kl": 0.146484375, "learning_rate": 4.934291221307166e-07, "loss": 0.0152, "reward": 1.401998519897461, "reward_std": 0.17456203699111938, "rewards/accuracy_reward_stage2": 0.6676234006881714, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2892 }, { "completion_length": 9.71875, "epoch": 0.5069213246889784, "grad_norm": 19.36151135012173, "kl": 0.142578125, "learning_rate": 4.93253898720869e-07, "loss": 0.057, "reward": 1.6398862600326538, "reward_std": 0.09990248829126358, "rewards/accuracy_reward_stage2": 0.8898862600326538, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2893 }, { "completion_length": 11.203125, "epoch": 0.507096548098826, "grad_norm": 17.91391947313902, "kl": 0.1328125, "learning_rate": 4.930786753110215e-07, "loss": 0.0201, "reward": 1.5040777921676636, "reward_std": 0.2481655478477478, "rewards/accuracy_reward_stage2": 0.6447028517723083, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2894 }, { "completion_length": 6.171875, "epoch": 0.5072717715086735, "grad_norm": 19.930073948592227, "kl": 0.057861328125, "learning_rate": 4.92903451901174e-07, "loss": 0.0231, "reward": 1.7874478101730347, "reward_std": 0.16011011600494385, "rewards/accuracy_reward_stage2": 0.7874478697776794, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2895 }, { "completion_length": 9.46875, "epoch": 0.5074469949185211, "grad_norm": 19.36729865534549, "kl": 0.1552734375, "learning_rate": 4.927282284913264e-07, "loss": 0.0332, "reward": 1.514788031578064, "reward_std": 0.2984526455402374, "rewards/accuracy_reward_stage2": 0.655413031578064, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2896 }, { "completion_length": 9.203125, "epoch": 0.5076222183283686, "grad_norm": 16.005919521985984, "kl": 0.1318359375, "learning_rate": 4.925530050814788e-07, "loss": -0.0286, "reward": 1.7343182563781738, "reward_std": 0.271266907453537, "rewards/accuracy_reward_stage2": 0.7655682563781738, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2897 }, { "completion_length": 17.8125, "epoch": 0.5077974417382162, "grad_norm": 25.43410976233458, "kl": 0.07373046875, "learning_rate": 4.923777816716313e-07, "loss": -0.0147, "reward": 1.6448153257369995, "reward_std": 0.22978892922401428, "rewards/accuracy_reward_stage2": 0.6604403257369995, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2898 }, { "completion_length": 8.953125, "epoch": 0.5079726651480638, "grad_norm": 21.780308873263436, "kl": 0.1904296875, "learning_rate": 4.922025582617838e-07, "loss": 0.0228, "reward": 1.4488801956176758, "reward_std": 0.2935516834259033, "rewards/accuracy_reward_stage2": 0.6051301956176758, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2899 }, { "completion_length": 12.234375, "epoch": 0.5081478885579114, "grad_norm": 25.732968985688775, "kl": 0.0625, "learning_rate": 4.920273348519362e-07, "loss": -0.0162, "reward": 1.666982650756836, "reward_std": 0.25394535064697266, "rewards/accuracy_reward_stage2": 0.6826076507568359, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2900 }, { "completion_length": 12.421875, "epoch": 0.5083231119677589, "grad_norm": 15.150632376423873, "kl": 0.1376953125, "learning_rate": 4.918521114420887e-07, "loss": 0.0114, "reward": 1.6130900382995605, "reward_std": 0.16419199109077454, "rewards/accuracy_reward_stage2": 0.6287149786949158, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2901 }, { "completion_length": 11.921875, "epoch": 0.5084983353776065, "grad_norm": 16.95905149443227, "kl": 0.12451171875, "learning_rate": 4.916768880322412e-07, "loss": -0.0114, "reward": 1.4094171524047852, "reward_std": 0.2724335193634033, "rewards/accuracy_reward_stage2": 0.4406670928001404, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2902 }, { "completion_length": 10.25, "epoch": 0.508673558787454, "grad_norm": 18.371489893566416, "kl": 0.21875, "learning_rate": 4.915016646223935e-07, "loss": 0.0198, "reward": 1.40625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2903 }, { "completion_length": 9.5, "epoch": 0.5088487821973016, "grad_norm": 14.49199015434854, "kl": 0.056884765625, "learning_rate": 4.913264412125459e-07, "loss": 0.0228, "reward": 1.630352258682251, "reward_std": 0.1146092489361763, "rewards/accuracy_reward_stage2": 0.630352258682251, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2904 }, { "completion_length": 4.453125, "epoch": 0.5090240056071491, "grad_norm": 14.870448324908958, "kl": 0.21484375, "learning_rate": 4.911512178026984e-07, "loss": -0.0382, "reward": 1.5221229791641235, "reward_std": 0.2549004852771759, "rewards/accuracy_reward_stage2": 0.5689980387687683, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2905 }, { "completion_length": 8.78125, "epoch": 0.5091992290169967, "grad_norm": 13.835765742239484, "kl": 0.1279296875, "learning_rate": 4.909759943928509e-07, "loss": 0.0069, "reward": 1.5392454862594604, "reward_std": 0.1716037094593048, "rewards/accuracy_reward_stage2": 0.6798704862594604, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2906 }, { "completion_length": 7.875, "epoch": 0.5093744524268442, "grad_norm": 16.148864286076268, "kl": 0.1787109375, "learning_rate": 4.908007709830033e-07, "loss": -0.0166, "reward": 1.460571527481079, "reward_std": 0.22273962199687958, "rewards/accuracy_reward_stage2": 0.49182161688804626, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2907 }, { "completion_length": 23.84375, "epoch": 0.5095496758366918, "grad_norm": 20.654437519822974, "kl": 0.07470703125, "learning_rate": 4.906255475731558e-07, "loss": 0.03, "reward": 1.386010766029358, "reward_std": 0.17949114739894867, "rewards/accuracy_reward_stage2": 0.3860107660293579, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2908 }, { "completion_length": 10.40625, "epoch": 0.5097248992465393, "grad_norm": 18.81732343688654, "kl": 0.2490234375, "learning_rate": 4.904503241633082e-07, "loss": -0.0386, "reward": 1.593791127204895, "reward_std": 0.3051733076572418, "rewards/accuracy_reward_stage2": 0.6562911868095398, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2909 }, { "completion_length": 7.65625, "epoch": 0.5099001226563868, "grad_norm": 13.786113372966806, "kl": 0.1787109375, "learning_rate": 4.902751007534607e-07, "loss": -0.0119, "reward": 1.5133922100067139, "reward_std": 0.23366865515708923, "rewards/accuracy_reward_stage2": 0.6696423292160034, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2910 }, { "completion_length": 10.46875, "epoch": 0.5100753460662345, "grad_norm": 16.89240087623471, "kl": 0.1005859375, "learning_rate": 4.900998773436131e-07, "loss": -0.004, "reward": 1.601118803024292, "reward_std": 0.18279781937599182, "rewards/accuracy_reward_stage2": 0.616743803024292, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2911 }, { "completion_length": 11.875, "epoch": 0.510250569476082, "grad_norm": 16.514867762528304, "kl": 0.20703125, "learning_rate": 4.899246539337655e-07, "loss": 0.0151, "reward": 1.6914869546890259, "reward_std": 0.22304078936576843, "rewards/accuracy_reward_stage2": 0.7227370142936707, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2912 }, { "completion_length": 11.765625, "epoch": 0.5104257928859296, "grad_norm": 16.041953386428794, "kl": 0.11767578125, "learning_rate": 4.897494305239179e-07, "loss": 0.0029, "reward": 1.6377475261688232, "reward_std": 0.11044105887413025, "rewards/accuracy_reward_stage2": 0.6533724665641785, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2913 }, { "completion_length": 12.59375, "epoch": 0.5106010162957771, "grad_norm": 85.41909589603668, "kl": 0.474609375, "learning_rate": 4.895742071140704e-07, "loss": 0.1459, "reward": 1.59375, "reward_std": 0.2756393849849701, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2914 }, { "completion_length": 5.875, "epoch": 0.5107762397056247, "grad_norm": 18.587063592887425, "kl": 0.2109375, "learning_rate": 4.893989837042229e-07, "loss": 0.0038, "reward": 1.5096237659454346, "reward_std": 0.17004084587097168, "rewards/accuracy_reward_stage2": 0.6658737063407898, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2915 }, { "completion_length": 7.0, "epoch": 0.5109514631154722, "grad_norm": 8.717009220923853, "kl": 0.1357421875, "learning_rate": 4.892237602943753e-07, "loss": -0.034, "reward": 1.571064829826355, "reward_std": 0.15255174040794373, "rewards/accuracy_reward_stage2": 0.602314829826355, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2916 }, { "completion_length": 11.34375, "epoch": 0.5111266865253198, "grad_norm": 16.412978357934826, "kl": 0.039794921875, "learning_rate": 4.890485368845277e-07, "loss": 0.0159, "reward": 1.459886074066162, "reward_std": 0.23499058187007904, "rewards/accuracy_reward_stage2": 0.5848859548568726, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2917 }, { "completion_length": 9.125, "epoch": 0.5113019099351673, "grad_norm": 16.269261166994866, "kl": 0.109375, "learning_rate": 4.888733134746802e-07, "loss": 0.0438, "reward": 1.6181046962738037, "reward_std": 0.12335985898971558, "rewards/accuracy_reward_stage2": 0.7431047558784485, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2918 }, { "completion_length": 12.296875, "epoch": 0.5114771333450149, "grad_norm": 22.491302820648134, "kl": 0.0849609375, "learning_rate": 4.886980900648327e-07, "loss": 0.0339, "reward": 1.5203516483306885, "reward_std": 0.29869550466537476, "rewards/accuracy_reward_stage2": 0.5203516483306885, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2919 }, { "completion_length": 8.234375, "epoch": 0.5116523567548624, "grad_norm": 17.17007086433005, "kl": 0.07958984375, "learning_rate": 4.885228666549851e-07, "loss": -0.0412, "reward": 1.6270606517791748, "reward_std": 0.23870226740837097, "rewards/accuracy_reward_stage2": 0.6583105325698853, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2920 }, { "completion_length": 11.828125, "epoch": 0.51182758016471, "grad_norm": 21.541425158813357, "kl": 0.061279296875, "learning_rate": 4.883476432451376e-07, "loss": 0.0246, "reward": 1.8930007219314575, "reward_std": 0.1855742335319519, "rewards/accuracy_reward_stage2": 0.8930006623268127, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2921 }, { "completion_length": 10.84375, "epoch": 0.5120028035745575, "grad_norm": 15.84245159309022, "kl": 0.0732421875, "learning_rate": 4.881724198352899e-07, "loss": -0.0142, "reward": 1.7038071155548096, "reward_std": 0.19769446551799774, "rewards/accuracy_reward_stage2": 0.7194320559501648, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2922 }, { "completion_length": 9.265625, "epoch": 0.5121780269844051, "grad_norm": 19.064149989693348, "kl": 0.07666015625, "learning_rate": 4.879971964254424e-07, "loss": 0.0306, "reward": 1.669129729270935, "reward_std": 0.21358340978622437, "rewards/accuracy_reward_stage2": 0.6691297292709351, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2923 }, { "completion_length": 11.671875, "epoch": 0.5123532503942527, "grad_norm": 20.056699028346667, "kl": 0.181640625, "learning_rate": 4.878219730155948e-07, "loss": 0.0364, "reward": 1.5925509929656982, "reward_std": 0.3178279399871826, "rewards/accuracy_reward_stage2": 0.6081759333610535, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2924 }, { "completion_length": 12.03125, "epoch": 0.5125284738041003, "grad_norm": 19.022822203075908, "kl": 0.212890625, "learning_rate": 4.876467496057473e-07, "loss": -0.0034, "reward": 1.4258291721343994, "reward_std": 0.2871686816215515, "rewards/accuracy_reward_stage2": 0.45707911252975464, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2925 }, { "completion_length": 6.828125, "epoch": 0.5127036972139478, "grad_norm": 17.1522057355123, "kl": 0.1513671875, "learning_rate": 4.874715261958998e-07, "loss": 0.0162, "reward": 1.7785899639129639, "reward_std": 0.2176314890384674, "rewards/accuracy_reward_stage2": 0.7942148447036743, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2926 }, { "completion_length": 9.40625, "epoch": 0.5128789206237954, "grad_norm": 20.05783296001098, "kl": 0.0712890625, "learning_rate": 4.872963027860522e-07, "loss": 0.0285, "reward": 1.6176671981811523, "reward_std": 0.1611122190952301, "rewards/accuracy_reward_stage2": 0.6176671981811523, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2927 }, { "completion_length": 10.15625, "epoch": 0.5130541440336429, "grad_norm": 20.272883626727722, "kl": 0.1015625, "learning_rate": 4.871210793762046e-07, "loss": 0.024, "reward": 1.423703908920288, "reward_std": 0.24741846323013306, "rewards/accuracy_reward_stage2": 0.4393288493156433, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2928 }, { "completion_length": 8.640625, "epoch": 0.5132293674434905, "grad_norm": 15.959972536091819, "kl": 0.09912109375, "learning_rate": 4.869458559663571e-07, "loss": 0.0395, "reward": 1.4770452976226807, "reward_std": 0.11607255786657333, "rewards/accuracy_reward_stage2": 0.6020451784133911, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2929 }, { "completion_length": 10.890625, "epoch": 0.513404590853338, "grad_norm": 17.518959100791523, "kl": 0.09375, "learning_rate": 4.867706325565096e-07, "loss": 0.0247, "reward": 1.5568358898162842, "reward_std": 0.19567272067070007, "rewards/accuracy_reward_stage2": 0.572460949420929, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2930 }, { "completion_length": 12.71875, "epoch": 0.5135798142631856, "grad_norm": 20.847540218934004, "kl": 0.12158203125, "learning_rate": 4.86595409146662e-07, "loss": 0.0486, "reward": 1.689253330230713, "reward_std": 0.16902679204940796, "rewards/accuracy_reward_stage2": 0.6892533898353577, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2931 }, { "completion_length": 9.25, "epoch": 0.5137550376730331, "grad_norm": 15.486698154520228, "kl": 0.07861328125, "learning_rate": 4.864201857368144e-07, "loss": 0.0314, "reward": 1.7223076820373535, "reward_std": 0.08451945334672928, "rewards/accuracy_reward_stage2": 0.722307562828064, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2932 }, { "completion_length": 9.96875, "epoch": 0.5139302610828806, "grad_norm": 19.769809054977316, "kl": 0.37890625, "learning_rate": 4.862449623269668e-07, "loss": 0.1053, "reward": 1.2136902809143066, "reward_std": 0.11439789831638336, "rewards/accuracy_reward_stage2": 0.5886902809143066, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 2933 }, { "completion_length": 9.5625, "epoch": 0.5141054844927282, "grad_norm": 18.595144713621774, "kl": 0.1806640625, "learning_rate": 4.860697389171193e-07, "loss": -0.0367, "reward": 1.689985990524292, "reward_std": 0.26583412289619446, "rewards/accuracy_reward_stage2": 0.861860990524292, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2934 }, { "completion_length": 6.828125, "epoch": 0.5142807079025757, "grad_norm": 12.149659373020459, "kl": 0.0751953125, "learning_rate": 4.858945155072717e-07, "loss": -0.0141, "reward": 1.6463022232055664, "reward_std": 0.07076901197433472, "rewards/accuracy_reward_stage2": 0.6619271636009216, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2935 }, { "completion_length": 12.203125, "epoch": 0.5144559313124233, "grad_norm": 15.91180690198129, "kl": 0.263671875, "learning_rate": 4.857192920974242e-07, "loss": -0.0187, "reward": 1.3695735931396484, "reward_std": 0.2769041061401367, "rewards/accuracy_reward_stage2": 0.4164485037326813, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2936 }, { "completion_length": 7.78125, "epoch": 0.514631154722271, "grad_norm": 14.274518704484965, "kl": 0.07861328125, "learning_rate": 4.855440686875766e-07, "loss": 0.0034, "reward": 1.3541667461395264, "reward_std": 0.18801738321781158, "rewards/accuracy_reward_stage2": 0.4947916865348816, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2937 }, { "completion_length": 20.28125, "epoch": 0.5148063781321185, "grad_norm": 18.40581395743968, "kl": 0.09521484375, "learning_rate": 4.853688452777291e-07, "loss": 0.0046, "reward": 1.336214542388916, "reward_std": 0.22887608408927917, "rewards/accuracy_reward_stage2": 0.47683948278427124, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2938 }, { "completion_length": 7.96875, "epoch": 0.514981601541966, "grad_norm": 23.80466503473419, "kl": 0.2333984375, "learning_rate": 4.851936218678816e-07, "loss": 0.0057, "reward": 1.5006499290466309, "reward_std": 0.2008485645055771, "rewards/accuracy_reward_stage2": 0.5318998098373413, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2939 }, { "completion_length": 8.359375, "epoch": 0.5151568249518136, "grad_norm": 17.946604121179828, "kl": 0.1064453125, "learning_rate": 4.85018398458034e-07, "loss": 0.0425, "reward": 1.7962557077407837, "reward_std": 0.19889307022094727, "rewards/accuracy_reward_stage2": 0.7962556481361389, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2940 }, { "completion_length": 13.25, "epoch": 0.5153320483616611, "grad_norm": 21.0453056279407, "kl": 0.2216796875, "learning_rate": 4.848431750481863e-07, "loss": -0.0437, "reward": 1.3767890930175781, "reward_std": 0.2692345082759857, "rewards/accuracy_reward_stage2": 0.4236640930175781, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2941 }, { "completion_length": 11.1875, "epoch": 0.5155072717715087, "grad_norm": 14.213308225508744, "kl": 0.1337890625, "learning_rate": 4.846679516383388e-07, "loss": 0.0537, "reward": 1.5524367094039917, "reward_std": 0.16399207711219788, "rewards/accuracy_reward_stage2": 0.6774367094039917, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2942 }, { "completion_length": 7.578125, "epoch": 0.5156824951813562, "grad_norm": 14.856360108084575, "kl": 0.16796875, "learning_rate": 4.844927282284913e-07, "loss": 0.0173, "reward": 1.4467616081237793, "reward_std": 0.15504275262355804, "rewards/accuracy_reward_stage2": 0.4780115783214569, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2943 }, { "completion_length": 10.953125, "epoch": 0.5158577185912038, "grad_norm": 11.990319904854841, "kl": 0.162109375, "learning_rate": 4.843175048186437e-07, "loss": -0.0109, "reward": 1.2604167461395264, "reward_std": 0.1473139077425003, "rewards/accuracy_reward_stage2": 0.4166666865348816, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2944 }, { "completion_length": 14.734375, "epoch": 0.5160329420010513, "grad_norm": 19.809881898967866, "kl": 0.08154296875, "learning_rate": 4.841422814087962e-07, "loss": -0.0065, "reward": 1.6608145236968994, "reward_std": 0.30760252475738525, "rewards/accuracy_reward_stage2": 0.8014395236968994, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2945 }, { "completion_length": 9.484375, "epoch": 0.5162081654108989, "grad_norm": 13.553033797905996, "kl": 0.09521484375, "learning_rate": 4.839670579989487e-07, "loss": -0.006, "reward": 1.5949015617370605, "reward_std": 0.19428260624408722, "rewards/accuracy_reward_stage2": 0.6105265617370605, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2946 }, { "completion_length": 9.34375, "epoch": 0.5163833888207464, "grad_norm": 25.443446242256226, "kl": 0.10888671875, "learning_rate": 4.837918345891011e-07, "loss": -0.0042, "reward": 1.438382625579834, "reward_std": 0.4678412675857544, "rewards/accuracy_reward_stage2": 0.4696325361728668, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2947 }, { "completion_length": 15.1875, "epoch": 0.516558612230594, "grad_norm": 21.525238194596895, "kl": 0.11572265625, "learning_rate": 4.836166111792535e-07, "loss": 0.002, "reward": 1.4573842287063599, "reward_std": 0.3356036841869354, "rewards/accuracy_reward_stage2": 0.4730091989040375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2948 }, { "completion_length": 14.5, "epoch": 0.5167338356404415, "grad_norm": 19.532247302186448, "kl": 0.1796875, "learning_rate": 4.83441387769406e-07, "loss": 0.0462, "reward": 1.7068278789520264, "reward_std": 0.15236984193325043, "rewards/accuracy_reward_stage2": 0.7224528193473816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2949 }, { "completion_length": 10.234375, "epoch": 0.5169090590502892, "grad_norm": 17.956263791790803, "kl": 0.275390625, "learning_rate": 4.832661643595585e-07, "loss": -0.018, "reward": 1.6681300401687622, "reward_std": 0.2555171251296997, "rewards/accuracy_reward_stage2": 0.7150050401687622, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2950 }, { "completion_length": 13.546875, "epoch": 0.5170842824601367, "grad_norm": 20.608177429703655, "kl": 0.203125, "learning_rate": 4.830909409497109e-07, "loss": 0.0028, "reward": 1.7684814929962158, "reward_std": 0.3044443726539612, "rewards/accuracy_reward_stage2": 0.7997313737869263, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2951 }, { "completion_length": 11.703125, "epoch": 0.5172595058699843, "grad_norm": 16.37850263732007, "kl": 0.09033203125, "learning_rate": 4.829157175398633e-07, "loss": -0.0023, "reward": 1.7461036443710327, "reward_std": 0.20768775045871735, "rewards/accuracy_reward_stage2": 0.7617285847663879, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2952 }, { "completion_length": 7.828125, "epoch": 0.5174347292798318, "grad_norm": 15.786937308181098, "kl": 0.09375, "learning_rate": 4.827404941300157e-07, "loss": -0.0057, "reward": 1.6817526817321777, "reward_std": 0.24270977079868317, "rewards/accuracy_reward_stage2": 0.697377622127533, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2953 }, { "completion_length": 9.046875, "epoch": 0.5176099526896794, "grad_norm": 12.787982745803061, "kl": 0.05810546875, "learning_rate": 4.825652707201682e-07, "loss": -0.0204, "reward": 1.5325255393981934, "reward_std": 0.1728992760181427, "rewards/accuracy_reward_stage2": 0.5481504797935486, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2954 }, { "completion_length": 12.25, "epoch": 0.5177851760995269, "grad_norm": 18.344373606655147, "kl": 0.058349609375, "learning_rate": 4.823900473103206e-07, "loss": -0.0209, "reward": 1.4726002216339111, "reward_std": 0.22146786749362946, "rewards/accuracy_reward_stage2": 0.6132252216339111, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2955 }, { "completion_length": 25.1875, "epoch": 0.5179603995093744, "grad_norm": 26.56599596321791, "kl": 0.185546875, "learning_rate": 4.822148239004731e-07, "loss": 0.0741, "reward": 1.4056503772735596, "reward_std": 0.18882179260253906, "rewards/accuracy_reward_stage2": 0.6556503176689148, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2956 }, { "completion_length": 20.296875, "epoch": 0.518135622919222, "grad_norm": 16.152254574834682, "kl": 0.0302734375, "learning_rate": 4.820396004906255e-07, "loss": 0.0121, "reward": 1.3913934230804443, "reward_std": 0.17937754094600677, "rewards/accuracy_reward_stage2": 0.5163935422897339, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2957 }, { "completion_length": 11.109375, "epoch": 0.5183108463290695, "grad_norm": 23.663345562826837, "kl": 0.2734375, "learning_rate": 4.81864377080778e-07, "loss": 0.0676, "reward": 1.383396029472351, "reward_std": 0.33867397904396057, "rewards/accuracy_reward_stage2": 0.5240209698677063, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2958 }, { "completion_length": 7.671875, "epoch": 0.5184860697389171, "grad_norm": 15.080706050120135, "kl": 0.0927734375, "learning_rate": 4.816891536709305e-07, "loss": -0.0061, "reward": 1.7183098793029785, "reward_std": 0.1505095362663269, "rewards/accuracy_reward_stage2": 0.7339348793029785, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2959 }, { "completion_length": 9.921875, "epoch": 0.5186612931487646, "grad_norm": 20.0616679391178, "kl": 0.052978515625, "learning_rate": 4.815139302610829e-07, "loss": 0.0212, "reward": 1.5980708599090576, "reward_std": 0.19968780875205994, "rewards/accuracy_reward_stage2": 0.5980708003044128, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2960 }, { "completion_length": 12.09375, "epoch": 0.5188365165586122, "grad_norm": 19.9463560619159, "kl": 0.11376953125, "learning_rate": 4.813387068512352e-07, "loss": 0.0166, "reward": 1.589599609375, "reward_std": 0.2128470540046692, "rewards/accuracy_reward_stage2": 0.605224609375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2961 }, { "completion_length": 12.53125, "epoch": 0.5190117399684598, "grad_norm": 18.95845814797562, "kl": 0.1376953125, "learning_rate": 4.811634834413877e-07, "loss": 0.0203, "reward": 1.2567017078399658, "reward_std": 0.2735491394996643, "rewards/accuracy_reward_stage2": 0.522326648235321, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2962 }, { "completion_length": 8.359375, "epoch": 0.5191869633783074, "grad_norm": 18.40982447445592, "kl": 0.07861328125, "learning_rate": 4.809882600315402e-07, "loss": 0.0315, "reward": 1.7283145189285278, "reward_std": 0.29172807931900024, "rewards/accuracy_reward_stage2": 0.7283145189285278, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2963 }, { "completion_length": 8.078125, "epoch": 0.5193621867881549, "grad_norm": 16.950258641470114, "kl": 0.13671875, "learning_rate": 4.808130366216926e-07, "loss": 0.0547, "reward": 1.5028691291809082, "reward_std": 0.1037883311510086, "rewards/accuracy_reward_stage2": 0.6278691291809082, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2964 }, { "completion_length": 14.140625, "epoch": 0.5195374101980025, "grad_norm": 22.45684219909601, "kl": 0.15625, "learning_rate": 4.806378132118451e-07, "loss": 0.0626, "reward": 1.3568233251571655, "reward_std": 0.2835046648979187, "rewards/accuracy_reward_stage2": 0.48182329535484314, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2965 }, { "completion_length": 12.921875, "epoch": 0.51971263360785, "grad_norm": 13.37469094535078, "kl": 0.07666015625, "learning_rate": 4.804625898019975e-07, "loss": -0.0082, "reward": 1.5271795988082886, "reward_std": 0.16873487830162048, "rewards/accuracy_reward_stage2": 0.5428045988082886, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2966 }, { "completion_length": 8.53125, "epoch": 0.5198878570176976, "grad_norm": 16.56289414630539, "kl": 0.1904296875, "learning_rate": 4.8028736639215e-07, "loss": 0.017, "reward": 1.506408452987671, "reward_std": 0.18759910762310028, "rewards/accuracy_reward_stage2": 0.6626585721969604, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2967 }, { "completion_length": 6.421875, "epoch": 0.5200630804275451, "grad_norm": 16.23170422508012, "kl": 0.328125, "learning_rate": 4.801121429823024e-07, "loss": 0.0439, "reward": 1.4529410600662231, "reward_std": 0.29004305601119995, "rewards/accuracy_reward_stage2": 0.6091910004615784, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2968 }, { "completion_length": 7.421875, "epoch": 0.5202383038373927, "grad_norm": 17.728963477828742, "kl": 0.23046875, "learning_rate": 4.799369195724549e-07, "loss": 0.0565, "reward": 1.7198197841644287, "reward_std": 0.21634772419929504, "rewards/accuracy_reward_stage2": 0.8604447841644287, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2969 }, { "completion_length": 8.28125, "epoch": 0.5204135272472402, "grad_norm": 17.077717905726193, "kl": 0.1904296875, "learning_rate": 4.797616961626073e-07, "loss": 0.0433, "reward": 1.4588346481323242, "reward_std": 0.3143489360809326, "rewards/accuracy_reward_stage2": 0.5994596481323242, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2970 }, { "completion_length": 12.28125, "epoch": 0.5205887506570878, "grad_norm": 20.111701036269782, "kl": 0.408203125, "learning_rate": 4.795864727527598e-07, "loss": 0.0546, "reward": 1.3059935569763184, "reward_std": 0.24852296710014343, "rewards/accuracy_reward_stage2": 0.47786855697631836, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2971 }, { "completion_length": 9.75, "epoch": 0.5207639740669353, "grad_norm": 51.80114420123987, "kl": 0.369140625, "learning_rate": 4.794112493429122e-07, "loss": 0.1473, "reward": 1.6538548469543457, "reward_std": 0.268999308347702, "rewards/accuracy_reward_stage2": 0.6538547873497009, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2972 }, { "completion_length": 6.453125, "epoch": 0.5209391974767829, "grad_norm": 402.704170101784, "kl": 2.21875, "learning_rate": 4.792360259330646e-07, "loss": 0.7794, "reward": 1.4323503971099854, "reward_std": 0.26757895946502686, "rewards/accuracy_reward_stage2": 0.6042253971099854, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2973 }, { "completion_length": 28.65625, "epoch": 0.5211144208866304, "grad_norm": 23.106544974841064, "kl": 0.1630859375, "learning_rate": 4.79060802523217e-07, "loss": -0.0048, "reward": 1.370764136314392, "reward_std": 0.33060193061828613, "rewards/accuracy_reward_stage2": 0.4020141363143921, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2974 }, { "completion_length": 9.0625, "epoch": 0.5212896442964781, "grad_norm": 18.194894455569557, "kl": 0.0498046875, "learning_rate": 4.788855791133695e-07, "loss": -0.0131, "reward": 1.5386393070220947, "reward_std": 0.36468303203582764, "rewards/accuracy_reward_stage2": 0.5542643070220947, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2975 }, { "completion_length": 7.515625, "epoch": 0.5214648677063256, "grad_norm": 22.506162643097728, "kl": 0.208984375, "learning_rate": 4.78710355703522e-07, "loss": 0.0394, "reward": 1.7304048538208008, "reward_std": 0.17070415616035461, "rewards/accuracy_reward_stage2": 0.8710298538208008, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2976 }, { "completion_length": 10.28125, "epoch": 0.5216400911161732, "grad_norm": 272.80223531430477, "kl": 1.5703125, "learning_rate": 4.785351322936744e-07, "loss": 0.4964, "reward": 1.740801453590393, "reward_std": 0.23649471998214722, "rewards/accuracy_reward_stage2": 0.9126764535903931, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2977 }, { "completion_length": 10.15625, "epoch": 0.5218153145260207, "grad_norm": 15.03608408563602, "kl": 0.1552734375, "learning_rate": 4.783599088838269e-07, "loss": 0.0229, "reward": 1.4743565320968628, "reward_std": 0.1263759732246399, "rewards/accuracy_reward_stage2": 0.614981472492218, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2978 }, { "completion_length": 7.734375, "epoch": 0.5219905379358682, "grad_norm": 12.453380459386771, "kl": 0.07958984375, "learning_rate": 4.781846854739793e-07, "loss": -0.01, "reward": 1.7916667461395264, "reward_std": 0.1455363929271698, "rewards/accuracy_reward_stage2": 0.8072916269302368, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2979 }, { "completion_length": 12.0, "epoch": 0.5221657613457158, "grad_norm": 16.390112633942664, "kl": 0.1875, "learning_rate": 4.780094620641318e-07, "loss": -0.0685, "reward": 1.3277325630187988, "reward_std": 0.2840336561203003, "rewards/accuracy_reward_stage2": 0.39023256301879883, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 2980 }, { "completion_length": 10.234375, "epoch": 0.5223409847555633, "grad_norm": 17.47778157040926, "kl": 0.146484375, "learning_rate": 4.778342386542841e-07, "loss": -0.0413, "reward": 1.536125659942627, "reward_std": 0.23933261632919312, "rewards/accuracy_reward_stage2": 0.708000659942627, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2981 }, { "completion_length": 9.34375, "epoch": 0.5225162081654109, "grad_norm": 19.88076286176023, "kl": 0.08935546875, "learning_rate": 4.776590152444366e-07, "loss": 0.0034, "reward": 1.3844552040100098, "reward_std": 0.2918417453765869, "rewards/accuracy_reward_stage2": 0.525080144405365, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2982 }, { "completion_length": 9.71875, "epoch": 0.5226914315752584, "grad_norm": 28.094404649595983, "kl": 0.189453125, "learning_rate": 4.774837918345891e-07, "loss": 0.0445, "reward": 1.632422924041748, "reward_std": 0.3590032756328583, "rewards/accuracy_reward_stage2": 0.6480480432510376, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2983 }, { "completion_length": 9.84375, "epoch": 0.522866654985106, "grad_norm": 24.415029498851087, "kl": 0.31640625, "learning_rate": 4.773085684247415e-07, "loss": 0.0669, "reward": 1.613487720489502, "reward_std": 0.25573980808258057, "rewards/accuracy_reward_stage2": 0.7697376608848572, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2984 }, { "completion_length": 8.828125, "epoch": 0.5230418783949535, "grad_norm": 24.517554180624, "kl": 0.2177734375, "learning_rate": 4.77133345014894e-07, "loss": 0.0001, "reward": 1.3647611141204834, "reward_std": 0.4705469608306885, "rewards/accuracy_reward_stage2": 0.4116361737251282, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2985 }, { "completion_length": 11.203125, "epoch": 0.5232171018048011, "grad_norm": 15.677994477134238, "kl": 0.126953125, "learning_rate": 4.769581216050464e-07, "loss": 0.0064, "reward": 1.60444176197052, "reward_std": 0.31227320432662964, "rewards/accuracy_reward_stage2": 0.6200668215751648, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2986 }, { "completion_length": 9.53125, "epoch": 0.5233923252146486, "grad_norm": 18.468711010471285, "kl": 0.2294921875, "learning_rate": 4.7678289819519884e-07, "loss": -0.015, "reward": 1.2599871158599854, "reward_std": 0.18454702198505402, "rewards/accuracy_reward_stage2": 0.43186211585998535, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2987 }, { "completion_length": 7.84375, "epoch": 0.5235675486244963, "grad_norm": 17.931504296092047, "kl": 0.162109375, "learning_rate": 4.766076747853513e-07, "loss": -0.0675, "reward": 1.6289443969726562, "reward_std": 0.29932835698127747, "rewards/accuracy_reward_stage2": 0.6758193969726562, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2988 }, { "completion_length": 9.859375, "epoch": 0.5237427720343438, "grad_norm": 17.4591338186714, "kl": 0.0810546875, "learning_rate": 4.7643245137550377e-07, "loss": -0.0059, "reward": 1.6931253671646118, "reward_std": 0.21513822674751282, "rewards/accuracy_reward_stage2": 0.7087503671646118, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2989 }, { "completion_length": 11.5625, "epoch": 0.5239179954441914, "grad_norm": 19.233603189651088, "kl": 0.17578125, "learning_rate": 4.762572279656562e-07, "loss": -0.0478, "reward": 1.3866586685180664, "reward_std": 0.22251750528812408, "rewards/accuracy_reward_stage2": 0.5585336685180664, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2990 }, { "completion_length": 11.890625, "epoch": 0.5240932188540389, "grad_norm": 16.066061189933652, "kl": 0.083984375, "learning_rate": 4.7608200455580865e-07, "loss": 0.0335, "reward": 1.5238542556762695, "reward_std": 0.22788162529468536, "rewards/accuracy_reward_stage2": 0.5238542556762695, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2991 }, { "completion_length": 9.9375, "epoch": 0.5242684422638865, "grad_norm": 18.861312346598808, "kl": 0.1728515625, "learning_rate": 4.7590678114596104e-07, "loss": 0.0032, "reward": 1.3086049556732178, "reward_std": 0.33977043628692627, "rewards/accuracy_reward_stage2": 0.3398548364639282, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2992 }, { "completion_length": 9.125, "epoch": 0.524443665673734, "grad_norm": 17.42928363573299, "kl": 0.287109375, "learning_rate": 4.7573155773611353e-07, "loss": 0.0014, "reward": 1.328125, "reward_std": 0.25688543915748596, "rewards/accuracy_reward_stage2": 0.5, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2993 }, { "completion_length": 7.40625, "epoch": 0.5246188890835816, "grad_norm": 27.018808557675843, "kl": 0.197265625, "learning_rate": 4.7555633432626597e-07, "loss": 0.0057, "reward": 1.4005868434906006, "reward_std": 0.3192916214466095, "rewards/accuracy_reward_stage2": 0.5568368434906006, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2994 }, { "completion_length": 9.46875, "epoch": 0.5247941124934291, "grad_norm": 11.750319884502725, "kl": 0.1630859375, "learning_rate": 4.753811109164184e-07, "loss": 0.0002, "reward": 1.4829835891723633, "reward_std": 0.14890187978744507, "rewards/accuracy_reward_stage2": 0.5142335295677185, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2995 }, { "completion_length": 14.453125, "epoch": 0.5249693359032767, "grad_norm": 15.66119969286439, "kl": 0.1611328125, "learning_rate": 4.7520588750657085e-07, "loss": -0.066, "reward": 1.5467138290405273, "reward_std": 0.2760947048664093, "rewards/accuracy_reward_stage2": 0.5935889482498169, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2996 }, { "completion_length": 11.859375, "epoch": 0.5251445593131242, "grad_norm": 9.586425716781799, "kl": 0.07470703125, "learning_rate": 4.7503066409672334e-07, "loss": -0.0144, "reward": 1.40625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2997 }, { "completion_length": 11.140625, "epoch": 0.5253197827229718, "grad_norm": 18.620989828713803, "kl": 0.09423828125, "learning_rate": 4.7485544068687573e-07, "loss": 0.0023, "reward": 1.5689308643341064, "reward_std": 0.26974961161613464, "rewards/accuracy_reward_stage2": 0.5845559239387512, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2998 }, { "completion_length": 6.46875, "epoch": 0.5254950061328193, "grad_norm": 16.402643244855273, "kl": 0.0849609375, "learning_rate": 4.7468021727702817e-07, "loss": -0.0099, "reward": 1.3922981023788452, "reward_std": 0.1058889701962471, "rewards/accuracy_reward_stage2": 0.5329231023788452, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2999 }, { "completion_length": 12.15625, "epoch": 0.525670229542667, "grad_norm": 19.33626292725619, "kl": 0.03564453125, "learning_rate": 4.745049938671806e-07, "loss": 0.0143, "reward": 1.5242962837219238, "reward_std": 0.12696348130702972, "rewards/accuracy_reward_stage2": 0.5242962837219238, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3000 }, { "completion_length": 10.125, "epoch": 0.5258454529525145, "grad_norm": 19.847748663580717, "kl": 0.314453125, "learning_rate": 4.743297704573331e-07, "loss": 0.0834, "reward": 1.1907129287719727, "reward_std": 0.23560258746147156, "rewards/accuracy_reward_stage2": 0.4563378691673279, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3001 }, { "completion_length": 17.15625, "epoch": 0.526020676362362, "grad_norm": 20.586289372649574, "kl": 0.171875, "learning_rate": 4.7415454704748554e-07, "loss": -0.0648, "reward": 1.4292160272598267, "reward_std": 0.31085318326950073, "rewards/accuracy_reward_stage2": 0.6167160272598267, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 3002 }, { "completion_length": 12.4375, "epoch": 0.5261958997722096, "grad_norm": 17.106269980841894, "kl": 0.119140625, "learning_rate": 4.73979323637638e-07, "loss": -0.0152, "reward": 1.213785171508789, "reward_std": 0.33467233180999756, "rewards/accuracy_reward_stage2": 0.37003517150878906, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3003 }, { "completion_length": 11.234375, "epoch": 0.5263711231820571, "grad_norm": 17.56565559200859, "kl": 0.062255859375, "learning_rate": 4.738041002277904e-07, "loss": -0.0091, "reward": 1.3451817035675049, "reward_std": 0.22466593980789185, "rewards/accuracy_reward_stage2": 0.4858066439628601, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3004 }, { "completion_length": 7.21875, "epoch": 0.5265463465919047, "grad_norm": 12.920353465243306, "kl": 0.1044921875, "learning_rate": 4.7362887681794286e-07, "loss": 0.0178, "reward": 1.4469187259674072, "reward_std": 0.21581503748893738, "rewards/accuracy_reward_stage2": 0.46254366636276245, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3005 }, { "completion_length": 8.453125, "epoch": 0.5267215700017522, "grad_norm": 14.367077784056798, "kl": 0.2216796875, "learning_rate": 4.734536534080953e-07, "loss": -0.0588, "reward": 1.2829350233078003, "reward_std": 0.2190045267343521, "rewards/accuracy_reward_stage2": 0.4704349935054779, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 3006 }, { "completion_length": 10.375, "epoch": 0.5268967934115998, "grad_norm": 19.523796969621742, "kl": 0.302734375, "learning_rate": 4.7327842999824774e-07, "loss": 0.033, "reward": 1.7166911363601685, "reward_std": 0.27443280816078186, "rewards/accuracy_reward_stage2": 0.7479411363601685, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3007 }, { "completion_length": 11.671875, "epoch": 0.5270720168214473, "grad_norm": 28.468412075135017, "kl": 0.2734375, "learning_rate": 4.731032065884002e-07, "loss": 0.0847, "reward": 1.6269659996032715, "reward_std": 0.17851251363754272, "rewards/accuracy_reward_stage2": 0.767591118812561, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3008 }, { "completion_length": 12.6875, "epoch": 0.5272472402312949, "grad_norm": 15.867000520209041, "kl": 0.1923828125, "learning_rate": 4.7292798317855267e-07, "loss": 0.0393, "reward": 1.3693108558654785, "reward_std": 0.18513169884681702, "rewards/accuracy_reward_stage2": 0.6349357962608337, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3009 }, { "completion_length": 6.171875, "epoch": 0.5274224636411424, "grad_norm": 12.409857153191926, "kl": 0.041015625, "learning_rate": 4.727527597687051e-07, "loss": 0.0165, "reward": 1.7259947061538696, "reward_std": 0.11311056464910507, "rewards/accuracy_reward_stage2": 0.7259947061538696, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3010 }, { "completion_length": 10.984375, "epoch": 0.52759768705099, "grad_norm": 20.049126613210415, "kl": 0.103515625, "learning_rate": 4.725775363588575e-07, "loss": -0.0027, "reward": 1.412109136581421, "reward_std": 0.29860153794288635, "rewards/accuracy_reward_stage2": 0.4277341663837433, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3011 }, { "completion_length": 10.703125, "epoch": 0.5277729104608375, "grad_norm": 16.696484961038596, "kl": 0.2158203125, "learning_rate": 4.7240231294900993e-07, "loss": -0.0326, "reward": 1.3450486660003662, "reward_std": 0.2671396732330322, "rewards/accuracy_reward_stage2": 0.5169236660003662, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3012 }, { "completion_length": 8.875, "epoch": 0.5279481338706852, "grad_norm": 14.689130441389223, "kl": 0.2265625, "learning_rate": 4.7222708953916243e-07, "loss": 0.0248, "reward": 1.6728073358535767, "reward_std": 0.25592559576034546, "rewards/accuracy_reward_stage2": 0.7040572762489319, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3013 }, { "completion_length": 11.609375, "epoch": 0.5281233572805327, "grad_norm": 15.071699015429848, "kl": 0.1845703125, "learning_rate": 4.7205186612931487e-07, "loss": -0.0164, "reward": 1.5697612762451172, "reward_std": 0.19837264716625214, "rewards/accuracy_reward_stage2": 0.6166362762451172, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3014 }, { "completion_length": 6.59375, "epoch": 0.5282985806903803, "grad_norm": 15.538520030910789, "kl": 0.103515625, "learning_rate": 4.718766427194673e-07, "loss": -0.0027, "reward": 1.6166914701461792, "reward_std": 0.2432194948196411, "rewards/accuracy_reward_stage2": 0.6323164701461792, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3015 }, { "completion_length": 17.875, "epoch": 0.5284738041002278, "grad_norm": 17.517113919901284, "kl": 0.11376953125, "learning_rate": 4.7170141930961975e-07, "loss": -0.0641, "reward": 1.3405852317810059, "reward_std": 0.17777365446090698, "rewards/accuracy_reward_stage2": 0.5124603509902954, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3016 }, { "completion_length": 13.578125, "epoch": 0.5286490275100754, "grad_norm": 16.46059554496943, "kl": 0.265625, "learning_rate": 4.715261958997722e-07, "loss": 0.1058, "reward": 1.5327612161636353, "reward_std": 0.16156886518001556, "rewards/accuracy_reward_stage2": 0.6577612161636353, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3017 }, { "completion_length": 10.03125, "epoch": 0.5288242509199229, "grad_norm": 19.988363017500177, "kl": 0.091796875, "learning_rate": 4.713509724899246e-07, "loss": 0.0367, "reward": 1.6378380060195923, "reward_std": 0.21499964594841003, "rewards/accuracy_reward_stage2": 0.7628380060195923, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3018 }, { "completion_length": 10.78125, "epoch": 0.5289994743297705, "grad_norm": 18.54168827454856, "kl": 0.15234375, "learning_rate": 4.7117574908007706e-07, "loss": -0.0272, "reward": 1.715771198272705, "reward_std": 0.3070124387741089, "rewards/accuracy_reward_stage2": 0.7470211982727051, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3019 }, { "completion_length": 8.40625, "epoch": 0.529174697739618, "grad_norm": 16.69030502399357, "kl": 0.2236328125, "learning_rate": 4.710005256702295e-07, "loss": -0.0819, "reward": 1.4025707244873047, "reward_std": 0.2415388822555542, "rewards/accuracy_reward_stage2": 0.48069584369659424, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3020 }, { "completion_length": 6.234375, "epoch": 0.5293499211494656, "grad_norm": 6.533390983035657, "kl": 0.02685546875, "learning_rate": 4.70825302260382e-07, "loss": 0.0107, "reward": 1.5693737268447876, "reward_std": 0.03983701765537262, "rewards/accuracy_reward_stage2": 0.5693736672401428, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3021 }, { "completion_length": 10.171875, "epoch": 0.5295251445593131, "grad_norm": 17.603432652423468, "kl": 0.09814453125, "learning_rate": 4.7065007885053444e-07, "loss": -0.0011, "reward": 1.7323403358459473, "reward_std": 0.3000224530696869, "rewards/accuracy_reward_stage2": 0.7479652762413025, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3022 }, { "completion_length": 12.703125, "epoch": 0.5297003679691606, "grad_norm": 20.196880943398675, "kl": 0.1376953125, "learning_rate": 4.704748554406869e-07, "loss": 0.0178, "reward": 1.5424933433532715, "reward_std": 0.24900223314762115, "rewards/accuracy_reward_stage2": 0.5581183433532715, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3023 }, { "completion_length": 10.0625, "epoch": 0.5298755913790082, "grad_norm": 25.70022775673109, "kl": 0.158203125, "learning_rate": 4.7029963203083926e-07, "loss": -0.0026, "reward": 1.3462114334106445, "reward_std": 0.270582914352417, "rewards/accuracy_reward_stage2": 0.6274613738059998, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3024 }, { "completion_length": 6.4375, "epoch": 0.5300508147888557, "grad_norm": 31.517862165800544, "kl": 0.162109375, "learning_rate": 4.7012440862099176e-07, "loss": -0.0235, "reward": 1.5729167461395264, "reward_std": 0.31406548619270325, "rewards/accuracy_reward_stage2": 0.6041666269302368, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3025 }, { "completion_length": 12.375, "epoch": 0.5302260381987034, "grad_norm": 19.99666059416602, "kl": 0.2421875, "learning_rate": 4.699491852111442e-07, "loss": -0.0321, "reward": 1.5244319438934326, "reward_std": 0.30928748846054077, "rewards/accuracy_reward_stage2": 0.6963070034980774, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3026 }, { "completion_length": 11.28125, "epoch": 0.530401261608551, "grad_norm": 20.72342344452126, "kl": 0.1943359375, "learning_rate": 4.6977396180129663e-07, "loss": -0.0103, "reward": 1.847987413406372, "reward_std": 0.2692791819572449, "rewards/accuracy_reward_stage2": 0.8792373538017273, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3027 }, { "completion_length": 9.171875, "epoch": 0.5305764850183985, "grad_norm": 18.269406103081074, "kl": 0.06640625, "learning_rate": 4.695987383914491e-07, "loss": 0.0265, "reward": 1.4410545825958252, "reward_std": 0.12092936038970947, "rewards/accuracy_reward_stage2": 0.4410546123981476, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3028 }, { "completion_length": 16.828125, "epoch": 0.530751708428246, "grad_norm": 24.097913134980068, "kl": 0.251953125, "learning_rate": 4.6942351498160157e-07, "loss": 0.0468, "reward": 1.2747653722763062, "reward_std": 0.22782419621944427, "rewards/accuracy_reward_stage2": 0.30601537227630615, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3029 }, { "completion_length": 9.8125, "epoch": 0.5309269318380936, "grad_norm": 15.977143548134446, "kl": 0.05810546875, "learning_rate": 4.6924829157175395e-07, "loss": 0.0232, "reward": 1.7499685287475586, "reward_std": 0.2890286445617676, "rewards/accuracy_reward_stage2": 0.7499684691429138, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3030 }, { "completion_length": 10.9375, "epoch": 0.5311021552479411, "grad_norm": 17.89330528689539, "kl": 0.18359375, "learning_rate": 4.690730681619064e-07, "loss": 0.0112, "reward": 1.570418357849121, "reward_std": 0.1976543366909027, "rewards/accuracy_reward_stage2": 0.6016684770584106, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3031 }, { "completion_length": 9.09375, "epoch": 0.5312773786577887, "grad_norm": 18.17287625511143, "kl": 0.0196533203125, "learning_rate": 4.6889784475205883e-07, "loss": 0.0079, "reward": 1.1605315208435059, "reward_std": 0.210140198469162, "rewards/accuracy_reward_stage2": 0.28553152084350586, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3032 }, { "completion_length": 10.421875, "epoch": 0.5314526020676362, "grad_norm": 20.40117868533523, "kl": 0.125, "learning_rate": 4.687226213422113e-07, "loss": 0.0501, "reward": 1.4888815879821777, "reward_std": 0.17244793474674225, "rewards/accuracy_reward_stage2": 0.4888816177845001, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3033 }, { "completion_length": 10.609375, "epoch": 0.5316278254774838, "grad_norm": 25.24115334011376, "kl": 0.2216796875, "learning_rate": 4.6854739793236377e-07, "loss": 0.0879, "reward": 1.611750841140747, "reward_std": 0.2850770652294159, "rewards/accuracy_reward_stage2": 0.7367508411407471, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3034 }, { "completion_length": 11.203125, "epoch": 0.5318030488873313, "grad_norm": 16.12155695115901, "kl": 0.0419921875, "learning_rate": 4.683721745225162e-07, "loss": 0.0168, "reward": 1.5541150569915771, "reward_std": 0.16996827721595764, "rewards/accuracy_reward_stage2": 0.5541150569915771, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3035 }, { "completion_length": 7.328125, "epoch": 0.5319782722971789, "grad_norm": 12.940864496539184, "kl": 0.06591796875, "learning_rate": 4.6819695111266864e-07, "loss": 0.002, "reward": 1.6956827640533447, "reward_std": 0.14037351310253143, "rewards/accuracy_reward_stage2": 0.7113077640533447, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3036 }, { "completion_length": 11.65625, "epoch": 0.5321534957070264, "grad_norm": 19.100056712000704, "kl": 0.208984375, "learning_rate": 4.6802172770282103e-07, "loss": 0.0835, "reward": 1.3029202222824097, "reward_std": 0.17705127596855164, "rewards/accuracy_reward_stage2": 0.5529202222824097, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3037 }, { "completion_length": 9.53125, "epoch": 0.532328719116874, "grad_norm": 25.0008873405925, "kl": 0.1201171875, "learning_rate": 4.678465042929735e-07, "loss": 0.0109, "reward": 1.6256303787231445, "reward_std": 0.26126065850257874, "rewards/accuracy_reward_stage2": 0.6412553787231445, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3038 }, { "completion_length": 16.171875, "epoch": 0.5325039425267216, "grad_norm": 15.480860684235363, "kl": 0.1044921875, "learning_rate": 4.6767128088312596e-07, "loss": -0.0026, "reward": 1.7870838642120361, "reward_std": 0.15268449485301971, "rewards/accuracy_reward_stage2": 0.8027088642120361, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3039 }, { "completion_length": 13.625, "epoch": 0.5326791659365692, "grad_norm": 17.521574298852897, "kl": 0.203125, "learning_rate": 4.674960574732784e-07, "loss": 0.0426, "reward": 1.1582777500152588, "reward_std": 0.19577035307884216, "rewards/accuracy_reward_stage2": 0.4395277798175812, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3040 }, { "completion_length": 10.421875, "epoch": 0.5328543893464167, "grad_norm": 15.826143042438844, "kl": 0.11767578125, "learning_rate": 4.6732083406343084e-07, "loss": -0.0206, "reward": 1.6439828872680664, "reward_std": 0.2372094690799713, "rewards/accuracy_reward_stage2": 0.6752328872680664, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3041 }, { "completion_length": 7.28125, "epoch": 0.5330296127562643, "grad_norm": 14.504095571093211, "kl": 0.0693359375, "learning_rate": 4.6714561065358334e-07, "loss": 0.0276, "reward": 1.3369925022125244, "reward_std": 0.16244575381278992, "rewards/accuracy_reward_stage2": 0.4619925022125244, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3042 }, { "completion_length": 12.234375, "epoch": 0.5332048361661118, "grad_norm": 18.798109631342566, "kl": 0.0927734375, "learning_rate": 4.669703872437357e-07, "loss": -0.0071, "reward": 1.4485533237457275, "reward_std": 0.22703000903129578, "rewards/accuracy_reward_stage2": 0.4641782343387604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3043 }, { "completion_length": 11.875, "epoch": 0.5333800595759594, "grad_norm": 16.32852779302221, "kl": 0.09912109375, "learning_rate": 4.6679516383388816e-07, "loss": 0.0186, "reward": 1.5693191289901733, "reward_std": 0.1421196162700653, "rewards/accuracy_reward_stage2": 0.5849441289901733, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3044 }, { "completion_length": 5.9375, "epoch": 0.5335552829858069, "grad_norm": 16.792635822140806, "kl": 0.15625, "learning_rate": 4.666199404240406e-07, "loss": 0.0624, "reward": 1.726854681968689, "reward_std": 0.17972619831562042, "rewards/accuracy_reward_stage2": 0.726854681968689, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3045 }, { "completion_length": 27.21875, "epoch": 0.5337305063956544, "grad_norm": 18.453746951075754, "kl": 0.103515625, "learning_rate": 4.664447170141931e-07, "loss": 0.0045, "reward": 1.5340856313705444, "reward_std": 0.17024879157543182, "rewards/accuracy_reward_stage2": 0.5497106313705444, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3046 }, { "completion_length": 15.609375, "epoch": 0.533905729805502, "grad_norm": 13.499514206736817, "kl": 0.01263427734375, "learning_rate": 4.6626949360434553e-07, "loss": 0.0051, "reward": 1.714925765991211, "reward_std": 0.09920510649681091, "rewards/accuracy_reward_stage2": 0.7149257063865662, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3047 }, { "completion_length": 8.5625, "epoch": 0.5340809532153495, "grad_norm": 25.70082296105864, "kl": 0.060791015625, "learning_rate": 4.6609427019449797e-07, "loss": 0.0243, "reward": 1.793139934539795, "reward_std": 0.22477105259895325, "rewards/accuracy_reward_stage2": 0.7931399345397949, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3048 }, { "completion_length": 8.609375, "epoch": 0.5342561766251971, "grad_norm": 17.500307037883275, "kl": 0.43359375, "learning_rate": 4.6591904678465036e-07, "loss": 0.1737, "reward": 1.3654680252075195, "reward_std": 0.1928379386663437, "rewards/accuracy_reward_stage2": 0.6154680252075195, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3049 }, { "completion_length": 17.328125, "epoch": 0.5344314000350446, "grad_norm": 17.814760335668577, "kl": 0.1748046875, "learning_rate": 4.6574382337480285e-07, "loss": 0.0375, "reward": 1.3082983493804932, "reward_std": 0.2186213731765747, "rewards/accuracy_reward_stage2": 0.4489234387874603, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3050 }, { "completion_length": 9.3125, "epoch": 0.5346066234448923, "grad_norm": 21.43077166996965, "kl": 0.091796875, "learning_rate": 4.655685999649553e-07, "loss": 0.0367, "reward": 1.7873187065124512, "reward_std": 0.14868119359016418, "rewards/accuracy_reward_stage2": 0.7873187065124512, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3051 }, { "completion_length": 10.734375, "epoch": 0.5347818468547398, "grad_norm": 20.775333986536143, "kl": 0.1474609375, "learning_rate": 4.6539337655510773e-07, "loss": 0.0148, "reward": 1.6244642734527588, "reward_std": 0.2634267210960388, "rewards/accuracy_reward_stage2": 0.640089213848114, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3052 }, { "completion_length": 10.5, "epoch": 0.5349570702645874, "grad_norm": 22.625466011199897, "kl": 0.0810546875, "learning_rate": 4.6521815314526017e-07, "loss": 0.0324, "reward": 1.4285982847213745, "reward_std": 0.2603178024291992, "rewards/accuracy_reward_stage2": 0.5535982251167297, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3053 }, { "completion_length": 13.234375, "epoch": 0.5351322936744349, "grad_norm": 19.153618135380867, "kl": 0.330078125, "learning_rate": 4.6504292973541266e-07, "loss": -0.0254, "reward": 1.3712303638458252, "reward_std": 0.3846546709537506, "rewards/accuracy_reward_stage2": 0.5743553638458252, "rewards/format_reward_stage1_pointerpad": 0.796875, "scores/accuracy_reward_stage2": 0.796875, "step": 3054 }, { "completion_length": 9.15625, "epoch": 0.5353075170842825, "grad_norm": 15.593493994700411, "kl": 0.07421875, "learning_rate": 4.648677063255651e-07, "loss": -0.0146, "reward": 1.8773478269577026, "reward_std": 0.14944106340408325, "rewards/accuracy_reward_stage2": 0.8929727673530579, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3055 }, { "completion_length": 6.3125, "epoch": 0.53548274049413, "grad_norm": 19.66066039095581, "kl": 0.1513671875, "learning_rate": 4.646924829157175e-07, "loss": 0.0363, "reward": 1.5343431234359741, "reward_std": 0.19723013043403625, "rewards/accuracy_reward_stage2": 0.6593431234359741, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3056 }, { "completion_length": 6.78125, "epoch": 0.5356579639039776, "grad_norm": 21.526714673787414, "kl": 0.158203125, "learning_rate": 4.6451725950586993e-07, "loss": 0.0342, "reward": 1.617117166519165, "reward_std": 0.26397401094436646, "rewards/accuracy_reward_stage2": 0.7577422261238098, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3057 }, { "completion_length": 15.265625, "epoch": 0.5358331873138251, "grad_norm": 12.15510432850708, "kl": 0.17578125, "learning_rate": 4.643420360960224e-07, "loss": -0.0135, "reward": 1.327011227607727, "reward_std": 0.17525216937065125, "rewards/accuracy_reward_stage2": 0.4832611382007599, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3058 }, { "completion_length": 19.828125, "epoch": 0.5360084107236727, "grad_norm": 18.80841738380306, "kl": 0.06787109375, "learning_rate": 4.6416681268617486e-07, "loss": -0.017, "reward": 1.5321886539459229, "reward_std": 0.249672532081604, "rewards/accuracy_reward_stage2": 0.5478136539459229, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3059 }, { "completion_length": 10.125, "epoch": 0.5361836341335202, "grad_norm": 20.60053039795327, "kl": 0.09228515625, "learning_rate": 4.639915892763273e-07, "loss": 0.0369, "reward": 1.5177383422851562, "reward_std": 0.1550036072731018, "rewards/accuracy_reward_stage2": 0.5177382826805115, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3060 }, { "completion_length": 7.609375, "epoch": 0.5363588575433678, "grad_norm": 15.038847005498408, "kl": 0.016845703125, "learning_rate": 4.6381636586647974e-07, "loss": 0.0068, "reward": 1.6498210430145264, "reward_std": 0.13509923219680786, "rewards/accuracy_reward_stage2": 0.8998209834098816, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3061 }, { "completion_length": 14.015625, "epoch": 0.5365340809532153, "grad_norm": 9.436819961252333, "kl": 0.072265625, "learning_rate": 4.636411424566322e-07, "loss": -0.0054, "reward": 1.6160824298858643, "reward_std": 0.11158134788274765, "rewards/accuracy_reward_stage2": 0.6317073702812195, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3062 }, { "completion_length": 13.4375, "epoch": 0.5367093043630629, "grad_norm": 17.744512185558786, "kl": 0.0908203125, "learning_rate": 4.634659190467846e-07, "loss": -0.0078, "reward": 1.62375807762146, "reward_std": 0.17992845177650452, "rewards/accuracy_reward_stage2": 0.6393829584121704, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3063 }, { "completion_length": 12.109375, "epoch": 0.5368845277729105, "grad_norm": 19.430148054219906, "kl": 0.087890625, "learning_rate": 4.6329069563693706e-07, "loss": 0.0351, "reward": 1.6683220863342285, "reward_std": 0.2081729918718338, "rewards/accuracy_reward_stage2": 0.6683220863342285, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3064 }, { "completion_length": 14.953125, "epoch": 0.5370597511827581, "grad_norm": 14.130279849007, "kl": 0.08251953125, "learning_rate": 4.631154722270895e-07, "loss": -0.0112, "reward": 1.2821969985961914, "reward_std": 0.17817632853984833, "rewards/accuracy_reward_stage2": 0.4228219985961914, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3065 }, { "completion_length": 11.921875, "epoch": 0.5372349745926056, "grad_norm": 16.998675878434767, "kl": 0.119140625, "learning_rate": 4.62940248817242e-07, "loss": -0.0279, "reward": 1.482663631439209, "reward_std": 0.3255109190940857, "rewards/accuracy_reward_stage2": 0.5139136910438538, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3066 }, { "completion_length": 8.71875, "epoch": 0.5374101980024532, "grad_norm": 16.177190918806446, "kl": 0.109375, "learning_rate": 4.6276502540739443e-07, "loss": 0.0048, "reward": 1.3549516201019287, "reward_std": 0.2306400090456009, "rewards/accuracy_reward_stage2": 0.3705766797065735, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3067 }, { "completion_length": 8.9375, "epoch": 0.5375854214123007, "grad_norm": 27.309356310930397, "kl": 0.271484375, "learning_rate": 4.625898019975468e-07, "loss": -0.0098, "reward": 1.3389118909835815, "reward_std": 0.4090281128883362, "rewards/accuracy_reward_stage2": 0.5107868313789368, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3068 }, { "completion_length": 8.8125, "epoch": 0.5377606448221482, "grad_norm": 19.312543083938817, "kl": 0.0771484375, "learning_rate": 4.6241457858769926e-07, "loss": 0.0309, "reward": 1.7708431482315063, "reward_std": 0.20213577151298523, "rewards/accuracy_reward_stage2": 0.7708431482315063, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3069 }, { "completion_length": 8.84375, "epoch": 0.5379358682319958, "grad_norm": 19.962249196207807, "kl": 0.1640625, "learning_rate": 4.6223935517785175e-07, "loss": 0.0215, "reward": 1.5813571214675903, "reward_std": 0.25529611110687256, "rewards/accuracy_reward_stage2": 0.5969820618629456, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3070 }, { "completion_length": 8.515625, "epoch": 0.5381110916418433, "grad_norm": 15.90380238115599, "kl": 0.0517578125, "learning_rate": 4.620641317680042e-07, "loss": 0.0207, "reward": 1.7377853393554688, "reward_std": 0.07723461091518402, "rewards/accuracy_reward_stage2": 0.7377853393554688, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3071 }, { "completion_length": 12.609375, "epoch": 0.5382863150516909, "grad_norm": 16.115969346139185, "kl": 0.08056640625, "learning_rate": 4.6188890835815663e-07, "loss": -0.0343, "reward": 1.4710911512374878, "reward_std": 0.17701643705368042, "rewards/accuracy_reward_stage2": 0.5023411512374878, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3072 }, { "completion_length": 12.359375, "epoch": 0.5384615384615384, "grad_norm": 17.9484435026702, "kl": 0.115234375, "learning_rate": 4.6171368494830907e-07, "loss": 0.0171, "reward": 1.3882776498794556, "reward_std": 0.29357674717903137, "rewards/accuracy_reward_stage2": 0.40390264987945557, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3073 }, { "completion_length": 17.203125, "epoch": 0.538636761871386, "grad_norm": 16.662349153141072, "kl": 0.040771484375, "learning_rate": 4.6153846153846156e-07, "loss": 0.0164, "reward": 1.5608493089675903, "reward_std": 0.09860756993293762, "rewards/accuracy_reward_stage2": 0.5608493685722351, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3074 }, { "completion_length": 9.59375, "epoch": 0.5388119852812335, "grad_norm": 18.106733037371338, "kl": 0.0703125, "learning_rate": 4.6136323812861395e-07, "loss": -0.0106, "reward": 1.6378639936447144, "reward_std": 0.27184975147247314, "rewards/accuracy_reward_stage2": 0.6534889936447144, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3075 }, { "completion_length": 8.3125, "epoch": 0.5389872086910811, "grad_norm": 21.56281782764618, "kl": 0.267578125, "learning_rate": 4.611880147187664e-07, "loss": 0.0521, "reward": 1.5611025094985962, "reward_std": 0.35041025280952454, "rewards/accuracy_reward_stage2": 0.717352569103241, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3076 }, { "completion_length": 12.359375, "epoch": 0.5391624321009287, "grad_norm": 32.14585568367615, "kl": 0.05517578125, "learning_rate": 4.6101279130891883e-07, "loss": 0.022, "reward": 1.4294856786727905, "reward_std": 0.29953643679618835, "rewards/accuracy_reward_stage2": 0.4294856786727905, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3077 }, { "completion_length": 16.875, "epoch": 0.5393376555107763, "grad_norm": 20.091125573660317, "kl": 0.138671875, "learning_rate": 4.608375678990713e-07, "loss": 0.0112, "reward": 1.5007598400115967, "reward_std": 0.18287548422813416, "rewards/accuracy_reward_stage2": 0.5163848996162415, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3078 }, { "completion_length": 10.59375, "epoch": 0.5395128789206238, "grad_norm": 16.831158308135656, "kl": 0.03369140625, "learning_rate": 4.6066234448922376e-07, "loss": -0.0307, "reward": 1.6649608612060547, "reward_std": 0.13924731314182281, "rewards/accuracy_reward_stage2": 0.6805858612060547, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3079 }, { "completion_length": 9.109375, "epoch": 0.5396881023304714, "grad_norm": 16.293936210247495, "kl": 0.06884765625, "learning_rate": 4.604871210793762e-07, "loss": 0.0275, "reward": 1.6581212282180786, "reward_std": 0.10700437426567078, "rewards/accuracy_reward_stage2": 0.6581212282180786, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3080 }, { "completion_length": 12.046875, "epoch": 0.5398633257403189, "grad_norm": 16.85665873852233, "kl": 0.1640625, "learning_rate": 4.603118976695286e-07, "loss": -0.0539, "reward": 1.603559136390686, "reward_std": 0.2619991898536682, "rewards/accuracy_reward_stage2": 0.6504341959953308, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3081 }, { "completion_length": 16.71875, "epoch": 0.5400385491501665, "grad_norm": 18.200123926002398, "kl": 0.07666015625, "learning_rate": 4.601366742596811e-07, "loss": 0.0022, "reward": 1.6544448137283325, "reward_std": 0.13603055477142334, "rewards/accuracy_reward_stage2": 0.6700698733329773, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3082 }, { "completion_length": 8.578125, "epoch": 0.540213772560014, "grad_norm": 19.496810478238686, "kl": 0.134765625, "learning_rate": 4.599614508498335e-07, "loss": 0.0537, "reward": 1.5489877462387085, "reward_std": 0.1919880509376526, "rewards/accuracy_reward_stage2": 0.5489877462387085, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3083 }, { "completion_length": 10.015625, "epoch": 0.5403889959698616, "grad_norm": 17.58848253241211, "kl": 0.234375, "learning_rate": 4.5978622743998596e-07, "loss": -0.0312, "reward": 1.385695457458496, "reward_std": 0.14936351776123047, "rewards/accuracy_reward_stage2": 0.4481954276561737, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3084 }, { "completion_length": 13.8125, "epoch": 0.5405642193797091, "grad_norm": 21.304883569980227, "kl": 0.25, "learning_rate": 4.596110040301384e-07, "loss": -0.0186, "reward": 1.660407543182373, "reward_std": 0.3340178430080414, "rewards/accuracy_reward_stage2": 0.707282543182373, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3085 }, { "completion_length": 10.9375, "epoch": 0.5407394427895567, "grad_norm": 24.291576382554858, "kl": 0.08447265625, "learning_rate": 4.594357806202909e-07, "loss": 0.0338, "reward": 1.649717092514038, "reward_std": 0.26840633153915405, "rewards/accuracy_reward_stage2": 0.6497172117233276, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3086 }, { "completion_length": 11.46875, "epoch": 0.5409146661994042, "grad_norm": 16.023316583348308, "kl": 0.181640625, "learning_rate": 4.592605572104433e-07, "loss": -0.0577, "reward": 1.7659709453582764, "reward_std": 0.2128397822380066, "rewards/accuracy_reward_stage2": 0.8128460645675659, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3087 }, { "completion_length": 9.796875, "epoch": 0.5410898896092518, "grad_norm": 19.777055191335933, "kl": 0.09228515625, "learning_rate": 4.590853338005957e-07, "loss": 0.037, "reward": 1.6594014167785645, "reward_std": 0.2414485216140747, "rewards/accuracy_reward_stage2": 0.6594013571739197, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3088 }, { "completion_length": 10.125, "epoch": 0.5412651130190993, "grad_norm": 20.59924841280276, "kl": 0.169921875, "learning_rate": 4.5891011039074816e-07, "loss": 0.0291, "reward": 1.6514942646026611, "reward_std": 0.36602866649627686, "rewards/accuracy_reward_stage2": 0.6671192646026611, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3089 }, { "completion_length": 14.1875, "epoch": 0.541440336428947, "grad_norm": 18.931291043266487, "kl": 0.1044921875, "learning_rate": 4.5873488698090065e-07, "loss": -0.0038, "reward": 1.771875023841858, "reward_std": 0.2878369688987732, "rewards/accuracy_reward_stage2": 0.8031250238418579, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3090 }, { "completion_length": 9.71875, "epoch": 0.5416155598387945, "grad_norm": 18.664500855954135, "kl": 0.041748046875, "learning_rate": 4.585596635710531e-07, "loss": 0.0167, "reward": 1.3326388597488403, "reward_std": 0.19817854464054108, "rewards/accuracy_reward_stage2": 0.4576388895511627, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3091 }, { "completion_length": 11.5, "epoch": 0.541790783248642, "grad_norm": 22.570017573761078, "kl": 0.1376953125, "learning_rate": 4.5838444016120553e-07, "loss": 0.0025, "reward": 1.433104395866394, "reward_std": 0.2778227627277374, "rewards/accuracy_reward_stage2": 0.573729395866394, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3092 }, { "completion_length": 9.53125, "epoch": 0.5419660066584896, "grad_norm": 14.12496196439735, "kl": 0.1748046875, "learning_rate": 4.5820921675135797e-07, "loss": -0.0091, "reward": 1.6820327043533325, "reward_std": 0.170151025056839, "rewards/accuracy_reward_stage2": 0.7132827043533325, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3093 }, { "completion_length": 8.359375, "epoch": 0.5421412300683371, "grad_norm": 15.999409442411078, "kl": 0.197265625, "learning_rate": 4.580339933415104e-07, "loss": -0.0095, "reward": 1.5700740814208984, "reward_std": 0.25437554717063904, "rewards/accuracy_reward_stage2": 0.6013240814208984, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3094 }, { "completion_length": 9.0625, "epoch": 0.5423164534781847, "grad_norm": 51.15226086456877, "kl": 0.2041015625, "learning_rate": 4.5785876993166285e-07, "loss": -0.0015, "reward": 1.4735863208770752, "reward_std": 0.2881781756877899, "rewards/accuracy_reward_stage2": 0.6454613208770752, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3095 }, { "completion_length": 9.65625, "epoch": 0.5424916768880322, "grad_norm": 22.2332921253652, "kl": 0.220703125, "learning_rate": 4.576835465218153e-07, "loss": 0.0026, "reward": 1.5193158388137817, "reward_std": 0.25890079140663147, "rewards/accuracy_reward_stage2": 0.5505658388137817, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3096 }, { "completion_length": 26.015625, "epoch": 0.5426669002978798, "grad_norm": 22.528868259966305, "kl": 0.11474609375, "learning_rate": 4.5750832311196773e-07, "loss": 0.0017, "reward": 1.4433059692382812, "reward_std": 0.19515696167945862, "rewards/accuracy_reward_stage2": 0.458931028842926, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3097 }, { "completion_length": 12.328125, "epoch": 0.5428421237077273, "grad_norm": 17.685266609921687, "kl": 0.146484375, "learning_rate": 4.573330997021202e-07, "loss": 0.0095, "reward": 1.5171079635620117, "reward_std": 0.20841091871261597, "rewards/accuracy_reward_stage2": 0.5483580231666565, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3098 }, { "completion_length": 9.828125, "epoch": 0.5430173471175749, "grad_norm": 20.5074041806249, "kl": 0.06640625, "learning_rate": 4.5715787629227266e-07, "loss": 0.0267, "reward": 1.6943392753601074, "reward_std": 0.2411803901195526, "rewards/accuracy_reward_stage2": 0.8193392753601074, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3099 }, { "completion_length": 11.09375, "epoch": 0.5431925705274224, "grad_norm": 17.269969614582326, "kl": 0.2021484375, "learning_rate": 4.5698265288242505e-07, "loss": 0.0451, "reward": 1.4439659118652344, "reward_std": 0.21747536957263947, "rewards/accuracy_reward_stage2": 0.5845909118652344, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3100 }, { "completion_length": 27.078125, "epoch": 0.54336779393727, "grad_norm": 14.248365215207672, "kl": 0.08935546875, "learning_rate": 4.568074294725775e-07, "loss": -0.0084, "reward": 1.3167316913604736, "reward_std": 0.1012648195028305, "rewards/accuracy_reward_stage2": 0.3323565721511841, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3101 }, { "completion_length": 16.828125, "epoch": 0.5435430173471176, "grad_norm": 14.166141616292743, "kl": 0.05224609375, "learning_rate": 4.5663220606273e-07, "loss": -0.0204, "reward": 1.2051842212677002, "reward_std": 0.13409823179244995, "rewards/accuracy_reward_stage2": 0.34580928087234497, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3102 }, { "completion_length": 14.671875, "epoch": 0.5437182407569652, "grad_norm": 14.638131377176025, "kl": 0.09521484375, "learning_rate": 4.564569826528824e-07, "loss": 0.0023, "reward": 1.6929941177368164, "reward_std": 0.1856401562690735, "rewards/accuracy_reward_stage2": 0.7086191177368164, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3103 }, { "completion_length": 10.515625, "epoch": 0.5438934641668127, "grad_norm": 21.71622363695326, "kl": 0.1728515625, "learning_rate": 4.5628175924303486e-07, "loss": -0.0184, "reward": 1.7654647827148438, "reward_std": 0.24956831336021423, "rewards/accuracy_reward_stage2": 0.7967147827148438, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3104 }, { "completion_length": 11.390625, "epoch": 0.5440686875766603, "grad_norm": 20.51723862003228, "kl": 0.09521484375, "learning_rate": 4.561065358331873e-07, "loss": 0.0379, "reward": 1.5989583730697632, "reward_std": 0.3667879104614258, "rewards/accuracy_reward_stage2": 0.5989583730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3105 }, { "completion_length": 10.9375, "epoch": 0.5442439109865078, "grad_norm": 23.295959325200524, "kl": 0.16796875, "learning_rate": 4.559313124233398e-07, "loss": -0.0211, "reward": 1.414116382598877, "reward_std": 0.3159290850162506, "rewards/accuracy_reward_stage2": 0.4453664720058441, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3106 }, { "completion_length": 6.703125, "epoch": 0.5444191343963554, "grad_norm": 14.653730888336844, "kl": 0.1142578125, "learning_rate": 4.557560890134922e-07, "loss": 0.0458, "reward": 1.6861011981964111, "reward_std": 0.1108192503452301, "rewards/accuracy_reward_stage2": 0.6861011385917664, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3107 }, { "completion_length": 11.484375, "epoch": 0.5445943578062029, "grad_norm": 12.71283407481726, "kl": 0.07568359375, "learning_rate": 4.555808656036446e-07, "loss": -0.0138, "reward": 1.5843769311904907, "reward_std": 0.11037540435791016, "rewards/accuracy_reward_stage2": 0.6000019311904907, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3108 }, { "completion_length": 7.15625, "epoch": 0.5447695812160505, "grad_norm": 9.632664097158752, "kl": 0.1015625, "learning_rate": 4.5540564219379706e-07, "loss": -0.0036, "reward": 1.7535353899002075, "reward_std": 0.11451417207717896, "rewards/accuracy_reward_stage2": 0.7691603899002075, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3109 }, { "completion_length": 11.484375, "epoch": 0.544944804625898, "grad_norm": 18.26844472901338, "kl": 0.2177734375, "learning_rate": 4.552304187839495e-07, "loss": 0.0136, "reward": 1.50527024269104, "reward_std": 0.23741815984249115, "rewards/accuracy_reward_stage2": 0.5365201830863953, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3110 }, { "completion_length": 12.109375, "epoch": 0.5451200280357456, "grad_norm": 22.22406598511341, "kl": 0.126953125, "learning_rate": 4.55055195374102e-07, "loss": 0.0069, "reward": 1.6192907094955444, "reward_std": 0.22934451699256897, "rewards/accuracy_reward_stage2": 0.6349157691001892, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3111 }, { "completion_length": 10.40625, "epoch": 0.5452952514455931, "grad_norm": 19.839667418496607, "kl": 0.1494140625, "learning_rate": 4.5487997196425443e-07, "loss": 0.0197, "reward": 1.7109891176223755, "reward_std": 0.15221619606018066, "rewards/accuracy_reward_stage2": 0.7266141772270203, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3112 }, { "completion_length": 11.21875, "epoch": 0.5454704748554406, "grad_norm": 30.394901531165882, "kl": 0.2236328125, "learning_rate": 4.547047485544068e-07, "loss": 0.0018, "reward": 1.4713342189788818, "reward_std": 0.26562297344207764, "rewards/accuracy_reward_stage2": 0.5182092785835266, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3113 }, { "completion_length": 37.921875, "epoch": 0.5456456982652882, "grad_norm": 18.317318007212805, "kl": 0.1513671875, "learning_rate": 4.5452952514455925e-07, "loss": 0.0266, "reward": 1.3703083992004395, "reward_std": 0.23182103037834167, "rewards/accuracy_reward_stage2": 0.3859333097934723, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3114 }, { "completion_length": 9.609375, "epoch": 0.5458209216751359, "grad_norm": 18.277950300554696, "kl": 0.3203125, "learning_rate": 4.5435430173471175e-07, "loss": 0.0185, "reward": 1.4545676708221436, "reward_std": 0.3653530478477478, "rewards/accuracy_reward_stage2": 0.7514426708221436, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 3115 }, { "completion_length": 10.34375, "epoch": 0.5459961450849834, "grad_norm": 19.3652231026983, "kl": 0.232421875, "learning_rate": 4.541790783248642e-07, "loss": 0.0545, "reward": 1.4572173357009888, "reward_std": 0.3412941098213196, "rewards/accuracy_reward_stage2": 0.48846733570098877, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3116 }, { "completion_length": 12.28125, "epoch": 0.5461713684948309, "grad_norm": 18.07080948296114, "kl": 0.07958984375, "learning_rate": 4.540038549150166e-07, "loss": 0.0318, "reward": 1.0901241302490234, "reward_std": 0.22192618250846863, "rewards/accuracy_reward_stage2": 0.3401240110397339, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3117 }, { "completion_length": 10.8125, "epoch": 0.5463465919046785, "grad_norm": 20.13719059887797, "kl": 0.142578125, "learning_rate": 4.5382863150516907e-07, "loss": 0.0568, "reward": 1.6008598804473877, "reward_std": 0.19498933851718903, "rewards/accuracy_reward_stage2": 0.7258598208427429, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3118 }, { "completion_length": 8.140625, "epoch": 0.546521815314526, "grad_norm": 19.86921510263278, "kl": 0.052734375, "learning_rate": 4.536534080953215e-07, "loss": 0.021, "reward": 1.415239691734314, "reward_std": 0.22165584564208984, "rewards/accuracy_reward_stage2": 0.41523975133895874, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3119 }, { "completion_length": 11.734375, "epoch": 0.5466970387243736, "grad_norm": 17.748671242725997, "kl": 0.014404296875, "learning_rate": 4.5347818468547394e-07, "loss": 0.0058, "reward": 1.7900738716125488, "reward_std": 0.18254978954792023, "rewards/accuracy_reward_stage2": 0.7900738716125488, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3120 }, { "completion_length": 8.84375, "epoch": 0.5468722621342211, "grad_norm": 14.524084056879108, "kl": 0.1982421875, "learning_rate": 4.533029612756264e-07, "loss": 0.0353, "reward": 1.5100059509277344, "reward_std": 0.16531233489513397, "rewards/accuracy_reward_stage2": 0.6506309509277344, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3121 }, { "completion_length": 11.734375, "epoch": 0.5470474855440687, "grad_norm": 22.884109798469716, "kl": 0.119140625, "learning_rate": 4.531277378657788e-07, "loss": 0.0045, "reward": 1.4116387367248535, "reward_std": 0.35598820447921753, "rewards/accuracy_reward_stage2": 0.5522637367248535, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3122 }, { "completion_length": 10.03125, "epoch": 0.5472227089539162, "grad_norm": 20.57263042186901, "kl": 0.1298828125, "learning_rate": 4.529525144559313e-07, "loss": 0.0078, "reward": 1.525716781616211, "reward_std": 0.2811649739742279, "rewards/accuracy_reward_stage2": 0.5413416624069214, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3123 }, { "completion_length": 14.171875, "epoch": 0.5473979323637638, "grad_norm": 18.40555272183468, "kl": 0.1123046875, "learning_rate": 4.5277729104608376e-07, "loss": -0.0313, "reward": 1.426608920097351, "reward_std": 0.3710615336894989, "rewards/accuracy_reward_stage2": 0.4578589200973511, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3124 }, { "completion_length": 12.84375, "epoch": 0.5475731557736113, "grad_norm": 27.199703543281533, "kl": 0.01708984375, "learning_rate": 4.526020676362362e-07, "loss": 0.0068, "reward": 1.59226655960083, "reward_std": 0.19375893473625183, "rewards/accuracy_reward_stage2": 0.5922665596008301, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3125 }, { "completion_length": 21.703125, "epoch": 0.5477483791834589, "grad_norm": 22.707419937102834, "kl": 0.1513671875, "learning_rate": 4.524268442263886e-07, "loss": 0.0219, "reward": 1.4392061233520508, "reward_std": 0.1956329196691513, "rewards/accuracy_reward_stage2": 0.5798312425613403, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3126 }, { "completion_length": 17.0, "epoch": 0.5479236025933064, "grad_norm": 25.956664968558947, "kl": 0.06884765625, "learning_rate": 4.522516208165411e-07, "loss": 0.0275, "reward": 1.4441642761230469, "reward_std": 0.3204311430454254, "rewards/accuracy_reward_stage2": 0.4441642761230469, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3127 }, { "completion_length": 10.015625, "epoch": 0.5480988260031541, "grad_norm": 18.86980089407399, "kl": 0.130859375, "learning_rate": 4.520763974066935e-07, "loss": -0.0339, "reward": 1.5849673748016357, "reward_std": 0.21915815770626068, "rewards/accuracy_reward_stage2": 0.6318423748016357, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3128 }, { "completion_length": 17.4375, "epoch": 0.5482740494130016, "grad_norm": 17.85624212350077, "kl": 0.0166015625, "learning_rate": 4.5190117399684595e-07, "loss": 0.0067, "reward": 1.5974417924880981, "reward_std": 0.13922935724258423, "rewards/accuracy_reward_stage2": 0.7224418520927429, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3129 }, { "completion_length": 12.390625, "epoch": 0.5484492728228492, "grad_norm": 16.83195774838754, "kl": 0.010498046875, "learning_rate": 4.517259505869984e-07, "loss": 0.0042, "reward": 1.7349507808685303, "reward_std": 0.12773281335830688, "rewards/accuracy_reward_stage2": 0.8599507212638855, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3130 }, { "completion_length": 11.03125, "epoch": 0.5486244962326967, "grad_norm": 20.906730392221608, "kl": 0.208984375, "learning_rate": 4.515507271771509e-07, "loss": 0.0837, "reward": 1.2577917575836182, "reward_std": 0.12434166669845581, "rewards/accuracy_reward_stage2": 0.5077918767929077, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3131 }, { "completion_length": 15.125, "epoch": 0.5487997196425443, "grad_norm": 18.725137651408236, "kl": 0.1064453125, "learning_rate": 4.5137550376730327e-07, "loss": -0.041, "reward": 1.314073085784912, "reward_std": 0.1571890413761139, "rewards/accuracy_reward_stage2": 0.3453230857849121, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3132 }, { "completion_length": 7.640625, "epoch": 0.5489749430523918, "grad_norm": 20.149659224345918, "kl": 0.203125, "learning_rate": 4.512002803574557e-07, "loss": -0.0268, "reward": 1.5538485050201416, "reward_std": 0.3465612530708313, "rewards/accuracy_reward_stage2": 0.6007235646247864, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3133 }, { "completion_length": 7.703125, "epoch": 0.5491501664622394, "grad_norm": 18.960739714891563, "kl": 0.2041015625, "learning_rate": 4.5102505694760815e-07, "loss": -0.0285, "reward": 1.3470832109451294, "reward_std": 0.20021668076515198, "rewards/accuracy_reward_stage2": 0.4095832407474518, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3134 }, { "completion_length": 9.890625, "epoch": 0.5493253898720869, "grad_norm": 24.688239604005318, "kl": 0.13671875, "learning_rate": 4.5084983353776064e-07, "loss": -0.0169, "reward": 1.6175473928451538, "reward_std": 0.3005647659301758, "rewards/accuracy_reward_stage2": 0.6487972736358643, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3135 }, { "completion_length": 10.71875, "epoch": 0.5495006132819344, "grad_norm": 13.618621416554019, "kl": 0.059326171875, "learning_rate": 4.506746101279131e-07, "loss": 0.0237, "reward": 1.6671037673950195, "reward_std": 0.06964041292667389, "rewards/accuracy_reward_stage2": 0.7921037077903748, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3136 }, { "completion_length": 9.421875, "epoch": 0.549675836691782, "grad_norm": 14.399033573800883, "kl": 0.1015625, "learning_rate": 4.504993867180655e-07, "loss": 0.0117, "reward": 1.5416667461395264, "reward_std": 0.19621436297893524, "rewards/accuracy_reward_stage2": 0.5729166865348816, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3137 }, { "completion_length": 11.546875, "epoch": 0.5498510601016295, "grad_norm": 11.203890840175257, "kl": 0.1083984375, "learning_rate": 4.503241633082179e-07, "loss": -0.0349, "reward": 1.8177083730697632, "reward_std": 0.12734557688236237, "rewards/accuracy_reward_stage2": 0.8489583730697632, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3138 }, { "completion_length": 9.046875, "epoch": 0.5500262835114771, "grad_norm": 15.282853134979353, "kl": 0.0595703125, "learning_rate": 4.501489398983704e-07, "loss": -0.0204, "reward": 1.546875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3139 }, { "completion_length": 11.640625, "epoch": 0.5502015069213246, "grad_norm": 22.126106131184848, "kl": 0.2333984375, "learning_rate": 4.4997371648852284e-07, "loss": -0.0416, "reward": 1.7610794305801392, "reward_std": 0.23321378231048584, "rewards/accuracy_reward_stage2": 0.8235794305801392, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3140 }, { "completion_length": 21.234375, "epoch": 0.5503767303311723, "grad_norm": 16.174597906243484, "kl": 0.2275390625, "learning_rate": 4.497984930786753e-07, "loss": 0.0187, "reward": 1.4087018966674805, "reward_std": 0.23245249688625336, "rewards/accuracy_reward_stage2": 0.5649518966674805, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3141 }, { "completion_length": 10.265625, "epoch": 0.5505519537410198, "grad_norm": 15.144513817072603, "kl": 0.09765625, "learning_rate": 4.496232696688277e-07, "loss": -0.0051, "reward": 1.6008020639419556, "reward_std": 0.15015725791454315, "rewards/accuracy_reward_stage2": 0.6164271235466003, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3142 }, { "completion_length": 13.28125, "epoch": 0.5507271771508674, "grad_norm": 15.840311728508933, "kl": 0.1806640625, "learning_rate": 4.494480462589802e-07, "loss": 0.0153, "reward": 1.4902280569076538, "reward_std": 0.2487790584564209, "rewards/accuracy_reward_stage2": 0.5214781165122986, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3143 }, { "completion_length": 10.75, "epoch": 0.5509024005607149, "grad_norm": 14.29696411929444, "kl": 0.107421875, "learning_rate": 4.4927282284913265e-07, "loss": 0.0152, "reward": 1.8940612077713013, "reward_std": 0.11344745010137558, "rewards/accuracy_reward_stage2": 0.9096862077713013, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3144 }, { "completion_length": 9.71875, "epoch": 0.5510776239705625, "grad_norm": 18.577526022076846, "kl": 0.11474609375, "learning_rate": 4.4909759943928504e-07, "loss": -0.0343, "reward": 1.6097900867462158, "reward_std": 0.3654620349407196, "rewards/accuracy_reward_stage2": 0.7504150867462158, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3145 }, { "completion_length": 9.15625, "epoch": 0.55125284738041, "grad_norm": 22.368273469070417, "kl": 0.447265625, "learning_rate": 4.489223760294375e-07, "loss": 0.0436, "reward": 1.6789296865463257, "reward_std": 0.3216952681541443, "rewards/accuracy_reward_stage2": 0.7414296865463257, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3146 }, { "completion_length": 9.453125, "epoch": 0.5514280707902576, "grad_norm": 19.66384748677594, "kl": 0.1240234375, "learning_rate": 4.4874715261958997e-07, "loss": 0.0104, "reward": 1.4158316850662231, "reward_std": 0.335426926612854, "rewards/accuracy_reward_stage2": 0.43145668506622314, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3147 }, { "completion_length": 8.03125, "epoch": 0.5516032942001051, "grad_norm": 16.66474206130544, "kl": 0.212890625, "learning_rate": 4.485719292097424e-07, "loss": 0.0074, "reward": 1.716360330581665, "reward_std": 0.2283344864845276, "rewards/accuracy_reward_stage2": 0.747610330581665, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3148 }, { "completion_length": 11.125, "epoch": 0.5517785176099527, "grad_norm": 13.826401124368541, "kl": 0.040771484375, "learning_rate": 4.4839670579989485e-07, "loss": 0.0163, "reward": 1.4942870140075684, "reward_std": 0.1060253232717514, "rewards/accuracy_reward_stage2": 0.49428704380989075, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3149 }, { "completion_length": 10.125, "epoch": 0.5519537410198002, "grad_norm": 16.087893374405123, "kl": 0.08642578125, "learning_rate": 4.482214823900473e-07, "loss": -0.0095, "reward": 1.328662395477295, "reward_std": 0.20100915431976318, "rewards/accuracy_reward_stage2": 0.3442873954772949, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3150 }, { "completion_length": 9.828125, "epoch": 0.5521289644296478, "grad_norm": 15.359195592826731, "kl": 0.06787109375, "learning_rate": 4.4804625898019973e-07, "loss": -0.0172, "reward": 1.4719253778457642, "reward_std": 0.18595364689826965, "rewards/accuracy_reward_stage2": 0.4875503182411194, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3151 }, { "completion_length": 14.3125, "epoch": 0.5523041878394953, "grad_norm": 16.956441792127453, "kl": 0.1767578125, "learning_rate": 4.4787103557035217e-07, "loss": 0.0493, "reward": 1.5719506740570068, "reward_std": 0.21863603591918945, "rewards/accuracy_reward_stage2": 0.7125757336616516, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3152 }, { "completion_length": 7.84375, "epoch": 0.552479411249343, "grad_norm": 14.321242912915837, "kl": 0.050537109375, "learning_rate": 4.476958121605046e-07, "loss": -0.0075, "reward": 1.6056922674179077, "reward_std": 0.23552852869033813, "rewards/accuracy_reward_stage2": 0.6213172674179077, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3153 }, { "completion_length": 11.203125, "epoch": 0.5526546346591905, "grad_norm": 19.342404513274424, "kl": 0.23828125, "learning_rate": 4.4752058875065705e-07, "loss": 0.0174, "reward": 1.4421981573104858, "reward_std": 0.27634021639823914, "rewards/accuracy_reward_stage2": 0.6140731573104858, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3154 }, { "completion_length": 10.0, "epoch": 0.5528298580690381, "grad_norm": 18.155035009617933, "kl": 0.1611328125, "learning_rate": 4.4734536534080954e-07, "loss": 0.0355, "reward": 1.4873788356781006, "reward_std": 0.23260822892189026, "rewards/accuracy_reward_stage2": 0.503003716468811, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3155 }, { "completion_length": 15.671875, "epoch": 0.5530050814788856, "grad_norm": 18.013443759182252, "kl": 0.1748046875, "learning_rate": 4.47170141930962e-07, "loss": 0.0035, "reward": 1.5860655307769775, "reward_std": 0.1955593377351761, "rewards/accuracy_reward_stage2": 0.6173155307769775, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3156 }, { "completion_length": 19.03125, "epoch": 0.5531803048887332, "grad_norm": 15.498322945185874, "kl": 0.1162109375, "learning_rate": 4.469949185211144e-07, "loss": 0.0131, "reward": 1.1478610038757324, "reward_std": 0.16122110188007355, "rewards/accuracy_reward_stage2": 0.28848594427108765, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3157 }, { "completion_length": 14.234375, "epoch": 0.5533555282985807, "grad_norm": 17.205042099112543, "kl": 0.1982421875, "learning_rate": 4.468196951112668e-07, "loss": 0.0086, "reward": 1.4277459383010864, "reward_std": 0.3595418632030487, "rewards/accuracy_reward_stage2": 0.4589959383010864, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3158 }, { "completion_length": 8.90625, "epoch": 0.5535307517084282, "grad_norm": 11.244925743336225, "kl": 0.052490234375, "learning_rate": 4.466444717014193e-07, "loss": -0.0232, "reward": 1.59375, "reward_std": 0.2177756428718567, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3159 }, { "completion_length": 17.546875, "epoch": 0.5537059751182758, "grad_norm": 21.06181035329859, "kl": 0.171875, "learning_rate": 4.4646924829157174e-07, "loss": 0.0687, "reward": 1.4755263328552246, "reward_std": 0.19179841876029968, "rewards/accuracy_reward_stage2": 0.6005264520645142, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3160 }, { "completion_length": 7.96875, "epoch": 0.5538811985281233, "grad_norm": 19.710182016252123, "kl": 0.130859375, "learning_rate": 4.462940248817242e-07, "loss": 0.0205, "reward": 1.6828030347824097, "reward_std": 0.30897510051727295, "rewards/accuracy_reward_stage2": 0.6984280347824097, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3161 }, { "completion_length": 11.65625, "epoch": 0.5540564219379709, "grad_norm": 27.181609967617536, "kl": 0.31640625, "learning_rate": 4.461188014718766e-07, "loss": -0.0273, "reward": 1.2866575717926025, "reward_std": 0.32743221521377563, "rewards/accuracy_reward_stage2": 0.3491576611995697, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3162 }, { "completion_length": 9.0625, "epoch": 0.5542316453478184, "grad_norm": 17.65054472095913, "kl": 0.1064453125, "learning_rate": 4.459435780620291e-07, "loss": 0.0212, "reward": 1.4841200113296509, "reward_std": 0.24171821773052216, "rewards/accuracy_reward_stage2": 0.4997449815273285, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3163 }, { "completion_length": 8.140625, "epoch": 0.554406868757666, "grad_norm": 22.888330804031177, "kl": 0.1982421875, "learning_rate": 4.457683546521815e-07, "loss": -0.0206, "reward": 1.6924538612365723, "reward_std": 0.3295304477214813, "rewards/accuracy_reward_stage2": 0.739328920841217, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3164 }, { "completion_length": 10.546875, "epoch": 0.5545820921675135, "grad_norm": 20.007332005698075, "kl": 0.18359375, "learning_rate": 4.4559313124233394e-07, "loss": -0.0148, "reward": 1.4425759315490723, "reward_std": 0.29871058464050293, "rewards/accuracy_reward_stage2": 0.4738258719444275, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3165 }, { "completion_length": 7.640625, "epoch": 0.5547573155773612, "grad_norm": 17.282894759042968, "kl": 0.0322265625, "learning_rate": 4.454179078324864e-07, "loss": 0.0129, "reward": 1.3645219802856445, "reward_std": 0.20641304552555084, "rewards/accuracy_reward_stage2": 0.36452198028564453, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3166 }, { "completion_length": 9.109375, "epoch": 0.5549325389872087, "grad_norm": 24.610090218699664, "kl": 0.06982421875, "learning_rate": 4.4524268442263887e-07, "loss": 0.028, "reward": 1.5005807876586914, "reward_std": 0.32463937997817993, "rewards/accuracy_reward_stage2": 0.6255807876586914, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3167 }, { "completion_length": 7.171875, "epoch": 0.5551077623970563, "grad_norm": 21.88279887889821, "kl": 0.078125, "learning_rate": 4.450674610127913e-07, "loss": 0.0312, "reward": 1.7056117057800293, "reward_std": 0.3208841383457184, "rewards/accuracy_reward_stage2": 0.7056115865707397, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3168 }, { "completion_length": 13.328125, "epoch": 0.5552829858069038, "grad_norm": 16.74587152982527, "kl": 0.11669921875, "learning_rate": 4.4489223760294375e-07, "loss": 0.0467, "reward": 1.2985787391662598, "reward_std": 0.12602615356445312, "rewards/accuracy_reward_stage2": 0.5485787391662598, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3169 }, { "completion_length": 13.84375, "epoch": 0.5554582092167514, "grad_norm": 21.40490424954572, "kl": 0.031982421875, "learning_rate": 4.4471701419309614e-07, "loss": 0.0128, "reward": 1.5608049631118774, "reward_std": 0.15516994893550873, "rewards/accuracy_reward_stage2": 0.5608049631118774, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3170 }, { "completion_length": 13.21875, "epoch": 0.5556334326265989, "grad_norm": 19.239122601412628, "kl": 0.10791015625, "learning_rate": 4.4454179078324863e-07, "loss": 0.004, "reward": 1.6224453449249268, "reward_std": 0.2906746566295624, "rewards/accuracy_reward_stage2": 0.7630704641342163, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3171 }, { "completion_length": 13.09375, "epoch": 0.5558086560364465, "grad_norm": 19.094748291275916, "kl": 0.06396484375, "learning_rate": 4.4436656737340107e-07, "loss": 0.0255, "reward": 1.7014517784118652, "reward_std": 0.25967666506767273, "rewards/accuracy_reward_stage2": 0.70145183801651, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3172 }, { "completion_length": 8.625, "epoch": 0.555983879446294, "grad_norm": 19.966829369849936, "kl": 0.275390625, "learning_rate": 4.441913439635535e-07, "loss": 0.0069, "reward": 1.5300395488739014, "reward_std": 0.35093286633491516, "rewards/accuracy_reward_stage2": 0.5925396680831909, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3173 }, { "completion_length": 10.859375, "epoch": 0.5561591028561416, "grad_norm": 22.17268398172219, "kl": 0.26953125, "learning_rate": 4.4401612055370595e-07, "loss": -0.0581, "reward": 1.4169352054595947, "reward_std": 0.2545914649963379, "rewards/accuracy_reward_stage2": 0.4950602054595947, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3174 }, { "completion_length": 8.578125, "epoch": 0.5563343262659891, "grad_norm": 24.07055012064237, "kl": 0.2578125, "learning_rate": 4.4384089714385844e-07, "loss": -0.0169, "reward": 1.4825234413146973, "reward_std": 0.3476669192314148, "rewards/accuracy_reward_stage2": 0.5450234413146973, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3175 }, { "completion_length": 7.109375, "epoch": 0.5565095496758367, "grad_norm": 13.07784200629125, "kl": 0.265625, "learning_rate": 4.436656737340109e-07, "loss": -0.0712, "reward": 1.646902084350586, "reward_std": 0.23995369672775269, "rewards/accuracy_reward_stage2": 0.7250271439552307, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3176 }, { "completion_length": 8.4375, "epoch": 0.5566847730856842, "grad_norm": 19.977964033841957, "kl": 0.08447265625, "learning_rate": 4.4349045032416327e-07, "loss": -0.0102, "reward": 1.647351861000061, "reward_std": 0.29686886072158813, "rewards/accuracy_reward_stage2": 0.662976861000061, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3177 }, { "completion_length": 10.765625, "epoch": 0.5568599964955318, "grad_norm": 21.420053415746505, "kl": 0.12451171875, "learning_rate": 4.433152269143157e-07, "loss": -0.0353, "reward": 1.5312858819961548, "reward_std": 0.3580089509487152, "rewards/accuracy_reward_stage2": 0.5625358819961548, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3178 }, { "completion_length": 10.421875, "epoch": 0.5570352199053794, "grad_norm": 17.138495113898916, "kl": 0.1025390625, "learning_rate": 4.4314000350446815e-07, "loss": -0.0031, "reward": 1.59375, "reward_std": 0.2845909595489502, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3179 }, { "completion_length": 11.625, "epoch": 0.557210443315227, "grad_norm": 11.558748314691904, "kl": 0.0927734375, "learning_rate": 4.4296478009462064e-07, "loss": -0.049, "reward": 1.595902442932129, "reward_std": 0.1572880744934082, "rewards/accuracy_reward_stage2": 0.6271524429321289, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3180 }, { "completion_length": 8.390625, "epoch": 0.5573856667250745, "grad_norm": 11.752197546197085, "kl": 0.06005859375, "learning_rate": 4.427895566847731e-07, "loss": -0.0202, "reward": 1.3125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.328125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3181 }, { "completion_length": 12.796875, "epoch": 0.557560890134922, "grad_norm": 15.82381166113847, "kl": 0.201171875, "learning_rate": 4.426143332749255e-07, "loss": -0.0258, "reward": 1.6532623767852783, "reward_std": 0.2777688503265381, "rewards/accuracy_reward_stage2": 0.7001373767852783, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3182 }, { "completion_length": 8.21875, "epoch": 0.5577361135447696, "grad_norm": 20.513792709599088, "kl": 0.2099609375, "learning_rate": 4.424391098650779e-07, "loss": -0.0252, "reward": 1.5753761529922485, "reward_std": 0.3551340401172638, "rewards/accuracy_reward_stage2": 0.6222511529922485, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3183 }, { "completion_length": 14.953125, "epoch": 0.5579113369546171, "grad_norm": 22.883421913878937, "kl": 0.1328125, "learning_rate": 4.422638864552304e-07, "loss": -0.0242, "reward": 1.434954047203064, "reward_std": 0.3454228639602661, "rewards/accuracy_reward_stage2": 0.46620407700538635, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3184 }, { "completion_length": 6.90625, "epoch": 0.5580865603644647, "grad_norm": 19.627035429188577, "kl": 0.048828125, "learning_rate": 4.4208866304538284e-07, "loss": -0.0095, "reward": 1.5062909126281738, "reward_std": 0.2934325933456421, "rewards/accuracy_reward_stage2": 0.521915853023529, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3185 }, { "completion_length": 11.3125, "epoch": 0.5582617837743122, "grad_norm": 19.709667689059085, "kl": 0.23046875, "learning_rate": 4.419134396355353e-07, "loss": 0.0353, "reward": 1.686478853225708, "reward_std": 0.2373899221420288, "rewards/accuracy_reward_stage2": 0.8427289128303528, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3186 }, { "completion_length": 13.734375, "epoch": 0.5584370071841598, "grad_norm": 22.747635084082575, "kl": 0.1591796875, "learning_rate": 4.417382162256877e-07, "loss": 0.0637, "reward": 1.4030470848083496, "reward_std": 0.24681012332439423, "rewards/accuracy_reward_stage2": 0.5280469655990601, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3187 }, { "completion_length": 10.8125, "epoch": 0.5586122305940073, "grad_norm": 16.6917204221685, "kl": 0.130859375, "learning_rate": 4.415629928158402e-07, "loss": 0.0081, "reward": 1.6642496585845947, "reward_std": 0.23828163743019104, "rewards/accuracy_reward_stage2": 0.6798745393753052, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3188 }, { "completion_length": 9.03125, "epoch": 0.5587874540038549, "grad_norm": 16.825558367044195, "kl": 0.1015625, "learning_rate": 4.413877694059926e-07, "loss": 0.0018, "reward": 1.5643525123596191, "reward_std": 0.2210099995136261, "rewards/accuracy_reward_stage2": 0.7049775123596191, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3189 }, { "completion_length": 9.703125, "epoch": 0.5589626774137024, "grad_norm": 20.641601437340125, "kl": 0.051513671875, "learning_rate": 4.4121254599614504e-07, "loss": 0.0206, "reward": 1.7616381645202637, "reward_std": 0.19968253374099731, "rewards/accuracy_reward_stage2": 0.7616380453109741, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3190 }, { "completion_length": 10.546875, "epoch": 0.55913790082355, "grad_norm": 20.21219529381267, "kl": 0.08349609375, "learning_rate": 4.410373225862975e-07, "loss": 0.0332, "reward": 1.7175178527832031, "reward_std": 0.15806759893894196, "rewards/accuracy_reward_stage2": 0.8425179123878479, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3191 }, { "completion_length": 19.6875, "epoch": 0.5593131242333976, "grad_norm": 18.111853588114563, "kl": 0.1240234375, "learning_rate": 4.4086209917644997e-07, "loss": -0.0375, "reward": 1.7544395923614502, "reward_std": 0.24202269315719604, "rewards/accuracy_reward_stage2": 0.7856895923614502, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3192 }, { "completion_length": 9.84375, "epoch": 0.5594883476432452, "grad_norm": 20.112560254113447, "kl": 0.1767578125, "learning_rate": 4.406868757666024e-07, "loss": -0.0124, "reward": 1.3385369777679443, "reward_std": 0.36554330587387085, "rewards/accuracy_reward_stage2": 0.49478694796562195, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3193 }, { "completion_length": 6.84375, "epoch": 0.5596635710530927, "grad_norm": 19.564347720470963, "kl": 0.283203125, "learning_rate": 4.4051165235675485e-07, "loss": -0.0118, "reward": 1.7647864818572998, "reward_std": 0.2844822108745575, "rewards/accuracy_reward_stage2": 0.8272864818572998, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3194 }, { "completion_length": 13.40625, "epoch": 0.5598387944629403, "grad_norm": 17.476056355454983, "kl": 0.0703125, "learning_rate": 4.403364289469073e-07, "loss": 0.0281, "reward": 1.5582443475723267, "reward_std": 0.26723265647888184, "rewards/accuracy_reward_stage2": 0.6832443475723267, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3195 }, { "completion_length": 9.625, "epoch": 0.5600140178727878, "grad_norm": 20.52858424291766, "kl": 0.26171875, "learning_rate": 4.4016120553705973e-07, "loss": 0.0215, "reward": 1.647862195968628, "reward_std": 0.20478901267051697, "rewards/accuracy_reward_stage2": 0.6947371363639832, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3196 }, { "completion_length": 9.984375, "epoch": 0.5601892412826354, "grad_norm": 11.87473457027194, "kl": 0.049072265625, "learning_rate": 4.3998598212721217e-07, "loss": 0.0196, "reward": 1.3483126163482666, "reward_std": 0.12753789126873016, "rewards/accuracy_reward_stage2": 0.4733126163482666, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3197 }, { "completion_length": 11.765625, "epoch": 0.5603644646924829, "grad_norm": 16.538140753582518, "kl": 0.2470703125, "learning_rate": 4.398107587173646e-07, "loss": 0.0256, "reward": 1.498239517211914, "reward_std": 0.23242546617984772, "rewards/accuracy_reward_stage2": 0.7638646364212036, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3198 }, { "completion_length": 11.109375, "epoch": 0.5605396881023305, "grad_norm": 23.394955904871278, "kl": 0.2890625, "learning_rate": 4.3963553530751705e-07, "loss": 0.0801, "reward": 1.5308568477630615, "reward_std": 0.2671172618865967, "rewards/accuracy_reward_stage2": 0.671481728553772, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3199 }, { "completion_length": 9.875, "epoch": 0.560714911512178, "grad_norm": 15.441451965197013, "kl": 0.109375, "learning_rate": 4.3946031189766954e-07, "loss": 0.0436, "reward": 1.3800475597381592, "reward_std": 0.08413176238536835, "rewards/accuracy_reward_stage2": 0.5050475597381592, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3200 }, { "completion_length": 6.890625, "epoch": 0.5608901349220256, "grad_norm": 21.575708325957677, "kl": 0.09228515625, "learning_rate": 4.39285088487822e-07, "loss": 0.0369, "reward": 1.7437188625335693, "reward_std": 0.21275295317173004, "rewards/accuracy_reward_stage2": 0.7437188625335693, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3201 }, { "completion_length": 11.671875, "epoch": 0.5610653583318731, "grad_norm": 162.16611459049957, "kl": 1.1640625, "learning_rate": 4.3910986507797436e-07, "loss": 0.3752, "reward": 1.500571846961975, "reward_std": 0.17933861911296844, "rewards/accuracy_reward_stage2": 0.6568217873573303, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3202 }, { "completion_length": 7.109375, "epoch": 0.5612405817417206, "grad_norm": 12.19803769706041, "kl": 0.0146484375, "learning_rate": 4.389346416681268e-07, "loss": 0.0059, "reward": 1.796875, "reward_std": 0.189372718334198, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3203 }, { "completion_length": 14.3125, "epoch": 0.5614158051515683, "grad_norm": 18.535694073248635, "kl": 0.034423828125, "learning_rate": 4.387594182582793e-07, "loss": 0.0138, "reward": 1.563488245010376, "reward_std": 0.1409141570329666, "rewards/accuracy_reward_stage2": 0.563488245010376, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3204 }, { "completion_length": 11.453125, "epoch": 0.5615910285614159, "grad_norm": 17.69222335890946, "kl": 0.28125, "learning_rate": 4.3858419484843174e-07, "loss": -0.0359, "reward": 1.5310032367706299, "reward_std": 0.16320835053920746, "rewards/accuracy_reward_stage2": 0.5935031175613403, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3205 }, { "completion_length": 13.0625, "epoch": 0.5617662519712634, "grad_norm": 14.769363797772218, "kl": 0.1240234375, "learning_rate": 4.384089714385842e-07, "loss": 0.0082, "reward": 1.7722058296203613, "reward_std": 0.1137927919626236, "rewards/accuracy_reward_stage2": 0.7878307700157166, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3206 }, { "completion_length": 17.765625, "epoch": 0.5619414753811109, "grad_norm": 18.148990937885742, "kl": 0.078125, "learning_rate": 4.382337480287366e-07, "loss": 0.0312, "reward": 1.401296615600586, "reward_std": 0.15238088369369507, "rewards/accuracy_reward_stage2": 0.5262964963912964, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3207 }, { "completion_length": 11.578125, "epoch": 0.5621166987909585, "grad_norm": 18.39241430040369, "kl": 0.19140625, "learning_rate": 4.380585246188891e-07, "loss": -0.0008, "reward": 1.5690919160842896, "reward_std": 0.36027538776397705, "rewards/accuracy_reward_stage2": 0.6003419160842896, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3208 }, { "completion_length": 10.71875, "epoch": 0.562291922200806, "grad_norm": 19.185613600966136, "kl": 0.107421875, "learning_rate": 4.378833012090415e-07, "loss": 0.022, "reward": 1.5785123109817505, "reward_std": 0.32527726888656616, "rewards/accuracy_reward_stage2": 0.5941373109817505, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3209 }, { "completion_length": 12.421875, "epoch": 0.5624671456106536, "grad_norm": 18.097225530702673, "kl": 0.11328125, "learning_rate": 4.3770807779919393e-07, "loss": -0.0333, "reward": 1.4982638359069824, "reward_std": 0.25345471501350403, "rewards/accuracy_reward_stage2": 0.5295138955116272, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3210 }, { "completion_length": 12.3125, "epoch": 0.5626423690205011, "grad_norm": 17.231740141514813, "kl": 0.0673828125, "learning_rate": 4.375328543893464e-07, "loss": 0.0269, "reward": 1.429835319519043, "reward_std": 0.1284599006175995, "rewards/accuracy_reward_stage2": 0.5548353791236877, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3211 }, { "completion_length": 19.421875, "epoch": 0.5628175924303487, "grad_norm": 20.159795366197336, "kl": 0.01055908203125, "learning_rate": 4.3735763097949887e-07, "loss": 0.0042, "reward": 1.4341033697128296, "reward_std": 0.15376178920269012, "rewards/accuracy_reward_stage2": 0.4341033101081848, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3212 }, { "completion_length": 9.421875, "epoch": 0.5629928158401962, "grad_norm": 21.621095116440348, "kl": 0.07080078125, "learning_rate": 4.371824075696513e-07, "loss": -0.016, "reward": 1.5738193988800049, "reward_std": 0.2998353838920593, "rewards/accuracy_reward_stage2": 0.5894443392753601, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3213 }, { "completion_length": 8.5625, "epoch": 0.5631680392500438, "grad_norm": 19.44838191969088, "kl": 0.275390625, "learning_rate": 4.3700718415980375e-07, "loss": -0.0095, "reward": 1.4690642356872559, "reward_std": 0.39073270559310913, "rewards/accuracy_reward_stage2": 0.5159392952919006, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3214 }, { "completion_length": 8.828125, "epoch": 0.5633432626598913, "grad_norm": 16.7926305447221, "kl": 0.022216796875, "learning_rate": 4.3683196074995613e-07, "loss": 0.0089, "reward": 1.6463541984558105, "reward_std": 0.11637798696756363, "rewards/accuracy_reward_stage2": 0.6463541984558105, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3215 }, { "completion_length": 7.921875, "epoch": 0.5635184860697389, "grad_norm": 16.575150696218373, "kl": 0.1103515625, "learning_rate": 4.366567373401086e-07, "loss": 0.0072, "reward": 1.7634837627410889, "reward_std": 0.24160407483577728, "rewards/accuracy_reward_stage2": 0.7791087627410889, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3216 }, { "completion_length": 12.75, "epoch": 0.5636937094795865, "grad_norm": 17.25097670858031, "kl": 0.08837890625, "learning_rate": 4.3648151393026107e-07, "loss": 0.0354, "reward": 1.5628836154937744, "reward_std": 0.27376827597618103, "rewards/accuracy_reward_stage2": 0.5628836154937744, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3217 }, { "completion_length": 10.8125, "epoch": 0.5638689328894341, "grad_norm": 16.21171001000319, "kl": 0.16015625, "learning_rate": 4.363062905204135e-07, "loss": -0.0196, "reward": 1.4027849435806274, "reward_std": 0.2692621946334839, "rewards/accuracy_reward_stage2": 0.4496598541736603, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3218 }, { "completion_length": 9.828125, "epoch": 0.5640441562992816, "grad_norm": 14.414151768836764, "kl": 0.025146484375, "learning_rate": 4.3613106711056594e-07, "loss": 0.0101, "reward": 1.7850942611694336, "reward_std": 0.09801465272903442, "rewards/accuracy_reward_stage2": 0.7850942611694336, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3219 }, { "completion_length": 13.453125, "epoch": 0.5642193797091292, "grad_norm": 24.6320558666092, "kl": 0.25, "learning_rate": 4.3595584370071844e-07, "loss": 0.0128, "reward": 1.4080870151519775, "reward_std": 0.2708454430103302, "rewards/accuracy_reward_stage2": 0.43933701515197754, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3220 }, { "completion_length": 8.171875, "epoch": 0.5643946031189767, "grad_norm": 28.597504490295133, "kl": 0.173828125, "learning_rate": 4.357806202908708e-07, "loss": 0.0429, "reward": 1.449331521987915, "reward_std": 0.32936540246009827, "rewards/accuracy_reward_stage2": 0.46495649218559265, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3221 }, { "completion_length": 8.234375, "epoch": 0.5645698265288243, "grad_norm": 16.205125598210785, "kl": 0.10595703125, "learning_rate": 4.3560539688102326e-07, "loss": -0.0019, "reward": 1.4258536100387573, "reward_std": 0.1293002814054489, "rewards/accuracy_reward_stage2": 0.5664785504341125, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3222 }, { "completion_length": 11.328125, "epoch": 0.5647450499386718, "grad_norm": 10.876812503612529, "kl": 0.07421875, "learning_rate": 4.354301734711757e-07, "loss": 0.0297, "reward": 1.643942952156067, "reward_std": 0.07415582239627838, "rewards/accuracy_reward_stage2": 0.7689428925514221, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3223 }, { "completion_length": 9.828125, "epoch": 0.5649202733485194, "grad_norm": 27.2544035722533, "kl": 0.0458984375, "learning_rate": 4.352549500613282e-07, "loss": 0.0183, "reward": 1.785082459449768, "reward_std": 0.24246467649936676, "rewards/accuracy_reward_stage2": 0.7850823402404785, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3224 }, { "completion_length": 7.09375, "epoch": 0.5650954967583669, "grad_norm": 14.083083155362187, "kl": 0.07861328125, "learning_rate": 4.3507972665148064e-07, "loss": 0.0314, "reward": 1.7869019508361816, "reward_std": 0.07399497926235199, "rewards/accuracy_reward_stage2": 0.7869018316268921, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3225 }, { "completion_length": 10.9375, "epoch": 0.5652707201682144, "grad_norm": 15.09002638533396, "kl": 0.07470703125, "learning_rate": 4.349045032416331e-07, "loss": -0.0014, "reward": 1.5789334774017334, "reward_std": 0.22475594282150269, "rewards/accuracy_reward_stage2": 0.5945584774017334, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3226 }, { "completion_length": 9.21875, "epoch": 0.565445943578062, "grad_norm": 22.039352896693494, "kl": 0.205078125, "learning_rate": 4.347292798317855e-07, "loss": 0.0593, "reward": 1.3339595794677734, "reward_std": 0.2276839017868042, "rewards/accuracy_reward_stage2": 0.6152095198631287, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3227 }, { "completion_length": 7.640625, "epoch": 0.5656211669879095, "grad_norm": 19.61843143593303, "kl": 0.04150390625, "learning_rate": 4.3455405642193795e-07, "loss": 0.0166, "reward": 1.7379176616668701, "reward_std": 0.18134930729866028, "rewards/accuracy_reward_stage2": 0.7379177808761597, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3228 }, { "completion_length": 10.953125, "epoch": 0.5657963903977571, "grad_norm": 21.3555429487247, "kl": 0.2578125, "learning_rate": 4.343788330120904e-07, "loss": 0.0654, "reward": 1.4483630657196045, "reward_std": 0.38449281454086304, "rewards/accuracy_reward_stage2": 0.5889881253242493, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3229 }, { "completion_length": 14.984375, "epoch": 0.5659716138076047, "grad_norm": 16.71334086382943, "kl": 0.06396484375, "learning_rate": 4.3420360960224283e-07, "loss": 0.0256, "reward": 1.512737512588501, "reward_std": 0.15234015882015228, "rewards/accuracy_reward_stage2": 0.512737512588501, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3230 }, { "completion_length": 8.125, "epoch": 0.5661468372174523, "grad_norm": 22.757742414149945, "kl": 0.162109375, "learning_rate": 4.3402838619239527e-07, "loss": 0.0647, "reward": 1.5299479961395264, "reward_std": 0.23831304907798767, "rewards/accuracy_reward_stage2": 0.5299479365348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3231 }, { "completion_length": 12.828125, "epoch": 0.5663220606272998, "grad_norm": 19.429740807099193, "kl": 0.08251953125, "learning_rate": 4.3385316278254777e-07, "loss": -0.0057, "reward": 1.5748710632324219, "reward_std": 0.21447551250457764, "rewards/accuracy_reward_stage2": 0.5904961824417114, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3232 }, { "completion_length": 9.625, "epoch": 0.5664972840371474, "grad_norm": 17.21733104459262, "kl": 0.09130859375, "learning_rate": 4.336779393727002e-07, "loss": 0.0363, "reward": 1.4882630109786987, "reward_std": 0.2861184775829315, "rewards/accuracy_reward_stage2": 0.6132630109786987, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3233 }, { "completion_length": 8.5625, "epoch": 0.5666725074469949, "grad_norm": 17.967560787175596, "kl": 0.197265625, "learning_rate": 4.335027159628526e-07, "loss": -0.007, "reward": 1.676719307899475, "reward_std": 0.2228265404701233, "rewards/accuracy_reward_stage2": 0.7079692482948303, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3234 }, { "completion_length": 11.28125, "epoch": 0.5668477308568425, "grad_norm": 16.682095638705565, "kl": 0.21875, "learning_rate": 4.3332749255300503e-07, "loss": -0.0576, "reward": 1.720862865447998, "reward_std": 0.27902647852897644, "rewards/accuracy_reward_stage2": 0.7833628058433533, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3235 }, { "completion_length": 13.171875, "epoch": 0.56702295426669, "grad_norm": 22.178935418309496, "kl": 0.11328125, "learning_rate": 4.331522691431575e-07, "loss": -0.0209, "reward": 1.4929907321929932, "reward_std": 0.2933758497238159, "rewards/accuracy_reward_stage2": 0.5242406129837036, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3236 }, { "completion_length": 7.875, "epoch": 0.5671981776765376, "grad_norm": 18.121307300594328, "kl": 0.0888671875, "learning_rate": 4.3297704573330996e-07, "loss": 0.0354, "reward": 1.462658405303955, "reward_std": 0.20405489206314087, "rewards/accuracy_reward_stage2": 0.5876583456993103, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3237 }, { "completion_length": 11.8125, "epoch": 0.5673734010863851, "grad_norm": 13.614637902537446, "kl": 0.189453125, "learning_rate": 4.328018223234624e-07, "loss": -0.0101, "reward": 1.1695168018341064, "reward_std": 0.15401369333267212, "rewards/accuracy_reward_stage2": 0.4507666826248169, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3238 }, { "completion_length": 12.296875, "epoch": 0.5675486244962327, "grad_norm": 19.40824248666292, "kl": 0.126953125, "learning_rate": 4.3262659891361484e-07, "loss": 0.0153, "reward": 1.4984833002090454, "reward_std": 0.2742450535297394, "rewards/accuracy_reward_stage2": 0.5141082406044006, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3239 }, { "completion_length": 12.03125, "epoch": 0.5677238479060802, "grad_norm": 18.540362816237884, "kl": 0.03857421875, "learning_rate": 4.324513755037673e-07, "loss": 0.0154, "reward": 1.5752973556518555, "reward_std": 0.202922523021698, "rewards/accuracy_reward_stage2": 0.5752974152565002, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3240 }, { "completion_length": 11.09375, "epoch": 0.5678990713159278, "grad_norm": 14.610240774086812, "kl": 0.2734375, "learning_rate": 4.322761520939197e-07, "loss": -0.0418, "reward": 1.4417760372161865, "reward_std": 0.21922755241394043, "rewards/accuracy_reward_stage2": 0.5042760372161865, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3241 }, { "completion_length": 5.53125, "epoch": 0.5680742947257753, "grad_norm": 21.848077026052675, "kl": 0.10693359375, "learning_rate": 4.3210092868407216e-07, "loss": -0.0012, "reward": 1.519402265548706, "reward_std": 0.22549188137054443, "rewards/accuracy_reward_stage2": 0.5350273251533508, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3242 }, { "completion_length": 10.984375, "epoch": 0.568249518135623, "grad_norm": 18.597432240492093, "kl": 0.2080078125, "learning_rate": 4.319257052742246e-07, "loss": -0.0013, "reward": 1.4672976732254028, "reward_std": 0.2723372280597687, "rewards/accuracy_reward_stage2": 0.49854767322540283, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3243 }, { "completion_length": 8.015625, "epoch": 0.5684247415454705, "grad_norm": 15.128065729385757, "kl": 0.1796875, "learning_rate": 4.317504818643771e-07, "loss": 0.0051, "reward": 1.5128765106201172, "reward_std": 0.23403945565223694, "rewards/accuracy_reward_stage2": 0.6535014510154724, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3244 }, { "completion_length": 11.734375, "epoch": 0.5685999649553181, "grad_norm": 14.460722192729529, "kl": 0.2021484375, "learning_rate": 4.3157525845452953e-07, "loss": -0.0005, "reward": 1.4392145872116089, "reward_std": 0.16994988918304443, "rewards/accuracy_reward_stage2": 0.5954645276069641, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3245 }, { "completion_length": 7.359375, "epoch": 0.5687751883651656, "grad_norm": 28.45521949425175, "kl": 0.2734375, "learning_rate": 4.31400035044682e-07, "loss": 0.0649, "reward": 1.4389383792877197, "reward_std": 0.3385145366191864, "rewards/accuracy_reward_stage2": 0.4545634388923645, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3246 }, { "completion_length": 8.421875, "epoch": 0.5689504117750132, "grad_norm": 54.59620973864808, "kl": 0.5546875, "learning_rate": 4.3122481163483436e-07, "loss": 0.003, "reward": 1.2950458526611328, "reward_std": 0.1936914175748825, "rewards/accuracy_reward_stage2": 0.6544209122657776, "rewards/format_reward_stage1_pointerpad": 0.640625, "scores/accuracy_reward_stage2": 0.640625, "step": 3247 }, { "completion_length": 9.9375, "epoch": 0.5691256351848607, "grad_norm": 21.733520858543173, "kl": 0.287109375, "learning_rate": 4.3104958822498685e-07, "loss": -0.0068, "reward": 1.4032741785049438, "reward_std": 0.3143426775932312, "rewards/accuracy_reward_stage2": 0.45014917850494385, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3248 }, { "completion_length": 11.03125, "epoch": 0.5693008585947082, "grad_norm": 28.34829740951872, "kl": 0.08642578125, "learning_rate": 4.308743648151393e-07, "loss": -0.0097, "reward": 1.5421795845031738, "reward_std": 0.2444736212491989, "rewards/accuracy_reward_stage2": 0.557804524898529, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3249 }, { "completion_length": 12.578125, "epoch": 0.5694760820045558, "grad_norm": 19.836738683639865, "kl": 0.15234375, "learning_rate": 4.3069914140529173e-07, "loss": -0.0679, "reward": 1.429978847503662, "reward_std": 0.3472153842449188, "rewards/accuracy_reward_stage2": 0.4768539369106293, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3250 }, { "completion_length": 10.46875, "epoch": 0.5696513054144033, "grad_norm": 21.483227411450418, "kl": 0.0625, "learning_rate": 4.3052391799544417e-07, "loss": -0.0191, "reward": 1.6257601976394653, "reward_std": 0.26404887437820435, "rewards/accuracy_reward_stage2": 0.6413851976394653, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3251 }, { "completion_length": 7.140625, "epoch": 0.5698265288242509, "grad_norm": 19.57299909193561, "kl": 0.12158203125, "learning_rate": 4.3034869458559666e-07, "loss": 0.0206, "reward": 1.6792659759521484, "reward_std": 0.2793692350387573, "rewards/accuracy_reward_stage2": 0.6948908567428589, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3252 }, { "completion_length": 9.1875, "epoch": 0.5700017522340984, "grad_norm": 19.498853474073666, "kl": 0.05810546875, "learning_rate": 4.3017347117574905e-07, "loss": 0.0232, "reward": 1.5725898742675781, "reward_std": 0.1827564537525177, "rewards/accuracy_reward_stage2": 0.6975898742675781, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3253 }, { "completion_length": 9.0625, "epoch": 0.570176975643946, "grad_norm": 12.837473511274942, "kl": 0.11083984375, "learning_rate": 4.299982477659015e-07, "loss": 0.0253, "reward": 1.653282880783081, "reward_std": 0.1653340756893158, "rewards/accuracy_reward_stage2": 0.6689077615737915, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3254 }, { "completion_length": 9.640625, "epoch": 0.5703521990537936, "grad_norm": 25.24313558707273, "kl": 0.09912109375, "learning_rate": 4.2982302435605393e-07, "loss": 0.0396, "reward": 1.8017048835754395, "reward_std": 0.19715824723243713, "rewards/accuracy_reward_stage2": 0.8017048835754395, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3255 }, { "completion_length": 8.796875, "epoch": 0.5705274224636412, "grad_norm": 18.519638123786716, "kl": 0.06591796875, "learning_rate": 4.2964780094620637e-07, "loss": 0.0263, "reward": 1.553621768951416, "reward_std": 0.22481247782707214, "rewards/accuracy_reward_stage2": 0.5536218881607056, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3256 }, { "completion_length": 9.71875, "epoch": 0.5707026458734887, "grad_norm": 19.563549003512524, "kl": 0.1748046875, "learning_rate": 4.2947257753635886e-07, "loss": 0.0263, "reward": 1.7168099880218506, "reward_std": 0.3916775584220886, "rewards/accuracy_reward_stage2": 0.7324349880218506, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3257 }, { "completion_length": 15.375, "epoch": 0.5708778692833363, "grad_norm": 17.002630755032023, "kl": 0.0888671875, "learning_rate": 4.292973541265113e-07, "loss": -0.0063, "reward": 1.3024250268936157, "reward_std": 0.2217259705066681, "rewards/accuracy_reward_stage2": 0.44304996728897095, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3258 }, { "completion_length": 8.5625, "epoch": 0.5710530926931838, "grad_norm": 18.175895476432554, "kl": 0.158203125, "learning_rate": 4.291221307166637e-07, "loss": 0.0192, "reward": 1.3714404106140137, "reward_std": 0.17032676935195923, "rewards/accuracy_reward_stage2": 0.6370653510093689, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3259 }, { "completion_length": 12.046875, "epoch": 0.5712283161030314, "grad_norm": 19.595931739238125, "kl": 0.072265625, "learning_rate": 4.2894690730681613e-07, "loss": 0.0289, "reward": 1.6014660596847534, "reward_std": 0.24930503964424133, "rewards/accuracy_reward_stage2": 0.6014660000801086, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3260 }, { "completion_length": 9.359375, "epoch": 0.5714035395128789, "grad_norm": 22.378946777705135, "kl": 0.205078125, "learning_rate": 4.287716838969686e-07, "loss": 0.0454, "reward": 1.6707240343093872, "reward_std": 0.2596997022628784, "rewards/accuracy_reward_stage2": 0.6863489747047424, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3261 }, { "completion_length": 8.875, "epoch": 0.5715787629227265, "grad_norm": 13.881901021806375, "kl": 0.099609375, "learning_rate": 4.2859646048712106e-07, "loss": -0.002, "reward": 1.7320775985717773, "reward_std": 0.10933174937963486, "rewards/accuracy_reward_stage2": 0.7477025985717773, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3262 }, { "completion_length": 6.3125, "epoch": 0.571753986332574, "grad_norm": 17.567583797864298, "kl": 0.0732421875, "learning_rate": 4.284212370772735e-07, "loss": -0.0124, "reward": 1.6161859035491943, "reward_std": 0.2978987991809845, "rewards/accuracy_reward_stage2": 0.6318109035491943, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3263 }, { "completion_length": 11.5625, "epoch": 0.5719292097424216, "grad_norm": 22.225003221781638, "kl": 0.15625, "learning_rate": 4.2824601366742594e-07, "loss": 0.0625, "reward": 1.4586200714111328, "reward_std": 0.3100087344646454, "rewards/accuracy_reward_stage2": 0.45862019062042236, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3264 }, { "completion_length": 10.140625, "epoch": 0.5721044331522691, "grad_norm": 17.636927566313457, "kl": 0.0859375, "learning_rate": 4.2807079025757843e-07, "loss": -0.0056, "reward": 1.7124474048614502, "reward_std": 0.19638592004776, "rewards/accuracy_reward_stage2": 0.7280724048614502, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3265 }, { "completion_length": 10.859375, "epoch": 0.5722796565621167, "grad_norm": 15.538390882765091, "kl": 0.1376953125, "learning_rate": 4.278955668477308e-07, "loss": 0.0109, "reward": 1.5744549036026, "reward_std": 0.17407383024692535, "rewards/accuracy_reward_stage2": 0.5900799036026001, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3266 }, { "completion_length": 6.171875, "epoch": 0.5724548799719642, "grad_norm": 8.95489831645316, "kl": 0.072265625, "learning_rate": 4.2772034343788326e-07, "loss": 0.0289, "reward": 1.5, "reward_std": 0.1157275140285492, "rewards/accuracy_reward_stage2": 0.625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3267 }, { "completion_length": 9.671875, "epoch": 0.5726301033818119, "grad_norm": 23.077475062509656, "kl": 0.1298828125, "learning_rate": 4.275451200280357e-07, "loss": 0.0518, "reward": 1.4160887002944946, "reward_std": 0.25131142139434814, "rewards/accuracy_reward_stage2": 0.5410886406898499, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3268 }, { "completion_length": 9.1875, "epoch": 0.5728053267916594, "grad_norm": 64.54471420516117, "kl": 0.50390625, "learning_rate": 4.273698966181882e-07, "loss": 0.2022, "reward": 1.4653161764144897, "reward_std": 0.1672295182943344, "rewards/accuracy_reward_stage2": 0.715316116809845, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3269 }, { "completion_length": 8.71875, "epoch": 0.572980550201507, "grad_norm": 20.224070237612608, "kl": 0.09521484375, "learning_rate": 4.2719467320834063e-07, "loss": 0.038, "reward": 1.7414612770080566, "reward_std": 0.14439380168914795, "rewards/accuracy_reward_stage2": 0.7414612174034119, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3270 }, { "completion_length": 11.921875, "epoch": 0.5731557736113545, "grad_norm": 11.893100562966017, "kl": 0.10595703125, "learning_rate": 4.2701944979849307e-07, "loss": -0.0901, "reward": 1.6197917461395264, "reward_std": 0.2911488711833954, "rewards/accuracy_reward_stage2": 0.6666666269302368, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3271 }, { "completion_length": 13.0, "epoch": 0.573330997021202, "grad_norm": 18.123729535165488, "kl": 0.1103515625, "learning_rate": 4.2684422638864546e-07, "loss": 0.0443, "reward": 1.8838486671447754, "reward_std": 0.19200080633163452, "rewards/accuracy_reward_stage2": 0.8838486671447754, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3272 }, { "completion_length": 12.796875, "epoch": 0.5735062204310496, "grad_norm": 16.187098739667356, "kl": 0.166015625, "learning_rate": 4.2666900297879795e-07, "loss": 0.0375, "reward": 1.6712990999221802, "reward_std": 0.21961762011051178, "rewards/accuracy_reward_stage2": 0.6869240999221802, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3273 }, { "completion_length": 11.578125, "epoch": 0.5736814438408971, "grad_norm": 17.906472432934265, "kl": 0.07177734375, "learning_rate": 4.264937795689504e-07, "loss": 0.0288, "reward": 1.4957730770111084, "reward_std": 0.1836545616388321, "rewards/accuracy_reward_stage2": 0.4957730174064636, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3274 }, { "completion_length": 10.875, "epoch": 0.5738566672507447, "grad_norm": 15.113223087609942, "kl": 0.126953125, "learning_rate": 4.2631855615910283e-07, "loss": 0.0066, "reward": 1.6197917461395264, "reward_std": 0.21341678500175476, "rewards/accuracy_reward_stage2": 0.8697916269302368, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3275 }, { "completion_length": 14.546875, "epoch": 0.5740318906605922, "grad_norm": 25.882139996220964, "kl": 0.1396484375, "learning_rate": 4.2614333274925527e-07, "loss": 0.056, "reward": 1.4892785549163818, "reward_std": 0.3860580623149872, "rewards/accuracy_reward_stage2": 0.6142784357070923, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3276 }, { "completion_length": 9.484375, "epoch": 0.5742071140704398, "grad_norm": 19.216446272569474, "kl": 0.036865234375, "learning_rate": 4.2596810933940776e-07, "loss": 0.0148, "reward": 1.5715553760528564, "reward_std": 0.15116415917873383, "rewards/accuracy_reward_stage2": 0.6965553760528564, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3277 }, { "completion_length": 12.375, "epoch": 0.5743823374802873, "grad_norm": 21.357406798195083, "kl": 0.130859375, "learning_rate": 4.257928859295602e-07, "loss": 0.0244, "reward": 1.8721911907196045, "reward_std": 0.24186889827251434, "rewards/accuracy_reward_stage2": 0.8878160715103149, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3278 }, { "completion_length": 7.1875, "epoch": 0.5745575608901349, "grad_norm": 19.448846480307616, "kl": 0.13671875, "learning_rate": 4.256176625197126e-07, "loss": 0.0546, "reward": 1.5123186111450195, "reward_std": 0.18857964873313904, "rewards/accuracy_reward_stage2": 0.5123186111450195, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3279 }, { "completion_length": 15.390625, "epoch": 0.5747327842999824, "grad_norm": 21.288459897921552, "kl": 0.1552734375, "learning_rate": 4.2544243910986503e-07, "loss": 0.0333, "reward": 1.317323923110962, "reward_std": 0.1799907237291336, "rewards/accuracy_reward_stage2": 0.4579489529132843, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3280 }, { "completion_length": 15.078125, "epoch": 0.5749080077098301, "grad_norm": 21.07082925093428, "kl": 0.1865234375, "learning_rate": 4.252672157000175e-07, "loss": 0.0395, "reward": 1.291269063949585, "reward_std": 0.18487222492694855, "rewards/accuracy_reward_stage2": 0.5568939447402954, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3281 }, { "completion_length": 18.15625, "epoch": 0.5750832311196776, "grad_norm": 19.913425559679723, "kl": 0.138671875, "learning_rate": 4.2509199229016996e-07, "loss": 0.0557, "reward": 1.2240808010101318, "reward_std": 0.22274786233901978, "rewards/accuracy_reward_stage2": 0.47408074140548706, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3282 }, { "completion_length": 11.765625, "epoch": 0.5752584545295252, "grad_norm": 21.420756353044013, "kl": 0.05078125, "learning_rate": 4.249167688803224e-07, "loss": 0.0203, "reward": 1.5513889789581299, "reward_std": 0.2648440897464752, "rewards/accuracy_reward_stage2": 0.6763889193534851, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3283 }, { "completion_length": 8.15625, "epoch": 0.5754336779393727, "grad_norm": 17.8684394818383, "kl": 0.054931640625, "learning_rate": 4.2474154547047484e-07, "loss": 0.0219, "reward": 1.5660185813903809, "reward_std": 0.2910422384738922, "rewards/accuracy_reward_stage2": 0.5660187005996704, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3284 }, { "completion_length": 7.75, "epoch": 0.5756089013492203, "grad_norm": 4.057894997719182, "kl": 0.04150390625, "learning_rate": 4.245663220606273e-07, "loss": -0.0276, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3285 }, { "completion_length": 8.953125, "epoch": 0.5757841247590678, "grad_norm": 22.31527931446079, "kl": 0.1435546875, "learning_rate": 4.243910986507797e-07, "loss": 0.0478, "reward": 1.3714749813079834, "reward_std": 0.2784489393234253, "rewards/accuracy_reward_stage2": 0.5120998620986938, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3286 }, { "completion_length": 7.671875, "epoch": 0.5759593481689154, "grad_norm": 17.750280283958453, "kl": 0.1279296875, "learning_rate": 4.2421587524093216e-07, "loss": 0.008, "reward": 1.673346996307373, "reward_std": 0.20908769965171814, "rewards/accuracy_reward_stage2": 0.8139719367027283, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3287 }, { "completion_length": 8.515625, "epoch": 0.5761345715787629, "grad_norm": 28.452469508830507, "kl": 0.201171875, "learning_rate": 4.240406518310846e-07, "loss": 0.0459, "reward": 1.407305121421814, "reward_std": 0.3242839276790619, "rewards/accuracy_reward_stage2": 0.547930121421814, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3288 }, { "completion_length": 9.625, "epoch": 0.5763097949886105, "grad_norm": 20.764420247606466, "kl": 0.158203125, "learning_rate": 4.238654284212371e-07, "loss": 0.0632, "reward": 1.3577320575714111, "reward_std": 0.3342527151107788, "rewards/accuracy_reward_stage2": 0.6077320575714111, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3289 }, { "completion_length": 7.265625, "epoch": 0.576485018398458, "grad_norm": 13.598394732574489, "kl": 0.1103515625, "learning_rate": 4.2369020501138953e-07, "loss": -0.0002, "reward": 1.894986629486084, "reward_std": 0.11542447656393051, "rewards/accuracy_reward_stage2": 0.910611629486084, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3290 }, { "completion_length": 7.875, "epoch": 0.5766602418083056, "grad_norm": 14.29332734624939, "kl": 0.0252685546875, "learning_rate": 4.235149816015419e-07, "loss": 0.0101, "reward": 1.828125, "reward_std": 0.10311973094940186, "rewards/accuracy_reward_stage2": 0.828125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3291 }, { "completion_length": 10.6875, "epoch": 0.5768354652181531, "grad_norm": 18.227115369491855, "kl": 0.03564453125, "learning_rate": 4.2333975819169436e-07, "loss": 0.0143, "reward": 1.329542875289917, "reward_std": 0.17716088891029358, "rewards/accuracy_reward_stage2": 0.5795428156852722, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3292 }, { "completion_length": 9.984375, "epoch": 0.5770106886280008, "grad_norm": 17.617103800867277, "kl": 0.064453125, "learning_rate": 4.2316453478184685e-07, "loss": 0.0258, "reward": 1.662729024887085, "reward_std": 0.15877869725227356, "rewards/accuracy_reward_stage2": 0.662729024887085, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3293 }, { "completion_length": 8.28125, "epoch": 0.5771859120378483, "grad_norm": 31.01227260578467, "kl": 0.047607421875, "learning_rate": 4.229893113719993e-07, "loss": 0.0191, "reward": 1.683934211730957, "reward_std": 0.22675946354866028, "rewards/accuracy_reward_stage2": 0.683934211730957, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3294 }, { "completion_length": 7.515625, "epoch": 0.5773611354476959, "grad_norm": 12.294745744586768, "kl": 0.1796875, "learning_rate": 4.2281408796215173e-07, "loss": -0.0054, "reward": 1.859375, "reward_std": 0.20189079642295837, "rewards/accuracy_reward_stage2": 0.890625, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3295 }, { "completion_length": 8.515625, "epoch": 0.5775363588575434, "grad_norm": 12.746122951348797, "kl": 0.09033203125, "learning_rate": 4.2263886455230417e-07, "loss": -0.0081, "reward": 1.6589066982269287, "reward_std": 0.1634528785943985, "rewards/accuracy_reward_stage2": 0.6745317578315735, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3296 }, { "completion_length": 11.21875, "epoch": 0.5777115822673909, "grad_norm": 18.010685879221356, "kl": 0.04736328125, "learning_rate": 4.2246364114245666e-07, "loss": 0.0189, "reward": 1.5710866451263428, "reward_std": 0.16242891550064087, "rewards/accuracy_reward_stage2": 0.6960866451263428, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3297 }, { "completion_length": 17.09375, "epoch": 0.5778868056772385, "grad_norm": 16.5643178297475, "kl": 0.034423828125, "learning_rate": 4.2228841773260905e-07, "loss": 0.0138, "reward": 1.6158677339553833, "reward_std": 0.1544964611530304, "rewards/accuracy_reward_stage2": 0.6158677339553833, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3298 }, { "completion_length": 7.578125, "epoch": 0.578062029087086, "grad_norm": 21.732843806909315, "kl": 0.060302734375, "learning_rate": 4.221131943227615e-07, "loss": 0.0241, "reward": 1.7844496965408325, "reward_std": 0.1314665973186493, "rewards/accuracy_reward_stage2": 0.7844497561454773, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3299 }, { "completion_length": 8.671875, "epoch": 0.5782372524969336, "grad_norm": 18.278951218401343, "kl": 0.033447265625, "learning_rate": 4.219379709129139e-07, "loss": 0.0134, "reward": 1.2312500476837158, "reward_std": 0.1976664662361145, "rewards/accuracy_reward_stage2": 0.23125001788139343, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3300 }, { "completion_length": 10.265625, "epoch": 0.5784124759067811, "grad_norm": 85.56994101305463, "kl": 0.466796875, "learning_rate": 4.217627475030664e-07, "loss": 0.1873, "reward": 1.397374153137207, "reward_std": 0.17502638697624207, "rewards/accuracy_reward_stage2": 0.5223740935325623, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3301 }, { "completion_length": 17.734375, "epoch": 0.5785876993166287, "grad_norm": 13.14692014606543, "kl": 0.10400390625, "learning_rate": 4.2158752409321886e-07, "loss": 0.0026, "reward": 1.8054771423339844, "reward_std": 0.22878023982048035, "rewards/accuracy_reward_stage2": 0.8211021423339844, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3302 }, { "completion_length": 12.46875, "epoch": 0.5787629227264762, "grad_norm": 26.435353069087956, "kl": 0.1005859375, "learning_rate": 4.214123006833713e-07, "loss": 0.0402, "reward": 1.5523478984832764, "reward_std": 0.25822532176971436, "rewards/accuracy_reward_stage2": 0.5523478388786316, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3303 }, { "completion_length": 7.5625, "epoch": 0.5789381461363238, "grad_norm": 17.94646914207544, "kl": 0.1337890625, "learning_rate": 4.212370772735237e-07, "loss": 0.0168, "reward": 1.4311730861663818, "reward_std": 0.22365109622478485, "rewards/accuracy_reward_stage2": 0.44679805636405945, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3304 }, { "completion_length": 8.125, "epoch": 0.5791133695461713, "grad_norm": 20.73112396046394, "kl": 0.1484375, "learning_rate": 4.210618538636762e-07, "loss": 0.0046, "reward": 1.682603120803833, "reward_std": 0.2785916328430176, "rewards/accuracy_reward_stage2": 0.7138531804084778, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3305 }, { "completion_length": 12.78125, "epoch": 0.579288592956019, "grad_norm": 17.497448798545236, "kl": 0.150390625, "learning_rate": 4.208866304538286e-07, "loss": 0.0432, "reward": 1.5918843746185303, "reward_std": 0.16210268437862396, "rewards/accuracy_reward_stage2": 0.7325093746185303, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3306 }, { "completion_length": 8.953125, "epoch": 0.5794638163658665, "grad_norm": 18.037814272784697, "kl": 0.10009765625, "learning_rate": 4.2071140704398106e-07, "loss": 0.0401, "reward": 1.8454476594924927, "reward_std": 0.18337611854076385, "rewards/accuracy_reward_stage2": 0.8454477190971375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3307 }, { "completion_length": 12.078125, "epoch": 0.5796390397757141, "grad_norm": 18.95994598789497, "kl": 0.08837890625, "learning_rate": 4.205361836341335e-07, "loss": 0.0354, "reward": 1.4737579822540283, "reward_std": 0.13294795155525208, "rewards/accuracy_reward_stage2": 0.4737580418586731, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3308 }, { "completion_length": 9.515625, "epoch": 0.5798142631855616, "grad_norm": 10.774608652834829, "kl": 0.12890625, "learning_rate": 4.20360960224286e-07, "loss": 0.0075, "reward": 1.5416667461395264, "reward_std": 0.1473139077425003, "rewards/accuracy_reward_stage2": 0.6822916865348816, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3309 }, { "completion_length": 9.1875, "epoch": 0.5799894865954092, "grad_norm": 17.368178364877604, "kl": 0.035400390625, "learning_rate": 4.201857368144384e-07, "loss": 0.0141, "reward": 1.4706447124481201, "reward_std": 0.23802317678928375, "rewards/accuracy_reward_stage2": 0.47064468264579773, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3310 }, { "completion_length": 9.328125, "epoch": 0.5801647100052567, "grad_norm": 20.77313014791262, "kl": 0.08203125, "learning_rate": 4.200105134045908e-07, "loss": 0.0328, "reward": 1.747942566871643, "reward_std": 0.24145202338695526, "rewards/accuracy_reward_stage2": 0.7479425668716431, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3311 }, { "completion_length": 9.625, "epoch": 0.5803399334151043, "grad_norm": 18.866424210680346, "kl": 0.06591796875, "learning_rate": 4.1983528999474325e-07, "loss": 0.0263, "reward": 1.5332136154174805, "reward_std": 0.08188802003860474, "rewards/accuracy_reward_stage2": 0.7832136750221252, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3312 }, { "completion_length": 13.578125, "epoch": 0.5805151568249518, "grad_norm": 21.807267192943407, "kl": 0.07568359375, "learning_rate": 4.1966006658489575e-07, "loss": 0.0303, "reward": 1.578660011291504, "reward_std": 0.20007401704788208, "rewards/accuracy_reward_stage2": 0.5786599516868591, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3313 }, { "completion_length": 25.265625, "epoch": 0.5806903802347994, "grad_norm": 20.275197723166627, "kl": 0.046142578125, "learning_rate": 4.194848431750482e-07, "loss": 0.0185, "reward": 1.633068561553955, "reward_std": 0.09468790143728256, "rewards/accuracy_reward_stage2": 0.6330685615539551, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3314 }, { "completion_length": 7.609375, "epoch": 0.5808656036446469, "grad_norm": 16.043358849152995, "kl": 0.1015625, "learning_rate": 4.193096197652006e-07, "loss": -0.0475, "reward": 1.3541667461395264, "reward_std": 0.1178511381149292, "rewards/accuracy_reward_stage2": 0.5104166865348816, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3315 }, { "completion_length": 13.671875, "epoch": 0.5810408270544944, "grad_norm": 23.18812295368805, "kl": 0.1455078125, "learning_rate": 4.1913439635535307e-07, "loss": 0.0208, "reward": 1.467007040977478, "reward_std": 0.25777289271354675, "rewards/accuracy_reward_stage2": 0.4826321005821228, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3316 }, { "completion_length": 9.0, "epoch": 0.581216050464342, "grad_norm": 16.94807832750533, "kl": 0.12060546875, "learning_rate": 4.189591729455055e-07, "loss": 0.0483, "reward": 1.5463520288467407, "reward_std": 0.20520761609077454, "rewards/accuracy_reward_stage2": 0.6713520288467407, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3317 }, { "completion_length": 10.484375, "epoch": 0.5813912738741895, "grad_norm": 15.590820843421067, "kl": 0.2158203125, "learning_rate": 4.1878394953565794e-07, "loss": 0.0861, "reward": 1.3699464797973633, "reward_std": 0.10645575821399689, "rewards/accuracy_reward_stage2": 0.4949463903903961, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3318 }, { "completion_length": 7.625, "epoch": 0.5815664972840372, "grad_norm": 18.720185814431826, "kl": 0.11083984375, "learning_rate": 4.186087261258104e-07, "loss": 0.0002, "reward": 1.5887812376022339, "reward_std": 0.19031430780887604, "rewards/accuracy_reward_stage2": 0.6044062376022339, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3319 }, { "completion_length": 10.375, "epoch": 0.5817417206938847, "grad_norm": 18.988882850470866, "kl": 0.07373046875, "learning_rate": 4.184335027159628e-07, "loss": 0.0296, "reward": 1.48932945728302, "reward_std": 0.2587651014328003, "rewards/accuracy_reward_stage2": 0.48932945728302, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3320 }, { "completion_length": 13.140625, "epoch": 0.5819169441037323, "grad_norm": 15.464298948409237, "kl": 0.1171875, "learning_rate": 4.182582793061153e-07, "loss": 0.0468, "reward": 1.3898260593414307, "reward_std": 0.2174256443977356, "rewards/accuracy_reward_stage2": 0.5148261189460754, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3321 }, { "completion_length": 5.78125, "epoch": 0.5820921675135798, "grad_norm": 14.489600309623796, "kl": 0.18359375, "learning_rate": 4.1808305589626776e-07, "loss": 0.0732, "reward": 1.6758959293365479, "reward_std": 0.21336817741394043, "rewards/accuracy_reward_stage2": 0.8008958101272583, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3322 }, { "completion_length": 8.015625, "epoch": 0.5822673909234274, "grad_norm": 18.704860838352552, "kl": 0.185546875, "learning_rate": 4.1790783248642014e-07, "loss": 0.0743, "reward": 1.3735308647155762, "reward_std": 0.21026110649108887, "rewards/accuracy_reward_stage2": 0.4985308051109314, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3323 }, { "completion_length": 9.984375, "epoch": 0.5824426143332749, "grad_norm": 20.995400462058033, "kl": 0.1689453125, "learning_rate": 4.177326090765726e-07, "loss": 0.0461, "reward": 1.6412947177886963, "reward_std": 0.24216331541538239, "rewards/accuracy_reward_stage2": 0.6569197177886963, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3324 }, { "completion_length": 5.859375, "epoch": 0.5826178377431225, "grad_norm": 18.18017332400372, "kl": 0.09228515625, "learning_rate": 4.17557385666725e-07, "loss": -0.0048, "reward": 1.6639931201934814, "reward_std": 0.19962234795093536, "rewards/accuracy_reward_stage2": 0.6796180605888367, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3325 }, { "completion_length": 11.421875, "epoch": 0.58279306115297, "grad_norm": 18.2333607957152, "kl": 0.1572265625, "learning_rate": 4.173821622568775e-07, "loss": -0.0032, "reward": 1.7223223447799683, "reward_std": 0.2541934847831726, "rewards/accuracy_reward_stage2": 0.7535722255706787, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3326 }, { "completion_length": 14.15625, "epoch": 0.5829682845628176, "grad_norm": 22.09752305955603, "kl": 0.09814453125, "learning_rate": 4.1720693884702995e-07, "loss": 0.0044, "reward": 1.6956291198730469, "reward_std": 0.3532159924507141, "rewards/accuracy_reward_stage2": 0.7112541198730469, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3327 }, { "completion_length": 10.734375, "epoch": 0.5831435079726651, "grad_norm": 21.51765639958224, "kl": 0.11767578125, "learning_rate": 4.170317154371824e-07, "loss": -0.0271, "reward": 1.7271101474761963, "reward_std": 0.2501765787601471, "rewards/accuracy_reward_stage2": 0.7583601474761963, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3328 }, { "completion_length": 17.984375, "epoch": 0.5833187313825127, "grad_norm": 18.08290871320803, "kl": 0.04833984375, "learning_rate": 4.168564920273349e-07, "loss": 0.0193, "reward": 1.63839852809906, "reward_std": 0.08872491121292114, "rewards/accuracy_reward_stage2": 0.6383985877037048, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3329 }, { "completion_length": 8.421875, "epoch": 0.5834939547923602, "grad_norm": 20.120055979618456, "kl": 0.1708984375, "learning_rate": 4.1668126861748727e-07, "loss": -0.0333, "reward": 1.407860517501831, "reward_std": 0.34330761432647705, "rewards/accuracy_reward_stage2": 0.45473557710647583, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3330 }, { "completion_length": 14.484375, "epoch": 0.5836691782022078, "grad_norm": 18.739297757537408, "kl": 0.07080078125, "learning_rate": 4.165060452076397e-07, "loss": 0.0283, "reward": 1.487917423248291, "reward_std": 0.20292870700359344, "rewards/accuracy_reward_stage2": 0.6129173040390015, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3331 }, { "completion_length": 11.3125, "epoch": 0.5838444016120554, "grad_norm": 16.014175859750328, "kl": 0.05029296875, "learning_rate": 4.1633082179779215e-07, "loss": 0.0202, "reward": 1.8262572288513184, "reward_std": 0.1389392912387848, "rewards/accuracy_reward_stage2": 0.8262572288513184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3332 }, { "completion_length": 10.40625, "epoch": 0.584019625021903, "grad_norm": 22.951480118000163, "kl": 0.10205078125, "learning_rate": 4.161555983879446e-07, "loss": 0.0119, "reward": 1.6964985132217407, "reward_std": 0.14089880883693695, "rewards/accuracy_reward_stage2": 0.7121233940124512, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3333 }, { "completion_length": 8.96875, "epoch": 0.5841948484317505, "grad_norm": 19.04509047433968, "kl": 0.1025390625, "learning_rate": 4.159803749780971e-07, "loss": 0.041, "reward": 1.8162028789520264, "reward_std": 0.12637542188167572, "rewards/accuracy_reward_stage2": 0.8162027597427368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3334 }, { "completion_length": 8.359375, "epoch": 0.5843700718415981, "grad_norm": 16.61353049886891, "kl": 0.1181640625, "learning_rate": 4.158051515682495e-07, "loss": 0.0474, "reward": 1.424905776977539, "reward_std": 0.14749082922935486, "rewards/accuracy_reward_stage2": 0.5499057769775391, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3335 }, { "completion_length": 12.421875, "epoch": 0.5845452952514456, "grad_norm": 15.680900749988403, "kl": 0.0172119140625, "learning_rate": 4.156299281584019e-07, "loss": 0.0069, "reward": 1.6614583730697632, "reward_std": 0.12134584784507751, "rewards/accuracy_reward_stage2": 0.6614583134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3336 }, { "completion_length": 8.53125, "epoch": 0.5847205186612932, "grad_norm": 17.680179150240633, "kl": 0.045166015625, "learning_rate": 4.1545470474855435e-07, "loss": 0.0181, "reward": 1.447710394859314, "reward_std": 0.159596785902977, "rewards/accuracy_reward_stage2": 0.572710394859314, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3337 }, { "completion_length": 15.8125, "epoch": 0.5848957420711407, "grad_norm": 16.446810362956278, "kl": 0.10498046875, "learning_rate": 4.1527948133870684e-07, "loss": -0.0023, "reward": 1.7206721305847168, "reward_std": 0.27663755416870117, "rewards/accuracy_reward_stage2": 0.736297070980072, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3338 }, { "completion_length": 9.359375, "epoch": 0.5850709654809882, "grad_norm": 21.36734395553362, "kl": 0.1552734375, "learning_rate": 4.151042579288593e-07, "loss": 0.0623, "reward": 1.713259220123291, "reward_std": 0.25472259521484375, "rewards/accuracy_reward_stage2": 0.8382592797279358, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3339 }, { "completion_length": 9.859375, "epoch": 0.5852461888908358, "grad_norm": 31.907468674326463, "kl": 0.10400390625, "learning_rate": 4.149290345190117e-07, "loss": 0.0103, "reward": 1.605589747428894, "reward_std": 0.32869046926498413, "rewards/accuracy_reward_stage2": 0.6212146878242493, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3340 }, { "completion_length": 10.671875, "epoch": 0.5854214123006833, "grad_norm": 19.527174473286316, "kl": 0.2373046875, "learning_rate": 4.1475381110916416e-07, "loss": -0.0196, "reward": 1.3388493061065674, "reward_std": 0.3259095251560211, "rewards/accuracy_reward_stage2": 0.5107242465019226, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3341 }, { "completion_length": 12.9375, "epoch": 0.5855966357105309, "grad_norm": 16.606210595175256, "kl": 0.091796875, "learning_rate": 4.145785876993166e-07, "loss": 0.0366, "reward": 1.2201833724975586, "reward_std": 0.21854467689990997, "rewards/accuracy_reward_stage2": 0.34518343210220337, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3342 }, { "completion_length": 11.296875, "epoch": 0.5857718591203784, "grad_norm": 24.992915564062983, "kl": 21.625, "learning_rate": 4.1440336428946904e-07, "loss": 8.678, "reward": 1.3538477420806885, "reward_std": 0.17989769577980042, "rewards/accuracy_reward_stage2": 0.4788476824760437, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3343 }, { "completion_length": 7.5, "epoch": 0.5859470825302261, "grad_norm": 20.341540212078797, "kl": 0.17578125, "learning_rate": 4.142281408796215e-07, "loss": 0.0704, "reward": 1.651244044303894, "reward_std": 0.2022555023431778, "rewards/accuracy_reward_stage2": 0.6512439846992493, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3344 }, { "completion_length": 7.21875, "epoch": 0.5861223059400736, "grad_norm": 23.357858185375914, "kl": 0.06884765625, "learning_rate": 4.140529174697739e-07, "loss": 0.0276, "reward": 1.7915301322937012, "reward_std": 0.26167210936546326, "rewards/accuracy_reward_stage2": 0.7915301322937012, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3345 }, { "completion_length": 12.4375, "epoch": 0.5862975293499212, "grad_norm": 7.846556815175532, "kl": 0.056884765625, "learning_rate": 4.138776940599264e-07, "loss": 0.0227, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.859375, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3346 }, { "completion_length": 11.84375, "epoch": 0.5864727527597687, "grad_norm": 18.95217996727311, "kl": 0.1484375, "learning_rate": 4.1370247065007885e-07, "loss": 0.0594, "reward": 1.5207836627960205, "reward_std": 0.2098308801651001, "rewards/accuracy_reward_stage2": 0.6457836627960205, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3347 }, { "completion_length": 9.671875, "epoch": 0.5866479761696163, "grad_norm": 27.59838884419405, "kl": 0.259765625, "learning_rate": 4.135272472402313e-07, "loss": 0.0232, "reward": 1.4346290826797485, "reward_std": 0.3767169117927551, "rewards/accuracy_reward_stage2": 0.4658791422843933, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3348 }, { "completion_length": 7.0, "epoch": 0.5868231995794638, "grad_norm": 9.903665941983448, "kl": 0.12060546875, "learning_rate": 4.133520238303837e-07, "loss": 0.004, "reward": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3349 }, { "completion_length": 6.234375, "epoch": 0.5869984229893114, "grad_norm": 18.693980258556312, "kl": 0.119140625, "learning_rate": 4.1317680042053617e-07, "loss": 0.0034, "reward": 1.6041667461395264, "reward_std": 0.2158295214176178, "rewards/accuracy_reward_stage2": 0.6197916865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3350 }, { "completion_length": 10.703125, "epoch": 0.5871736463991589, "grad_norm": 22.560774928412627, "kl": 0.169921875, "learning_rate": 4.130015770106886e-07, "loss": 0.068, "reward": 1.4317594766616821, "reward_std": 0.2960038483142853, "rewards/accuracy_reward_stage2": 0.5567594766616821, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3351 }, { "completion_length": 13.90625, "epoch": 0.5873488698090065, "grad_norm": 26.03373808729502, "kl": 0.3046875, "learning_rate": 4.1282635360084105e-07, "loss": 0.0931, "reward": 1.2304809093475342, "reward_std": 0.17618829011917114, "rewards/accuracy_reward_stage2": 0.4961059093475342, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3352 }, { "completion_length": 9.171875, "epoch": 0.587524093218854, "grad_norm": 15.91817488703617, "kl": 0.1259765625, "learning_rate": 4.126511301909935e-07, "loss": 0.0062, "reward": 1.702805757522583, "reward_std": 0.28147825598716736, "rewards/accuracy_reward_stage2": 0.718430757522583, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3353 }, { "completion_length": 10.328125, "epoch": 0.5876993166287016, "grad_norm": 18.887598806244405, "kl": 0.2294921875, "learning_rate": 4.12475906781146e-07, "loss": 0.0067, "reward": 1.3210906982421875, "reward_std": 0.26162979006767273, "rewards/accuracy_reward_stage2": 0.4773406982421875, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3354 }, { "completion_length": 12.0625, "epoch": 0.5878745400385491, "grad_norm": 19.96975439666355, "kl": 0.072265625, "learning_rate": 4.1230068337129837e-07, "loss": 0.029, "reward": 1.5819294452667236, "reward_std": 0.3019178509712219, "rewards/accuracy_reward_stage2": 0.7069293260574341, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3355 }, { "completion_length": 10.046875, "epoch": 0.5880497634483967, "grad_norm": 19.25697606660207, "kl": 0.1376953125, "learning_rate": 4.121254599614508e-07, "loss": -0.0123, "reward": 1.608581304550171, "reward_std": 0.2221478521823883, "rewards/accuracy_reward_stage2": 0.6398313641548157, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3356 }, { "completion_length": 9.671875, "epoch": 0.5882249868582443, "grad_norm": 21.210323483932303, "kl": 0.11767578125, "learning_rate": 4.1195023655160325e-07, "loss": 0.0471, "reward": 1.3723440170288086, "reward_std": 0.18884873390197754, "rewards/accuracy_reward_stage2": 0.4973440170288086, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3357 }, { "completion_length": 8.859375, "epoch": 0.5884002102680919, "grad_norm": 26.110479544755435, "kl": 0.16015625, "learning_rate": 4.1177501314175574e-07, "loss": 0.0199, "reward": 1.52445387840271, "reward_std": 0.2836562991142273, "rewards/accuracy_reward_stage2": 0.6650788187980652, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3358 }, { "completion_length": 7.421875, "epoch": 0.5885754336779394, "grad_norm": 14.95239738422591, "kl": 0.0859375, "learning_rate": 4.115997897319082e-07, "loss": -0.0099, "reward": 1.7336453199386597, "reward_std": 0.18917325139045715, "rewards/accuracy_reward_stage2": 0.7492703199386597, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3359 }, { "completion_length": 10.0, "epoch": 0.588750657087787, "grad_norm": 10.26649766816332, "kl": 0.06201171875, "learning_rate": 4.114245663220606e-07, "loss": 0.0249, "reward": 1.59375, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3360 }, { "completion_length": 19.609375, "epoch": 0.5889258804976345, "grad_norm": 19.542471997914472, "kl": 0.0301513671875, "learning_rate": 4.11249342912213e-07, "loss": 0.0121, "reward": 1.4741116762161255, "reward_std": 0.1889735460281372, "rewards/accuracy_reward_stage2": 0.4741116762161255, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3361 }, { "completion_length": 9.859375, "epoch": 0.589101103907482, "grad_norm": 15.580185778605106, "kl": 0.1396484375, "learning_rate": 4.110741195023655e-07, "loss": -0.006, "reward": 1.5328525304794312, "reward_std": 0.24839192628860474, "rewards/accuracy_reward_stage2": 0.5641025304794312, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3362 }, { "completion_length": 10.8125, "epoch": 0.5892763273173296, "grad_norm": 27.01172403275803, "kl": 0.0654296875, "learning_rate": 4.1089889609251794e-07, "loss": 0.0262, "reward": 1.5872222185134888, "reward_std": 0.27706378698349, "rewards/accuracy_reward_stage2": 0.7122222185134888, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3363 }, { "completion_length": 19.6875, "epoch": 0.5894515507271771, "grad_norm": 18.45051149058737, "kl": 0.109375, "learning_rate": 4.107236726826704e-07, "loss": 0.0437, "reward": 1.0697689056396484, "reward_std": 0.15089532732963562, "rewards/accuracy_reward_stage2": 0.31976890563964844, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3364 }, { "completion_length": 10.375, "epoch": 0.5896267741370247, "grad_norm": 21.58470674598541, "kl": 0.1376953125, "learning_rate": 4.105484492728228e-07, "loss": -0.0107, "reward": 1.2813940048217773, "reward_std": 0.26054540276527405, "rewards/accuracy_reward_stage2": 0.43764400482177734, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3365 }, { "completion_length": 4.265625, "epoch": 0.5898019975468722, "grad_norm": 11.649879587538361, "kl": 0.1630859375, "learning_rate": 4.103732258629753e-07, "loss": -0.0623, "reward": 1.796875, "reward_std": 0.18139132857322693, "rewards/accuracy_reward_stage2": 0.84375, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3366 }, { "completion_length": 8.046875, "epoch": 0.5899772209567198, "grad_norm": 23.81306888391832, "kl": 0.1279296875, "learning_rate": 4.1019800245312775e-07, "loss": 0.0511, "reward": 1.5414574146270752, "reward_std": 0.25763386487960815, "rewards/accuracy_reward_stage2": 0.5414573550224304, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3367 }, { "completion_length": 7.0625, "epoch": 0.5901524443665673, "grad_norm": 19.94454738981174, "kl": 0.08837890625, "learning_rate": 4.1002277904328014e-07, "loss": 0.0354, "reward": 1.7906076908111572, "reward_std": 0.20431801676750183, "rewards/accuracy_reward_stage2": 0.7906076908111572, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3368 }, { "completion_length": 13.78125, "epoch": 0.5903276677764149, "grad_norm": 14.935642544111568, "kl": 0.2294921875, "learning_rate": 4.098475556334326e-07, "loss": 0.0476, "reward": 1.0843511819839478, "reward_std": 0.10877098143100739, "rewards/accuracy_reward_stage2": 0.34997621178627014, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3369 }, { "completion_length": 8.484375, "epoch": 0.5905028911862625, "grad_norm": 16.50548252361066, "kl": 0.142578125, "learning_rate": 4.0967233222358507e-07, "loss": 0.0571, "reward": 1.4899513721466064, "reward_std": 0.09331192076206207, "rewards/accuracy_reward_stage2": 0.6149513721466064, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3370 }, { "completion_length": 8.4375, "epoch": 0.5906781145961101, "grad_norm": 23.60007625752545, "kl": 0.310546875, "learning_rate": 4.094971088137375e-07, "loss": 0.0226, "reward": 1.373981237411499, "reward_std": 0.37158650159835815, "rewards/accuracy_reward_stage2": 0.545856237411499, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3371 }, { "completion_length": 7.046875, "epoch": 0.5908533380059576, "grad_norm": 17.13091303333977, "kl": 0.16015625, "learning_rate": 4.0932188540388995e-07, "loss": 0.0052, "reward": 1.7232661247253418, "reward_std": 0.2743530869483948, "rewards/accuracy_reward_stage2": 0.7545161843299866, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3372 }, { "completion_length": 7.6875, "epoch": 0.5910285614158052, "grad_norm": 12.522108130668892, "kl": 0.126953125, "learning_rate": 4.091466619940424e-07, "loss": 0.011, "reward": 1.3854740858078003, "reward_std": 0.2172052413225174, "rewards/accuracy_reward_stage2": 0.5260991454124451, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3373 }, { "completion_length": 15.59375, "epoch": 0.5912037848256527, "grad_norm": 23.58891641696791, "kl": 0.09375, "learning_rate": 4.0897143858419483e-07, "loss": 0.0375, "reward": 1.3126802444458008, "reward_std": 0.27276840806007385, "rewards/accuracy_reward_stage2": 0.43768009543418884, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3374 }, { "completion_length": 13.34375, "epoch": 0.5913790082355003, "grad_norm": 16.012524695728487, "kl": 0.1005859375, "learning_rate": 4.0879621517434727e-07, "loss": 0.0026, "reward": 1.7399613857269287, "reward_std": 0.14063388109207153, "rewards/accuracy_reward_stage2": 0.7712113261222839, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3375 }, { "completion_length": 8.53125, "epoch": 0.5915542316453478, "grad_norm": 19.110171898355755, "kl": 0.06201171875, "learning_rate": 4.086209917644997e-07, "loss": 0.0249, "reward": 1.3935902118682861, "reward_std": 0.20389771461486816, "rewards/accuracy_reward_stage2": 0.39359015226364136, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3376 }, { "completion_length": 9.015625, "epoch": 0.5917294550551954, "grad_norm": 18.917191334580906, "kl": 0.3125, "learning_rate": 4.0844576835465215e-07, "loss": 0.0808, "reward": 1.6681314706802368, "reward_std": 0.2374303936958313, "rewards/accuracy_reward_stage2": 0.6837565898895264, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3377 }, { "completion_length": 37.875, "epoch": 0.5919046784650429, "grad_norm": 20.295157453686457, "kl": 0.140625, "learning_rate": 4.0827054494480464e-07, "loss": -0.0315, "reward": 1.2237355709075928, "reward_std": 0.28009819984436035, "rewards/accuracy_reward_stage2": 0.3799855411052704, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3378 }, { "completion_length": 21.140625, "epoch": 0.5920799018748905, "grad_norm": 18.80728202640302, "kl": 0.08837890625, "learning_rate": 4.080953215349571e-07, "loss": -0.0089, "reward": 1.4754610061645508, "reward_std": 0.16696128249168396, "rewards/accuracy_reward_stage2": 0.4910860061645508, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3379 }, { "completion_length": 9.828125, "epoch": 0.592255125284738, "grad_norm": 16.59325732582251, "kl": 0.2470703125, "learning_rate": 4.0792009812510947e-07, "loss": 0.0359, "reward": 1.3921682834625244, "reward_std": 0.28522390127182007, "rewards/accuracy_reward_stage2": 0.548418402671814, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3380 }, { "completion_length": 6.8125, "epoch": 0.5924303486945856, "grad_norm": 13.811814446211153, "kl": 0.2158203125, "learning_rate": 4.077448747152619e-07, "loss": 0.0424, "reward": 1.3314099311828613, "reward_std": 0.16977867484092712, "rewards/accuracy_reward_stage2": 0.47203493118286133, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3381 }, { "completion_length": 8.203125, "epoch": 0.5926055721044331, "grad_norm": 19.216188862686714, "kl": 0.1435546875, "learning_rate": 4.075696513054144e-07, "loss": 0.0161, "reward": 1.4336891174316406, "reward_std": 0.3022252917289734, "rewards/accuracy_reward_stage2": 0.5743141174316406, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3382 }, { "completion_length": 9.890625, "epoch": 0.5927807955142808, "grad_norm": 17.67517194094775, "kl": 0.1416015625, "learning_rate": 4.0739442789556684e-07, "loss": 0.0568, "reward": 1.6875131130218506, "reward_std": 0.202718585729599, "rewards/accuracy_reward_stage2": 0.6875130534172058, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3383 }, { "completion_length": 11.28125, "epoch": 0.5929560189241283, "grad_norm": 19.006467821007, "kl": 0.2431640625, "learning_rate": 4.072192044857193e-07, "loss": 0.0185, "reward": 1.2602907419204712, "reward_std": 0.26582372188568115, "rewards/accuracy_reward_stage2": 0.541540801525116, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3384 }, { "completion_length": 19.546875, "epoch": 0.5931312423339758, "grad_norm": 17.659396295554313, "kl": 0.16015625, "learning_rate": 4.070439810758717e-07, "loss": -0.0393, "reward": 1.3948569297790527, "reward_std": 0.3265957236289978, "rewards/accuracy_reward_stage2": 0.44173192977905273, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3385 }, { "completion_length": 11.25, "epoch": 0.5933064657438234, "grad_norm": 16.915472791830478, "kl": 0.050048828125, "learning_rate": 4.068687576660242e-07, "loss": 0.02, "reward": 1.7250510454177856, "reward_std": 0.19554750621318817, "rewards/accuracy_reward_stage2": 0.7250510454177856, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3386 }, { "completion_length": 12.046875, "epoch": 0.5934816891536709, "grad_norm": 24.37913620423697, "kl": 0.267578125, "learning_rate": 4.066935342561766e-07, "loss": -0.0634, "reward": 1.6817896366119385, "reward_std": 0.34547320008277893, "rewards/accuracy_reward_stage2": 0.744289755821228, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3387 }, { "completion_length": 7.4375, "epoch": 0.5936569125635185, "grad_norm": 14.314208272923713, "kl": 0.10546875, "learning_rate": 4.0651831084632904e-07, "loss": -0.0021, "reward": 1.3413195610046387, "reward_std": 0.2445610910654068, "rewards/accuracy_reward_stage2": 0.4819444417953491, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3388 }, { "completion_length": 8.9375, "epoch": 0.593832135973366, "grad_norm": 12.188262249976322, "kl": 0.06787109375, "learning_rate": 4.063430874364815e-07, "loss": -0.017, "reward": 1.6598129272460938, "reward_std": 0.13758717477321625, "rewards/accuracy_reward_stage2": 0.6754379868507385, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3389 }, { "completion_length": 9.09375, "epoch": 0.5940073593832136, "grad_norm": 20.565537812672964, "kl": 0.189453125, "learning_rate": 4.0616786402663397e-07, "loss": 0.0332, "reward": 1.424170732498169, "reward_std": 0.27712106704711914, "rewards/accuracy_reward_stage2": 0.4397958219051361, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3390 }, { "completion_length": 10.5625, "epoch": 0.5941825827930611, "grad_norm": 18.537166743695696, "kl": 0.2080078125, "learning_rate": 4.059926406167864e-07, "loss": -0.0532, "reward": 1.3873106241226196, "reward_std": 0.18024834990501404, "rewards/accuracy_reward_stage2": 0.44981059432029724, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3391 }, { "completion_length": 13.046875, "epoch": 0.5943578062029087, "grad_norm": 17.6139555607113, "kl": 0.05029296875, "learning_rate": 4.0581741720693885e-07, "loss": 0.0201, "reward": 1.756882905960083, "reward_std": 0.1282881498336792, "rewards/accuracy_reward_stage2": 0.7568830251693726, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3392 }, { "completion_length": 9.609375, "epoch": 0.5945330296127562, "grad_norm": 22.452716228857593, "kl": 0.1572265625, "learning_rate": 4.0564219379709124e-07, "loss": -0.0039, "reward": 1.4564459323883057, "reward_std": 0.3075176477432251, "rewards/accuracy_reward_stage2": 0.4876958727836609, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3393 }, { "completion_length": 8.78125, "epoch": 0.5947082530226038, "grad_norm": 28.37169774199766, "kl": 0.2177734375, "learning_rate": 4.054669703872437e-07, "loss": 0.0546, "reward": 1.4617120027542114, "reward_std": 0.26479586958885193, "rewards/accuracy_reward_stage2": 0.6023369431495667, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3394 }, { "completion_length": 6.921875, "epoch": 0.5948834764324514, "grad_norm": 13.656636345638992, "kl": 0.1591796875, "learning_rate": 4.0529174697739617e-07, "loss": 0.0197, "reward": 1.4355125427246094, "reward_std": 0.15464989840984344, "rewards/accuracy_reward_stage2": 0.45113757252693176, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3395 }, { "completion_length": 10.921875, "epoch": 0.595058699842299, "grad_norm": 21.538672452109356, "kl": 0.0673828125, "learning_rate": 4.051165235675486e-07, "loss": 0.0271, "reward": 1.7184603214263916, "reward_std": 0.15963563323020935, "rewards/accuracy_reward_stage2": 0.7184603214263916, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3396 }, { "completion_length": 8.359375, "epoch": 0.5952339232521465, "grad_norm": 12.307626427997151, "kl": 0.111328125, "learning_rate": 4.0494130015770105e-07, "loss": 0.0103, "reward": 1.8679943084716797, "reward_std": 0.11993659287691116, "rewards/accuracy_reward_stage2": 0.8836191892623901, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3397 }, { "completion_length": 7.5625, "epoch": 0.5954091466619941, "grad_norm": 13.598249244566306, "kl": 0.10546875, "learning_rate": 4.0476607674785354e-07, "loss": 0.0207, "reward": 1.5885417461395264, "reward_std": 0.1236192062497139, "rewards/accuracy_reward_stage2": 0.6041666865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3398 }, { "completion_length": 11.296875, "epoch": 0.5955843700718416, "grad_norm": 19.143868313764987, "kl": 0.1611328125, "learning_rate": 4.04590853338006e-07, "loss": 0.0646, "reward": 1.5587762594223022, "reward_std": 0.16119365394115448, "rewards/accuracy_reward_stage2": 0.5587762594223022, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3399 }, { "completion_length": 11.578125, "epoch": 0.5957595934816892, "grad_norm": 19.578673146528867, "kl": 0.59765625, "learning_rate": 4.0441562992815837e-07, "loss": 0.0771, "reward": 1.5015919208526611, "reward_std": 0.26523348689079285, "rewards/accuracy_reward_stage2": 0.6890919804573059, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 3400 }, { "completion_length": 8.625, "epoch": 0.5959348168915367, "grad_norm": 19.872808171518628, "kl": 0.2314453125, "learning_rate": 4.042404065183108e-07, "loss": 0.0123, "reward": 1.413339614868164, "reward_std": 0.29223260283470154, "rewards/accuracy_reward_stage2": 0.5695896148681641, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3401 }, { "completion_length": 10.859375, "epoch": 0.5961100403013843, "grad_norm": 20.536292079434425, "kl": 0.1328125, "learning_rate": 4.0406518310846324e-07, "loss": 0.0242, "reward": 1.3782663345336914, "reward_std": 0.283265620470047, "rewards/accuracy_reward_stage2": 0.5188913345336914, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3402 }, { "completion_length": 11.59375, "epoch": 0.5962852637112318, "grad_norm": 14.543339866793787, "kl": 0.138671875, "learning_rate": 4.0388995969861574e-07, "loss": 0.0138, "reward": 1.5714879035949707, "reward_std": 0.12914830446243286, "rewards/accuracy_reward_stage2": 0.7121127843856812, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3403 }, { "completion_length": 13.34375, "epoch": 0.5964604871210794, "grad_norm": 17.874908256904774, "kl": 0.06787109375, "learning_rate": 4.037147362887682e-07, "loss": 0.0271, "reward": 1.4587076902389526, "reward_std": 0.17502731084823608, "rewards/accuracy_reward_stage2": 0.45870766043663025, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3404 }, { "completion_length": 7.3125, "epoch": 0.5966357105309269, "grad_norm": 22.047570139658138, "kl": 0.0673828125, "learning_rate": 4.035395128789206e-07, "loss": -0.0172, "reward": 1.726590871810913, "reward_std": 0.14708679914474487, "rewards/accuracy_reward_stage2": 0.7422158122062683, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3405 }, { "completion_length": 10.234375, "epoch": 0.5968109339407744, "grad_norm": 17.266053412726244, "kl": 0.08203125, "learning_rate": 4.03364289469073e-07, "loss": -0.0113, "reward": 1.6297082901000977, "reward_std": 0.13242757320404053, "rewards/accuracy_reward_stage2": 0.7703334093093872, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3406 }, { "completion_length": 9.59375, "epoch": 0.596986157350622, "grad_norm": 16.885928043424666, "kl": 0.078125, "learning_rate": 4.031890660592255e-07, "loss": -0.013, "reward": 1.5568372011184692, "reward_std": 0.15844221413135529, "rewards/accuracy_reward_stage2": 0.572462260723114, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3407 }, { "completion_length": 12.84375, "epoch": 0.5971613807604697, "grad_norm": 20.659760081222462, "kl": 0.16015625, "learning_rate": 4.0301384264937794e-07, "loss": 0.032, "reward": 1.6425063610076904, "reward_std": 0.22482311725616455, "rewards/accuracy_reward_stage2": 0.6581313610076904, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3408 }, { "completion_length": 12.28125, "epoch": 0.5973366041703172, "grad_norm": 10.630078981944184, "kl": 0.076171875, "learning_rate": 4.028386192395304e-07, "loss": -0.0079, "reward": 1.7965457439422607, "reward_std": 0.07948299497365952, "rewards/accuracy_reward_stage2": 0.9371707439422607, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3409 }, { "completion_length": 10.546875, "epoch": 0.5975118275801647, "grad_norm": 26.564388093945244, "kl": 0.1611328125, "learning_rate": 4.026633958296828e-07, "loss": 0.0267, "reward": 1.467179775238037, "reward_std": 0.24725881218910217, "rewards/accuracy_reward_stage2": 0.48280471563339233, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3410 }, { "completion_length": 9.015625, "epoch": 0.5976870509900123, "grad_norm": 18.764011624529996, "kl": 0.1337890625, "learning_rate": 4.024881724198353e-07, "loss": -0.0347, "reward": 1.767315149307251, "reward_std": 0.1556333750486374, "rewards/accuracy_reward_stage2": 0.7985650897026062, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3411 }, { "completion_length": 8.5625, "epoch": 0.5978622743998598, "grad_norm": 12.278633061947433, "kl": 0.007537841796875, "learning_rate": 4.023129490099877e-07, "loss": 0.003, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3412 }, { "completion_length": 10.078125, "epoch": 0.5980374978097074, "grad_norm": 17.42639047551563, "kl": 0.061767578125, "learning_rate": 4.0213772560014013e-07, "loss": 0.0247, "reward": 1.8013005256652832, "reward_std": 0.18322604894638062, "rewards/accuracy_reward_stage2": 0.8013004660606384, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3413 }, { "completion_length": 10.28125, "epoch": 0.5982127212195549, "grad_norm": 20.745494626089844, "kl": 0.1240234375, "learning_rate": 4.0196250219029257e-07, "loss": 0.0079, "reward": 1.5774965286254883, "reward_std": 0.30587121844291687, "rewards/accuracy_reward_stage2": 0.5931214094161987, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3414 }, { "completion_length": 11.828125, "epoch": 0.5983879446294025, "grad_norm": 22.787095634560245, "kl": 0.1875, "learning_rate": 4.0178727878044507e-07, "loss": 0.0394, "reward": 1.4983285665512085, "reward_std": 0.19265775382518768, "rewards/accuracy_reward_stage2": 0.5139535665512085, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3415 }, { "completion_length": 8.390625, "epoch": 0.59856316803925, "grad_norm": 20.242710381195522, "kl": 0.1318359375, "learning_rate": 4.016120553705975e-07, "loss": 0.0362, "reward": 1.6618952751159668, "reward_std": 0.21545787155628204, "rewards/accuracy_reward_stage2": 0.6775202751159668, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3416 }, { "completion_length": 13.03125, "epoch": 0.5987383914490976, "grad_norm": 15.263042608528119, "kl": 0.09228515625, "learning_rate": 4.0143683196074995e-07, "loss": -0.0072, "reward": 1.6668956279754639, "reward_std": 0.192345529794693, "rewards/accuracy_reward_stage2": 0.6825206279754639, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3417 }, { "completion_length": 12.59375, "epoch": 0.5989136148589451, "grad_norm": 19.471023785017675, "kl": 0.08154296875, "learning_rate": 4.012616085509024e-07, "loss": -0.0115, "reward": 1.7478539943695068, "reward_std": 0.2811315953731537, "rewards/accuracy_reward_stage2": 0.7634790539741516, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3418 }, { "completion_length": 7.359375, "epoch": 0.5990888382687927, "grad_norm": 19.587244365848665, "kl": 0.146484375, "learning_rate": 4.010863851410548e-07, "loss": 0.0587, "reward": 1.6780874729156494, "reward_std": 0.19293718039989471, "rewards/accuracy_reward_stage2": 0.6780875325202942, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3419 }, { "completion_length": 20.625, "epoch": 0.5992640616786402, "grad_norm": 20.90755133142386, "kl": 0.1015625, "learning_rate": 4.0091116173120726e-07, "loss": 0.0405, "reward": 1.59331214427948, "reward_std": 0.1857033371925354, "rewards/accuracy_reward_stage2": 0.59331214427948, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3420 }, { "completion_length": 9.875, "epoch": 0.5994392850884879, "grad_norm": 13.415275248514398, "kl": 0.027099609375, "learning_rate": 4.007359383213597e-07, "loss": 0.0108, "reward": 1.7339122295379639, "reward_std": 0.15513893961906433, "rewards/accuracy_reward_stage2": 0.7339121699333191, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3421 }, { "completion_length": 10.09375, "epoch": 0.5996145084983354, "grad_norm": 17.46796887441957, "kl": 0.1279296875, "learning_rate": 4.0056071491151214e-07, "loss": 0.0121, "reward": 1.5939295291900635, "reward_std": 0.18742156028747559, "rewards/accuracy_reward_stage2": 0.6251795291900635, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3422 }, { "completion_length": 21.328125, "epoch": 0.599789731908183, "grad_norm": 15.744692972576852, "kl": 0.11669921875, "learning_rate": 4.0038549150166464e-07, "loss": 0.0467, "reward": 1.5629197359085083, "reward_std": 0.21470427513122559, "rewards/accuracy_reward_stage2": 0.5629197955131531, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3423 }, { "completion_length": 13.1875, "epoch": 0.5999649553180305, "grad_norm": 16.44038857892878, "kl": 0.345703125, "learning_rate": 4.002102680918171e-07, "loss": 0.0582, "reward": 1.5216660499572754, "reward_std": 0.2286083847284317, "rewards/accuracy_reward_stage2": 0.8029160499572754, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3424 }, { "completion_length": 9.5, "epoch": 0.6001401787278781, "grad_norm": 10.97712282230342, "kl": 0.0283203125, "learning_rate": 4.0003504468196946e-07, "loss": 0.0113, "reward": 1.6370192766189575, "reward_std": 0.08915039896965027, "rewards/accuracy_reward_stage2": 0.6370192170143127, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3425 }, { "completion_length": 9.546875, "epoch": 0.6003154021377256, "grad_norm": 17.1651898145542, "kl": 0.1328125, "learning_rate": 3.998598212721219e-07, "loss": 0.0121, "reward": 1.5052459239959717, "reward_std": 0.2685433626174927, "rewards/accuracy_reward_stage2": 0.6458709239959717, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3426 }, { "completion_length": 11.71875, "epoch": 0.6004906255475732, "grad_norm": 18.557896383066335, "kl": 0.169921875, "learning_rate": 3.996845978622744e-07, "loss": 0.0263, "reward": 1.2808881998062134, "reward_std": 0.19097641110420227, "rewards/accuracy_reward_stage2": 0.6558881998062134, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 3427 }, { "completion_length": 9.453125, "epoch": 0.6006658489574207, "grad_norm": 25.268672223120202, "kl": 0.154296875, "learning_rate": 3.9950937445242683e-07, "loss": 0.0175, "reward": 1.5014958381652832, "reward_std": 0.2355181872844696, "rewards/accuracy_reward_stage2": 0.5171208381652832, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3428 }, { "completion_length": 17.28125, "epoch": 0.6008410723672682, "grad_norm": 27.698931415889906, "kl": 0.34765625, "learning_rate": 3.993341510425793e-07, "loss": 0.0096, "reward": 1.4189836978912354, "reward_std": 0.3754510283470154, "rewards/accuracy_reward_stage2": 0.48148372769355774, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3429 }, { "completion_length": 13.078125, "epoch": 0.6010162957771158, "grad_norm": 14.964534700377712, "kl": 0.025634765625, "learning_rate": 3.991589276327317e-07, "loss": 0.0102, "reward": 1.8493903875350952, "reward_std": 0.08784636110067368, "rewards/accuracy_reward_stage2": 0.8493903875350952, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3430 }, { "completion_length": 8.671875, "epoch": 0.6011915191869633, "grad_norm": 18.686887863480006, "kl": 0.1904296875, "learning_rate": 3.9898370422288415e-07, "loss": -0.0013, "reward": 1.4473824501037598, "reward_std": 0.30469417572021484, "rewards/accuracy_reward_stage2": 0.7286325097084045, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3431 }, { "completion_length": 6.84375, "epoch": 0.6013667425968109, "grad_norm": 23.05137679558983, "kl": 0.08642578125, "learning_rate": 3.988084808130366e-07, "loss": 0.0345, "reward": 1.5047707557678223, "reward_std": 0.23167124390602112, "rewards/accuracy_reward_stage2": 0.5047707557678223, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3432 }, { "completion_length": 24.25, "epoch": 0.6015419660066584, "grad_norm": 23.424671897950013, "kl": 0.140625, "learning_rate": 3.9863325740318903e-07, "loss": -0.0319, "reward": 1.5699257850646973, "reward_std": 0.35112351179122925, "rewards/accuracy_reward_stage2": 0.601175844669342, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3433 }, { "completion_length": 8.9375, "epoch": 0.6017171894165061, "grad_norm": 20.79507527827949, "kl": 0.06689453125, "learning_rate": 3.9845803399334147e-07, "loss": -0.0022, "reward": 1.6292564868927002, "reward_std": 0.1802724003791809, "rewards/accuracy_reward_stage2": 0.6448814868927002, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3434 }, { "completion_length": 7.578125, "epoch": 0.6018924128263536, "grad_norm": 15.589011966550148, "kl": 0.0927734375, "learning_rate": 3.9828281058349396e-07, "loss": -0.0071, "reward": 1.65333092212677, "reward_std": 0.16445782780647278, "rewards/accuracy_reward_stage2": 0.66895592212677, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3435 }, { "completion_length": 8.953125, "epoch": 0.6020676362362012, "grad_norm": 34.07184555789485, "kl": 0.11767578125, "learning_rate": 3.981075871736464e-07, "loss": 0.003, "reward": 1.6030738353729248, "reward_std": 0.29539844393730164, "rewards/accuracy_reward_stage2": 0.6186988353729248, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3436 }, { "completion_length": 7.953125, "epoch": 0.6022428596460487, "grad_norm": 16.218031005827555, "kl": 0.16015625, "learning_rate": 3.9793236376379884e-07, "loss": 0.0638, "reward": 0.9500302076339722, "reward_std": 0.1618354320526123, "rewards/accuracy_reward_stage2": 0.45003020763397217, "rewards/format_reward_stage1_pointerpad": 0.5, "scores/accuracy_reward_stage2": 0.5, "step": 3437 }, { "completion_length": 8.453125, "epoch": 0.6024180830558963, "grad_norm": 16.409700138315014, "kl": 0.08984375, "learning_rate": 3.9775714035395123e-07, "loss": -0.0084, "reward": 1.54030442237854, "reward_std": 0.1720879077911377, "rewards/accuracy_reward_stage2": 0.5559294819831848, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3438 }, { "completion_length": 8.234375, "epoch": 0.6025933064657438, "grad_norm": 32.155513926011665, "kl": 0.380859375, "learning_rate": 3.975819169441037e-07, "loss": 0.0541, "reward": 1.3794606924057007, "reward_std": 0.2892468273639679, "rewards/accuracy_reward_stage2": 0.6763357520103455, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 3439 }, { "completion_length": 9.75, "epoch": 0.6027685298755914, "grad_norm": 50.064485559731196, "kl": 0.314453125, "learning_rate": 3.9740669353425616e-07, "loss": 0.1261, "reward": 1.6192705631256104, "reward_std": 0.15657562017440796, "rewards/accuracy_reward_stage2": 0.7442706823348999, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3440 }, { "completion_length": 14.25, "epoch": 0.6029437532854389, "grad_norm": 22.465520921403584, "kl": 0.294921875, "learning_rate": 3.972314701244086e-07, "loss": 0.0255, "reward": 1.5082569122314453, "reward_std": 0.3638462722301483, "rewards/accuracy_reward_stage2": 0.5551318526268005, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3441 }, { "completion_length": 9.46875, "epoch": 0.6031189766952865, "grad_norm": 15.780525676682956, "kl": 0.11572265625, "learning_rate": 3.9705624671456104e-07, "loss": 0.0021, "reward": 1.501037359237671, "reward_std": 0.10796771943569183, "rewards/accuracy_reward_stage2": 0.5166622996330261, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3442 }, { "completion_length": 29.609375, "epoch": 0.603294200105134, "grad_norm": 23.820338114426182, "kl": 0.318359375, "learning_rate": 3.9688102330471353e-07, "loss": 0.0451, "reward": 1.6326594352722168, "reward_std": 0.2520653307437897, "rewards/accuracy_reward_stage2": 0.6795344948768616, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3443 }, { "completion_length": 9.53125, "epoch": 0.6034694235149816, "grad_norm": 16.184072771911413, "kl": 0.08349609375, "learning_rate": 3.967057998948659e-07, "loss": -0.0069, "reward": 1.723325252532959, "reward_std": 0.14421439170837402, "rewards/accuracy_reward_stage2": 0.7389503121376038, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3444 }, { "completion_length": 9.1875, "epoch": 0.6036446469248291, "grad_norm": 19.017645766699943, "kl": 0.06591796875, "learning_rate": 3.9653057648501836e-07, "loss": 0.0264, "reward": 1.687445044517517, "reward_std": 0.20073054730892181, "rewards/accuracy_reward_stage2": 0.6874449253082275, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3445 }, { "completion_length": 11.46875, "epoch": 0.6038198703346768, "grad_norm": 16.926725366578847, "kl": 0.06591796875, "learning_rate": 3.963553530751708e-07, "loss": 0.0058, "reward": 1.7377595901489258, "reward_std": 0.13752949237823486, "rewards/accuracy_reward_stage2": 0.7533845901489258, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3446 }, { "completion_length": 11.3125, "epoch": 0.6039950937445243, "grad_norm": 13.770783596052299, "kl": 0.10107421875, "learning_rate": 3.961801296653233e-07, "loss": -0.0004, "reward": 1.466865062713623, "reward_std": 0.2192572057247162, "rewards/accuracy_reward_stage2": 0.48249009251594543, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3447 }, { "completion_length": 8.40625, "epoch": 0.6041703171543719, "grad_norm": 17.04847051503246, "kl": 0.1455078125, "learning_rate": 3.9600490625547573e-07, "loss": -0.0238, "reward": 1.7534458637237549, "reward_std": 0.2810730040073395, "rewards/accuracy_reward_stage2": 0.7846959233283997, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3448 }, { "completion_length": 10.078125, "epoch": 0.6043455405642194, "grad_norm": 17.546032869691548, "kl": 0.10791015625, "learning_rate": 3.9582968284562817e-07, "loss": 0.0431, "reward": 1.533093810081482, "reward_std": 0.11157439649105072, "rewards/accuracy_reward_stage2": 0.6580938100814819, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3449 }, { "completion_length": 11.40625, "epoch": 0.604520763974067, "grad_norm": 19.356403016722467, "kl": 0.17578125, "learning_rate": 3.956544594357806e-07, "loss": 0.0486, "reward": 1.4633104801177979, "reward_std": 0.19928845763206482, "rewards/accuracy_reward_stage2": 0.47893548011779785, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3450 }, { "completion_length": 12.8125, "epoch": 0.6046959873839145, "grad_norm": 16.01905817760592, "kl": 0.134765625, "learning_rate": 3.9547923602593305e-07, "loss": 0.0098, "reward": 1.3782914876937866, "reward_std": 0.20190542936325073, "rewards/accuracy_reward_stage2": 0.5189164876937866, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3451 }, { "completion_length": 10.0, "epoch": 0.604871210793762, "grad_norm": 24.06284766509744, "kl": 0.1533203125, "learning_rate": 3.953040126160855e-07, "loss": -0.0162, "reward": 1.6933026313781738, "reward_std": 0.24311666190624237, "rewards/accuracy_reward_stage2": 0.7245526909828186, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3452 }, { "completion_length": 13.671875, "epoch": 0.6050464342036096, "grad_norm": 16.836031995387494, "kl": 0.10205078125, "learning_rate": 3.9512878920623793e-07, "loss": 0.0194, "reward": 1.3007614612579346, "reward_std": 0.2282509207725525, "rewards/accuracy_reward_stage2": 0.5507614016532898, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3453 }, { "completion_length": 10.9375, "epoch": 0.6052216576134571, "grad_norm": 28.102604670492564, "kl": 0.068359375, "learning_rate": 3.9495356579639037e-07, "loss": 0.0273, "reward": 1.6019675731658936, "reward_std": 0.1741998940706253, "rewards/accuracy_reward_stage2": 0.6019675731658936, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3454 }, { "completion_length": 8.65625, "epoch": 0.6053968810233047, "grad_norm": 18.0939366145864, "kl": 0.0732421875, "learning_rate": 3.9477834238654286e-07, "loss": 0.0293, "reward": 1.3374698162078857, "reward_std": 0.23530396819114685, "rewards/accuracy_reward_stage2": 0.46246981620788574, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3455 }, { "completion_length": 8.03125, "epoch": 0.6055721044331522, "grad_norm": 14.97976553552256, "kl": 0.035400390625, "learning_rate": 3.946031189766953e-07, "loss": 0.0141, "reward": 1.6035091876983643, "reward_std": 0.13654939830303192, "rewards/accuracy_reward_stage2": 0.6035091280937195, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3456 }, { "completion_length": 20.328125, "epoch": 0.6057473278429998, "grad_norm": 19.07249570104564, "kl": 0.24609375, "learning_rate": 3.944278955668477e-07, "loss": 0.0104, "reward": 1.4960546493530273, "reward_std": 0.2673536539077759, "rewards/accuracy_reward_stage2": 0.6523047089576721, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3457 }, { "completion_length": 9.4375, "epoch": 0.6059225512528473, "grad_norm": 12.936590524802542, "kl": 0.06884765625, "learning_rate": 3.9425267215700013e-07, "loss": -0.0119, "reward": 1.7319378852844238, "reward_std": 0.13649073243141174, "rewards/accuracy_reward_stage2": 0.7475628852844238, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3458 }, { "completion_length": 11.71875, "epoch": 0.606097774662695, "grad_norm": 19.371274993588596, "kl": 0.09765625, "learning_rate": 3.940774487471526e-07, "loss": 0.0391, "reward": 1.417302131652832, "reward_std": 0.23798641562461853, "rewards/accuracy_reward_stage2": 0.542302131652832, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3459 }, { "completion_length": 13.5, "epoch": 0.6062729980725425, "grad_norm": 19.240788759913016, "kl": 0.2216796875, "learning_rate": 3.9390222533730506e-07, "loss": 0.0446, "reward": 1.4564974308013916, "reward_std": 0.19151608645915985, "rewards/accuracy_reward_stage2": 0.5971223711967468, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3460 }, { "completion_length": 11.9375, "epoch": 0.6064482214823901, "grad_norm": 24.204213129637967, "kl": 0.232421875, "learning_rate": 3.937270019274575e-07, "loss": 0.022, "reward": 1.5358624458312988, "reward_std": 0.320328950881958, "rewards/accuracy_reward_stage2": 0.6921124458312988, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3461 }, { "completion_length": 9.171875, "epoch": 0.6066234448922376, "grad_norm": 20.175706890970673, "kl": 0.07080078125, "learning_rate": 3.9355177851760994e-07, "loss": 0.0282, "reward": 1.6413211822509766, "reward_std": 0.1602167785167694, "rewards/accuracy_reward_stage2": 0.641321063041687, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3462 }, { "completion_length": 11.71875, "epoch": 0.6067986683020852, "grad_norm": 20.092099705100154, "kl": 0.1689453125, "learning_rate": 3.9337655510776233e-07, "loss": 0.0678, "reward": 1.5993422269821167, "reward_std": 0.22892138361930847, "rewards/accuracy_reward_stage2": 0.5993422269821167, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3463 }, { "completion_length": 8.609375, "epoch": 0.6069738917119327, "grad_norm": 18.65820843756049, "kl": 0.265625, "learning_rate": 3.932013316979148e-07, "loss": -0.0445, "reward": 1.675358772277832, "reward_std": 0.3607245981693268, "rewards/accuracy_reward_stage2": 0.737858772277832, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3464 }, { "completion_length": 9.84375, "epoch": 0.6071491151217803, "grad_norm": 21.585310238332063, "kl": 0.02685546875, "learning_rate": 3.9302610828806726e-07, "loss": 0.0107, "reward": 1.3237862586975098, "reward_std": 0.17188136279582977, "rewards/accuracy_reward_stage2": 0.32378625869750977, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3465 }, { "completion_length": 9.734375, "epoch": 0.6073243385316278, "grad_norm": 17.21550996098564, "kl": 0.4453125, "learning_rate": 3.928508848782197e-07, "loss": 0.0607, "reward": 1.5191543102264404, "reward_std": 0.3503054976463318, "rewards/accuracy_reward_stage2": 0.6910292506217957, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3466 }, { "completion_length": 7.25, "epoch": 0.6074995619414754, "grad_norm": 22.309264652599936, "kl": 0.1884765625, "learning_rate": 3.926756614683722e-07, "loss": 0.0033, "reward": 1.319457769393921, "reward_std": 0.2027011513710022, "rewards/accuracy_reward_stage2": 0.6007077693939209, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3467 }, { "completion_length": 9.171875, "epoch": 0.6076747853513229, "grad_norm": 27.423521988934635, "kl": 0.1669921875, "learning_rate": 3.9250043805852463e-07, "loss": 0.0668, "reward": 1.6609394550323486, "reward_std": 0.23472878336906433, "rewards/accuracy_reward_stage2": 0.7859394550323486, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3468 }, { "completion_length": 10.6875, "epoch": 0.6078500087611705, "grad_norm": 13.57082931790605, "kl": 0.2490234375, "learning_rate": 3.9232521464867707e-07, "loss": 0.0493, "reward": 1.3972986936569214, "reward_std": 0.22089111804962158, "rewards/accuracy_reward_stage2": 0.4285487234592438, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3469 }, { "completion_length": 6.78125, "epoch": 0.608025232171018, "grad_norm": 15.588786781173503, "kl": 0.07177734375, "learning_rate": 3.9214999123882946e-07, "loss": 0.0287, "reward": 1.6718885898590088, "reward_std": 0.18329882621765137, "rewards/accuracy_reward_stage2": 0.6718885898590088, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3470 }, { "completion_length": 7.84375, "epoch": 0.6082004555808656, "grad_norm": 17.22290028822834, "kl": 0.06396484375, "learning_rate": 3.919747678289819e-07, "loss": 0.0256, "reward": 1.6375272274017334, "reward_std": 0.25864362716674805, "rewards/accuracy_reward_stage2": 0.6375272870063782, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3471 }, { "completion_length": 10.203125, "epoch": 0.6083756789907132, "grad_norm": 23.322277944614704, "kl": 0.26953125, "learning_rate": 3.917995444191344e-07, "loss": 0.0542, "reward": 1.6182160377502441, "reward_std": 0.30346912145614624, "rewards/accuracy_reward_stage2": 0.7744660973548889, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3472 }, { "completion_length": 9.359375, "epoch": 0.6085509024005608, "grad_norm": 18.548762768660698, "kl": 0.15234375, "learning_rate": 3.9162432100928683e-07, "loss": 0.019, "reward": 1.5547361373901367, "reward_std": 0.23848986625671387, "rewards/accuracy_reward_stage2": 0.6953611373901367, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3473 }, { "completion_length": 19.109375, "epoch": 0.6087261258104083, "grad_norm": 17.897521594424298, "kl": 0.0927734375, "learning_rate": 3.9144909759943927e-07, "loss": -0.0018, "reward": 1.5199809074401855, "reward_std": 0.22173704206943512, "rewards/accuracy_reward_stage2": 0.6606058478355408, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3474 }, { "completion_length": 9.421875, "epoch": 0.6089013492202558, "grad_norm": 21.38906924678852, "kl": 0.1708984375, "learning_rate": 3.912738741895917e-07, "loss": 0.0242, "reward": 1.5925832986831665, "reward_std": 0.31878191232681274, "rewards/accuracy_reward_stage2": 0.7332083582878113, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3475 }, { "completion_length": 8.15625, "epoch": 0.6090765726301034, "grad_norm": 14.91514219628995, "kl": 0.1494140625, "learning_rate": 3.9109865077974415e-07, "loss": -0.0178, "reward": 1.5367786884307861, "reward_std": 0.22990933060646057, "rewards/accuracy_reward_stage2": 0.5680287480354309, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3476 }, { "completion_length": 6.0, "epoch": 0.6092517960399509, "grad_norm": 17.288656981091965, "kl": 0.150390625, "learning_rate": 3.909234273698966e-07, "loss": 0.0182, "reward": 1.673478364944458, "reward_std": 0.20205412805080414, "rewards/accuracy_reward_stage2": 0.8141033053398132, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3477 }, { "completion_length": 13.78125, "epoch": 0.6094270194497985, "grad_norm": 19.521573383820197, "kl": 0.1376953125, "learning_rate": 3.9074820396004903e-07, "loss": 0.0611, "reward": 1.3979265689849854, "reward_std": 0.1486382782459259, "rewards/accuracy_reward_stage2": 0.5229264497756958, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3478 }, { "completion_length": 10.015625, "epoch": 0.609602242859646, "grad_norm": 19.318238937602164, "kl": 0.1689453125, "learning_rate": 3.9057298055020147e-07, "loss": 0.0254, "reward": 1.523573637008667, "reward_std": 0.18456262350082397, "rewards/accuracy_reward_stage2": 0.5391986966133118, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3479 }, { "completion_length": 9.34375, "epoch": 0.6097774662694936, "grad_norm": 27.932749619056537, "kl": 0.3828125, "learning_rate": 3.9039775714035396e-07, "loss": -0.017, "reward": 1.5458049774169922, "reward_std": 0.387906551361084, "rewards/accuracy_reward_stage2": 0.6239298582077026, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3480 }, { "completion_length": 10.859375, "epoch": 0.6099526896793411, "grad_norm": 26.924752886684438, "kl": 0.271484375, "learning_rate": 3.902225337305064e-07, "loss": 0.0641, "reward": 1.078125, "reward_std": 0.19939783215522766, "rewards/accuracy_reward_stage2": 0.21875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3481 }, { "completion_length": 10.984375, "epoch": 0.6101279130891887, "grad_norm": 22.92304205333242, "kl": 0.2294921875, "learning_rate": 3.900473103206588e-07, "loss": 0.0088, "reward": 1.5345546007156372, "reward_std": 0.23967134952545166, "rewards/accuracy_reward_stage2": 0.7064296007156372, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3482 }, { "completion_length": 7.625, "epoch": 0.6103031364990362, "grad_norm": 16.23284608235178, "kl": 0.1845703125, "learning_rate": 3.898720869108112e-07, "loss": -0.0396, "reward": 1.6320732831954956, "reward_std": 0.23797453939914703, "rewards/accuracy_reward_stage2": 0.6789483428001404, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3483 }, { "completion_length": 39.6875, "epoch": 0.6104783599088838, "grad_norm": 20.037190503078875, "kl": 0.189453125, "learning_rate": 3.896968635009637e-07, "loss": 0.005, "reward": 1.2335355281829834, "reward_std": 0.31477969884872437, "rewards/accuracy_reward_stage2": 0.3897854685783386, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3484 }, { "completion_length": 13.421875, "epoch": 0.6106535833187314, "grad_norm": 17.271775369446903, "kl": 0.09521484375, "learning_rate": 3.8952164009111616e-07, "loss": -0.0503, "reward": 1.5214704275131226, "reward_std": 0.18515348434448242, "rewards/accuracy_reward_stage2": 0.5527204275131226, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3485 }, { "completion_length": 7.359375, "epoch": 0.610828806728579, "grad_norm": 19.87792405646192, "kl": 0.0615234375, "learning_rate": 3.893464166812686e-07, "loss": 0.0084, "reward": 1.799248218536377, "reward_std": 0.1444234848022461, "rewards/accuracy_reward_stage2": 0.8148731589317322, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3486 }, { "completion_length": 11.0, "epoch": 0.6110040301384265, "grad_norm": 13.932438148758836, "kl": 0.049560546875, "learning_rate": 3.8917119327142104e-07, "loss": 0.0198, "reward": 1.4347407817840576, "reward_std": 0.11921636015176773, "rewards/accuracy_reward_stage2": 0.43474066257476807, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3487 }, { "completion_length": 16.546875, "epoch": 0.6111792535482741, "grad_norm": 20.81131020095072, "kl": 0.154296875, "learning_rate": 3.8899596986157353e-07, "loss": 0.0781, "reward": 1.1274425983428955, "reward_std": 0.18740572035312653, "rewards/accuracy_reward_stage2": 0.3774426579475403, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3488 }, { "completion_length": 10.140625, "epoch": 0.6113544769581216, "grad_norm": 20.707528478298844, "kl": 0.1005859375, "learning_rate": 3.888207464517259e-07, "loss": -0.004, "reward": 1.4854450225830078, "reward_std": 0.1825050711631775, "rewards/accuracy_reward_stage2": 0.626069962978363, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3489 }, { "completion_length": 8.453125, "epoch": 0.6115297003679692, "grad_norm": 21.871090043391153, "kl": 0.1669921875, "learning_rate": 3.8864552304187836e-07, "loss": 0.0042, "reward": 1.433689832687378, "reward_std": 0.22336667776107788, "rewards/accuracy_reward_stage2": 0.5899399518966675, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3490 }, { "completion_length": 9.109375, "epoch": 0.6117049237778167, "grad_norm": 39.55167830738417, "kl": 0.326171875, "learning_rate": 3.884702996320308e-07, "loss": 0.0546, "reward": 1.3851423263549805, "reward_std": 0.32784217596054077, "rewards/accuracy_reward_stage2": 0.5413922071456909, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3491 }, { "completion_length": 9.578125, "epoch": 0.6118801471876643, "grad_norm": 16.106440954832575, "kl": 0.162109375, "learning_rate": 3.882950762221833e-07, "loss": -0.018, "reward": 1.7969837188720703, "reward_std": 0.22703614830970764, "rewards/accuracy_reward_stage2": 0.8282337784767151, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3492 }, { "completion_length": 10.578125, "epoch": 0.6120553705975118, "grad_norm": 16.4256066595653, "kl": 0.1669921875, "learning_rate": 3.8811985281233573e-07, "loss": -0.0015, "reward": 1.6016194820404053, "reward_std": 0.18486538529396057, "rewards/accuracy_reward_stage2": 0.6328696012496948, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3493 }, { "completion_length": 8.4375, "epoch": 0.6122305940073594, "grad_norm": 17.446112304916678, "kl": 0.1494140625, "learning_rate": 3.8794462940248817e-07, "loss": 0.0196, "reward": 1.3984107971191406, "reward_std": 0.2758486866950989, "rewards/accuracy_reward_stage2": 0.5390357971191406, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3494 }, { "completion_length": 6.140625, "epoch": 0.6124058174172069, "grad_norm": 18.34918591608612, "kl": 0.19140625, "learning_rate": 3.8776940599264055e-07, "loss": -0.0011, "reward": 1.5423123836517334, "reward_std": 0.31714749336242676, "rewards/accuracy_reward_stage2": 0.5735623240470886, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3495 }, { "completion_length": 8.78125, "epoch": 0.6125810408270544, "grad_norm": 18.564979588961563, "kl": 0.1279296875, "learning_rate": 3.8759418258279305e-07, "loss": 0.007, "reward": 1.682100534439087, "reward_std": 0.24820661544799805, "rewards/accuracy_reward_stage2": 0.6977255344390869, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3496 }, { "completion_length": 11.265625, "epoch": 0.6127562642369021, "grad_norm": 18.11383222850136, "kl": 0.2392578125, "learning_rate": 3.874189591729455e-07, "loss": -0.0278, "reward": 1.5729455947875977, "reward_std": 0.3092484474182129, "rewards/accuracy_reward_stage2": 0.6354456543922424, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3497 }, { "completion_length": 13.03125, "epoch": 0.6129314876467497, "grad_norm": 21.12746990447389, "kl": 0.119140625, "learning_rate": 3.872437357630979e-07, "loss": 0.0033, "reward": 1.6204090118408203, "reward_std": 0.20224544405937195, "rewards/accuracy_reward_stage2": 0.6360338926315308, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3498 }, { "completion_length": 8.828125, "epoch": 0.6131067110565972, "grad_norm": 15.40110651828435, "kl": 0.078125, "learning_rate": 3.8706851235325037e-07, "loss": 0.0313, "reward": 1.8171948194503784, "reward_std": 0.137511745095253, "rewards/accuracy_reward_stage2": 0.8171948194503784, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3499 }, { "completion_length": 9.640625, "epoch": 0.6132819344664447, "grad_norm": 17.78726650450441, "kl": 0.03955078125, "learning_rate": 3.8689328894340286e-07, "loss": 0.0158, "reward": 1.6292483806610107, "reward_std": 0.2666996121406555, "rewards/accuracy_reward_stage2": 0.7542483806610107, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3500 }, { "completion_length": 15.484375, "epoch": 0.6134571578762923, "grad_norm": 17.83496345339617, "kl": 0.12158203125, "learning_rate": 3.8671806553355524e-07, "loss": -0.0372, "reward": 1.5006381273269653, "reward_std": 0.20013336837291718, "rewards/accuracy_reward_stage2": 0.6568880677223206, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3501 }, { "completion_length": 10.25, "epoch": 0.6136323812861398, "grad_norm": 18.81555321500434, "kl": 0.17578125, "learning_rate": 3.865428421237077e-07, "loss": -0.0392, "reward": 1.5049701929092407, "reward_std": 0.40888711810112, "rewards/accuracy_reward_stage2": 0.551845133304596, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3502 }, { "completion_length": 8.9375, "epoch": 0.6138076046959874, "grad_norm": 15.004712159632119, "kl": 0.07470703125, "learning_rate": 3.863676187138601e-07, "loss": 0.0083, "reward": 1.730872631072998, "reward_std": 0.15501649677753448, "rewards/accuracy_reward_stage2": 0.7464977502822876, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3503 }, { "completion_length": 7.609375, "epoch": 0.6139828281058349, "grad_norm": 17.480040288539744, "kl": 0.267578125, "learning_rate": 3.861923953040126e-07, "loss": -0.0607, "reward": 1.6104066371917725, "reward_std": 0.35269561409950256, "rewards/accuracy_reward_stage2": 0.6729066371917725, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3504 }, { "completion_length": 11.171875, "epoch": 0.6141580515156825, "grad_norm": 21.1540640328661, "kl": 0.345703125, "learning_rate": 3.8601717189416506e-07, "loss": 0.1385, "reward": 1.5312515497207642, "reward_std": 0.2088533341884613, "rewards/accuracy_reward_stage2": 0.7812516093254089, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3505 }, { "completion_length": 7.3125, "epoch": 0.61433327492553, "grad_norm": 19.042647399369816, "kl": 0.185546875, "learning_rate": 3.858419484843175e-07, "loss": 0.0121, "reward": 1.5548827648162842, "reward_std": 0.23560212552547455, "rewards/accuracy_reward_stage2": 0.5861326456069946, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3506 }, { "completion_length": 7.171875, "epoch": 0.6145084983353776, "grad_norm": 18.408079381299036, "kl": 0.193359375, "learning_rate": 3.8566672507446994e-07, "loss": -0.0061, "reward": 1.2742847204208374, "reward_std": 0.34039878845214844, "rewards/accuracy_reward_stage2": 0.4305347204208374, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3507 }, { "completion_length": 11.90625, "epoch": 0.6146837217452251, "grad_norm": 18.173494814858987, "kl": 0.22265625, "learning_rate": 3.854915016646224e-07, "loss": -0.0637, "reward": 1.5908520221710205, "reward_std": 0.35630887746810913, "rewards/accuracy_reward_stage2": 0.6533519625663757, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3508 }, { "completion_length": 9.5625, "epoch": 0.6148589451550727, "grad_norm": 15.490980096963543, "kl": 0.236328125, "learning_rate": 3.853162782547748e-07, "loss": 0.0217, "reward": 1.8262255191802979, "reward_std": 0.23086966574192047, "rewards/accuracy_reward_stage2": 0.8574756383895874, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3509 }, { "completion_length": 15.5, "epoch": 0.6150341685649203, "grad_norm": 15.170598682243257, "kl": 0.1708984375, "learning_rate": 3.8514105484492725e-07, "loss": 0.0009, "reward": 1.4647321701049805, "reward_std": 0.23844221234321594, "rewards/accuracy_reward_stage2": 0.6209821701049805, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3510 }, { "completion_length": 9.859375, "epoch": 0.6152093919747679, "grad_norm": 98.77143590844044, "kl": 0.7109375, "learning_rate": 3.849658314350797e-07, "loss": 0.1333, "reward": 1.40625, "reward_std": 0.1523548662662506, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 0.796875, "scores/accuracy_reward_stage2": 0.796875, "step": 3511 }, { "completion_length": 4.359375, "epoch": 0.6153846153846154, "grad_norm": 28.602906237292615, "kl": 0.1484375, "learning_rate": 3.847906080252322e-07, "loss": 0.0594, "reward": 1.734375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3512 }, { "completion_length": 8.671875, "epoch": 0.615559838794463, "grad_norm": 18.623178861985995, "kl": 0.1669921875, "learning_rate": 3.8461538461538463e-07, "loss": -0.0431, "reward": 1.6280450820922852, "reward_std": 0.33348989486694336, "rewards/accuracy_reward_stage2": 0.6749200224876404, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3513 }, { "completion_length": 4.90625, "epoch": 0.6157350622043105, "grad_norm": 11.466048889383227, "kl": 0.04150390625, "learning_rate": 3.84440161205537e-07, "loss": -0.0276, "reward": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3514 }, { "completion_length": 13.015625, "epoch": 0.6159102856141581, "grad_norm": 22.1078229298361, "kl": 0.04052734375, "learning_rate": 3.8426493779568945e-07, "loss": 0.0162, "reward": 1.3823972940444946, "reward_std": 0.2699623703956604, "rewards/accuracy_reward_stage2": 0.5073972940444946, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3515 }, { "completion_length": 12.078125, "epoch": 0.6160855090240056, "grad_norm": 26.06034416396247, "kl": 0.11181640625, "learning_rate": 3.8408971438584195e-07, "loss": 0.0447, "reward": 1.6742892265319824, "reward_std": 0.2498525083065033, "rewards/accuracy_reward_stage2": 0.6742891073226929, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3516 }, { "completion_length": 13.328125, "epoch": 0.6162607324338532, "grad_norm": 9.834568489716801, "kl": 0.087890625, "learning_rate": 3.839144909759944e-07, "loss": -0.0089, "reward": 1.5580108165740967, "reward_std": 0.09554215520620346, "rewards/accuracy_reward_stage2": 0.5736356973648071, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3517 }, { "completion_length": 6.546875, "epoch": 0.6164359558437007, "grad_norm": 16.192663583818383, "kl": 0.1513671875, "learning_rate": 3.837392675661468e-07, "loss": 0.0391, "reward": 1.5547268390655518, "reward_std": 0.12057439982891083, "rewards/accuracy_reward_stage2": 0.5703518986701965, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3518 }, { "completion_length": 9.40625, "epoch": 0.6166111792535482, "grad_norm": 16.394920671685842, "kl": 0.1796875, "learning_rate": 3.8356404415629926e-07, "loss": 0.0042, "reward": 1.59375, "reward_std": 0.19149437546730042, "rewards/accuracy_reward_stage2": 0.75, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3519 }, { "completion_length": 10.890625, "epoch": 0.6167864026633958, "grad_norm": 17.074565592754322, "kl": 0.103515625, "learning_rate": 3.8338882074645176e-07, "loss": 0.0108, "reward": 1.405001163482666, "reward_std": 0.2863079905509949, "rewards/accuracy_reward_stage2": 0.545626163482666, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3520 }, { "completion_length": 8.75, "epoch": 0.6169616260732433, "grad_norm": 13.41627549720222, "kl": 0.2041015625, "learning_rate": 3.8321359733660414e-07, "loss": -0.0509, "reward": 1.6864609718322754, "reward_std": 0.182004913687706, "rewards/accuracy_reward_stage2": 0.7333359122276306, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3521 }, { "completion_length": 11.15625, "epoch": 0.6171368494830909, "grad_norm": 25.295103218557728, "kl": 0.224609375, "learning_rate": 3.830383739267566e-07, "loss": 0.0146, "reward": 1.5392603874206543, "reward_std": 0.2501576840877533, "rewards/accuracy_reward_stage2": 0.6955103874206543, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3522 }, { "completion_length": 10.515625, "epoch": 0.6173120728929385, "grad_norm": 17.035368352296782, "kl": 0.466796875, "learning_rate": 3.82863150516909e-07, "loss": 0.1139, "reward": 1.4368441104888916, "reward_std": 0.2833743095397949, "rewards/accuracy_reward_stage2": 0.5930941104888916, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3523 }, { "completion_length": 10.25, "epoch": 0.6174872963027861, "grad_norm": 16.55324156650657, "kl": 0.1953125, "learning_rate": 3.826879271070615e-07, "loss": -0.0034, "reward": 1.6485011577606201, "reward_std": 0.3175305128097534, "rewards/accuracy_reward_stage2": 0.6797511577606201, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3524 }, { "completion_length": 7.0625, "epoch": 0.6176625197126336, "grad_norm": 20.007918759937983, "kl": 0.111328125, "learning_rate": 3.8251270369721396e-07, "loss": 0.0445, "reward": 1.440403938293457, "reward_std": 0.22194069623947144, "rewards/accuracy_reward_stage2": 0.5654039978981018, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3525 }, { "completion_length": 11.09375, "epoch": 0.6178377431224812, "grad_norm": 17.33559494877162, "kl": 0.193359375, "learning_rate": 3.823374802873664e-07, "loss": 0.0333, "reward": 1.4666603803634644, "reward_std": 0.137288898229599, "rewards/accuracy_reward_stage2": 0.6072853803634644, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3526 }, { "completion_length": 10.40625, "epoch": 0.6180129665323287, "grad_norm": 10.617484295557372, "kl": 0.1630859375, "learning_rate": 3.821622568775188e-07, "loss": 0.0208, "reward": 1.38825261592865, "reward_std": 0.13479651510715485, "rewards/accuracy_reward_stage2": 0.5288775563240051, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3527 }, { "completion_length": 10.671875, "epoch": 0.6181881899421763, "grad_norm": 17.033987129441545, "kl": 0.21875, "learning_rate": 3.819870334676713e-07, "loss": 0.0434, "reward": 1.446754813194275, "reward_std": 0.23349672555923462, "rewards/accuracy_reward_stage2": 0.5873798131942749, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3528 }, { "completion_length": 8.71875, "epoch": 0.6183634133520238, "grad_norm": 15.973752398998958, "kl": 0.193359375, "learning_rate": 3.818118100578237e-07, "loss": 0.044, "reward": 1.4383138418197632, "reward_std": 0.2546946406364441, "rewards/accuracy_reward_stage2": 0.5789388418197632, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3529 }, { "completion_length": 9.28125, "epoch": 0.6185386367618714, "grad_norm": 17.20861765125097, "kl": 0.2138671875, "learning_rate": 3.8163658664797615e-07, "loss": -0.0312, "reward": 1.8370803594589233, "reward_std": 0.21137744188308716, "rewards/accuracy_reward_stage2": 0.8839553594589233, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3530 }, { "completion_length": 8.046875, "epoch": 0.6187138601717189, "grad_norm": 16.159578362111386, "kl": 0.2138671875, "learning_rate": 3.814613632381286e-07, "loss": -0.0363, "reward": 1.539421796798706, "reward_std": 0.29987043142318726, "rewards/accuracy_reward_stage2": 0.5862968564033508, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3531 }, { "completion_length": 10.03125, "epoch": 0.6188890835815665, "grad_norm": 21.416384338002235, "kl": 0.1396484375, "learning_rate": 3.812861398282811e-07, "loss": 0.0271, "reward": 1.5553689002990723, "reward_std": 0.26087063550949097, "rewards/accuracy_reward_stage2": 0.5709939002990723, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3532 }, { "completion_length": 10.484375, "epoch": 0.619064306991414, "grad_norm": 24.629877674014825, "kl": 0.10498046875, "learning_rate": 3.8111091641843347e-07, "loss": 0.042, "reward": 1.4916770458221436, "reward_std": 0.2873051166534424, "rewards/accuracy_reward_stage2": 0.49167704582214355, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3533 }, { "completion_length": 8.59375, "epoch": 0.6192395304012616, "grad_norm": 17.835054328540952, "kl": 0.1416015625, "learning_rate": 3.809356930085859e-07, "loss": -0.0317, "reward": 1.408469796180725, "reward_std": 0.17870807647705078, "rewards/accuracy_reward_stage2": 0.4397197961807251, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3534 }, { "completion_length": 10.0625, "epoch": 0.6194147538111091, "grad_norm": 18.655373426418993, "kl": 0.1533203125, "learning_rate": 3.8076046959873835e-07, "loss": 0.0171, "reward": 1.6664772033691406, "reward_std": 0.2817244529724121, "rewards/accuracy_reward_stage2": 0.6821021437644958, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3535 }, { "completion_length": 9.859375, "epoch": 0.6195899772209568, "grad_norm": 18.72390316211415, "kl": 0.37109375, "learning_rate": 3.8058524618889084e-07, "loss": -0.0262, "reward": 1.4409722089767456, "reward_std": 0.2888485789299011, "rewards/accuracy_reward_stage2": 0.5190972089767456, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3536 }, { "completion_length": 8.734375, "epoch": 0.6197652006308043, "grad_norm": 21.898155977185976, "kl": 0.1328125, "learning_rate": 3.804100227790433e-07, "loss": 0.0403, "reward": 1.5529836416244507, "reward_std": 0.26357075572013855, "rewards/accuracy_reward_stage2": 0.6936086416244507, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3537 }, { "completion_length": 6.734375, "epoch": 0.6199404240406519, "grad_norm": 21.017998866038578, "kl": 0.26953125, "learning_rate": 3.802347993691957e-07, "loss": -0.0248, "reward": 1.6154170036315918, "reward_std": 0.27425822615623474, "rewards/accuracy_reward_stage2": 0.662291944026947, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3538 }, { "completion_length": 9.65625, "epoch": 0.6201156474504994, "grad_norm": 24.37689125398419, "kl": 0.390625, "learning_rate": 3.8005957595934816e-07, "loss": 0.1059, "reward": 1.3345599174499512, "reward_std": 0.24926936626434326, "rewards/accuracy_reward_stage2": 0.6158099174499512, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3539 }, { "completion_length": 10.078125, "epoch": 0.620290870860347, "grad_norm": 20.6787583059043, "kl": 0.1259765625, "learning_rate": 3.7988435254950055e-07, "loss": 0.0503, "reward": 1.629636526107788, "reward_std": 0.24759991466999054, "rewards/accuracy_reward_stage2": 0.6296364665031433, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3540 }, { "completion_length": 10.65625, "epoch": 0.6204660942701945, "grad_norm": 14.225070278546896, "kl": 0.08544921875, "learning_rate": 3.7970912913965304e-07, "loss": -0.0485, "reward": 1.6456576585769653, "reward_std": 0.1932823807001114, "rewards/accuracy_reward_stage2": 0.6769076585769653, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3541 }, { "completion_length": 8.875, "epoch": 0.620641317680042, "grad_norm": 17.902675872340364, "kl": 0.1611328125, "learning_rate": 3.795339057298055e-07, "loss": 0.0205, "reward": 1.6116595268249512, "reward_std": 0.21022561192512512, "rewards/accuracy_reward_stage2": 0.6272845268249512, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3542 }, { "completion_length": 12.09375, "epoch": 0.6208165410898896, "grad_norm": 20.440861631124083, "kl": 0.291015625, "learning_rate": 3.793586823199579e-07, "loss": -0.0145, "reward": 1.5649584531784058, "reward_std": 0.3023066520690918, "rewards/accuracy_reward_stage2": 0.6274584531784058, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3543 }, { "completion_length": 17.28125, "epoch": 0.6209917644997371, "grad_norm": 14.588766645869493, "kl": 0.1474609375, "learning_rate": 3.791834589101104e-07, "loss": -0.0292, "reward": 1.483196496963501, "reward_std": 0.20009317994117737, "rewards/accuracy_reward_stage2": 0.6394466161727905, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3544 }, { "completion_length": 28.796875, "epoch": 0.6211669879095847, "grad_norm": 21.009877199345457, "kl": 0.1103515625, "learning_rate": 3.7900823550026285e-07, "loss": 0.0, "reward": 1.5804895162582397, "reward_std": 0.1696164608001709, "rewards/accuracy_reward_stage2": 0.5961145162582397, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3545 }, { "completion_length": 11.9375, "epoch": 0.6213422113194322, "grad_norm": 14.908152840058406, "kl": 0.060546875, "learning_rate": 3.7883301209041524e-07, "loss": 0.0242, "reward": 1.5572917461395264, "reward_std": 0.1236191987991333, "rewards/accuracy_reward_stage2": 0.5572916865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3546 }, { "completion_length": 9.21875, "epoch": 0.6215174347292798, "grad_norm": 16.157423263961284, "kl": 0.083984375, "learning_rate": 3.786577886805677e-07, "loss": 0.0239, "reward": 1.5393553972244263, "reward_std": 0.1936211735010147, "rewards/accuracy_reward_stage2": 0.5549803972244263, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3547 }, { "completion_length": 7.578125, "epoch": 0.6216926581391274, "grad_norm": 15.889175938523843, "kl": 0.05908203125, "learning_rate": 3.784825652707201e-07, "loss": -0.0206, "reward": 1.8656994104385376, "reward_std": 0.17255815863609314, "rewards/accuracy_reward_stage2": 0.8813244104385376, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3548 }, { "completion_length": 10.671875, "epoch": 0.621867881548975, "grad_norm": 19.960541953323727, "kl": 0.1220703125, "learning_rate": 3.783073418608726e-07, "loss": 0.0488, "reward": 1.6864854097366333, "reward_std": 0.23788943886756897, "rewards/accuracy_reward_stage2": 0.8114853501319885, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3549 }, { "completion_length": 9.15625, "epoch": 0.6220431049588225, "grad_norm": 16.951175313083127, "kl": 0.11279296875, "learning_rate": 3.7813211845102505e-07, "loss": 0.0008, "reward": 1.6322424411773682, "reward_std": 0.2910325527191162, "rewards/accuracy_reward_stage2": 0.7728673815727234, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3550 }, { "completion_length": 8.59375, "epoch": 0.6222183283686701, "grad_norm": 15.89765695432175, "kl": 0.05078125, "learning_rate": 3.779568950411775e-07, "loss": 0.0204, "reward": 1.7006888389587402, "reward_std": 0.10197651386260986, "rewards/accuracy_reward_stage2": 0.7006887197494507, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3551 }, { "completion_length": 12.203125, "epoch": 0.6223935517785176, "grad_norm": 20.326690153829755, "kl": 0.318359375, "learning_rate": 3.777816716313299e-07, "loss": 0.0028, "reward": 1.4471937417984009, "reward_std": 0.2985219359397888, "rewards/accuracy_reward_stage2": 0.4940687119960785, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3552 }, { "completion_length": 7.390625, "epoch": 0.6225687751883652, "grad_norm": 15.092191451627144, "kl": 0.2373046875, "learning_rate": 3.7760644822148237e-07, "loss": -0.0871, "reward": 1.6521281003952026, "reward_std": 0.2880202829837799, "rewards/accuracy_reward_stage2": 0.7302531003952026, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3553 }, { "completion_length": 8.65625, "epoch": 0.6227439985982127, "grad_norm": 17.99898596971007, "kl": 0.11767578125, "learning_rate": 3.774312248116348e-07, "loss": -0.0413, "reward": 1.7212051153182983, "reward_std": 0.29289162158966064, "rewards/accuracy_reward_stage2": 0.7524551153182983, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3554 }, { "completion_length": 6.859375, "epoch": 0.6229192220080603, "grad_norm": 16.940170418921618, "kl": 0.1708984375, "learning_rate": 3.7725600140178725e-07, "loss": 0.0244, "reward": 1.7089645862579346, "reward_std": 0.11606550216674805, "rewards/accuracy_reward_stage2": 0.8495896458625793, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3555 }, { "completion_length": 5.90625, "epoch": 0.6230944454179078, "grad_norm": 17.429080589459772, "kl": 0.09765625, "learning_rate": 3.770807779919397e-07, "loss": 0.0391, "reward": 1.4759080410003662, "reward_std": 0.18674036860466003, "rewards/accuracy_reward_stage2": 0.600908100605011, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3556 }, { "completion_length": 9.375, "epoch": 0.6232696688277554, "grad_norm": 20.458137533190115, "kl": 0.130859375, "learning_rate": 3.769055545820922e-07, "loss": 0.0194, "reward": 1.657368540763855, "reward_std": 0.31080591678619385, "rewards/accuracy_reward_stage2": 0.672993540763855, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3557 }, { "completion_length": 8.40625, "epoch": 0.6234448922376029, "grad_norm": 13.461876617629324, "kl": 0.10498046875, "learning_rate": 3.767303311722446e-07, "loss": 0.042, "reward": 1.8790143728256226, "reward_std": 0.12620574235916138, "rewards/accuracy_reward_stage2": 0.8790143728256226, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3558 }, { "completion_length": 10.3125, "epoch": 0.6236201156474505, "grad_norm": 21.25344102951621, "kl": 0.2470703125, "learning_rate": 3.76555107762397e-07, "loss": 0.022, "reward": 1.06238853931427, "reward_std": 0.3604205250740051, "rewards/accuracy_reward_stage2": 0.59363853931427, "rewards/format_reward_stage1_pointerpad": 0.46875, "scores/accuracy_reward_stage2": 0.46875, "step": 3559 }, { "completion_length": 13.28125, "epoch": 0.623795339057298, "grad_norm": 17.725282912474576, "kl": 0.095703125, "learning_rate": 3.7637988435254945e-07, "loss": 0.0382, "reward": 1.5480883121490479, "reward_std": 0.2119196355342865, "rewards/accuracy_reward_stage2": 0.5480883121490479, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3560 }, { "completion_length": 13.328125, "epoch": 0.6239705624671457, "grad_norm": 23.053761024253866, "kl": 0.11279296875, "learning_rate": 3.7620466094270194e-07, "loss": 0.045, "reward": 1.4253368377685547, "reward_std": 0.22176837921142578, "rewards/accuracy_reward_stage2": 0.6753367185592651, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3561 }, { "completion_length": 11.25, "epoch": 0.6241457858769932, "grad_norm": 264.2225226480569, "kl": 1.2265625, "learning_rate": 3.760294375328544e-07, "loss": 0.3575, "reward": 1.4801325798034668, "reward_std": 0.20424708724021912, "rewards/accuracy_reward_stage2": 0.5426324605941772, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3562 }, { "completion_length": 9.015625, "epoch": 0.6243210092868408, "grad_norm": 18.901975442316548, "kl": 0.11279296875, "learning_rate": 3.758542141230068e-07, "loss": 0.0453, "reward": 1.456575632095337, "reward_std": 0.15833720564842224, "rewards/accuracy_reward_stage2": 0.4565756320953369, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3563 }, { "completion_length": 9.390625, "epoch": 0.6244962326966883, "grad_norm": 17.899991222909144, "kl": 0.1005859375, "learning_rate": 3.7567899071315926e-07, "loss": 0.0191, "reward": 1.4772353172302246, "reward_std": 0.21017813682556152, "rewards/accuracy_reward_stage2": 0.49286025762557983, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3564 }, { "completion_length": 8.765625, "epoch": 0.6246714561065358, "grad_norm": 11.97891870659925, "kl": 0.020263671875, "learning_rate": 3.755037673033117e-07, "loss": 0.0081, "reward": 1.6755764484405518, "reward_std": 0.14607882499694824, "rewards/accuracy_reward_stage2": 0.6755764484405518, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3565 }, { "completion_length": 15.734375, "epoch": 0.6248466795163834, "grad_norm": 15.662134755135996, "kl": 0.08349609375, "learning_rate": 3.7532854389346414e-07, "loss": -0.0102, "reward": 1.5459709167480469, "reward_std": 0.23149898648262024, "rewards/accuracy_reward_stage2": 0.6865959167480469, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3566 }, { "completion_length": 9.671875, "epoch": 0.6250219029262309, "grad_norm": 17.250786101493915, "kl": 0.12060546875, "learning_rate": 3.751533204836166e-07, "loss": 0.0482, "reward": 1.4936150312423706, "reward_std": 0.16181641817092896, "rewards/accuracy_reward_stage2": 0.6186150312423706, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3567 }, { "completion_length": 8.859375, "epoch": 0.6251971263360785, "grad_norm": 21.18063832472489, "kl": 0.1298828125, "learning_rate": 3.74978097073769e-07, "loss": -0.0081, "reward": 1.6339540481567383, "reward_std": 0.20301076769828796, "rewards/accuracy_reward_stage2": 0.6652040481567383, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3568 }, { "completion_length": 9.171875, "epoch": 0.625372349745926, "grad_norm": 16.632840171437252, "kl": 0.1533203125, "learning_rate": 3.748028736639215e-07, "loss": 0.0169, "reward": 1.4851830005645752, "reward_std": 0.1846974790096283, "rewards/accuracy_reward_stage2": 0.5008080005645752, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3569 }, { "completion_length": 24.921875, "epoch": 0.6255475731557736, "grad_norm": 19.56369931496008, "kl": 0.11376953125, "learning_rate": 3.7462765025407395e-07, "loss": 0.0119, "reward": 1.5117580890655518, "reward_std": 0.24193823337554932, "rewards/accuracy_reward_stage2": 0.5273829698562622, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3570 }, { "completion_length": 17.390625, "epoch": 0.6257227965656211, "grad_norm": 31.59243233511247, "kl": 0.29296875, "learning_rate": 3.744524268442264e-07, "loss": 0.0291, "reward": 1.5098161697387695, "reward_std": 0.29780054092407227, "rewards/accuracy_reward_stage2": 0.5410662889480591, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3571 }, { "completion_length": 9.875, "epoch": 0.6258980199754687, "grad_norm": 16.968677555978577, "kl": 0.1552734375, "learning_rate": 3.742772034343788e-07, "loss": -0.0247, "reward": 1.6412062644958496, "reward_std": 0.2023872286081314, "rewards/accuracy_reward_stage2": 0.7974562048912048, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3572 }, { "completion_length": 13.25, "epoch": 0.6260732433853162, "grad_norm": 23.035582765314654, "kl": 0.09326171875, "learning_rate": 3.7410198002453127e-07, "loss": 0.0374, "reward": 1.68117094039917, "reward_std": 0.23287498950958252, "rewards/accuracy_reward_stage2": 0.6811710000038147, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3573 }, { "completion_length": 8.796875, "epoch": 0.6262484667951639, "grad_norm": 17.890803563885793, "kl": 0.1318359375, "learning_rate": 3.739267566146837e-07, "loss": 0.0155, "reward": 1.4391281604766846, "reward_std": 0.18224698305130005, "rewards/accuracy_reward_stage2": 0.5797532200813293, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3574 }, { "completion_length": 11.78125, "epoch": 0.6264236902050114, "grad_norm": 18.294483074014767, "kl": 0.10791015625, "learning_rate": 3.7375153320483615e-07, "loss": 0.043, "reward": 1.550438642501831, "reward_std": 0.13361681997776031, "rewards/accuracy_reward_stage2": 0.6754387617111206, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3575 }, { "completion_length": 9.46875, "epoch": 0.626598913614859, "grad_norm": 19.645375528005765, "kl": 0.11474609375, "learning_rate": 3.735763097949886e-07, "loss": -0.0072, "reward": 1.514617681503296, "reward_std": 0.25796785950660706, "rewards/accuracy_reward_stage2": 0.5458677411079407, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3576 }, { "completion_length": 11.203125, "epoch": 0.6267741370247065, "grad_norm": 24.349105508571775, "kl": 0.3671875, "learning_rate": 3.734010863851411e-07, "loss": 0.0613, "reward": 1.490251898765564, "reward_std": 0.15121126174926758, "rewards/accuracy_reward_stage2": 0.521501898765564, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3577 }, { "completion_length": 7.453125, "epoch": 0.6269493604345541, "grad_norm": 20.045855738595996, "kl": 0.19140625, "learning_rate": 3.7322586297529347e-07, "loss": 0.0449, "reward": 1.5475776195526123, "reward_std": 0.2927602231502533, "rewards/accuracy_reward_stage2": 0.5632026195526123, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3578 }, { "completion_length": 7.65625, "epoch": 0.6271245838444016, "grad_norm": 18.788063060436127, "kl": 0.14453125, "learning_rate": 3.730506395654459e-07, "loss": 0.0194, "reward": 1.6328332424163818, "reward_std": 0.1912064403295517, "rewards/accuracy_reward_stage2": 0.6484582424163818, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3579 }, { "completion_length": 12.84375, "epoch": 0.6272998072542492, "grad_norm": 18.013975028757724, "kl": 0.1884765625, "learning_rate": 3.7287541615559835e-07, "loss": -0.0024, "reward": 1.3137900829315186, "reward_std": 0.30171117186546326, "rewards/accuracy_reward_stage2": 0.3450400233268738, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3580 }, { "completion_length": 8.84375, "epoch": 0.6274750306640967, "grad_norm": 20.12558351071326, "kl": 0.036376953125, "learning_rate": 3.7270019274575084e-07, "loss": 0.0146, "reward": 1.6001933813095093, "reward_std": 0.14768247306346893, "rewards/accuracy_reward_stage2": 0.600193440914154, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3581 }, { "completion_length": 9.390625, "epoch": 0.6276502540739443, "grad_norm": 20.643454936736447, "kl": 0.1689453125, "learning_rate": 3.725249693359033e-07, "loss": 0.0235, "reward": 1.3290756940841675, "reward_std": 0.14735127985477448, "rewards/accuracy_reward_stage2": 0.46970072388648987, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3582 }, { "completion_length": 9.625, "epoch": 0.6278254774837918, "grad_norm": 22.35769949056351, "kl": 0.2294921875, "learning_rate": 3.723497459260557e-07, "loss": 0.0223, "reward": 1.5896830558776855, "reward_std": 0.32479965686798096, "rewards/accuracy_reward_stage2": 0.620932936668396, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3583 }, { "completion_length": 14.984375, "epoch": 0.6280007008936394, "grad_norm": 14.017040088597355, "kl": 0.05712890625, "learning_rate": 3.721745225162081e-07, "loss": -0.0207, "reward": 1.4739978313446045, "reward_std": 0.16686731576919556, "rewards/accuracy_reward_stage2": 0.48962289094924927, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3584 }, { "completion_length": 7.78125, "epoch": 0.6281759243034869, "grad_norm": 22.618567158406375, "kl": 0.11865234375, "learning_rate": 3.719992991063606e-07, "loss": 0.0474, "reward": 1.9034717082977295, "reward_std": 0.1325203776359558, "rewards/accuracy_reward_stage2": 0.9034717082977295, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3585 }, { "completion_length": 12.0, "epoch": 0.6283511477133346, "grad_norm": 19.571473796459003, "kl": 0.06494140625, "learning_rate": 3.7182407569651304e-07, "loss": 0.0259, "reward": 1.6627893447875977, "reward_std": 0.3550585210323334, "rewards/accuracy_reward_stage2": 0.6627893447875977, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3586 }, { "completion_length": 11.125, "epoch": 0.6285263711231821, "grad_norm": 20.24306501516782, "kl": 0.205078125, "learning_rate": 3.716488522866655e-07, "loss": 0.0378, "reward": 1.3590320348739624, "reward_std": 0.2013445496559143, "rewards/accuracy_reward_stage2": 0.4996569752693176, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3587 }, { "completion_length": 11.1875, "epoch": 0.6287015945330297, "grad_norm": 19.460810861658555, "kl": 0.197265625, "learning_rate": 3.714736288768179e-07, "loss": 0.013, "reward": 1.4457964897155762, "reward_std": 0.19221317768096924, "rewards/accuracy_reward_stage2": 0.602046549320221, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3588 }, { "completion_length": 7.53125, "epoch": 0.6288768179428772, "grad_norm": 23.264413559011846, "kl": 0.10546875, "learning_rate": 3.712984054669704e-07, "loss": 0.042, "reward": 1.7394332885742188, "reward_std": 0.20407696068286896, "rewards/accuracy_reward_stage2": 0.7394333481788635, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3589 }, { "completion_length": 11.015625, "epoch": 0.6290520413527247, "grad_norm": 22.753318894193853, "kl": 0.1005859375, "learning_rate": 3.7112318205712285e-07, "loss": 0.0402, "reward": 1.489595651626587, "reward_std": 0.2987571656703949, "rewards/accuracy_reward_stage2": 0.48959559202194214, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3590 }, { "completion_length": 11.0625, "epoch": 0.6292272647625723, "grad_norm": 19.31772370933449, "kl": 0.02783203125, "learning_rate": 3.7094795864727524e-07, "loss": 0.0111, "reward": 1.5470237731933594, "reward_std": 0.22848649322986603, "rewards/accuracy_reward_stage2": 0.5470237731933594, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3591 }, { "completion_length": 7.984375, "epoch": 0.6294024881724198, "grad_norm": 19.530075409106047, "kl": 0.1259765625, "learning_rate": 3.707727352374277e-07, "loss": -0.036, "reward": 1.4901680946350098, "reward_std": 0.2815321087837219, "rewards/accuracy_reward_stage2": 0.6464180946350098, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3592 }, { "completion_length": 10.90625, "epoch": 0.6295777115822674, "grad_norm": 18.345966770518224, "kl": 0.197265625, "learning_rate": 3.7059751182758017e-07, "loss": -0.0039, "reward": 1.6659647226333618, "reward_std": 0.2305700182914734, "rewards/accuracy_reward_stage2": 0.6972147226333618, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3593 }, { "completion_length": 11.03125, "epoch": 0.6297529349921149, "grad_norm": 21.17499941283475, "kl": 0.22265625, "learning_rate": 3.704222884177326e-07, "loss": -0.0711, "reward": 1.5663416385650635, "reward_std": 0.3491690754890442, "rewards/accuracy_reward_stage2": 0.6444666385650635, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3594 }, { "completion_length": 9.671875, "epoch": 0.6299281584019625, "grad_norm": 14.065463779336932, "kl": 0.146484375, "learning_rate": 3.7024706500788505e-07, "loss": -0.0295, "reward": 1.5848331451416016, "reward_std": 0.18891380727291107, "rewards/accuracy_reward_stage2": 0.6160831451416016, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3595 }, { "completion_length": 7.15625, "epoch": 0.63010338181181, "grad_norm": 21.11957075148225, "kl": 0.1328125, "learning_rate": 3.700718415980375e-07, "loss": 0.0242, "reward": 1.5459332466125488, "reward_std": 0.25915971398353577, "rewards/accuracy_reward_stage2": 0.5615583658218384, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3596 }, { "completion_length": 12.671875, "epoch": 0.6302786052216576, "grad_norm": 19.24932292881549, "kl": 0.263671875, "learning_rate": 3.698966181881899e-07, "loss": 0.0387, "reward": 1.4856727123260498, "reward_std": 0.3313537538051605, "rewards/accuracy_reward_stage2": 0.6419227719306946, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3597 }, { "completion_length": 12.65625, "epoch": 0.6304538286315051, "grad_norm": 15.765998728489336, "kl": 0.1015625, "learning_rate": 3.6972139477834237e-07, "loss": 0.0408, "reward": 1.6163502931594849, "reward_std": 0.09977184236049652, "rewards/accuracy_reward_stage2": 0.6163503527641296, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3598 }, { "completion_length": 9.46875, "epoch": 0.6306290520413528, "grad_norm": 6.389241941568242, "kl": 0.05810546875, "learning_rate": 3.695461713684948e-07, "loss": -0.0102, "reward": 1.5, "reward_std": 0.06681530922651291, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3599 }, { "completion_length": 10.453125, "epoch": 0.6308042754512003, "grad_norm": 18.805567669260828, "kl": 0.150390625, "learning_rate": 3.6937094795864725e-07, "loss": 0.016, "reward": 1.7800894975662231, "reward_std": 0.1909758448600769, "rewards/accuracy_reward_stage2": 0.7957144975662231, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3600 }, { "completion_length": 11.125, "epoch": 0.6309794988610479, "grad_norm": 19.18921620561482, "kl": 0.1376953125, "learning_rate": 3.6919572454879974e-07, "loss": -0.0322, "reward": 1.4479596614837646, "reward_std": 0.2607450485229492, "rewards/accuracy_reward_stage2": 0.4948346018791199, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3601 }, { "completion_length": 11.25, "epoch": 0.6311547222708954, "grad_norm": 16.58693480436364, "kl": 0.04638671875, "learning_rate": 3.690205011389522e-07, "loss": 0.0185, "reward": 1.5900869369506836, "reward_std": 0.13465861976146698, "rewards/accuracy_reward_stage2": 0.5900869369506836, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3602 }, { "completion_length": 12.09375, "epoch": 0.631329945680743, "grad_norm": 14.017275262683858, "kl": 0.1162109375, "learning_rate": 3.6884527772910456e-07, "loss": 0.0109, "reward": 1.3748043775558472, "reward_std": 0.25310125946998596, "rewards/accuracy_reward_stage2": 0.6404293179512024, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3603 }, { "completion_length": 7.203125, "epoch": 0.6315051690905905, "grad_norm": 17.0023380519842, "kl": 0.15625, "learning_rate": 3.68670054319257e-07, "loss": 0.0182, "reward": 1.296875, "reward_std": 0.25726157426834106, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3604 }, { "completion_length": 9.078125, "epoch": 0.6316803925004381, "grad_norm": 17.37072153704894, "kl": 0.07421875, "learning_rate": 3.684948309094095e-07, "loss": 0.0298, "reward": 1.6908620595932007, "reward_std": 0.15819929540157318, "rewards/accuracy_reward_stage2": 0.6908620595932007, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3605 }, { "completion_length": 8.4375, "epoch": 0.6318556159102856, "grad_norm": 17.15164377499678, "kl": 0.1572265625, "learning_rate": 3.6831960749956194e-07, "loss": 0.0433, "reward": 1.407257318496704, "reward_std": 0.17434148490428925, "rewards/accuracy_reward_stage2": 0.4385073184967041, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3606 }, { "completion_length": 8.375, "epoch": 0.6320308393201332, "grad_norm": 13.219982651126623, "kl": 0.1044921875, "learning_rate": 3.681443840897144e-07, "loss": 0.0047, "reward": 1.6666977405548096, "reward_std": 0.082484170794487, "rewards/accuracy_reward_stage2": 0.8073228597640991, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3607 }, { "completion_length": 33.09375, "epoch": 0.6322060627299807, "grad_norm": 20.00487017872797, "kl": 0.123046875, "learning_rate": 3.679691606798668e-07, "loss": 0.0052, "reward": 1.6811567544937134, "reward_std": 0.2361474633216858, "rewards/accuracy_reward_stage2": 0.6967816352844238, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3608 }, { "completion_length": 10.28125, "epoch": 0.6323812861398282, "grad_norm": 20.55657196512188, "kl": 0.045166015625, "learning_rate": 3.677939372700193e-07, "loss": 0.0181, "reward": 1.3361544609069824, "reward_std": 0.2871898114681244, "rewards/accuracy_reward_stage2": 0.33615443110466003, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3609 }, { "completion_length": 7.140625, "epoch": 0.6325565095496758, "grad_norm": 16.762029088279338, "kl": 0.1005859375, "learning_rate": 3.676187138601717e-07, "loss": 0.0402, "reward": 1.59446382522583, "reward_std": 0.15326127409934998, "rewards/accuracy_reward_stage2": 0.5944638848304749, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3610 }, { "completion_length": 7.515625, "epoch": 0.6327317329595233, "grad_norm": 23.174955130615718, "kl": 0.1904296875, "learning_rate": 3.6744349045032413e-07, "loss": 0.01, "reward": 1.806498408317566, "reward_std": 0.23249441385269165, "rewards/accuracy_reward_stage2": 0.8377484083175659, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3611 }, { "completion_length": 8.21875, "epoch": 0.632906956369371, "grad_norm": 20.866554282875175, "kl": 0.103515625, "learning_rate": 3.672682670404766e-07, "loss": 0.0144, "reward": 1.630176067352295, "reward_std": 0.23904339969158173, "rewards/accuracy_reward_stage2": 0.6458011269569397, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3612 }, { "completion_length": 11.828125, "epoch": 0.6330821797792185, "grad_norm": 18.307211353396795, "kl": 0.169921875, "learning_rate": 3.6709304363062907e-07, "loss": -0.0082, "reward": 1.4270917177200317, "reward_std": 0.200740247964859, "rewards/accuracy_reward_stage2": 0.5833417177200317, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3613 }, { "completion_length": 10.9375, "epoch": 0.6332574031890661, "grad_norm": 18.065419136464577, "kl": 0.11376953125, "learning_rate": 3.669178202207815e-07, "loss": 0.024, "reward": 1.4780011177062988, "reward_std": 0.2074006348848343, "rewards/accuracy_reward_stage2": 0.4936261773109436, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3614 }, { "completion_length": 7.546875, "epoch": 0.6334326265989136, "grad_norm": 16.572006317014594, "kl": 0.048828125, "learning_rate": 3.6674259681093395e-07, "loss": 0.0196, "reward": 1.697108507156372, "reward_std": 0.10851763188838959, "rewards/accuracy_reward_stage2": 0.6971083879470825, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3615 }, { "completion_length": 9.78125, "epoch": 0.6336078500087612, "grad_norm": 14.499401917716359, "kl": 0.1533203125, "learning_rate": 3.6656737340108633e-07, "loss": -0.0012, "reward": 1.6847833395004272, "reward_std": 0.1449342668056488, "rewards/accuracy_reward_stage2": 0.7160332798957825, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3616 }, { "completion_length": 8.75, "epoch": 0.6337830734186087, "grad_norm": 17.256413793076284, "kl": 0.1787109375, "learning_rate": 3.6639214999123877e-07, "loss": -0.0611, "reward": 1.5829840898513794, "reward_std": 0.28531354665756226, "rewards/accuracy_reward_stage2": 0.6298590302467346, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3617 }, { "completion_length": 9.15625, "epoch": 0.6339582968284563, "grad_norm": 16.62546792326556, "kl": 0.1396484375, "learning_rate": 3.6621692658139126e-07, "loss": 0.056, "reward": 1.4859604835510254, "reward_std": 0.13596408069133759, "rewards/accuracy_reward_stage2": 0.6109604835510254, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3618 }, { "completion_length": 8.171875, "epoch": 0.6341335202383038, "grad_norm": 21.861338130303448, "kl": 0.049072265625, "learning_rate": 3.660417031715437e-07, "loss": 0.0196, "reward": 1.789048671722412, "reward_std": 0.15189215540885925, "rewards/accuracy_reward_stage2": 0.7890486121177673, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3619 }, { "completion_length": 10.609375, "epoch": 0.6343087436481514, "grad_norm": 18.131394365573538, "kl": 0.046875, "learning_rate": 3.6586647976169614e-07, "loss": 0.0188, "reward": 1.332848310470581, "reward_std": 0.19504520297050476, "rewards/accuracy_reward_stage2": 0.33284837007522583, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3620 }, { "completion_length": 6.203125, "epoch": 0.6344839670579989, "grad_norm": 24.954239353297936, "kl": 0.1064453125, "learning_rate": 3.656912563518486e-07, "loss": 0.0425, "reward": 1.6431493759155273, "reward_std": 0.28684133291244507, "rewards/accuracy_reward_stage2": 0.6431494355201721, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3621 }, { "completion_length": 14.90625, "epoch": 0.6346591904678465, "grad_norm": 20.359486450098274, "kl": 0.1220703125, "learning_rate": 3.655160329420011e-07, "loss": 0.0102, "reward": 1.1454540491104126, "reward_std": 0.30089348554611206, "rewards/accuracy_reward_stage2": 0.4110791087150574, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3622 }, { "completion_length": 17.796875, "epoch": 0.634834413877694, "grad_norm": 23.201390222878917, "kl": 0.1455078125, "learning_rate": 3.6534080953215346e-07, "loss": 0.0142, "reward": 1.383394718170166, "reward_std": 0.2742749750614166, "rewards/accuracy_reward_stage2": 0.5240197777748108, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3623 }, { "completion_length": 10.0625, "epoch": 0.6350096372875416, "grad_norm": 14.074983708968212, "kl": 0.11376953125, "learning_rate": 3.651655861223059e-07, "loss": -0.0303, "reward": 1.8645833730697632, "reward_std": 0.1918574422597885, "rewards/accuracy_reward_stage2": 0.8958333730697632, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3624 }, { "completion_length": 13.515625, "epoch": 0.6351848606973892, "grad_norm": 35.487603494256746, "kl": 0.109375, "learning_rate": 3.6499036271245834e-07, "loss": 0.0437, "reward": 1.4434072971343994, "reward_std": 0.3074309527873993, "rewards/accuracy_reward_stage2": 0.5684072971343994, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3625 }, { "completion_length": 10.453125, "epoch": 0.6353600841072368, "grad_norm": 17.94171065619323, "kl": 0.2177734375, "learning_rate": 3.6481513930261083e-07, "loss": 0.0453, "reward": 1.6170685291290283, "reward_std": 0.19411921501159668, "rewards/accuracy_reward_stage2": 0.7576935291290283, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3626 }, { "completion_length": 7.171875, "epoch": 0.6355353075170843, "grad_norm": 18.337787479534004, "kl": 0.1455078125, "learning_rate": 3.646399158927633e-07, "loss": -0.0252, "reward": 1.666426181793213, "reward_std": 0.22929048538208008, "rewards/accuracy_reward_stage2": 0.6976761817932129, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3627 }, { "completion_length": 4.671875, "epoch": 0.6357105309269319, "grad_norm": 11.80166842802175, "kl": 0.041015625, "learning_rate": 3.644646924829157e-07, "loss": 0.0164, "reward": 1.5, "reward_std": 0.1293872892856598, "rewards/accuracy_reward_stage2": 0.5, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3628 }, { "completion_length": 9.6875, "epoch": 0.6358857543367794, "grad_norm": 14.412459917876111, "kl": 0.099609375, "learning_rate": 3.642894690730681e-07, "loss": 0.0399, "reward": 1.7028069496154785, "reward_std": 0.1426396518945694, "rewards/accuracy_reward_stage2": 0.7028070092201233, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3629 }, { "completion_length": 10.75, "epoch": 0.636060977746627, "grad_norm": 21.876442571206635, "kl": 0.064453125, "learning_rate": 3.641142456632206e-07, "loss": 0.0257, "reward": 1.6312143802642822, "reward_std": 0.364946186542511, "rewards/accuracy_reward_stage2": 0.6312142610549927, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3630 }, { "completion_length": 7.453125, "epoch": 0.6362362011564745, "grad_norm": 22.977326990070033, "kl": 0.2119140625, "learning_rate": 3.6393902225337303e-07, "loss": -0.0167, "reward": 1.5634841918945312, "reward_std": 0.19824695587158203, "rewards/accuracy_reward_stage2": 0.610359251499176, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3631 }, { "completion_length": 12.09375, "epoch": 0.636411424566322, "grad_norm": 18.88546650132665, "kl": 0.10009765625, "learning_rate": 3.6376379884352547e-07, "loss": 0.0401, "reward": 1.6160304546356201, "reward_std": 0.26513588428497314, "rewards/accuracy_reward_stage2": 0.6160303354263306, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3632 }, { "completion_length": 9.203125, "epoch": 0.6365866479761696, "grad_norm": 17.150632416002303, "kl": 0.0634765625, "learning_rate": 3.635885754336779e-07, "loss": 0.0254, "reward": 1.4632712602615356, "reward_std": 0.1401752084493637, "rewards/accuracy_reward_stage2": 0.5882712602615356, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3633 }, { "completion_length": 15.359375, "epoch": 0.6367618713860171, "grad_norm": 23.459706095482968, "kl": 0.15234375, "learning_rate": 3.634133520238304e-07, "loss": -0.0093, "reward": 1.4876947402954102, "reward_std": 0.20976431667804718, "rewards/accuracy_reward_stage2": 0.6439447402954102, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3634 }, { "completion_length": 8.765625, "epoch": 0.6369370947958647, "grad_norm": 16.70197187367129, "kl": 0.25, "learning_rate": 3.632381286139828e-07, "loss": 0.0206, "reward": 1.3416086435317993, "reward_std": 0.246791273355484, "rewards/accuracy_reward_stage2": 0.5134836435317993, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3635 }, { "completion_length": 9.3125, "epoch": 0.6371123182057122, "grad_norm": 22.943919776948466, "kl": 0.1201171875, "learning_rate": 3.6306290520413523e-07, "loss": 0.048, "reward": 1.6054027080535889, "reward_std": 0.30763548612594604, "rewards/accuracy_reward_stage2": 0.6054026484489441, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3636 }, { "completion_length": 8.71875, "epoch": 0.6372875416155599, "grad_norm": 34.82726050972871, "kl": 0.1650390625, "learning_rate": 3.6288768179428767e-07, "loss": 0.0659, "reward": 1.390625, "reward_std": 0.30721208453178406, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3637 }, { "completion_length": 14.46875, "epoch": 0.6374627650254074, "grad_norm": 18.081020575075673, "kl": 0.05078125, "learning_rate": 3.6271245838444016e-07, "loss": 0.0203, "reward": 1.5074986219406128, "reward_std": 0.14961406588554382, "rewards/accuracy_reward_stage2": 0.5074986219406128, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3638 }, { "completion_length": 15.0625, "epoch": 0.637637988435255, "grad_norm": 19.973849226311064, "kl": 0.173828125, "learning_rate": 3.625372349745926e-07, "loss": 0.0565, "reward": 1.2945737838745117, "reward_std": 0.2766297161579132, "rewards/accuracy_reward_stage2": 0.31019875407218933, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3639 }, { "completion_length": 9.65625, "epoch": 0.6378132118451025, "grad_norm": 23.292542316914666, "kl": 0.103515625, "learning_rate": 3.6236201156474504e-07, "loss": 0.0414, "reward": 1.5685745477676392, "reward_std": 0.2816503643989563, "rewards/accuracy_reward_stage2": 0.5685745477676392, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3640 }, { "completion_length": 8.40625, "epoch": 0.6379884352549501, "grad_norm": 13.730848138316576, "kl": 0.09716796875, "learning_rate": 3.621867881548975e-07, "loss": 0.0388, "reward": 1.6940895318984985, "reward_std": 0.16377386450767517, "rewards/accuracy_reward_stage2": 0.8190895318984985, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3641 }, { "completion_length": 13.484375, "epoch": 0.6381636586647976, "grad_norm": 19.131087121818375, "kl": 0.107421875, "learning_rate": 3.620115647450499e-07, "loss": -0.0454, "reward": 1.4821650981903076, "reward_std": 0.2997850179672241, "rewards/accuracy_reward_stage2": 0.5134150981903076, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3642 }, { "completion_length": 10.328125, "epoch": 0.6383388820746452, "grad_norm": 13.516975395098173, "kl": 0.046875, "learning_rate": 3.6183634133520236e-07, "loss": -0.0254, "reward": 1.5661125183105469, "reward_std": 0.21004100143909454, "rewards/accuracy_reward_stage2": 0.7067373991012573, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3643 }, { "completion_length": 10.171875, "epoch": 0.6385141054844927, "grad_norm": 15.89953798492155, "kl": 0.14453125, "learning_rate": 3.616611179253548e-07, "loss": 0.0187, "reward": 1.6142473220825195, "reward_std": 0.1787492334842682, "rewards/accuracy_reward_stage2": 0.7548723220825195, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3644 }, { "completion_length": 12.953125, "epoch": 0.6386893288943403, "grad_norm": 17.22223205879188, "kl": 0.25390625, "learning_rate": 3.6148589451550724e-07, "loss": 0.0136, "reward": 1.6069388389587402, "reward_std": 0.340746134519577, "rewards/accuracy_reward_stage2": 0.6381888389587402, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3645 }, { "completion_length": 13.46875, "epoch": 0.6388645523041878, "grad_norm": 25.223753101461224, "kl": 0.04833984375, "learning_rate": 3.6131067110565973e-07, "loss": 0.0193, "reward": 1.7002465724945068, "reward_std": 0.28130415081977844, "rewards/accuracy_reward_stage2": 0.7002465128898621, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3646 }, { "completion_length": 20.359375, "epoch": 0.6390397757140354, "grad_norm": 21.772746332061597, "kl": 0.11376953125, "learning_rate": 3.6113544769581217e-07, "loss": -0.0353, "reward": 1.5794328451156616, "reward_std": 0.22146174311637878, "rewards/accuracy_reward_stage2": 0.6106828451156616, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3647 }, { "completion_length": 31.984375, "epoch": 0.6392149991238829, "grad_norm": 9.787206170137264, "kl": 0.0284423828125, "learning_rate": 3.6096022428596456e-07, "loss": 0.0114, "reward": 1.4354877471923828, "reward_std": 0.09317904710769653, "rewards/accuracy_reward_stage2": 0.4354877173900604, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3648 }, { "completion_length": 18.84375, "epoch": 0.6393902225337305, "grad_norm": 18.024907451498215, "kl": 0.1708984375, "learning_rate": 3.60785000876117e-07, "loss": 0.0284, "reward": 1.301550269126892, "reward_std": 0.17121167480945587, "rewards/accuracy_reward_stage2": 0.4421752393245697, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3649 }, { "completion_length": 7.515625, "epoch": 0.6395654459435781, "grad_norm": 33.71627384871996, "kl": 0.119140625, "learning_rate": 3.606097774662695e-07, "loss": -0.0067, "reward": 1.144614577293396, "reward_std": 0.19346420466899872, "rewards/accuracy_reward_stage2": 0.300864577293396, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3650 }, { "completion_length": 9.65625, "epoch": 0.6397406693534257, "grad_norm": 16.026084326842717, "kl": 0.099609375, "learning_rate": 3.6043455405642193e-07, "loss": 0.0397, "reward": 1.447823166847229, "reward_std": 0.18850518763065338, "rewards/accuracy_reward_stage2": 0.5728232264518738, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3651 }, { "completion_length": 10.765625, "epoch": 0.6399158927632732, "grad_norm": 19.035674298729102, "kl": 0.1494140625, "learning_rate": 3.6025933064657437e-07, "loss": -0.0211, "reward": 1.4274215698242188, "reward_std": 0.2881200909614563, "rewards/accuracy_reward_stage2": 0.45867156982421875, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3652 }, { "completion_length": 11.515625, "epoch": 0.6400911161731208, "grad_norm": 16.483873807580256, "kl": 0.1640625, "learning_rate": 3.600841072367268e-07, "loss": -0.0587, "reward": 1.6582226753234863, "reward_std": 0.2703021466732025, "rewards/accuracy_reward_stage2": 0.7050977349281311, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3653 }, { "completion_length": 4.8125, "epoch": 0.6402663395829683, "grad_norm": 13.76375064303439, "kl": 0.0196533203125, "learning_rate": 3.5990888382687925e-07, "loss": 0.0078, "reward": 1.9034197330474854, "reward_std": 0.05980297550559044, "rewards/accuracy_reward_stage2": 0.9034197330474854, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3654 }, { "completion_length": 13.21875, "epoch": 0.6404415629928158, "grad_norm": 18.985032534329388, "kl": 0.0810546875, "learning_rate": 3.597336604170317e-07, "loss": 0.0325, "reward": 1.4299408197402954, "reward_std": 0.20411565899848938, "rewards/accuracy_reward_stage2": 0.5549408197402954, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3655 }, { "completion_length": 10.40625, "epoch": 0.6406167864026634, "grad_norm": 13.552495749076439, "kl": 0.0308837890625, "learning_rate": 3.5955843700718413e-07, "loss": 0.0123, "reward": 1.6765105724334717, "reward_std": 0.1349574625492096, "rewards/accuracy_reward_stage2": 0.6765106916427612, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3656 }, { "completion_length": 9.953125, "epoch": 0.6407920098125109, "grad_norm": 23.11657657631506, "kl": 0.1044921875, "learning_rate": 3.5938321359733657e-07, "loss": 0.0418, "reward": 1.3100368976593018, "reward_std": 0.3031858801841736, "rewards/accuracy_reward_stage2": 0.435036838054657, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3657 }, { "completion_length": 8.953125, "epoch": 0.6409672332223585, "grad_norm": 15.631149227925766, "kl": 0.1826171875, "learning_rate": 3.5920799018748906e-07, "loss": -0.0026, "reward": 1.4643263816833496, "reward_std": 0.3079353868961334, "rewards/accuracy_reward_stage2": 0.6205763816833496, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3658 }, { "completion_length": 10.421875, "epoch": 0.641142456632206, "grad_norm": 28.95809032526528, "kl": 0.2314453125, "learning_rate": 3.590327667776415e-07, "loss": 0.061, "reward": 1.4065438508987427, "reward_std": 0.3178490400314331, "rewards/accuracy_reward_stage2": 0.5471689105033875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3659 }, { "completion_length": 10.1875, "epoch": 0.6413176800420536, "grad_norm": 12.313051803100334, "kl": 0.1728515625, "learning_rate": 3.5885754336779394e-07, "loss": -0.0482, "reward": 1.7476816177368164, "reward_std": 0.24349580705165863, "rewards/accuracy_reward_stage2": 0.7945566177368164, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3660 }, { "completion_length": 9.484375, "epoch": 0.6414929034519011, "grad_norm": 13.941713403383208, "kl": 0.1162109375, "learning_rate": 3.5868231995794633e-07, "loss": -0.0419, "reward": 1.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward_stage2": 0.84375, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3661 }, { "completion_length": 14.203125, "epoch": 0.6416681268617487, "grad_norm": 20.011229627655002, "kl": 0.2255859375, "learning_rate": 3.585070965480988e-07, "loss": -0.0426, "reward": 1.6828477382659912, "reward_std": 0.2681369185447693, "rewards/accuracy_reward_stage2": 0.7453478574752808, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3662 }, { "completion_length": 15.46875, "epoch": 0.6418433502715963, "grad_norm": 16.98900105517778, "kl": 0.0238037109375, "learning_rate": 3.5833187313825126e-07, "loss": 0.0095, "reward": 1.6922528743743896, "reward_std": 0.09544496238231659, "rewards/accuracy_reward_stage2": 0.6922527551651001, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3663 }, { "completion_length": 12.71875, "epoch": 0.6420185736814439, "grad_norm": 15.895188337342285, "kl": 0.1064453125, "learning_rate": 3.581566497284037e-07, "loss": 0.0426, "reward": 1.7854351997375488, "reward_std": 0.12646767497062683, "rewards/accuracy_reward_stage2": 0.910435140132904, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3664 }, { "completion_length": 8.5625, "epoch": 0.6421937970912914, "grad_norm": 14.632761302952792, "kl": 0.0986328125, "learning_rate": 3.5798142631855614e-07, "loss": -0.0048, "reward": 1.78125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3665 }, { "completion_length": 10.109375, "epoch": 0.642369020501139, "grad_norm": 19.81936363509419, "kl": 0.142578125, "learning_rate": 3.5780620290870863e-07, "loss": -0.0106, "reward": 1.6112961769104004, "reward_std": 0.2645382881164551, "rewards/accuracy_reward_stage2": 0.6425461769104004, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3666 }, { "completion_length": 10.28125, "epoch": 0.6425442439109865, "grad_norm": 14.471599410569507, "kl": 0.1826171875, "learning_rate": 3.57630979498861e-07, "loss": -0.0099, "reward": 1.6287720203399658, "reward_std": 0.21368272602558136, "rewards/accuracy_reward_stage2": 0.660021960735321, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3667 }, { "completion_length": 13.078125, "epoch": 0.6427194673208341, "grad_norm": 13.667389974234641, "kl": 0.09765625, "learning_rate": 3.5745575608901346e-07, "loss": -0.0051, "reward": 1.296875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward_stage2": 0.3125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3668 }, { "completion_length": 9.71875, "epoch": 0.6428946907306816, "grad_norm": 18.914206371370327, "kl": 0.10205078125, "learning_rate": 3.572805326791659e-07, "loss": 0.0077, "reward": 1.6069194078445435, "reward_std": 0.2152298241853714, "rewards/accuracy_reward_stage2": 0.6225443482398987, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3669 }, { "completion_length": 19.4375, "epoch": 0.6430699141405292, "grad_norm": 15.692748843568294, "kl": 0.154296875, "learning_rate": 3.571053092693184e-07, "loss": -0.0261, "reward": 1.3200486898422241, "reward_std": 0.2600979506969452, "rewards/accuracy_reward_stage2": 0.35129863023757935, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3670 }, { "completion_length": 10.84375, "epoch": 0.6432451375503767, "grad_norm": 20.713042140015347, "kl": 0.154296875, "learning_rate": 3.5693008585947083e-07, "loss": 0.0615, "reward": 1.6474779844284058, "reward_std": 0.19449205696582794, "rewards/accuracy_reward_stage2": 0.7724780440330505, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3671 }, { "completion_length": 7.90625, "epoch": 0.6434203609602243, "grad_norm": 18.939598476065225, "kl": 0.09423828125, "learning_rate": 3.5675486244962327e-07, "loss": 0.0013, "reward": 1.4366912841796875, "reward_std": 0.11813464760780334, "rewards/accuracy_reward_stage2": 0.45231637358665466, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3672 }, { "completion_length": 8.59375, "epoch": 0.6435955843700718, "grad_norm": 22.560695552345376, "kl": 0.09375, "learning_rate": 3.5657963903977566e-07, "loss": 0.0208, "reward": 1.534196138381958, "reward_std": 0.25116610527038574, "rewards/accuracy_reward_stage2": 0.5498210787773132, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3673 }, { "completion_length": 15.40625, "epoch": 0.6437708077799194, "grad_norm": 22.839026411280702, "kl": 0.091796875, "learning_rate": 3.5640441562992815e-07, "loss": 0.0366, "reward": 1.703125, "reward_std": 0.2633790373802185, "rewards/accuracy_reward_stage2": 0.828125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3674 }, { "completion_length": 13.484375, "epoch": 0.6439460311897669, "grad_norm": 30.310752237469682, "kl": 0.130859375, "learning_rate": 3.562291922200806e-07, "loss": 0.0081, "reward": 1.406123161315918, "reward_std": 0.16176798939704895, "rewards/accuracy_reward_stage2": 0.546748161315918, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3675 }, { "completion_length": 13.109375, "epoch": 0.6441212545996146, "grad_norm": 17.7024621132698, "kl": 0.15234375, "learning_rate": 3.5605396881023303e-07, "loss": 0.0167, "reward": 1.5824267864227295, "reward_std": 0.2084314227104187, "rewards/accuracy_reward_stage2": 0.5980518460273743, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3676 }, { "completion_length": 13.15625, "epoch": 0.6442964780094621, "grad_norm": 17.017036528647274, "kl": 0.11572265625, "learning_rate": 3.5587874540038547e-07, "loss": 0.0462, "reward": 1.4068691730499268, "reward_std": 0.1746881902217865, "rewards/accuracy_reward_stage2": 0.6568692326545715, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3677 }, { "completion_length": 9.6875, "epoch": 0.6444717014193097, "grad_norm": 19.01532393777833, "kl": 0.25390625, "learning_rate": 3.5570352199053796e-07, "loss": 0.0139, "reward": 1.7672874927520752, "reward_std": 0.3108041286468506, "rewards/accuracy_reward_stage2": 0.8141624927520752, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3678 }, { "completion_length": 12.640625, "epoch": 0.6446469248291572, "grad_norm": 22.816178841710844, "kl": 0.1806640625, "learning_rate": 3.555282985806904e-07, "loss": 0.0281, "reward": 1.6114469766616821, "reward_std": 0.24365541338920593, "rewards/accuracy_reward_stage2": 0.6270719766616821, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3679 }, { "completion_length": 8.921875, "epoch": 0.6448221482390047, "grad_norm": 19.43465114242342, "kl": 0.26171875, "learning_rate": 3.553530751708428e-07, "loss": 0.0547, "reward": 1.5898686647415161, "reward_std": 0.3100131154060364, "rewards/accuracy_reward_stage2": 0.6211186647415161, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3680 }, { "completion_length": 11.578125, "epoch": 0.6449973716488523, "grad_norm": 21.807378310853615, "kl": 0.21484375, "learning_rate": 3.551778517609952e-07, "loss": 0.0047, "reward": 1.3561477661132812, "reward_std": 0.2760908603668213, "rewards/accuracy_reward_stage2": 0.4967726767063141, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3681 }, { "completion_length": 7.71875, "epoch": 0.6451725950586998, "grad_norm": 23.159563899251044, "kl": 0.1298828125, "learning_rate": 3.550026283511477e-07, "loss": 0.0078, "reward": 1.6781278848648071, "reward_std": 0.25165823101997375, "rewards/accuracy_reward_stage2": 0.6937528848648071, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3682 }, { "completion_length": 7.03125, "epoch": 0.6453478184685474, "grad_norm": 15.508065266492686, "kl": 0.10107421875, "learning_rate": 3.5482740494130016e-07, "loss": 0.0403, "reward": 1.6627414226531982, "reward_std": 0.17357571423053741, "rewards/accuracy_reward_stage2": 0.7877414226531982, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3683 }, { "completion_length": 9.953125, "epoch": 0.6455230418783949, "grad_norm": 20.11035846039172, "kl": 0.1669921875, "learning_rate": 3.546521815314526e-07, "loss": 0.0064, "reward": 1.4375, "reward_std": 0.3335031569004059, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3684 }, { "completion_length": 35.859375, "epoch": 0.6456982652882425, "grad_norm": 47.35011743804844, "kl": 0.1123046875, "learning_rate": 3.5447695812160504e-07, "loss": 0.0009, "reward": 1.5811420679092407, "reward_std": 0.26867377758026123, "rewards/accuracy_reward_stage2": 0.5967670679092407, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3685 }, { "completion_length": 8.5625, "epoch": 0.64587348869809, "grad_norm": 19.582694344476494, "kl": 0.083984375, "learning_rate": 3.543017347117574e-07, "loss": 0.0337, "reward": 1.487224817276001, "reward_std": 0.2638910114765167, "rewards/accuracy_reward_stage2": 0.48722487688064575, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3686 }, { "completion_length": 10.796875, "epoch": 0.6460487121079376, "grad_norm": 17.312223258225703, "kl": 0.0830078125, "learning_rate": 3.541265113019099e-07, "loss": 0.0333, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3687 }, { "completion_length": 9.15625, "epoch": 0.6462239355177852, "grad_norm": 22.85228117476544, "kl": 0.1728515625, "learning_rate": 3.5395128789206236e-07, "loss": 0.034, "reward": 1.21493399143219, "reward_std": 0.31954365968704224, "rewards/accuracy_reward_stage2": 0.48055899143218994, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3688 }, { "completion_length": 9.5, "epoch": 0.6463991589276328, "grad_norm": 16.64890158145795, "kl": 0.30859375, "learning_rate": 3.537760644822148e-07, "loss": 0.035, "reward": 1.5417678356170654, "reward_std": 0.23739787936210632, "rewards/accuracy_reward_stage2": 0.6980177164077759, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3689 }, { "completion_length": 14.671875, "epoch": 0.6465743823374803, "grad_norm": 18.750404329398975, "kl": 0.150390625, "learning_rate": 3.5360084107236724e-07, "loss": -0.0516, "reward": 1.405958890914917, "reward_std": 0.1552903652191162, "rewards/accuracy_reward_stage2": 0.7028338313102722, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 3690 }, { "completion_length": 12.125, "epoch": 0.6467496057473279, "grad_norm": 17.833589733517734, "kl": 0.16796875, "learning_rate": 3.5342561766251973e-07, "loss": 0.0671, "reward": 1.3854167461395264, "reward_std": 0.2868278920650482, "rewards/accuracy_reward_stage2": 0.6354166865348816, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3691 }, { "completion_length": 18.65625, "epoch": 0.6469248291571754, "grad_norm": 30.050526122991396, "kl": 0.322265625, "learning_rate": 3.5325039425267217e-07, "loss": 0.0414, "reward": 1.224075198173523, "reward_std": 0.26675575971603394, "rewards/accuracy_reward_stage2": 0.27095019817352295, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3692 }, { "completion_length": 9.203125, "epoch": 0.647100052567023, "grad_norm": 17.85586035935803, "kl": 0.1611328125, "learning_rate": 3.5307517084282455e-07, "loss": 0.0002, "reward": 1.3594422340393066, "reward_std": 0.22799652814865112, "rewards/accuracy_reward_stage2": 0.39069223403930664, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3693 }, { "completion_length": 15.890625, "epoch": 0.6472752759768705, "grad_norm": 17.995748941269156, "kl": 0.1533203125, "learning_rate": 3.52899947432977e-07, "loss": -0.0501, "reward": 1.5937397480010986, "reward_std": 0.12785354256629944, "rewards/accuracy_reward_stage2": 0.6406147480010986, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3694 }, { "completion_length": 12.734375, "epoch": 0.6474504993867181, "grad_norm": 17.172686057454797, "kl": 0.0751953125, "learning_rate": 3.527247240231295e-07, "loss": 0.03, "reward": 1.4971519708633423, "reward_std": 0.17167343199253082, "rewards/accuracy_reward_stage2": 0.4971519112586975, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3695 }, { "completion_length": 7.9375, "epoch": 0.6476257227965656, "grad_norm": 13.774901205039306, "kl": 0.12109375, "learning_rate": 3.5254950061328193e-07, "loss": 0.0068, "reward": 1.6474002599716187, "reward_std": 0.1538010835647583, "rewards/accuracy_reward_stage2": 0.6630252599716187, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3696 }, { "completion_length": 10.328125, "epoch": 0.6478009462064132, "grad_norm": 17.499256588420277, "kl": 0.125, "learning_rate": 3.5237427720343437e-07, "loss": -0.0003, "reward": 1.6396540403366089, "reward_std": 0.19111916422843933, "rewards/accuracy_reward_stage2": 0.6709039807319641, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3697 }, { "completion_length": 13.6875, "epoch": 0.6479761696162607, "grad_norm": 55.37711546967327, "kl": 0.349609375, "learning_rate": 3.521990537935868e-07, "loss": 0.0996, "reward": 1.0965315103530884, "reward_std": 0.27788692712783813, "rewards/accuracy_reward_stage2": 0.37778154015541077, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3698 }, { "completion_length": 12.09375, "epoch": 0.6481513930261082, "grad_norm": 21.027429870961353, "kl": 0.047607421875, "learning_rate": 3.5202383038373925e-07, "loss": 0.019, "reward": 1.5767583847045898, "reward_std": 0.2807679772377014, "rewards/accuracy_reward_stage2": 0.5767583250999451, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3699 }, { "completion_length": 11.515625, "epoch": 0.6483266164359558, "grad_norm": 20.49667679086188, "kl": 0.1328125, "learning_rate": 3.518486069738917e-07, "loss": 0.0614, "reward": 1.5157562494277954, "reward_std": 0.2441159188747406, "rewards/accuracy_reward_stage2": 0.6407562494277954, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3700 }, { "completion_length": 7.171875, "epoch": 0.6485018398458035, "grad_norm": 18.043544748787948, "kl": 0.0791015625, "learning_rate": 3.516733835640441e-07, "loss": 0.0018, "reward": 1.5832839012145996, "reward_std": 0.2336646169424057, "rewards/accuracy_reward_stage2": 0.7239089012145996, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3701 }, { "completion_length": 7.140625, "epoch": 0.648677063255651, "grad_norm": 16.274809205252975, "kl": 0.06396484375, "learning_rate": 3.5149816015419656e-07, "loss": 0.0255, "reward": 1.7552083730697632, "reward_std": 0.17123916745185852, "rewards/accuracy_reward_stage2": 0.7552083730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3702 }, { "completion_length": 9.265625, "epoch": 0.6488522866654985, "grad_norm": 19.514395198585934, "kl": 0.09033203125, "learning_rate": 3.5132293674434906e-07, "loss": 0.0056, "reward": 1.402266502380371, "reward_std": 0.28506118059158325, "rewards/accuracy_reward_stage2": 0.4178914427757263, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3703 }, { "completion_length": 10.25, "epoch": 0.6490275100753461, "grad_norm": 21.652850702390413, "kl": 0.0947265625, "learning_rate": 3.511477133345015e-07, "loss": -0.0063, "reward": 1.5164008140563965, "reward_std": 0.23606063425540924, "rewards/accuracy_reward_stage2": 0.6570256948471069, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3704 }, { "completion_length": 12.375, "epoch": 0.6492027334851936, "grad_norm": 19.437959017734133, "kl": 0.09130859375, "learning_rate": 3.509724899246539e-07, "loss": 0.0022, "reward": 1.1945466995239258, "reward_std": 0.23989106714725494, "rewards/accuracy_reward_stage2": 0.33517172932624817, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3705 }, { "completion_length": 11.890625, "epoch": 0.6493779568950412, "grad_norm": 27.20928171381491, "kl": 0.1328125, "learning_rate": 3.507972665148063e-07, "loss": 0.0533, "reward": 1.1246411800384521, "reward_std": 0.16782069206237793, "rewards/accuracy_reward_stage2": 0.49964118003845215, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 3706 }, { "completion_length": 7.953125, "epoch": 0.6495531803048887, "grad_norm": 15.36476070824679, "kl": 0.11572265625, "learning_rate": 3.506220431049588e-07, "loss": 0.0147, "reward": 1.591088056564331, "reward_std": 0.14347587525844574, "rewards/accuracy_reward_stage2": 0.6067129969596863, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3707 }, { "completion_length": 12.0, "epoch": 0.6497284037147363, "grad_norm": 21.447431886758366, "kl": 0.2353515625, "learning_rate": 3.5044681969511126e-07, "loss": -0.1017, "reward": 1.5135695934295654, "reward_std": 0.3412063717842102, "rewards/accuracy_reward_stage2": 0.7010695934295654, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 3708 }, { "completion_length": 8.890625, "epoch": 0.6499036271245838, "grad_norm": 16.476302891145476, "kl": 0.130859375, "learning_rate": 3.502715962852637e-07, "loss": 0.0523, "reward": 1.5784977674484253, "reward_std": 0.16375833749771118, "rewards/accuracy_reward_stage2": 0.7034977674484253, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3709 }, { "completion_length": 12.0, "epoch": 0.6500788505344314, "grad_norm": 18.689905011866053, "kl": 0.12109375, "learning_rate": 3.5009637287541613e-07, "loss": 0.0075, "reward": 1.5490682125091553, "reward_std": 0.19050264358520508, "rewards/accuracy_reward_stage2": 0.6896932125091553, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3710 }, { "completion_length": 12.484375, "epoch": 0.6502540739442789, "grad_norm": 28.57532795015385, "kl": 0.29296875, "learning_rate": 3.4992114946556863e-07, "loss": 0.0133, "reward": 1.4129630327224731, "reward_std": 0.27624937891960144, "rewards/accuracy_reward_stage2": 0.5848380923271179, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3711 }, { "completion_length": 11.421875, "epoch": 0.6504292973541265, "grad_norm": 18.780457042870765, "kl": 0.1083984375, "learning_rate": 3.49745926055721e-07, "loss": 0.0433, "reward": 1.4363348484039307, "reward_std": 0.20736932754516602, "rewards/accuracy_reward_stage2": 0.6863349676132202, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3712 }, { "completion_length": 8.859375, "epoch": 0.650604520763974, "grad_norm": 16.69906655424036, "kl": 0.06298828125, "learning_rate": 3.4957070264587345e-07, "loss": -0.0178, "reward": 1.658174753189087, "reward_std": 0.19843123853206635, "rewards/accuracy_reward_stage2": 0.6737997531890869, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3713 }, { "completion_length": 7.3125, "epoch": 0.6507797441738217, "grad_norm": 19.927210600002145, "kl": 0.1552734375, "learning_rate": 3.493954792360259e-07, "loss": -0.0635, "reward": 1.625319242477417, "reward_std": 0.23469506204128265, "rewards/accuracy_reward_stage2": 0.6721941828727722, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3714 }, { "completion_length": 10.96875, "epoch": 0.6509549675836692, "grad_norm": 17.435068228575098, "kl": 0.28515625, "learning_rate": 3.492202558261784e-07, "loss": 0.1136, "reward": 1.1564494371414185, "reward_std": 0.11023418605327606, "rewards/accuracy_reward_stage2": 0.40644940733909607, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3715 }, { "completion_length": 8.578125, "epoch": 0.6511301909935168, "grad_norm": 24.593109335212954, "kl": 0.259765625, "learning_rate": 3.490450324163308e-07, "loss": 0.1039, "reward": 1.4121159315109253, "reward_std": 0.2062818557024002, "rewards/accuracy_reward_stage2": 0.6621158719062805, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3716 }, { "completion_length": 10.703125, "epoch": 0.6513054144033643, "grad_norm": 18.10507496549295, "kl": 0.1884765625, "learning_rate": 3.4886980900648326e-07, "loss": 0.0111, "reward": 1.7756869792938232, "reward_std": 0.2656467854976654, "rewards/accuracy_reward_stage2": 0.806937038898468, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3717 }, { "completion_length": 9.46875, "epoch": 0.6514806378132119, "grad_norm": 19.52925181539543, "kl": 0.2236328125, "learning_rate": 3.4869458559663565e-07, "loss": 0.0524, "reward": 1.5218735933303833, "reward_std": 0.19578316807746887, "rewards/accuracy_reward_stage2": 0.5374986529350281, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3718 }, { "completion_length": 8.9375, "epoch": 0.6516558612230594, "grad_norm": 20.85909176443267, "kl": 0.13671875, "learning_rate": 3.4851936218678814e-07, "loss": -0.0004, "reward": 1.602414846420288, "reward_std": 0.20141032338142395, "rewards/accuracy_reward_stage2": 0.6336649656295776, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3719 }, { "completion_length": 7.578125, "epoch": 0.651831084632907, "grad_norm": 15.77366126598909, "kl": 0.1982421875, "learning_rate": 3.483441387769406e-07, "loss": -0.0254, "reward": 1.6306451559066772, "reward_std": 0.20585371553897858, "rewards/accuracy_reward_stage2": 0.677520215511322, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3720 }, { "completion_length": 10.0625, "epoch": 0.6520063080427545, "grad_norm": 34.95961364568875, "kl": 0.0947265625, "learning_rate": 3.48168915367093e-07, "loss": 0.0378, "reward": 1.5415239334106445, "reward_std": 0.2722419202327728, "rewards/accuracy_reward_stage2": 0.5415239930152893, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3721 }, { "completion_length": 7.28125, "epoch": 0.652181531452602, "grad_norm": 21.899136357172818, "kl": 0.11376953125, "learning_rate": 3.4799369195724546e-07, "loss": 0.0028, "reward": 1.6860486268997192, "reward_std": 0.21623259782791138, "rewards/accuracy_reward_stage2": 0.7016735076904297, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3722 }, { "completion_length": 13.6875, "epoch": 0.6523567548624496, "grad_norm": 13.149917081596058, "kl": 0.08935546875, "learning_rate": 3.4781846854739796e-07, "loss": -0.0147, "reward": 1.4375, "reward_std": 0.2041158676147461, "rewards/accuracy_reward_stage2": 0.703125, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3723 }, { "completion_length": 10.9375, "epoch": 0.6525319782722971, "grad_norm": 17.47616705957636, "kl": 0.1826171875, "learning_rate": 3.4764324513755034e-07, "loss": 0.0343, "reward": 1.4866011142730713, "reward_std": 0.19845804572105408, "rewards/accuracy_reward_stage2": 0.5022260546684265, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3724 }, { "completion_length": 9.28125, "epoch": 0.6527072016821447, "grad_norm": 22.11286584635779, "kl": 0.34765625, "learning_rate": 3.474680217277028e-07, "loss": -0.0108, "reward": 1.545560598373413, "reward_std": 0.3116529583930969, "rewards/accuracy_reward_stage2": 0.6080605983734131, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3725 }, { "completion_length": 7.59375, "epoch": 0.6528824250919922, "grad_norm": 18.219007326739145, "kl": 0.32421875, "learning_rate": 3.472927983178552e-07, "loss": -0.0423, "reward": 1.8254756927490234, "reward_std": 0.3147159516811371, "rewards/accuracy_reward_stage2": 0.8879756331443787, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3726 }, { "completion_length": 11.09375, "epoch": 0.6530576485018399, "grad_norm": 39.50198001524787, "kl": 0.384765625, "learning_rate": 3.471175749080077e-07, "loss": 0.1193, "reward": 1.3389873504638672, "reward_std": 0.3174276351928711, "rewards/accuracy_reward_stage2": 0.6046122908592224, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3727 }, { "completion_length": 11.65625, "epoch": 0.6532328719116874, "grad_norm": 28.740440477499103, "kl": 0.2412109375, "learning_rate": 3.4694235149816015e-07, "loss": 0.0521, "reward": 1.3430554866790771, "reward_std": 0.3203160762786865, "rewards/accuracy_reward_stage2": 0.4836805462837219, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3728 }, { "completion_length": 10.125, "epoch": 0.653408095321535, "grad_norm": 13.062985034306717, "kl": 0.1708984375, "learning_rate": 3.467671280883126e-07, "loss": 0.0373, "reward": 1.4132962226867676, "reward_std": 0.11710938811302185, "rewards/accuracy_reward_stage2": 0.5539212822914124, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3729 }, { "completion_length": 12.078125, "epoch": 0.6535833187313825, "grad_norm": 18.584620786462324, "kl": 0.1611328125, "learning_rate": 3.4659190467846503e-07, "loss": -0.0143, "reward": 1.328352451324463, "reward_std": 0.3078764081001282, "rewards/accuracy_reward_stage2": 0.48460254073143005, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3730 }, { "completion_length": 8.625, "epoch": 0.6537585421412301, "grad_norm": 27.202464955173326, "kl": 0.1484375, "learning_rate": 3.4641668126861747e-07, "loss": -0.0044, "reward": 1.463038682937622, "reward_std": 0.2783457338809967, "rewards/accuracy_reward_stage2": 0.49428868293762207, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3731 }, { "completion_length": 7.0625, "epoch": 0.6539337655510776, "grad_norm": 13.408109916174384, "kl": 0.09228515625, "learning_rate": 3.462414578587699e-07, "loss": 0.0369, "reward": 1.9514180421829224, "reward_std": 0.09767099469900131, "rewards/accuracy_reward_stage2": 0.9514180421829224, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3732 }, { "completion_length": 15.65625, "epoch": 0.6541089889609252, "grad_norm": 12.027256239529795, "kl": 0.14453125, "learning_rate": 3.4606623444892235e-07, "loss": 0.014, "reward": 1.1358861923217773, "reward_std": 0.16388946771621704, "rewards/accuracy_reward_stage2": 0.27651113271713257, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3733 }, { "completion_length": 9.265625, "epoch": 0.6542842123707727, "grad_norm": 18.95864959955592, "kl": 0.154296875, "learning_rate": 3.458910110390748e-07, "loss": 0.033, "reward": 1.50836181640625, "reward_std": 0.25778210163116455, "rewards/accuracy_reward_stage2": 0.64898681640625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3734 }, { "completion_length": 10.59375, "epoch": 0.6544594357806203, "grad_norm": 21.764154397501247, "kl": 0.146484375, "learning_rate": 3.457157876292273e-07, "loss": 0.0274, "reward": 1.6949567794799805, "reward_std": 0.2524658441543579, "rewards/accuracy_reward_stage2": 0.7105817794799805, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3735 }, { "completion_length": 9.375, "epoch": 0.6546346591904678, "grad_norm": 16.90160251412798, "kl": 0.11376953125, "learning_rate": 3.455405642193797e-07, "loss": 0.0097, "reward": 1.4191895723342896, "reward_std": 0.16770751774311066, "rewards/accuracy_reward_stage2": 0.4348146319389343, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3736 }, { "completion_length": 14.53125, "epoch": 0.6548098826003154, "grad_norm": 18.18403251798853, "kl": 0.056884765625, "learning_rate": 3.453653408095321e-07, "loss": 0.0228, "reward": 1.6315476894378662, "reward_std": 0.11001887172460556, "rewards/accuracy_reward_stage2": 0.631547749042511, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3737 }, { "completion_length": 16.34375, "epoch": 0.6549851060101629, "grad_norm": 23.436874690011468, "kl": 0.216796875, "learning_rate": 3.4519011739968455e-07, "loss": 0.0622, "reward": 1.3637266159057617, "reward_std": 0.2299090176820755, "rewards/accuracy_reward_stage2": 0.5043515563011169, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3738 }, { "completion_length": 11.015625, "epoch": 0.6551603294200106, "grad_norm": 26.50739060775904, "kl": 0.240234375, "learning_rate": 3.4501489398983704e-07, "loss": 0.0391, "reward": 1.4403434991836548, "reward_std": 0.3019851744174957, "rewards/accuracy_reward_stage2": 0.61221843957901, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3739 }, { "completion_length": 21.625, "epoch": 0.6553355528298581, "grad_norm": 18.513512395425863, "kl": 0.3046875, "learning_rate": 3.448396705799895e-07, "loss": -0.0277, "reward": 1.639461874961853, "reward_std": 0.25874900817871094, "rewards/accuracy_reward_stage2": 0.7019618153572083, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3740 }, { "completion_length": 9.203125, "epoch": 0.6555107762397057, "grad_norm": 23.17622655740721, "kl": 0.09228515625, "learning_rate": 3.446644471701419e-07, "loss": 0.0369, "reward": 1.3237862586975098, "reward_std": 0.2052987515926361, "rewards/accuracy_reward_stage2": 0.5737862586975098, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3741 }, { "completion_length": 8.890625, "epoch": 0.6556859996495532, "grad_norm": 13.285559996644166, "kl": 0.1259765625, "learning_rate": 3.4448922376029436e-07, "loss": -0.0148, "reward": 1.7263647317886353, "reward_std": 0.15753847360610962, "rewards/accuracy_reward_stage2": 0.7576147317886353, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3742 }, { "completion_length": 11.96875, "epoch": 0.6558612230594008, "grad_norm": 14.989393310438379, "kl": 0.0869140625, "learning_rate": 3.4431400035044685e-07, "loss": 0.0347, "reward": 1.555059552192688, "reward_std": 0.1714351773262024, "rewards/accuracy_reward_stage2": 0.6800594925880432, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3743 }, { "completion_length": 11.84375, "epoch": 0.6560364464692483, "grad_norm": 14.933381466929175, "kl": 0.0908203125, "learning_rate": 3.4413877694059924e-07, "loss": -0.0015, "reward": 1.4837230443954468, "reward_std": 0.14175119996070862, "rewards/accuracy_reward_stage2": 0.499347984790802, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3744 }, { "completion_length": 9.5, "epoch": 0.6562116698790958, "grad_norm": 15.504366600818704, "kl": 0.12158203125, "learning_rate": 3.439635535307517e-07, "loss": -0.0152, "reward": 1.7632322311401367, "reward_std": 0.1770913302898407, "rewards/accuracy_reward_stage2": 0.7944821715354919, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3745 }, { "completion_length": 10.90625, "epoch": 0.6563868932889434, "grad_norm": 19.398149361266665, "kl": 0.2109375, "learning_rate": 3.437883301209041e-07, "loss": 0.032, "reward": 1.597571849822998, "reward_std": 0.2011193186044693, "rewards/accuracy_reward_stage2": 0.753821849822998, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3746 }, { "completion_length": 10.453125, "epoch": 0.6565621166987909, "grad_norm": 20.833322425182583, "kl": 0.1201171875, "learning_rate": 3.436131067110566e-07, "loss": -0.0305, "reward": 1.4291150569915771, "reward_std": 0.2782094180583954, "rewards/accuracy_reward_stage2": 0.4603649973869324, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3747 }, { "completion_length": 17.34375, "epoch": 0.6567373401086385, "grad_norm": 17.334135646348553, "kl": 0.06591796875, "learning_rate": 3.4343788330120905e-07, "loss": 0.0263, "reward": 1.3547989130020142, "reward_std": 0.1030765026807785, "rewards/accuracy_reward_stage2": 0.47979891300201416, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3748 }, { "completion_length": 8.421875, "epoch": 0.656912563518486, "grad_norm": 19.427073166610853, "kl": 0.1748046875, "learning_rate": 3.432626598913615e-07, "loss": -0.006, "reward": 1.499578595161438, "reward_std": 0.27636945247650146, "rewards/accuracy_reward_stage2": 0.530828595161438, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3749 }, { "completion_length": 8.125, "epoch": 0.6570877869283336, "grad_norm": 14.00868203881481, "kl": 0.140625, "learning_rate": 3.430874364815139e-07, "loss": 0.0119, "reward": 1.8125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.953125, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3750 }, { "completion_length": 9.09375, "epoch": 0.6572630103381811, "grad_norm": 16.193634064377, "kl": 0.058837890625, "learning_rate": 3.4291221307166637e-07, "loss": 0.0235, "reward": 1.3800715208053589, "reward_std": 0.15051256120204926, "rewards/accuracy_reward_stage2": 0.3800715208053589, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3751 }, { "completion_length": 21.34375, "epoch": 0.6574382337480288, "grad_norm": 22.072825583902596, "kl": 0.0869140625, "learning_rate": 3.427369896618188e-07, "loss": -0.0095, "reward": 1.7406294345855713, "reward_std": 0.2594420909881592, "rewards/accuracy_reward_stage2": 0.7562545537948608, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3752 }, { "completion_length": 6.921875, "epoch": 0.6576134571578763, "grad_norm": 16.058494707079756, "kl": 0.031005859375, "learning_rate": 3.4256176625197125e-07, "loss": 0.0124, "reward": 1.695550560951233, "reward_std": 0.1407569944858551, "rewards/accuracy_reward_stage2": 0.8205506801605225, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3753 }, { "completion_length": 10.65625, "epoch": 0.6577886805677239, "grad_norm": 22.634665908325573, "kl": 0.146484375, "learning_rate": 3.423865428421237e-07, "loss": -0.0046, "reward": 1.516603708267212, "reward_std": 0.3547920286655426, "rewards/accuracy_reward_stage2": 0.5478537082672119, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3754 }, { "completion_length": 8.265625, "epoch": 0.6579639039775714, "grad_norm": 13.65605117039202, "kl": 0.1728515625, "learning_rate": 3.422113194322762e-07, "loss": 0.0112, "reward": 1.6286708116531372, "reward_std": 0.14669831097126007, "rewards/accuracy_reward_stage2": 0.6599206924438477, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3755 }, { "completion_length": 5.40625, "epoch": 0.658139127387419, "grad_norm": 13.280947295786133, "kl": 0.068359375, "learning_rate": 3.4203609602242857e-07, "loss": -0.0168, "reward": 1.9375, "reward_std": 0.1462520956993103, "rewards/accuracy_reward_stage2": 0.953125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3756 }, { "completion_length": 8.6875, "epoch": 0.6583143507972665, "grad_norm": 17.47601377460947, "kl": 0.1171875, "learning_rate": 3.41860872612581e-07, "loss": 0.0153, "reward": 1.4678363800048828, "reward_std": 0.2227524071931839, "rewards/accuracy_reward_stage2": 0.4834613502025604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3757 }, { "completion_length": 13.046875, "epoch": 0.6584895742071141, "grad_norm": 18.490909376699747, "kl": 0.177734375, "learning_rate": 3.4168564920273345e-07, "loss": -0.0174, "reward": 1.4714080095291138, "reward_std": 0.32452845573425293, "rewards/accuracy_reward_stage2": 0.5026580691337585, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3758 }, { "completion_length": 6.5, "epoch": 0.6586647976169616, "grad_norm": 23.224734562018334, "kl": 0.1962890625, "learning_rate": 3.415104257928859e-07, "loss": -0.026, "reward": 1.6008846759796143, "reward_std": 0.32852935791015625, "rewards/accuracy_reward_stage2": 0.6477595567703247, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3759 }, { "completion_length": 11.125, "epoch": 0.6588400210268092, "grad_norm": 18.760357237054855, "kl": 0.2041015625, "learning_rate": 3.413352023830384e-07, "loss": -0.0059, "reward": 1.5383474826812744, "reward_std": 0.3135913610458374, "rewards/accuracy_reward_stage2": 0.6945973634719849, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3760 }, { "completion_length": 12.59375, "epoch": 0.6590152444366567, "grad_norm": 16.016991487601743, "kl": 0.09326171875, "learning_rate": 3.411599789731908e-07, "loss": -0.0021, "reward": 1.5600254535675049, "reward_std": 0.23971496522426605, "rewards/accuracy_reward_stage2": 0.5756504535675049, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3761 }, { "completion_length": 10.359375, "epoch": 0.6591904678465043, "grad_norm": 19.832552947194255, "kl": 0.267578125, "learning_rate": 3.4098475556334326e-07, "loss": 0.0016, "reward": 1.6911460161209106, "reward_std": 0.3048381209373474, "rewards/accuracy_reward_stage2": 0.7380210161209106, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3762 }, { "completion_length": 9.84375, "epoch": 0.6593656912563518, "grad_norm": 18.808604800352317, "kl": 0.189453125, "learning_rate": 3.4080953215349565e-07, "loss": -0.0427, "reward": 1.629578948020935, "reward_std": 0.28050941228866577, "rewards/accuracy_reward_stage2": 0.8014539480209351, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3763 }, { "completion_length": 8.078125, "epoch": 0.6595409146661994, "grad_norm": 18.145882072947025, "kl": 0.21484375, "learning_rate": 3.4063430874364814e-07, "loss": -0.042, "reward": 1.5573110580444336, "reward_std": 0.20979392528533936, "rewards/accuracy_reward_stage2": 0.6198111176490784, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3764 }, { "completion_length": 7.40625, "epoch": 0.659716138076047, "grad_norm": 15.642888931193294, "kl": 0.1015625, "learning_rate": 3.404590853338006e-07, "loss": -0.0034, "reward": 1.6968038082122803, "reward_std": 0.19613364338874817, "rewards/accuracy_reward_stage2": 0.8374287486076355, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3765 }, { "completion_length": 10.25, "epoch": 0.6598913614858946, "grad_norm": 23.440768685123555, "kl": 0.09912109375, "learning_rate": 3.40283861923953e-07, "loss": -0.0435, "reward": 1.463047981262207, "reward_std": 0.37215834856033325, "rewards/accuracy_reward_stage2": 0.4942980110645294, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3766 }, { "completion_length": 11.171875, "epoch": 0.6600665848957421, "grad_norm": 18.89718950407704, "kl": 0.1845703125, "learning_rate": 3.4010863851410546e-07, "loss": 0.0667, "reward": 1.4453023672103882, "reward_std": 0.29632043838500977, "rewards/accuracy_reward_stage2": 0.6953023672103882, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3767 }, { "completion_length": 9.734375, "epoch": 0.6602418083055897, "grad_norm": 18.261741322305895, "kl": 0.15234375, "learning_rate": 3.3993341510425795e-07, "loss": 0.0607, "reward": 1.595839023590088, "reward_std": 0.18660318851470947, "rewards/accuracy_reward_stage2": 0.7208389639854431, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3768 }, { "completion_length": 20.8125, "epoch": 0.6604170317154372, "grad_norm": 20.39858492028997, "kl": 0.1357421875, "learning_rate": 3.3975819169441034e-07, "loss": -0.0171, "reward": 1.3963425159454346, "reward_std": 0.27449262142181396, "rewards/accuracy_reward_stage2": 0.42759257555007935, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3769 }, { "completion_length": 13.09375, "epoch": 0.6605922551252847, "grad_norm": 17.313030395092323, "kl": 0.1484375, "learning_rate": 3.395829682845628e-07, "loss": 0.0209, "reward": 1.204599142074585, "reward_std": 0.18435396254062653, "rewards/accuracy_reward_stage2": 0.34522414207458496, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3770 }, { "completion_length": 7.796875, "epoch": 0.6607674785351323, "grad_norm": 23.390768820207825, "kl": 0.06494140625, "learning_rate": 3.394077448747152e-07, "loss": 0.026, "reward": 1.7073495388031006, "reward_std": 0.2253035306930542, "rewards/accuracy_reward_stage2": 0.7073495984077454, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3771 }, { "completion_length": 9.046875, "epoch": 0.6609427019449798, "grad_norm": 17.356307836688707, "kl": 0.1259765625, "learning_rate": 3.392325214648677e-07, "loss": 0.0127, "reward": 1.6848533153533936, "reward_std": 0.18406951427459717, "rewards/accuracy_reward_stage2": 0.7004783153533936, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3772 }, { "completion_length": 10.75, "epoch": 0.6611179253548274, "grad_norm": 19.927632990776967, "kl": 0.2431640625, "learning_rate": 3.3905729805502015e-07, "loss": -0.0305, "reward": 1.4557608366012573, "reward_std": 0.36149799823760986, "rewards/accuracy_reward_stage2": 0.5182607769966125, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3773 }, { "completion_length": 24.546875, "epoch": 0.6612931487646749, "grad_norm": 17.501135465012446, "kl": 0.0830078125, "learning_rate": 3.388820746451726e-07, "loss": -0.0332, "reward": 1.3385493755340576, "reward_std": 0.12510152161121368, "rewards/accuracy_reward_stage2": 0.3697994351387024, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3774 }, { "completion_length": 6.609375, "epoch": 0.6614683721745225, "grad_norm": 17.16426784437837, "kl": 0.1767578125, "learning_rate": 3.38706851235325e-07, "loss": 0.0083, "reward": 1.2884865999221802, "reward_std": 0.25939449667930603, "rewards/accuracy_reward_stage2": 0.3197365701198578, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3775 }, { "completion_length": 6.46875, "epoch": 0.66164359558437, "grad_norm": 18.955321642768602, "kl": 0.26953125, "learning_rate": 3.3853162782547747e-07, "loss": 0.0137, "reward": 1.5619122982025146, "reward_std": 0.25723767280578613, "rewards/accuracy_reward_stage2": 0.6087872982025146, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3776 }, { "completion_length": 12.453125, "epoch": 0.6618188189942176, "grad_norm": 18.881526029270855, "kl": 0.11376953125, "learning_rate": 3.383564044156299e-07, "loss": 0.0455, "reward": 1.4165642261505127, "reward_std": 0.30341148376464844, "rewards/accuracy_reward_stage2": 0.4165641665458679, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3777 }, { "completion_length": 13.515625, "epoch": 0.6619940424040652, "grad_norm": 20.948955131923597, "kl": 0.138671875, "learning_rate": 3.3818118100578235e-07, "loss": 0.046, "reward": 1.5264365673065186, "reward_std": 0.30605074763298035, "rewards/accuracy_reward_stage2": 0.5420615077018738, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3778 }, { "completion_length": 8.734375, "epoch": 0.6621692658139128, "grad_norm": 14.31361978247023, "kl": 0.234375, "learning_rate": 3.380059575959348e-07, "loss": 0.0496, "reward": 1.4558091163635254, "reward_std": 0.15434116125106812, "rewards/accuracy_reward_stage2": 0.5964341163635254, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3779 }, { "completion_length": 11.734375, "epoch": 0.6623444892237603, "grad_norm": 15.118890139919, "kl": 0.1123046875, "learning_rate": 3.378307341860873e-07, "loss": 0.0048, "reward": 1.7935502529144287, "reward_std": 0.21152012050151825, "rewards/accuracy_reward_stage2": 0.8091753125190735, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3780 }, { "completion_length": 6.75, "epoch": 0.6625197126336079, "grad_norm": 19.08976605237901, "kl": 0.408203125, "learning_rate": 3.376555107762397e-07, "loss": 0.0432, "reward": 1.4881014823913574, "reward_std": 0.3304804861545563, "rewards/accuracy_reward_stage2": 0.5506014227867126, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3781 }, { "completion_length": 11.5, "epoch": 0.6626949360434554, "grad_norm": 30.88448000309724, "kl": 0.177734375, "learning_rate": 3.374802873663921e-07, "loss": 0.0773, "reward": 1.3979463577270508, "reward_std": 0.21971935033798218, "rewards/accuracy_reward_stage2": 0.6479463577270508, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3782 }, { "completion_length": 8.421875, "epoch": 0.662870159453303, "grad_norm": 14.383039345570571, "kl": 0.1767578125, "learning_rate": 3.3730506395654455e-07, "loss": 0.0097, "reward": 1.3854167461395264, "reward_std": 0.19974718987941742, "rewards/accuracy_reward_stage2": 0.4166666865348816, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3783 }, { "completion_length": 11.6875, "epoch": 0.6630453828631505, "grad_norm": 32.26407359455346, "kl": 0.1572265625, "learning_rate": 3.3712984054669704e-07, "loss": 0.0188, "reward": 1.6351269483566284, "reward_std": 0.12910021841526031, "rewards/accuracy_reward_stage2": 0.9007519483566284, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3784 }, { "completion_length": 9.609375, "epoch": 0.6632206062729981, "grad_norm": 18.12670377480947, "kl": 0.142578125, "learning_rate": 3.369546171368495e-07, "loss": 0.0208, "reward": 1.6540381908416748, "reward_std": 0.2671361565589905, "rewards/accuracy_reward_stage2": 0.66966313123703, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3785 }, { "completion_length": 10.171875, "epoch": 0.6633958296828456, "grad_norm": 19.31629462658784, "kl": 0.119140625, "learning_rate": 3.367793937270019e-07, "loss": 0.0034, "reward": 1.6815602779388428, "reward_std": 0.27131250500679016, "rewards/accuracy_reward_stage2": 0.6971853971481323, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3786 }, { "completion_length": 11.0625, "epoch": 0.6635710530926932, "grad_norm": 19.81498896949635, "kl": 0.154296875, "learning_rate": 3.3660417031715436e-07, "loss": -0.0112, "reward": 1.601413607597351, "reward_std": 0.37887099385261536, "rewards/accuracy_reward_stage2": 0.6326636075973511, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3787 }, { "completion_length": 8.828125, "epoch": 0.6637462765025407, "grad_norm": 21.39361118233043, "kl": 0.267578125, "learning_rate": 3.364289469073068e-07, "loss": 0.0186, "reward": 1.3068628311157227, "reward_std": 0.2723737359046936, "rewards/accuracy_reward_stage2": 0.4631127715110779, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3788 }, { "completion_length": 7.375, "epoch": 0.6639214999123882, "grad_norm": 16.641332895006535, "kl": 0.11767578125, "learning_rate": 3.3625372349745924e-07, "loss": -0.0136, "reward": 1.646165132522583, "reward_std": 0.17670243978500366, "rewards/accuracy_reward_stage2": 0.8024150729179382, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3789 }, { "completion_length": 11.125, "epoch": 0.6640967233222359, "grad_norm": 12.84345404563453, "kl": 0.031494140625, "learning_rate": 3.360785000876117e-07, "loss": 0.0126, "reward": 1.646390438079834, "reward_std": 0.09573078155517578, "rewards/accuracy_reward_stage2": 0.646390438079834, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3790 }, { "completion_length": 10.984375, "epoch": 0.6642719467320835, "grad_norm": 17.804293261151166, "kl": 0.154296875, "learning_rate": 3.359032766777641e-07, "loss": 0.0173, "reward": 1.6868422031402588, "reward_std": 0.26586437225341797, "rewards/accuracy_reward_stage2": 0.7024672031402588, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3791 }, { "completion_length": 11.109375, "epoch": 0.664447170141931, "grad_norm": 22.70462929976115, "kl": 0.11962890625, "learning_rate": 3.357280532679166e-07, "loss": 0.0169, "reward": 1.4475001096725464, "reward_std": 0.358273446559906, "rewards/accuracy_reward_stage2": 0.4631251394748688, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3792 }, { "completion_length": 8.984375, "epoch": 0.6646223935517785, "grad_norm": 13.1589326041502, "kl": 0.19140625, "learning_rate": 3.3555282985806905e-07, "loss": -0.0119, "reward": 1.8759760856628418, "reward_std": 0.19113053381443024, "rewards/accuracy_reward_stage2": 0.9072260856628418, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3793 }, { "completion_length": 9.375, "epoch": 0.6647976169616261, "grad_norm": 16.2208180979175, "kl": 0.1962890625, "learning_rate": 3.3537760644822143e-07, "loss": -0.0445, "reward": 1.6830174922943115, "reward_std": 0.301523357629776, "rewards/accuracy_reward_stage2": 0.7298924326896667, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3794 }, { "completion_length": 10.59375, "epoch": 0.6649728403714736, "grad_norm": 22.2857496134525, "kl": 0.1591796875, "learning_rate": 3.352023830383739e-07, "loss": 0.0197, "reward": 1.7518019676208496, "reward_std": 0.22051015496253967, "rewards/accuracy_reward_stage2": 0.8924268484115601, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3795 }, { "completion_length": 10.78125, "epoch": 0.6651480637813212, "grad_norm": 11.82391521934736, "kl": 0.1083984375, "learning_rate": 3.3502715962852637e-07, "loss": -0.0009, "reward": 1.7660496234893799, "reward_std": 0.17463505268096924, "rewards/accuracy_reward_stage2": 0.7816746234893799, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3796 }, { "completion_length": 11.796875, "epoch": 0.6653232871911687, "grad_norm": 19.721619897430543, "kl": 0.0966796875, "learning_rate": 3.348519362186788e-07, "loss": -0.0012, "reward": 1.1196482181549072, "reward_std": 0.23411786556243896, "rewards/accuracy_reward_stage2": 0.2602732479572296, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3797 }, { "completion_length": 5.296875, "epoch": 0.6654985106010163, "grad_norm": 11.758927680436287, "kl": 0.28125, "learning_rate": 3.3467671280883125e-07, "loss": 0.0241, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward_stage2": 0.84375, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3798 }, { "completion_length": 9.265625, "epoch": 0.6656737340108638, "grad_norm": 16.046421595628004, "kl": 0.345703125, "learning_rate": 3.345014893989837e-07, "loss": -0.043, "reward": 1.5293065309524536, "reward_std": 0.28512483835220337, "rewards/accuracy_reward_stage2": 0.7324315309524536, "rewards/format_reward_stage1_pointerpad": 0.796875, "scores/accuracy_reward_stage2": 0.796875, "step": 3799 }, { "completion_length": 13.265625, "epoch": 0.6658489574207114, "grad_norm": 18.830911554311168, "kl": 0.017333984375, "learning_rate": 3.343262659891362e-07, "loss": 0.0069, "reward": 1.7495684623718262, "reward_std": 0.20599254965782166, "rewards/accuracy_reward_stage2": 0.7495684027671814, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3800 }, { "completion_length": 11.84375, "epoch": 0.6660241808305589, "grad_norm": 12.271733362216043, "kl": 0.05517578125, "learning_rate": 3.3415104257928856e-07, "loss": 0.0221, "reward": 1.5179245471954346, "reward_std": 0.1245698481798172, "rewards/accuracy_reward_stage2": 0.5179246068000793, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3801 }, { "completion_length": 11.203125, "epoch": 0.6661994042404065, "grad_norm": 21.78953904690612, "kl": 0.2412109375, "learning_rate": 3.33975819169441e-07, "loss": 0.0256, "reward": 1.5020623207092285, "reward_std": 0.2805957794189453, "rewards/accuracy_reward_stage2": 0.5333123207092285, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3802 }, { "completion_length": 11.203125, "epoch": 0.6663746276502541, "grad_norm": 20.18266093377564, "kl": 0.326171875, "learning_rate": 3.3380059575959344e-07, "loss": -0.0134, "reward": 1.2390034198760986, "reward_std": 0.2471129447221756, "rewards/accuracy_reward_stage2": 0.4265034794807434, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 3803 }, { "completion_length": 14.59375, "epoch": 0.6665498510601017, "grad_norm": 18.078311944493244, "kl": 0.1728515625, "learning_rate": 3.3362537234974594e-07, "loss": -0.0209, "reward": 1.5762239694595337, "reward_std": 0.277314692735672, "rewards/accuracy_reward_stage2": 0.6230989098548889, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3804 }, { "completion_length": 9.703125, "epoch": 0.6667250744699492, "grad_norm": 17.04188443236299, "kl": 0.1328125, "learning_rate": 3.334501489398984e-07, "loss": -0.0306, "reward": 1.7449889183044434, "reward_std": 0.25133174657821655, "rewards/accuracy_reward_stage2": 0.7762388586997986, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3805 }, { "completion_length": 11.203125, "epoch": 0.6669002978797968, "grad_norm": 49.30679087926395, "kl": 0.5625, "learning_rate": 3.332749255300508e-07, "loss": 0.1652, "reward": 1.2607142925262451, "reward_std": 0.21106266975402832, "rewards/accuracy_reward_stage2": 0.41696426272392273, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3806 }, { "completion_length": 9.53125, "epoch": 0.6670755212896443, "grad_norm": 20.956408970849377, "kl": 0.259765625, "learning_rate": 3.330997021202032e-07, "loss": 0.0249, "reward": 1.5182335376739502, "reward_std": 0.2621886730194092, "rewards/accuracy_reward_stage2": 0.549483597278595, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3807 }, { "completion_length": 11.140625, "epoch": 0.6672507446994919, "grad_norm": 28.610511173855173, "kl": 0.1904296875, "learning_rate": 3.329244787103557e-07, "loss": 0.0973, "reward": 1.5078845024108887, "reward_std": 0.20054185390472412, "rewards/accuracy_reward_stage2": 0.6328844428062439, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3808 }, { "completion_length": 5.421875, "epoch": 0.6674259681093394, "grad_norm": 15.317171215419528, "kl": 0.1533203125, "learning_rate": 3.3274925530050813e-07, "loss": -0.0258, "reward": 1.824300765991211, "reward_std": 0.26174771785736084, "rewards/accuracy_reward_stage2": 0.8555507063865662, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3809 }, { "completion_length": 12.40625, "epoch": 0.667601191519187, "grad_norm": 23.217493178402837, "kl": 0.07177734375, "learning_rate": 3.325740318906606e-07, "loss": 0.0323, "reward": 1.3106931447982788, "reward_std": 0.28896403312683105, "rewards/accuracy_reward_stage2": 0.4356931447982788, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3810 }, { "completion_length": 10.28125, "epoch": 0.6677764149290345, "grad_norm": 26.477477101239245, "kl": 0.1142578125, "learning_rate": 3.32398808480813e-07, "loss": 0.0168, "reward": 1.642218828201294, "reward_std": 0.3020542860031128, "rewards/accuracy_reward_stage2": 0.6578439474105835, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3811 }, { "completion_length": 11.296875, "epoch": 0.667951638338882, "grad_norm": 23.32254696210251, "kl": 0.318359375, "learning_rate": 3.322235850709655e-07, "loss": 0.052, "reward": 1.5385891199111938, "reward_std": 0.324771523475647, "rewards/accuracy_reward_stage2": 0.7104641199111938, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3812 }, { "completion_length": 11.34375, "epoch": 0.6681268617487296, "grad_norm": 19.10950592222298, "kl": 0.18359375, "learning_rate": 3.3204836166111795e-07, "loss": 0.0445, "reward": 1.59661066532135, "reward_std": 0.25543177127838135, "rewards/accuracy_reward_stage2": 0.6122356653213501, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3813 }, { "completion_length": 20.6875, "epoch": 0.6683020851585771, "grad_norm": 19.02191313355224, "kl": 0.1259765625, "learning_rate": 3.3187313825127033e-07, "loss": 0.0062, "reward": 1.5268758535385132, "reward_std": 0.18108849227428436, "rewards/accuracy_reward_stage2": 0.5425008535385132, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3814 }, { "completion_length": 16.15625, "epoch": 0.6684773085684247, "grad_norm": 18.578214590711934, "kl": 0.1015625, "learning_rate": 3.3169791484142277e-07, "loss": -0.0034, "reward": 1.443403959274292, "reward_std": 0.22445048391819, "rewards/accuracy_reward_stage2": 0.45902884006500244, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3815 }, { "completion_length": 11.703125, "epoch": 0.6686525319782723, "grad_norm": 22.83780116713671, "kl": 0.1845703125, "learning_rate": 3.3152269143157527e-07, "loss": 0.0297, "reward": 1.686450719833374, "reward_std": 0.24080437421798706, "rewards/accuracy_reward_stage2": 0.827075719833374, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3816 }, { "completion_length": 13.875, "epoch": 0.6688277553881199, "grad_norm": 16.72810642640274, "kl": 0.1328125, "learning_rate": 3.313474680217277e-07, "loss": 0.012, "reward": 1.5293313264846802, "reward_std": 0.1489149034023285, "rewards/accuracy_reward_stage2": 0.5449563264846802, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3817 }, { "completion_length": 10.625, "epoch": 0.6690029787979674, "grad_norm": 19.73170627422111, "kl": 0.2138671875, "learning_rate": 3.3117224461188014e-07, "loss": 0.0133, "reward": 1.5520833730697632, "reward_std": 0.3328608274459839, "rewards/accuracy_reward_stage2": 0.7083333134651184, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3818 }, { "completion_length": 17.078125, "epoch": 0.669178202207815, "grad_norm": 13.387912599799588, "kl": 0.255859375, "learning_rate": 3.309970212020326e-07, "loss": 0.0244, "reward": 1.3956577777862549, "reward_std": 0.20234528183937073, "rewards/accuracy_reward_stage2": 0.4269077777862549, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3819 }, { "completion_length": 11.0625, "epoch": 0.6693534256176625, "grad_norm": 16.666920323225234, "kl": 0.07861328125, "learning_rate": 3.30821797792185e-07, "loss": 0.0313, "reward": 1.6344341039657593, "reward_std": 0.12134034186601639, "rewards/accuracy_reward_stage2": 0.759434163570404, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3820 }, { "completion_length": 8.453125, "epoch": 0.6695286490275101, "grad_norm": 22.67417343158387, "kl": 0.07470703125, "learning_rate": 3.3064657438233746e-07, "loss": 0.03, "reward": 1.5556546449661255, "reward_std": 0.19017130136489868, "rewards/accuracy_reward_stage2": 0.5556546449661255, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3821 }, { "completion_length": 19.84375, "epoch": 0.6697038724373576, "grad_norm": 16.46627356736991, "kl": 0.036376953125, "learning_rate": 3.304713509724899e-07, "loss": 0.0146, "reward": 1.5332281589508057, "reward_std": 0.07025317847728729, "rewards/accuracy_reward_stage2": 0.6582280397415161, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3822 }, { "completion_length": 9.234375, "epoch": 0.6698790958472052, "grad_norm": 26.80407941462116, "kl": 0.291015625, "learning_rate": 3.3029612756264234e-07, "loss": 0.038, "reward": 1.3822216987609863, "reward_std": 0.2539028525352478, "rewards/accuracy_reward_stage2": 0.6634716987609863, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3823 }, { "completion_length": 6.8125, "epoch": 0.6700543192570527, "grad_norm": 8.311960186174872, "kl": 0.045166015625, "learning_rate": 3.3012090415279484e-07, "loss": 0.0181, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward_stage2": 0.71875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3824 }, { "completion_length": 7.15625, "epoch": 0.6702295426669003, "grad_norm": 17.01314817220025, "kl": 0.1689453125, "learning_rate": 3.299456807429473e-07, "loss": -0.0386, "reward": 1.4015306234359741, "reward_std": 0.24766838550567627, "rewards/accuracy_reward_stage2": 0.5734056234359741, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3825 }, { "completion_length": 11.09375, "epoch": 0.6704047660767478, "grad_norm": 14.522585950078515, "kl": 0.1298828125, "learning_rate": 3.2977045733309966e-07, "loss": -0.0286, "reward": 1.3737890720367432, "reward_std": 0.1715371012687683, "rewards/accuracy_reward_stage2": 0.4050390124320984, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3826 }, { "completion_length": 11.515625, "epoch": 0.6705799894865954, "grad_norm": 19.10620552915964, "kl": 0.09375, "learning_rate": 3.295952339232521e-07, "loss": -0.021, "reward": 1.4859750270843506, "reward_std": 0.20169678330421448, "rewards/accuracy_reward_stage2": 0.5172249674797058, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3827 }, { "completion_length": 10.203125, "epoch": 0.6707552128964429, "grad_norm": 26.683339127329393, "kl": 0.1962890625, "learning_rate": 3.2942001051340454e-07, "loss": 0.0337, "reward": 1.3072201013565063, "reward_std": 0.2676845192909241, "rewards/accuracy_reward_stage2": 0.44784507155418396, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3828 }, { "completion_length": 10.953125, "epoch": 0.6709304363062906, "grad_norm": 17.501991848539717, "kl": 0.07275390625, "learning_rate": 3.2924478710355703e-07, "loss": -0.0151, "reward": 1.5807785987854004, "reward_std": 0.1519029289484024, "rewards/accuracy_reward_stage2": 0.5964034795761108, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3829 }, { "completion_length": 10.65625, "epoch": 0.6711056597161381, "grad_norm": 16.364446061626836, "kl": 0.16796875, "learning_rate": 3.2906956369370947e-07, "loss": -0.05, "reward": 1.3926641941070557, "reward_std": 0.30592912435531616, "rewards/accuracy_reward_stage2": 0.4395391345024109, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3830 }, { "completion_length": 8.59375, "epoch": 0.6712808831259857, "grad_norm": 14.233024889294107, "kl": 0.17578125, "learning_rate": 3.288943402838619e-07, "loss": 0.0304, "reward": 1.721284031867981, "reward_std": 0.20616403222084045, "rewards/accuracy_reward_stage2": 0.736909031867981, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3831 }, { "completion_length": 10.84375, "epoch": 0.6714561065358332, "grad_norm": 19.467527727246598, "kl": 0.31640625, "learning_rate": 3.287191168740144e-07, "loss": -0.0196, "reward": 1.2939127683639526, "reward_std": 0.3219314515590668, "rewards/accuracy_reward_stage2": 0.37203776836395264, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3832 }, { "completion_length": 13.296875, "epoch": 0.6716313299456808, "grad_norm": 19.922162130122, "kl": 0.1416015625, "learning_rate": 3.285438934641668e-07, "loss": -0.0715, "reward": 1.633192777633667, "reward_std": 0.23791098594665527, "rewards/accuracy_reward_stage2": 0.6800678372383118, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3833 }, { "completion_length": 9.796875, "epoch": 0.6718065533555283, "grad_norm": 18.082579453339836, "kl": 0.07470703125, "learning_rate": 3.2836867005431923e-07, "loss": -0.0143, "reward": 1.5907870531082153, "reward_std": 0.1954219490289688, "rewards/accuracy_reward_stage2": 0.6064120531082153, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3834 }, { "completion_length": 8.859375, "epoch": 0.6719817767653758, "grad_norm": 16.720129476556014, "kl": 0.031982421875, "learning_rate": 3.2819344664447167e-07, "loss": 0.0128, "reward": 1.455439805984497, "reward_std": 0.23492306470870972, "rewards/accuracy_reward_stage2": 0.45543980598449707, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3835 }, { "completion_length": 8.34375, "epoch": 0.6721570001752234, "grad_norm": 32.88435394651061, "kl": 0.26171875, "learning_rate": 3.280182232346241e-07, "loss": 0.0663, "reward": 1.4930814504623413, "reward_std": 0.18936826288700104, "rewards/accuracy_reward_stage2": 0.6337064504623413, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3836 }, { "completion_length": 12.78125, "epoch": 0.6723322235850709, "grad_norm": 17.80399433349402, "kl": 0.130859375, "learning_rate": 3.278429998247766e-07, "loss": -0.0312, "reward": 1.5512590408325195, "reward_std": 0.29897886514663696, "rewards/accuracy_reward_stage2": 0.5825091004371643, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3837 }, { "completion_length": 9.21875, "epoch": 0.6725074469949185, "grad_norm": 13.305832739926364, "kl": 0.177734375, "learning_rate": 3.2766777641492904e-07, "loss": -0.0165, "reward": 1.3768309354782104, "reward_std": 0.16466762125492096, "rewards/accuracy_reward_stage2": 0.40808090567588806, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3838 }, { "completion_length": 6.578125, "epoch": 0.672682670404766, "grad_norm": 19.53413402892302, "kl": 0.0517578125, "learning_rate": 3.2749255300508143e-07, "loss": 0.0207, "reward": 1.587104082107544, "reward_std": 0.13184921443462372, "rewards/accuracy_reward_stage2": 0.5871041417121887, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3839 }, { "completion_length": 7.0625, "epoch": 0.6728578938146136, "grad_norm": 14.381656777284563, "kl": 0.130859375, "learning_rate": 3.2731732959523387e-07, "loss": -0.021, "reward": 1.6133270263671875, "reward_std": 0.206298828125, "rewards/accuracy_reward_stage2": 0.6445769667625427, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3840 }, { "completion_length": 9.90625, "epoch": 0.6730331172244612, "grad_norm": 18.426939341494425, "kl": 0.140625, "learning_rate": 3.2714210618538636e-07, "loss": 0.0211, "reward": 1.5109727382659912, "reward_std": 0.1667131632566452, "rewards/accuracy_reward_stage2": 0.5265976786613464, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3841 }, { "completion_length": 9.140625, "epoch": 0.6732083406343088, "grad_norm": 17.32055192337897, "kl": 0.1826171875, "learning_rate": 3.269668827755388e-07, "loss": -0.0152, "reward": 1.6525171995162964, "reward_std": 0.357276976108551, "rewards/accuracy_reward_stage2": 0.6837671399116516, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3842 }, { "completion_length": 10.21875, "epoch": 0.6733835640441563, "grad_norm": 23.93115293151798, "kl": 0.07373046875, "learning_rate": 3.2679165936569124e-07, "loss": -0.0037, "reward": 1.4673311710357666, "reward_std": 0.38846614956855774, "rewards/accuracy_reward_stage2": 0.48295605182647705, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3843 }, { "completion_length": 8.78125, "epoch": 0.6735587874540039, "grad_norm": 22.95815368304367, "kl": 0.203125, "learning_rate": 3.266164359558437e-07, "loss": 0.0459, "reward": 1.4174572229385376, "reward_std": 0.27246612310409546, "rewards/accuracy_reward_stage2": 0.5580822229385376, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3844 }, { "completion_length": 9.40625, "epoch": 0.6737340108638514, "grad_norm": 20.06182804507912, "kl": 0.08056640625, "learning_rate": 3.264412125459961e-07, "loss": -0.0119, "reward": 1.3923089504241943, "reward_std": 0.26168495416641235, "rewards/accuracy_reward_stage2": 0.40793395042419434, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3845 }, { "completion_length": 11.703125, "epoch": 0.673909234273699, "grad_norm": 18.47541852117159, "kl": 0.030517578125, "learning_rate": 3.2626598913614856e-07, "loss": 0.0122, "reward": 1.5444600582122803, "reward_std": 0.1621989905834198, "rewards/accuracy_reward_stage2": 0.544460117816925, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3846 }, { "completion_length": 9.25, "epoch": 0.6740844576835465, "grad_norm": 18.265132964504144, "kl": 0.185546875, "learning_rate": 3.26090765726301e-07, "loss": -0.0559, "reward": 1.390181541442871, "reward_std": 0.19728723168373108, "rewards/accuracy_reward_stage2": 0.4526815414428711, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3847 }, { "completion_length": 7.5625, "epoch": 0.6742596810933941, "grad_norm": 23.277133642814135, "kl": 0.0625, "learning_rate": 3.2591554231645344e-07, "loss": 0.0251, "reward": 1.5694992542266846, "reward_std": 0.3129950165748596, "rewards/accuracy_reward_stage2": 0.5694993138313293, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3848 }, { "completion_length": 8.875, "epoch": 0.6744349045032416, "grad_norm": 17.61604651957286, "kl": 0.0252685546875, "learning_rate": 3.2574031890660593e-07, "loss": 0.0101, "reward": 1.8145318031311035, "reward_std": 0.18421462178230286, "rewards/accuracy_reward_stage2": 0.8145317435264587, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3849 }, { "completion_length": 14.28125, "epoch": 0.6746101279130892, "grad_norm": 14.546044174712485, "kl": 0.0291748046875, "learning_rate": 3.2556509549675837e-07, "loss": 0.0117, "reward": 1.296875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3850 }, { "completion_length": 8.390625, "epoch": 0.6747853513229367, "grad_norm": 18.886644065465404, "kl": 0.251953125, "learning_rate": 3.253898720869108e-07, "loss": 0.0403, "reward": 1.5611273050308228, "reward_std": 0.24465563893318176, "rewards/accuracy_reward_stage2": 0.5923773050308228, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3851 }, { "completion_length": 11.625, "epoch": 0.6749605747327843, "grad_norm": 15.88017314047495, "kl": 0.09326171875, "learning_rate": 3.252146486770632e-07, "loss": 0.004, "reward": 1.6079230308532715, "reward_std": 0.19661104679107666, "rewards/accuracy_reward_stage2": 0.6235479712486267, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3852 }, { "completion_length": 7.0625, "epoch": 0.6751357981426318, "grad_norm": 14.757801964566244, "kl": 0.142578125, "learning_rate": 3.250394252672157e-07, "loss": 0.0128, "reward": 1.6863281726837158, "reward_std": 0.18864640593528748, "rewards/accuracy_reward_stage2": 0.7019531726837158, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3853 }, { "completion_length": 8.640625, "epoch": 0.6753110215524795, "grad_norm": 16.493447509624776, "kl": 0.271484375, "learning_rate": 3.2486420185736813e-07, "loss": -0.0021, "reward": 1.39857017993927, "reward_std": 0.2185536026954651, "rewards/accuracy_reward_stage2": 0.5704452395439148, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3854 }, { "completion_length": 15.359375, "epoch": 0.675486244962327, "grad_norm": 14.82144590858429, "kl": 0.0712890625, "learning_rate": 3.2468897844752057e-07, "loss": 0.0285, "reward": 1.5174546241760254, "reward_std": 0.10479126870632172, "rewards/accuracy_reward_stage2": 0.6424546837806702, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3855 }, { "completion_length": 12.03125, "epoch": 0.6756614683721746, "grad_norm": 20.812155729239077, "kl": 0.140625, "learning_rate": 3.24513755037673e-07, "loss": 0.0563, "reward": 1.5602467060089111, "reward_std": 0.19248421490192413, "rewards/accuracy_reward_stage2": 0.6852467656135559, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3856 }, { "completion_length": 7.15625, "epoch": 0.6758366917820221, "grad_norm": 11.617742248311606, "kl": 0.07421875, "learning_rate": 3.243385316278255e-07, "loss": 0.0298, "reward": 1.7109254598617554, "reward_std": 0.06991486251354218, "rewards/accuracy_reward_stage2": 0.7109254598617554, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3857 }, { "completion_length": 12.5625, "epoch": 0.6760119151918697, "grad_norm": 27.784956148030506, "kl": 0.25, "learning_rate": 3.241633082179779e-07, "loss": 0.0623, "reward": 1.5572917461395264, "reward_std": 0.3274396061897278, "rewards/accuracy_reward_stage2": 0.6979166865348816, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3858 }, { "completion_length": 8.734375, "epoch": 0.6761871386017172, "grad_norm": 25.093731646428207, "kl": 0.08056640625, "learning_rate": 3.2398808480813033e-07, "loss": 0.0323, "reward": 1.471142053604126, "reward_std": 0.3559247851371765, "rewards/accuracy_reward_stage2": 0.596142053604126, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3859 }, { "completion_length": 14.96875, "epoch": 0.6763623620115647, "grad_norm": 19.326438345530867, "kl": 0.0986328125, "learning_rate": 3.2381286139828277e-07, "loss": 0.0393, "reward": 1.6141562461853027, "reward_std": 0.1951741874217987, "rewards/accuracy_reward_stage2": 0.6141563057899475, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3860 }, { "completion_length": 9.03125, "epoch": 0.6765375854214123, "grad_norm": 23.327624059918623, "kl": 0.212890625, "learning_rate": 3.2363763798843526e-07, "loss": 0.0177, "reward": 1.4712347984313965, "reward_std": 0.34458082914352417, "rewards/accuracy_reward_stage2": 0.5024847984313965, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3861 }, { "completion_length": 19.65625, "epoch": 0.6767128088312598, "grad_norm": 16.849083072806163, "kl": 0.13671875, "learning_rate": 3.234624145785877e-07, "loss": -0.0213, "reward": 1.222360372543335, "reward_std": 0.2840573191642761, "rewards/accuracy_reward_stage2": 0.37861043214797974, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3862 }, { "completion_length": 11.234375, "epoch": 0.6768880322411074, "grad_norm": 17.208273637782654, "kl": 0.2578125, "learning_rate": 3.2328719116874014e-07, "loss": 0.0356, "reward": 1.5516068935394287, "reward_std": 0.2445361167192459, "rewards/accuracy_reward_stage2": 0.7078569531440735, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3863 }, { "completion_length": 15.890625, "epoch": 0.6770632556509549, "grad_norm": 18.015938285221996, "kl": 0.1787109375, "learning_rate": 3.231119677588926e-07, "loss": -0.0118, "reward": 1.4646062850952148, "reward_std": 0.33400917053222656, "rewards/accuracy_reward_stage2": 0.4958563446998596, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3864 }, { "completion_length": 14.921875, "epoch": 0.6772384790608025, "grad_norm": 18.196032916563585, "kl": 0.126953125, "learning_rate": 3.22936744349045e-07, "loss": 0.0115, "reward": 1.5467426776885986, "reward_std": 0.22806578874588013, "rewards/accuracy_reward_stage2": 0.5623677372932434, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3865 }, { "completion_length": 8.765625, "epoch": 0.67741370247065, "grad_norm": 16.52686949186202, "kl": 0.1728515625, "learning_rate": 3.2276152093919746e-07, "loss": -0.0469, "reward": 1.771558403968811, "reward_std": 0.23040996491909027, "rewards/accuracy_reward_stage2": 0.818433403968811, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3866 }, { "completion_length": 11.6875, "epoch": 0.6775889258804977, "grad_norm": 19.275094089668283, "kl": 0.177734375, "learning_rate": 3.225862975293499e-07, "loss": 0.0131, "reward": 1.3561549186706543, "reward_std": 0.2779897451400757, "rewards/accuracy_reward_stage2": 0.4967798888683319, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3867 }, { "completion_length": 8.28125, "epoch": 0.6777641492903452, "grad_norm": 15.92255726519388, "kl": 0.10888671875, "learning_rate": 3.2241107411950234e-07, "loss": -0.0284, "reward": 1.4668264389038086, "reward_std": 0.1967121809720993, "rewards/accuracy_reward_stage2": 0.6074514389038086, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3868 }, { "completion_length": 6.046875, "epoch": 0.6779393727001928, "grad_norm": 11.782657382689344, "kl": 0.107421875, "learning_rate": 3.2223585070965483e-07, "loss": -0.0013, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.796875, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3869 }, { "completion_length": 7.09375, "epoch": 0.6781145961100403, "grad_norm": 15.969784495706426, "kl": 0.058837890625, "learning_rate": 3.2206062729980727e-07, "loss": 0.0236, "reward": 1.6320271492004395, "reward_std": 0.2036266028881073, "rewards/accuracy_reward_stage2": 0.6320271492004395, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3870 }, { "completion_length": 10.390625, "epoch": 0.6782898195198879, "grad_norm": 77.97322350800017, "kl": 0.625, "learning_rate": 3.2188540388995966e-07, "loss": 0.1614, "reward": 1.2760417461395264, "reward_std": 0.25043365359306335, "rewards/accuracy_reward_stage2": 0.4322916567325592, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3871 }, { "completion_length": 6.25, "epoch": 0.6784650429297354, "grad_norm": 21.606935492858284, "kl": 0.400390625, "learning_rate": 3.217101804801121e-07, "loss": 0.0334, "reward": 1.6798467636108398, "reward_std": 0.38453322649002075, "rewards/accuracy_reward_stage2": 0.7423468232154846, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3872 }, { "completion_length": 8.3125, "epoch": 0.678640266339583, "grad_norm": 17.220294287482876, "kl": 0.2197265625, "learning_rate": 3.215349570702646e-07, "loss": -0.0175, "reward": 1.521311640739441, "reward_std": 0.19599059224128723, "rewards/accuracy_reward_stage2": 0.5681866407394409, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3873 }, { "completion_length": 9.109375, "epoch": 0.6788154897494305, "grad_norm": 17.94561290455226, "kl": 0.18359375, "learning_rate": 3.2135973366041703e-07, "loss": 0.0733, "reward": 1.521716594696045, "reward_std": 0.17642980813980103, "rewards/accuracy_reward_stage2": 0.5217165946960449, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3874 }, { "completion_length": 7.296875, "epoch": 0.6789907131592781, "grad_norm": 17.68863830864549, "kl": 0.07666015625, "learning_rate": 3.2118451025056947e-07, "loss": 0.0307, "reward": 1.7461333274841309, "reward_std": 0.14099617302417755, "rewards/accuracy_reward_stage2": 0.7461333870887756, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3875 }, { "completion_length": 6.859375, "epoch": 0.6791659365691256, "grad_norm": 20.19153973213064, "kl": 0.1689453125, "learning_rate": 3.210092868407219e-07, "loss": 0.0069, "reward": 1.3125, "reward_std": 0.2845909297466278, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3876 }, { "completion_length": 8.828125, "epoch": 0.6793411599789732, "grad_norm": 18.381875937743548, "kl": 0.11328125, "learning_rate": 3.2083406343087435e-07, "loss": 0.0453, "reward": 1.6288864612579346, "reward_std": 0.2961333096027374, "rewards/accuracy_reward_stage2": 0.6288865208625793, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3877 }, { "completion_length": 7.546875, "epoch": 0.6795163833888207, "grad_norm": 20.716311767333977, "kl": 0.2294921875, "learning_rate": 3.206588400210268e-07, "loss": 0.011, "reward": 1.4247196912765503, "reward_std": 0.23277902603149414, "rewards/accuracy_reward_stage2": 0.4559696614742279, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3878 }, { "completion_length": 10.1875, "epoch": 0.6796916067986684, "grad_norm": 16.013344269281035, "kl": 0.08447265625, "learning_rate": 3.2048361661117923e-07, "loss": -0.0045, "reward": 1.6661726236343384, "reward_std": 0.16137085855007172, "rewards/accuracy_reward_stage2": 0.6817976236343384, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3879 }, { "completion_length": 10.765625, "epoch": 0.6798668302085159, "grad_norm": 17.385526364163912, "kl": 0.09228515625, "learning_rate": 3.2030839320133167e-07, "loss": -0.0072, "reward": 1.4617502689361572, "reward_std": 0.17313295602798462, "rewards/accuracy_reward_stage2": 0.47737520933151245, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3880 }, { "completion_length": 11.078125, "epoch": 0.6800420536183635, "grad_norm": 20.93387541319987, "kl": 0.16015625, "learning_rate": 3.2013316979148416e-07, "loss": 0.0201, "reward": 1.5362987518310547, "reward_std": 0.2890404462814331, "rewards/accuracy_reward_stage2": 0.5519237518310547, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3881 }, { "completion_length": 7.625, "epoch": 0.680217277028211, "grad_norm": 29.31166403311481, "kl": 0.41796875, "learning_rate": 3.199579463816366e-07, "loss": 0.0073, "reward": 1.5820767879486084, "reward_std": 0.3647039234638214, "rewards/accuracy_reward_stage2": 0.7695767879486084, "rewards/format_reward_stage1_pointerpad": 0.8125, "scores/accuracy_reward_stage2": 0.8125, "step": 3882 }, { "completion_length": 14.9375, "epoch": 0.6803925004380585, "grad_norm": 21.82332493151507, "kl": 0.08642578125, "learning_rate": 3.1978272297178904e-07, "loss": -0.0042, "reward": 1.495707392692566, "reward_std": 0.266140878200531, "rewards/accuracy_reward_stage2": 0.6363324522972107, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3883 }, { "completion_length": 12.09375, "epoch": 0.6805677238479061, "grad_norm": 18.232970730025155, "kl": 0.2177734375, "learning_rate": 3.196074995619414e-07, "loss": 0.0871, "reward": 1.3191421031951904, "reward_std": 0.26958224177360535, "rewards/accuracy_reward_stage2": 0.44414204359054565, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3884 }, { "completion_length": 12.265625, "epoch": 0.6807429472577536, "grad_norm": 19.870742781236203, "kl": 0.19140625, "learning_rate": 3.194322761520939e-07, "loss": 0.0428, "reward": 1.4023401737213135, "reward_std": 0.19982674717903137, "rewards/accuracy_reward_stage2": 0.5273402333259583, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3885 }, { "completion_length": 9.0625, "epoch": 0.6809181706676012, "grad_norm": 16.40488727080269, "kl": 0.283203125, "learning_rate": 3.1925705274224636e-07, "loss": 0.0197, "reward": 1.160224199295044, "reward_std": 0.1855008602142334, "rewards/accuracy_reward_stage2": 0.4414742588996887, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3886 }, { "completion_length": 21.421875, "epoch": 0.6810933940774487, "grad_norm": 13.552039941001802, "kl": 0.10986328125, "learning_rate": 3.190818293323988e-07, "loss": -0.0005, "reward": 1.639201045036316, "reward_std": 0.21364232897758484, "rewards/accuracy_reward_stage2": 0.6704509854316711, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3887 }, { "completion_length": 12.140625, "epoch": 0.6812686174872963, "grad_norm": 19.831530405238684, "kl": 0.19921875, "learning_rate": 3.1890660592255124e-07, "loss": -0.0619, "reward": 1.6471765041351318, "reward_std": 0.25535136461257935, "rewards/accuracy_reward_stage2": 0.7096765637397766, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3888 }, { "completion_length": 17.640625, "epoch": 0.6814438408971438, "grad_norm": 24.320226089228928, "kl": 0.0306396484375, "learning_rate": 3.1873138251270373e-07, "loss": 0.0122, "reward": 1.5085279941558838, "reward_std": 0.2112589329481125, "rewards/accuracy_reward_stage2": 0.5085281133651733, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3889 }, { "completion_length": 15.625, "epoch": 0.6816190643069914, "grad_norm": 17.509304200429426, "kl": 0.0673828125, "learning_rate": 3.185561591028561e-07, "loss": 0.027, "reward": 1.5866674184799194, "reward_std": 0.13666321337223053, "rewards/accuracy_reward_stage2": 0.5866674184799194, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3890 }, { "completion_length": 8.0, "epoch": 0.6817942877168389, "grad_norm": 17.129707485149034, "kl": 0.076171875, "learning_rate": 3.1838093569300856e-07, "loss": 0.0303, "reward": 1.4643492698669434, "reward_std": 0.195224791765213, "rewards/accuracy_reward_stage2": 0.5893491506576538, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3891 }, { "completion_length": 10.25, "epoch": 0.6819695111266866, "grad_norm": 19.25229769183077, "kl": 0.27734375, "learning_rate": 3.18205712283161e-07, "loss": 0.0266, "reward": 1.320204496383667, "reward_std": 0.3462868928909302, "rewards/accuracy_reward_stage2": 0.6170794367790222, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 3892 }, { "completion_length": 9.109375, "epoch": 0.6821447345365341, "grad_norm": 18.117867087783885, "kl": 0.10400390625, "learning_rate": 3.180304888733135e-07, "loss": -0.0014, "reward": 1.7592573165893555, "reward_std": 0.22831569612026215, "rewards/accuracy_reward_stage2": 0.7748823165893555, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3893 }, { "completion_length": 9.328125, "epoch": 0.6823199579463817, "grad_norm": 19.771147903621497, "kl": 0.050537109375, "learning_rate": 3.1785526546346593e-07, "loss": 0.0202, "reward": 1.2551724910736084, "reward_std": 0.30035412311553955, "rewards/accuracy_reward_stage2": 0.3801724910736084, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3894 }, { "completion_length": 12.71875, "epoch": 0.6824951813562292, "grad_norm": 20.155780290967037, "kl": 0.123046875, "learning_rate": 3.1768004205361837e-07, "loss": 0.005, "reward": 1.3723958730697632, "reward_std": 0.3768148124217987, "rewards/accuracy_reward_stage2": 0.3880208730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3895 }, { "completion_length": 7.296875, "epoch": 0.6826704047660768, "grad_norm": 20.313263656604374, "kl": 0.30859375, "learning_rate": 3.1750481864377075e-07, "loss": 0.0369, "reward": 1.384493350982666, "reward_std": 0.3127593398094177, "rewards/accuracy_reward_stage2": 0.5563682317733765, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3896 }, { "completion_length": 12.890625, "epoch": 0.6828456281759243, "grad_norm": 23.104763723478158, "kl": 0.125, "learning_rate": 3.1732959523392325e-07, "loss": 0.0501, "reward": 1.526780128479004, "reward_std": 0.22299879789352417, "rewards/accuracy_reward_stage2": 0.5267801284790039, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3897 }, { "completion_length": 11.625, "epoch": 0.6830208515857719, "grad_norm": 20.52548099394388, "kl": 0.2490234375, "learning_rate": 3.171543718240757e-07, "loss": -0.0513, "reward": 1.517575979232788, "reward_std": 0.3787464499473572, "rewards/accuracy_reward_stage2": 0.5957009792327881, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3898 }, { "completion_length": 7.1875, "epoch": 0.6831960749956194, "grad_norm": 13.175094925157342, "kl": 0.162109375, "learning_rate": 3.169791484142281e-07, "loss": 0.0209, "reward": 1.5124504566192627, "reward_std": 0.14380380511283875, "rewards/accuracy_reward_stage2": 0.5280753970146179, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3899 }, { "completion_length": 7.1875, "epoch": 0.683371298405467, "grad_norm": 16.721112796537913, "kl": 0.055908203125, "learning_rate": 3.1680392500438056e-07, "loss": 0.0223, "reward": 1.6026625633239746, "reward_std": 0.1928408145904541, "rewards/accuracy_reward_stage2": 0.6026625037193298, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3900 }, { "completion_length": 10.59375, "epoch": 0.6835465218153145, "grad_norm": 19.18941387849459, "kl": 0.0947265625, "learning_rate": 3.1662870159453306e-07, "loss": 0.0, "reward": 1.66532564163208, "reward_std": 0.23487676680088043, "rewards/accuracy_reward_stage2": 0.6809506416320801, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3901 }, { "completion_length": 10.296875, "epoch": 0.683721745225162, "grad_norm": 18.117473878505333, "kl": 0.2197265625, "learning_rate": 3.164534781846855e-07, "loss": -0.0005, "reward": 1.5684740543365479, "reward_std": 0.21021094918251038, "rewards/accuracy_reward_stage2": 0.8497240543365479, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3902 }, { "completion_length": 20.421875, "epoch": 0.6838969686350096, "grad_norm": 19.232278453453848, "kl": 0.048583984375, "learning_rate": 3.162782547748379e-07, "loss": 0.0194, "reward": 1.5420706272125244, "reward_std": 0.17480739951133728, "rewards/accuracy_reward_stage2": 0.5420706868171692, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3903 }, { "completion_length": 8.421875, "epoch": 0.6840721920448571, "grad_norm": 16.92202065403525, "kl": 0.2177734375, "learning_rate": 3.161030313649903e-07, "loss": 0.0555, "reward": 1.4832316637039185, "reward_std": 0.22372540831565857, "rewards/accuracy_reward_stage2": 0.6238567233085632, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3904 }, { "completion_length": 8.328125, "epoch": 0.6842474154547048, "grad_norm": 20.351729004974555, "kl": 0.12109375, "learning_rate": 3.1592780795514276e-07, "loss": -0.033, "reward": 1.6287168264389038, "reward_std": 0.20220136642456055, "rewards/accuracy_reward_stage2": 0.6755918860435486, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3905 }, { "completion_length": 12.875, "epoch": 0.6844226388645523, "grad_norm": 17.18872343506988, "kl": 0.043701171875, "learning_rate": 3.1575258454529526e-07, "loss": 0.0175, "reward": 1.587280511856079, "reward_std": 0.20979759097099304, "rewards/accuracy_reward_stage2": 0.5872805714607239, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3906 }, { "completion_length": 9.28125, "epoch": 0.6845978622743999, "grad_norm": 15.510287644855367, "kl": 0.06494140625, "learning_rate": 3.155773611354477e-07, "loss": 0.026, "reward": 1.2447917461395264, "reward_std": 0.19727420806884766, "rewards/accuracy_reward_stage2": 0.4947916567325592, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3907 }, { "completion_length": 11.8125, "epoch": 0.6847730856842474, "grad_norm": 10.721890124991495, "kl": 0.08935546875, "learning_rate": 3.1540213772560013e-07, "loss": -0.0078, "reward": 1.529199481010437, "reward_std": 0.1329018622636795, "rewards/accuracy_reward_stage2": 0.544824481010437, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3908 }, { "completion_length": 10.625, "epoch": 0.684948309094095, "grad_norm": 16.564565836577522, "kl": 0.10400390625, "learning_rate": 3.152269143157525e-07, "loss": 0.0249, "reward": 1.642519474029541, "reward_std": 0.14972180128097534, "rewards/accuracy_reward_stage2": 0.658144474029541, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3909 }, { "completion_length": 7.265625, "epoch": 0.6851235325039425, "grad_norm": 23.065037716747465, "kl": 0.1845703125, "learning_rate": 3.15051690905905e-07, "loss": 0.0301, "reward": 1.551328420639038, "reward_std": 0.3457961678504944, "rewards/accuracy_reward_stage2": 0.5669534206390381, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3910 }, { "completion_length": 8.703125, "epoch": 0.6852987559137901, "grad_norm": 17.467165082127643, "kl": 0.0986328125, "learning_rate": 3.1487646749605745e-07, "loss": -0.0019, "reward": 1.4705981016159058, "reward_std": 0.24642279744148254, "rewards/accuracy_reward_stage2": 0.6112231016159058, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3911 }, { "completion_length": 9.65625, "epoch": 0.6854739793236376, "grad_norm": 23.083623670273536, "kl": 0.1455078125, "learning_rate": 3.147012440862099e-07, "loss": 0.0583, "reward": 1.770646095275879, "reward_std": 0.15404048562049866, "rewards/accuracy_reward_stage2": 0.7706459760665894, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3912 }, { "completion_length": 13.796875, "epoch": 0.6856492027334852, "grad_norm": 18.169266900451227, "kl": 0.08935546875, "learning_rate": 3.1452602067636233e-07, "loss": -0.0083, "reward": 1.6190228462219238, "reward_std": 0.23725715279579163, "rewards/accuracy_reward_stage2": 0.6346479058265686, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3913 }, { "completion_length": 23.46875, "epoch": 0.6858244261433327, "grad_norm": 17.97592684350171, "kl": 0.07666015625, "learning_rate": 3.143507972665148e-07, "loss": 0.0306, "reward": 1.5842111110687256, "reward_std": 0.13952378928661346, "rewards/accuracy_reward_stage2": 0.584210991859436, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3914 }, { "completion_length": 10.15625, "epoch": 0.6859996495531803, "grad_norm": 16.534376820236705, "kl": 0.0732421875, "learning_rate": 3.1417557385666727e-07, "loss": 0.0293, "reward": 1.6091794967651367, "reward_std": 0.19951403141021729, "rewards/accuracy_reward_stage2": 0.6091794371604919, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3915 }, { "completion_length": 12.546875, "epoch": 0.6861748729630278, "grad_norm": 11.26361851638084, "kl": 0.0703125, "learning_rate": 3.1400035044681965e-07, "loss": -0.0161, "reward": 1.7864649295806885, "reward_std": 0.1284366399049759, "rewards/accuracy_reward_stage2": 0.8020899891853333, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3916 }, { "completion_length": 10.4375, "epoch": 0.6863500963728754, "grad_norm": 25.026176432044366, "kl": 0.416015625, "learning_rate": 3.138251270369721e-07, "loss": 0.0294, "reward": 1.6315479278564453, "reward_std": 0.36400657892227173, "rewards/accuracy_reward_stage2": 0.6940478086471558, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3917 }, { "completion_length": 25.125, "epoch": 0.686525319782723, "grad_norm": 12.195645963577283, "kl": 0.21484375, "learning_rate": 3.136499036271246e-07, "loss": -0.0283, "reward": 1.365476131439209, "reward_std": 0.2613718509674072, "rewards/accuracy_reward_stage2": 0.5373511910438538, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 3918 }, { "completion_length": 11.78125, "epoch": 0.6867005431925706, "grad_norm": 23.591685414442438, "kl": 0.283203125, "learning_rate": 3.13474680217277e-07, "loss": -0.0139, "reward": 1.604864478111267, "reward_std": 0.33831846714019775, "rewards/accuracy_reward_stage2": 0.6673645377159119, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3919 }, { "completion_length": 8.765625, "epoch": 0.6868757666024181, "grad_norm": 12.687983316888051, "kl": 0.06494140625, "learning_rate": 3.1329945680742946e-07, "loss": 0.026, "reward": 1.7347902059555054, "reward_std": 0.18083685636520386, "rewards/accuracy_reward_stage2": 0.7347902059555054, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3920 }, { "completion_length": 12.515625, "epoch": 0.6870509900122657, "grad_norm": 35.26355604712914, "kl": 0.2373046875, "learning_rate": 3.131242333975819e-07, "loss": 0.0171, "reward": 1.3750255107879639, "reward_std": 0.27525320649147034, "rewards/accuracy_reward_stage2": 0.5312755703926086, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3921 }, { "completion_length": 9.046875, "epoch": 0.6872262134221132, "grad_norm": 17.562680111645903, "kl": 0.1357421875, "learning_rate": 3.1294900998773434e-07, "loss": 0.0169, "reward": 1.6735283136367798, "reward_std": 0.17841938138008118, "rewards/accuracy_reward_stage2": 0.6891533732414246, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3922 }, { "completion_length": 8.78125, "epoch": 0.6874014368319608, "grad_norm": 11.789574838636355, "kl": 0.080078125, "learning_rate": 3.127737865778868e-07, "loss": -0.0122, "reward": 1.745296597480774, "reward_std": 0.09620551019906998, "rewards/accuracy_reward_stage2": 0.7609216570854187, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3923 }, { "completion_length": 10.5, "epoch": 0.6875766602418083, "grad_norm": 17.03907545161025, "kl": 0.11865234375, "learning_rate": 3.125985631680392e-07, "loss": 0.02, "reward": 1.565126657485962, "reward_std": 0.22917722165584564, "rewards/accuracy_reward_stage2": 0.5807517170906067, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3924 }, { "completion_length": 9.1875, "epoch": 0.6877518836516558, "grad_norm": 19.054671720664587, "kl": 0.2275390625, "learning_rate": 3.1242333975819166e-07, "loss": 0.0302, "reward": 1.586341142654419, "reward_std": 0.25396132469177246, "rewards/accuracy_reward_stage2": 0.617591142654419, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3925 }, { "completion_length": 5.671875, "epoch": 0.6879271070615034, "grad_norm": 21.235964599476986, "kl": 0.08349609375, "learning_rate": 3.1224811634834415e-07, "loss": 0.0333, "reward": 1.5080132484436035, "reward_std": 0.18352213501930237, "rewards/accuracy_reward_stage2": 0.5080131888389587, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3926 }, { "completion_length": 9.21875, "epoch": 0.6881023304713509, "grad_norm": 19.730500202294046, "kl": 0.0791015625, "learning_rate": 3.120728929384966e-07, "loss": 0.0317, "reward": 1.629475712776184, "reward_std": 0.20445698499679565, "rewards/accuracy_reward_stage2": 0.6294757127761841, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3927 }, { "completion_length": 6.9375, "epoch": 0.6882775538811985, "grad_norm": 18.653319443582067, "kl": 0.302734375, "learning_rate": 3.11897669528649e-07, "loss": 0.0434, "reward": 1.310603141784668, "reward_std": 0.3170985281467438, "rewards/accuracy_reward_stage2": 0.5918530821800232, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 3928 }, { "completion_length": 10.609375, "epoch": 0.688452777291046, "grad_norm": 20.47469033235731, "kl": 0.1328125, "learning_rate": 3.117224461188014e-07, "loss": 0.0533, "reward": 1.6717946529388428, "reward_std": 0.16436317563056946, "rewards/accuracy_reward_stage2": 0.6717947721481323, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3929 }, { "completion_length": 20.15625, "epoch": 0.6886280007008937, "grad_norm": 19.376279526164858, "kl": 0.2216796875, "learning_rate": 3.115472227089539e-07, "loss": 0.0228, "reward": 1.332094430923462, "reward_std": 0.16052697598934174, "rewards/accuracy_reward_stage2": 0.48834434151649475, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3930 }, { "completion_length": 7.5625, "epoch": 0.6888032241107412, "grad_norm": 19.79598180371833, "kl": 0.271484375, "learning_rate": 3.1137199929910635e-07, "loss": 0.0384, "reward": 1.54587721824646, "reward_std": 0.3018754720687866, "rewards/accuracy_reward_stage2": 0.7021272778511047, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3931 }, { "completion_length": 16.921875, "epoch": 0.6889784475205888, "grad_norm": 13.835350571440522, "kl": 0.1494140625, "learning_rate": 3.111967758892588e-07, "loss": 0.016, "reward": 1.5475966930389404, "reward_std": 0.15770044922828674, "rewards/accuracy_reward_stage2": 0.8132216930389404, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 3932 }, { "completion_length": 8.734375, "epoch": 0.6891536709304363, "grad_norm": 20.491047232136104, "kl": 0.185546875, "learning_rate": 3.1102155247941123e-07, "loss": 0.074, "reward": 1.6471889019012451, "reward_std": 0.18342958390712738, "rewards/accuracy_reward_stage2": 0.7721887826919556, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3933 }, { "completion_length": 14.265625, "epoch": 0.6893288943402839, "grad_norm": 14.746120338904305, "kl": 0.171875, "learning_rate": 3.108463290695637e-07, "loss": -0.0042, "reward": 1.4174625873565674, "reward_std": 0.30715838074684143, "rewards/accuracy_reward_stage2": 0.5737125873565674, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3934 }, { "completion_length": 13.953125, "epoch": 0.6895041177501314, "grad_norm": 15.878902923650905, "kl": 0.1328125, "learning_rate": 3.106711056597161e-07, "loss": -0.0028, "reward": 1.6495393514633179, "reward_std": 0.2503609359264374, "rewards/accuracy_reward_stage2": 0.6807893514633179, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3935 }, { "completion_length": 10.796875, "epoch": 0.689679341159979, "grad_norm": 18.115861963025267, "kl": 0.09814453125, "learning_rate": 3.1049588224986855e-07, "loss": 0.0394, "reward": 1.5655012130737305, "reward_std": 0.3471730351448059, "rewards/accuracy_reward_stage2": 0.5655011534690857, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3936 }, { "completion_length": 7.40625, "epoch": 0.6898545645698265, "grad_norm": 17.574990338644515, "kl": 0.181640625, "learning_rate": 3.10320658840021e-07, "loss": -0.0157, "reward": 1.5906562805175781, "reward_std": 0.34753066301345825, "rewards/accuracy_reward_stage2": 0.6219062209129333, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3937 }, { "completion_length": 7.0, "epoch": 0.6900297879796741, "grad_norm": 19.48735881575915, "kl": 0.1064453125, "learning_rate": 3.101454354301735e-07, "loss": -0.0014, "reward": 1.5208333730697632, "reward_std": 0.24168574810028076, "rewards/accuracy_reward_stage2": 0.5364583730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3938 }, { "completion_length": 10.65625, "epoch": 0.6902050113895216, "grad_norm": 17.762705769766715, "kl": 0.055908203125, "learning_rate": 3.099702120203259e-07, "loss": 0.0223, "reward": 1.5739235877990723, "reward_std": 0.20269808173179626, "rewards/accuracy_reward_stage2": 0.5739235877990723, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3939 }, { "completion_length": 10.6875, "epoch": 0.6903802347993692, "grad_norm": 20.91996892422379, "kl": 0.25390625, "learning_rate": 3.0979498861047836e-07, "loss": 0.0024, "reward": 1.3960648775100708, "reward_std": 0.30436623096466064, "rewards/accuracy_reward_stage2": 0.4429398775100708, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3940 }, { "completion_length": 12.375, "epoch": 0.6905554582092167, "grad_norm": 26.349980975779463, "kl": 0.08740234375, "learning_rate": 3.0961976520063075e-07, "loss": -0.0004, "reward": 1.816171646118164, "reward_std": 0.23692914843559265, "rewards/accuracy_reward_stage2": 0.8317966461181641, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3941 }, { "completion_length": 9.34375, "epoch": 0.6907306816190643, "grad_norm": 19.432604856174976, "kl": 0.06787109375, "learning_rate": 3.0944454179078324e-07, "loss": 0.0271, "reward": 1.5397982597351074, "reward_std": 0.2259722501039505, "rewards/accuracy_reward_stage2": 0.5397982001304626, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3942 }, { "completion_length": 12.109375, "epoch": 0.6909059050289119, "grad_norm": 22.676757328029613, "kl": 0.1728515625, "learning_rate": 3.092693183809357e-07, "loss": 0.0284, "reward": 1.4862951040267944, "reward_std": 0.28705430030822754, "rewards/accuracy_reward_stage2": 0.6269201040267944, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3943 }, { "completion_length": 28.703125, "epoch": 0.6910811284387595, "grad_norm": 14.315674866282164, "kl": 0.09130859375, "learning_rate": 3.090940949710881e-07, "loss": -0.0077, "reward": 1.6648551225662231, "reward_std": 0.13941612839698792, "rewards/accuracy_reward_stage2": 0.6804801225662231, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3944 }, { "completion_length": 11.734375, "epoch": 0.691256351848607, "grad_norm": 18.54880608586595, "kl": 0.1591796875, "learning_rate": 3.0891887156124056e-07, "loss": 0.0196, "reward": 1.5700504779815674, "reward_std": 0.18866734206676483, "rewards/accuracy_reward_stage2": 0.5856754779815674, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3945 }, { "completion_length": 10.40625, "epoch": 0.6914315752584546, "grad_norm": 20.708798753313303, "kl": 0.255859375, "learning_rate": 3.0874364815139305e-07, "loss": -0.045, "reward": 1.4974335432052612, "reward_std": 0.3221808075904846, "rewards/accuracy_reward_stage2": 0.5599335432052612, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 3946 }, { "completion_length": 11.921875, "epoch": 0.6916067986683021, "grad_norm": 17.309036654287304, "kl": 0.11328125, "learning_rate": 3.0856842474154544e-07, "loss": 0.0454, "reward": 1.5039632320404053, "reward_std": 0.2883744239807129, "rewards/accuracy_reward_stage2": 0.6289632320404053, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3947 }, { "completion_length": 7.96875, "epoch": 0.6917820220781496, "grad_norm": 20.163204338819533, "kl": 0.185546875, "learning_rate": 3.083932013316979e-07, "loss": 0.0742, "reward": 1.5174355506896973, "reward_std": 0.1934971660375595, "rewards/accuracy_reward_stage2": 0.6424355506896973, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3948 }, { "completion_length": 8.484375, "epoch": 0.6919572454879972, "grad_norm": 23.202422932170183, "kl": 0.1689453125, "learning_rate": 3.082179779218503e-07, "loss": -0.0156, "reward": 1.4502408504486084, "reward_std": 0.2823672890663147, "rewards/accuracy_reward_stage2": 0.4971158802509308, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3949 }, { "completion_length": 9.375, "epoch": 0.6921324688978447, "grad_norm": 18.667771850263488, "kl": 0.2158203125, "learning_rate": 3.080427545120028e-07, "loss": -0.0125, "reward": 1.7800071239471436, "reward_std": 0.2526581585407257, "rewards/accuracy_reward_stage2": 0.8268821835517883, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3950 }, { "completion_length": 12.234375, "epoch": 0.6923076923076923, "grad_norm": 21.12215208307982, "kl": 0.1025390625, "learning_rate": 3.0786753110215525e-07, "loss": 0.0409, "reward": 1.3765630722045898, "reward_std": 0.2770848870277405, "rewards/accuracy_reward_stage2": 0.37656310200691223, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3951 }, { "completion_length": 6.953125, "epoch": 0.6924829157175398, "grad_norm": 21.502918541623767, "kl": 0.1767578125, "learning_rate": 3.076923076923077e-07, "loss": 0.0267, "reward": 1.59840989112854, "reward_std": 0.3457435965538025, "rewards/accuracy_reward_stage2": 0.6140349507331848, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3952 }, { "completion_length": 6.59375, "epoch": 0.6926581391273874, "grad_norm": 20.384638028256372, "kl": 0.134765625, "learning_rate": 3.0751708428246013e-07, "loss": 0.0121, "reward": 1.7731083631515503, "reward_std": 0.2765722870826721, "rewards/accuracy_reward_stage2": 0.8043583631515503, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3953 }, { "completion_length": 6.421875, "epoch": 0.6928333625372349, "grad_norm": 12.88207861312547, "kl": 0.07373046875, "learning_rate": 3.0734186087261257e-07, "loss": -0.0027, "reward": 1.7528049945831299, "reward_std": 0.12491665780544281, "rewards/accuracy_reward_stage2": 0.7684298753738403, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3954 }, { "completion_length": 9.921875, "epoch": 0.6930085859470825, "grad_norm": 18.404849601417556, "kl": 0.1630859375, "learning_rate": 3.07166637462765e-07, "loss": 0.008, "reward": 1.631026029586792, "reward_std": 0.31937503814697266, "rewards/accuracy_reward_stage2": 0.662276029586792, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3955 }, { "completion_length": 11.984375, "epoch": 0.6931838093569301, "grad_norm": 18.07342096506507, "kl": 0.166015625, "learning_rate": 3.0699141405291745e-07, "loss": 0.0023, "reward": 1.658907413482666, "reward_std": 0.27549779415130615, "rewards/accuracy_reward_stage2": 0.6901572942733765, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3956 }, { "completion_length": 8.78125, "epoch": 0.6933590327667777, "grad_norm": 15.03471222250481, "kl": 0.1240234375, "learning_rate": 3.068161906430699e-07, "loss": -0.035, "reward": 1.6224809885025024, "reward_std": 0.209599107503891, "rewards/accuracy_reward_stage2": 0.6537309885025024, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3957 }, { "completion_length": 14.84375, "epoch": 0.6935342561766252, "grad_norm": 17.05975382067552, "kl": 0.03662109375, "learning_rate": 3.066409672332224e-07, "loss": -0.0187, "reward": 1.6875, "reward_std": 0.213067427277565, "rewards/accuracy_reward_stage2": 0.703125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3958 }, { "completion_length": 9.53125, "epoch": 0.6937094795864728, "grad_norm": 19.50720645745515, "kl": 0.07275390625, "learning_rate": 3.064657438233748e-07, "loss": 0.0291, "reward": 1.7259900569915771, "reward_std": 0.3334110379219055, "rewards/accuracy_reward_stage2": 0.7259901165962219, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3959 }, { "completion_length": 12.0625, "epoch": 0.6938847029963203, "grad_norm": 15.904502289401634, "kl": 0.09521484375, "learning_rate": 3.062905204135272e-07, "loss": -0.0039, "reward": 1.599704623222351, "reward_std": 0.1589922308921814, "rewards/accuracy_reward_stage2": 0.6153296232223511, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3960 }, { "completion_length": 9.203125, "epoch": 0.6940599264061679, "grad_norm": 21.180187694098507, "kl": 0.2412109375, "learning_rate": 3.0611529700367965e-07, "loss": 0.0627, "reward": 1.4935078620910645, "reward_std": 0.22299349308013916, "rewards/accuracy_reward_stage2": 0.6341328620910645, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3961 }, { "completion_length": 7.71875, "epoch": 0.6942351498160154, "grad_norm": 13.024524663270084, "kl": 0.1484375, "learning_rate": 3.0594007359383214e-07, "loss": -0.0521, "reward": 1.5260417461395264, "reward_std": 0.2162405401468277, "rewards/accuracy_reward_stage2": 0.5729166865348816, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3962 }, { "completion_length": 6.5, "epoch": 0.694410373225863, "grad_norm": 12.515771796910617, "kl": 0.1103515625, "learning_rate": 3.057648501839846e-07, "loss": -0.0185, "reward": 1.783489465713501, "reward_std": 0.09697789698839188, "rewards/accuracy_reward_stage2": 0.8147395849227905, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3963 }, { "completion_length": 9.34375, "epoch": 0.6945855966357105, "grad_norm": 16.101834277694934, "kl": 0.1943359375, "learning_rate": 3.05589626774137e-07, "loss": -0.0135, "reward": 1.6511366367340088, "reward_std": 0.21041421592235565, "rewards/accuracy_reward_stage2": 0.698011577129364, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3964 }, { "completion_length": 11.703125, "epoch": 0.6947608200455581, "grad_norm": 20.639096960569315, "kl": 0.1552734375, "learning_rate": 3.0541440336428946e-07, "loss": 0.0404, "reward": 1.6338541507720947, "reward_std": 0.2505960464477539, "rewards/accuracy_reward_stage2": 0.7588541507720947, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3965 }, { "completion_length": 7.921875, "epoch": 0.6949360434554056, "grad_norm": 25.032954546058857, "kl": 0.2490234375, "learning_rate": 3.052391799544419e-07, "loss": -0.037, "reward": 1.5356470346450806, "reward_std": 0.3018898367881775, "rewards/accuracy_reward_stage2": 0.6137720346450806, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3966 }, { "completion_length": 9.125, "epoch": 0.6951112668652532, "grad_norm": 29.25886863913636, "kl": 0.08154296875, "learning_rate": 3.0506395654459434e-07, "loss": 0.0327, "reward": 1.6015243530273438, "reward_std": 0.2491273283958435, "rewards/accuracy_reward_stage2": 0.6015242338180542, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3967 }, { "completion_length": 7.265625, "epoch": 0.6952864902751007, "grad_norm": 28.75762040222275, "kl": 0.138671875, "learning_rate": 3.048887331347468e-07, "loss": -0.0252, "reward": 1.357391595840454, "reward_std": 0.20943915843963623, "rewards/accuracy_reward_stage2": 0.6542665362358093, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 3968 }, { "completion_length": 11.0, "epoch": 0.6954617136849484, "grad_norm": 17.793314482250434, "kl": 0.056396484375, "learning_rate": 3.047135097248992e-07, "loss": 0.0226, "reward": 1.714925765991211, "reward_std": 0.18759344518184662, "rewards/accuracy_reward_stage2": 0.7149257063865662, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3969 }, { "completion_length": 11.890625, "epoch": 0.6956369370947959, "grad_norm": 10.884958540757461, "kl": 0.162109375, "learning_rate": 3.045382863150517e-07, "loss": 0.0361, "reward": 1.59375, "reward_std": 0.1872510462999344, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3970 }, { "completion_length": 10.609375, "epoch": 0.6958121605046435, "grad_norm": 19.301987640237314, "kl": 0.1025390625, "learning_rate": 3.0436306290520415e-07, "loss": 0.0115, "reward": 1.6933300495147705, "reward_std": 0.17335617542266846, "rewards/accuracy_reward_stage2": 0.7089550495147705, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3971 }, { "completion_length": 9.375, "epoch": 0.695987383914491, "grad_norm": 17.800448563142357, "kl": 0.1787109375, "learning_rate": 3.041878394953566e-07, "loss": -0.0171, "reward": 1.5350027084350586, "reward_std": 0.30478453636169434, "rewards/accuracy_reward_stage2": 0.5818777084350586, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3972 }, { "completion_length": 8.171875, "epoch": 0.6961626073243385, "grad_norm": 17.08748803743485, "kl": 0.1298828125, "learning_rate": 3.04012616085509e-07, "loss": 0.0278, "reward": 1.4267973899841309, "reward_std": 0.2559486925601959, "rewards/accuracy_reward_stage2": 0.5674223899841309, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3973 }, { "completion_length": 10.5625, "epoch": 0.6963378307341861, "grad_norm": 17.388386742225734, "kl": 0.287109375, "learning_rate": 3.038373926756614e-07, "loss": 0.1151, "reward": 1.4651490449905396, "reward_std": 0.23490217328071594, "rewards/accuracy_reward_stage2": 0.7151491641998291, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 3974 }, { "completion_length": 11.765625, "epoch": 0.6965130541440336, "grad_norm": 14.604925955241491, "kl": 0.027587890625, "learning_rate": 3.036621692658139e-07, "loss": 0.011, "reward": 1.4230644702911377, "reward_std": 0.17332643270492554, "rewards/accuracy_reward_stage2": 0.4230644702911377, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3975 }, { "completion_length": 8.34375, "epoch": 0.6966882775538812, "grad_norm": 19.10335138910953, "kl": 0.06884765625, "learning_rate": 3.0348694585596635e-07, "loss": -0.0135, "reward": 1.392409324645996, "reward_std": 0.24867044389247894, "rewards/accuracy_reward_stage2": 0.40803423523902893, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3976 }, { "completion_length": 8.59375, "epoch": 0.6968635009637287, "grad_norm": 13.149312735503395, "kl": 0.09423828125, "learning_rate": 3.033117224461188e-07, "loss": -0.0378, "reward": 1.6302083730697632, "reward_std": 0.24144160747528076, "rewards/accuracy_reward_stage2": 0.6614583730697632, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3977 }, { "completion_length": 10.9375, "epoch": 0.6970387243735763, "grad_norm": 15.296151278621773, "kl": 0.1728515625, "learning_rate": 3.031364990362713e-07, "loss": -0.0421, "reward": 1.8066771030426025, "reward_std": 0.2580149173736572, "rewards/accuracy_reward_stage2": 0.8535521030426025, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3978 }, { "completion_length": 8.265625, "epoch": 0.6972139477834238, "grad_norm": 14.792175216235439, "kl": 0.130859375, "learning_rate": 3.0296127562642367e-07, "loss": 0.0522, "reward": 1.4834437370300293, "reward_std": 0.093760646879673, "rewards/accuracy_reward_stage2": 0.6084437966346741, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3979 }, { "completion_length": 9.546875, "epoch": 0.6973891711932714, "grad_norm": 15.189126749746107, "kl": 0.09326171875, "learning_rate": 3.027860522165761e-07, "loss": -0.0068, "reward": 1.5334163904190063, "reward_std": 0.1791066825389862, "rewards/accuracy_reward_stage2": 0.5490414500236511, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3980 }, { "completion_length": 11.203125, "epoch": 0.697564394603119, "grad_norm": 16.051310339535767, "kl": 0.16796875, "learning_rate": 3.0261082880672855e-07, "loss": -0.0004, "reward": 1.7741138935089111, "reward_std": 0.11385900527238846, "rewards/accuracy_reward_stage2": 0.8209889531135559, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3981 }, { "completion_length": 10.859375, "epoch": 0.6977396180129666, "grad_norm": 18.411013346263637, "kl": 0.259765625, "learning_rate": 3.02435605396881e-07, "loss": -0.0625, "reward": 1.678377389907837, "reward_std": 0.2788037061691284, "rewards/accuracy_reward_stage2": 0.7565024495124817, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3982 }, { "completion_length": 14.15625, "epoch": 0.6979148414228141, "grad_norm": 18.396572064637514, "kl": 0.06298828125, "learning_rate": 3.022603819870335e-07, "loss": -0.0166, "reward": 1.6920890808105469, "reward_std": 0.2676393389701843, "rewards/accuracy_reward_stage2": 0.7077139616012573, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3983 }, { "completion_length": 9.015625, "epoch": 0.6980900648326617, "grad_norm": 15.892165582034131, "kl": 0.07373046875, "learning_rate": 3.020851585771859e-07, "loss": 0.0296, "reward": 1.5303912162780762, "reward_std": 0.1921355128288269, "rewards/accuracy_reward_stage2": 0.5303913354873657, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3984 }, { "completion_length": 10.0625, "epoch": 0.6982652882425092, "grad_norm": 22.291782886833108, "kl": 0.1728515625, "learning_rate": 3.0190993516733836e-07, "loss": 0.0251, "reward": 1.3862011432647705, "reward_std": 0.3098811209201813, "rewards/accuracy_reward_stage2": 0.5268262028694153, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3985 }, { "completion_length": 9.640625, "epoch": 0.6984405116523568, "grad_norm": 16.805373534048258, "kl": 0.08984375, "learning_rate": 3.0173471175749074e-07, "loss": 0.036, "reward": 1.4264421463012695, "reward_std": 0.14173433184623718, "rewards/accuracy_reward_stage2": 0.4264422655105591, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3986 }, { "completion_length": 11.671875, "epoch": 0.6986157350622043, "grad_norm": 30.59496168350246, "kl": 0.2109375, "learning_rate": 3.0155948834764324e-07, "loss": -0.0151, "reward": 1.6473720073699951, "reward_std": 0.28604885935783386, "rewards/accuracy_reward_stage2": 0.6942470669746399, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 3987 }, { "completion_length": 7.6875, "epoch": 0.6987909584720519, "grad_norm": 20.906417103919768, "kl": 0.09619140625, "learning_rate": 3.013842649377957e-07, "loss": 0.0108, "reward": 1.443253517150879, "reward_std": 0.20594848692417145, "rewards/accuracy_reward_stage2": 0.45887845754623413, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3988 }, { "completion_length": 32.0625, "epoch": 0.6989661818818994, "grad_norm": 19.969257114079817, "kl": 0.0380859375, "learning_rate": 3.012090415279481e-07, "loss": 0.0152, "reward": 1.5706324577331543, "reward_std": 0.09524843841791153, "rewards/accuracy_reward_stage2": 0.5706325769424438, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3989 }, { "completion_length": 13.515625, "epoch": 0.699141405291747, "grad_norm": 16.937478051824815, "kl": 0.0791015625, "learning_rate": 3.0103381811810056e-07, "loss": -0.0126, "reward": 1.5109663009643555, "reward_std": 0.1787455677986145, "rewards/accuracy_reward_stage2": 0.5265913605690002, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3990 }, { "completion_length": 11.8125, "epoch": 0.6993166287015945, "grad_norm": 17.999730551940655, "kl": 0.10302734375, "learning_rate": 3.0085859470825305e-07, "loss": -0.0447, "reward": 1.6739494800567627, "reward_std": 0.30125704407691956, "rewards/accuracy_reward_stage2": 0.8301993608474731, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3991 }, { "completion_length": 8.78125, "epoch": 0.699491852111442, "grad_norm": 20.101459352683804, "kl": 0.251953125, "learning_rate": 3.0068337129840543e-07, "loss": 0.0563, "reward": 1.437328577041626, "reward_std": 0.2033752202987671, "rewards/accuracy_reward_stage2": 0.577953577041626, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3992 }, { "completion_length": 8.015625, "epoch": 0.6996670755212896, "grad_norm": 16.72200093482387, "kl": 0.296875, "learning_rate": 3.005081478885579e-07, "loss": -0.0461, "reward": 1.5274182558059692, "reward_std": 0.2606680989265442, "rewards/accuracy_reward_stage2": 0.6055432558059692, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 3993 }, { "completion_length": 8.140625, "epoch": 0.6998422989311373, "grad_norm": 15.525598036631733, "kl": 0.09423828125, "learning_rate": 3.003329244787103e-07, "loss": 0.0168, "reward": 1.5954310894012451, "reward_std": 0.21913869678974152, "rewards/accuracy_reward_stage2": 0.6110560297966003, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3994 }, { "completion_length": 13.515625, "epoch": 0.7000175223409848, "grad_norm": 24.49869160495708, "kl": 0.1826171875, "learning_rate": 3.001577010688628e-07, "loss": -0.007, "reward": 1.475322961807251, "reward_std": 0.29399049282073975, "rewards/accuracy_reward_stage2": 0.5065730214118958, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 3995 }, { "completion_length": 8.765625, "epoch": 0.7001927457508323, "grad_norm": 21.277096836973257, "kl": 0.16015625, "learning_rate": 2.9998247765901525e-07, "loss": 0.0458, "reward": 1.388684630393982, "reward_std": 0.29958435893058777, "rewards/accuracy_reward_stage2": 0.5136846303939819, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 3996 }, { "completion_length": 11.875, "epoch": 0.7003679691606799, "grad_norm": 17.36410670374809, "kl": 0.171875, "learning_rate": 2.998072542491677e-07, "loss": 0.0297, "reward": 1.2950433492660522, "reward_std": 0.21796312928199768, "rewards/accuracy_reward_stage2": 0.43566828966140747, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 3997 }, { "completion_length": 10.671875, "epoch": 0.7005431925705274, "grad_norm": 19.323163806534783, "kl": 0.21875, "learning_rate": 2.9963203083932007e-07, "loss": 0.0144, "reward": 1.4738521575927734, "reward_std": 0.3393661379814148, "rewards/accuracy_reward_stage2": 0.6301021575927734, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 3998 }, { "completion_length": 8.890625, "epoch": 0.700718415980375, "grad_norm": 20.288072612266532, "kl": 0.11328125, "learning_rate": 2.9945680742947257e-07, "loss": 0.0009, "reward": 1.7728909254074097, "reward_std": 0.1919422149658203, "rewards/accuracy_reward_stage2": 0.7885159254074097, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 3999 }, { "completion_length": 10.140625, "epoch": 0.7008936393902225, "grad_norm": 19.459808071686343, "kl": 0.09619140625, "learning_rate": 2.99281584019625e-07, "loss": -0.0037, "reward": 1.4524281024932861, "reward_std": 0.24864047765731812, "rewards/accuracy_reward_stage2": 0.46805307269096375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 4000 } ], "logging_steps": 1.0, "max_steps": 5707, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }