diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7008936393902225, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 5.71875, + "epoch": 0.00017522340984755565, + "grad_norm": 23.31555964997353, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 1.4438997507095337, + "reward_std": 0.22532765567302704, + "rewards/accuracy_reward_stage2": 0.4438997805118561, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1 + }, + { + "completion_length": 7.21875, + "epoch": 0.0003504468196951113, + "grad_norm": 22.065290449218974, + "kl": -8.940696716308594e-06, + "learning_rate": 9.998247765901524e-07, + "loss": -0.0, + "reward": 1.4763569831848145, + "reward_std": 0.2327914535999298, + "rewards/accuracy_reward_stage2": 0.47635695338249207, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2 + }, + { + "completion_length": 11.1875, + "epoch": 0.0005256702295426669, + "grad_norm": 26.456861704069485, + "kl": 0.0001163482666015625, + "learning_rate": 9.99649553180305e-07, + "loss": 0.0, + "reward": 1.4022423028945923, + "reward_std": 0.2888947129249573, + "rewards/accuracy_reward_stage2": 0.4022422432899475, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3 + }, + { + "completion_length": 17.140625, + "epoch": 0.0007008936393902226, + "grad_norm": 23.624755117010494, + "kl": 0.00738525390625, + "learning_rate": 9.994743297704572e-07, + "loss": 0.0029, + "reward": 1.3010417222976685, + "reward_std": 0.14182603359222412, + "rewards/accuracy_reward_stage2": 0.5510416626930237, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 4 + }, + { + "completion_length": 14.78125, + "epoch": 0.0008761170492377782, + "grad_norm": 22.987286296145836, + "kl": 0.00022792816162109375, + "learning_rate": 9.992991063606097e-07, + "loss": -0.088, + "reward": 1.281743049621582, + "reward_std": 0.25567078590393066, + "rewards/accuracy_reward_stage2": 0.3129930794239044, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 5 + }, + { + "completion_length": 9.703125, + "epoch": 0.0010513404590853338, + "grad_norm": 33.07972699991074, + "kl": 1.3470649719238281e-05, + "learning_rate": 9.991238829507622e-07, + "loss": 0.0, + "reward": 1.5713826417922974, + "reward_std": 0.37181052565574646, + "rewards/accuracy_reward_stage2": 0.5713826417922974, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 6 + }, + { + "completion_length": 10.71875, + "epoch": 0.0012265638689328894, + "grad_norm": 27.974285828488036, + "kl": 7.2479248046875e-05, + "learning_rate": 9.989486595409147e-07, + "loss": 0.0, + "reward": 1.3608198165893555, + "reward_std": 0.2502235174179077, + "rewards/accuracy_reward_stage2": 0.360819935798645, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 7 + }, + { + "completion_length": 8.515625, + "epoch": 0.0014017872787804452, + "grad_norm": 33.13357399660305, + "kl": 0.103515625, + "learning_rate": 9.98773436131067e-07, + "loss": 0.0521, + "reward": 1.3532755374908447, + "reward_std": 0.29816100001335144, + "rewards/accuracy_reward_stage2": 0.4782755672931671, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 8 + }, + { + "completion_length": 8.359375, + "epoch": 0.0015770106886280008, + "grad_norm": 24.905794912365085, + "kl": -9.1552734375e-05, + "learning_rate": 9.985982127212195e-07, + "loss": -0.0, + "reward": 1.4549081325531006, + "reward_std": 0.328605592250824, + "rewards/accuracy_reward_stage2": 0.4549080431461334, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 9 + }, + { + "completion_length": 12.96875, + "epoch": 0.0017522340984755564, + "grad_norm": 67.46304155608689, + "kl": 0.03955078125, + "learning_rate": 9.98422989311372e-07, + "loss": 0.0158, + "reward": 1.197622537612915, + "reward_std": 0.2530099153518677, + "rewards/accuracy_reward_stage2": 0.32262250781059265, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 10 + }, + { + "completion_length": 12.046875, + "epoch": 0.001927457508323112, + "grad_norm": 20.500299418605792, + "kl": 0.000965118408203125, + "learning_rate": 9.982477659015245e-07, + "loss": -0.0418, + "reward": 1.432761311531067, + "reward_std": 0.2917559742927551, + "rewards/accuracy_reward_stage2": 0.4483863115310669, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 11 + }, + { + "completion_length": 14.453125, + "epoch": 0.0021026809181706675, + "grad_norm": 167876.85932408215, + "kl": 70.5, + "learning_rate": 9.980725424916767e-07, + "loss": 28.0895, + "reward": 1.2883098125457764, + "reward_std": 0.1935170292854309, + "rewards/accuracy_reward_stage2": 0.5383098125457764, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 12 + }, + { + "completion_length": 9.359375, + "epoch": 0.002277904328018223, + "grad_norm": 21.528023020071775, + "kl": 0.00066375732421875, + "learning_rate": 9.978973190818292e-07, + "loss": 0.0003, + "reward": 1.469854712486267, + "reward_std": 0.2446746528148651, + "rewards/accuracy_reward_stage2": 0.4698547124862671, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 13 + }, + { + "completion_length": 13.0, + "epoch": 0.0024531277378657787, + "grad_norm": 23.680931373549402, + "kl": 0.002105712890625, + "learning_rate": 9.977220956719817e-07, + "loss": 0.0008, + "reward": 1.2903798818588257, + "reward_std": 0.22621138393878937, + "rewards/accuracy_reward_stage2": 0.2903798520565033, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 14 + }, + { + "completion_length": 13.421875, + "epoch": 0.0026283511477133343, + "grad_norm": 20.564036006232385, + "kl": 0.00128936767578125, + "learning_rate": 9.975468722621342e-07, + "loss": 0.0005, + "reward": 1.6679387092590332, + "reward_std": 0.17764630913734436, + "rewards/accuracy_reward_stage2": 0.6679386496543884, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 15 + }, + { + "completion_length": 9.46875, + "epoch": 0.0028035745575608903, + "grad_norm": 26.089703097200466, + "kl": 0.00125885009765625, + "learning_rate": 9.973716488522867e-07, + "loss": 0.0005, + "reward": 1.4319759607315063, + "reward_std": 0.27263617515563965, + "rewards/accuracy_reward_stage2": 0.43197596073150635, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 16 + }, + { + "completion_length": 7.03125, + "epoch": 0.002978797967408446, + "grad_norm": 16.17891999936704, + "kl": 0.00089263916015625, + "learning_rate": 9.97196425442439e-07, + "loss": -0.0438, + "reward": 1.4551277160644531, + "reward_std": 0.12339088320732117, + "rewards/accuracy_reward_stage2": 0.5957527160644531, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 17 + }, + { + "completion_length": 7.875, + "epoch": 0.0031540213772560015, + "grad_norm": 24.111914676200755, + "kl": 0.00469970703125, + "learning_rate": 9.970212020325915e-07, + "loss": 0.0019, + "reward": 1.3679840564727783, + "reward_std": 0.30560600757598877, + "rewards/accuracy_reward_stage2": 0.4929840862751007, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 18 + }, + { + "completion_length": 14.4375, + "epoch": 0.003329244787103557, + "grad_norm": 20.91886024380496, + "kl": 0.002838134765625, + "learning_rate": 9.96845978622744e-07, + "loss": 0.0011, + "reward": 1.330362319946289, + "reward_std": 0.1731535941362381, + "rewards/accuracy_reward_stage2": 0.3303622603416443, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 19 + }, + { + "completion_length": 15.59375, + "epoch": 0.0035044681969511127, + "grad_norm": 42139.41750782583, + "kl": 428.0, + "learning_rate": 9.966707552128965e-07, + "loss": 171.9276, + "reward": 1.3618611097335815, + "reward_std": 0.15837247669696808, + "rewards/accuracy_reward_stage2": 0.48686110973358154, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 20 + }, + { + "completion_length": 6.1875, + "epoch": 0.0036796916067986683, + "grad_norm": 13.026390779558632, + "kl": 0.0001850128173828125, + "learning_rate": 9.964955318030487e-07, + "loss": 0.0001, + "reward": 1.7447917461395264, + "reward_std": 0.13045889139175415, + "rewards/accuracy_reward_stage2": 0.7447916865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 21 + }, + { + "completion_length": 7.0, + "epoch": 0.003854915016646224, + "grad_norm": 23.78421408951673, + "kl": 0.0029754638671875, + "learning_rate": 9.963203083932012e-07, + "loss": -0.0322, + "reward": 1.2731072902679443, + "reward_std": 0.18740758299827576, + "rewards/accuracy_reward_stage2": 0.28873229026794434, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 22 + }, + { + "completion_length": 11.46875, + "epoch": 0.0040301384264937795, + "grad_norm": 22.753424574491618, + "kl": 0.0037078857421875, + "learning_rate": 9.961450849833537e-07, + "loss": -0.0449, + "reward": 1.2642568349838257, + "reward_std": 0.35765278339385986, + "rewards/accuracy_reward_stage2": 0.29550686478614807, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 23 + }, + { + "completion_length": 8.703125, + "epoch": 0.004205361836341335, + "grad_norm": 24.427356484908216, + "kl": 0.0021514892578125, + "learning_rate": 9.959698615735062e-07, + "loss": 0.0009, + "reward": 1.3861404657363892, + "reward_std": 0.25101006031036377, + "rewards/accuracy_reward_stage2": 0.5111405253410339, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 24 + }, + { + "completion_length": 10.578125, + "epoch": 0.004380585246188891, + "grad_norm": 11105.321622372509, + "kl": 11.875, + "learning_rate": 9.957946381636585e-07, + "loss": 4.7301, + "reward": 1.416548490524292, + "reward_std": 0.22747981548309326, + "rewards/accuracy_reward_stage2": 0.5415483713150024, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 25 + }, + { + "completion_length": 8.78125, + "epoch": 0.004555808656036446, + "grad_norm": 25.76102538605346, + "kl": 0.004913330078125, + "learning_rate": 9.95619414753811e-07, + "loss": 0.0147, + "reward": 1.6482062339782715, + "reward_std": 0.191350519657135, + "rewards/accuracy_reward_stage2": 0.7732061743736267, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 26 + }, + { + "completion_length": 7.140625, + "epoch": 0.004731032065884002, + "grad_norm": 26.572344191821514, + "kl": 0.003387451171875, + "learning_rate": 9.954441913439635e-07, + "loss": 0.0014, + "reward": 1.5845115184783936, + "reward_std": 0.24853208661079407, + "rewards/accuracy_reward_stage2": 0.5845115780830383, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 27 + }, + { + "completion_length": 12.734375, + "epoch": 0.0049062554757315574, + "grad_norm": 24.260368773277218, + "kl": 0.00616455078125, + "learning_rate": 9.95268967934116e-07, + "loss": 0.0025, + "reward": 1.5300661325454712, + "reward_std": 0.3158077895641327, + "rewards/accuracy_reward_stage2": 0.5300660729408264, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 28 + }, + { + "completion_length": 8.765625, + "epoch": 0.005081478885579113, + "grad_norm": 15.78198731506964, + "kl": 0.002166748046875, + "learning_rate": 9.950937445242685e-07, + "loss": 0.0009, + "reward": 1.3041990995407104, + "reward_std": 0.12750419974327087, + "rewards/accuracy_reward_stage2": 0.42919909954071045, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 29 + }, + { + "completion_length": 11.03125, + "epoch": 0.005256702295426669, + "grad_norm": 20.63873452976797, + "kl": 0.003326416015625, + "learning_rate": 9.94918521114421e-07, + "loss": 0.0013, + "reward": 1.094714641571045, + "reward_std": 0.12758338451385498, + "rewards/accuracy_reward_stage2": 0.2197147160768509, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 30 + }, + { + "completion_length": 8.859375, + "epoch": 0.005431925705274224, + "grad_norm": 21.50035066177145, + "kl": 0.0181884765625, + "learning_rate": 9.947432977045732e-07, + "loss": -0.0217, + "reward": 1.540401816368103, + "reward_std": 0.23430338501930237, + "rewards/accuracy_reward_stage2": 0.6810267567634583, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 31 + }, + { + "completion_length": 12.390625, + "epoch": 0.005607149115121781, + "grad_norm": 29.83623754865519, + "kl": 0.0216064453125, + "learning_rate": 9.945680742947257e-07, + "loss": 0.0087, + "reward": 1.25832998752594, + "reward_std": 0.26882484555244446, + "rewards/accuracy_reward_stage2": 0.3833300471305847, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 32 + }, + { + "completion_length": 10.9375, + "epoch": 0.005782372524969336, + "grad_norm": 26.143293218614, + "kl": 0.0198974609375, + "learning_rate": 9.94392850884878e-07, + "loss": 0.008, + "reward": 1.2494080066680908, + "reward_std": 0.2645424008369446, + "rewards/accuracy_reward_stage2": 0.49940791726112366, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 33 + }, + { + "completion_length": 9.53125, + "epoch": 0.005957595934816892, + "grad_norm": 21.480867023451303, + "kl": 0.0059814453125, + "learning_rate": 9.942176274750305e-07, + "loss": 0.0024, + "reward": 1.648768663406372, + "reward_std": 0.16991084814071655, + "rewards/accuracy_reward_stage2": 0.6487685441970825, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 34 + }, + { + "completion_length": 9.6875, + "epoch": 0.0061328193446644474, + "grad_norm": 22.526380129092917, + "kl": 0.00390625, + "learning_rate": 9.94042404065183e-07, + "loss": 0.0016, + "reward": 1.520120620727539, + "reward_std": 0.2722627818584442, + "rewards/accuracy_reward_stage2": 0.5201205015182495, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 35 + }, + { + "completion_length": 12.875, + "epoch": 0.006308042754512003, + "grad_norm": 28.92262587524997, + "kl": 0.234375, + "learning_rate": 9.938671806553355e-07, + "loss": 0.0936, + "reward": 1.1792454719543457, + "reward_std": 0.11877614259719849, + "rewards/accuracy_reward_stage2": 0.3042455315589905, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 36 + }, + { + "completion_length": 5.96875, + "epoch": 0.006483266164359559, + "grad_norm": 21.198214257246605, + "kl": 0.00689697265625, + "learning_rate": 9.93691957245488e-07, + "loss": 0.0028, + "reward": 1.6057288646697998, + "reward_std": 0.12137105315923691, + "rewards/accuracy_reward_stage2": 0.730728805065155, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 37 + }, + { + "completion_length": 7.921875, + "epoch": 0.006658489574207114, + "grad_norm": 22.106502658208328, + "kl": 0.01348876953125, + "learning_rate": 9.935167338356405e-07, + "loss": 0.0054, + "reward": 1.5812971591949463, + "reward_std": 0.2364519238471985, + "rewards/accuracy_reward_stage2": 0.5812971591949463, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 38 + }, + { + "completion_length": 13.046875, + "epoch": 0.00683371298405467, + "grad_norm": 24.54112522332538, + "kl": 0.00958251953125, + "learning_rate": 9.933415104257928e-07, + "loss": 0.0038, + "reward": 1.3349295854568481, + "reward_std": 0.3648528456687927, + "rewards/accuracy_reward_stage2": 0.4599296450614929, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 39 + }, + { + "completion_length": 16.109375, + "epoch": 0.007008936393902225, + "grad_norm": 444.98631596943954, + "kl": 1.3125, + "learning_rate": 9.931662870159453e-07, + "loss": 0.5243, + "reward": 1.2739577293395996, + "reward_std": 0.1354614943265915, + "rewards/accuracy_reward_stage2": 0.5239576697349548, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 40 + }, + { + "completion_length": 15.046875, + "epoch": 0.007184159803749781, + "grad_norm": 24.708295810309572, + "kl": 0.0167236328125, + "learning_rate": 9.929910636060978e-07, + "loss": 0.0067, + "reward": 1.5153954029083252, + "reward_std": 0.20246349275112152, + "rewards/accuracy_reward_stage2": 0.5153952836990356, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 41 + }, + { + "completion_length": 11.671875, + "epoch": 0.007359383213597337, + "grad_norm": 19.119381990147836, + "kl": 0.007598876953125, + "learning_rate": 9.928158401962502e-07, + "loss": -0.0066, + "reward": 1.4843531847000122, + "reward_std": 0.14850273728370667, + "rewards/accuracy_reward_stage2": 0.6093531250953674, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 42 + }, + { + "completion_length": 12.296875, + "epoch": 0.007534606623444892, + "grad_norm": 126.73286516600807, + "kl": 0.59375, + "learning_rate": 9.926406167864027e-07, + "loss": 0.2368, + "reward": 1.2725048065185547, + "reward_std": 0.2956145703792572, + "rewards/accuracy_reward_stage2": 0.3975048065185547, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 43 + }, + { + "completion_length": 7.140625, + "epoch": 0.007709830033292448, + "grad_norm": 19.271630316643456, + "kl": 0.016357421875, + "learning_rate": 9.92465393376555e-07, + "loss": 0.0066, + "reward": 1.5359582901000977, + "reward_std": 0.23351669311523438, + "rewards/accuracy_reward_stage2": 0.5359582304954529, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 44 + }, + { + "completion_length": 11.546875, + "epoch": 0.007885053443140003, + "grad_norm": 32.85579727788969, + "kl": 0.455078125, + "learning_rate": 9.922901699667075e-07, + "loss": 0.1818, + "reward": 1.4999957084655762, + "reward_std": 0.28870946168899536, + "rewards/accuracy_reward_stage2": 0.6249956488609314, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 45 + }, + { + "completion_length": 6.046875, + "epoch": 0.008060276852987559, + "grad_norm": 24.0607272358331, + "kl": 0.033203125, + "learning_rate": 9.9211494655686e-07, + "loss": 0.0132, + "reward": 1.4540456533432007, + "reward_std": 0.35870805382728577, + "rewards/accuracy_reward_stage2": 0.4540456533432007, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 46 + }, + { + "completion_length": 8.8125, + "epoch": 0.008235500262835115, + "grad_norm": 20.58473435321408, + "kl": 0.0022125244140625, + "learning_rate": 9.919397231470123e-07, + "loss": 0.0009, + "reward": 1.4864552021026611, + "reward_std": 0.22649237513542175, + "rewards/accuracy_reward_stage2": 0.4864552319049835, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 47 + }, + { + "completion_length": 7.9375, + "epoch": 0.00841072367268267, + "grad_norm": 23.996696185252787, + "kl": 0.01416015625, + "learning_rate": 9.917644997371648e-07, + "loss": 0.0057, + "reward": 1.5774058103561401, + "reward_std": 0.2729129493236542, + "rewards/accuracy_reward_stage2": 0.5774057507514954, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 48 + }, + { + "completion_length": 13.75, + "epoch": 0.008585947082530226, + "grad_norm": 9198.783068890769, + "kl": 12.625, + "learning_rate": 9.915892763273173e-07, + "loss": 5.0487, + "reward": 1.5072916746139526, + "reward_std": 0.10488568246364594, + "rewards/accuracy_reward_stage2": 0.6322916746139526, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 49 + }, + { + "completion_length": 7.578125, + "epoch": 0.008761170492377781, + "grad_norm": 21.17089484476723, + "kl": 0.028076171875, + "learning_rate": 9.914140529174698e-07, + "loss": -0.033, + "reward": 1.561553716659546, + "reward_std": 0.20458553731441498, + "rewards/accuracy_reward_stage2": 0.5771787762641907, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 50 + }, + { + "completion_length": 14.546875, + "epoch": 0.008936393902225337, + "grad_norm": 22.676324556567717, + "kl": 0.0269775390625, + "learning_rate": 9.912388295076223e-07, + "loss": 0.0108, + "reward": 1.5551083087921143, + "reward_std": 0.24086907505989075, + "rewards/accuracy_reward_stage2": 0.5551083087921143, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 51 + }, + { + "completion_length": 19.734375, + "epoch": 0.009111617312072893, + "grad_norm": 24.090157582750408, + "kl": 0.0301513671875, + "learning_rate": 9.910636060977745e-07, + "loss": 0.012, + "reward": 1.5436656475067139, + "reward_std": 0.1551145315170288, + "rewards/accuracy_reward_stage2": 0.5436656475067139, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 52 + }, + { + "completion_length": 18.359375, + "epoch": 0.009286840721920448, + "grad_norm": 117.2815317858313, + "kl": 0.1787109375, + "learning_rate": 9.90888382687927e-07, + "loss": 0.0715, + "reward": 1.26711106300354, + "reward_std": 0.17756909132003784, + "rewards/accuracy_reward_stage2": 0.39211103320121765, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 53 + }, + { + "completion_length": 19.640625, + "epoch": 0.009462064131768004, + "grad_norm": 2670.243259955133, + "kl": 4.25, + "learning_rate": 9.907131592780795e-07, + "loss": 1.6969, + "reward": 1.3709712028503418, + "reward_std": 0.26806020736694336, + "rewards/accuracy_reward_stage2": 0.4959712028503418, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 54 + }, + { + "completion_length": 10.78125, + "epoch": 0.00963728754161556, + "grad_norm": 24.422978097429848, + "kl": 0.031494140625, + "learning_rate": 9.90537935868232e-07, + "loss": 0.0127, + "reward": 1.635439157485962, + "reward_std": 0.15717683732509613, + "rewards/accuracy_reward_stage2": 0.6354391574859619, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 55 + }, + { + "completion_length": 21.9375, + "epoch": 0.009812510951463115, + "grad_norm": 58.54729288417099, + "kl": 0.94140625, + "learning_rate": 9.903627124583845e-07, + "loss": 0.3767, + "reward": 1.2334133386611938, + "reward_std": 0.3513503968715668, + "rewards/accuracy_reward_stage2": 0.4834132790565491, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 56 + }, + { + "completion_length": 10.890625, + "epoch": 0.00998773436131067, + "grad_norm": 24.764202425478974, + "kl": 0.01953125, + "learning_rate": 9.901874890485368e-07, + "loss": 0.0078, + "reward": 1.542431116104126, + "reward_std": 0.18826258182525635, + "rewards/accuracy_reward_stage2": 0.5424311757087708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 57 + }, + { + "completion_length": 11.078125, + "epoch": 0.010162957771158226, + "grad_norm": 23.433657092831172, + "kl": 0.06005859375, + "learning_rate": 9.900122656386893e-07, + "loss": 0.024, + "reward": 1.4939064979553223, + "reward_std": 0.20271292328834534, + "rewards/accuracy_reward_stage2": 0.4939064383506775, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 58 + }, + { + "completion_length": 9.875, + "epoch": 0.010338181181005782, + "grad_norm": 37.05527638038634, + "kl": 0.271484375, + "learning_rate": 9.898370422288418e-07, + "loss": 0.0885, + "reward": 1.395346999168396, + "reward_std": 0.2753984332084656, + "rewards/accuracy_reward_stage2": 0.520346999168396, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 59 + }, + { + "completion_length": 13.921875, + "epoch": 0.010513404590853337, + "grad_norm": 22.533338631681016, + "kl": 0.039794921875, + "learning_rate": 9.89661818818994e-07, + "loss": 0.0159, + "reward": 1.3350149393081665, + "reward_std": 0.18372395634651184, + "rewards/accuracy_reward_stage2": 0.3350149095058441, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 60 + }, + { + "completion_length": 7.359375, + "epoch": 0.010688628000700893, + "grad_norm": 21.392543766016747, + "kl": 0.0162353515625, + "learning_rate": 9.894865954091465e-07, + "loss": 0.0065, + "reward": 1.627392292022705, + "reward_std": 0.27495235204696655, + "rewards/accuracy_reward_stage2": 0.6273922324180603, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 61 + }, + { + "completion_length": 16.3125, + "epoch": 0.010863851410548448, + "grad_norm": 20.580479234767235, + "kl": 0.02294921875, + "learning_rate": 9.89311371999299e-07, + "loss": 0.0092, + "reward": 1.7086659669876099, + "reward_std": 0.26947683095932007, + "rewards/accuracy_reward_stage2": 0.7086660265922546, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 62 + }, + { + "completion_length": 23.265625, + "epoch": 0.011039074820396006, + "grad_norm": 21260.430375982574, + "kl": 25.625, + "learning_rate": 9.891361485894515e-07, + "loss": 10.2778, + "reward": 1.194105863571167, + "reward_std": 0.2070472538471222, + "rewards/accuracy_reward_stage2": 0.31910592317581177, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 63 + }, + { + "completion_length": 17.859375, + "epoch": 0.011214298230243561, + "grad_norm": 16.365434222292564, + "kl": 0.021728515625, + "learning_rate": 9.88960925179604e-07, + "loss": 0.0087, + "reward": 1.5770893096923828, + "reward_std": 0.06625860929489136, + "rewards/accuracy_reward_stage2": 0.5770893096923828, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 64 + }, + { + "completion_length": 12.21875, + "epoch": 0.011389521640091117, + "grad_norm": 21.895183855546207, + "kl": 0.1611328125, + "learning_rate": 9.887857017697563e-07, + "loss": 0.0645, + "reward": 1.4588305950164795, + "reward_std": 0.199259951710701, + "rewards/accuracy_reward_stage2": 0.5838305950164795, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 65 + }, + { + "completion_length": 8.359375, + "epoch": 0.011564745049938673, + "grad_norm": 914.427024535711, + "kl": 1.8125, + "learning_rate": 9.886104783599088e-07, + "loss": 0.7264, + "reward": 1.6872773170471191, + "reward_std": 0.19540375471115112, + "rewards/accuracy_reward_stage2": 0.8122772574424744, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 66 + }, + { + "completion_length": 11.1875, + "epoch": 0.011739968459786228, + "grad_norm": 57.98909231313472, + "kl": 0.1767578125, + "learning_rate": 9.884352549500613e-07, + "loss": 0.0707, + "reward": 0.883919894695282, + "reward_std": 0.12535862624645233, + "rewards/accuracy_reward_stage2": 0.1339198797941208, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 67 + }, + { + "completion_length": 14.5, + "epoch": 0.011915191869633784, + "grad_norm": 21.20461032961743, + "kl": 0.0277099609375, + "learning_rate": 9.882600315402138e-07, + "loss": 0.0111, + "reward": 1.3983817100524902, + "reward_std": 0.2320813536643982, + "rewards/accuracy_reward_stage2": 0.5233815908432007, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 68 + }, + { + "completion_length": 12.296875, + "epoch": 0.01209041527948134, + "grad_norm": 16.882678958865416, + "kl": 0.029296875, + "learning_rate": 9.880848081303663e-07, + "loss": 0.0117, + "reward": 1.76310396194458, + "reward_std": 0.1585851013660431, + "rewards/accuracy_reward_stage2": 0.7631039619445801, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 69 + }, + { + "completion_length": 14.6875, + "epoch": 0.012265638689328895, + "grad_norm": 486.1965247861804, + "kl": 1.2109375, + "learning_rate": 9.879095847205188e-07, + "loss": 0.4841, + "reward": 1.665351390838623, + "reward_std": 0.18253958225250244, + "rewards/accuracy_reward_stage2": 0.790351390838623, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 70 + }, + { + "completion_length": 11.265625, + "epoch": 0.01244086209917645, + "grad_norm": 21.42386015892597, + "kl": 0.050537109375, + "learning_rate": 9.87734361310671e-07, + "loss": 0.0202, + "reward": 1.214674472808838, + "reward_std": 0.15539926290512085, + "rewards/accuracy_reward_stage2": 0.3396745026111603, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 71 + }, + { + "completion_length": 6.109375, + "epoch": 0.012616085509024006, + "grad_norm": 23.54396927449601, + "kl": 0.026123046875, + "learning_rate": 9.875591379008235e-07, + "loss": 0.0105, + "reward": 1.7581977844238281, + "reward_std": 0.2139248102903366, + "rewards/accuracy_reward_stage2": 0.7581977248191833, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 72 + }, + { + "completion_length": 14.40625, + "epoch": 0.012791308918871562, + "grad_norm": 19.476130544371653, + "kl": 0.035400390625, + "learning_rate": 9.873839144909758e-07, + "loss": 0.0141, + "reward": 1.8426779508590698, + "reward_std": 0.16201923787593842, + "rewards/accuracy_reward_stage2": 0.842677891254425, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 73 + }, + { + "completion_length": 10.5625, + "epoch": 0.012966532328719117, + "grad_norm": 19.29894733530793, + "kl": 0.08251953125, + "learning_rate": 9.872086910811283e-07, + "loss": -0.0113, + "reward": 1.628914475440979, + "reward_std": 0.17053398489952087, + "rewards/accuracy_reward_stage2": 0.6445394158363342, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 74 + }, + { + "completion_length": 9.484375, + "epoch": 0.013141755738566673, + "grad_norm": 26.149212215656444, + "kl": 0.205078125, + "learning_rate": 9.870334676712808e-07, + "loss": 0.0817, + "reward": 1.3059927225112915, + "reward_std": 0.22208189964294434, + "rewards/accuracy_reward_stage2": 0.4309927225112915, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 75 + }, + { + "completion_length": 10.875, + "epoch": 0.013316979148414228, + "grad_norm": 26.023761326712144, + "kl": 0.10791015625, + "learning_rate": 9.868582442614333e-07, + "loss": 0.0432, + "reward": 1.583097219467163, + "reward_std": 0.2495778203010559, + "rewards/accuracy_reward_stage2": 0.5830972790718079, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 76 + }, + { + "completion_length": 9.453125, + "epoch": 0.013492202558261784, + "grad_norm": 18.353935777981253, + "kl": 0.01904296875, + "learning_rate": 9.866830208515858e-07, + "loss": 0.0076, + "reward": 1.3065390586853027, + "reward_std": 0.12740254402160645, + "rewards/accuracy_reward_stage2": 0.3065391182899475, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 77 + }, + { + "completion_length": 19.09375, + "epoch": 0.01366742596810934, + "grad_norm": 43.89053956356925, + "kl": 0.65625, + "learning_rate": 9.86507797441738e-07, + "loss": 0.22, + "reward": 1.2945109605789185, + "reward_std": 0.13152040541172028, + "rewards/accuracy_reward_stage2": 0.45076102018356323, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 78 + }, + { + "completion_length": 14.765625, + "epoch": 0.013842649377956895, + "grad_norm": 22.686536085830813, + "kl": 0.050537109375, + "learning_rate": 9.863325740318906e-07, + "loss": 0.0202, + "reward": 1.2739322185516357, + "reward_std": 0.22220373153686523, + "rewards/accuracy_reward_stage2": 0.27393215894699097, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 79 + }, + { + "completion_length": 6.734375, + "epoch": 0.01401787278780445, + "grad_norm": 18.610904618490743, + "kl": 0.028076171875, + "learning_rate": 9.86157350622043e-07, + "loss": 0.0113, + "reward": 1.7154624462127686, + "reward_std": 0.12328290939331055, + "rewards/accuracy_reward_stage2": 0.715462327003479, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 80 + }, + { + "completion_length": 13.640625, + "epoch": 0.014193096197652006, + "grad_norm": 21.58844190934058, + "kl": 0.0125732421875, + "learning_rate": 9.859821272121955e-07, + "loss": 0.005, + "reward": 1.775895118713379, + "reward_std": 0.1399209350347519, + "rewards/accuracy_reward_stage2": 0.7758949995040894, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 81 + }, + { + "completion_length": 19.65625, + "epoch": 0.014368319607499562, + "grad_norm": 24.391365722105085, + "kl": 0.013671875, + "learning_rate": 9.85806903802348e-07, + "loss": 0.0055, + "reward": 1.445000171661377, + "reward_std": 0.2040981948375702, + "rewards/accuracy_reward_stage2": 0.44500014185905457, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 82 + }, + { + "completion_length": 8.109375, + "epoch": 0.014543543017347118, + "grad_norm": 14.35169151732421, + "kl": 0.01123046875, + "learning_rate": 9.856316803925005e-07, + "loss": 0.0045, + "reward": 1.4154889583587646, + "reward_std": 0.17590636014938354, + "rewards/accuracy_reward_stage2": 0.41548892855644226, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 83 + }, + { + "completion_length": 8.9375, + "epoch": 0.014718766427194673, + "grad_norm": 23.391660041096344, + "kl": 0.10693359375, + "learning_rate": 9.854564569826528e-07, + "loss": 0.0428, + "reward": 1.4479758739471436, + "reward_std": 0.26807376742362976, + "rewards/accuracy_reward_stage2": 0.4479758143424988, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 84 + }, + { + "completion_length": 9.890625, + "epoch": 0.014893989837042229, + "grad_norm": 25.9165649133999, + "kl": 0.2138671875, + "learning_rate": 9.852812335728053e-07, + "loss": 0.0855, + "reward": 1.46875, + "reward_std": 0.2619796395301819, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 85 + }, + { + "completion_length": 16.375, + "epoch": 0.015069213246889784, + "grad_norm": 32.11956895366405, + "kl": 0.478515625, + "learning_rate": 9.851060101629576e-07, + "loss": 0.1917, + "reward": 1.3244003057479858, + "reward_std": 0.2183195948600769, + "rewards/accuracy_reward_stage2": 0.44940024614334106, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 86 + }, + { + "completion_length": 17.109375, + "epoch": 0.01524443665673734, + "grad_norm": 16.465646866338727, + "kl": 0.029052734375, + "learning_rate": 9.8493078675311e-07, + "loss": 0.0116, + "reward": 1.5186080932617188, + "reward_std": 0.11561406403779984, + "rewards/accuracy_reward_stage2": 0.5186082124710083, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 87 + }, + { + "completion_length": 16.484375, + "epoch": 0.015419660066584896, + "grad_norm": 26.79288856366434, + "kl": 0.0177001953125, + "learning_rate": 9.847555633432626e-07, + "loss": 0.0232, + "reward": 1.3938446044921875, + "reward_std": 0.3033265471458435, + "rewards/accuracy_reward_stage2": 0.5188446640968323, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 88 + }, + { + "completion_length": 9.96875, + "epoch": 0.015594883476432451, + "grad_norm": 23.16098970301936, + "kl": 0.18359375, + "learning_rate": 9.84580339933415e-07, + "loss": 0.0734, + "reward": 1.452669620513916, + "reward_std": 0.1748245358467102, + "rewards/accuracy_reward_stage2": 0.5776697397232056, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 89 + }, + { + "completion_length": 6.71875, + "epoch": 0.015770106886280007, + "grad_norm": 14.171409825663957, + "kl": 0.0218505859375, + "learning_rate": 9.844051165235676e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 90 + }, + { + "completion_length": 13.765625, + "epoch": 0.015945330296127564, + "grad_norm": 14.704475254365626, + "kl": 0.062255859375, + "learning_rate": 9.8422989311372e-07, + "loss": 0.0249, + "reward": 1.3633265495300293, + "reward_std": 0.11541568487882614, + "rewards/accuracy_reward_stage2": 0.4883265197277069, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 91 + }, + { + "completion_length": 8.125, + "epoch": 0.016120553705975118, + "grad_norm": 20.49408337271478, + "kl": 0.05029296875, + "learning_rate": 9.840546697038723e-07, + "loss": 0.0202, + "reward": 1.5179017782211304, + "reward_std": 0.07997994124889374, + "rewards/accuracy_reward_stage2": 0.5179017782211304, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 92 + }, + { + "completion_length": 24.875, + "epoch": 0.016295777115822675, + "grad_norm": 17.20056666951234, + "kl": 0.0191650390625, + "learning_rate": 9.838794462940248e-07, + "loss": 0.0076, + "reward": 1.5605134963989258, + "reward_std": 0.15738126635551453, + "rewards/accuracy_reward_stage2": 0.5605135560035706, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 93 + }, + { + "completion_length": 8.984375, + "epoch": 0.01647100052567023, + "grad_norm": 14.05047417845059, + "kl": 0.0242919921875, + "learning_rate": 9.837042228841773e-07, + "loss": 0.0098, + "reward": 1.765625, + "reward_std": 0.15992169082164764, + "rewards/accuracy_reward_stage2": 0.765625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 94 + }, + { + "completion_length": 13.453125, + "epoch": 0.016646223935517786, + "grad_norm": 16.066438711048843, + "kl": 0.00848388671875, + "learning_rate": 9.835289994743298e-07, + "loss": 0.0034, + "reward": 1.3854167461395264, + "reward_std": 0.2431686818599701, + "rewards/accuracy_reward_stage2": 0.5104166269302368, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 95 + }, + { + "completion_length": 13.9375, + "epoch": 0.01682144734536534, + "grad_norm": 20.498967796024022, + "kl": 0.0223388671875, + "learning_rate": 9.833537760644823e-07, + "loss": 0.009, + "reward": 1.32749605178833, + "reward_std": 0.24064147472381592, + "rewards/accuracy_reward_stage2": 0.3274959325790405, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 96 + }, + { + "completion_length": 19.640625, + "epoch": 0.016996670755212898, + "grad_norm": 23.351279022856694, + "kl": 0.048583984375, + "learning_rate": 9.831785526546346e-07, + "loss": 0.0194, + "reward": 1.341355562210083, + "reward_std": 0.17758896946907043, + "rewards/accuracy_reward_stage2": 0.3413556218147278, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 97 + }, + { + "completion_length": 11.0, + "epoch": 0.01717189416506045, + "grad_norm": 2424.142283949507, + "kl": 5.28125, + "learning_rate": 9.83003329244787e-07, + "loss": 2.1083, + "reward": 1.6076582670211792, + "reward_std": 0.2769812345504761, + "rewards/accuracy_reward_stage2": 0.732658326625824, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 98 + }, + { + "completion_length": 15.5, + "epoch": 0.01734711757490801, + "grad_norm": 18.888112369485217, + "kl": 0.0106201171875, + "learning_rate": 9.828281058349396e-07, + "loss": 0.0043, + "reward": 1.5364583730697632, + "reward_std": 0.13152071833610535, + "rewards/accuracy_reward_stage2": 0.6614583730697632, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 99 + }, + { + "completion_length": 9.484375, + "epoch": 0.017522340984755563, + "grad_norm": 18.63597928328685, + "kl": 0.0245361328125, + "learning_rate": 9.826528824250918e-07, + "loss": 0.0098, + "reward": 1.5691524744033813, + "reward_std": 0.17309194803237915, + "rewards/accuracy_reward_stage2": 0.5691524744033813, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 100 + }, + { + "completion_length": 8.25, + "epoch": 0.01769756439460312, + "grad_norm": 32.53836968752601, + "kl": 0.0244140625, + "learning_rate": 9.824776590152443e-07, + "loss": 0.0098, + "reward": 1.399068832397461, + "reward_std": 0.3313966393470764, + "rewards/accuracy_reward_stage2": 0.39906883239746094, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 101 + }, + { + "completion_length": 9.90625, + "epoch": 0.017872787804450674, + "grad_norm": 32.8854408605841, + "kl": 0.0155029296875, + "learning_rate": 9.823024356053968e-07, + "loss": 0.0062, + "reward": 1.3821427822113037, + "reward_std": 0.19832119345664978, + "rewards/accuracy_reward_stage2": 0.5071427226066589, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 102 + }, + { + "completion_length": 10.671875, + "epoch": 0.01804801121429823, + "grad_norm": 21.58148592368162, + "kl": 0.0218505859375, + "learning_rate": 9.821272121955493e-07, + "loss": 0.0087, + "reward": 1.7327183485031128, + "reward_std": 0.12665671110153198, + "rewards/accuracy_reward_stage2": 0.7327184081077576, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 103 + }, + { + "completion_length": 11.53125, + "epoch": 0.018223234624145785, + "grad_norm": 18.949315673363365, + "kl": 0.03955078125, + "learning_rate": 9.819519887857018e-07, + "loss": -0.0283, + "reward": 1.6874890327453613, + "reward_std": 0.1615470051765442, + "rewards/accuracy_reward_stage2": 0.7031140923500061, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 104 + }, + { + "completion_length": 11.828125, + "epoch": 0.018398458033993342, + "grad_norm": 26.76789168361859, + "kl": 0.1787109375, + "learning_rate": 9.81776765375854e-07, + "loss": 0.0716, + "reward": 1.625, + "reward_std": 0.12910360097885132, + "rewards/accuracy_reward_stage2": 0.75, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 105 + }, + { + "completion_length": 11.875, + "epoch": 0.018573681443840896, + "grad_norm": 23.148900056019155, + "kl": 0.0859375, + "learning_rate": 9.816015419660066e-07, + "loss": 0.0343, + "reward": 1.6884129047393799, + "reward_std": 0.2642272114753723, + "rewards/accuracy_reward_stage2": 0.6884129047393799, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 106 + }, + { + "completion_length": 7.0625, + "epoch": 0.018748904853688454, + "grad_norm": 26.976366492756323, + "kl": 0.056884765625, + "learning_rate": 9.81426318556159e-07, + "loss": 0.0228, + "reward": 1.615952968597412, + "reward_std": 0.3104846775531769, + "rewards/accuracy_reward_stage2": 0.6159528493881226, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 107 + }, + { + "completion_length": 8.953125, + "epoch": 0.018924128263536007, + "grad_norm": 16.921272039404386, + "kl": 0.0419921875, + "learning_rate": 9.812510951463116e-07, + "loss": 0.0168, + "reward": 1.621179461479187, + "reward_std": 0.14682598412036896, + "rewards/accuracy_reward_stage2": 0.621179461479187, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 108 + }, + { + "completion_length": 13.15625, + "epoch": 0.019099351673383565, + "grad_norm": 22.94264744441606, + "kl": 0.031494140625, + "learning_rate": 9.81075871736464e-07, + "loss": 0.0126, + "reward": 1.3379876613616943, + "reward_std": 0.23553958535194397, + "rewards/accuracy_reward_stage2": 0.46298760175704956, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 109 + }, + { + "completion_length": 7.78125, + "epoch": 0.01927457508323112, + "grad_norm": 18.640754349362595, + "kl": 0.03125, + "learning_rate": 9.809006483266164e-07, + "loss": 0.0126, + "reward": 1.6614583730697632, + "reward_std": 0.18261326849460602, + "rewards/accuracy_reward_stage2": 0.7864583730697632, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 110 + }, + { + "completion_length": 9.515625, + "epoch": 0.019449798493078676, + "grad_norm": 25.22440686754448, + "kl": 0.1572265625, + "learning_rate": 9.807254249167688e-07, + "loss": 0.0629, + "reward": 1.5890928506851196, + "reward_std": 0.3712921142578125, + "rewards/accuracy_reward_stage2": 0.7140928506851196, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 111 + }, + { + "completion_length": 14.3125, + "epoch": 0.01962502190292623, + "grad_norm": 18.83358143724199, + "kl": 0.038818359375, + "learning_rate": 9.805502015069213e-07, + "loss": 0.0156, + "reward": 1.8106896877288818, + "reward_std": 0.10225945711135864, + "rewards/accuracy_reward_stage2": 0.8106895685195923, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 112 + }, + { + "completion_length": 7.40625, + "epoch": 0.019800245312773787, + "grad_norm": 22.13546775107038, + "kl": 0.0272216796875, + "learning_rate": 9.803749780970736e-07, + "loss": 0.0109, + "reward": 1.6545759439468384, + "reward_std": 0.30723053216934204, + "rewards/accuracy_reward_stage2": 0.6545758843421936, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 113 + }, + { + "completion_length": 10.109375, + "epoch": 0.01997546872262134, + "grad_norm": 23.29532824323836, + "kl": 0.039306640625, + "learning_rate": 9.801997546872261e-07, + "loss": 0.0157, + "reward": 1.511056661605835, + "reward_std": 0.24191156029701233, + "rewards/accuracy_reward_stage2": 0.636056661605835, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 114 + }, + { + "completion_length": 14.5, + "epoch": 0.020150692132468898, + "grad_norm": 20.883116213967234, + "kl": 0.06396484375, + "learning_rate": 9.800245312773786e-07, + "loss": 0.0256, + "reward": 1.700423002243042, + "reward_std": 0.11160765588283539, + "rewards/accuracy_reward_stage2": 0.7004230618476868, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 115 + }, + { + "completion_length": 13.65625, + "epoch": 0.020325915542316452, + "grad_norm": 38.41318094469385, + "kl": 0.58984375, + "learning_rate": 9.79849307867531e-07, + "loss": 0.2199, + "reward": 1.4143095016479492, + "reward_std": 0.21462732553482056, + "rewards/accuracy_reward_stage2": 0.5549345016479492, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 116 + }, + { + "completion_length": 7.578125, + "epoch": 0.02050113895216401, + "grad_norm": 19.121183391928234, + "kl": 0.0118408203125, + "learning_rate": 9.796740844576836e-07, + "loss": 0.0047, + "reward": 1.3231756687164307, + "reward_std": 0.14878198504447937, + "rewards/accuracy_reward_stage2": 0.44817566871643066, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 117 + }, + { + "completion_length": 15.90625, + "epoch": 0.020676362362011563, + "grad_norm": 21.061263351136542, + "kl": 0.038818359375, + "learning_rate": 9.794988610478359e-07, + "loss": 0.0155, + "reward": 1.3059378862380981, + "reward_std": 0.14674827456474304, + "rewards/accuracy_reward_stage2": 0.4309379458427429, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 118 + }, + { + "completion_length": 9.71875, + "epoch": 0.02085158577185912, + "grad_norm": 25.236293976052632, + "kl": 0.055908203125, + "learning_rate": 9.793236376379884e-07, + "loss": 0.0223, + "reward": 1.4603816270828247, + "reward_std": 0.23183618485927582, + "rewards/accuracy_reward_stage2": 0.4603816270828247, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 119 + }, + { + "completion_length": 9.265625, + "epoch": 0.021026809181706674, + "grad_norm": 17.647031631462447, + "kl": 0.029541015625, + "learning_rate": 9.791484142281409e-07, + "loss": 0.0118, + "reward": 1.4820407629013062, + "reward_std": 0.18687711656093597, + "rewards/accuracy_reward_stage2": 0.48204079270362854, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 120 + }, + { + "completion_length": 12.640625, + "epoch": 0.021202032591554232, + "grad_norm": 17.193468877661324, + "kl": 0.08935546875, + "learning_rate": 9.789731908182933e-07, + "loss": 0.0357, + "reward": 1.6092438697814941, + "reward_std": 0.1579185426235199, + "rewards/accuracy_reward_stage2": 0.7342438697814941, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 121 + }, + { + "completion_length": 11.171875, + "epoch": 0.021377256001401786, + "grad_norm": 26.77965880367834, + "kl": 0.19140625, + "learning_rate": 9.787979674084458e-07, + "loss": 0.0222, + "reward": 1.4083753824234009, + "reward_std": 0.4149293303489685, + "rewards/accuracy_reward_stage2": 0.5646253824234009, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 122 + }, + { + "completion_length": 8.46875, + "epoch": 0.021552479411249343, + "grad_norm": 16.66820112097354, + "kl": 0.034912109375, + "learning_rate": 9.786227439985981e-07, + "loss": 0.014, + "reward": 1.321736216545105, + "reward_std": 0.18148699402809143, + "rewards/accuracy_reward_stage2": 0.321736216545105, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 123 + }, + { + "completion_length": 10.453125, + "epoch": 0.021727702821096897, + "grad_norm": 13.039644804969383, + "kl": 0.0274658203125, + "learning_rate": 9.784475205887506e-07, + "loss": 0.011, + "reward": 1.6614583730697632, + "reward_std": 0.1530819833278656, + "rewards/accuracy_reward_stage2": 0.6614583134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 124 + }, + { + "completion_length": 9.234375, + "epoch": 0.021902926230944454, + "grad_norm": 11.330738899206423, + "kl": 0.01251220703125, + "learning_rate": 9.78272297178903e-07, + "loss": -0.0239, + "reward": 1.53125, + "reward_std": 0.1246790662407875, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 125 + }, + { + "completion_length": 13.828125, + "epoch": 0.02207814964079201, + "grad_norm": 22.438552672241048, + "kl": 0.038818359375, + "learning_rate": 9.780970737690554e-07, + "loss": 0.0155, + "reward": 1.3687918186187744, + "reward_std": 0.25998741388320923, + "rewards/accuracy_reward_stage2": 0.368791788816452, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 126 + }, + { + "completion_length": 7.390625, + "epoch": 0.022253373050639565, + "grad_norm": 14.108403183062768, + "kl": 0.02294921875, + "learning_rate": 9.779218503592079e-07, + "loss": 0.0092, + "reward": 1.7554993629455566, + "reward_std": 0.12310698628425598, + "rewards/accuracy_reward_stage2": 0.7554993629455566, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 127 + }, + { + "completion_length": 11.9375, + "epoch": 0.022428596460487123, + "grad_norm": 32.650359349639814, + "kl": 0.03857421875, + "learning_rate": 9.777466269493604e-07, + "loss": 0.0154, + "reward": 1.693576455116272, + "reward_std": 0.2803168296813965, + "rewards/accuracy_reward_stage2": 0.6935763955116272, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 128 + }, + { + "completion_length": 11.3125, + "epoch": 0.022603819870334677, + "grad_norm": 21.3970061518697, + "kl": 0.12109375, + "learning_rate": 9.775714035395129e-07, + "loss": 0.0484, + "reward": 1.5294928550720215, + "reward_std": 0.22037862241268158, + "rewards/accuracy_reward_stage2": 0.5294929146766663, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 129 + }, + { + "completion_length": 9.1875, + "epoch": 0.022779043280182234, + "grad_norm": 17.20296674101713, + "kl": 0.0439453125, + "learning_rate": 9.773961801296654e-07, + "loss": 0.0176, + "reward": 1.671449065208435, + "reward_std": 0.15607139468193054, + "rewards/accuracy_reward_stage2": 0.6714490056037903, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 130 + }, + { + "completion_length": 11.109375, + "epoch": 0.022954266690029788, + "grad_norm": 23.924016993147553, + "kl": 0.16796875, + "learning_rate": 9.772209567198178e-07, + "loss": 0.067, + "reward": 1.515625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 131 + }, + { + "completion_length": 8.109375, + "epoch": 0.023129490099877345, + "grad_norm": 17.871281939724312, + "kl": 0.060546875, + "learning_rate": 9.770457333099701e-07, + "loss": 0.0242, + "reward": 1.5422821044921875, + "reward_std": 0.2110249251127243, + "rewards/accuracy_reward_stage2": 0.5422821640968323, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 132 + }, + { + "completion_length": 8.984375, + "epoch": 0.0233047135097249, + "grad_norm": 16.237834293008472, + "kl": 0.05126953125, + "learning_rate": 9.768705099001226e-07, + "loss": 0.0205, + "reward": 1.4114583730697632, + "reward_std": 0.22298547625541687, + "rewards/accuracy_reward_stage2": 0.5364583134651184, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 133 + }, + { + "completion_length": 8.234375, + "epoch": 0.023479936919572456, + "grad_norm": 22.095308233281447, + "kl": 0.02685546875, + "learning_rate": 9.766952864902751e-07, + "loss": -0.031, + "reward": 1.8829572200775146, + "reward_std": 0.22935430705547333, + "rewards/accuracy_reward_stage2": 0.8985822200775146, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 134 + }, + { + "completion_length": 11.203125, + "epoch": 0.02365516032942001, + "grad_norm": 19.883543612210254, + "kl": 0.0498046875, + "learning_rate": 9.765200630804274e-07, + "loss": 0.0199, + "reward": 1.4533112049102783, + "reward_std": 0.24766197800636292, + "rewards/accuracy_reward_stage2": 0.45331108570098877, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 135 + }, + { + "completion_length": 12.734375, + "epoch": 0.023830383739267567, + "grad_norm": 22.33848788451206, + "kl": 0.26953125, + "learning_rate": 9.763448396705799e-07, + "loss": 0.0983, + "reward": 1.0532407760620117, + "reward_std": 0.28661733865737915, + "rewards/accuracy_reward_stage2": 0.31886574625968933, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 136 + }, + { + "completion_length": 10.921875, + "epoch": 0.02400560714911512, + "grad_norm": 22.501605896319216, + "kl": 0.1728515625, + "learning_rate": 9.761696162607324e-07, + "loss": 0.0402, + "reward": 1.0, + "reward_std": 0.22461533546447754, + "rewards/accuracy_reward_stage2": 0.265625, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 137 + }, + { + "completion_length": 11.03125, + "epoch": 0.02418083055896268, + "grad_norm": 21.43356694594981, + "kl": 0.042724609375, + "learning_rate": 9.759943928508849e-07, + "loss": 0.0171, + "reward": 1.4605212211608887, + "reward_std": 0.3050932288169861, + "rewards/accuracy_reward_stage2": 0.4605211615562439, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 138 + }, + { + "completion_length": 10.484375, + "epoch": 0.024356053968810232, + "grad_norm": 4150.486203164245, + "kl": 9.3125, + "learning_rate": 9.758191694410374e-07, + "loss": 3.6919, + "reward": 1.3730590343475342, + "reward_std": 0.19215653836727142, + "rewards/accuracy_reward_stage2": 0.5136840343475342, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 139 + }, + { + "completion_length": 10.25, + "epoch": 0.02453127737865779, + "grad_norm": 19.10514066890567, + "kl": 0.07421875, + "learning_rate": 9.756439460311896e-07, + "loss": 0.0297, + "reward": 1.4467504024505615, + "reward_std": 0.213166743516922, + "rewards/accuracy_reward_stage2": 0.4467504024505615, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 140 + }, + { + "completion_length": 12.265625, + "epoch": 0.024706500788505344, + "grad_norm": 27.796189272405012, + "kl": 0.06591796875, + "learning_rate": 9.754687226213421e-07, + "loss": 0.0264, + "reward": 1.2996835708618164, + "reward_std": 0.2103997766971588, + "rewards/accuracy_reward_stage2": 0.2996836304664612, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 141 + }, + { + "completion_length": 11.65625, + "epoch": 0.0248817241983529, + "grad_norm": 15.512147123884196, + "kl": 0.0206298828125, + "learning_rate": 9.752934992114946e-07, + "loss": -0.0359, + "reward": 1.52015221118927, + "reward_std": 0.09403587877750397, + "rewards/accuracy_reward_stage2": 0.5357772707939148, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 142 + }, + { + "completion_length": 10.8125, + "epoch": 0.025056947608200455, + "grad_norm": 14.26488216251468, + "kl": 0.050537109375, + "learning_rate": 9.751182758016471e-07, + "loss": 0.0202, + "reward": 1.4782986640930176, + "reward_std": 0.15773266553878784, + "rewards/accuracy_reward_stage2": 0.4782986044883728, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 143 + }, + { + "completion_length": 10.3125, + "epoch": 0.025232171018048012, + "grad_norm": 581.3548172640157, + "kl": 2.09375, + "learning_rate": 9.749430523917996e-07, + "loss": 0.833, + "reward": 1.47330904006958, + "reward_std": 0.2243586778640747, + "rewards/accuracy_reward_stage2": 0.5983090996742249, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 144 + }, + { + "completion_length": 6.640625, + "epoch": 0.025407394427895566, + "grad_norm": 16.632026818964334, + "kl": 0.013427734375, + "learning_rate": 9.74767828981952e-07, + "loss": 0.0054, + "reward": 1.6461806297302246, + "reward_std": 0.13736851513385773, + "rewards/accuracy_reward_stage2": 0.6461805701255798, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 145 + }, + { + "completion_length": 8.125, + "epoch": 0.025582617837743123, + "grad_norm": 20.712311357432764, + "kl": 0.051025390625, + "learning_rate": 9.745926055721044e-07, + "loss": 0.0204, + "reward": 1.4257996082305908, + "reward_std": 0.12755097448825836, + "rewards/accuracy_reward_stage2": 0.4257996082305908, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 146 + }, + { + "completion_length": 8.203125, + "epoch": 0.025757841247590677, + "grad_norm": 20.951278916894317, + "kl": 0.03125, + "learning_rate": 9.744173821622569e-07, + "loss": -0.0316, + "reward": 1.5287258625030518, + "reward_std": 0.2641984224319458, + "rewards/accuracy_reward_stage2": 0.6693509817123413, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 147 + }, + { + "completion_length": 11.953125, + "epoch": 0.025933064657438235, + "grad_norm": 8.761776615452003, + "kl": 0.0230712890625, + "learning_rate": 9.742421587524092e-07, + "loss": 0.0093, + "reward": 1.399897813796997, + "reward_std": 0.04742930084466934, + "rewards/accuracy_reward_stage2": 0.39989787340164185, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 148 + }, + { + "completion_length": 7.359375, + "epoch": 0.02610828806728579, + "grad_norm": 15.121517342131725, + "kl": 0.027099609375, + "learning_rate": 9.740669353425617e-07, + "loss": 0.0108, + "reward": 1.4454432725906372, + "reward_std": 0.04337773099541664, + "rewards/accuracy_reward_stage2": 0.4454432725906372, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 149 + }, + { + "completion_length": 15.90625, + "epoch": 0.026283511477133346, + "grad_norm": 24.934994595131617, + "kl": 0.3203125, + "learning_rate": 9.738917119327141e-07, + "loss": 0.0873, + "reward": 1.3222427368164062, + "reward_std": 0.29903650283813477, + "rewards/accuracy_reward_stage2": 0.4628676772117615, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 150 + }, + { + "completion_length": 13.6875, + "epoch": 0.0264587348869809, + "grad_norm": 20.45511798008456, + "kl": 0.1708984375, + "learning_rate": 9.737164885228666e-07, + "loss": 0.0684, + "reward": 1.582951545715332, + "reward_std": 0.07972659170627594, + "rewards/accuracy_reward_stage2": 0.7079516053199768, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 151 + }, + { + "completion_length": 17.828125, + "epoch": 0.026633958296828457, + "grad_norm": 22.159400129993305, + "kl": 0.046875, + "learning_rate": 9.735412651130191e-07, + "loss": 0.0188, + "reward": 1.5045561790466309, + "reward_std": 0.15542970597743988, + "rewards/accuracy_reward_stage2": 0.5045561790466309, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 152 + }, + { + "completion_length": 8.171875, + "epoch": 0.02680918170667601, + "grad_norm": 16.54412742369763, + "kl": 0.1396484375, + "learning_rate": 9.733660417031714e-07, + "loss": 0.0464, + "reward": 1.2883906364440918, + "reward_std": 0.06080695986747742, + "rewards/accuracy_reward_stage2": 0.5383907556533813, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 153 + }, + { + "completion_length": 10.625, + "epoch": 0.026984405116523568, + "grad_norm": 21.8872943160035, + "kl": 0.054931640625, + "learning_rate": 9.73190818293324e-07, + "loss": 0.0219, + "reward": 1.4988808631896973, + "reward_std": 0.23085248470306396, + "rewards/accuracy_reward_stage2": 0.4988808333873749, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 154 + }, + { + "completion_length": 17.765625, + "epoch": 0.027159628526371122, + "grad_norm": 20.512091387289935, + "kl": 0.03466796875, + "learning_rate": 9.730155948834764e-07, + "loss": 0.0139, + "reward": 1.479767084121704, + "reward_std": 0.23238566517829895, + "rewards/accuracy_reward_stage2": 0.6047670841217041, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 155 + }, + { + "completion_length": 8.515625, + "epoch": 0.02733485193621868, + "grad_norm": 16.26675377437708, + "kl": 0.01153564453125, + "learning_rate": 9.728403714736289e-07, + "loss": 0.0046, + "reward": 1.7636384963989258, + "reward_std": 0.174021378159523, + "rewards/accuracy_reward_stage2": 0.7636384963989258, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 156 + }, + { + "completion_length": 8.71875, + "epoch": 0.027510075346066233, + "grad_norm": 13.252046634278365, + "kl": 0.01556396484375, + "learning_rate": 9.726651480637814e-07, + "loss": 0.0062, + "reward": 1.5384259223937988, + "reward_std": 0.13512171804904938, + "rewards/accuracy_reward_stage2": 0.5384259223937988, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 157 + }, + { + "completion_length": 9.4375, + "epoch": 0.02768529875591379, + "grad_norm": 17.87652537935883, + "kl": 0.0284423828125, + "learning_rate": 9.724899246539337e-07, + "loss": 0.0114, + "reward": 1.6046476364135742, + "reward_std": 0.1472102403640747, + "rewards/accuracy_reward_stage2": 0.6046475172042847, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 158 + }, + { + "completion_length": 9.640625, + "epoch": 0.027860522165761344, + "grad_norm": 255.3729136272485, + "kl": 1.1171875, + "learning_rate": 9.723147012440862e-07, + "loss": 0.4472, + "reward": 1.3028572797775269, + "reward_std": 0.10992234945297241, + "rewards/accuracy_reward_stage2": 0.42785730957984924, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 159 + }, + { + "completion_length": 9.03125, + "epoch": 0.0280357455756089, + "grad_norm": 24.540647784148998, + "kl": 0.07958984375, + "learning_rate": 9.721394778342387e-07, + "loss": -0.0124, + "reward": 1.6068737506866455, + "reward_std": 0.2786521315574646, + "rewards/accuracy_reward_stage2": 0.6224986910820007, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 160 + }, + { + "completion_length": 9.828125, + "epoch": 0.028210968985456455, + "grad_norm": 31.622324802367434, + "kl": 0.1650390625, + "learning_rate": 9.71964254424391e-07, + "loss": 0.0659, + "reward": 1.5447970628738403, + "reward_std": 0.282809317111969, + "rewards/accuracy_reward_stage2": 0.5447970628738403, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 161 + }, + { + "completion_length": 5.875, + "epoch": 0.028386192395304013, + "grad_norm": 18.034072051457613, + "kl": 0.037841796875, + "learning_rate": 9.717890310145434e-07, + "loss": 0.0151, + "reward": 1.5959933996200562, + "reward_std": 0.17372407019138336, + "rewards/accuracy_reward_stage2": 0.7209933400154114, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 162 + }, + { + "completion_length": 14.6875, + "epoch": 0.028561415805151567, + "grad_norm": 25.02410090499007, + "kl": 0.030517578125, + "learning_rate": 9.71613807604696e-07, + "loss": 0.0122, + "reward": 1.4665104150772095, + "reward_std": 0.23067545890808105, + "rewards/accuracy_reward_stage2": 0.4665104150772095, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 163 + }, + { + "completion_length": 10.578125, + "epoch": 0.028736639214999124, + "grad_norm": 17.3468175548762, + "kl": 0.25, + "learning_rate": 9.714385841948484e-07, + "loss": 0.0995, + "reward": 1.3711848258972168, + "reward_std": 0.13964632153511047, + "rewards/accuracy_reward_stage2": 0.49618473649024963, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 164 + }, + { + "completion_length": 9.15625, + "epoch": 0.028911862624846678, + "grad_norm": 23.561247389295385, + "kl": 0.08740234375, + "learning_rate": 9.71263360785001e-07, + "loss": 0.035, + "reward": 1.550048589706421, + "reward_std": 0.23237371444702148, + "rewards/accuracy_reward_stage2": 0.5500486493110657, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 165 + }, + { + "completion_length": 16.734375, + "epoch": 0.029087086034694235, + "grad_norm": 19.429360315487067, + "kl": 0.020263671875, + "learning_rate": 9.710881373751532e-07, + "loss": 0.0081, + "reward": 1.6858422756195068, + "reward_std": 0.26720598340034485, + "rewards/accuracy_reward_stage2": 0.6858422756195068, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 166 + }, + { + "completion_length": 9.109375, + "epoch": 0.029262309444541793, + "grad_norm": 20.70039845969837, + "kl": 0.051025390625, + "learning_rate": 9.709129139653057e-07, + "loss": 0.0204, + "reward": 1.6000640392303467, + "reward_std": 0.34583085775375366, + "rewards/accuracy_reward_stage2": 0.6000640392303467, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 167 + }, + { + "completion_length": 12.03125, + "epoch": 0.029437532854389346, + "grad_norm": 15.176393141742965, + "kl": 0.0224609375, + "learning_rate": 9.707376905554582e-07, + "loss": 0.009, + "reward": 1.3509178161621094, + "reward_std": 0.0907922238111496, + "rewards/accuracy_reward_stage2": 0.4759177565574646, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 168 + }, + { + "completion_length": 7.6875, + "epoch": 0.029612756264236904, + "grad_norm": 20.90349632158209, + "kl": 0.0184326171875, + "learning_rate": 9.705624671456107e-07, + "loss": 0.0074, + "reward": 1.622603416442871, + "reward_std": 0.17725098133087158, + "rewards/accuracy_reward_stage2": 0.6226034760475159, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 169 + }, + { + "completion_length": 10.609375, + "epoch": 0.029787979674084458, + "grad_norm": 13.705177811281553, + "kl": 0.265625, + "learning_rate": 9.703872437357632e-07, + "loss": 0.1066, + "reward": 1.229966163635254, + "reward_std": 0.03953730687499046, + "rewards/accuracy_reward_stage2": 0.35496610403060913, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 170 + }, + { + "completion_length": 11.546875, + "epoch": 0.029963203083932015, + "grad_norm": 19.105963673665826, + "kl": 0.04443359375, + "learning_rate": 9.702120203259154e-07, + "loss": -0.0251, + "reward": 1.7006630897521973, + "reward_std": 0.10390988737344742, + "rewards/accuracy_reward_stage2": 0.7162880301475525, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 171 + }, + { + "completion_length": 7.484375, + "epoch": 0.03013842649377957, + "grad_norm": 23.78920063928513, + "kl": 0.0419921875, + "learning_rate": 9.70036796916068e-07, + "loss": -0.0274, + "reward": 1.514814853668213, + "reward_std": 0.35105100274086, + "rewards/accuracy_reward_stage2": 0.6554398536682129, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 172 + }, + { + "completion_length": 9.828125, + "epoch": 0.030313649903627126, + "grad_norm": 17.5747043233872, + "kl": 0.1630859375, + "learning_rate": 9.698615735062204e-07, + "loss": 0.0656, + "reward": 1.46875, + "reward_std": 0.2619796395301819, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 173 + }, + { + "completion_length": 11.453125, + "epoch": 0.03048887331347468, + "grad_norm": 18.708571980167633, + "kl": 0.033203125, + "learning_rate": 9.696863500963727e-07, + "loss": 0.0132, + "reward": 1.652700424194336, + "reward_std": 0.16118405759334564, + "rewards/accuracy_reward_stage2": 0.6527003049850464, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 174 + }, + { + "completion_length": 11.0, + "epoch": 0.030664096723322237, + "grad_norm": 23.207996060963765, + "kl": 0.0186767578125, + "learning_rate": 9.695111266865252e-07, + "loss": 0.0075, + "reward": 1.5543544292449951, + "reward_std": 0.21840667724609375, + "rewards/accuracy_reward_stage2": 0.5543544292449951, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 175 + }, + { + "completion_length": 6.0625, + "epoch": 0.03083932013316979, + "grad_norm": 17.239276511923176, + "kl": 0.05029296875, + "learning_rate": 9.693359032766777e-07, + "loss": 0.0201, + "reward": 1.7345705032348633, + "reward_std": 0.13885539770126343, + "rewards/accuracy_reward_stage2": 0.7345705032348633, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 176 + }, + { + "completion_length": 14.875, + "epoch": 0.03101454354301735, + "grad_norm": 17.215969325620332, + "kl": 0.064453125, + "learning_rate": 9.691606798668302e-07, + "loss": 0.0258, + "reward": 1.4116889238357544, + "reward_std": 0.11823472380638123, + "rewards/accuracy_reward_stage2": 0.6616888642311096, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 177 + }, + { + "completion_length": 7.28125, + "epoch": 0.031189766952864902, + "grad_norm": 12.798355258269762, + "kl": 0.02734375, + "learning_rate": 9.689854564569827e-07, + "loss": 0.0109, + "reward": 1.30573308467865, + "reward_std": 0.08645682036876678, + "rewards/accuracy_reward_stage2": 0.4307331144809723, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 178 + }, + { + "completion_length": 6.765625, + "epoch": 0.03136499036271246, + "grad_norm": 25.05776951577626, + "kl": 0.23828125, + "learning_rate": 9.68810233047135e-07, + "loss": 0.0953, + "reward": 1.3320447206497192, + "reward_std": 0.1577865034341812, + "rewards/accuracy_reward_stage2": 0.4570447504520416, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 179 + }, + { + "completion_length": 7.96875, + "epoch": 0.03154021377256001, + "grad_norm": 22.093198419377405, + "kl": 0.0634765625, + "learning_rate": 9.686350096372874e-07, + "loss": 0.0254, + "reward": 1.6561169624328613, + "reward_std": 0.26356494426727295, + "rewards/accuracy_reward_stage2": 0.6561169624328613, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 180 + }, + { + "completion_length": 16.953125, + "epoch": 0.03171543718240757, + "grad_norm": 30.753363540818007, + "kl": 0.26953125, + "learning_rate": 9.6845978622744e-07, + "loss": 0.1079, + "reward": 1.3598719835281372, + "reward_std": 0.22879821062088013, + "rewards/accuracy_reward_stage2": 0.4848719835281372, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 181 + }, + { + "completion_length": 9.359375, + "epoch": 0.03189066059225513, + "grad_norm": 21.41239232341035, + "kl": 0.0279541015625, + "learning_rate": 9.682845628175924e-07, + "loss": 0.0112, + "reward": 1.6863123178482056, + "reward_std": 0.22437290847301483, + "rewards/accuracy_reward_stage2": 0.6863122582435608, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 182 + }, + { + "completion_length": 6.484375, + "epoch": 0.03206588400210268, + "grad_norm": 8.467142001911718, + "kl": 0.0150146484375, + "learning_rate": 9.68109339407745e-07, + "loss": 0.006, + "reward": 1.546875, + "reward_std": 0.11100947856903076, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 183 + }, + { + "completion_length": 6.1875, + "epoch": 0.032241107411950236, + "grad_norm": 18.643982166866053, + "kl": 0.0179443359375, + "learning_rate": 9.679341159978974e-07, + "loss": 0.0072, + "reward": 1.4908428192138672, + "reward_std": 0.11428863555192947, + "rewards/accuracy_reward_stage2": 0.49084287881851196, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 184 + }, + { + "completion_length": 10.3125, + "epoch": 0.03241633082179779, + "grad_norm": 33.78920896731426, + "kl": 0.1728515625, + "learning_rate": 9.677588925880497e-07, + "loss": 0.0687, + "reward": 1.46875, + "reward_std": 0.1246790662407875, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 185 + }, + { + "completion_length": 7.203125, + "epoch": 0.03259155423164535, + "grad_norm": 21.094244218777078, + "kl": 0.140625, + "learning_rate": 9.675836691782022e-07, + "loss": 0.056, + "reward": 1.171875, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward_stage2": 0.421875, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 186 + }, + { + "completion_length": 9.609375, + "epoch": 0.032766777641492904, + "grad_norm": 20.530642144368848, + "kl": 0.080078125, + "learning_rate": 9.674084457683545e-07, + "loss": 0.0321, + "reward": 1.6582694053649902, + "reward_std": 0.22948169708251953, + "rewards/accuracy_reward_stage2": 0.658269464969635, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 187 + }, + { + "completion_length": 9.46875, + "epoch": 0.03294200105134046, + "grad_norm": 18.975229254487072, + "kl": 0.056396484375, + "learning_rate": 9.67233222358507e-07, + "loss": 0.0226, + "reward": 1.6822609901428223, + "reward_std": 0.09512491524219513, + "rewards/accuracy_reward_stage2": 0.682261049747467, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 188 + }, + { + "completion_length": 13.8125, + "epoch": 0.03311722446118801, + "grad_norm": 20.47557098090093, + "kl": 0.236328125, + "learning_rate": 9.670579989486595e-07, + "loss": 0.0948, + "reward": 1.3839223384857178, + "reward_std": 0.19412410259246826, + "rewards/accuracy_reward_stage2": 0.6339223384857178, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 189 + }, + { + "completion_length": 11.515625, + "epoch": 0.03329244787103557, + "grad_norm": 21.453820677237367, + "kl": 0.0294189453125, + "learning_rate": 9.66882775538812e-07, + "loss": 0.0118, + "reward": 1.6715006828308105, + "reward_std": 0.1585531234741211, + "rewards/accuracy_reward_stage2": 0.6715006828308105, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 190 + }, + { + "completion_length": 14.421875, + "epoch": 0.03346767128088313, + "grad_norm": 29.194800757782893, + "kl": 0.08544921875, + "learning_rate": 9.667075521289644e-07, + "loss": 0.0341, + "reward": 1.3433187007904053, + "reward_std": 0.17038963735103607, + "rewards/accuracy_reward_stage2": 0.4683186411857605, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 191 + }, + { + "completion_length": 7.296875, + "epoch": 0.03364289469073068, + "grad_norm": 20.725013074544748, + "kl": 0.0289306640625, + "learning_rate": 9.66532328719117e-07, + "loss": 0.0115, + "reward": 1.4519970417022705, + "reward_std": 0.17661163210868835, + "rewards/accuracy_reward_stage2": 0.4519970118999481, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 192 + }, + { + "completion_length": 7.984375, + "epoch": 0.033818118100578234, + "grad_norm": 18.52882236150989, + "kl": 0.0810546875, + "learning_rate": 9.663571053092692e-07, + "loss": -0.0118, + "reward": 1.6647088527679443, + "reward_std": 0.12946046888828278, + "rewards/accuracy_reward_stage2": 0.6803338527679443, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 193 + }, + { + "completion_length": 8.8125, + "epoch": 0.033993341510425795, + "grad_norm": 26.484601737109703, + "kl": 0.06201171875, + "learning_rate": 9.661818818994217e-07, + "loss": -0.004, + "reward": 1.4455125331878662, + "reward_std": 0.1841823160648346, + "rewards/accuracy_reward_stage2": 0.5861374735832214, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 194 + }, + { + "completion_length": 12.96875, + "epoch": 0.03416856492027335, + "grad_norm": 15.203542718715298, + "kl": 0.0791015625, + "learning_rate": 9.660066584895742e-07, + "loss": 0.0317, + "reward": 1.557002305984497, + "reward_std": 0.16922709345817566, + "rewards/accuracy_reward_stage2": 0.6820023059844971, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 195 + }, + { + "completion_length": 12.9375, + "epoch": 0.0343437883301209, + "grad_norm": 15.59409127943019, + "kl": 0.037353515625, + "learning_rate": 9.658314350797267e-07, + "loss": 0.0149, + "reward": 1.5619020462036133, + "reward_std": 0.07156114280223846, + "rewards/accuracy_reward_stage2": 0.5619020462036133, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 196 + }, + { + "completion_length": 7.578125, + "epoch": 0.03451901173996846, + "grad_norm": 10.595565784687796, + "kl": 0.021240234375, + "learning_rate": 9.656562116698792e-07, + "loss": 0.0085, + "reward": 1.5104167461395264, + "reward_std": 0.0294627882540226, + "rewards/accuracy_reward_stage2": 0.5104166269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 197 + }, + { + "completion_length": 12.9375, + "epoch": 0.03469423514981602, + "grad_norm": 13.055675079262832, + "kl": 0.007781982421875, + "learning_rate": 9.654809882600315e-07, + "loss": 0.0031, + "reward": 1.84375, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.84375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 198 + }, + { + "completion_length": 9.65625, + "epoch": 0.03486945855966357, + "grad_norm": 1.139007136515326, + "kl": 0.007476806640625, + "learning_rate": 9.65305764850184e-07, + "loss": 0.003, + "reward": 1.625, + "reward_std": 0.0, + "rewards/accuracy_reward_stage2": 0.625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 199 + }, + { + "completion_length": 7.328125, + "epoch": 0.035044681969511125, + "grad_norm": 16.784491224117573, + "kl": 0.0074462890625, + "learning_rate": 9.651305414403364e-07, + "loss": 0.003, + "reward": 1.6219052076339722, + "reward_std": 0.155005544424057, + "rewards/accuracy_reward_stage2": 0.6219052076339722, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 200 + }, + { + "completion_length": 10.34375, + "epoch": 0.03521990537935868, + "grad_norm": 20.19629650538392, + "kl": 0.0277099609375, + "learning_rate": 9.649553180304887e-07, + "loss": 0.0111, + "reward": 1.394465684890747, + "reward_std": 0.08756385743618011, + "rewards/accuracy_reward_stage2": 0.39446574449539185, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 201 + }, + { + "completion_length": 8.53125, + "epoch": 0.03539512878920624, + "grad_norm": 23.718846731751, + "kl": 0.041015625, + "learning_rate": 9.647800946206412e-07, + "loss": 0.0164, + "reward": 1.6174988746643066, + "reward_std": 0.29005441069602966, + "rewards/accuracy_reward_stage2": 0.6174987554550171, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 202 + }, + { + "completion_length": 11.515625, + "epoch": 0.035570352199053794, + "grad_norm": 17.105844468751872, + "kl": 0.046142578125, + "learning_rate": 9.646048712107937e-07, + "loss": -0.0238, + "reward": 1.3539774417877197, + "reward_std": 0.20320993661880493, + "rewards/accuracy_reward_stage2": 0.36960241198539734, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 203 + }, + { + "completion_length": 11.125, + "epoch": 0.03574557560890135, + "grad_norm": 24.028616996160352, + "kl": 0.0771484375, + "learning_rate": 9.644296478009462e-07, + "loss": 0.0309, + "reward": 1.3534865379333496, + "reward_std": 0.17606234550476074, + "rewards/accuracy_reward_stage2": 0.3534865975379944, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 204 + }, + { + "completion_length": 26.40625, + "epoch": 0.0359207990187489, + "grad_norm": 18.689605260335664, + "kl": 0.01806640625, + "learning_rate": 9.642544243910987e-07, + "loss": 0.0072, + "reward": 1.4637235403060913, + "reward_std": 0.17509686946868896, + "rewards/accuracy_reward_stage2": 0.4637235105037689, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 205 + }, + { + "completion_length": 8.421875, + "epoch": 0.03609602242859646, + "grad_norm": 21.290975767768856, + "kl": 0.0205078125, + "learning_rate": 9.64079200981251e-07, + "loss": 0.0082, + "reward": 1.7789130210876465, + "reward_std": 0.18055710196495056, + "rewards/accuracy_reward_stage2": 0.7789130210876465, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 206 + }, + { + "completion_length": 9.71875, + "epoch": 0.036271245838444016, + "grad_norm": 25.960035529614007, + "kl": 0.0537109375, + "learning_rate": 9.639039775714035e-07, + "loss": 0.0215, + "reward": 1.552076816558838, + "reward_std": 0.28908300399780273, + "rewards/accuracy_reward_stage2": 0.5520768165588379, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 207 + }, + { + "completion_length": 11.21875, + "epoch": 0.03644646924829157, + "grad_norm": 16.88448721696107, + "kl": 0.036376953125, + "learning_rate": 9.63728754161556e-07, + "loss": 0.0145, + "reward": 1.1257497072219849, + "reward_std": 0.1288609355688095, + "rewards/accuracy_reward_stage2": 0.2507496774196625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 208 + }, + { + "completion_length": 7.609375, + "epoch": 0.03662169265813913, + "grad_norm": 16.2274690040401, + "kl": 0.0272216796875, + "learning_rate": 9.635535307517085e-07, + "loss": 0.0109, + "reward": 1.4792509078979492, + "reward_std": 0.19155427813529968, + "rewards/accuracy_reward_stage2": 0.479250967502594, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 209 + }, + { + "completion_length": 13.46875, + "epoch": 0.036796916067986685, + "grad_norm": 33.784177051978844, + "kl": 0.5078125, + "learning_rate": 9.63378307341861e-07, + "loss": 0.2018, + "reward": 1.434826135635376, + "reward_std": 0.26847773790359497, + "rewards/accuracy_reward_stage2": 0.5598262548446655, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 210 + }, + { + "completion_length": 9.09375, + "epoch": 0.03697213947783424, + "grad_norm": 20.179490922240536, + "kl": 0.048828125, + "learning_rate": 9.632030839320132e-07, + "loss": 0.0196, + "reward": 1.5264551639556885, + "reward_std": 0.1224151998758316, + "rewards/accuracy_reward_stage2": 0.5264551639556885, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 211 + }, + { + "completion_length": 8.390625, + "epoch": 0.03714736288768179, + "grad_norm": 20.438576677424336, + "kl": 0.02783203125, + "learning_rate": 9.630278605221657e-07, + "loss": 0.0111, + "reward": 1.697406530380249, + "reward_std": 0.18822979927062988, + "rewards/accuracy_reward_stage2": 0.6974066495895386, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 212 + }, + { + "completion_length": 16.0, + "epoch": 0.03732258629752935, + "grad_norm": 24.199250122428335, + "kl": 0.1708984375, + "learning_rate": 9.628526371123182e-07, + "loss": 0.0685, + "reward": 1.1678051948547363, + "reward_std": 0.17951638996601105, + "rewards/accuracy_reward_stage2": 0.2928052544593811, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 213 + }, + { + "completion_length": 12.546875, + "epoch": 0.03749780970737691, + "grad_norm": 13.500767042240465, + "kl": 0.00909423828125, + "learning_rate": 9.626774137024705e-07, + "loss": -0.0253, + "reward": 1.5083041191101074, + "reward_std": 0.14326375722885132, + "rewards/accuracy_reward_stage2": 0.5239291787147522, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 214 + }, + { + "completion_length": 7.265625, + "epoch": 0.03767303311722446, + "grad_norm": 21.444658879022974, + "kl": 0.014404296875, + "learning_rate": 9.62502190292623e-07, + "loss": 0.0058, + "reward": 1.606555461883545, + "reward_std": 0.2108723670244217, + "rewards/accuracy_reward_stage2": 0.6065554618835449, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 215 + }, + { + "completion_length": 9.359375, + "epoch": 0.037848256527072015, + "grad_norm": 22.512872765127288, + "kl": 0.45703125, + "learning_rate": 9.623269668827755e-07, + "loss": 0.1825, + "reward": 1.3978098630905151, + "reward_std": 0.15367010235786438, + "rewards/accuracy_reward_stage2": 0.5228098630905151, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 216 + }, + { + "completion_length": 12.921875, + "epoch": 0.038023479936919576, + "grad_norm": 21.09355853469942, + "kl": 0.31640625, + "learning_rate": 9.62151743472928e-07, + "loss": 0.1267, + "reward": 1.358708143234253, + "reward_std": 0.23795433342456818, + "rewards/accuracy_reward_stage2": 0.6087081432342529, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 217 + }, + { + "completion_length": 9.25, + "epoch": 0.03819870334676713, + "grad_norm": 19.748781154588258, + "kl": 0.294921875, + "learning_rate": 9.619765200630805e-07, + "loss": 0.1177, + "reward": 1.5509915351867676, + "reward_std": 0.12086137384176254, + "rewards/accuracy_reward_stage2": 0.8009915351867676, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 218 + }, + { + "completion_length": 8.09375, + "epoch": 0.03837392675661468, + "grad_norm": 35.49436359896782, + "kl": 0.205078125, + "learning_rate": 9.618012966532327e-07, + "loss": 0.0577, + "reward": 1.719040870666504, + "reward_std": 0.19431626796722412, + "rewards/accuracy_reward_stage2": 0.7346658706665039, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 219 + }, + { + "completion_length": 8.953125, + "epoch": 0.03854915016646224, + "grad_norm": 18.25037389606022, + "kl": 0.010498046875, + "learning_rate": 9.616260732433852e-07, + "loss": -0.04, + "reward": 1.566141128540039, + "reward_std": 0.20485994219779968, + "rewards/accuracy_reward_stage2": 0.5817661881446838, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 220 + }, + { + "completion_length": 9.15625, + "epoch": 0.0387243735763098, + "grad_norm": 15.805557927333963, + "kl": 0.05615234375, + "learning_rate": 9.614508498335377e-07, + "loss": -0.0205, + "reward": 1.4933180809020996, + "reward_std": 0.17127634584903717, + "rewards/accuracy_reward_stage2": 0.5089430212974548, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 221 + }, + { + "completion_length": 12.390625, + "epoch": 0.03889959698615735, + "grad_norm": 13.08398417869419, + "kl": 0.08154296875, + "learning_rate": 9.612756264236902e-07, + "loss": -0.0074, + "reward": 1.6429219245910645, + "reward_std": 0.09408406913280487, + "rewards/accuracy_reward_stage2": 0.658547043800354, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 222 + }, + { + "completion_length": 11.0, + "epoch": 0.039074820396004906, + "grad_norm": 17.417074560822787, + "kl": 0.0167236328125, + "learning_rate": 9.611004030138427e-07, + "loss": 0.0067, + "reward": 1.492321252822876, + "reward_std": 0.10672628879547119, + "rewards/accuracy_reward_stage2": 0.617321252822876, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 223 + }, + { + "completion_length": 8.109375, + "epoch": 0.03925004380585246, + "grad_norm": 18.883396983473418, + "kl": 0.019775390625, + "learning_rate": 9.60925179603995e-07, + "loss": -0.0363, + "reward": 1.439524531364441, + "reward_std": 0.20576725900173187, + "rewards/accuracy_reward_stage2": 0.4551495909690857, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 224 + }, + { + "completion_length": 11.1875, + "epoch": 0.03942526721570002, + "grad_norm": 21.665625000254725, + "kl": 0.0185546875, + "learning_rate": 9.607499561941475e-07, + "loss": 0.0074, + "reward": 1.6839239597320557, + "reward_std": 0.21414509415626526, + "rewards/accuracy_reward_stage2": 0.6839240193367004, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 225 + }, + { + "completion_length": 9.28125, + "epoch": 0.039600490625547574, + "grad_norm": 14.634515141275852, + "kl": 0.01007080078125, + "learning_rate": 9.605747327843e-07, + "loss": 0.004, + "reward": 1.6269270181655884, + "reward_std": 0.12803037464618683, + "rewards/accuracy_reward_stage2": 0.6269270181655884, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 226 + }, + { + "completion_length": 28.96875, + "epoch": 0.03977571403539513, + "grad_norm": 21.338816841964235, + "kl": 0.018310546875, + "learning_rate": 9.603995093744523e-07, + "loss": 0.0073, + "reward": 1.555863618850708, + "reward_std": 0.12815237045288086, + "rewards/accuracy_reward_stage2": 0.555863618850708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 227 + }, + { + "completion_length": 9.390625, + "epoch": 0.03995093744524268, + "grad_norm": 21.852815893340804, + "kl": 0.0498046875, + "learning_rate": 9.602242859646048e-07, + "loss": 0.0199, + "reward": 1.7307069301605225, + "reward_std": 0.11314516514539719, + "rewards/accuracy_reward_stage2": 0.7307069301605225, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 228 + }, + { + "completion_length": 9.40625, + "epoch": 0.04012616085509024, + "grad_norm": 30.771365408147698, + "kl": 0.0225830078125, + "learning_rate": 9.600490625547573e-07, + "loss": 0.009, + "reward": 1.560467004776001, + "reward_std": 0.22677713632583618, + "rewards/accuracy_reward_stage2": 0.5604670643806458, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 229 + }, + { + "completion_length": 7.4375, + "epoch": 0.040301384264937797, + "grad_norm": 24.481460538698858, + "kl": 0.0966796875, + "learning_rate": 9.598738391449097e-07, + "loss": 0.0387, + "reward": 1.6300032138824463, + "reward_std": 0.1686421036720276, + "rewards/accuracy_reward_stage2": 0.6300033330917358, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 230 + }, + { + "completion_length": 9.15625, + "epoch": 0.04047660767478535, + "grad_norm": 28.843071660319026, + "kl": 0.045654296875, + "learning_rate": 9.596986157350622e-07, + "loss": -0.0151, + "reward": 1.376204252243042, + "reward_std": 0.3445381224155426, + "rewards/accuracy_reward_stage2": 0.39182931184768677, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 231 + }, + { + "completion_length": 12.109375, + "epoch": 0.040651831084632904, + "grad_norm": 23.224141525934847, + "kl": 0.0301513671875, + "learning_rate": 9.595233923252145e-07, + "loss": 0.012, + "reward": 1.3188610076904297, + "reward_std": 0.17884111404418945, + "rewards/accuracy_reward_stage2": 0.3188610076904297, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 232 + }, + { + "completion_length": 19.609375, + "epoch": 0.040827054494480465, + "grad_norm": 16.86419016940912, + "kl": 0.01214599609375, + "learning_rate": 9.59348168915367e-07, + "loss": 0.0049, + "reward": 1.3646348714828491, + "reward_std": 0.13505005836486816, + "rewards/accuracy_reward_stage2": 0.36463481187820435, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 233 + }, + { + "completion_length": 8.5, + "epoch": 0.04100227790432802, + "grad_norm": 24.073123884487078, + "kl": 0.01123046875, + "learning_rate": 9.591729455055195e-07, + "loss": 0.0045, + "reward": 1.40625, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward_stage2": 0.40625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 234 + }, + { + "completion_length": 13.4375, + "epoch": 0.04117750131417557, + "grad_norm": 4901.515461483319, + "kl": 18.375, + "learning_rate": 9.58997722095672e-07, + "loss": 7.3251, + "reward": 1.438122272491455, + "reward_std": 0.15549173951148987, + "rewards/accuracy_reward_stage2": 0.5631222724914551, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 235 + }, + { + "completion_length": 9.34375, + "epoch": 0.04135272472402313, + "grad_norm": 14.789976756404938, + "kl": 0.009521484375, + "learning_rate": 9.588224986858245e-07, + "loss": 0.0038, + "reward": 1.650240421295166, + "reward_std": 0.1713310033082962, + "rewards/accuracy_reward_stage2": 0.650240421295166, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 236 + }, + { + "completion_length": 13.609375, + "epoch": 0.04152794813387069, + "grad_norm": 584.9939223178902, + "kl": 1.90625, + "learning_rate": 9.586472752759768e-07, + "loss": 0.7597, + "reward": 1.3631982803344727, + "reward_std": 0.2880287170410156, + "rewards/accuracy_reward_stage2": 0.48819833993911743, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 237 + }, + { + "completion_length": 10.40625, + "epoch": 0.04170317154371824, + "grad_norm": 20.92452543043772, + "kl": 0.0235595703125, + "learning_rate": 9.584720518661293e-07, + "loss": 0.0094, + "reward": 1.336254358291626, + "reward_std": 0.19408045709133148, + "rewards/accuracy_reward_stage2": 0.3362542986869812, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 238 + }, + { + "completion_length": 11.390625, + "epoch": 0.041878394953565795, + "grad_norm": 15.628353567644007, + "kl": 0.0079345703125, + "learning_rate": 9.582968284562818e-07, + "loss": 0.0032, + "reward": 1.4190398454666138, + "reward_std": 0.12067941576242447, + "rewards/accuracy_reward_stage2": 0.41903984546661377, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 239 + }, + { + "completion_length": 9.875, + "epoch": 0.04205361836341335, + "grad_norm": 23.718856680377566, + "kl": 0.018798828125, + "learning_rate": 9.58121605046434e-07, + "loss": 0.0075, + "reward": 1.7033743858337402, + "reward_std": 0.2691870629787445, + "rewards/accuracy_reward_stage2": 0.7033743858337402, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 240 + }, + { + "completion_length": 7.96875, + "epoch": 0.04222884177326091, + "grad_norm": 16.06620576115406, + "kl": 0.032958984375, + "learning_rate": 9.579463816365865e-07, + "loss": 0.0132, + "reward": 1.472252368927002, + "reward_std": 0.21186134219169617, + "rewards/accuracy_reward_stage2": 0.5972523093223572, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 241 + }, + { + "completion_length": 12.5625, + "epoch": 0.042404065183108464, + "grad_norm": 23.444470839653174, + "kl": 0.039306640625, + "learning_rate": 9.57771158226739e-07, + "loss": 0.0157, + "reward": 1.5591293573379517, + "reward_std": 0.33478352427482605, + "rewards/accuracy_reward_stage2": 0.5591292977333069, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 242 + }, + { + "completion_length": 13.71875, + "epoch": 0.04257928859295602, + "grad_norm": 25.381272213819486, + "kl": 0.0235595703125, + "learning_rate": 9.575959348168915e-07, + "loss": -0.0236, + "reward": 1.4567725658416748, + "reward_std": 0.3310222923755646, + "rewards/accuracy_reward_stage2": 0.4723976254463196, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 243 + }, + { + "completion_length": 14.03125, + "epoch": 0.04275451200280357, + "grad_norm": 24.936302441450792, + "kl": 0.62109375, + "learning_rate": 9.57420711407044e-07, + "loss": 0.2483, + "reward": 1.6432292461395264, + "reward_std": 0.3550029397010803, + "rewards/accuracy_reward_stage2": 0.7682291865348816, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 244 + }, + { + "completion_length": 9.734375, + "epoch": 0.04292973541265113, + "grad_norm": 20.441411725972664, + "kl": 0.0283203125, + "learning_rate": 9.572454879971965e-07, + "loss": -0.0229, + "reward": 1.5584733486175537, + "reward_std": 0.37590664625167847, + "rewards/accuracy_reward_stage2": 0.5740982890129089, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 245 + }, + { + "completion_length": 8.3125, + "epoch": 0.043104958822498686, + "grad_norm": 20.72964347938417, + "kl": 0.0201416015625, + "learning_rate": 9.570702645873488e-07, + "loss": 0.0081, + "reward": 1.5848780870437622, + "reward_std": 0.2728351056575775, + "rewards/accuracy_reward_stage2": 0.584878146648407, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 246 + }, + { + "completion_length": 12.5625, + "epoch": 0.04328018223234624, + "grad_norm": 15.52199378970145, + "kl": 0.007476806640625, + "learning_rate": 9.568950411775013e-07, + "loss": 0.003, + "reward": 1.575636386871338, + "reward_std": 0.13075202703475952, + "rewards/accuracy_reward_stage2": 0.5756364464759827, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 247 + }, + { + "completion_length": 12.0625, + "epoch": 0.043455405642193794, + "grad_norm": 19.395159139210442, + "kl": 0.1533203125, + "learning_rate": 9.567198177676538e-07, + "loss": 0.0279, + "reward": 1.4184027910232544, + "reward_std": 0.3044259250164032, + "rewards/accuracy_reward_stage2": 0.6840277910232544, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 248 + }, + { + "completion_length": 5.765625, + "epoch": 0.043630629052041354, + "grad_norm": 14.48011797195782, + "kl": 0.0079345703125, + "learning_rate": 9.565445943578063e-07, + "loss": 0.0032, + "reward": 1.34375, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 249 + }, + { + "completion_length": 11.265625, + "epoch": 0.04380585246188891, + "grad_norm": 17.31147879550402, + "kl": 0.0228271484375, + "learning_rate": 9.563693709479585e-07, + "loss": -0.0238, + "reward": 1.598874568939209, + "reward_std": 0.233104407787323, + "rewards/accuracy_reward_stage2": 0.6144995093345642, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 250 + }, + { + "completion_length": 12.0625, + "epoch": 0.04398107587173646, + "grad_norm": 16.454194341141637, + "kl": 0.0361328125, + "learning_rate": 9.56194147538111e-07, + "loss": 0.0145, + "reward": 1.520371675491333, + "reward_std": 0.15112952888011932, + "rewards/accuracy_reward_stage2": 0.5203717350959778, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 251 + }, + { + "completion_length": 11.4375, + "epoch": 0.04415629928158402, + "grad_norm": 17.35487988167322, + "kl": 0.0361328125, + "learning_rate": 9.560189241282635e-07, + "loss": 0.0145, + "reward": 1.3840597867965698, + "reward_std": 0.06660275906324387, + "rewards/accuracy_reward_stage2": 0.3840597867965698, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 252 + }, + { + "completion_length": 8.421875, + "epoch": 0.04433152269143158, + "grad_norm": 17.687468327124677, + "kl": 0.029296875, + "learning_rate": 9.55843700718416e-07, + "loss": 0.0117, + "reward": 1.7530488967895508, + "reward_std": 0.05738438665866852, + "rewards/accuracy_reward_stage2": 0.7530487775802612, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 253 + }, + { + "completion_length": 12.28125, + "epoch": 0.04450674610127913, + "grad_norm": 19.913907277473474, + "kl": 0.0693359375, + "learning_rate": 9.556684773085683e-07, + "loss": -0.0166, + "reward": 1.416152000427246, + "reward_std": 0.25723960995674133, + "rewards/accuracy_reward_stage2": 0.5567771196365356, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 254 + }, + { + "completion_length": 9.5625, + "epoch": 0.044681969511126685, + "grad_norm": 23.441539432847623, + "kl": 0.08740234375, + "learning_rate": 9.554932538987208e-07, + "loss": 0.0349, + "reward": 1.4265499114990234, + "reward_std": 0.2730960547924042, + "rewards/accuracy_reward_stage2": 0.42654991149902344, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 255 + }, + { + "completion_length": 8.796875, + "epoch": 0.044857192920974245, + "grad_norm": 20.805671934114525, + "kl": 0.04541015625, + "learning_rate": 9.553180304888733e-07, + "loss": 0.0182, + "reward": 1.392343282699585, + "reward_std": 0.18272897601127625, + "rewards/accuracy_reward_stage2": 0.39234328269958496, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 256 + }, + { + "completion_length": 11.71875, + "epoch": 0.0450324163308218, + "grad_norm": 23.194431431497243, + "kl": 0.062255859375, + "learning_rate": 9.551428070790258e-07, + "loss": 0.0249, + "reward": 1.399277687072754, + "reward_std": 0.19788572192192078, + "rewards/accuracy_reward_stage2": 0.3992777466773987, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 257 + }, + { + "completion_length": 7.0625, + "epoch": 0.04520763974066935, + "grad_norm": 25.40844788041373, + "kl": 0.0712890625, + "learning_rate": 9.549675836691783e-07, + "loss": 0.0285, + "reward": 1.5427207946777344, + "reward_std": 0.17167173326015472, + "rewards/accuracy_reward_stage2": 0.6677207350730896, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 258 + }, + { + "completion_length": 6.875, + "epoch": 0.04538286315051691, + "grad_norm": 19.327847215838386, + "kl": 0.059326171875, + "learning_rate": 9.547923602593305e-07, + "loss": 0.0238, + "reward": 1.63570237159729, + "reward_std": 0.17358574271202087, + "rewards/accuracy_reward_stage2": 0.63570237159729, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 259 + }, + { + "completion_length": 10.59375, + "epoch": 0.04555808656036447, + "grad_norm": 12.462430172624714, + "kl": 0.046142578125, + "learning_rate": 9.54617136849483e-07, + "loss": 0.0184, + "reward": 1.4343960285186768, + "reward_std": 0.0724828690290451, + "rewards/accuracy_reward_stage2": 0.43439605832099915, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 260 + }, + { + "completion_length": 18.265625, + "epoch": 0.04573330997021202, + "grad_norm": 18.39096279891861, + "kl": 0.043212890625, + "learning_rate": 9.544419134396355e-07, + "loss": 0.0173, + "reward": 1.4796587228775024, + "reward_std": 0.2298499345779419, + "rewards/accuracy_reward_stage2": 0.47965875267982483, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 261 + }, + { + "completion_length": 11.5625, + "epoch": 0.045908533380059575, + "grad_norm": 18.064444413910984, + "kl": 0.051513671875, + "learning_rate": 9.54266690029788e-07, + "loss": 0.0206, + "reward": 1.385161280632019, + "reward_std": 0.12799863517284393, + "rewards/accuracy_reward_stage2": 0.510161280632019, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 262 + }, + { + "completion_length": 8.75, + "epoch": 0.04608375678990713, + "grad_norm": 16.59071665301892, + "kl": 0.10009765625, + "learning_rate": 9.540914666199403e-07, + "loss": 0.0401, + "reward": 1.6114752292633057, + "reward_std": 0.07131287455558777, + "rewards/accuracy_reward_stage2": 0.6114752292633057, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 263 + }, + { + "completion_length": 11.390625, + "epoch": 0.04625898019975469, + "grad_norm": 27.809570183113333, + "kl": 0.0888671875, + "learning_rate": 9.539162432100928e-07, + "loss": 0.0355, + "reward": 1.5748344659805298, + "reward_std": 0.2848876714706421, + "rewards/accuracy_reward_stage2": 0.5748344659805298, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 264 + }, + { + "completion_length": 9.59375, + "epoch": 0.046434203609602244, + "grad_norm": 24.14804135776577, + "kl": 0.04931640625, + "learning_rate": 9.537410198002453e-07, + "loss": 0.0197, + "reward": 1.8005200624465942, + "reward_std": 0.22077980637550354, + "rewards/accuracy_reward_stage2": 0.8005200624465942, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 265 + }, + { + "completion_length": 6.8125, + "epoch": 0.0466094270194498, + "grad_norm": 20.759906763884782, + "kl": 0.0299072265625, + "learning_rate": 9.535657963903977e-07, + "loss": 0.0023, + "reward": 1.408979058265686, + "reward_std": 0.11087541282176971, + "rewards/accuracy_reward_stage2": 0.42460405826568604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 266 + }, + { + "completion_length": 9.265625, + "epoch": 0.04678465042929735, + "grad_norm": 21.32710865447075, + "kl": 0.10791015625, + "learning_rate": 9.533905729805502e-07, + "loss": 0.0433, + "reward": 1.3225247859954834, + "reward_std": 0.20090004801750183, + "rewards/accuracy_reward_stage2": 0.5725248456001282, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 267 + }, + { + "completion_length": 10.671875, + "epoch": 0.04695987383914491, + "grad_norm": 24.455639169887064, + "kl": 0.345703125, + "learning_rate": 9.532153495707026e-07, + "loss": 0.1386, + "reward": 1.4042267799377441, + "reward_std": 0.15356406569480896, + "rewards/accuracy_reward_stage2": 0.5292267799377441, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 268 + }, + { + "completion_length": 15.328125, + "epoch": 0.047135097248992466, + "grad_norm": 21.541350855308803, + "kl": 0.035400390625, + "learning_rate": 9.53040126160855e-07, + "loss": 0.0141, + "reward": 1.6411187648773193, + "reward_std": 0.166485995054245, + "rewards/accuracy_reward_stage2": 0.6411186456680298, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 269 + }, + { + "completion_length": 9.3125, + "epoch": 0.04731032065884002, + "grad_norm": 22.15774609889451, + "kl": 0.0625, + "learning_rate": 9.528649027510075e-07, + "loss": 0.025, + "reward": 1.4429640769958496, + "reward_std": 0.19489288330078125, + "rewards/accuracy_reward_stage2": 0.5679640173912048, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 270 + }, + { + "completion_length": 11.421875, + "epoch": 0.047485544068687574, + "grad_norm": 16.491138324168993, + "kl": 0.023193359375, + "learning_rate": 9.526896793411599e-07, + "loss": -0.0349, + "reward": 1.5896495580673218, + "reward_std": 0.20764687657356262, + "rewards/accuracy_reward_stage2": 0.6052745580673218, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 271 + }, + { + "completion_length": 5.953125, + "epoch": 0.047660767478535135, + "grad_norm": 21.88275639215845, + "kl": 0.09716796875, + "learning_rate": 9.525144559313124e-07, + "loss": 0.0388, + "reward": 1.6785914897918701, + "reward_std": 0.16737723350524902, + "rewards/accuracy_reward_stage2": 0.6785914897918701, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 272 + }, + { + "completion_length": 11.9375, + "epoch": 0.04783599088838269, + "grad_norm": 21.467567337800396, + "kl": 0.62890625, + "learning_rate": 9.523392325214649e-07, + "loss": 0.2498, + "reward": 1.6683162450790405, + "reward_std": 0.2646476924419403, + "rewards/accuracy_reward_stage2": 0.7933162450790405, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 273 + }, + { + "completion_length": 12.921875, + "epoch": 0.04801121429823024, + "grad_norm": 25.858447993597725, + "kl": 0.56640625, + "learning_rate": 9.521640091116173e-07, + "loss": 0.2256, + "reward": 1.4236572980880737, + "reward_std": 0.17377673089504242, + "rewards/accuracy_reward_stage2": 0.5486572980880737, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 274 + }, + { + "completion_length": 14.15625, + "epoch": 0.048186437708077796, + "grad_norm": 46.2527752830865, + "kl": 0.03857421875, + "learning_rate": 9.519887857017697e-07, + "loss": 0.0154, + "reward": 1.678868055343628, + "reward_std": 0.2035331428050995, + "rewards/accuracy_reward_stage2": 0.6788681745529175, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 275 + }, + { + "completion_length": 7.59375, + "epoch": 0.04836166111792536, + "grad_norm": 32.720924431721514, + "kl": 0.16796875, + "learning_rate": 9.518135622919221e-07, + "loss": 0.0383, + "reward": 1.8637468814849854, + "reward_std": 0.2025931477546692, + "rewards/accuracy_reward_stage2": 0.8793718814849854, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 276 + }, + { + "completion_length": 7.53125, + "epoch": 0.04853688452777291, + "grad_norm": 21.119273458560386, + "kl": 0.076171875, + "learning_rate": 9.516383388820746e-07, + "loss": -0.0137, + "reward": 1.4132182598114014, + "reward_std": 0.24847757816314697, + "rewards/accuracy_reward_stage2": 0.5538431406021118, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 277 + }, + { + "completion_length": 12.390625, + "epoch": 0.048712107937620465, + "grad_norm": 14.369302454959714, + "kl": 0.08154296875, + "learning_rate": 9.514631154722271e-07, + "loss": 0.0327, + "reward": 1.5831317901611328, + "reward_std": 0.12140820920467377, + "rewards/accuracy_reward_stage2": 0.583131730556488, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 278 + }, + { + "completion_length": 6.421875, + "epoch": 0.04888733134746802, + "grad_norm": 12.107063676651114, + "kl": 0.01446533203125, + "learning_rate": 9.512878920623794e-07, + "loss": 0.0058, + "reward": 1.71875, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.71875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 279 + }, + { + "completion_length": 7.96875, + "epoch": 0.04906255475731558, + "grad_norm": 20.12750865493871, + "kl": 0.06884765625, + "learning_rate": 9.511126686525319e-07, + "loss": 0.0275, + "reward": 1.6211915016174316, + "reward_std": 0.1551232933998108, + "rewards/accuracy_reward_stage2": 0.6211915612220764, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 280 + }, + { + "completion_length": 22.015625, + "epoch": 0.04923777816716313, + "grad_norm": 118700.45485301006, + "kl": 500.0, + "learning_rate": 9.509374452426844e-07, + "loss": 200.9689, + "reward": 1.578223705291748, + "reward_std": 0.22198337316513062, + "rewards/accuracy_reward_stage2": 0.718848705291748, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 281 + }, + { + "completion_length": 9.75, + "epoch": 0.04941300157701069, + "grad_norm": 18.937226993599705, + "kl": 0.06494140625, + "learning_rate": 9.507622218328368e-07, + "loss": 0.026, + "reward": 1.6166949272155762, + "reward_std": 0.2150428295135498, + "rewards/accuracy_reward_stage2": 0.6166949272155762, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 282 + }, + { + "completion_length": 16.859375, + "epoch": 0.04958822498685824, + "grad_norm": 35.92465554139422, + "kl": 0.283203125, + "learning_rate": 9.505869984229893e-07, + "loss": 0.1138, + "reward": 1.3042311668395996, + "reward_std": 0.24792616069316864, + "rewards/accuracy_reward_stage2": 0.5542311072349548, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 283 + }, + { + "completion_length": 9.109375, + "epoch": 0.0497634483967058, + "grad_norm": 14.33229856110137, + "kl": 0.146484375, + "learning_rate": 9.504117750131417e-07, + "loss": 0.0588, + "reward": 1.455843210220337, + "reward_std": 0.07982275635004044, + "rewards/accuracy_reward_stage2": 0.5808432102203369, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 284 + }, + { + "completion_length": 11.046875, + "epoch": 0.049938671806553356, + "grad_norm": 16.522511082795745, + "kl": 0.06787109375, + "learning_rate": 9.502365516032942e-07, + "loss": 0.0272, + "reward": 1.5573397874832153, + "reward_std": 0.21037398278713226, + "rewards/accuracy_reward_stage2": 0.5573397874832153, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 285 + }, + { + "completion_length": 7.65625, + "epoch": 0.05011389521640091, + "grad_norm": 18.612997661361707, + "kl": 0.059814453125, + "learning_rate": 9.500613281934467e-07, + "loss": -0.0106, + "reward": 1.5225942134857178, + "reward_std": 0.1994466781616211, + "rewards/accuracy_reward_stage2": 0.5382192134857178, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 286 + }, + { + "completion_length": 8.34375, + "epoch": 0.050289118626248464, + "grad_norm": 21.052614034383808, + "kl": 0.0654296875, + "learning_rate": 9.498861047835991e-07, + "loss": 0.0261, + "reward": 1.7560055255889893, + "reward_std": 0.1533891260623932, + "rewards/accuracy_reward_stage2": 0.756005585193634, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 287 + }, + { + "completion_length": 7.671875, + "epoch": 0.050464342036096024, + "grad_norm": 14.416852702309425, + "kl": 0.033935546875, + "learning_rate": 9.497108813737515e-07, + "loss": 0.0136, + "reward": 1.5063834190368652, + "reward_std": 0.1799892634153366, + "rewards/accuracy_reward_stage2": 0.5063834190368652, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 288 + }, + { + "completion_length": 16.546875, + "epoch": 0.05063956544594358, + "grad_norm": 23.690892862827162, + "kl": 0.197265625, + "learning_rate": 9.495356579639038e-07, + "loss": 0.0789, + "reward": 1.4050755500793457, + "reward_std": 0.13149887323379517, + "rewards/accuracy_reward_stage2": 0.5300755500793457, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 289 + }, + { + "completion_length": 13.421875, + "epoch": 0.05081478885579113, + "grad_norm": 24.095865194664135, + "kl": 0.0849609375, + "learning_rate": 9.493604345540563e-07, + "loss": -0.0103, + "reward": 1.5336406230926514, + "reward_std": 0.23935247957706451, + "rewards/accuracy_reward_stage2": 0.5492656826972961, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 290 + }, + { + "completion_length": 9.03125, + "epoch": 0.050990012265638686, + "grad_norm": 20.282179837458845, + "kl": 0.23046875, + "learning_rate": 9.491852111442088e-07, + "loss": 0.092, + "reward": 1.5, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward_stage2": 0.625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 291 + }, + { + "completion_length": 7.8125, + "epoch": 0.05116523567548625, + "grad_norm": 23.860637551007596, + "kl": 0.193359375, + "learning_rate": 9.490099877343612e-07, + "loss": 0.0772, + "reward": 1.546875, + "reward_std": 0.16887323558330536, + "rewards/accuracy_reward_stage2": 0.671875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 292 + }, + { + "completion_length": 11.703125, + "epoch": 0.0513404590853338, + "grad_norm": 24.13121591753904, + "kl": 0.0537109375, + "learning_rate": 9.488347643245137e-07, + "loss": -0.0227, + "reward": 1.5658124685287476, + "reward_std": 0.2879348397254944, + "rewards/accuracy_reward_stage2": 0.5814374685287476, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 293 + }, + { + "completion_length": 22.875, + "epoch": 0.051515682495181354, + "grad_norm": 18.675519791516496, + "kl": 0.0263671875, + "learning_rate": 9.486595409146662e-07, + "loss": 0.0106, + "reward": 1.2921215295791626, + "reward_std": 0.17938996851444244, + "rewards/accuracy_reward_stage2": 0.4171214997768402, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 294 + }, + { + "completion_length": 10.9375, + "epoch": 0.051690905905028915, + "grad_norm": 20.26634765781072, + "kl": 0.00799560546875, + "learning_rate": 9.484843175048186e-07, + "loss": 0.0032, + "reward": 1.6875, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward_stage2": 0.6875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 295 + }, + { + "completion_length": 8.53125, + "epoch": 0.05186612931487647, + "grad_norm": 19.78494567228972, + "kl": 0.0208740234375, + "learning_rate": 9.483090940949711e-07, + "loss": 0.0084, + "reward": 1.633901834487915, + "reward_std": 0.2172580063343048, + "rewards/accuracy_reward_stage2": 0.633901834487915, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 296 + }, + { + "completion_length": 21.625, + "epoch": 0.05204135272472402, + "grad_norm": 17.083860605215293, + "kl": 0.0120849609375, + "learning_rate": 9.481338706851235e-07, + "loss": 0.0048, + "reward": 1.3703351020812988, + "reward_std": 0.09173109382390976, + "rewards/accuracy_reward_stage2": 0.3703351616859436, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 297 + }, + { + "completion_length": 10.234375, + "epoch": 0.05221657613457158, + "grad_norm": 20.168621744741493, + "kl": 0.06640625, + "learning_rate": 9.47958647275276e-07, + "loss": 0.0265, + "reward": 1.5391501188278198, + "reward_std": 0.17844170331954956, + "rewards/accuracy_reward_stage2": 0.6641501188278198, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 298 + }, + { + "completion_length": 10.703125, + "epoch": 0.05239179954441914, + "grad_norm": 21.384155893839793, + "kl": 0.02685546875, + "learning_rate": 9.477834238654284e-07, + "loss": 0.0107, + "reward": 1.5403645038604736, + "reward_std": 0.3316608667373657, + "rewards/accuracy_reward_stage2": 0.5403645634651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 299 + }, + { + "completion_length": 15.5, + "epoch": 0.05256702295426669, + "grad_norm": 67.3784361561861, + "kl": 0.478515625, + "learning_rate": 9.476082004555808e-07, + "loss": 0.1914, + "reward": 1.2157280445098877, + "reward_std": 0.050552383065223694, + "rewards/accuracy_reward_stage2": 0.4657280445098877, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 300 + }, + { + "completion_length": 13.5, + "epoch": 0.052742246364114245, + "grad_norm": 17.64358996401573, + "kl": 0.053466796875, + "learning_rate": 9.474329770457332e-07, + "loss": 0.0213, + "reward": 1.3356982469558716, + "reward_std": 0.1435163915157318, + "rewards/accuracy_reward_stage2": 0.3356982469558716, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 301 + }, + { + "completion_length": 14.359375, + "epoch": 0.0529174697739618, + "grad_norm": 22.56834868655897, + "kl": 0.03173828125, + "learning_rate": 9.472577536358857e-07, + "loss": 0.0127, + "reward": 1.435058355331421, + "reward_std": 0.17073744535446167, + "rewards/accuracy_reward_stage2": 0.4350583851337433, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 302 + }, + { + "completion_length": 11.578125, + "epoch": 0.05309269318380936, + "grad_norm": 25.73115764646642, + "kl": 0.072265625, + "learning_rate": 9.470825302260381e-07, + "loss": 0.0289, + "reward": 1.601118564605713, + "reward_std": 0.28823572397232056, + "rewards/accuracy_reward_stage2": 0.6011185646057129, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 303 + }, + { + "completion_length": 7.828125, + "epoch": 0.053267916593656914, + "grad_norm": 21.950921128147304, + "kl": 0.046142578125, + "learning_rate": 9.469073068161906e-07, + "loss": -0.0387, + "reward": 1.4147183895111084, + "reward_std": 0.2501143217086792, + "rewards/accuracy_reward_stage2": 0.5709684491157532, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 304 + }, + { + "completion_length": 8.578125, + "epoch": 0.05344314000350447, + "grad_norm": 77.45405609155354, + "kl": 0.376953125, + "learning_rate": 9.46732083406343e-07, + "loss": 0.106, + "reward": 1.5750467777252197, + "reward_std": 0.15764901041984558, + "rewards/accuracy_reward_stage2": 0.5906718373298645, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 305 + }, + { + "completion_length": 11.578125, + "epoch": 0.05361836341335202, + "grad_norm": 17.972984505078564, + "kl": 0.09375, + "learning_rate": 9.465568599964955e-07, + "loss": 0.0374, + "reward": 1.3125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward_stage2": 0.4375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 306 + }, + { + "completion_length": 9.234375, + "epoch": 0.05379358682319958, + "grad_norm": 21.731586710719984, + "kl": 0.07763671875, + "learning_rate": 9.46381636586648e-07, + "loss": -0.0044, + "reward": 1.406597375869751, + "reward_std": 0.2542756199836731, + "rewards/accuracy_reward_stage2": 0.42222240567207336, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 307 + }, + { + "completion_length": 10.625, + "epoch": 0.053968810233047136, + "grad_norm": 49.507632531802265, + "kl": 0.349609375, + "learning_rate": 9.462064131768004e-07, + "loss": 0.1399, + "reward": 1.4937288761138916, + "reward_std": 0.19214007258415222, + "rewards/accuracy_reward_stage2": 0.6187288761138916, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 308 + }, + { + "completion_length": 9.125, + "epoch": 0.05414403364289469, + "grad_norm": 16.914177137657322, + "kl": 0.0255126953125, + "learning_rate": 9.460311897669528e-07, + "loss": 0.0102, + "reward": 1.5506947040557861, + "reward_std": 0.1157640889286995, + "rewards/accuracy_reward_stage2": 0.5506946444511414, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 309 + }, + { + "completion_length": 15.046875, + "epoch": 0.054319257052742244, + "grad_norm": 14.960025362961645, + "kl": 0.022216796875, + "learning_rate": 9.458559663571053e-07, + "loss": 0.0089, + "reward": 1.4643514156341553, + "reward_std": 0.16417983174324036, + "rewards/accuracy_reward_stage2": 0.4643513560295105, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 310 + }, + { + "completion_length": 18.390625, + "epoch": 0.054494480462589805, + "grad_norm": 21.473871880094755, + "kl": 0.0419921875, + "learning_rate": 9.456807429472577e-07, + "loss": 0.0168, + "reward": 1.3595951795578003, + "reward_std": 0.24870413541793823, + "rewards/accuracy_reward_stage2": 0.4845951795578003, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 311 + }, + { + "completion_length": 13.734375, + "epoch": 0.05466970387243736, + "grad_norm": 14.675544367354817, + "kl": 0.0269775390625, + "learning_rate": 9.455055195374102e-07, + "loss": -0.0334, + "reward": 1.4303240776062012, + "reward_std": 0.1907956451177597, + "rewards/accuracy_reward_stage2": 0.5709490180015564, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 312 + }, + { + "completion_length": 8.640625, + "epoch": 0.05484492728228491, + "grad_norm": 21.425310572049685, + "kl": 0.03125, + "learning_rate": 9.453302961275626e-07, + "loss": 0.0125, + "reward": 1.7457122802734375, + "reward_std": 0.2760339379310608, + "rewards/accuracy_reward_stage2": 0.745712399482727, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 313 + }, + { + "completion_length": 10.1875, + "epoch": 0.055020150692132466, + "grad_norm": 23.740271575825194, + "kl": 0.1279296875, + "learning_rate": 9.45155072717715e-07, + "loss": 0.051, + "reward": 1.4499235153198242, + "reward_std": 0.1825544536113739, + "rewards/accuracy_reward_stage2": 0.44992342591285706, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 314 + }, + { + "completion_length": 11.546875, + "epoch": 0.05519537410198003, + "grad_norm": 24.347783431235495, + "kl": 0.046630859375, + "learning_rate": 9.449798493078675e-07, + "loss": -0.0167, + "reward": 1.5818061828613281, + "reward_std": 0.36251744627952576, + "rewards/accuracy_reward_stage2": 0.5974311828613281, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 315 + }, + { + "completion_length": 15.59375, + "epoch": 0.05537059751182758, + "grad_norm": 26.675134620684368, + "kl": 0.25, + "learning_rate": 9.448046258980199e-07, + "loss": 0.1, + "reward": 1.3757495880126953, + "reward_std": 0.25632089376449585, + "rewards/accuracy_reward_stage2": 0.6257495284080505, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 316 + }, + { + "completion_length": 14.234375, + "epoch": 0.055545820921675135, + "grad_norm": 19.637641539670906, + "kl": 0.57421875, + "learning_rate": 9.446294024881724e-07, + "loss": 0.2295, + "reward": 1.5519661903381348, + "reward_std": 0.14337322115898132, + "rewards/accuracy_reward_stage2": 0.6769663095474243, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 317 + }, + { + "completion_length": 10.84375, + "epoch": 0.05572104433152269, + "grad_norm": 21.498652862245095, + "kl": 0.0211181640625, + "learning_rate": 9.444541790783249e-07, + "loss": -0.0338, + "reward": 1.237978219985962, + "reward_std": 0.14729207754135132, + "rewards/accuracy_reward_stage2": 0.2536032199859619, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 318 + }, + { + "completion_length": 7.671875, + "epoch": 0.05589626774137025, + "grad_norm": 77.66592293586824, + "kl": 0.0712890625, + "learning_rate": 9.442789556684772e-07, + "loss": -0.0049, + "reward": 1.20796537399292, + "reward_std": 0.19719843566417694, + "rewards/accuracy_reward_stage2": 0.22359028458595276, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 319 + }, + { + "completion_length": 17.5625, + "epoch": 0.0560714911512178, + "grad_norm": 24.167033254443226, + "kl": 0.0576171875, + "learning_rate": 9.441037322586297e-07, + "loss": 0.023, + "reward": 1.460500717163086, + "reward_std": 0.17372924089431763, + "rewards/accuracy_reward_stage2": 0.46050071716308594, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 320 + }, + { + "completion_length": 7.8125, + "epoch": 0.05624671456106536, + "grad_norm": 23.422114499103813, + "kl": 0.0284423828125, + "learning_rate": 9.439285088487821e-07, + "loss": 0.0114, + "reward": 1.719941258430481, + "reward_std": 0.22414857149124146, + "rewards/accuracy_reward_stage2": 0.7199413180351257, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 321 + }, + { + "completion_length": 14.03125, + "epoch": 0.05642193797091291, + "grad_norm": 21.216710069826014, + "kl": 0.04443359375, + "learning_rate": 9.437532854389346e-07, + "loss": 0.0177, + "reward": 1.4304391145706177, + "reward_std": 0.2377379685640335, + "rewards/accuracy_reward_stage2": 0.43043917417526245, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 322 + }, + { + "completion_length": 8.375, + "epoch": 0.05659716138076047, + "grad_norm": 15.917513479644082, + "kl": 0.048583984375, + "learning_rate": 9.435780620290871e-07, + "loss": 0.0194, + "reward": 1.3735301494598389, + "reward_std": 0.16220563650131226, + "rewards/accuracy_reward_stage2": 0.3735300600528717, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 323 + }, + { + "completion_length": 6.609375, + "epoch": 0.056772384790608026, + "grad_norm": 15.550109521299955, + "kl": 0.0230712890625, + "learning_rate": 9.434028386192395e-07, + "loss": 0.0092, + "reward": 1.5212457180023193, + "reward_std": 0.06378524005413055, + "rewards/accuracy_reward_stage2": 0.6462457180023193, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 324 + }, + { + "completion_length": 17.5, + "epoch": 0.05694760820045558, + "grad_norm": 24.026423483219208, + "kl": 0.09423828125, + "learning_rate": 9.43227615209392e-07, + "loss": 0.0088, + "reward": 1.6016652584075928, + "reward_std": 0.17152510583400726, + "rewards/accuracy_reward_stage2": 0.6172903776168823, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 325 + }, + { + "completion_length": 10.5, + "epoch": 0.05712283161030313, + "grad_norm": 22.942754962486735, + "kl": 0.0478515625, + "learning_rate": 9.430523917995444e-07, + "loss": 0.0192, + "reward": 1.5287861824035645, + "reward_std": 0.2127273827791214, + "rewards/accuracy_reward_stage2": 0.5287861227989197, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 326 + }, + { + "completion_length": 14.0625, + "epoch": 0.057298055020150694, + "grad_norm": 19.828393084324876, + "kl": 0.0859375, + "learning_rate": 9.428771683896968e-07, + "loss": 0.0055, + "reward": 1.4723576307296753, + "reward_std": 0.22869396209716797, + "rewards/accuracy_reward_stage2": 0.6129826307296753, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 327 + }, + { + "completion_length": 22.0, + "epoch": 0.05747327842999825, + "grad_norm": 19.45292668071304, + "kl": 0.03955078125, + "learning_rate": 9.427019449798493e-07, + "loss": 0.0158, + "reward": 1.2495118379592896, + "reward_std": 0.16045579314231873, + "rewards/accuracy_reward_stage2": 0.3745118975639343, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 328 + }, + { + "completion_length": 7.421875, + "epoch": 0.0576485018398458, + "grad_norm": 19.920681409272653, + "kl": 0.03564453125, + "learning_rate": 9.425267215700016e-07, + "loss": 0.0142, + "reward": 1.5295330286026, + "reward_std": 0.25502684712409973, + "rewards/accuracy_reward_stage2": 0.6545330286026001, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 329 + }, + { + "completion_length": 12.46875, + "epoch": 0.057823725249693356, + "grad_norm": 21.62160329452179, + "kl": 0.44921875, + "learning_rate": 9.423514981601541e-07, + "loss": 0.1801, + "reward": 1.4313299655914307, + "reward_std": 0.2229369878768921, + "rewards/accuracy_reward_stage2": 0.5563299655914307, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 330 + }, + { + "completion_length": 9.734375, + "epoch": 0.057998948659540916, + "grad_norm": 17.07241858301237, + "kl": 0.038818359375, + "learning_rate": 9.421762747503066e-07, + "loss": -0.0576, + "reward": 1.6263515949249268, + "reward_std": 0.1974027454853058, + "rewards/accuracy_reward_stage2": 0.6576014757156372, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 331 + }, + { + "completion_length": 9.40625, + "epoch": 0.05817417206938847, + "grad_norm": 20.23984568805099, + "kl": 0.1005859375, + "learning_rate": 9.42001051340459e-07, + "loss": 0.0019, + "reward": 1.738767147064209, + "reward_std": 0.19900760054588318, + "rewards/accuracy_reward_stage2": 0.7543920278549194, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 332 + }, + { + "completion_length": 7.765625, + "epoch": 0.058349395479236024, + "grad_norm": 25.418557978515302, + "kl": 0.060546875, + "learning_rate": 9.418258279306115e-07, + "loss": -0.0145, + "reward": 1.5975984334945679, + "reward_std": 0.2777034044265747, + "rewards/accuracy_reward_stage2": 0.6132233738899231, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 333 + }, + { + "completion_length": 8.984375, + "epoch": 0.058524618889083585, + "grad_norm": 39.31755014042999, + "kl": 0.2216796875, + "learning_rate": 9.41650604520764e-07, + "loss": 0.0369, + "reward": 1.4411017894744873, + "reward_std": 0.2632755935192108, + "rewards/accuracy_reward_stage2": 0.47235187888145447, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 334 + }, + { + "completion_length": 6.015625, + "epoch": 0.05869984229893114, + "grad_norm": 13.464327717734951, + "kl": 0.0186767578125, + "learning_rate": 9.414753811109164e-07, + "loss": -0.0141, + "reward": 1.824305534362793, + "reward_std": 0.07549665868282318, + "rewards/accuracy_reward_stage2": 0.839930534362793, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 335 + }, + { + "completion_length": 9.671875, + "epoch": 0.05887506570877869, + "grad_norm": 19.3182318322218, + "kl": 0.103515625, + "learning_rate": 9.413001577010689e-07, + "loss": 0.0215, + "reward": 1.6234338283538818, + "reward_std": 0.20951224863529205, + "rewards/accuracy_reward_stage2": 0.7640588283538818, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 336 + }, + { + "completion_length": 14.65625, + "epoch": 0.05905028911862625, + "grad_norm": 17.7357517309841, + "kl": 0.01904296875, + "learning_rate": 9.411249342912213e-07, + "loss": 0.0076, + "reward": 1.3733090162277222, + "reward_std": 0.11433231830596924, + "rewards/accuracy_reward_stage2": 0.6233089566230774, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 337 + }, + { + "completion_length": 11.671875, + "epoch": 0.05922551252847381, + "grad_norm": 16.671151395451187, + "kl": 0.06982421875, + "learning_rate": 9.409497108813738e-07, + "loss": -0.0352, + "reward": 1.4322497844696045, + "reward_std": 0.2082475870847702, + "rewards/accuracy_reward_stage2": 0.4634997248649597, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 338 + }, + { + "completion_length": 11.59375, + "epoch": 0.05940073593832136, + "grad_norm": 19.83687104514724, + "kl": 0.060546875, + "learning_rate": 9.407744874715261e-07, + "loss": -0.02, + "reward": 1.4493414163589478, + "reward_std": 0.2299998253583908, + "rewards/accuracy_reward_stage2": 0.5899664163589478, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 339 + }, + { + "completion_length": 15.78125, + "epoch": 0.059575959348168915, + "grad_norm": 21.61938135228256, + "kl": 0.056884765625, + "learning_rate": 9.405992640616785e-07, + "loss": 0.0228, + "reward": 1.6063339710235596, + "reward_std": 0.23589658737182617, + "rewards/accuracy_reward_stage2": 0.6063340306282043, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 340 + }, + { + "completion_length": 7.671875, + "epoch": 0.05975118275801647, + "grad_norm": 16.099045116611332, + "kl": 0.033203125, + "learning_rate": 9.40424040651831e-07, + "loss": -0.0984, + "reward": 1.203125, + "reward_std": 0.19044627249240875, + "rewards/accuracy_reward_stage2": 0.265625, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 341 + }, + { + "completion_length": 10.09375, + "epoch": 0.05992640616786403, + "grad_norm": 17.02533664481703, + "kl": 0.039306640625, + "learning_rate": 9.402488172419835e-07, + "loss": 0.0157, + "reward": 1.7135874032974243, + "reward_std": 0.18373428285121918, + "rewards/accuracy_reward_stage2": 0.7135874032974243, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 342 + }, + { + "completion_length": 8.0625, + "epoch": 0.060101629577711584, + "grad_norm": 16.60695937815947, + "kl": 0.01458740234375, + "learning_rate": 9.400735938321359e-07, + "loss": -0.0231, + "reward": 1.6499578952789307, + "reward_std": 0.1063762977719307, + "rewards/accuracy_reward_stage2": 0.6655828952789307, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 343 + }, + { + "completion_length": 12.5, + "epoch": 0.06027685298755914, + "grad_norm": 21.365233496072975, + "kl": 0.56640625, + "learning_rate": 9.398983704222884e-07, + "loss": 0.2267, + "reward": 1.409088134765625, + "reward_std": 0.11065279692411423, + "rewards/accuracy_reward_stage2": 0.5340880751609802, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 344 + }, + { + "completion_length": 6.984375, + "epoch": 0.06045207639740669, + "grad_norm": 20.21324822494437, + "kl": 0.0289306640625, + "learning_rate": 9.397231470124408e-07, + "loss": 0.0116, + "reward": 1.7402604818344116, + "reward_std": 0.20953799784183502, + "rewards/accuracy_reward_stage2": 0.7402604818344116, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 345 + }, + { + "completion_length": 12.8125, + "epoch": 0.06062729980725425, + "grad_norm": 30.088987680319942, + "kl": 0.4296875, + "learning_rate": 9.395479236025933e-07, + "loss": 0.1713, + "reward": 1.4402340650558472, + "reward_std": 0.3298301696777344, + "rewards/accuracy_reward_stage2": 0.5652340650558472, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 346 + }, + { + "completion_length": 17.25, + "epoch": 0.060802523217101806, + "grad_norm": 16.63132010533935, + "kl": 0.056396484375, + "learning_rate": 9.393727001927458e-07, + "loss": 0.0226, + "reward": 1.2534388303756714, + "reward_std": 0.12893691658973694, + "rewards/accuracy_reward_stage2": 0.37843888998031616, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 347 + }, + { + "completion_length": 12.859375, + "epoch": 0.06097774662694936, + "grad_norm": 19.911215792220517, + "kl": 0.057861328125, + "learning_rate": 9.391974767828981e-07, + "loss": 0.0231, + "reward": 1.7468256950378418, + "reward_std": 0.16623491048812866, + "rewards/accuracy_reward_stage2": 0.7468256950378418, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 348 + }, + { + "completion_length": 13.375, + "epoch": 0.061152970036796914, + "grad_norm": 16.62225918229597, + "kl": 0.04052734375, + "learning_rate": 9.390222533730506e-07, + "loss": 0.0162, + "reward": 1.4750198125839233, + "reward_std": 0.08836042135953903, + "rewards/accuracy_reward_stage2": 0.4750198423862457, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 349 + }, + { + "completion_length": 9.453125, + "epoch": 0.061328193446644474, + "grad_norm": 17.918736470818693, + "kl": 0.0264892578125, + "learning_rate": 9.388470299632031e-07, + "loss": -0.0622, + "reward": 1.5052083730697632, + "reward_std": 0.2088155895471573, + "rewards/accuracy_reward_stage2": 0.5677083134651184, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 350 + }, + { + "completion_length": 16.828125, + "epoch": 0.06150341685649203, + "grad_norm": 18.93734858541338, + "kl": 0.029052734375, + "learning_rate": 9.386718065533555e-07, + "loss": 0.0116, + "reward": 1.6041667461395264, + "reward_std": 0.1329318881034851, + "rewards/accuracy_reward_stage2": 0.6041666269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 351 + }, + { + "completion_length": 14.65625, + "epoch": 0.06167864026633958, + "grad_norm": 13.115570119136578, + "kl": 0.007659912109375, + "learning_rate": 9.384965831435079e-07, + "loss": -0.0411, + "reward": 1.5149922370910645, + "reward_std": 0.10126683115959167, + "rewards/accuracy_reward_stage2": 0.5306171178817749, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 352 + }, + { + "completion_length": 10.484375, + "epoch": 0.061853863676187136, + "grad_norm": 23.67866454130065, + "kl": 0.0673828125, + "learning_rate": 9.383213597336603e-07, + "loss": 0.0268, + "reward": 1.462594747543335, + "reward_std": 0.2419978380203247, + "rewards/accuracy_reward_stage2": 0.4625946879386902, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 353 + }, + { + "completion_length": 10.203125, + "epoch": 0.0620290870860347, + "grad_norm": 28.223532476857716, + "kl": 0.054443359375, + "learning_rate": 9.381461363238128e-07, + "loss": 0.0217, + "reward": 1.5003230571746826, + "reward_std": 0.18812508881092072, + "rewards/accuracy_reward_stage2": 0.6253230571746826, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 354 + }, + { + "completion_length": 12.34375, + "epoch": 0.06220431049588225, + "grad_norm": 17.401607076165597, + "kl": 0.0191650390625, + "learning_rate": 9.379709129139653e-07, + "loss": 0.0077, + "reward": 1.508453369140625, + "reward_std": 0.10828704386949539, + "rewards/accuracy_reward_stage2": 0.508453369140625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 355 + }, + { + "completion_length": 12.484375, + "epoch": 0.062379533905729805, + "grad_norm": 23.054852828053292, + "kl": 0.142578125, + "learning_rate": 9.377956895041177e-07, + "loss": 0.0571, + "reward": 1.3993406295776367, + "reward_std": 0.12222443521022797, + "rewards/accuracy_reward_stage2": 0.6493405103683472, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 356 + }, + { + "completion_length": 9.296875, + "epoch": 0.06255475731557736, + "grad_norm": 19.277548157506466, + "kl": 0.125, + "learning_rate": 9.376204660942702e-07, + "loss": 0.046, + "reward": 1.6770656108856201, + "reward_std": 0.14195884764194489, + "rewards/accuracy_reward_stage2": 0.8020656704902649, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 357 + }, + { + "completion_length": 8.9375, + "epoch": 0.06272998072542492, + "grad_norm": 14.01438455619914, + "kl": 0.0203857421875, + "learning_rate": 9.374452426844227e-07, + "loss": 0.0082, + "reward": 1.296875, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward_stage2": 0.328125, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 358 + }, + { + "completion_length": 9.578125, + "epoch": 0.06290520413527247, + "grad_norm": 18.87541757343168, + "kl": 0.06591796875, + "learning_rate": 9.37270019274575e-07, + "loss": 0.0265, + "reward": 1.4920721054077148, + "reward_std": 0.13774442672729492, + "rewards/accuracy_reward_stage2": 0.49207204580307007, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 359 + }, + { + "completion_length": 10.625, + "epoch": 0.06308042754512003, + "grad_norm": 19.89411336926994, + "kl": 0.0615234375, + "learning_rate": 9.370947958647275e-07, + "loss": -0.0196, + "reward": 1.503807544708252, + "reward_std": 0.26093435287475586, + "rewards/accuracy_reward_stage2": 0.6444324851036072, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 360 + }, + { + "completion_length": 15.46875, + "epoch": 0.06325565095496759, + "grad_norm": 23.077935193360588, + "kl": 0.130859375, + "learning_rate": 9.369195724548799e-07, + "loss": 0.0129, + "reward": 1.599015235900879, + "reward_std": 0.2068370282649994, + "rewards/accuracy_reward_stage2": 0.6146402359008789, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 361 + }, + { + "completion_length": 7.875, + "epoch": 0.06343087436481513, + "grad_norm": 21.35498118118063, + "kl": 0.0303955078125, + "learning_rate": 9.367443490450324e-07, + "loss": 0.0122, + "reward": 1.7239583730697632, + "reward_std": 0.28599968552589417, + "rewards/accuracy_reward_stage2": 0.7239583730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 362 + }, + { + "completion_length": 12.40625, + "epoch": 0.0636060977746627, + "grad_norm": 1170.2714969237206, + "kl": 3.21875, + "learning_rate": 9.365691256351849e-07, + "loss": 1.2427, + "reward": 1.412689208984375, + "reward_std": 0.1700247824192047, + "rewards/accuracy_reward_stage2": 0.6783140897750854, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 363 + }, + { + "completion_length": 8.703125, + "epoch": 0.06378132118451026, + "grad_norm": 21.473732277517065, + "kl": 0.024658203125, + "learning_rate": 9.363939022253373e-07, + "loss": 0.0099, + "reward": 1.6391968727111816, + "reward_std": 0.14921677112579346, + "rewards/accuracy_reward_stage2": 0.6391969323158264, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 364 + }, + { + "completion_length": 5.953125, + "epoch": 0.0639565445943578, + "grad_norm": 24.9623619651563, + "kl": 0.08837890625, + "learning_rate": 9.362186788154897e-07, + "loss": 0.0144, + "reward": 1.559525489807129, + "reward_std": 0.3174114227294922, + "rewards/accuracy_reward_stage2": 0.5751504898071289, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 365 + }, + { + "completion_length": 8.703125, + "epoch": 0.06413176800420536, + "grad_norm": 25.036012178534264, + "kl": 0.12451171875, + "learning_rate": 9.360434554056421e-07, + "loss": 0.0209, + "reward": 1.32099449634552, + "reward_std": 0.31950968503952026, + "rewards/accuracy_reward_stage2": 0.4616195559501648, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 366 + }, + { + "completion_length": 11.296875, + "epoch": 0.06430699141405291, + "grad_norm": 21.858208443169964, + "kl": 0.119140625, + "learning_rate": 9.358682319957946e-07, + "loss": -0.0174, + "reward": 1.502138614654541, + "reward_std": 0.2199799120426178, + "rewards/accuracy_reward_stage2": 0.5333885550498962, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 367 + }, + { + "completion_length": 11.859375, + "epoch": 0.06448221482390047, + "grad_norm": 40.60899905178031, + "kl": 0.06689453125, + "learning_rate": 9.35693008585947e-07, + "loss": 0.0267, + "reward": 1.6392221450805664, + "reward_std": 0.27685898542404175, + "rewards/accuracy_reward_stage2": 0.6392222046852112, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 368 + }, + { + "completion_length": 11.3125, + "epoch": 0.06465743823374803, + "grad_norm": 18.414448641824837, + "kl": 0.10302734375, + "learning_rate": 9.355177851760994e-07, + "loss": 0.0411, + "reward": 1.4526405334472656, + "reward_std": 0.155037060379982, + "rewards/accuracy_reward_stage2": 0.4526405334472656, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 369 + }, + { + "completion_length": 7.71875, + "epoch": 0.06483266164359558, + "grad_norm": 24.29576979333205, + "kl": 0.0703125, + "learning_rate": 9.353425617662519e-07, + "loss": -0.1017, + "reward": 1.6652096509933472, + "reward_std": 0.4130101203918457, + "rewards/accuracy_reward_stage2": 0.7277096509933472, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 370 + }, + { + "completion_length": 8.078125, + "epoch": 0.06500788505344314, + "grad_norm": 17.81540268602905, + "kl": 0.061767578125, + "learning_rate": 9.351673383564044e-07, + "loss": 0.0246, + "reward": 1.6285955905914307, + "reward_std": 0.16550035774707794, + "rewards/accuracy_reward_stage2": 0.6285956501960754, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 371 + }, + { + "completion_length": 10.15625, + "epoch": 0.0651831084632907, + "grad_norm": 45.18264689694991, + "kl": 0.265625, + "learning_rate": 9.349921149465568e-07, + "loss": 0.0619, + "reward": 1.517066478729248, + "reward_std": 0.13450536131858826, + "rewards/accuracy_reward_stage2": 0.5326914191246033, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 372 + }, + { + "completion_length": 9.140625, + "epoch": 0.06535833187313825, + "grad_norm": 17.577834154040076, + "kl": 0.046875, + "learning_rate": 9.348168915367093e-07, + "loss": 0.0187, + "reward": 1.4530048370361328, + "reward_std": 0.07536228746175766, + "rewards/accuracy_reward_stage2": 0.45300477743148804, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 373 + }, + { + "completion_length": 12.71875, + "epoch": 0.06553355528298581, + "grad_norm": 23.31006461032496, + "kl": 0.1767578125, + "learning_rate": 9.346416681268617e-07, + "loss": 0.0032, + "reward": 1.6315686702728271, + "reward_std": 0.14052413403987885, + "rewards/accuracy_reward_stage2": 0.7878186106681824, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 374 + }, + { + "completion_length": 9.421875, + "epoch": 0.06570877869283337, + "grad_norm": 18.109201679127302, + "kl": 0.068359375, + "learning_rate": 9.344664447170142e-07, + "loss": 0.0273, + "reward": 1.3579471111297607, + "reward_std": 0.136207714676857, + "rewards/accuracy_reward_stage2": 0.3579471707344055, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 375 + }, + { + "completion_length": 7.921875, + "epoch": 0.06588400210268092, + "grad_norm": 13.861200917194214, + "kl": 0.031494140625, + "learning_rate": 9.342912213071667e-07, + "loss": 0.0126, + "reward": 1.6228134632110596, + "reward_std": 0.1539314091205597, + "rewards/accuracy_reward_stage2": 0.6228134632110596, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 376 + }, + { + "completion_length": 7.234375, + "epoch": 0.06605922551252848, + "grad_norm": 18.323722906868795, + "kl": 0.0107421875, + "learning_rate": 9.34115997897319e-07, + "loss": 0.0043, + "reward": 1.5229077339172363, + "reward_std": 0.08434540033340454, + "rewards/accuracy_reward_stage2": 0.5229077339172363, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 377 + }, + { + "completion_length": 10.015625, + "epoch": 0.06623444892237602, + "grad_norm": 21.175979440293844, + "kl": 0.045166015625, + "learning_rate": 9.339407744874714e-07, + "loss": 0.0181, + "reward": 1.550042748451233, + "reward_std": 0.2471158653497696, + "rewards/accuracy_reward_stage2": 0.5500428080558777, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 378 + }, + { + "completion_length": 10.890625, + "epoch": 0.06640967233222358, + "grad_norm": 16.39289635513477, + "kl": 0.04541015625, + "learning_rate": 9.337655510776239e-07, + "loss": 0.0181, + "reward": 1.7313203811645508, + "reward_std": 0.1756003499031067, + "rewards/accuracy_reward_stage2": 0.7313204407691956, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 379 + }, + { + "completion_length": 8.6875, + "epoch": 0.06658489574207115, + "grad_norm": 19.84227908797742, + "kl": 0.0289306640625, + "learning_rate": 9.335903276677763e-07, + "loss": 0.0116, + "reward": 1.377845048904419, + "reward_std": 0.18648402392864227, + "rewards/accuracy_reward_stage2": 0.5028449892997742, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 380 + }, + { + "completion_length": 7.9375, + "epoch": 0.06676011915191869, + "grad_norm": 25.930310165133555, + "kl": 0.0693359375, + "learning_rate": 9.334151042579288e-07, + "loss": 0.0278, + "reward": 1.5856380462646484, + "reward_std": 0.13614915311336517, + "rewards/accuracy_reward_stage2": 0.7106380462646484, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 381 + }, + { + "completion_length": 11.734375, + "epoch": 0.06693534256176625, + "grad_norm": 20.751357067789666, + "kl": 0.10302734375, + "learning_rate": 9.332398808480812e-07, + "loss": -0.0029, + "reward": 1.272355318069458, + "reward_std": 0.25995975732803345, + "rewards/accuracy_reward_stage2": 0.28798040747642517, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 382 + }, + { + "completion_length": 11.015625, + "epoch": 0.06711056597161381, + "grad_norm": 17.974724434571502, + "kl": 0.07470703125, + "learning_rate": 9.330646574382337e-07, + "loss": -0.0144, + "reward": 1.5834121704101562, + "reward_std": 0.19877059757709503, + "rewards/accuracy_reward_stage2": 0.5990370512008667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 383 + }, + { + "completion_length": 8.265625, + "epoch": 0.06728578938146136, + "grad_norm": 32.14407547624172, + "kl": 0.068359375, + "learning_rate": 9.328894340283862e-07, + "loss": -0.0057, + "reward": 1.5889458656311035, + "reward_std": 0.2500606179237366, + "rewards/accuracy_reward_stage2": 0.6045708656311035, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 384 + }, + { + "completion_length": 11.734375, + "epoch": 0.06746101279130892, + "grad_norm": 15420.533479337024, + "kl": 35.75, + "learning_rate": 9.327142106185386e-07, + "loss": 14.2103, + "reward": 1.4057738780975342, + "reward_std": 0.1744045913219452, + "rewards/accuracy_reward_stage2": 0.5463988184928894, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 385 + }, + { + "completion_length": 9.234375, + "epoch": 0.06763623620115647, + "grad_norm": 31.137419894167664, + "kl": 0.0439453125, + "learning_rate": 9.325389872086911e-07, + "loss": -0.02, + "reward": 1.6810356378555298, + "reward_std": 0.24055354297161102, + "rewards/accuracy_reward_stage2": 0.8216606378555298, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 386 + }, + { + "completion_length": 11.359375, + "epoch": 0.06781145961100403, + "grad_norm": 21.441134645043906, + "kl": 0.048583984375, + "learning_rate": 9.323637637988436e-07, + "loss": -0.0248, + "reward": 1.5098655223846436, + "reward_std": 0.19568142294883728, + "rewards/accuracy_reward_stage2": 0.5254905223846436, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 387 + }, + { + "completion_length": 9.515625, + "epoch": 0.06798668302085159, + "grad_norm": 18.926669594347928, + "kl": 0.0296630859375, + "learning_rate": 9.321885403889959e-07, + "loss": -0.0195, + "reward": 1.563733458518982, + "reward_std": 0.26144492626190186, + "rewards/accuracy_reward_stage2": 0.5793584585189819, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 388 + }, + { + "completion_length": 17.140625, + "epoch": 0.06816190643069914, + "grad_norm": 21.75447769013038, + "kl": 0.0419921875, + "learning_rate": 9.320133169791484e-07, + "loss": -0.0271, + "reward": 1.5931763648986816, + "reward_std": 0.1997213512659073, + "rewards/accuracy_reward_stage2": 0.6088013648986816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 389 + }, + { + "completion_length": 8.5625, + "epoch": 0.0683371298405467, + "grad_norm": 20.67454053924187, + "kl": 0.0888671875, + "learning_rate": 9.318380935693007e-07, + "loss": -0.0701, + "reward": 1.4614261388778687, + "reward_std": 0.2886839509010315, + "rewards/accuracy_reward_stage2": 0.5083011984825134, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 390 + }, + { + "completion_length": 10.359375, + "epoch": 0.06851235325039426, + "grad_norm": 21.8958041612382, + "kl": 0.039306640625, + "learning_rate": 9.316628701594532e-07, + "loss": 0.0157, + "reward": 1.316678524017334, + "reward_std": 0.14866000413894653, + "rewards/accuracy_reward_stage2": 0.44167858362197876, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 391 + }, + { + "completion_length": 12.953125, + "epoch": 0.0686875766602418, + "grad_norm": 17.742755062439734, + "kl": 0.017822265625, + "learning_rate": 9.314876467496057e-07, + "loss": 0.0071, + "reward": 1.4223427772521973, + "reward_std": 0.15768727660179138, + "rewards/accuracy_reward_stage2": 0.4223426580429077, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 392 + }, + { + "completion_length": 10.84375, + "epoch": 0.06886280007008937, + "grad_norm": 18.22403562891549, + "kl": 0.0625, + "learning_rate": 9.313124233397581e-07, + "loss": 0.025, + "reward": 1.4049084186553955, + "reward_std": 0.2654655873775482, + "rewards/accuracy_reward_stage2": 0.5299084186553955, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 393 + }, + { + "completion_length": 9.546875, + "epoch": 0.06903802347993691, + "grad_norm": 20.273210311381384, + "kl": 0.0810546875, + "learning_rate": 9.311371999299106e-07, + "loss": -0.0119, + "reward": 1.4684481620788574, + "reward_std": 0.23478779196739197, + "rewards/accuracy_reward_stage2": 0.48407310247421265, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 394 + }, + { + "completion_length": 9.21875, + "epoch": 0.06921324688978447, + "grad_norm": 21.644046040667575, + "kl": 0.056884765625, + "learning_rate": 9.309619765200631e-07, + "loss": 0.0227, + "reward": 1.632354736328125, + "reward_std": 0.1535060554742813, + "rewards/accuracy_reward_stage2": 0.6323546767234802, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 395 + }, + { + "completion_length": 11.6875, + "epoch": 0.06938847029963204, + "grad_norm": 16.892620916232747, + "kl": 0.034912109375, + "learning_rate": 9.307867531102155e-07, + "loss": 0.0139, + "reward": 1.739698886871338, + "reward_std": 0.21730005741119385, + "rewards/accuracy_reward_stage2": 0.7396988868713379, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 396 + }, + { + "completion_length": 18.265625, + "epoch": 0.06956369370947958, + "grad_norm": 18.328691362249153, + "kl": 54.5, + "learning_rate": 9.30611529700368e-07, + "loss": 21.8455, + "reward": 1.3181016445159912, + "reward_std": 0.09811335802078247, + "rewards/accuracy_reward_stage2": 0.4587266445159912, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 397 + }, + { + "completion_length": 9.15625, + "epoch": 0.06973891711932714, + "grad_norm": 25.60445091883008, + "kl": 0.130859375, + "learning_rate": 9.304363062905203e-07, + "loss": 0.0082, + "reward": 1.54931640625, + "reward_std": 0.23682433366775513, + "rewards/accuracy_reward_stage2": 0.56494140625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 398 + }, + { + "completion_length": 27.078125, + "epoch": 0.0699141405291747, + "grad_norm": 20.710243539061945, + "kl": 0.06201171875, + "learning_rate": 9.302610828806728e-07, + "loss": 0.0036, + "reward": 1.312551498413086, + "reward_std": 0.16797444224357605, + "rewards/accuracy_reward_stage2": 0.3281765580177307, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 399 + }, + { + "completion_length": 9.46875, + "epoch": 0.07008936393902225, + "grad_norm": 19.818484838510113, + "kl": 0.04296875, + "learning_rate": 9.300858594708253e-07, + "loss": 0.0172, + "reward": 1.495539903640747, + "reward_std": 0.1228901818394661, + "rewards/accuracy_reward_stage2": 0.49553996324539185, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 400 + }, + { + "completion_length": 8.84375, + "epoch": 0.07026458734886981, + "grad_norm": 59.341330787930666, + "kl": 0.0869140625, + "learning_rate": 9.299106360609777e-07, + "loss": 0.0347, + "reward": 1.8055976629257202, + "reward_std": 0.2616202235221863, + "rewards/accuracy_reward_stage2": 0.8055975437164307, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 401 + }, + { + "completion_length": 14.328125, + "epoch": 0.07043981075871736, + "grad_norm": 18.6793268588412, + "kl": 0.07666015625, + "learning_rate": 9.297354126511302e-07, + "loss": -0.0076, + "reward": 1.4062790870666504, + "reward_std": 0.1280801147222519, + "rewards/accuracy_reward_stage2": 0.4219040870666504, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 402 + }, + { + "completion_length": 9.515625, + "epoch": 0.07061503416856492, + "grad_norm": 17.36545255201424, + "kl": 0.0859375, + "learning_rate": 9.295601892412826e-07, + "loss": 0.0343, + "reward": 1.599717378616333, + "reward_std": 0.23323816061019897, + "rewards/accuracy_reward_stage2": 0.724717378616333, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 403 + }, + { + "completion_length": 14.421875, + "epoch": 0.07079025757841248, + "grad_norm": 22.000875240676198, + "kl": 0.072265625, + "learning_rate": 9.29384965831435e-07, + "loss": -0.036, + "reward": 1.3782211542129517, + "reward_std": 0.2272408902645111, + "rewards/accuracy_reward_stage2": 0.40947121381759644, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 404 + }, + { + "completion_length": 16.359375, + "epoch": 0.07096548098826003, + "grad_norm": 25.677116196214612, + "kl": 63.0, + "learning_rate": 9.292097424215875e-07, + "loss": 25.2293, + "reward": 1.4082450866699219, + "reward_std": 0.2820996642112732, + "rewards/accuracy_reward_stage2": 0.5488699674606323, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 405 + }, + { + "completion_length": 10.546875, + "epoch": 0.07114070439810759, + "grad_norm": 17.342161817612745, + "kl": 0.02880859375, + "learning_rate": 9.290345190117399e-07, + "loss": -0.0326, + "reward": 1.8020833730697632, + "reward_std": 0.19606460630893707, + "rewards/accuracy_reward_stage2": 0.8177083730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 406 + }, + { + "completion_length": 13.21875, + "epoch": 0.07131592780795515, + "grad_norm": 22.355008720403898, + "kl": 0.02294921875, + "learning_rate": 9.288592956018924e-07, + "loss": 0.0092, + "reward": 1.617870569229126, + "reward_std": 0.2922362983226776, + "rewards/accuracy_reward_stage2": 0.6178706288337708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 407 + }, + { + "completion_length": 8.578125, + "epoch": 0.0714911512178027, + "grad_norm": 23.973990386827367, + "kl": 0.064453125, + "learning_rate": 9.286840721920448e-07, + "loss": 0.0258, + "reward": 1.624790072441101, + "reward_std": 0.1845066249370575, + "rewards/accuracy_reward_stage2": 0.6247899532318115, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 408 + }, + { + "completion_length": 7.5, + "epoch": 0.07166637462765026, + "grad_norm": 21.002992389593107, + "kl": 0.068359375, + "learning_rate": 9.285088487821972e-07, + "loss": -0.0159, + "reward": 1.509068489074707, + "reward_std": 0.25469109416007996, + "rewards/accuracy_reward_stage2": 0.524693489074707, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 409 + }, + { + "completion_length": 8.265625, + "epoch": 0.0718415980374978, + "grad_norm": 24.076299669631023, + "kl": 0.0703125, + "learning_rate": 9.283336253723497e-07, + "loss": 0.0281, + "reward": 1.7792377471923828, + "reward_std": 0.15023738145828247, + "rewards/accuracy_reward_stage2": 0.7792376279830933, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 410 + }, + { + "completion_length": 16.296875, + "epoch": 0.07201682144734536, + "grad_norm": 68.66799831706935, + "kl": 55.25, + "learning_rate": 9.281584019625022e-07, + "loss": 22.1182, + "reward": 1.3476190567016602, + "reward_std": 0.3558818995952606, + "rewards/accuracy_reward_stage2": 0.48824408650398254, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 411 + }, + { + "completion_length": 13.9375, + "epoch": 0.07219204485719292, + "grad_norm": 25.391220966759903, + "kl": 0.15625, + "learning_rate": 9.279831785526546e-07, + "loss": 0.0061, + "reward": 1.4034576416015625, + "reward_std": 0.30644452571868896, + "rewards/accuracy_reward_stage2": 0.5597076416015625, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 412 + }, + { + "completion_length": 18.84375, + "epoch": 0.07236726826704047, + "grad_norm": 23223.505120208472, + "kl": 342.0, + "learning_rate": 9.278079551428071e-07, + "loss": 137.4967, + "reward": 1.2346117496490479, + "reward_std": 0.1464797854423523, + "rewards/accuracy_reward_stage2": 0.35961174964904785, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 413 + }, + { + "completion_length": 8.640625, + "epoch": 0.07254249167688803, + "grad_norm": 15.935590483914812, + "kl": 0.047119140625, + "learning_rate": 9.276327317329595e-07, + "loss": -0.0254, + "reward": 1.4736135005950928, + "reward_std": 0.18977496027946472, + "rewards/accuracy_reward_stage2": 0.6142385601997375, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 414 + }, + { + "completion_length": 7.765625, + "epoch": 0.0727177150867356, + "grad_norm": 20.58056946058028, + "kl": 0.11328125, + "learning_rate": 9.27457508323112e-07, + "loss": 0.0453, + "reward": 1.2243397235870361, + "reward_std": 0.20016539096832275, + "rewards/accuracy_reward_stage2": 0.47433966398239136, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 415 + }, + { + "completion_length": 8.671875, + "epoch": 0.07289293849658314, + "grad_norm": 20.599294957099612, + "kl": 0.1865234375, + "learning_rate": 9.272822849132644e-07, + "loss": 0.0746, + "reward": 1.811370849609375, + "reward_std": 0.10164359956979752, + "rewards/accuracy_reward_stage2": 0.936370849609375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 416 + }, + { + "completion_length": 23.25, + "epoch": 0.0730681619064307, + "grad_norm": 14.076138054590698, + "kl": 0.03369140625, + "learning_rate": 9.271070615034167e-07, + "loss": 0.0135, + "reward": 1.4144988059997559, + "reward_std": 0.07342597842216492, + "rewards/accuracy_reward_stage2": 0.41449886560440063, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 417 + }, + { + "completion_length": 11.59375, + "epoch": 0.07324338531627826, + "grad_norm": 20.929553261051325, + "kl": 0.0830078125, + "learning_rate": 9.269318380935692e-07, + "loss": 0.0333, + "reward": 1.3908796310424805, + "reward_std": 0.14465433359146118, + "rewards/accuracy_reward_stage2": 0.5158795714378357, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 418 + }, + { + "completion_length": 11.640625, + "epoch": 0.07341860872612581, + "grad_norm": 15.50809565451471, + "kl": 0.01287841796875, + "learning_rate": 9.267566146837217e-07, + "loss": 0.0052, + "reward": 1.3149559497833252, + "reward_std": 0.10769060254096985, + "rewards/accuracy_reward_stage2": 0.4399559497833252, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 419 + }, + { + "completion_length": 12.578125, + "epoch": 0.07359383213597337, + "grad_norm": 19.864710108306696, + "kl": 0.0791015625, + "learning_rate": 9.265813912738741e-07, + "loss": 0.0316, + "reward": 1.5394093990325928, + "reward_std": 0.13540780544281006, + "rewards/accuracy_reward_stage2": 0.539409339427948, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 420 + }, + { + "completion_length": 9.375, + "epoch": 0.07376905554582092, + "grad_norm": 17.71086059432014, + "kl": 0.07080078125, + "learning_rate": 9.264061678640266e-07, + "loss": -0.0489, + "reward": 1.6943539381027222, + "reward_std": 0.20985379815101624, + "rewards/accuracy_reward_stage2": 0.7256039977073669, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 421 + }, + { + "completion_length": 11.59375, + "epoch": 0.07394427895566848, + "grad_norm": 20.681192592863816, + "kl": 0.126953125, + "learning_rate": 9.26230944454179e-07, + "loss": -0.0402, + "reward": 1.3249560594558716, + "reward_std": 0.3091086447238922, + "rewards/accuracy_reward_stage2": 0.4812060594558716, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 422 + }, + { + "completion_length": 6.34375, + "epoch": 0.07411950236551604, + "grad_norm": 12.04230880142454, + "kl": 0.040283203125, + "learning_rate": 9.260557210443315e-07, + "loss": -0.0205, + "reward": 1.740767002105713, + "reward_std": 0.15345188975334167, + "rewards/accuracy_reward_stage2": 0.7563920617103577, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 423 + }, + { + "completion_length": 11.921875, + "epoch": 0.07429472577536358, + "grad_norm": 20.38539880893731, + "kl": 0.09130859375, + "learning_rate": 9.25880497634484e-07, + "loss": 0.0365, + "reward": 1.3158583641052246, + "reward_std": 0.2701030969619751, + "rewards/accuracy_reward_stage2": 0.31585830450057983, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 424 + }, + { + "completion_length": 7.609375, + "epoch": 0.07446994918521115, + "grad_norm": 21.084952418921368, + "kl": 0.0152587890625, + "learning_rate": 9.257052742246364e-07, + "loss": -0.0294, + "reward": 1.4352679252624512, + "reward_std": 0.20226189494132996, + "rewards/accuracy_reward_stage2": 0.450892835855484, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 425 + }, + { + "completion_length": 5.6875, + "epoch": 0.0746451725950587, + "grad_norm": 20.96660867060781, + "kl": 0.060546875, + "learning_rate": 9.255300508147889e-07, + "loss": -0.0199, + "reward": 1.4383260011672974, + "reward_std": 0.16609863936901093, + "rewards/accuracy_reward_stage2": 0.46957600116729736, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 426 + }, + { + "completion_length": 12.34375, + "epoch": 0.07482039600490625, + "grad_norm": 26.736215847509886, + "kl": 0.2099609375, + "learning_rate": 9.253548274049414e-07, + "loss": 0.0399, + "reward": 1.5008306503295898, + "reward_std": 0.2757319509983063, + "rewards/accuracy_reward_stage2": 0.6414556503295898, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 427 + }, + { + "completion_length": 8.578125, + "epoch": 0.07499561941475381, + "grad_norm": 20.00071764470865, + "kl": 0.0732421875, + "learning_rate": 9.251796039950936e-07, + "loss": -0.0211, + "reward": 1.5251744985580444, + "reward_std": 0.15365660190582275, + "rewards/accuracy_reward_stage2": 0.5564244985580444, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 428 + }, + { + "completion_length": 16.578125, + "epoch": 0.07517084282460136, + "grad_norm": 332.3153287615184, + "kl": 41.0, + "learning_rate": 9.250043805852461e-07, + "loss": 16.39, + "reward": 1.3449559211730957, + "reward_std": 0.4346715807914734, + "rewards/accuracy_reward_stage2": 0.5949559211730957, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 429 + }, + { + "completion_length": 8.515625, + "epoch": 0.07534606623444892, + "grad_norm": 26.579188374251853, + "kl": 0.04248046875, + "learning_rate": 9.248291571753985e-07, + "loss": -0.0163, + "reward": 1.3712437152862549, + "reward_std": 0.38221314549446106, + "rewards/accuracy_reward_stage2": 0.3868686556816101, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 430 + }, + { + "completion_length": 12.109375, + "epoch": 0.07552128964429648, + "grad_norm": 19.678076223219964, + "kl": 0.04736328125, + "learning_rate": 9.24653933765551e-07, + "loss": -0.0485, + "reward": 1.583035945892334, + "reward_std": 0.16018790006637573, + "rewards/accuracy_reward_stage2": 0.614285945892334, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 431 + }, + { + "completion_length": 12.890625, + "epoch": 0.07569651305414403, + "grad_norm": 20.088676161617673, + "kl": 75.5, + "learning_rate": 9.244787103557035e-07, + "loss": 30.3652, + "reward": 1.371154546737671, + "reward_std": 0.17230086028575897, + "rewards/accuracy_reward_stage2": 0.5117795467376709, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 432 + }, + { + "completion_length": 14.734375, + "epoch": 0.07587173646399159, + "grad_norm": 18.981420019902902, + "kl": 0.048583984375, + "learning_rate": 9.243034869458559e-07, + "loss": 0.0193, + "reward": 1.3368223905563354, + "reward_std": 0.21399948000907898, + "rewards/accuracy_reward_stage2": 0.46182242035865784, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 433 + }, + { + "completion_length": 20.46875, + "epoch": 0.07604695987383915, + "grad_norm": 19.334220522199253, + "kl": 53.0, + "learning_rate": 9.241282635360084e-07, + "loss": 21.2419, + "reward": 1.4832494258880615, + "reward_std": 0.12704303860664368, + "rewards/accuracy_reward_stage2": 0.6082494854927063, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 434 + }, + { + "completion_length": 22.46875, + "epoch": 0.0762221832836867, + "grad_norm": 23.000925090159036, + "kl": 59.0, + "learning_rate": 9.239530401261609e-07, + "loss": 23.7356, + "reward": 1.5550317764282227, + "reward_std": 0.22015462815761566, + "rewards/accuracy_reward_stage2": 0.6800317168235779, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 435 + }, + { + "completion_length": 9.046875, + "epoch": 0.07639740669353426, + "grad_norm": 28.476420892876988, + "kl": 0.1484375, + "learning_rate": 9.237778167163133e-07, + "loss": 0.0256, + "reward": 1.6102795600891113, + "reward_std": 0.20625394582748413, + "rewards/accuracy_reward_stage2": 0.6259044408798218, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 436 + }, + { + "completion_length": 12.53125, + "epoch": 0.0765726301033818, + "grad_norm": 24.19592301194606, + "kl": 0.0419921875, + "learning_rate": 9.236025933064658e-07, + "loss": -0.0274, + "reward": 1.4183125495910645, + "reward_std": 0.24271854758262634, + "rewards/accuracy_reward_stage2": 0.5589376091957092, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 437 + }, + { + "completion_length": 11.46875, + "epoch": 0.07674785351322937, + "grad_norm": 35.20563518798691, + "kl": 0.17578125, + "learning_rate": 9.234273698966181e-07, + "loss": 0.015, + "reward": 1.4013981819152832, + "reward_std": 0.27046674489974976, + "rewards/accuracy_reward_stage2": 0.4326481819152832, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 438 + }, + { + "completion_length": 10.640625, + "epoch": 0.07692307692307693, + "grad_norm": 21.04907556045925, + "kl": 0.0546875, + "learning_rate": 9.232521464867706e-07, + "loss": 0.009, + "reward": 1.5379629135131836, + "reward_std": 0.2769656777381897, + "rewards/accuracy_reward_stage2": 0.6785879135131836, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 439 + }, + { + "completion_length": 9.21875, + "epoch": 0.07709830033292447, + "grad_norm": 18.911621023379887, + "kl": 0.024169921875, + "learning_rate": 9.230769230769231e-07, + "loss": 0.0097, + "reward": 1.5322370529174805, + "reward_std": 0.18402306735515594, + "rewards/accuracy_reward_stage2": 0.5322371125221252, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 440 + }, + { + "completion_length": 10.4375, + "epoch": 0.07727352374277204, + "grad_norm": 15.798461479357341, + "kl": 0.0849609375, + "learning_rate": 9.229016996670754e-07, + "loss": 0.0341, + "reward": 1.6544257402420044, + "reward_std": 0.16453927755355835, + "rewards/accuracy_reward_stage2": 0.6544257998466492, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 441 + }, + { + "completion_length": 10.0, + "epoch": 0.0774487471526196, + "grad_norm": 14.276807876532036, + "kl": 0.0224609375, + "learning_rate": 9.227264762572279e-07, + "loss": -0.0352, + "reward": 1.59375, + "reward_std": 0.1778542846441269, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 442 + }, + { + "completion_length": 7.03125, + "epoch": 0.07762397056246714, + "grad_norm": 19.745379817242817, + "kl": 0.022705078125, + "learning_rate": 9.225512528473803e-07, + "loss": 0.0091, + "reward": 1.530820608139038, + "reward_std": 0.22802403569221497, + "rewards/accuracy_reward_stage2": 0.5308204889297485, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 443 + }, + { + "completion_length": 12.546875, + "epoch": 0.0777991939723147, + "grad_norm": 19.52967803023215, + "kl": 0.01397705078125, + "learning_rate": 9.223760294375328e-07, + "loss": 0.0056, + "reward": 1.6203205585479736, + "reward_std": 0.10274563729763031, + "rewards/accuracy_reward_stage2": 0.6203205585479736, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 444 + }, + { + "completion_length": 15.015625, + "epoch": 0.07797441738216225, + "grad_norm": 30.33681041142242, + "kl": 0.1533203125, + "learning_rate": 9.222008060276853e-07, + "loss": 0.0174, + "reward": 1.2619487047195435, + "reward_std": 0.3217325508594513, + "rewards/accuracy_reward_stage2": 0.40257370471954346, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 445 + }, + { + "completion_length": 9.765625, + "epoch": 0.07814964079200981, + "grad_norm": 18.15244970923255, + "kl": 0.049072265625, + "learning_rate": 9.220255826178377e-07, + "loss": 0.0197, + "reward": 1.5675026178359985, + "reward_std": 0.2025008201599121, + "rewards/accuracy_reward_stage2": 0.5675026774406433, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 446 + }, + { + "completion_length": 7.734375, + "epoch": 0.07832486420185737, + "grad_norm": 22.894618014689563, + "kl": 0.08642578125, + "learning_rate": 9.218503592079901e-07, + "loss": 0.0059, + "reward": 1.6023638248443604, + "reward_std": 0.27222031354904175, + "rewards/accuracy_reward_stage2": 0.6179888248443604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 447 + }, + { + "completion_length": 11.765625, + "epoch": 0.07850008761170492, + "grad_norm": 48.019137053035074, + "kl": 84.5, + "learning_rate": 9.216751357981426e-07, + "loss": 33.936, + "reward": 1.5308412313461304, + "reward_std": 0.21382224559783936, + "rewards/accuracy_reward_stage2": 0.6558412313461304, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 448 + }, + { + "completion_length": 10.296875, + "epoch": 0.07867531102155248, + "grad_norm": 16.88177879480028, + "kl": 0.0159912109375, + "learning_rate": 9.21499912388295e-07, + "loss": 0.0064, + "reward": 1.6435894966125488, + "reward_std": 0.20435433089733124, + "rewards/accuracy_reward_stage2": 0.6435894966125488, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 449 + }, + { + "completion_length": 9.1875, + "epoch": 0.07885053443140004, + "grad_norm": 19.188572270884954, + "kl": 0.06396484375, + "learning_rate": 9.213246889784475e-07, + "loss": 0.0256, + "reward": 1.636287808418274, + "reward_std": 0.18115490674972534, + "rewards/accuracy_reward_stage2": 0.6362878084182739, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 450 + }, + { + "completion_length": 8.5, + "epoch": 0.07902575784124759, + "grad_norm": 20.17073402441959, + "kl": 0.06884765625, + "learning_rate": 9.211494655685999e-07, + "loss": 0.0275, + "reward": 1.7681330442428589, + "reward_std": 0.18841080367565155, + "rewards/accuracy_reward_stage2": 0.8931329846382141, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 451 + }, + { + "completion_length": 10.25, + "epoch": 0.07920098125109515, + "grad_norm": 17.815446994203846, + "kl": 0.0218505859375, + "learning_rate": 9.209742421587524e-07, + "loss": 0.0088, + "reward": 1.5982638597488403, + "reward_std": 0.18450896441936493, + "rewards/accuracy_reward_stage2": 0.5982638597488403, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 452 + }, + { + "completion_length": 9.796875, + "epoch": 0.0793762046609427, + "grad_norm": 18.242203342119833, + "kl": 0.01806640625, + "learning_rate": 9.207990187489049e-07, + "loss": -0.0232, + "reward": 1.6876232624053955, + "reward_std": 0.14777256548404694, + "rewards/accuracy_reward_stage2": 0.703248143196106, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 453 + }, + { + "completion_length": 8.546875, + "epoch": 0.07955142807079026, + "grad_norm": 18.606917192384177, + "kl": 81.0, + "learning_rate": 9.206237953390572e-07, + "loss": 32.4161, + "reward": 1.496006965637207, + "reward_std": 0.09968242049217224, + "rewards/accuracy_reward_stage2": 0.621006965637207, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 454 + }, + { + "completion_length": 12.28125, + "epoch": 0.07972665148063782, + "grad_norm": 21.9000430991336, + "kl": 0.06884765625, + "learning_rate": 9.204485719292097e-07, + "loss": 0.0274, + "reward": 1.250139594078064, + "reward_std": 0.12326813489198685, + "rewards/accuracy_reward_stage2": 0.25013962388038635, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 455 + }, + { + "completion_length": 14.109375, + "epoch": 0.07990187489048536, + "grad_norm": 19.960024521289824, + "kl": 0.0849609375, + "learning_rate": 9.202733485193622e-07, + "loss": -0.0495, + "reward": 1.5685629844665527, + "reward_std": 0.1990964710712433, + "rewards/accuracy_reward_stage2": 0.5998129844665527, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 456 + }, + { + "completion_length": 9.921875, + "epoch": 0.08007709830033292, + "grad_norm": 15.223912178548778, + "kl": 0.0537109375, + "learning_rate": 9.200981251095145e-07, + "loss": 0.0215, + "reward": 1.7704863548278809, + "reward_std": 0.1370040774345398, + "rewards/accuracy_reward_stage2": 0.7704862952232361, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 457 + }, + { + "completion_length": 11.484375, + "epoch": 0.08025232171018049, + "grad_norm": 21.93213543217981, + "kl": 0.30078125, + "learning_rate": 9.19922901699667e-07, + "loss": 0.0903, + "reward": 1.4456546306610107, + "reward_std": 0.13230293989181519, + "rewards/accuracy_reward_stage2": 0.586279571056366, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 458 + }, + { + "completion_length": 9.484375, + "epoch": 0.08042754512002803, + "grad_norm": 14.798954968368728, + "kl": 0.08056640625, + "learning_rate": 9.197476782898194e-07, + "loss": 0.0323, + "reward": 1.6742162704467773, + "reward_std": 0.23457954823970795, + "rewards/accuracy_reward_stage2": 0.6742162704467773, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 459 + }, + { + "completion_length": 10.1875, + "epoch": 0.08060276852987559, + "grad_norm": 19.995984059508032, + "kl": 0.033203125, + "learning_rate": 9.195724548799719e-07, + "loss": -0.031, + "reward": 1.419159173965454, + "reward_std": 0.2401229292154312, + "rewards/accuracy_reward_stage2": 0.5597842335700989, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 460 + }, + { + "completion_length": 8.828125, + "epoch": 0.08077799193972315, + "grad_norm": 27.59647788465417, + "kl": 0.068359375, + "learning_rate": 9.193972314701244e-07, + "loss": 0.0049, + "reward": 1.57512366771698, + "reward_std": 0.2905910611152649, + "rewards/accuracy_reward_stage2": 0.60637366771698, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 461 + }, + { + "completion_length": 12.9375, + "epoch": 0.0809532153495707, + "grad_norm": 23.438390815015925, + "kl": 0.1201171875, + "learning_rate": 9.192220080602768e-07, + "loss": 0.048, + "reward": 1.3983908891677856, + "reward_std": 0.19916199147701263, + "rewards/accuracy_reward_stage2": 0.39839091897010803, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 462 + }, + { + "completion_length": 10.203125, + "epoch": 0.08112843875941826, + "grad_norm": 25.905344729385636, + "kl": 0.07763671875, + "learning_rate": 9.190467846504293e-07, + "loss": -0.0377, + "reward": 1.351801872253418, + "reward_std": 0.1887568235397339, + "rewards/accuracy_reward_stage2": 0.5080518126487732, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 463 + }, + { + "completion_length": 8.5625, + "epoch": 0.08130366216926581, + "grad_norm": 21.852140692838674, + "kl": 0.057861328125, + "learning_rate": 9.188715612405818e-07, + "loss": 0.0231, + "reward": 1.613126277923584, + "reward_std": 0.19007891416549683, + "rewards/accuracy_reward_stage2": 0.613126277923584, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 464 + }, + { + "completion_length": 9.171875, + "epoch": 0.08147888557911337, + "grad_norm": 16.677989585298285, + "kl": 0.12158203125, + "learning_rate": 9.186963378307342e-07, + "loss": 0.0488, + "reward": 1.3942195177078247, + "reward_std": 0.13556255400180817, + "rewards/accuracy_reward_stage2": 0.5348445177078247, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 465 + }, + { + "completion_length": 13.1875, + "epoch": 0.08165410898896093, + "grad_norm": 7.684489611981729, + "kl": 0.03076171875, + "learning_rate": 9.185211144208866e-07, + "loss": 0.0123, + "reward": 1.4885270595550537, + "reward_std": 0.032450269907712936, + "rewards/accuracy_reward_stage2": 0.6135270595550537, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 466 + }, + { + "completion_length": 25.828125, + "epoch": 0.08182933239880848, + "grad_norm": 19.304238267341717, + "kl": 0.03662109375, + "learning_rate": 9.183458910110389e-07, + "loss": 0.0147, + "reward": 1.5734906196594238, + "reward_std": 0.24000124633312225, + "rewards/accuracy_reward_stage2": 0.5734906792640686, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 467 + }, + { + "completion_length": 22.0, + "epoch": 0.08200455580865604, + "grad_norm": 17.66367688216898, + "kl": 41.25, + "learning_rate": 9.181706676011914e-07, + "loss": 16.6104, + "reward": 1.2004015445709229, + "reward_std": 0.14673781394958496, + "rewards/accuracy_reward_stage2": 0.32540154457092285, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 468 + }, + { + "completion_length": 10.28125, + "epoch": 0.0821797792185036, + "grad_norm": 16.750027229018684, + "kl": 0.01092529296875, + "learning_rate": 9.179954441913439e-07, + "loss": 0.0044, + "reward": 1.825685977935791, + "reward_std": 0.18837112188339233, + "rewards/accuracy_reward_stage2": 0.825685977935791, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 469 + }, + { + "completion_length": 8.96875, + "epoch": 0.08235500262835115, + "grad_norm": 19.670475644991413, + "kl": 0.062255859375, + "learning_rate": 9.178202207814963e-07, + "loss": -0.0194, + "reward": 1.5610289573669434, + "reward_std": 0.2965965270996094, + "rewards/accuracy_reward_stage2": 0.5766539573669434, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 470 + }, + { + "completion_length": 14.125, + "epoch": 0.0825302260381987, + "grad_norm": 22.687628136347087, + "kl": 0.1142578125, + "learning_rate": 9.176449973716488e-07, + "loss": 0.0458, + "reward": 1.6705833673477173, + "reward_std": 0.21611103415489197, + "rewards/accuracy_reward_stage2": 0.6705833673477173, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 471 + }, + { + "completion_length": 9.265625, + "epoch": 0.08270544944804625, + "grad_norm": 18.9690436720185, + "kl": 0.058837890625, + "learning_rate": 9.174697739618013e-07, + "loss": 0.0236, + "reward": 1.789116621017456, + "reward_std": 0.07733018696308136, + "rewards/accuracy_reward_stage2": 0.789116621017456, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 472 + }, + { + "completion_length": 9.28125, + "epoch": 0.08288067285789381, + "grad_norm": 18.03798220139904, + "kl": 0.062255859375, + "learning_rate": 9.172945505519537e-07, + "loss": 0.0249, + "reward": 1.5140492916107178, + "reward_std": 0.24741026759147644, + "rewards/accuracy_reward_stage2": 0.514049232006073, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 473 + }, + { + "completion_length": 26.09375, + "epoch": 0.08305589626774137, + "grad_norm": 5082.5494113753975, + "kl": 73.0, + "learning_rate": 9.171193271421062e-07, + "loss": 29.2608, + "reward": 1.3094170093536377, + "reward_std": 0.12926608324050903, + "rewards/accuracy_reward_stage2": 0.5594170093536377, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 474 + }, + { + "completion_length": 10.515625, + "epoch": 0.08323111967758892, + "grad_norm": 22.58619197290474, + "kl": 0.0986328125, + "learning_rate": 9.169441037322586e-07, + "loss": 0.0395, + "reward": 1.661272406578064, + "reward_std": 0.3087402582168579, + "rewards/accuracy_reward_stage2": 0.661272406578064, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 475 + }, + { + "completion_length": 9.6875, + "epoch": 0.08340634308743648, + "grad_norm": 24.06168327767966, + "kl": 0.087890625, + "learning_rate": 9.167688803224111e-07, + "loss": 0.0352, + "reward": 1.495906949043274, + "reward_std": 0.1737568974494934, + "rewards/accuracy_reward_stage2": 0.4959069490432739, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 476 + }, + { + "completion_length": 15.265625, + "epoch": 0.08358156649728404, + "grad_norm": 16.528081126500286, + "kl": 0.5078125, + "learning_rate": 9.165936569125636e-07, + "loss": 0.1584, + "reward": 1.4166667461395264, + "reward_std": 0.1257408708333969, + "rewards/accuracy_reward_stage2": 0.5572916269302368, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 477 + }, + { + "completion_length": 11.921875, + "epoch": 0.08375678990713159, + "grad_norm": 19.16101481092754, + "kl": 0.03125, + "learning_rate": 9.164184335027159e-07, + "loss": -0.0205, + "reward": 1.5078704357147217, + "reward_std": 0.26106423139572144, + "rewards/accuracy_reward_stage2": 0.5234953761100769, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 478 + }, + { + "completion_length": 10.90625, + "epoch": 0.08393201331697915, + "grad_norm": 20.65729096807624, + "kl": 0.0291748046875, + "learning_rate": 9.162432100928683e-07, + "loss": -0.032, + "reward": 1.5712745189666748, + "reward_std": 0.1589372754096985, + "rewards/accuracy_reward_stage2": 0.5868995785713196, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 479 + }, + { + "completion_length": 14.03125, + "epoch": 0.0841072367268267, + "grad_norm": 23.009481020265948, + "kl": 0.08984375, + "learning_rate": 9.160679866830208e-07, + "loss": -0.063, + "reward": 1.1516456604003906, + "reward_std": 0.4139564633369446, + "rewards/accuracy_reward_stage2": 0.323520690202713, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 480 + }, + { + "completion_length": 10.921875, + "epoch": 0.08428246013667426, + "grad_norm": 22.887241467709128, + "kl": 0.041015625, + "learning_rate": 9.158927632731732e-07, + "loss": 0.0164, + "reward": 1.658280611038208, + "reward_std": 0.19431188702583313, + "rewards/accuracy_reward_stage2": 0.783280611038208, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 481 + }, + { + "completion_length": 9.046875, + "epoch": 0.08445768354652182, + "grad_norm": 19.33248398430543, + "kl": 0.078125, + "learning_rate": 9.157175398633257e-07, + "loss": -0.0129, + "reward": 1.4629442691802979, + "reward_std": 0.2794113755226135, + "rewards/accuracy_reward_stage2": 0.4785691797733307, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 482 + }, + { + "completion_length": 7.6875, + "epoch": 0.08463290695636937, + "grad_norm": 24.134444137143362, + "kl": 0.033447265625, + "learning_rate": 9.155423164534781e-07, + "loss": 0.0134, + "reward": 1.601351022720337, + "reward_std": 0.28532567620277405, + "rewards/accuracy_reward_stage2": 0.6013510227203369, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 483 + }, + { + "completion_length": 6.171875, + "epoch": 0.08480813036621693, + "grad_norm": 16.767962021551625, + "kl": 0.018798828125, + "learning_rate": 9.153670930436306e-07, + "loss": -0.0463, + "reward": 1.4350864887237549, + "reward_std": 0.16850194334983826, + "rewards/accuracy_reward_stage2": 0.46633651852607727, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 484 + }, + { + "completion_length": 14.046875, + "epoch": 0.08498335377606449, + "grad_norm": 22.146728503845583, + "kl": 0.58203125, + "learning_rate": 9.151918696337831e-07, + "loss": 0.2322, + "reward": 1.4404876232147217, + "reward_std": 0.26357513666152954, + "rewards/accuracy_reward_stage2": 0.5654876232147217, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 485 + }, + { + "completion_length": 12.8125, + "epoch": 0.08515857718591203, + "grad_norm": 24.475383994637514, + "kl": 0.09619140625, + "learning_rate": 9.150166462239355e-07, + "loss": 0.0384, + "reward": 1.639747977256775, + "reward_std": 0.2860134541988373, + "rewards/accuracy_reward_stage2": 0.6397479772567749, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 486 + }, + { + "completion_length": 10.453125, + "epoch": 0.0853338005957596, + "grad_norm": 17.108695939756277, + "kl": 0.068359375, + "learning_rate": 9.148414228140879e-07, + "loss": -0.0169, + "reward": 1.7072330713272095, + "reward_std": 0.1748734563589096, + "rewards/accuracy_reward_stage2": 0.7228580713272095, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 487 + }, + { + "completion_length": 8.265625, + "epoch": 0.08550902400560714, + "grad_norm": 14.48557849493761, + "kl": 0.0252685546875, + "learning_rate": 9.146661994042404e-07, + "loss": -0.0341, + "reward": 1.7925978899002075, + "reward_std": 0.11917868256568909, + "rewards/accuracy_reward_stage2": 0.8082229495048523, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 488 + }, + { + "completion_length": 11.703125, + "epoch": 0.0856842474154547, + "grad_norm": 19.639142356206534, + "kl": 0.095703125, + "learning_rate": 9.144909759943928e-07, + "loss": 0.0385, + "reward": 1.58922278881073, + "reward_std": 0.2474866509437561, + "rewards/accuracy_reward_stage2": 0.71422278881073, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 489 + }, + { + "completion_length": 20.53125, + "epoch": 0.08585947082530226, + "grad_norm": 30.107818801594092, + "kl": 0.3515625, + "learning_rate": 9.143157525845453e-07, + "loss": 0.1409, + "reward": 1.3591396808624268, + "reward_std": 0.15374769270420074, + "rewards/accuracy_reward_stage2": 0.48413965106010437, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 490 + }, + { + "completion_length": 14.578125, + "epoch": 0.08603469423514981, + "grad_norm": 47.30563364220158, + "kl": 0.181640625, + "learning_rate": 9.141405291746977e-07, + "loss": 0.0283, + "reward": 1.171875, + "reward_std": 0.19939783215522766, + "rewards/accuracy_reward_stage2": 0.328125, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 491 + }, + { + "completion_length": 10.78125, + "epoch": 0.08620991764499737, + "grad_norm": 13.510331003385526, + "kl": 0.06689453125, + "learning_rate": 9.139653057648501e-07, + "loss": -0.0175, + "reward": 1.61344313621521, + "reward_std": 0.17493629455566406, + "rewards/accuracy_reward_stage2": 0.6290681958198547, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 492 + }, + { + "completion_length": 12.015625, + "epoch": 0.08638514105484493, + "grad_norm": 15.117382465909863, + "kl": 0.14453125, + "learning_rate": 9.137900823550026e-07, + "loss": 0.0139, + "reward": 1.3842592239379883, + "reward_std": 0.22201895713806152, + "rewards/accuracy_reward_stage2": 0.5248842239379883, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 493 + }, + { + "completion_length": 6.65625, + "epoch": 0.08656036446469248, + "grad_norm": 17.29696625736238, + "kl": 0.02783203125, + "learning_rate": 9.13614858945155e-07, + "loss": -0.033, + "reward": 1.8380773067474365, + "reward_std": 0.08552451431751251, + "rewards/accuracy_reward_stage2": 0.8537023067474365, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 494 + }, + { + "completion_length": 10.53125, + "epoch": 0.08673558787454004, + "grad_norm": 18.126590875549933, + "kl": 0.025390625, + "learning_rate": 9.134396355353075e-07, + "loss": -0.034, + "reward": 1.5160049200057983, + "reward_std": 0.17203427851200104, + "rewards/accuracy_reward_stage2": 0.5316299200057983, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 495 + }, + { + "completion_length": 10.3125, + "epoch": 0.08691081128438759, + "grad_norm": 22.92055570142462, + "kl": 0.061767578125, + "learning_rate": 9.1326441212546e-07, + "loss": 0.0247, + "reward": 1.6086578369140625, + "reward_std": 0.18672674894332886, + "rewards/accuracy_reward_stage2": 0.7336578369140625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 496 + }, + { + "completion_length": 19.0625, + "epoch": 0.08708603469423515, + "grad_norm": 13.63320237888586, + "kl": 0.05615234375, + "learning_rate": 9.130891887156123e-07, + "loss": -0.0648, + "reward": 1.4610896110534668, + "reward_std": 0.17531195282936096, + "rewards/accuracy_reward_stage2": 0.49233970046043396, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 497 + }, + { + "completion_length": 8.671875, + "epoch": 0.08726125810408271, + "grad_norm": 20.53693666680177, + "kl": 0.10595703125, + "learning_rate": 9.129139653057648e-07, + "loss": 0.0422, + "reward": 1.6972384452819824, + "reward_std": 0.20393230020999908, + "rewards/accuracy_reward_stage2": 0.6972383856773376, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 498 + }, + { + "completion_length": 16.515625, + "epoch": 0.08743648151393026, + "grad_norm": 18.100392259301536, + "kl": 0.03662109375, + "learning_rate": 9.127387418959172e-07, + "loss": 0.0051, + "reward": 1.575078010559082, + "reward_std": 0.12993305921554565, + "rewards/accuracy_reward_stage2": 0.590703010559082, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 499 + }, + { + "completion_length": 14.6875, + "epoch": 0.08761170492377782, + "grad_norm": 21.14943372944215, + "kl": 0.024658203125, + "learning_rate": 9.125635184860697e-07, + "loss": 0.0099, + "reward": 1.285620927810669, + "reward_std": 0.15270642936229706, + "rewards/accuracy_reward_stage2": 0.41062092781066895, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 500 + }, + { + "completion_length": 7.5, + "epoch": 0.08778692833362538, + "grad_norm": 17.769614304448513, + "kl": 0.0198974609375, + "learning_rate": 9.123882950762222e-07, + "loss": 0.008, + "reward": 1.6949687004089355, + "reward_std": 0.09590702503919601, + "rewards/accuracy_reward_stage2": 0.694968581199646, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 501 + }, + { + "completion_length": 11.265625, + "epoch": 0.08796215174347292, + "grad_norm": 24.687050847645843, + "kl": 0.032958984375, + "learning_rate": 9.122130716663746e-07, + "loss": -0.0555, + "reward": 1.694044589996338, + "reward_std": 0.24584685266017914, + "rewards/accuracy_reward_stage2": 0.7252947092056274, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 502 + }, + { + "completion_length": 12.28125, + "epoch": 0.08813737515332049, + "grad_norm": 23.468740520971043, + "kl": 0.384765625, + "learning_rate": 9.120378482565271e-07, + "loss": 0.0656, + "reward": 1.4185097217559814, + "reward_std": 0.2678026556968689, + "rewards/accuracy_reward_stage2": 0.5747597217559814, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 503 + }, + { + "completion_length": 10.421875, + "epoch": 0.08831259856316805, + "grad_norm": 30.292195286556733, + "kl": 0.056396484375, + "learning_rate": 9.118626248466796e-07, + "loss": 0.0225, + "reward": 1.5536742210388184, + "reward_std": 0.19536569714546204, + "rewards/accuracy_reward_stage2": 0.6786742210388184, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 504 + }, + { + "completion_length": 10.765625, + "epoch": 0.08848782197301559, + "grad_norm": 22.056329493793715, + "kl": 0.55859375, + "learning_rate": 9.116874014368319e-07, + "loss": 0.2231, + "reward": 1.4932494163513184, + "reward_std": 0.165345698595047, + "rewards/accuracy_reward_stage2": 0.7432493567466736, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 505 + }, + { + "completion_length": 7.78125, + "epoch": 0.08866304538286315, + "grad_norm": 23.179529678402307, + "kl": 0.1494140625, + "learning_rate": 9.115121780269844e-07, + "loss": 0.0599, + "reward": 1.523409128189087, + "reward_std": 0.24513620138168335, + "rewards/accuracy_reward_stage2": 0.5234091281890869, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 506 + }, + { + "completion_length": 8.671875, + "epoch": 0.0888382687927107, + "grad_norm": 40.15466178291841, + "kl": 0.08447265625, + "learning_rate": 9.113369546171367e-07, + "loss": 0.0338, + "reward": 1.4454511404037476, + "reward_std": 0.16291175782680511, + "rewards/accuracy_reward_stage2": 0.44545111060142517, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 507 + }, + { + "completion_length": 7.96875, + "epoch": 0.08901349220255826, + "grad_norm": 16.655840698240784, + "kl": 0.052978515625, + "learning_rate": 9.111617312072892e-07, + "loss": 0.0212, + "reward": 1.5639185905456543, + "reward_std": 0.10686256736516953, + "rewards/accuracy_reward_stage2": 0.5639185905456543, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 508 + }, + { + "completion_length": 12.46875, + "epoch": 0.08918871561240582, + "grad_norm": 15.858290783911817, + "kl": 0.330078125, + "learning_rate": 9.109865077974417e-07, + "loss": 0.0441, + "reward": 1.3085144758224487, + "reward_std": 0.2654265761375427, + "rewards/accuracy_reward_stage2": 0.46476447582244873, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 509 + }, + { + "completion_length": 8.0, + "epoch": 0.08936393902225337, + "grad_norm": 16.55163129247764, + "kl": 0.042724609375, + "learning_rate": 9.108112843875941e-07, + "loss": 0.017, + "reward": 1.5995845794677734, + "reward_std": 0.18738877773284912, + "rewards/accuracy_reward_stage2": 0.5995846390724182, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 510 + }, + { + "completion_length": 9.8125, + "epoch": 0.08953916243210093, + "grad_norm": 17.536117228564414, + "kl": 0.087890625, + "learning_rate": 9.106360609777466e-07, + "loss": 0.0018, + "reward": 1.3366703987121582, + "reward_std": 0.14628343284130096, + "rewards/accuracy_reward_stage2": 0.3522953391075134, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 511 + }, + { + "completion_length": 16.34375, + "epoch": 0.08971438584194849, + "grad_norm": 26.290486758726278, + "kl": 0.1953125, + "learning_rate": 9.10460837567899e-07, + "loss": 0.034, + "reward": 1.3810583353042603, + "reward_std": 0.2410065084695816, + "rewards/accuracy_reward_stage2": 0.39668336510658264, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 512 + }, + { + "completion_length": 8.40625, + "epoch": 0.08988960925179604, + "grad_norm": 15.909152671710622, + "kl": 0.06201171875, + "learning_rate": 9.102856141580515e-07, + "loss": 0.0247, + "reward": 1.629618763923645, + "reward_std": 0.18738989531993866, + "rewards/accuracy_reward_stage2": 0.629618763923645, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 513 + }, + { + "completion_length": 11.5, + "epoch": 0.0900648326616436, + "grad_norm": 15.863809367993417, + "kl": 0.0380859375, + "learning_rate": 9.10110390748204e-07, + "loss": 0.0152, + "reward": 1.478208065032959, + "reward_std": 0.10647543519735336, + "rewards/accuracy_reward_stage2": 0.4782080352306366, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 514 + }, + { + "completion_length": 7.90625, + "epoch": 0.09024005607149115, + "grad_norm": 21.755133696217552, + "kl": 0.1044921875, + "learning_rate": 9.099351673383564e-07, + "loss": 0.0417, + "reward": 1.5345044136047363, + "reward_std": 0.3030553460121155, + "rewards/accuracy_reward_stage2": 0.5345043540000916, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 515 + }, + { + "completion_length": 18.546875, + "epoch": 0.0904152794813387, + "grad_norm": 18.702052472544235, + "kl": 0.5078125, + "learning_rate": 9.097599439285089e-07, + "loss": 0.1589, + "reward": 1.3089147806167603, + "reward_std": 0.16796602308750153, + "rewards/accuracy_reward_stage2": 0.44953978061676025, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 516 + }, + { + "completion_length": 8.015625, + "epoch": 0.09059050289118627, + "grad_norm": 20.65997615387692, + "kl": 0.04296875, + "learning_rate": 9.095847205186612e-07, + "loss": 0.0172, + "reward": 1.6913013458251953, + "reward_std": 0.2100488543510437, + "rewards/accuracy_reward_stage2": 0.6913013458251953, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 517 + }, + { + "completion_length": 8.359375, + "epoch": 0.09076572630103381, + "grad_norm": 20.015545431572207, + "kl": 0.025146484375, + "learning_rate": 9.094094971088136e-07, + "loss": 0.0101, + "reward": 1.6237950325012207, + "reward_std": 0.2327117621898651, + "rewards/accuracy_reward_stage2": 0.6237950325012207, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 518 + }, + { + "completion_length": 7.8125, + "epoch": 0.09094094971088137, + "grad_norm": 17.673422161995706, + "kl": 0.0299072265625, + "learning_rate": 9.092342736989661e-07, + "loss": 0.012, + "reward": 1.6694855690002441, + "reward_std": 0.23738789558410645, + "rewards/accuracy_reward_stage2": 0.6694855690002441, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 519 + }, + { + "completion_length": 7.796875, + "epoch": 0.09111617312072894, + "grad_norm": 20.678175792830373, + "kl": 0.076171875, + "learning_rate": 9.090590502891185e-07, + "loss": -0.0581, + "reward": 1.4174991846084595, + "reward_std": 0.18197058141231537, + "rewards/accuracy_reward_stage2": 0.4487491548061371, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 520 + }, + { + "completion_length": 9.5625, + "epoch": 0.09129139653057648, + "grad_norm": 16.014094198427838, + "kl": 0.1435546875, + "learning_rate": 9.08883826879271e-07, + "loss": 0.0575, + "reward": 1.4942562580108643, + "reward_std": 0.18194083869457245, + "rewards/accuracy_reward_stage2": 0.6192562580108643, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 521 + }, + { + "completion_length": 14.890625, + "epoch": 0.09146661994042404, + "grad_norm": 24.857365439555846, + "kl": 0.42578125, + "learning_rate": 9.087086034694235e-07, + "loss": 0.126, + "reward": 1.3850722312927246, + "reward_std": 0.21721617877483368, + "rewards/accuracy_reward_stage2": 0.5256972908973694, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 522 + }, + { + "completion_length": 22.875, + "epoch": 0.09164184335027159, + "grad_norm": 22.317024704003305, + "kl": 0.1328125, + "learning_rate": 9.085333800595759e-07, + "loss": 0.0091, + "reward": 1.3440500497817993, + "reward_std": 0.20350778102874756, + "rewards/accuracy_reward_stage2": 0.3596750795841217, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 523 + }, + { + "completion_length": 7.859375, + "epoch": 0.09181706676011915, + "grad_norm": 12.267929545023042, + "kl": 0.04931640625, + "learning_rate": 9.083581566497284e-07, + "loss": 0.0198, + "reward": 1.3105697631835938, + "reward_std": 0.038502879440784454, + "rewards/accuracy_reward_stage2": 0.31056979298591614, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 524 + }, + { + "completion_length": 8.078125, + "epoch": 0.09199229016996671, + "grad_norm": 16.011879952345478, + "kl": 0.06787109375, + "learning_rate": 9.081829332398809e-07, + "loss": -0.0484, + "reward": 1.4427083730697632, + "reward_std": 0.303839772939682, + "rewards/accuracy_reward_stage2": 0.4739583432674408, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 525 + }, + { + "completion_length": 10.6875, + "epoch": 0.09216751357981426, + "grad_norm": 21.68307168867549, + "kl": 0.06787109375, + "learning_rate": 9.080077098300333e-07, + "loss": 0.027, + "reward": 1.6258351802825928, + "reward_std": 0.27005815505981445, + "rewards/accuracy_reward_stage2": 0.6258351802825928, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 526 + }, + { + "completion_length": 8.5, + "epoch": 0.09234273698966182, + "grad_norm": 11.823473795220892, + "kl": 0.0419921875, + "learning_rate": 9.078324864201857e-07, + "loss": -0.0215, + "reward": 1.3020833730697632, + "reward_std": 0.1627970188856125, + "rewards/accuracy_reward_stage2": 0.3177083134651184, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 527 + }, + { + "completion_length": 10.71875, + "epoch": 0.09251796039950938, + "grad_norm": 23.896112782111985, + "kl": 0.0252685546875, + "learning_rate": 9.076572630103381e-07, + "loss": 0.0101, + "reward": 1.6162935495376587, + "reward_std": 0.22328680753707886, + "rewards/accuracy_reward_stage2": 0.6162935495376587, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 528 + }, + { + "completion_length": 11.203125, + "epoch": 0.09269318380935693, + "grad_norm": 76.31274697932342, + "kl": 0.291015625, + "learning_rate": 9.074820396004906e-07, + "loss": 0.0999, + "reward": 1.5997403860092163, + "reward_std": 0.28809764981269836, + "rewards/accuracy_reward_stage2": 0.6153653264045715, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 529 + }, + { + "completion_length": 10.890625, + "epoch": 0.09286840721920449, + "grad_norm": 1213.4011074759428, + "kl": 3.171875, + "learning_rate": 9.07306816190643e-07, + "loss": 1.2073, + "reward": 1.527639389038086, + "reward_std": 0.2979646921157837, + "rewards/accuracy_reward_stage2": 0.6838893890380859, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 530 + }, + { + "completion_length": 12.9375, + "epoch": 0.09304363062905203, + "grad_norm": 20.3193019482723, + "kl": 0.04150390625, + "learning_rate": 9.071315927807954e-07, + "loss": -0.0619, + "reward": 1.526425838470459, + "reward_std": 0.2860848307609558, + "rewards/accuracy_reward_stage2": 0.5576759576797485, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 531 + }, + { + "completion_length": 8.78125, + "epoch": 0.0932188540388996, + "grad_norm": 16.39573830704058, + "kl": 0.09033203125, + "learning_rate": 9.069563693709479e-07, + "loss": 0.0, + "reward": 1.499305009841919, + "reward_std": 0.25076764822006226, + "rewards/accuracy_reward_stage2": 0.5149299502372742, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 532 + }, + { + "completion_length": 10.6875, + "epoch": 0.09339407744874716, + "grad_norm": 18.708267724995316, + "kl": 0.0277099609375, + "learning_rate": 9.067811459611004e-07, + "loss": 0.0111, + "reward": 1.4556643962860107, + "reward_std": 0.16554230451583862, + "rewards/accuracy_reward_stage2": 0.4556644558906555, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 533 + }, + { + "completion_length": 12.296875, + "epoch": 0.0935693008585947, + "grad_norm": 52816.97086739961, + "kl": 704.0, + "learning_rate": 9.066059225512528e-07, + "loss": 282.7971, + "reward": 1.1979167461395264, + "reward_std": 0.2623191773891449, + "rewards/accuracy_reward_stage2": 0.3541666865348816, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 534 + }, + { + "completion_length": 8.296875, + "epoch": 0.09374452426844226, + "grad_norm": 24.82478545154244, + "kl": 0.08740234375, + "learning_rate": 9.064306991414053e-07, + "loss": 0.035, + "reward": 1.4436891078948975, + "reward_std": 0.26873427629470825, + "rewards/accuracy_reward_stage2": 0.5686891078948975, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 535 + }, + { + "completion_length": 6.125, + "epoch": 0.09391974767828982, + "grad_norm": 18.23616939760275, + "kl": 0.0167236328125, + "learning_rate": 9.062554757315576e-07, + "loss": 0.0067, + "reward": 1.7154107093811035, + "reward_std": 0.19368675351142883, + "rewards/accuracy_reward_stage2": 0.7154107093811035, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 536 + }, + { + "completion_length": 15.515625, + "epoch": 0.09409497108813737, + "grad_norm": 12.11677875558571, + "kl": 0.035888671875, + "learning_rate": 9.060802523217101e-07, + "loss": 0.0144, + "reward": 1.44085693359375, + "reward_std": 0.06919336318969727, + "rewards/accuracy_reward_stage2": 0.4408569931983948, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 537 + }, + { + "completion_length": 7.640625, + "epoch": 0.09427019449798493, + "grad_norm": 16.886219674799325, + "kl": 0.0830078125, + "learning_rate": 9.059050289118626e-07, + "loss": -0.0274, + "reward": 1.2922900915145874, + "reward_std": 0.16166669130325317, + "rewards/accuracy_reward_stage2": 0.4641650915145874, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 538 + }, + { + "completion_length": 32.59375, + "epoch": 0.09444541790783248, + "grad_norm": 19.878285295210695, + "kl": 0.037109375, + "learning_rate": 9.05729805502015e-07, + "loss": -0.0293, + "reward": 1.4270589351654053, + "reward_std": 0.17999550700187683, + "rewards/accuracy_reward_stage2": 0.44268399477005005, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 539 + }, + { + "completion_length": 9.0625, + "epoch": 0.09462064131768004, + "grad_norm": 22.418001970985898, + "kl": 0.049560546875, + "learning_rate": 9.055545820921675e-07, + "loss": -0.0931, + "reward": 1.335763931274414, + "reward_std": 0.24215462803840637, + "rewards/accuracy_reward_stage2": 0.3826389014720917, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 540 + }, + { + "completion_length": 15.3125, + "epoch": 0.0947958647275276, + "grad_norm": 22.252296229336675, + "kl": 0.0986328125, + "learning_rate": 9.0537935868232e-07, + "loss": 0.0395, + "reward": 1.3899197578430176, + "reward_std": 0.16463521122932434, + "rewards/accuracy_reward_stage2": 0.3899197280406952, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 541 + }, + { + "completion_length": 12.375, + "epoch": 0.09497108813737515, + "grad_norm": 16.457184450985498, + "kl": 0.06396484375, + "learning_rate": 9.052041352724724e-07, + "loss": 0.0256, + "reward": 1.3750066757202148, + "reward_std": 0.1883399784564972, + "rewards/accuracy_reward_stage2": 0.3750067353248596, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 542 + }, + { + "completion_length": 11.625, + "epoch": 0.09514631154722271, + "grad_norm": 33.68922560485876, + "kl": 0.29296875, + "learning_rate": 9.050289118626248e-07, + "loss": 0.0647, + "reward": 1.2392685413360596, + "reward_std": 0.1982816904783249, + "rewards/accuracy_reward_stage2": 0.5205184817314148, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 543 + }, + { + "completion_length": 12.71875, + "epoch": 0.09532153495707027, + "grad_norm": 33.730678368774534, + "kl": 0.35546875, + "learning_rate": 9.048536884527772e-07, + "loss": 0.0475, + "reward": 1.0741642713546753, + "reward_std": 0.22530388832092285, + "rewards/accuracy_reward_stage2": 0.3554142713546753, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 544 + }, + { + "completion_length": 7.859375, + "epoch": 0.09549675836691782, + "grad_norm": 22.80060588847184, + "kl": 0.142578125, + "learning_rate": 9.046784650429297e-07, + "loss": 0.0569, + "reward": 1.6529418230056763, + "reward_std": 0.2742685079574585, + "rewards/accuracy_reward_stage2": 0.6529418230056763, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 545 + }, + { + "completion_length": 10.65625, + "epoch": 0.09567198177676538, + "grad_norm": 19.18030606110292, + "kl": 0.169921875, + "learning_rate": 9.045032416330821e-07, + "loss": 0.0303, + "reward": 1.33396577835083, + "reward_std": 0.19901514053344727, + "rewards/accuracy_reward_stage2": 0.47459083795547485, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 546 + }, + { + "completion_length": 24.609375, + "epoch": 0.09584720518661294, + "grad_norm": 39.36257012429911, + "kl": 0.5625, + "learning_rate": 9.043280182232345e-07, + "loss": 0.2261, + "reward": 1.3327341079711914, + "reward_std": 0.1748802214860916, + "rewards/accuracy_reward_stage2": 0.457734078168869, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 547 + }, + { + "completion_length": 11.921875, + "epoch": 0.09602242859646049, + "grad_norm": 15.899625187857596, + "kl": 0.04345703125, + "learning_rate": 9.04152794813387e-07, + "loss": -0.0268, + "reward": 1.4270386695861816, + "reward_std": 0.15212519466876984, + "rewards/accuracy_reward_stage2": 0.5676637291908264, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 548 + }, + { + "completion_length": 7.921875, + "epoch": 0.09619765200630805, + "grad_norm": 20.104482583734367, + "kl": 0.06982421875, + "learning_rate": 9.039775714035395e-07, + "loss": -0.0009, + "reward": 1.757695198059082, + "reward_std": 0.25138598680496216, + "rewards/accuracy_reward_stage2": 0.773320198059082, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 549 + }, + { + "completion_length": 12.4375, + "epoch": 0.09637287541615559, + "grad_norm": 17.080432216977275, + "kl": 0.56640625, + "learning_rate": 9.038023479936919e-07, + "loss": 0.2259, + "reward": 1.4803493022918701, + "reward_std": 0.18915359675884247, + "rewards/accuracy_reward_stage2": 0.6053494215011597, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 550 + }, + { + "completion_length": 11.1875, + "epoch": 0.09654809882600315, + "grad_norm": 20.191666928026432, + "kl": 0.05859375, + "learning_rate": 9.036271245838444e-07, + "loss": 0.0105, + "reward": 1.8123853206634521, + "reward_std": 0.20700375735759735, + "rewards/accuracy_reward_stage2": 0.8280103206634521, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 551 + }, + { + "completion_length": 11.421875, + "epoch": 0.09672332223585071, + "grad_norm": 20.378539040102037, + "kl": 0.1962890625, + "learning_rate": 9.034519011739968e-07, + "loss": 0.0787, + "reward": 1.409591555595398, + "reward_std": 0.23674771189689636, + "rewards/accuracy_reward_stage2": 0.659591555595398, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 552 + }, + { + "completion_length": 11.75, + "epoch": 0.09689854564569826, + "grad_norm": 18.890258636089598, + "kl": 0.051513671875, + "learning_rate": 9.032766777641493e-07, + "loss": 0.0206, + "reward": 1.4163398742675781, + "reward_std": 0.23046299815177917, + "rewards/accuracy_reward_stage2": 0.6663398742675781, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 553 + }, + { + "completion_length": 13.5, + "epoch": 0.09707376905554582, + "grad_norm": 18.286306275322975, + "kl": 0.0869140625, + "learning_rate": 9.031014543543018e-07, + "loss": 0.0348, + "reward": 1.3902003765106201, + "reward_std": 0.15996113419532776, + "rewards/accuracy_reward_stage2": 0.3902003765106201, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 554 + }, + { + "completion_length": 21.921875, + "epoch": 0.09724899246539338, + "grad_norm": 40.26435687211343, + "kl": 0.05517578125, + "learning_rate": 9.029262309444542e-07, + "loss": 0.0221, + "reward": 1.505936622619629, + "reward_std": 0.17588528990745544, + "rewards/accuracy_reward_stage2": 0.5059365034103394, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 555 + }, + { + "completion_length": 9.90625, + "epoch": 0.09742421587524093, + "grad_norm": 15.957309347414999, + "kl": 0.046875, + "learning_rate": 9.027510075346065e-07, + "loss": -0.0588, + "reward": 1.508584976196289, + "reward_std": 0.23708796501159668, + "rewards/accuracy_reward_stage2": 0.5398349165916443, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 556 + }, + { + "completion_length": 16.015625, + "epoch": 0.09759943928508849, + "grad_norm": 14.467619029018568, + "kl": 0.6953125, + "learning_rate": 9.02575784124759e-07, + "loss": 0.2386, + "reward": 1.5341227054595947, + "reward_std": 0.2018691599369049, + "rewards/accuracy_reward_stage2": 0.7997477650642395, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 557 + }, + { + "completion_length": 12.59375, + "epoch": 0.09777466269493604, + "grad_norm": 22.63678621876035, + "kl": 0.050537109375, + "learning_rate": 9.024005607149114e-07, + "loss": -0.1099, + "reward": 1.3984155654907227, + "reward_std": 0.3438401520252228, + "rewards/accuracy_reward_stage2": 0.4609155058860779, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 558 + }, + { + "completion_length": 11.875, + "epoch": 0.0979498861047836, + "grad_norm": 22.6560160835456, + "kl": 0.0751953125, + "learning_rate": 9.022253373050639e-07, + "loss": -0.0033, + "reward": 1.3677244186401367, + "reward_std": 0.3097214102745056, + "rewards/accuracy_reward_stage2": 0.3833494782447815, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 559 + }, + { + "completion_length": 11.8125, + "epoch": 0.09812510951463116, + "grad_norm": 22.911396622943528, + "kl": 0.73046875, + "learning_rate": 9.020501138952163e-07, + "loss": 0.249, + "reward": 1.5243258476257324, + "reward_std": 0.285469114780426, + "rewards/accuracy_reward_stage2": 0.6649507284164429, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 560 + }, + { + "completion_length": 12.046875, + "epoch": 0.0983003329244787, + "grad_norm": 14.34568733429109, + "kl": 0.051025390625, + "learning_rate": 9.018748904853688e-07, + "loss": 0.0205, + "reward": 1.4658381938934326, + "reward_std": 0.12557768821716309, + "rewards/accuracy_reward_stage2": 0.4658382534980774, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 561 + }, + { + "completion_length": 10.0625, + "epoch": 0.09847555633432627, + "grad_norm": 37.32668085474892, + "kl": 0.232421875, + "learning_rate": 9.016996670755213e-07, + "loss": 0.093, + "reward": 1.4536373615264893, + "reward_std": 0.22130584716796875, + "rewards/accuracy_reward_stage2": 0.703637421131134, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 562 + }, + { + "completion_length": 12.75, + "epoch": 0.09865077974417383, + "grad_norm": 15.319698390846803, + "kl": 0.0546875, + "learning_rate": 9.015244436656737e-07, + "loss": 0.0219, + "reward": 1.364243984222412, + "reward_std": 0.14342659711837769, + "rewards/accuracy_reward_stage2": 0.36424392461776733, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 563 + }, + { + "completion_length": 11.0625, + "epoch": 0.09882600315402137, + "grad_norm": 20.899740069261195, + "kl": 0.08935546875, + "learning_rate": 9.013492202558262e-07, + "loss": 0.0357, + "reward": 1.557667851448059, + "reward_std": 0.31662172079086304, + "rewards/accuracy_reward_stage2": 0.8076679110527039, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 564 + }, + { + "completion_length": 12.0, + "epoch": 0.09900122656386894, + "grad_norm": 21.39614803409284, + "kl": 0.1123046875, + "learning_rate": 9.011739968459787e-07, + "loss": 0.0008, + "reward": 1.7577136754989624, + "reward_std": 0.20902200043201447, + "rewards/accuracy_reward_stage2": 0.7733386754989624, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 565 + }, + { + "completion_length": 11.203125, + "epoch": 0.09917644997371648, + "grad_norm": 16.244712679238543, + "kl": 0.07275390625, + "learning_rate": 9.00998773436131e-07, + "loss": -0.0574, + "reward": 1.4084053039550781, + "reward_std": 0.12980613112449646, + "rewards/accuracy_reward_stage2": 0.4396553635597229, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 566 + }, + { + "completion_length": 8.796875, + "epoch": 0.09935167338356404, + "grad_norm": 12.42118208336979, + "kl": 0.029541015625, + "learning_rate": 9.008235500262835e-07, + "loss": 0.0118, + "reward": 1.6280958652496338, + "reward_std": 0.12311365455389023, + "rewards/accuracy_reward_stage2": 0.7530958652496338, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 567 + }, + { + "completion_length": 8.34375, + "epoch": 0.0995268967934116, + "grad_norm": 19.86622380429182, + "kl": 0.07421875, + "learning_rate": 9.006483266164358e-07, + "loss": 0.0296, + "reward": 1.6223640441894531, + "reward_std": 0.2801703214645386, + "rewards/accuracy_reward_stage2": 0.6223639249801636, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 568 + }, + { + "completion_length": 9.15625, + "epoch": 0.09970212020325915, + "grad_norm": 23.225550681225148, + "kl": 0.052734375, + "learning_rate": 9.004731032065883e-07, + "loss": 0.0069, + "reward": 1.6514757871627808, + "reward_std": 0.242259219288826, + "rewards/accuracy_reward_stage2": 0.6671008467674255, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 569 + }, + { + "completion_length": 9.234375, + "epoch": 0.09987734361310671, + "grad_norm": 19.919949196266696, + "kl": 0.203125, + "learning_rate": 9.002978797967408e-07, + "loss": 0.037, + "reward": 1.4913837909698486, + "reward_std": 0.23644289374351501, + "rewards/accuracy_reward_stage2": 0.6320087909698486, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 570 + }, + { + "completion_length": 13.0, + "epoch": 0.10005256702295427, + "grad_norm": 13.91521551284311, + "kl": 0.0673828125, + "learning_rate": 9.001226563868932e-07, + "loss": -0.0614, + "reward": 1.6299842596054077, + "reward_std": 0.18330855667591095, + "rewards/accuracy_reward_stage2": 0.6612342596054077, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 571 + }, + { + "completion_length": 8.03125, + "epoch": 0.10022779043280182, + "grad_norm": 19.58232929067739, + "kl": 0.0322265625, + "learning_rate": 8.999474329770457e-07, + "loss": 0.0129, + "reward": 1.6614583730697632, + "reward_std": 0.19485904276371002, + "rewards/accuracy_reward_stage2": 0.6614583134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 572 + }, + { + "completion_length": 9.25, + "epoch": 0.10040301384264938, + "grad_norm": 19.636258204988145, + "kl": 0.0478515625, + "learning_rate": 8.997722095671982e-07, + "loss": 0.0191, + "reward": 1.5884075164794922, + "reward_std": 0.31157463788986206, + "rewards/accuracy_reward_stage2": 0.5884075164794922, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 573 + }, + { + "completion_length": 8.4375, + "epoch": 0.10057823725249693, + "grad_norm": 11.776864581244816, + "kl": 0.03173828125, + "learning_rate": 8.995969861573506e-07, + "loss": 0.0126, + "reward": 1.8850898742675781, + "reward_std": 0.08686178922653198, + "rewards/accuracy_reward_stage2": 0.8850897550582886, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 574 + }, + { + "completion_length": 5.484375, + "epoch": 0.10075346066234449, + "grad_norm": 16.28159485758816, + "kl": 0.0203857421875, + "learning_rate": 8.994217627475031e-07, + "loss": -0.036, + "reward": 1.349395513534546, + "reward_std": 0.13381871581077576, + "rewards/accuracy_reward_stage2": 0.6150203943252563, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 575 + }, + { + "completion_length": 11.203125, + "epoch": 0.10092868407219205, + "grad_norm": 14.42868534069688, + "kl": 0.034423828125, + "learning_rate": 8.992465393376554e-07, + "loss": 0.0138, + "reward": 1.4621574878692627, + "reward_std": 0.13635873794555664, + "rewards/accuracy_reward_stage2": 0.7121575474739075, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 576 + }, + { + "completion_length": 15.0625, + "epoch": 0.1011039074820396, + "grad_norm": 26.063053167282682, + "kl": 0.08154296875, + "learning_rate": 8.990713159278079e-07, + "loss": -0.0032, + "reward": 1.4650766849517822, + "reward_std": 0.3235799968242645, + "rewards/accuracy_reward_stage2": 0.4807017147541046, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 577 + }, + { + "completion_length": 8.046875, + "epoch": 0.10127913089188716, + "grad_norm": 25.109409400455146, + "kl": 0.0595703125, + "learning_rate": 8.988960925179604e-07, + "loss": -0.0436, + "reward": 1.6310796737670898, + "reward_std": 0.2766677737236023, + "rewards/accuracy_reward_stage2": 0.6623297333717346, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 578 + }, + { + "completion_length": 10.265625, + "epoch": 0.10145435430173472, + "grad_norm": 17.661150618728758, + "kl": 0.2021484375, + "learning_rate": 8.987208691081128e-07, + "loss": 0.0807, + "reward": 1.1396028995513916, + "reward_std": 0.19701595604419708, + "rewards/accuracy_reward_stage2": 0.5146028995513916, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 579 + }, + { + "completion_length": 8.359375, + "epoch": 0.10162957771158226, + "grad_norm": 18.734631769812072, + "kl": 0.058837890625, + "learning_rate": 8.985456456982653e-07, + "loss": 0.0068, + "reward": 1.6147187948226929, + "reward_std": 0.2279905080795288, + "rewards/accuracy_reward_stage2": 0.6303437948226929, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 580 + }, + { + "completion_length": 9.65625, + "epoch": 0.10180480112142982, + "grad_norm": 19.790683776159472, + "kl": 0.1416015625, + "learning_rate": 8.983704222884176e-07, + "loss": -0.0755, + "reward": 1.4265341758728027, + "reward_std": 0.23208576440811157, + "rewards/accuracy_reward_stage2": 0.4734092354774475, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 581 + }, + { + "completion_length": 15.9375, + "epoch": 0.10198002453127737, + "grad_norm": 22.49576694273184, + "kl": 0.052001953125, + "learning_rate": 8.981951988785701e-07, + "loss": 0.0208, + "reward": 1.3234437704086304, + "reward_std": 0.17618276178836823, + "rewards/accuracy_reward_stage2": 0.323443740606308, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 582 + }, + { + "completion_length": 9.5625, + "epoch": 0.10215524794112493, + "grad_norm": 9.57883485832979, + "kl": 0.03076171875, + "learning_rate": 8.980199754687226e-07, + "loss": -0.0761, + "reward": 1.2532668113708496, + "reward_std": 0.10138334333896637, + "rewards/accuracy_reward_stage2": 0.28451675176620483, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 583 + }, + { + "completion_length": 9.515625, + "epoch": 0.1023304713509725, + "grad_norm": 13.225909758271314, + "kl": 0.0257568359375, + "learning_rate": 8.97844752058875e-07, + "loss": -0.0339, + "reward": 1.6815369129180908, + "reward_std": 0.1408383846282959, + "rewards/accuracy_reward_stage2": 0.6971619129180908, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 584 + }, + { + "completion_length": 9.90625, + "epoch": 0.10250569476082004, + "grad_norm": 23.03665624058185, + "kl": 0.07666015625, + "learning_rate": 8.976695286490275e-07, + "loss": -0.0451, + "reward": 1.4693918228149414, + "reward_std": 0.2618715465068817, + "rewards/accuracy_reward_stage2": 0.5006418228149414, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 585 + }, + { + "completion_length": 10.390625, + "epoch": 0.1026809181706676, + "grad_norm": 21.82393188697414, + "kl": 0.06494140625, + "learning_rate": 8.974943052391799e-07, + "loss": -0.0054, + "reward": 1.4567054510116577, + "reward_std": 0.28674811124801636, + "rewards/accuracy_reward_stage2": 0.47233039140701294, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 586 + }, + { + "completion_length": 9.0625, + "epoch": 0.10285614158051516, + "grad_norm": 18.317296690283246, + "kl": 0.28515625, + "learning_rate": 8.973190818293323e-07, + "loss": 0.0716, + "reward": 1.2668479681015015, + "reward_std": 0.20858728885650635, + "rewards/accuracy_reward_stage2": 0.4074729084968567, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 587 + }, + { + "completion_length": 16.46875, + "epoch": 0.10303136499036271, + "grad_norm": 17.62410201653559, + "kl": 0.0869140625, + "learning_rate": 8.971438584194848e-07, + "loss": -0.0094, + "reward": 1.4687905311584473, + "reward_std": 0.2499588280916214, + "rewards/accuracy_reward_stage2": 0.6094154715538025, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 588 + }, + { + "completion_length": 11.140625, + "epoch": 0.10320658840021027, + "grad_norm": 15.522166737484554, + "kl": 0.05859375, + "learning_rate": 8.969686350096372e-07, + "loss": 0.0234, + "reward": 1.6265251636505127, + "reward_std": 0.18599528074264526, + "rewards/accuracy_reward_stage2": 0.7515252232551575, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 589 + }, + { + "completion_length": 10.84375, + "epoch": 0.10338181181005783, + "grad_norm": 21.197974304069106, + "kl": 0.046142578125, + "learning_rate": 8.967934115997897e-07, + "loss": 0.0184, + "reward": 1.5723726749420166, + "reward_std": 0.2601754069328308, + "rewards/accuracy_reward_stage2": 0.5723727345466614, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 590 + }, + { + "completion_length": 14.578125, + "epoch": 0.10355703521990538, + "grad_norm": 24.743867916068385, + "kl": 0.0615234375, + "learning_rate": 8.966181881899422e-07, + "loss": -0.0044, + "reward": 1.618594765663147, + "reward_std": 0.1704864203929901, + "rewards/accuracy_reward_stage2": 0.6342197060585022, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 591 + }, + { + "completion_length": 8.515625, + "epoch": 0.10373225862975294, + "grad_norm": 22.397575395216897, + "kl": 0.0245361328125, + "learning_rate": 8.964429647800946e-07, + "loss": 0.0098, + "reward": 1.7014509439468384, + "reward_std": 0.2518218159675598, + "rewards/accuracy_reward_stage2": 0.7014508247375488, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 592 + }, + { + "completion_length": 8.609375, + "epoch": 0.10390748203960049, + "grad_norm": 17.820707192919905, + "kl": 0.031982421875, + "learning_rate": 8.962677413702471e-07, + "loss": 0.0128, + "reward": 1.5031335353851318, + "reward_std": 0.1464834064245224, + "rewards/accuracy_reward_stage2": 0.5031336545944214, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 593 + }, + { + "completion_length": 11.984375, + "epoch": 0.10408270544944805, + "grad_norm": 21.839142949786588, + "kl": 0.06396484375, + "learning_rate": 8.960925179603995e-07, + "loss": 0.0255, + "reward": 1.6463342905044556, + "reward_std": 0.25005415081977844, + "rewards/accuracy_reward_stage2": 0.6463342308998108, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 594 + }, + { + "completion_length": 12.78125, + "epoch": 0.1042579288592956, + "grad_norm": 12.309352868315235, + "kl": 0.01416015625, + "learning_rate": 8.959172945505519e-07, + "loss": 0.0057, + "reward": 1.71875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.71875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 595 + }, + { + "completion_length": 24.0, + "epoch": 0.10443315226914315, + "grad_norm": 22.16452522559772, + "kl": 0.0250244140625, + "learning_rate": 8.957420711407043e-07, + "loss": 0.01, + "reward": 1.3530032634735107, + "reward_std": 0.281308650970459, + "rewards/accuracy_reward_stage2": 0.3530033528804779, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 596 + }, + { + "completion_length": 6.65625, + "epoch": 0.10460837567899071, + "grad_norm": 18.573770659631396, + "kl": 0.0849609375, + "learning_rate": 8.955668477308567e-07, + "loss": 0.0006, + "reward": 1.5902413129806519, + "reward_std": 0.19903446733951569, + "rewards/accuracy_reward_stage2": 0.6058663725852966, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 597 + }, + { + "completion_length": 16.921875, + "epoch": 0.10478359908883828, + "grad_norm": 16.665416052004712, + "kl": 0.060302734375, + "learning_rate": 8.953916243210092e-07, + "loss": -0.0144, + "reward": 1.294586181640625, + "reward_std": 0.12051106244325638, + "rewards/accuracy_reward_stage2": 0.3102111220359802, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 598 + }, + { + "completion_length": 10.5, + "epoch": 0.10495882249868582, + "grad_norm": 19.600816799377586, + "kl": 0.12109375, + "learning_rate": 8.952164009111617e-07, + "loss": 0.0485, + "reward": 1.7329591512680054, + "reward_std": 0.24576660990715027, + "rewards/accuracy_reward_stage2": 0.7329592108726501, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 599 + }, + { + "completion_length": 13.03125, + "epoch": 0.10513404590853338, + "grad_norm": 17.31547786408126, + "kl": 0.05517578125, + "learning_rate": 8.950411775013141e-07, + "loss": 0.0221, + "reward": 1.1188859939575195, + "reward_std": 0.13924580812454224, + "rewards/accuracy_reward_stage2": 0.24388596415519714, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 600 + }, + { + "completion_length": 12.59375, + "epoch": 0.10530926931838093, + "grad_norm": 19.322536755534735, + "kl": 0.130859375, + "learning_rate": 8.948659540914666e-07, + "loss": 0.0525, + "reward": 1.4634090662002563, + "reward_std": 0.17742908000946045, + "rewards/accuracy_reward_stage2": 0.5884091258049011, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 601 + }, + { + "completion_length": 13.96875, + "epoch": 0.10548449272822849, + "grad_norm": 15.587417098129867, + "kl": 0.09326171875, + "learning_rate": 8.946907306816191e-07, + "loss": 0.0374, + "reward": 1.5436468124389648, + "reward_std": 0.12240086495876312, + "rewards/accuracy_reward_stage2": 0.6686468124389648, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 602 + }, + { + "completion_length": 8.359375, + "epoch": 0.10565971613807605, + "grad_norm": 20.599422159485304, + "kl": 0.045654296875, + "learning_rate": 8.945155072717715e-07, + "loss": 0.0183, + "reward": 1.7965465784072876, + "reward_std": 0.24006301164627075, + "rewards/accuracy_reward_stage2": 0.7965465188026428, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 603 + }, + { + "completion_length": 8.84375, + "epoch": 0.1058349395479236, + "grad_norm": 18.210715993732016, + "kl": 0.033447265625, + "learning_rate": 8.94340283861924e-07, + "loss": 0.0134, + "reward": 1.558894157409668, + "reward_std": 0.14036405086517334, + "rewards/accuracy_reward_stage2": 0.5745192170143127, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 604 + }, + { + "completion_length": 13.421875, + "epoch": 0.10601016295777116, + "grad_norm": 27.94464562579728, + "kl": 0.017333984375, + "learning_rate": 8.941650604520764e-07, + "loss": -0.0264, + "reward": 1.4473905563354492, + "reward_std": 0.21036802232265472, + "rewards/accuracy_reward_stage2": 0.5880155563354492, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 605 + }, + { + "completion_length": 11.34375, + "epoch": 0.10618538636761872, + "grad_norm": 12.57504645024913, + "kl": 0.01263427734375, + "learning_rate": 8.939898370422288e-07, + "loss": -0.1122, + "reward": 1.539158821105957, + "reward_std": 0.2022034078836441, + "rewards/accuracy_reward_stage2": 0.586033821105957, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 606 + }, + { + "completion_length": 8.15625, + "epoch": 0.10636060977746627, + "grad_norm": 15.638105890819281, + "kl": 0.10546875, + "learning_rate": 8.938146136323812e-07, + "loss": 0.0422, + "reward": 1.3107225894927979, + "reward_std": 0.18837109208106995, + "rewards/accuracy_reward_stage2": 0.4357225298881531, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 607 + }, + { + "completion_length": 9.515625, + "epoch": 0.10653583318731383, + "grad_norm": 16.783161632058864, + "kl": 0.045654296875, + "learning_rate": 8.936393902225336e-07, + "loss": -0.0033, + "reward": 1.728787899017334, + "reward_std": 0.1572287678718567, + "rewards/accuracy_reward_stage2": 0.7444128394126892, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 608 + }, + { + "completion_length": 7.296875, + "epoch": 0.10671105659716137, + "grad_norm": 19.953434160420613, + "kl": 0.023681640625, + "learning_rate": 8.934641668126861e-07, + "loss": 0.0095, + "reward": 1.3125, + "reward_std": 0.25513991713523865, + "rewards/accuracy_reward_stage2": 0.4375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 609 + }, + { + "completion_length": 12.375, + "epoch": 0.10688628000700894, + "grad_norm": 23.33191958488592, + "kl": 0.037353515625, + "learning_rate": 8.932889434028386e-07, + "loss": 0.0053, + "reward": 1.6041978597640991, + "reward_std": 0.33646097779273987, + "rewards/accuracy_reward_stage2": 0.6198228597640991, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 610 + }, + { + "completion_length": 8.546875, + "epoch": 0.1070615034168565, + "grad_norm": 14.652227746966123, + "kl": 0.01519775390625, + "learning_rate": 8.93113719992991e-07, + "loss": 0.0061, + "reward": 1.7631537914276123, + "reward_std": 0.10471543669700623, + "rewards/accuracy_reward_stage2": 0.7631537318229675, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 611 + }, + { + "completion_length": 8.390625, + "epoch": 0.10723672682670404, + "grad_norm": 19.372365428544043, + "kl": 0.0311279296875, + "learning_rate": 8.929384965831435e-07, + "loss": 0.0124, + "reward": 1.6221894025802612, + "reward_std": 0.14791935682296753, + "rewards/accuracy_reward_stage2": 0.622189462184906, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 612 + }, + { + "completion_length": 7.296875, + "epoch": 0.1074119502365516, + "grad_norm": 12.757778654176331, + "kl": 0.02880859375, + "learning_rate": 8.927632731732959e-07, + "loss": -0.0327, + "reward": 1.203125, + "reward_std": 0.16887323558330536, + "rewards/accuracy_reward_stage2": 0.21875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 613 + }, + { + "completion_length": 9.234375, + "epoch": 0.10758717364639916, + "grad_norm": 15.82738195868456, + "kl": 0.0301513671875, + "learning_rate": 8.925880497634484e-07, + "loss": 0.0121, + "reward": 1.7813446521759033, + "reward_std": 0.16648373007774353, + "rewards/accuracy_reward_stage2": 0.9063446521759033, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 614 + }, + { + "completion_length": 9.71875, + "epoch": 0.10776239705624671, + "grad_norm": 21.46197875618223, + "kl": 0.030517578125, + "learning_rate": 8.924128263536009e-07, + "loss": -0.1204, + "reward": 1.430158019065857, + "reward_std": 0.289289653301239, + "rewards/accuracy_reward_stage2": 0.47703301906585693, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 615 + }, + { + "completion_length": 9.828125, + "epoch": 0.10793762046609427, + "grad_norm": 24.27795896535574, + "kl": 0.0458984375, + "learning_rate": 8.922376029437532e-07, + "loss": -0.0258, + "reward": 1.624479055404663, + "reward_std": 0.21070247888565063, + "rewards/accuracy_reward_stage2": 0.6401039958000183, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 616 + }, + { + "completion_length": 9.109375, + "epoch": 0.10811284387594182, + "grad_norm": 16.643265414166162, + "kl": 0.1318359375, + "learning_rate": 8.920623795339057e-07, + "loss": -0.0343, + "reward": 1.634692668914795, + "reward_std": 0.15342967212200165, + "rewards/accuracy_reward_stage2": 0.7909427285194397, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 617 + }, + { + "completion_length": 14.703125, + "epoch": 0.10828806728578938, + "grad_norm": 21.05809926024803, + "kl": 0.061279296875, + "learning_rate": 8.918871561240582e-07, + "loss": 0.0245, + "reward": 1.3503414392471313, + "reward_std": 0.15436102449893951, + "rewards/accuracy_reward_stage2": 0.35034140944480896, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 618 + }, + { + "completion_length": 6.453125, + "epoch": 0.10846329069563694, + "grad_norm": 18.701299422305127, + "kl": 0.03173828125, + "learning_rate": 8.917119327142105e-07, + "loss": 0.0127, + "reward": 1.5458629131317139, + "reward_std": 0.19350674748420715, + "rewards/accuracy_reward_stage2": 0.5458628535270691, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 619 + }, + { + "completion_length": 9.9375, + "epoch": 0.10863851410548449, + "grad_norm": 19.58448716432124, + "kl": 0.0159912109375, + "learning_rate": 8.91536709304363e-07, + "loss": -0.0378, + "reward": 1.4166667461395264, + "reward_std": 0.2630414366722107, + "rewards/accuracy_reward_stage2": 0.5572916865348816, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 620 + }, + { + "completion_length": 10.640625, + "epoch": 0.10881373751533205, + "grad_norm": 20.55948510580831, + "kl": 0.162109375, + "learning_rate": 8.913614858945154e-07, + "loss": 0.0649, + "reward": 1.4507501125335693, + "reward_std": 0.12407205998897552, + "rewards/accuracy_reward_stage2": 0.5757502317428589, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 621 + }, + { + "completion_length": 9.578125, + "epoch": 0.10898896092517961, + "grad_norm": 22.69764250164246, + "kl": 0.1083984375, + "learning_rate": 8.911862624846679e-07, + "loss": 0.0432, + "reward": 1.4806079864501953, + "reward_std": 0.16957543790340424, + "rewards/accuracy_reward_stage2": 0.6056080460548401, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 622 + }, + { + "completion_length": 20.53125, + "epoch": 0.10916418433502716, + "grad_norm": 18.42436068010624, + "kl": 0.034423828125, + "learning_rate": 8.910110390748204e-07, + "loss": 0.0138, + "reward": 1.778219223022461, + "reward_std": 0.19961267709732056, + "rewards/accuracy_reward_stage2": 0.7782192230224609, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 623 + }, + { + "completion_length": 12.84375, + "epoch": 0.10933940774487472, + "grad_norm": 29.281130200071576, + "kl": 0.263671875, + "learning_rate": 8.908358156649728e-07, + "loss": 0.0613, + "reward": 1.4601194858551025, + "reward_std": 0.11638569831848145, + "rewards/accuracy_reward_stage2": 0.6007444858551025, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 624 + }, + { + "completion_length": 9.890625, + "epoch": 0.10951463115472228, + "grad_norm": 24.147239619968612, + "kl": 0.0712890625, + "learning_rate": 8.906605922551253e-07, + "loss": 0.0285, + "reward": 1.625582218170166, + "reward_std": 0.15535606443881989, + "rewards/accuracy_reward_stage2": 0.6255822777748108, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 625 + }, + { + "completion_length": 9.84375, + "epoch": 0.10968985456456982, + "grad_norm": 16.839339018289216, + "kl": 0.091796875, + "learning_rate": 8.904853688452777e-07, + "loss": -0.0074, + "reward": 1.8911480903625488, + "reward_std": 0.1472635567188263, + "rewards/accuracy_reward_stage2": 0.9067729711532593, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 626 + }, + { + "completion_length": 12.75, + "epoch": 0.10986507797441739, + "grad_norm": 16.885588058948453, + "kl": 0.06689453125, + "learning_rate": 8.903101454354301e-07, + "loss": 0.0269, + "reward": 1.6324387788772583, + "reward_std": 0.11915861815214157, + "rewards/accuracy_reward_stage2": 0.6324387788772583, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 627 + }, + { + "completion_length": 11.296875, + "epoch": 0.11004030138426493, + "grad_norm": 19.57572697577605, + "kl": 0.05419921875, + "learning_rate": 8.901349220255826e-07, + "loss": 0.0217, + "reward": 1.3621962070465088, + "reward_std": 0.26940637826919556, + "rewards/accuracy_reward_stage2": 0.3621961772441864, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 628 + }, + { + "completion_length": 7.296875, + "epoch": 0.1102155247941125, + "grad_norm": 15.528399734860603, + "kl": 0.03955078125, + "learning_rate": 8.89959698615735e-07, + "loss": 0.0158, + "reward": 1.6868137121200562, + "reward_std": 0.10869477689266205, + "rewards/accuracy_reward_stage2": 0.6868136525154114, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 629 + }, + { + "completion_length": 9.765625, + "epoch": 0.11039074820396005, + "grad_norm": 19.224322095125764, + "kl": 0.048583984375, + "learning_rate": 8.897844752058875e-07, + "loss": 0.0194, + "reward": 1.6562113761901855, + "reward_std": 0.18549120426177979, + "rewards/accuracy_reward_stage2": 0.6562113761901855, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 630 + }, + { + "completion_length": 6.1875, + "epoch": 0.1105659716138076, + "grad_norm": 20.116365737073824, + "kl": 0.060791015625, + "learning_rate": 8.8960925179604e-07, + "loss": 0.0243, + "reward": 1.512540578842163, + "reward_std": 0.16334936022758484, + "rewards/accuracy_reward_stage2": 0.5125405788421631, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 631 + }, + { + "completion_length": 8.84375, + "epoch": 0.11074119502365516, + "grad_norm": 16.682655445127725, + "kl": 0.1279296875, + "learning_rate": 8.894340283861923e-07, + "loss": 0.0112, + "reward": 1.474839210510254, + "reward_std": 0.2113642543554306, + "rewards/accuracy_reward_stage2": 0.6154641509056091, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 632 + }, + { + "completion_length": 14.390625, + "epoch": 0.11091641843350272, + "grad_norm": 17.490175729379096, + "kl": 0.0233154296875, + "learning_rate": 8.892588049763448e-07, + "loss": 0.0094, + "reward": 1.9157812595367432, + "reward_std": 0.12778238952159882, + "rewards/accuracy_reward_stage2": 0.9157813191413879, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 633 + }, + { + "completion_length": 16.5625, + "epoch": 0.11109164184335027, + "grad_norm": 16.11217323057676, + "kl": 0.06396484375, + "learning_rate": 8.890835815664973e-07, + "loss": 0.0255, + "reward": 1.3033857345581055, + "reward_std": 0.15057168900966644, + "rewards/accuracy_reward_stage2": 0.42838579416275024, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 634 + }, + { + "completion_length": 8.015625, + "epoch": 0.11126686525319783, + "grad_norm": 27.436005651707575, + "kl": 0.1728515625, + "learning_rate": 8.889083581566496e-07, + "loss": -0.0039, + "reward": 1.6807503700256348, + "reward_std": 0.30542638897895813, + "rewards/accuracy_reward_stage2": 0.8370004296302795, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 635 + }, + { + "completion_length": 9.703125, + "epoch": 0.11144208866304538, + "grad_norm": 26.94592284911264, + "kl": 0.07421875, + "learning_rate": 8.887331347468021e-07, + "loss": 0.0297, + "reward": 1.3588056564331055, + "reward_std": 0.3892172873020172, + "rewards/accuracy_reward_stage2": 0.6088056564331055, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 636 + }, + { + "completion_length": 9.21875, + "epoch": 0.11161731207289294, + "grad_norm": 18.19530983729694, + "kl": 0.036376953125, + "learning_rate": 8.885579113369545e-07, + "loss": -0.0021, + "reward": 1.6655867099761963, + "reward_std": 0.25258275866508484, + "rewards/accuracy_reward_stage2": 0.6812116503715515, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 637 + }, + { + "completion_length": 12.0625, + "epoch": 0.1117925354827405, + "grad_norm": 20.711330390595382, + "kl": 0.0712890625, + "learning_rate": 8.88382687927107e-07, + "loss": -0.0048, + "reward": 1.4815778732299805, + "reward_std": 0.24052470922470093, + "rewards/accuracy_reward_stage2": 0.4972028136253357, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 638 + }, + { + "completion_length": 10.1875, + "epoch": 0.11196775889258805, + "grad_norm": 20.415117056750283, + "kl": 0.043212890625, + "learning_rate": 8.882074645172595e-07, + "loss": -0.0143, + "reward": 1.4225728511810303, + "reward_std": 0.20394927263259888, + "rewards/accuracy_reward_stage2": 0.4381977319717407, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 639 + }, + { + "completion_length": 7.5, + "epoch": 0.1121429823024356, + "grad_norm": 18.065810871206835, + "kl": 0.0400390625, + "learning_rate": 8.880322411074119e-07, + "loss": 0.016, + "reward": 1.529618501663208, + "reward_std": 0.19352030754089355, + "rewards/accuracy_reward_stage2": 0.5296184420585632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 640 + }, + { + "completion_length": 7.65625, + "epoch": 0.11231820571228317, + "grad_norm": 17.331584185274014, + "kl": 0.0203857421875, + "learning_rate": 8.878570176975644e-07, + "loss": 0.0082, + "reward": 1.4235129356384277, + "reward_std": 0.2608225345611572, + "rewards/accuracy_reward_stage2": 0.42351287603378296, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 641 + }, + { + "completion_length": 7.953125, + "epoch": 0.11249342912213071, + "grad_norm": 17.852702136003707, + "kl": 0.30078125, + "learning_rate": 8.876817942877169e-07, + "loss": 0.12, + "reward": 1.2848070859909058, + "reward_std": 0.1477287858724594, + "rewards/accuracy_reward_stage2": 0.534807026386261, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 642 + }, + { + "completion_length": 12.03125, + "epoch": 0.11266865253197828, + "grad_norm": 15.507208538142622, + "kl": 0.015869140625, + "learning_rate": 8.875065708778693e-07, + "loss": 0.0063, + "reward": 1.7227667570114136, + "reward_std": 0.2567778527736664, + "rewards/accuracy_reward_stage2": 0.7227667570114136, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 643 + }, + { + "completion_length": 16.921875, + "epoch": 0.11284387594182582, + "grad_norm": 23.081114868964416, + "kl": 0.2216796875, + "learning_rate": 8.873313474680218e-07, + "loss": 0.0887, + "reward": 1.27731454372406, + "reward_std": 0.20373868942260742, + "rewards/accuracy_reward_stage2": 0.5273144841194153, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 644 + }, + { + "completion_length": 5.828125, + "epoch": 0.11301909935167338, + "grad_norm": 16.313428020485347, + "kl": 0.06103515625, + "learning_rate": 8.87156124058174e-07, + "loss": -0.064, + "reward": 1.6435449123382568, + "reward_std": 0.19214250147342682, + "rewards/accuracy_reward_stage2": 0.6747948527336121, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 645 + }, + { + "completion_length": 29.6875, + "epoch": 0.11319432276152094, + "grad_norm": 19.13977602308894, + "kl": 0.031494140625, + "learning_rate": 8.869809006483265e-07, + "loss": 0.0126, + "reward": 1.8208966255187988, + "reward_std": 0.14088435471057892, + "rewards/accuracy_reward_stage2": 0.8208966255187988, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 646 + }, + { + "completion_length": 8.078125, + "epoch": 0.11336954617136849, + "grad_norm": 16.336114992014164, + "kl": 0.1513671875, + "learning_rate": 8.86805677238479e-07, + "loss": 0.0608, + "reward": 1.5519046783447266, + "reward_std": 0.1638176441192627, + "rewards/accuracy_reward_stage2": 0.5519046187400818, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 647 + }, + { + "completion_length": 10.328125, + "epoch": 0.11354476958121605, + "grad_norm": 19.192940552680728, + "kl": 0.11328125, + "learning_rate": 8.866304538286314e-07, + "loss": 0.0453, + "reward": 1.5758169889450073, + "reward_std": 0.19315147399902344, + "rewards/accuracy_reward_stage2": 0.5758169293403625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 648 + }, + { + "completion_length": 10.109375, + "epoch": 0.11371999299106361, + "grad_norm": 24.65646749415431, + "kl": 0.1298828125, + "learning_rate": 8.864552304187839e-07, + "loss": 0.0521, + "reward": 1.5815494060516357, + "reward_std": 0.2213352620601654, + "rewards/accuracy_reward_stage2": 0.5815494060516357, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 649 + }, + { + "completion_length": 7.46875, + "epoch": 0.11389521640091116, + "grad_norm": 17.447375675037648, + "kl": 0.0751953125, + "learning_rate": 8.862800070089363e-07, + "loss": -0.0142, + "reward": 1.478639841079712, + "reward_std": 0.19840413331985474, + "rewards/accuracy_reward_stage2": 0.4942649006843567, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 650 + }, + { + "completion_length": 9.953125, + "epoch": 0.11407043981075872, + "grad_norm": 18.03776475428187, + "kl": 0.1435546875, + "learning_rate": 8.861047835990888e-07, + "loss": 0.0575, + "reward": 1.3064175844192505, + "reward_std": 0.19495750963687897, + "rewards/accuracy_reward_stage2": 0.4314176142215729, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 651 + }, + { + "completion_length": 10.125, + "epoch": 0.11424566322060627, + "grad_norm": 14.52019538206722, + "kl": 0.0244140625, + "learning_rate": 8.859295601892413e-07, + "loss": -0.032, + "reward": 1.5017303228378296, + "reward_std": 0.08835500478744507, + "rewards/accuracy_reward_stage2": 0.5173553228378296, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 652 + }, + { + "completion_length": 9.234375, + "epoch": 0.11442088663045383, + "grad_norm": 20.38124668947182, + "kl": 0.05224609375, + "learning_rate": 8.857543367793937e-07, + "loss": 0.0209, + "reward": 1.5765533447265625, + "reward_std": 0.22117437422275543, + "rewards/accuracy_reward_stage2": 0.5765534043312073, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 653 + }, + { + "completion_length": 8.390625, + "epoch": 0.11459611004030139, + "grad_norm": 16.921443968716325, + "kl": 0.12451171875, + "learning_rate": 8.855791133695462e-07, + "loss": 0.0497, + "reward": 1.603499174118042, + "reward_std": 0.20851662755012512, + "rewards/accuracy_reward_stage2": 0.603499174118042, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 654 + }, + { + "completion_length": 9.859375, + "epoch": 0.11477133345014894, + "grad_norm": 18.83286142598205, + "kl": 0.0693359375, + "learning_rate": 8.854038899596987e-07, + "loss": 0.0278, + "reward": 1.5485821962356567, + "reward_std": 0.2668875753879547, + "rewards/accuracy_reward_stage2": 0.5485821962356567, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 655 + }, + { + "completion_length": 10.75, + "epoch": 0.1149465568599965, + "grad_norm": 20.846478318101642, + "kl": 0.06005859375, + "learning_rate": 8.85228666549851e-07, + "loss": 0.024, + "reward": 1.5447120666503906, + "reward_std": 0.1853538602590561, + "rewards/accuracy_reward_stage2": 0.5447121262550354, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 656 + }, + { + "completion_length": 18.71875, + "epoch": 0.11512178026984406, + "grad_norm": 17.393770278359163, + "kl": 0.01513671875, + "learning_rate": 8.850534431400035e-07, + "loss": 0.0061, + "reward": 1.5744693279266357, + "reward_std": 0.1881437748670578, + "rewards/accuracy_reward_stage2": 0.5744693279266357, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 657 + }, + { + "completion_length": 9.953125, + "epoch": 0.1152970036796916, + "grad_norm": 14.001795451142137, + "kl": 0.11083984375, + "learning_rate": 8.848782197301558e-07, + "loss": 0.0444, + "reward": 1.5067996978759766, + "reward_std": 0.08500517159700394, + "rewards/accuracy_reward_stage2": 0.6317996978759766, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 658 + }, + { + "completion_length": 11.25, + "epoch": 0.11547222708953916, + "grad_norm": 20.467399329823657, + "kl": 0.06005859375, + "learning_rate": 8.847029963203083e-07, + "loss": -0.0189, + "reward": 1.3823635578155518, + "reward_std": 0.21266907453536987, + "rewards/accuracy_reward_stage2": 0.39798852801322937, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 659 + }, + { + "completion_length": 8.96875, + "epoch": 0.11564745049938671, + "grad_norm": 21.6979069287139, + "kl": 0.05224609375, + "learning_rate": 8.845277729104608e-07, + "loss": 0.0209, + "reward": 1.4927724599838257, + "reward_std": 0.2009587436914444, + "rewards/accuracy_reward_stage2": 0.4927724003791809, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 660 + }, + { + "completion_length": 11.03125, + "epoch": 0.11582267390923427, + "grad_norm": 20.26879174400301, + "kl": 0.10107421875, + "learning_rate": 8.843525495006132e-07, + "loss": 0.0405, + "reward": 1.4508693218231201, + "reward_std": 0.18329568207263947, + "rewards/accuracy_reward_stage2": 0.5758693218231201, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 661 + }, + { + "completion_length": 11.859375, + "epoch": 0.11599789731908183, + "grad_norm": 20.977372896175574, + "kl": 0.06640625, + "learning_rate": 8.841773260907657e-07, + "loss": -0.0176, + "reward": 1.5426456928253174, + "reward_std": 0.2179446965456009, + "rewards/accuracy_reward_stage2": 0.5582706928253174, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 662 + }, + { + "completion_length": 9.359375, + "epoch": 0.11617312072892938, + "grad_norm": 32.37126869962416, + "kl": 0.1962890625, + "learning_rate": 8.840021026809182e-07, + "loss": 0.0789, + "reward": 1.342227578163147, + "reward_std": 0.20740927755832672, + "rewards/accuracy_reward_stage2": 0.4672274887561798, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 663 + }, + { + "completion_length": 10.921875, + "epoch": 0.11634834413877694, + "grad_norm": 20.284582129799464, + "kl": 0.059814453125, + "learning_rate": 8.838268792710706e-07, + "loss": 0.0239, + "reward": 1.6112689971923828, + "reward_std": 0.2986975312232971, + "rewards/accuracy_reward_stage2": 0.611268937587738, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 664 + }, + { + "completion_length": 15.109375, + "epoch": 0.1165235675486245, + "grad_norm": 17.454920372904077, + "kl": 0.05859375, + "learning_rate": 8.83651655861223e-07, + "loss": 0.0068, + "reward": 1.364925742149353, + "reward_std": 0.24069397151470184, + "rewards/accuracy_reward_stage2": 0.3805507719516754, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 665 + }, + { + "completion_length": 10.09375, + "epoch": 0.11669879095847205, + "grad_norm": 22.190376227479046, + "kl": 0.064453125, + "learning_rate": 8.834764324513754e-07, + "loss": 0.0258, + "reward": 1.6114730834960938, + "reward_std": 0.22121301293373108, + "rewards/accuracy_reward_stage2": 0.6114731431007385, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 666 + }, + { + "completion_length": 12.09375, + "epoch": 0.11687401436831961, + "grad_norm": 24.39844815882228, + "kl": 0.0498046875, + "learning_rate": 8.833012090415279e-07, + "loss": 0.0199, + "reward": 1.733590841293335, + "reward_std": 0.25560033321380615, + "rewards/accuracy_reward_stage2": 0.7335907220840454, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 667 + }, + { + "completion_length": 8.90625, + "epoch": 0.11704923777816717, + "grad_norm": 20.17060417802813, + "kl": 0.06298828125, + "learning_rate": 8.831259856316804e-07, + "loss": 0.0252, + "reward": 1.3896734714508057, + "reward_std": 0.1829843521118164, + "rewards/accuracy_reward_stage2": 0.3896734118461609, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 668 + }, + { + "completion_length": 8.1875, + "epoch": 0.11722446118801472, + "grad_norm": 19.397346877665314, + "kl": 0.0361328125, + "learning_rate": 8.829507622218328e-07, + "loss": -0.0298, + "reward": 1.5043643712997437, + "reward_std": 0.15379469096660614, + "rewards/accuracy_reward_stage2": 0.5199893712997437, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 669 + }, + { + "completion_length": 10.453125, + "epoch": 0.11739968459786228, + "grad_norm": 14.634540399364296, + "kl": 0.06640625, + "learning_rate": 8.827755388119852e-07, + "loss": 0.0265, + "reward": 1.3958325386047363, + "reward_std": 0.1272956132888794, + "rewards/accuracy_reward_stage2": 0.39583244919776917, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 670 + }, + { + "completion_length": 12.5625, + "epoch": 0.11757490800770982, + "grad_norm": 19.871858144020802, + "kl": 0.059814453125, + "learning_rate": 8.826003154021377e-07, + "loss": -0.0274, + "reward": 1.6321520805358887, + "reward_std": 0.21269144117832184, + "rewards/accuracy_reward_stage2": 0.6634020209312439, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 671 + }, + { + "completion_length": 7.171875, + "epoch": 0.11775013141755739, + "grad_norm": 17.158601344446673, + "kl": 0.0830078125, + "learning_rate": 8.824250919922901e-07, + "loss": 0.0333, + "reward": 1.6193318367004395, + "reward_std": 0.17965298891067505, + "rewards/accuracy_reward_stage2": 0.6193318963050842, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 672 + }, + { + "completion_length": 16.296875, + "epoch": 0.11792535482740495, + "grad_norm": 19.247189611936488, + "kl": 0.0308837890625, + "learning_rate": 8.822498685824426e-07, + "loss": 0.0124, + "reward": 1.6456239223480225, + "reward_std": 0.11864829808473587, + "rewards/accuracy_reward_stage2": 0.6456239223480225, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 673 + }, + { + "completion_length": 10.296875, + "epoch": 0.1181005782372525, + "grad_norm": 16.813617413768394, + "kl": 0.01214599609375, + "learning_rate": 8.82074645172595e-07, + "loss": 0.0049, + "reward": 1.1939867734909058, + "reward_std": 0.16266238689422607, + "rewards/accuracy_reward_stage2": 0.19398674368858337, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 674 + }, + { + "completion_length": 14.578125, + "epoch": 0.11827580164710005, + "grad_norm": 22.171691413512914, + "kl": 0.2373046875, + "learning_rate": 8.818994217627474e-07, + "loss": 0.0948, + "reward": 1.4289348125457764, + "reward_std": 0.21636059880256653, + "rewards/accuracy_reward_stage2": 0.5539346933364868, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 675 + }, + { + "completion_length": 15.3125, + "epoch": 0.11845102505694761, + "grad_norm": 16.51389196229471, + "kl": 0.060791015625, + "learning_rate": 8.817241983528999e-07, + "loss": -0.0046, + "reward": 1.4913157224655151, + "reward_std": 0.1663581132888794, + "rewards/accuracy_reward_stage2": 0.6163157224655151, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 676 + }, + { + "completion_length": 8.984375, + "epoch": 0.11862624846679516, + "grad_norm": 17.816697430476907, + "kl": 0.19921875, + "learning_rate": 8.815489749430523e-07, + "loss": 0.0795, + "reward": 1.4166667461395264, + "reward_std": 0.2742938995361328, + "rewards/accuracy_reward_stage2": 0.6666666865348816, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 677 + }, + { + "completion_length": 9.09375, + "epoch": 0.11880147187664272, + "grad_norm": 22.066474738023967, + "kl": 0.037841796875, + "learning_rate": 8.813737515332048e-07, + "loss": 0.0152, + "reward": 1.782374382019043, + "reward_std": 0.2402632236480713, + "rewards/accuracy_reward_stage2": 0.7823742628097534, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 678 + }, + { + "completion_length": 11.796875, + "epoch": 0.11897669528649027, + "grad_norm": 21.59322311043083, + "kl": 0.072265625, + "learning_rate": 8.811985281233573e-07, + "loss": -0.0513, + "reward": 1.4617195129394531, + "reward_std": 0.2623371481895447, + "rewards/accuracy_reward_stage2": 0.4929695129394531, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 679 + }, + { + "completion_length": 19.984375, + "epoch": 0.11915191869633783, + "grad_norm": 21.791897157131835, + "kl": 0.263671875, + "learning_rate": 8.810233047135097e-07, + "loss": 0.0271, + "reward": 1.276839017868042, + "reward_std": 0.2978595793247223, + "rewards/accuracy_reward_stage2": 0.433089017868042, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 680 + }, + { + "completion_length": 15.28125, + "epoch": 0.11932714210618539, + "grad_norm": 20.228697346514785, + "kl": 0.05859375, + "learning_rate": 8.808480813036622e-07, + "loss": -0.0547, + "reward": 1.6011157035827637, + "reward_std": 0.2627042233943939, + "rewards/accuracy_reward_stage2": 0.6323657035827637, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 681 + }, + { + "completion_length": 15.8125, + "epoch": 0.11950236551603294, + "grad_norm": 14.047270245441503, + "kl": 0.1142578125, + "learning_rate": 8.806728578938146e-07, + "loss": 0.0361, + "reward": 1.3606376647949219, + "reward_std": 0.14999261498451233, + "rewards/accuracy_reward_stage2": 0.4856376647949219, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 682 + }, + { + "completion_length": 9.5, + "epoch": 0.1196775889258805, + "grad_norm": 25.027738822698772, + "kl": 0.064453125, + "learning_rate": 8.80497634483967e-07, + "loss": 0.0257, + "reward": 1.554718255996704, + "reward_std": 0.17526723444461823, + "rewards/accuracy_reward_stage2": 0.5547182559967041, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 683 + }, + { + "completion_length": 18.984375, + "epoch": 0.11985281233572806, + "grad_norm": 21.291836763245577, + "kl": 0.0341796875, + "learning_rate": 8.803224110741195e-07, + "loss": 0.0136, + "reward": 1.7058907747268677, + "reward_std": 0.16722923517227173, + "rewards/accuracy_reward_stage2": 0.7058907747268677, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 684 + }, + { + "completion_length": 9.9375, + "epoch": 0.1200280357455756, + "grad_norm": 10.990732505324662, + "kl": 0.016845703125, + "learning_rate": 8.801471876642718e-07, + "loss": 0.0067, + "reward": 1.7997299432754517, + "reward_std": 0.08413556218147278, + "rewards/accuracy_reward_stage2": 0.7997298240661621, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 685 + }, + { + "completion_length": 6.921875, + "epoch": 0.12020325915542317, + "grad_norm": 11.626939281046583, + "kl": 0.050537109375, + "learning_rate": 8.799719642544243e-07, + "loss": -0.024, + "reward": 1.4167120456695557, + "reward_std": 0.14424577355384827, + "rewards/accuracy_reward_stage2": 0.4323369860649109, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 686 + }, + { + "completion_length": 9.53125, + "epoch": 0.12037848256527071, + "grad_norm": 23.820420447554763, + "kl": 0.0208740234375, + "learning_rate": 8.797967408445768e-07, + "loss": 0.0084, + "reward": 1.7314132452011108, + "reward_std": 0.2912830412387848, + "rewards/accuracy_reward_stage2": 0.7314131855964661, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 687 + }, + { + "completion_length": 8.421875, + "epoch": 0.12055370597511827, + "grad_norm": 25.086453810433557, + "kl": 0.17578125, + "learning_rate": 8.796215174347292e-07, + "loss": -0.0079, + "reward": 1.603489637374878, + "reward_std": 0.16268977522850037, + "rewards/accuracy_reward_stage2": 0.6347395181655884, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 688 + }, + { + "completion_length": 18.28125, + "epoch": 0.12072892938496584, + "grad_norm": 22.8929620555088, + "kl": 0.224609375, + "learning_rate": 8.794462940248817e-07, + "loss": 0.0898, + "reward": 1.3817732334136963, + "reward_std": 0.3009188175201416, + "rewards/accuracy_reward_stage2": 0.5067732334136963, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 689 + }, + { + "completion_length": 7.46875, + "epoch": 0.12090415279481338, + "grad_norm": 14.703281851075689, + "kl": 0.056884765625, + "learning_rate": 8.792710706150341e-07, + "loss": -0.0184, + "reward": 1.5921062231063843, + "reward_std": 0.19487033784389496, + "rewards/accuracy_reward_stage2": 0.6077312231063843, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 690 + }, + { + "completion_length": 9.34375, + "epoch": 0.12107937620466094, + "grad_norm": 20.135150925561963, + "kl": 0.0712890625, + "learning_rate": 8.790958472051866e-07, + "loss": 0.0285, + "reward": 1.5368764400482178, + "reward_std": 0.29632821679115295, + "rewards/accuracy_reward_stage2": 0.5368764996528625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 691 + }, + { + "completion_length": 19.84375, + "epoch": 0.1212545996145085, + "grad_norm": 23.68837901261757, + "kl": 0.03955078125, + "learning_rate": 8.789206237953391e-07, + "loss": 0.0158, + "reward": 1.5159263610839844, + "reward_std": 0.1295163780450821, + "rewards/accuracy_reward_stage2": 0.5159264206886292, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 692 + }, + { + "completion_length": 7.828125, + "epoch": 0.12142982302435605, + "grad_norm": 20.499563576511616, + "kl": 0.078125, + "learning_rate": 8.787454003854915e-07, + "loss": -0.0018, + "reward": 1.6112077236175537, + "reward_std": 0.27959445118904114, + "rewards/accuracy_reward_stage2": 0.6268327236175537, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 693 + }, + { + "completion_length": 9.171875, + "epoch": 0.12160504643420361, + "grad_norm": 15.815312044878585, + "kl": 0.11474609375, + "learning_rate": 8.78570176975644e-07, + "loss": 0.0458, + "reward": 1.4532971382141113, + "reward_std": 0.22508825361728668, + "rewards/accuracy_reward_stage2": 0.4532972276210785, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 694 + }, + { + "completion_length": 14.328125, + "epoch": 0.12178026984405116, + "grad_norm": 21.787091332554773, + "kl": 0.051513671875, + "learning_rate": 8.783949535657964e-07, + "loss": -0.022, + "reward": 1.662217617034912, + "reward_std": 0.21388936042785645, + "rewards/accuracy_reward_stage2": 0.6778424978256226, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 695 + }, + { + "completion_length": 9.140625, + "epoch": 0.12195549325389872, + "grad_norm": 24.559787032583806, + "kl": 0.08447265625, + "learning_rate": 8.782197301559487e-07, + "loss": -0.0419, + "reward": 1.6562061309814453, + "reward_std": 0.3602675795555115, + "rewards/accuracy_reward_stage2": 0.6874561905860901, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 696 + }, + { + "completion_length": 10.109375, + "epoch": 0.12213071666374628, + "grad_norm": 20.95511007029554, + "kl": 0.0228271484375, + "learning_rate": 8.780445067461012e-07, + "loss": 0.0091, + "reward": 1.3176965713500977, + "reward_std": 0.13378259539604187, + "rewards/accuracy_reward_stage2": 0.5676966309547424, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 697 + }, + { + "completion_length": 12.0625, + "epoch": 0.12230594007359383, + "grad_norm": 21.98721888405554, + "kl": 0.2421875, + "learning_rate": 8.778692833362536e-07, + "loss": 0.0969, + "reward": 1.233135461807251, + "reward_std": 0.12779664993286133, + "rewards/accuracy_reward_stage2": 0.483135461807251, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 698 + }, + { + "completion_length": 21.8125, + "epoch": 0.12248116348344139, + "grad_norm": 16.042916838294357, + "kl": 0.020751953125, + "learning_rate": 8.776940599264061e-07, + "loss": 0.0083, + "reward": 1.2561603784561157, + "reward_std": 0.10535544157028198, + "rewards/accuracy_reward_stage2": 0.3811603784561157, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 699 + }, + { + "completion_length": 7.5, + "epoch": 0.12265638689328895, + "grad_norm": 6.820091036219988, + "kl": 0.0162353515625, + "learning_rate": 8.775188365165586e-07, + "loss": -0.0377, + "reward": 1.359375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 700 + }, + { + "completion_length": 19.34375, + "epoch": 0.1228316103031365, + "grad_norm": 26.132183313864143, + "kl": 0.10107421875, + "learning_rate": 8.77343613106711e-07, + "loss": -0.0037, + "reward": 1.4717097282409668, + "reward_std": 0.33925962448120117, + "rewards/accuracy_reward_stage2": 0.612334668636322, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 701 + }, + { + "completion_length": 12.765625, + "epoch": 0.12300683371298406, + "grad_norm": 23.06283365798747, + "kl": 0.36328125, + "learning_rate": 8.771683896968635e-07, + "loss": 0.1453, + "reward": 1.3122766017913818, + "reward_std": 0.2527972161769867, + "rewards/accuracy_reward_stage2": 0.5622766017913818, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 702 + }, + { + "completion_length": 14.0625, + "epoch": 0.1231820571228316, + "grad_norm": 21.00144003227733, + "kl": 0.051513671875, + "learning_rate": 8.76993166287016e-07, + "loss": 0.0206, + "reward": 1.542011022567749, + "reward_std": 0.21726180613040924, + "rewards/accuracy_reward_stage2": 0.542011022567749, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 703 + }, + { + "completion_length": 16.015625, + "epoch": 0.12335728053267916, + "grad_norm": 19.729022686331653, + "kl": 0.03564453125, + "learning_rate": 8.768179428771684e-07, + "loss": 0.0142, + "reward": 1.6703829765319824, + "reward_std": 0.14785850048065186, + "rewards/accuracy_reward_stage2": 0.6703829169273376, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 704 + }, + { + "completion_length": 12.265625, + "epoch": 0.12353250394252673, + "grad_norm": 16.84608154286035, + "kl": 0.068359375, + "learning_rate": 8.766427194673208e-07, + "loss": -0.0499, + "reward": 1.5830492973327637, + "reward_std": 0.22908511757850647, + "rewards/accuracy_reward_stage2": 0.6142992973327637, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 705 + }, + { + "completion_length": 9.578125, + "epoch": 0.12370772735237427, + "grad_norm": 17.416974869118555, + "kl": 0.17578125, + "learning_rate": 8.764674960574732e-07, + "loss": 0.0703, + "reward": 1.3125, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward_stage2": 0.4375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 706 + }, + { + "completion_length": 11.453125, + "epoch": 0.12388295076222183, + "grad_norm": 19.79245621798528, + "kl": 0.03173828125, + "learning_rate": 8.762922726476257e-07, + "loss": -0.0604, + "reward": 1.5851845741271973, + "reward_std": 0.2783687710762024, + "rewards/accuracy_reward_stage2": 0.6164345741271973, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 707 + }, + { + "completion_length": 16.046875, + "epoch": 0.1240581741720694, + "grad_norm": 17.210066397566948, + "kl": 0.0751953125, + "learning_rate": 8.761170492377782e-07, + "loss": -0.0125, + "reward": 1.521512508392334, + "reward_std": 0.23234155774116516, + "rewards/accuracy_reward_stage2": 0.537137508392334, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 708 + }, + { + "completion_length": 13.171875, + "epoch": 0.12423339758191694, + "grad_norm": 37.86191898225597, + "kl": 0.283203125, + "learning_rate": 8.759418258279305e-07, + "loss": 0.1262, + "reward": 1.515625, + "reward_std": 0.17358146607875824, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 709 + }, + { + "completion_length": 9.078125, + "epoch": 0.1244086209917645, + "grad_norm": 20.46731923834019, + "kl": 0.02783203125, + "learning_rate": 8.75766602418083e-07, + "loss": 0.0112, + "reward": 1.5376777648925781, + "reward_std": 0.19346807897090912, + "rewards/accuracy_reward_stage2": 0.6626777648925781, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 710 + }, + { + "completion_length": 7.734375, + "epoch": 0.12458384440161206, + "grad_norm": 27.163653480328417, + "kl": 0.054443359375, + "learning_rate": 8.755913790082355e-07, + "loss": -0.0225, + "reward": 1.4373842477798462, + "reward_std": 0.24737679958343506, + "rewards/accuracy_reward_stage2": 0.4530092477798462, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 711 + }, + { + "completion_length": 10.1875, + "epoch": 0.12475906781145961, + "grad_norm": 24.58636403392442, + "kl": 0.047607421875, + "learning_rate": 8.754161555983879e-07, + "loss": 0.0191, + "reward": 1.4863896369934082, + "reward_std": 0.309207558631897, + "rewards/accuracy_reward_stage2": 0.4863896369934082, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 712 + }, + { + "completion_length": 9.984375, + "epoch": 0.12493429122130717, + "grad_norm": 23.66613349846893, + "kl": 0.0390625, + "learning_rate": 8.752409321885404e-07, + "loss": -0.0281, + "reward": 1.6329691410064697, + "reward_std": 0.22451892495155334, + "rewards/accuracy_reward_stage2": 0.6485941410064697, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 713 + }, + { + "completion_length": 8.921875, + "epoch": 0.12510951463115472, + "grad_norm": 22.08982170844661, + "kl": 0.125, + "learning_rate": 8.750657087786927e-07, + "loss": 0.0083, + "reward": 1.7729127407073975, + "reward_std": 0.22566775977611542, + "rewards/accuracy_reward_stage2": 0.7885376811027527, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 714 + }, + { + "completion_length": 10.96875, + "epoch": 0.1252847380410023, + "grad_norm": 15.29384643846092, + "kl": 0.059814453125, + "learning_rate": 8.748904853688452e-07, + "loss": -0.021, + "reward": 1.3826444149017334, + "reward_std": 0.2546432912349701, + "rewards/accuracy_reward_stage2": 0.4138944447040558, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 715 + }, + { + "completion_length": 9.25, + "epoch": 0.12545996145084984, + "grad_norm": 22.25942061435046, + "kl": 0.125, + "learning_rate": 8.747152619589977e-07, + "loss": -0.016, + "reward": 1.588760256767273, + "reward_std": 0.27730366587638855, + "rewards/accuracy_reward_stage2": 0.854385256767273, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 716 + }, + { + "completion_length": 11.9375, + "epoch": 0.12563518486069739, + "grad_norm": 14.361927992686452, + "kl": 0.0517578125, + "learning_rate": 8.745400385491501e-07, + "loss": 0.0207, + "reward": 1.789434552192688, + "reward_std": 0.07365694642066956, + "rewards/accuracy_reward_stage2": 0.789434552192688, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 717 + }, + { + "completion_length": 15.640625, + "epoch": 0.12581040827054493, + "grad_norm": 24.44647306222038, + "kl": 0.15625, + "learning_rate": 8.743648151393026e-07, + "loss": 0.0408, + "reward": 1.407584309577942, + "reward_std": 0.14255505800247192, + "rewards/accuracy_reward_stage2": 0.6732093691825867, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 718 + }, + { + "completion_length": 12.5, + "epoch": 0.1259856316803925, + "grad_norm": 23.288244008568896, + "kl": 0.091796875, + "learning_rate": 8.741895917294551e-07, + "loss": 0.0052, + "reward": 1.5710008144378662, + "reward_std": 0.24018634855747223, + "rewards/accuracy_reward_stage2": 0.5866257548332214, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 719 + }, + { + "completion_length": 29.15625, + "epoch": 0.12616085509024005, + "grad_norm": 15.350773622109923, + "kl": 0.048828125, + "learning_rate": 8.740143683196075e-07, + "loss": 0.0196, + "reward": 1.360360026359558, + "reward_std": 0.15090447664260864, + "rewards/accuracy_reward_stage2": 0.4853600561618805, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 720 + }, + { + "completion_length": 9.359375, + "epoch": 0.1263360785000876, + "grad_norm": 18.454024671060672, + "kl": 0.115234375, + "learning_rate": 8.738391449097599e-07, + "loss": 0.0461, + "reward": 1.3818800449371338, + "reward_std": 0.16841940581798553, + "rewards/accuracy_reward_stage2": 0.5068800449371338, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 721 + }, + { + "completion_length": 8.265625, + "epoch": 0.12651130190993518, + "grad_norm": 21.160670778734957, + "kl": 0.046142578125, + "learning_rate": 8.736639214999123e-07, + "loss": 0.0185, + "reward": 1.6255755424499512, + "reward_std": 0.22785469889640808, + "rewards/accuracy_reward_stage2": 0.6255755424499512, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 722 + }, + { + "completion_length": 9.59375, + "epoch": 0.12668652531978272, + "grad_norm": 22.416171315171262, + "kl": 0.09521484375, + "learning_rate": 8.734886980900648e-07, + "loss": -0.0139, + "reward": 1.3465453386306763, + "reward_std": 0.19358889758586884, + "rewards/accuracy_reward_stage2": 0.3777953088283539, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 723 + }, + { + "completion_length": 8.0625, + "epoch": 0.12686174872963027, + "grad_norm": 19.61083070842855, + "kl": 0.0242919921875, + "learning_rate": 8.733134746802173e-07, + "loss": 0.0097, + "reward": 1.5615843534469604, + "reward_std": 0.19774232804775238, + "rewards/accuracy_reward_stage2": 0.5615843534469604, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 724 + }, + { + "completion_length": 10.0625, + "epoch": 0.12703697213947784, + "grad_norm": 13.690877366216116, + "kl": 0.11279296875, + "learning_rate": 8.731382512703696e-07, + "loss": 0.0451, + "reward": 1.3139506578445435, + "reward_std": 0.13442152738571167, + "rewards/accuracy_reward_stage2": 0.5639506578445435, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 725 + }, + { + "completion_length": 8.828125, + "epoch": 0.1272121955493254, + "grad_norm": 18.62268067901579, + "kl": 0.267578125, + "learning_rate": 8.729630278605221e-07, + "loss": 0.094, + "reward": 1.4589645862579346, + "reward_std": 0.18263386189937592, + "rewards/accuracy_reward_stage2": 0.7245896458625793, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 726 + }, + { + "completion_length": 15.21875, + "epoch": 0.12738741895917294, + "grad_norm": 23.71829613698294, + "kl": 0.061279296875, + "learning_rate": 8.727878044506745e-07, + "loss": 0.0245, + "reward": 1.4625771045684814, + "reward_std": 0.17948952317237854, + "rewards/accuracy_reward_stage2": 0.5875771045684814, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 727 + }, + { + "completion_length": 8.171875, + "epoch": 0.1275626423690205, + "grad_norm": 15.271887827957038, + "kl": 0.03173828125, + "learning_rate": 8.72612581040827e-07, + "loss": -0.0669, + "reward": 1.7864583730697632, + "reward_std": 0.23897382616996765, + "rewards/accuracy_reward_stage2": 0.8177083134651184, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 728 + }, + { + "completion_length": 14.65625, + "epoch": 0.12773786577886806, + "grad_norm": 17.48789694547715, + "kl": 0.173828125, + "learning_rate": 8.724373576309795e-07, + "loss": 0.0255, + "reward": 1.3286545276641846, + "reward_std": 0.13117444515228271, + "rewards/accuracy_reward_stage2": 0.46927955746650696, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 729 + }, + { + "completion_length": 12.328125, + "epoch": 0.1279130891887156, + "grad_norm": 23.96242945197875, + "kl": 0.060546875, + "learning_rate": 8.722621342211319e-07, + "loss": -0.0112, + "reward": 1.722947120666504, + "reward_std": 0.23015879094600677, + "rewards/accuracy_reward_stage2": 0.7385720610618591, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 730 + }, + { + "completion_length": 11.234375, + "epoch": 0.12808831259856318, + "grad_norm": 14.144418959421241, + "kl": 0.06689453125, + "learning_rate": 8.720869108112844e-07, + "loss": -0.0175, + "reward": 1.603621006011963, + "reward_std": 0.16533318161964417, + "rewards/accuracy_reward_stage2": 0.6192460656166077, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 731 + }, + { + "completion_length": 12.03125, + "epoch": 0.12826353600841073, + "grad_norm": 18.61312573748659, + "kl": 0.052734375, + "learning_rate": 8.719116874014369e-07, + "loss": 0.0048, + "reward": 1.4712536334991455, + "reward_std": 0.15344056487083435, + "rewards/accuracy_reward_stage2": 0.611878514289856, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 732 + }, + { + "completion_length": 8.625, + "epoch": 0.12843875941825827, + "grad_norm": 17.441433878109546, + "kl": 0.050537109375, + "learning_rate": 8.717364639915893e-07, + "loss": 0.0202, + "reward": 1.3996421098709106, + "reward_std": 0.2853778600692749, + "rewards/accuracy_reward_stage2": 0.5246421694755554, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 733 + }, + { + "completion_length": 11.15625, + "epoch": 0.12861398282810582, + "grad_norm": 24.980574455079825, + "kl": 0.189453125, + "learning_rate": 8.715612405817416e-07, + "loss": 0.0045, + "reward": 1.6333026885986328, + "reward_std": 0.26741790771484375, + "rewards/accuracy_reward_stage2": 0.7895527482032776, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 734 + }, + { + "completion_length": 11.109375, + "epoch": 0.1287892062379534, + "grad_norm": 50.068285086941245, + "kl": 0.255859375, + "learning_rate": 8.71386017171894e-07, + "loss": 0.0624, + "reward": 1.6093500852584839, + "reward_std": 0.18894584476947784, + "rewards/accuracy_reward_stage2": 0.6249750852584839, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 735 + }, + { + "completion_length": 10.46875, + "epoch": 0.12896442964780094, + "grad_norm": 16.488258189073505, + "kl": 0.0198974609375, + "learning_rate": 8.712107937620465e-07, + "loss": -0.005, + "reward": 1.542205572128296, + "reward_std": 0.139317587018013, + "rewards/accuracy_reward_stage2": 0.5578306913375854, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 736 + }, + { + "completion_length": 11.5625, + "epoch": 0.1291396530576485, + "grad_norm": 34.105571787658626, + "kl": 0.0439453125, + "learning_rate": 8.71035570352199e-07, + "loss": -0.0257, + "reward": 1.604642391204834, + "reward_std": 0.18298792839050293, + "rewards/accuracy_reward_stage2": 0.7452673316001892, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 737 + }, + { + "completion_length": 11.609375, + "epoch": 0.12931487646749606, + "grad_norm": 23.348912361605613, + "kl": 0.0751953125, + "learning_rate": 8.708603469423514e-07, + "loss": 0.0301, + "reward": 1.808213710784912, + "reward_std": 0.1791677325963974, + "rewards/accuracy_reward_stage2": 0.8082137107849121, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 738 + }, + { + "completion_length": 11.59375, + "epoch": 0.1294900998773436, + "grad_norm": 21.941403584613536, + "kl": 0.07470703125, + "learning_rate": 8.706851235325039e-07, + "loss": 0.0299, + "reward": 1.4388988018035889, + "reward_std": 0.20526067912578583, + "rewards/accuracy_reward_stage2": 0.43889886140823364, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 739 + }, + { + "completion_length": 13.375, + "epoch": 0.12966532328719116, + "grad_norm": 21.150108215201115, + "kl": 0.06298828125, + "learning_rate": 8.705099001226564e-07, + "loss": -0.019, + "reward": 1.7087092399597168, + "reward_std": 0.2277108132839203, + "rewards/accuracy_reward_stage2": 0.7243342399597168, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 740 + }, + { + "completion_length": 8.78125, + "epoch": 0.12984054669703873, + "grad_norm": 16.65691581922075, + "kl": 0.0224609375, + "learning_rate": 8.703346767128088e-07, + "loss": -0.0352, + "reward": 1.3385417461395264, + "reward_std": 0.13152071833610535, + "rewards/accuracy_reward_stage2": 0.3541666567325592, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 741 + }, + { + "completion_length": 9.9375, + "epoch": 0.13001577010688628, + "grad_norm": 17.551427941488406, + "kl": 0.09033203125, + "learning_rate": 8.701594533029613e-07, + "loss": 0.0361, + "reward": 1.6374205350875854, + "reward_std": 0.1315646469593048, + "rewards/accuracy_reward_stage2": 0.762420654296875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 742 + }, + { + "completion_length": 11.859375, + "epoch": 0.13019099351673383, + "grad_norm": 19.7964491837443, + "kl": 0.0147705078125, + "learning_rate": 8.699842298931137e-07, + "loss": 0.0059, + "reward": 1.7438607215881348, + "reward_std": 0.0982588678598404, + "rewards/accuracy_reward_stage2": 0.7438607811927795, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 743 + }, + { + "completion_length": 11.375, + "epoch": 0.1303662169265814, + "grad_norm": 15.932552856525444, + "kl": 0.054931640625, + "learning_rate": 8.698090064832662e-07, + "loss": 0.022, + "reward": 1.5394132137298584, + "reward_std": 0.10703323036432266, + "rewards/accuracy_reward_stage2": 0.5394132137298584, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 744 + }, + { + "completion_length": 8.40625, + "epoch": 0.13054144033642895, + "grad_norm": 18.34859237019442, + "kl": 0.023193359375, + "learning_rate": 8.696337830734186e-07, + "loss": 0.0093, + "reward": 1.5473082065582275, + "reward_std": 0.2439563274383545, + "rewards/accuracy_reward_stage2": 0.6723082065582275, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 745 + }, + { + "completion_length": 7.46875, + "epoch": 0.1307166637462765, + "grad_norm": 24.168096070523397, + "kl": 0.13671875, + "learning_rate": 8.69458559663571e-07, + "loss": 0.0316, + "reward": 1.4857832193374634, + "reward_std": 0.2854121923446655, + "rewards/accuracy_reward_stage2": 0.5014082193374634, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 746 + }, + { + "completion_length": 42.578125, + "epoch": 0.13089188715612407, + "grad_norm": 23.125506213020195, + "kl": 0.1162109375, + "learning_rate": 8.692833362537234e-07, + "loss": 0.0466, + "reward": 1.48673677444458, + "reward_std": 0.24000920355319977, + "rewards/accuracy_reward_stage2": 0.4867367744445801, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 747 + }, + { + "completion_length": 7.421875, + "epoch": 0.13106711056597162, + "grad_norm": 18.38991310797759, + "kl": 0.05078125, + "learning_rate": 8.691081128438759e-07, + "loss": 0.0203, + "reward": 1.53125, + "reward_std": 0.22461533546447754, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 748 + }, + { + "completion_length": 15.359375, + "epoch": 0.13124233397581916, + "grad_norm": 30.97799225674079, + "kl": 0.1630859375, + "learning_rate": 8.689328894340283e-07, + "loss": 0.0653, + "reward": 1.4413138628005981, + "reward_std": 0.22766022384166718, + "rewards/accuracy_reward_stage2": 0.5663139224052429, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 749 + }, + { + "completion_length": 9.9375, + "epoch": 0.13141755738566674, + "grad_norm": 15.125011554316435, + "kl": 0.06201171875, + "learning_rate": 8.687576660241808e-07, + "loss": 0.0248, + "reward": 1.7050046920776367, + "reward_std": 0.11424589902162552, + "rewards/accuracy_reward_stage2": 0.7050046324729919, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 750 + }, + { + "completion_length": 7.0625, + "epoch": 0.13159278079551429, + "grad_norm": 20.962470991988887, + "kl": 0.059814453125, + "learning_rate": 8.685824426143332e-07, + "loss": 0.0239, + "reward": 1.678983211517334, + "reward_std": 0.19982093572616577, + "rewards/accuracy_reward_stage2": 0.678983211517334, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 751 + }, + { + "completion_length": 14.6875, + "epoch": 0.13176800420536183, + "grad_norm": 385.8362173270209, + "kl": 1.5625, + "learning_rate": 8.684072192044857e-07, + "loss": 0.6266, + "reward": 1.203751564025879, + "reward_std": 0.18451911211013794, + "rewards/accuracy_reward_stage2": 0.4537515342235565, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 752 + }, + { + "completion_length": 7.046875, + "epoch": 0.13194322761520938, + "grad_norm": 10.755434994237342, + "kl": 0.005523681640625, + "learning_rate": 8.682319957946382e-07, + "loss": -0.042, + "reward": 1.6046037673950195, + "reward_std": 0.0576893612742424, + "rewards/accuracy_reward_stage2": 0.6202287077903748, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 753 + }, + { + "completion_length": 12.109375, + "epoch": 0.13211845102505695, + "grad_norm": 20.873092091818556, + "kl": 0.1015625, + "learning_rate": 8.680567723847905e-07, + "loss": -0.0454, + "reward": 1.3576622009277344, + "reward_std": 0.25579172372817993, + "rewards/accuracy_reward_stage2": 0.3889121115207672, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 754 + }, + { + "completion_length": 10.21875, + "epoch": 0.1322936744349045, + "grad_norm": 23.23743047363629, + "kl": 0.08740234375, + "learning_rate": 8.67881548974943e-07, + "loss": -0.0093, + "reward": 1.6699610948562622, + "reward_std": 0.2566087543964386, + "rewards/accuracy_reward_stage2": 0.6855860948562622, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 755 + }, + { + "completion_length": 7.328125, + "epoch": 0.13246889784475205, + "grad_norm": 20.70546052020614, + "kl": 0.140625, + "learning_rate": 8.677063255650955e-07, + "loss": 0.0561, + "reward": 1.5664639472961426, + "reward_std": 0.17884564399719238, + "rewards/accuracy_reward_stage2": 0.6914640069007874, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 756 + }, + { + "completion_length": 11.5, + "epoch": 0.13264412125459962, + "grad_norm": 20.88423913244621, + "kl": 0.244140625, + "learning_rate": 8.675311021552479e-07, + "loss": 0.1123, + "reward": 1.1417410373687744, + "reward_std": 0.1422545462846756, + "rewards/accuracy_reward_stage2": 0.3917410969734192, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 757 + }, + { + "completion_length": 9.015625, + "epoch": 0.13281934466444717, + "grad_norm": 18.603362315080933, + "kl": 0.07763671875, + "learning_rate": 8.673558787454004e-07, + "loss": -0.0318, + "reward": 1.6441401243209839, + "reward_std": 0.2794819474220276, + "rewards/accuracy_reward_stage2": 0.6753901243209839, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 758 + }, + { + "completion_length": 10.109375, + "epoch": 0.13299456807429472, + "grad_norm": 20.419572301243445, + "kl": 0.048828125, + "learning_rate": 8.671806553355527e-07, + "loss": 0.0195, + "reward": 1.6706733703613281, + "reward_std": 0.17810383439064026, + "rewards/accuracy_reward_stage2": 0.6706732511520386, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 759 + }, + { + "completion_length": 11.234375, + "epoch": 0.1331697914841423, + "grad_norm": 25.885192159188893, + "kl": 0.04833984375, + "learning_rate": 8.670054319257052e-07, + "loss": -0.0248, + "reward": 1.4116337299346924, + "reward_std": 0.2717921733856201, + "rewards/accuracy_reward_stage2": 0.5522586107254028, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 760 + }, + { + "completion_length": 11.96875, + "epoch": 0.13334501489398984, + "grad_norm": 18.97680260993474, + "kl": 0.068359375, + "learning_rate": 8.668302085158577e-07, + "loss": -0.0168, + "reward": 1.7507286071777344, + "reward_std": 0.17205798625946045, + "rewards/accuracy_reward_stage2": 0.7663537263870239, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 761 + }, + { + "completion_length": 9.796875, + "epoch": 0.13352023830383739, + "grad_norm": 16.862781802573416, + "kl": 0.04833984375, + "learning_rate": 8.666549851060101e-07, + "loss": -0.014, + "reward": 1.2919607162475586, + "reward_std": 0.22537767887115479, + "rewards/accuracy_reward_stage2": 0.3075857162475586, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 762 + }, + { + "completion_length": 9.0, + "epoch": 0.13369546171368496, + "grad_norm": 13.026380521166306, + "kl": 0.0380859375, + "learning_rate": 8.664797616961626e-07, + "loss": -0.0596, + "reward": 1.5166369676589966, + "reward_std": 0.19904598593711853, + "rewards/accuracy_reward_stage2": 0.6728869676589966, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 763 + }, + { + "completion_length": 16.875, + "epoch": 0.1338706851235325, + "grad_norm": 20.06725931064897, + "kl": 0.09423828125, + "learning_rate": 8.66304538286315e-07, + "loss": 0.0377, + "reward": 1.4519935846328735, + "reward_std": 0.21910575032234192, + "rewards/accuracy_reward_stage2": 0.5769935846328735, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 764 + }, + { + "completion_length": 12.96875, + "epoch": 0.13404590853338005, + "grad_norm": 21.98710630877217, + "kl": 0.0361328125, + "learning_rate": 8.661293148764674e-07, + "loss": 0.0144, + "reward": 1.4242117404937744, + "reward_std": 0.29334086179733276, + "rewards/accuracy_reward_stage2": 0.6742118000984192, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 765 + }, + { + "completion_length": 17.03125, + "epoch": 0.13422113194322763, + "grad_norm": 20.425342566894855, + "kl": 0.12451171875, + "learning_rate": 8.659540914666199e-07, + "loss": 0.0499, + "reward": 1.2799370288848877, + "reward_std": 0.18060024082660675, + "rewards/accuracy_reward_stage2": 0.4049370586872101, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 766 + }, + { + "completion_length": 10.234375, + "epoch": 0.13439635535307518, + "grad_norm": 18.339545297817114, + "kl": 0.0380859375, + "learning_rate": 8.657788680567723e-07, + "loss": 0.0152, + "reward": 1.4003806114196777, + "reward_std": 0.18729178607463837, + "rewards/accuracy_reward_stage2": 0.5253806710243225, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 767 + }, + { + "completion_length": 11.953125, + "epoch": 0.13457157876292272, + "grad_norm": 25.01143507989166, + "kl": 0.0458984375, + "learning_rate": 8.656036446469248e-07, + "loss": 0.0184, + "reward": 1.4315991401672363, + "reward_std": 0.2495567500591278, + "rewards/accuracy_reward_stage2": 0.5565991401672363, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 768 + }, + { + "completion_length": 8.5, + "epoch": 0.13474680217277027, + "grad_norm": 22.864818091288953, + "kl": 0.05224609375, + "learning_rate": 8.654284212370773e-07, + "loss": 0.0209, + "reward": 1.5170884132385254, + "reward_std": 0.2796719968318939, + "rewards/accuracy_reward_stage2": 0.5170884132385254, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 769 + }, + { + "completion_length": 8.375, + "epoch": 0.13492202558261784, + "grad_norm": 19.1641610611154, + "kl": 0.08349609375, + "learning_rate": 8.652531978272297e-07, + "loss": 0.0334, + "reward": 1.553787350654602, + "reward_std": 0.22851644456386566, + "rewards/accuracy_reward_stage2": 0.553787350654602, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 770 + }, + { + "completion_length": 10.21875, + "epoch": 0.1350972489924654, + "grad_norm": 19.934467521338437, + "kl": 0.0537109375, + "learning_rate": 8.650779744173822e-07, + "loss": -0.0216, + "reward": 1.4445722103118896, + "reward_std": 0.2468641996383667, + "rewards/accuracy_reward_stage2": 0.47582218050956726, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 771 + }, + { + "completion_length": 10.9375, + "epoch": 0.13527247240231294, + "grad_norm": 13.701970829268843, + "kl": 0.06982421875, + "learning_rate": 8.649027510075346e-07, + "loss": 0.0279, + "reward": 1.4476916790008545, + "reward_std": 0.12115681171417236, + "rewards/accuracy_reward_stage2": 0.5726916790008545, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 772 + }, + { + "completion_length": 10.109375, + "epoch": 0.1354476958121605, + "grad_norm": 18.066531116861547, + "kl": 0.29296875, + "learning_rate": 8.64727527597687e-07, + "loss": 0.0854, + "reward": 1.426032543182373, + "reward_std": 0.21603938937187195, + "rewards/accuracy_reward_stage2": 0.566657543182373, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 773 + }, + { + "completion_length": 9.09375, + "epoch": 0.13562291922200806, + "grad_norm": 16.71484519661642, + "kl": 0.1220703125, + "learning_rate": 8.645523041878394e-07, + "loss": -0.0187, + "reward": 1.326295256614685, + "reward_std": 0.24095875024795532, + "rewards/accuracy_reward_stage2": 0.48254525661468506, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 774 + }, + { + "completion_length": 7.9375, + "epoch": 0.1357981426318556, + "grad_norm": 19.917155663833235, + "kl": 0.09326171875, + "learning_rate": 8.643770807779918e-07, + "loss": -0.0468, + "reward": 1.4075981378555298, + "reward_std": 0.23457500338554382, + "rewards/accuracy_reward_stage2": 0.563848078250885, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 775 + }, + { + "completion_length": 5.484375, + "epoch": 0.13597336604170318, + "grad_norm": 21.14923073069886, + "kl": 0.1435546875, + "learning_rate": 8.642018573681443e-07, + "loss": 0.0134, + "reward": 1.250086784362793, + "reward_std": 0.19084270298480988, + "rewards/accuracy_reward_stage2": 0.26571181416511536, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 776 + }, + { + "completion_length": 10.0, + "epoch": 0.13614858945155073, + "grad_norm": 24.381992165789516, + "kl": 0.07763671875, + "learning_rate": 8.640266339582968e-07, + "loss": -0.0125, + "reward": 1.502030611038208, + "reward_std": 0.2354775220155716, + "rewards/accuracy_reward_stage2": 0.517655611038208, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 777 + }, + { + "completion_length": 12.875, + "epoch": 0.13632381286139827, + "grad_norm": 19.508155852089544, + "kl": 0.0478515625, + "learning_rate": 8.638514105484492e-07, + "loss": -0.0251, + "reward": 1.6137158870697021, + "reward_std": 0.21956676244735718, + "rewards/accuracy_reward_stage2": 0.6293408870697021, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 778 + }, + { + "completion_length": 13.265625, + "epoch": 0.13649903627124585, + "grad_norm": 16.56376238689246, + "kl": 0.06640625, + "learning_rate": 8.636761871386017e-07, + "loss": 0.0266, + "reward": 0.9685095548629761, + "reward_std": 0.17524616420269012, + "rewards/accuracy_reward_stage2": 0.2185094952583313, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 779 + }, + { + "completion_length": 7.4375, + "epoch": 0.1366742596810934, + "grad_norm": 16.918923047432216, + "kl": 0.0693359375, + "learning_rate": 8.635009637287542e-07, + "loss": 0.0278, + "reward": 1.5394725799560547, + "reward_std": 0.1202455535531044, + "rewards/accuracy_reward_stage2": 0.6644724607467651, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 780 + }, + { + "completion_length": 8.359375, + "epoch": 0.13684948309094094, + "grad_norm": 25.712473621519223, + "kl": 0.0223388671875, + "learning_rate": 8.633257403189066e-07, + "loss": 0.0089, + "reward": 1.6351406574249268, + "reward_std": 0.1843852698802948, + "rewards/accuracy_reward_stage2": 0.6351406574249268, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 781 + }, + { + "completion_length": 9.328125, + "epoch": 0.13702470650078852, + "grad_norm": 15.671890042035438, + "kl": 0.021484375, + "learning_rate": 8.631505169090591e-07, + "loss": 0.0023, + "reward": 1.4858630895614624, + "reward_std": 0.12837354838848114, + "rewards/accuracy_reward_stage2": 0.6108630895614624, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 782 + }, + { + "completion_length": 10.984375, + "epoch": 0.13719992991063606, + "grad_norm": 20.30097845606878, + "kl": 0.12255859375, + "learning_rate": 8.629752934992115e-07, + "loss": 0.0126, + "reward": 1.5040788650512695, + "reward_std": 0.2858182489871979, + "rewards/accuracy_reward_stage2": 0.5197038650512695, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 783 + }, + { + "completion_length": 9.203125, + "epoch": 0.1373751533204836, + "grad_norm": 26.037028675184818, + "kl": 0.10009765625, + "learning_rate": 8.62800070089364e-07, + "loss": -0.0872, + "reward": 1.5257666110992432, + "reward_std": 0.29896122217178345, + "rewards/accuracy_reward_stage2": 0.5726416110992432, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 784 + }, + { + "completion_length": 11.796875, + "epoch": 0.13755037673033116, + "grad_norm": 22.34753247816572, + "kl": 0.061279296875, + "learning_rate": 8.626248466795163e-07, + "loss": 0.0246, + "reward": 1.5781292915344238, + "reward_std": 0.14735251665115356, + "rewards/accuracy_reward_stage2": 0.5781292915344238, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 785 + }, + { + "completion_length": 9.640625, + "epoch": 0.13772560014017873, + "grad_norm": 21.799564810232653, + "kl": 0.01904296875, + "learning_rate": 8.624496232696687e-07, + "loss": 0.0076, + "reward": 1.606956958770752, + "reward_std": 0.2769896984100342, + "rewards/accuracy_reward_stage2": 0.6069568395614624, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 786 + }, + { + "completion_length": 8.125, + "epoch": 0.13790082355002628, + "grad_norm": 15.977768888328422, + "kl": 0.05078125, + "learning_rate": 8.622743998598212e-07, + "loss": 0.0202, + "reward": 1.5291006565093994, + "reward_std": 0.22113242745399475, + "rewards/accuracy_reward_stage2": 0.5291005373001099, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 787 + }, + { + "completion_length": 9.625, + "epoch": 0.13807604695987383, + "grad_norm": 24.547348319862273, + "kl": 0.10791015625, + "learning_rate": 8.620991764499737e-07, + "loss": 0.0431, + "reward": 1.7083591222763062, + "reward_std": 0.20269346237182617, + "rewards/accuracy_reward_stage2": 0.7083591222763062, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 788 + }, + { + "completion_length": 7.203125, + "epoch": 0.1382512703697214, + "grad_norm": 9.08047735946729, + "kl": 0.009521484375, + "learning_rate": 8.619239530401261e-07, + "loss": 0.0038, + "reward": 1.3945484161376953, + "reward_std": 0.012436339631676674, + "rewards/accuracy_reward_stage2": 0.3945484161376953, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 789 + }, + { + "completion_length": 8.859375, + "epoch": 0.13842649377956895, + "grad_norm": 22.58653722630102, + "kl": 0.0859375, + "learning_rate": 8.617487296302786e-07, + "loss": 0.001, + "reward": 1.6167235374450684, + "reward_std": 0.26534900069236755, + "rewards/accuracy_reward_stage2": 0.6323485374450684, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 790 + }, + { + "completion_length": 7.90625, + "epoch": 0.1386017171894165, + "grad_norm": 25.853314882203968, + "kl": 0.1376953125, + "learning_rate": 8.61573506220431e-07, + "loss": -0.032, + "reward": 1.5990020036697388, + "reward_std": 0.3643280267715454, + "rewards/accuracy_reward_stage2": 0.6458768844604492, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 791 + }, + { + "completion_length": 17.46875, + "epoch": 0.13877694059926407, + "grad_norm": 22.407136209035535, + "kl": 0.09716796875, + "learning_rate": 8.613982828105835e-07, + "loss": -0.0052, + "reward": 1.6806354522705078, + "reward_std": 0.2374819815158844, + "rewards/accuracy_reward_stage2": 0.6962604522705078, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 792 + }, + { + "completion_length": 7.734375, + "epoch": 0.13895216400911162, + "grad_norm": 22.541418888160006, + "kl": 0.1689453125, + "learning_rate": 8.61223059400736e-07, + "loss": 0.0677, + "reward": 1.4915530681610107, + "reward_std": 0.15434393286705017, + "rewards/accuracy_reward_stage2": 0.6165530681610107, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 793 + }, + { + "completion_length": 9.453125, + "epoch": 0.13912738741895916, + "grad_norm": 20.819147207825274, + "kl": 0.2265625, + "learning_rate": 8.610478359908883e-07, + "loss": 0.011, + "reward": 1.2268931865692139, + "reward_std": 0.28804537653923035, + "rewards/accuracy_reward_stage2": 0.38314324617385864, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 794 + }, + { + "completion_length": 8.765625, + "epoch": 0.13930261082880674, + "grad_norm": 18.854480783612633, + "kl": 0.0595703125, + "learning_rate": 8.608726125810408e-07, + "loss": -0.0646, + "reward": 1.582197904586792, + "reward_std": 0.2034272849559784, + "rewards/accuracy_reward_stage2": 0.613447904586792, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 795 + }, + { + "completion_length": 12.375, + "epoch": 0.13947783423865429, + "grad_norm": 19.002903839692486, + "kl": 0.078125, + "learning_rate": 8.606973891711933e-07, + "loss": 0.0312, + "reward": 1.512641429901123, + "reward_std": 0.20425836741924286, + "rewards/accuracy_reward_stage2": 0.5126413702964783, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 796 + }, + { + "completion_length": 9.9375, + "epoch": 0.13965305764850183, + "grad_norm": 24.877281554306364, + "kl": 0.240234375, + "learning_rate": 8.605221657613457e-07, + "loss": 0.0961, + "reward": 1.546425700187683, + "reward_std": 0.27219367027282715, + "rewards/accuracy_reward_stage2": 0.6714255809783936, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 797 + }, + { + "completion_length": 8.984375, + "epoch": 0.1398282810583494, + "grad_norm": 23.992827145250047, + "kl": 0.07958984375, + "learning_rate": 8.603469423514981e-07, + "loss": -0.0549, + "reward": 1.747768521308899, + "reward_std": 0.25258371233940125, + "rewards/accuracy_reward_stage2": 0.7790185213088989, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 798 + }, + { + "completion_length": 5.484375, + "epoch": 0.14000350446819695, + "grad_norm": 21.729766925610043, + "kl": 0.06689453125, + "learning_rate": 8.601717189416505e-07, + "loss": 0.0267, + "reward": 1.8072917461395264, + "reward_std": 0.1921348124742508, + "rewards/accuracy_reward_stage2": 0.8072916865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 799 + }, + { + "completion_length": 9.21875, + "epoch": 0.1401787278780445, + "grad_norm": 21.856434941735007, + "kl": 0.047607421875, + "learning_rate": 8.59996495531803e-07, + "loss": 0.019, + "reward": 1.5184874534606934, + "reward_std": 0.10601860284805298, + "rewards/accuracy_reward_stage2": 0.5184873938560486, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 800 + }, + { + "completion_length": 11.5, + "epoch": 0.14035395128789208, + "grad_norm": 24.821815379910944, + "kl": 0.0869140625, + "learning_rate": 8.598212721219555e-07, + "loss": -0.0094, + "reward": 1.6020259857177734, + "reward_std": 0.25857317447662354, + "rewards/accuracy_reward_stage2": 0.7426510453224182, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 801 + }, + { + "completion_length": 10.890625, + "epoch": 0.14052917469773962, + "grad_norm": 16.76642314310539, + "kl": 0.041748046875, + "learning_rate": 8.596460487121079e-07, + "loss": 0.0167, + "reward": 1.5585997104644775, + "reward_std": 0.10192655771970749, + "rewards/accuracy_reward_stage2": 0.5585997104644775, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 802 + }, + { + "completion_length": 13.1875, + "epoch": 0.14070439810758717, + "grad_norm": 41.3087590593791, + "kl": 0.2392578125, + "learning_rate": 8.594708253022604e-07, + "loss": 0.0622, + "reward": 1.2422547340393066, + "reward_std": 0.24303309619426727, + "rewards/accuracy_reward_stage2": 0.3828798234462738, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 803 + }, + { + "completion_length": 10.078125, + "epoch": 0.14087962151743472, + "grad_norm": 21.448868861727423, + "kl": 0.1103515625, + "learning_rate": 8.592956018924127e-07, + "loss": -0.0151, + "reward": 1.6200649738311768, + "reward_std": 0.2951427698135376, + "rewards/accuracy_reward_stage2": 0.6513150334358215, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 804 + }, + { + "completion_length": 11.8125, + "epoch": 0.1410548449272823, + "grad_norm": 20.462640582577773, + "kl": 0.041259765625, + "learning_rate": 8.591203784825652e-07, + "loss": 0.0166, + "reward": 1.4762279987335205, + "reward_std": 0.20103763043880463, + "rewards/accuracy_reward_stage2": 0.4762280285358429, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 805 + }, + { + "completion_length": 9.0, + "epoch": 0.14123006833712984, + "grad_norm": 17.430891887761103, + "kl": 0.2119140625, + "learning_rate": 8.589451550727177e-07, + "loss": 0.0849, + "reward": 1.347902774810791, + "reward_std": 0.15564867854118347, + "rewards/accuracy_reward_stage2": 0.5979026556015015, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 806 + }, + { + "completion_length": 9.484375, + "epoch": 0.14140529174697739, + "grad_norm": 21.318720238994356, + "kl": 0.12451171875, + "learning_rate": 8.587699316628701e-07, + "loss": -0.0666, + "reward": 1.6182410717010498, + "reward_std": 0.2888680100440979, + "rewards/accuracy_reward_stage2": 0.665116012096405, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 807 + }, + { + "completion_length": 9.640625, + "epoch": 0.14158051515682496, + "grad_norm": 1125.174475735105, + "kl": 0.703125, + "learning_rate": 8.585947082530226e-07, + "loss": 0.2047, + "reward": 1.3404356241226196, + "reward_std": 0.15762722492218018, + "rewards/accuracy_reward_stage2": 0.49668562412261963, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 808 + }, + { + "completion_length": 13.03125, + "epoch": 0.1417557385666725, + "grad_norm": 24.047162978568288, + "kl": 0.1376953125, + "learning_rate": 8.584194848431751e-07, + "loss": 0.0551, + "reward": 1.5736751556396484, + "reward_std": 0.2321637123823166, + "rewards/accuracy_reward_stage2": 0.5736752152442932, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 809 + }, + { + "completion_length": 14.578125, + "epoch": 0.14193096197652005, + "grad_norm": 16.987052420807263, + "kl": 0.0308837890625, + "learning_rate": 8.582442614333274e-07, + "loss": -0.0231, + "reward": 1.4605519771575928, + "reward_std": 0.2577477991580963, + "rewards/accuracy_reward_stage2": 0.6011769771575928, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 810 + }, + { + "completion_length": 8.625, + "epoch": 0.14210618538636763, + "grad_norm": 23.109130665223027, + "kl": 0.2392578125, + "learning_rate": 8.580690380234799e-07, + "loss": 0.0646, + "reward": 1.4605047702789307, + "reward_std": 0.27034828066825867, + "rewards/accuracy_reward_stage2": 0.7261297106742859, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 811 + }, + { + "completion_length": 10.140625, + "epoch": 0.14228140879621518, + "grad_norm": 16.502025071920517, + "kl": 0.11474609375, + "learning_rate": 8.578938146136323e-07, + "loss": -0.0425, + "reward": 1.5705902576446533, + "reward_std": 0.1804032325744629, + "rewards/accuracy_reward_stage2": 0.6018401980400085, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 812 + }, + { + "completion_length": 7.875, + "epoch": 0.14245663220606272, + "grad_norm": 7.228563122762854, + "kl": 0.00823974609375, + "learning_rate": 8.577185912037847e-07, + "loss": -0.0409, + "reward": 1.75, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.765625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 813 + }, + { + "completion_length": 12.6875, + "epoch": 0.1426318556159103, + "grad_norm": 18.22673671823769, + "kl": 0.10693359375, + "learning_rate": 8.575433677939372e-07, + "loss": -0.0646, + "reward": 1.5301176309585571, + "reward_std": 0.2613670229911804, + "rewards/accuracy_reward_stage2": 0.5769926309585571, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 814 + }, + { + "completion_length": 13.328125, + "epoch": 0.14280707902575784, + "grad_norm": 17.23263122771429, + "kl": 0.037353515625, + "learning_rate": 8.573681443840896e-07, + "loss": 0.0149, + "reward": 1.6051459312438965, + "reward_std": 0.15718679130077362, + "rewards/accuracy_reward_stage2": 0.605146050453186, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 815 + }, + { + "completion_length": 6.40625, + "epoch": 0.1429823024356054, + "grad_norm": 16.364062402792296, + "kl": 0.08642578125, + "learning_rate": 8.571929209742421e-07, + "loss": -0.0096, + "reward": 1.6421375274658203, + "reward_std": 0.16563481092453003, + "rewards/accuracy_reward_stage2": 0.6577625274658203, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 816 + }, + { + "completion_length": 8.078125, + "epoch": 0.14315752584545297, + "grad_norm": 21.1458214999794, + "kl": 0.12109375, + "learning_rate": 8.570176975643946e-07, + "loss": 0.0485, + "reward": 1.6554884910583496, + "reward_std": 0.18612952530384064, + "rewards/accuracy_reward_stage2": 0.6554884910583496, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 817 + }, + { + "completion_length": 14.75, + "epoch": 0.1433327492553005, + "grad_norm": 26.119798669726237, + "kl": 0.01556396484375, + "learning_rate": 8.56842474154547e-07, + "loss": 0.0062, + "reward": 1.6673200130462646, + "reward_std": 0.29138341546058655, + "rewards/accuracy_reward_stage2": 0.6673198938369751, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 818 + }, + { + "completion_length": 12.75, + "epoch": 0.14350797266514806, + "grad_norm": 17.38674288999391, + "kl": 0.0150146484375, + "learning_rate": 8.566672507446995e-07, + "loss": 0.006, + "reward": 1.7542085647583008, + "reward_std": 0.29505500197410583, + "rewards/accuracy_reward_stage2": 0.7542085647583008, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 819 + }, + { + "completion_length": 8.4375, + "epoch": 0.1436831960749956, + "grad_norm": 21.875096570193925, + "kl": 0.038330078125, + "learning_rate": 8.564920273348519e-07, + "loss": 0.0153, + "reward": 1.4888389110565186, + "reward_std": 0.27264735102653503, + "rewards/accuracy_reward_stage2": 0.6138389706611633, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 820 + }, + { + "completion_length": 10.6875, + "epoch": 0.14385841948484318, + "grad_norm": 10.463498557034104, + "kl": 0.03759765625, + "learning_rate": 8.563168039250044e-07, + "loss": 0.015, + "reward": 1.4326601028442383, + "reward_std": 0.030810590833425522, + "rewards/accuracy_reward_stage2": 0.4326601028442383, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 821 + }, + { + "completion_length": 10.765625, + "epoch": 0.14403364289469073, + "grad_norm": 20.18488498768583, + "kl": 0.1806640625, + "learning_rate": 8.561415805151569e-07, + "loss": 0.0725, + "reward": 1.5884678363800049, + "reward_std": 0.2005475014448166, + "rewards/accuracy_reward_stage2": 0.7134678363800049, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 822 + }, + { + "completion_length": 10.9375, + "epoch": 0.14420886630453827, + "grad_norm": 16.441621674165216, + "kl": 0.02734375, + "learning_rate": 8.559663571053091e-07, + "loss": 0.011, + "reward": 1.583137035369873, + "reward_std": 0.22141794860363007, + "rewards/accuracy_reward_stage2": 0.583137035369873, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 823 + }, + { + "completion_length": 11.796875, + "epoch": 0.14438408971438585, + "grad_norm": 17.290747179596657, + "kl": 0.07568359375, + "learning_rate": 8.557911336954616e-07, + "loss": -0.0138, + "reward": 1.5539031028747559, + "reward_std": 0.15610679984092712, + "rewards/accuracy_reward_stage2": 0.6945281624794006, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 824 + }, + { + "completion_length": 9.421875, + "epoch": 0.1445593131242334, + "grad_norm": 13.948054655485015, + "kl": 0.0281982421875, + "learning_rate": 8.556159102856141e-07, + "loss": -0.0329, + "reward": 1.8782212734222412, + "reward_std": 0.12385688722133636, + "rewards/accuracy_reward_stage2": 0.8938462734222412, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 825 + }, + { + "completion_length": 13.625, + "epoch": 0.14473453653408094, + "grad_norm": 16.201832903904826, + "kl": 0.1796875, + "learning_rate": 8.554406868757665e-07, + "loss": 0.0718, + "reward": 1.442164421081543, + "reward_std": 0.09635643661022186, + "rewards/accuracy_reward_stage2": 0.5671643614768982, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 826 + }, + { + "completion_length": 8.296875, + "epoch": 0.14490975994392852, + "grad_norm": 24.460771275572338, + "kl": 0.05029296875, + "learning_rate": 8.55265463465919e-07, + "loss": 0.0201, + "reward": 1.6931114196777344, + "reward_std": 0.35366058349609375, + "rewards/accuracy_reward_stage2": 0.6931114196777344, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 827 + }, + { + "completion_length": 10.234375, + "epoch": 0.14508498335377606, + "grad_norm": 24.600109281918645, + "kl": 0.1005859375, + "learning_rate": 8.550902400560714e-07, + "loss": -0.0306, + "reward": 1.4568983316421509, + "reward_std": 0.3055163323879242, + "rewards/accuracy_reward_stage2": 0.4881483018398285, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 828 + }, + { + "completion_length": 14.203125, + "epoch": 0.1452602067636236, + "grad_norm": 20.48543817048711, + "kl": 0.049560546875, + "learning_rate": 8.549150166462239e-07, + "loss": 0.0198, + "reward": 1.5052090883255005, + "reward_std": 0.1374053657054901, + "rewards/accuracy_reward_stage2": 0.5052090287208557, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 829 + }, + { + "completion_length": 8.015625, + "epoch": 0.1454354301734712, + "grad_norm": 18.253365710861136, + "kl": 0.12060546875, + "learning_rate": 8.547397932363764e-07, + "loss": 0.0483, + "reward": 1.5600733757019043, + "reward_std": 0.14701932668685913, + "rewards/accuracy_reward_stage2": 0.5600734949111938, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 830 + }, + { + "completion_length": 10.484375, + "epoch": 0.14561065358331873, + "grad_norm": 15.571074165782326, + "kl": 0.0458984375, + "learning_rate": 8.545645698265288e-07, + "loss": 0.012, + "reward": 1.5902339220046997, + "reward_std": 0.07367925345897675, + "rewards/accuracy_reward_stage2": 0.7152339220046997, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 831 + }, + { + "completion_length": 14.140625, + "epoch": 0.14578587699316628, + "grad_norm": 51.077344535450706, + "kl": 0.1884765625, + "learning_rate": 8.543893464166813e-07, + "loss": 0.0313, + "reward": 1.1851284503936768, + "reward_std": 0.21774765849113464, + "rewards/accuracy_reward_stage2": 0.3257533311843872, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 832 + }, + { + "completion_length": 10.328125, + "epoch": 0.14596110040301385, + "grad_norm": 24.55740055745479, + "kl": 0.09814453125, + "learning_rate": 8.542141230068338e-07, + "loss": 0.0062, + "reward": 1.5574613809585571, + "reward_std": 0.357519268989563, + "rewards/accuracy_reward_stage2": 0.5730863213539124, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 833 + }, + { + "completion_length": 16.515625, + "epoch": 0.1461363238128614, + "grad_norm": 15.594915814044413, + "kl": 0.0546875, + "learning_rate": 8.540388995969861e-07, + "loss": -0.0201, + "reward": 1.5986196994781494, + "reward_std": 0.13833385705947876, + "rewards/accuracy_reward_stage2": 0.6142447590827942, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 834 + }, + { + "completion_length": 8.84375, + "epoch": 0.14631154722270895, + "grad_norm": 14.683705793659195, + "kl": 0.052734375, + "learning_rate": 8.538636761871386e-07, + "loss": 0.0211, + "reward": 1.8413678407669067, + "reward_std": 0.10930629074573517, + "rewards/accuracy_reward_stage2": 0.8413679003715515, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 835 + }, + { + "completion_length": 14.640625, + "epoch": 0.14648677063255652, + "grad_norm": 16.716252647384675, + "kl": 0.044921875, + "learning_rate": 8.536884527772909e-07, + "loss": -0.0704, + "reward": 1.5871949195861816, + "reward_std": 0.18210293352603912, + "rewards/accuracy_reward_stage2": 0.6184448599815369, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 836 + }, + { + "completion_length": 8.703125, + "epoch": 0.14666199404240407, + "grad_norm": 14.968499708337697, + "kl": 0.0625, + "learning_rate": 8.535132293674434e-07, + "loss": 0.025, + "reward": 1.6192355155944824, + "reward_std": 0.13152600824832916, + "rewards/accuracy_reward_stage2": 0.619235634803772, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 837 + }, + { + "completion_length": 14.25, + "epoch": 0.14683721745225162, + "grad_norm": 22.841885046960563, + "kl": 0.050048828125, + "learning_rate": 8.533380059575959e-07, + "loss": -0.0335, + "reward": 1.5910993814468384, + "reward_std": 0.18110352754592896, + "rewards/accuracy_reward_stage2": 0.6223493814468384, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 838 + }, + { + "completion_length": 9.828125, + "epoch": 0.14701244086209916, + "grad_norm": 20.15835105543159, + "kl": 0.0908203125, + "learning_rate": 8.531627825477483e-07, + "loss": 0.0363, + "reward": 1.7946337461471558, + "reward_std": 0.18121492862701416, + "rewards/accuracy_reward_stage2": 0.794633686542511, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 839 + }, + { + "completion_length": 12.734375, + "epoch": 0.14718766427194674, + "grad_norm": 14.433049590453853, + "kl": 0.06884765625, + "learning_rate": 8.529875591379008e-07, + "loss": 0.0276, + "reward": 1.7972837686538696, + "reward_std": 0.14100028574466705, + "rewards/accuracy_reward_stage2": 0.7972837686538696, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 840 + }, + { + "completion_length": 10.140625, + "epoch": 0.14736288768179429, + "grad_norm": 26.699421426615817, + "kl": 0.1845703125, + "learning_rate": 8.528123357280533e-07, + "loss": 0.0182, + "reward": 1.4089363813400269, + "reward_std": 0.26331624388694763, + "rewards/accuracy_reward_stage2": 0.44018638134002686, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 841 + }, + { + "completion_length": 9.59375, + "epoch": 0.14753811109164183, + "grad_norm": 17.884239561013167, + "kl": 0.0615234375, + "learning_rate": 8.526371123182057e-07, + "loss": -0.0197, + "reward": 1.5981876850128174, + "reward_std": 0.2483270913362503, + "rewards/accuracy_reward_stage2": 0.6138126254081726, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 842 + }, + { + "completion_length": 9.78125, + "epoch": 0.1477133345014894, + "grad_norm": 15.32685599876304, + "kl": 0.0400390625, + "learning_rate": 8.524618889083582e-07, + "loss": 0.016, + "reward": 1.6972908973693848, + "reward_std": 0.09243768453598022, + "rewards/accuracy_reward_stage2": 0.6972908973693848, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 843 + }, + { + "completion_length": 23.53125, + "epoch": 0.14788855791133695, + "grad_norm": 20.172619708324884, + "kl": 0.072265625, + "learning_rate": 8.522866654985105e-07, + "loss": -0.0064, + "reward": 1.3043758869171143, + "reward_std": 0.23132845759391785, + "rewards/accuracy_reward_stage2": 0.44500094652175903, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 844 + }, + { + "completion_length": 6.78125, + "epoch": 0.1480637813211845, + "grad_norm": 14.049334173065237, + "kl": 0.08935546875, + "learning_rate": 8.52111442088663e-07, + "loss": 0.0357, + "reward": 1.6144025325775146, + "reward_std": 0.06398695707321167, + "rewards/accuracy_reward_stage2": 0.6144025325775146, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 845 + }, + { + "completion_length": 6.671875, + "epoch": 0.14823900473103208, + "grad_norm": 9.94233896025104, + "kl": 0.03564453125, + "learning_rate": 8.519362186788155e-07, + "loss": -0.0299, + "reward": 1.5922174453735352, + "reward_std": 0.07707421481609344, + "rewards/accuracy_reward_stage2": 0.6078425049781799, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 846 + }, + { + "completion_length": 24.484375, + "epoch": 0.14841422814087962, + "grad_norm": 20.63713262817355, + "kl": 0.16796875, + "learning_rate": 8.517609952689679e-07, + "loss": -0.0195, + "reward": 1.3995568752288818, + "reward_std": 0.3477005660533905, + "rewards/accuracy_reward_stage2": 0.5558068156242371, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 847 + }, + { + "completion_length": 15.78125, + "epoch": 0.14858945155072717, + "grad_norm": 20.264020472537457, + "kl": 0.044677734375, + "learning_rate": 8.515857718591204e-07, + "loss": 0.0179, + "reward": 1.6210176944732666, + "reward_std": 0.20520731806755066, + "rewards/accuracy_reward_stage2": 0.621017575263977, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 848 + }, + { + "completion_length": 14.3125, + "epoch": 0.14876467496057474, + "grad_norm": 9.600883326034557, + "kl": 0.06640625, + "learning_rate": 8.514105484492728e-07, + "loss": 0.0266, + "reward": 1.5826388597488403, + "reward_std": 0.08929072320461273, + "rewards/accuracy_reward_stage2": 0.5826388597488403, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 849 + }, + { + "completion_length": 12.046875, + "epoch": 0.1489398983704223, + "grad_norm": 16.59982588749903, + "kl": 0.1318359375, + "learning_rate": 8.512353250394252e-07, + "loss": -0.0356, + "reward": 1.71842360496521, + "reward_std": 0.25606924295425415, + "rewards/accuracy_reward_stage2": 0.76529860496521, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 850 + }, + { + "completion_length": 15.203125, + "epoch": 0.14911512178026984, + "grad_norm": 25.532052798300754, + "kl": 0.2490234375, + "learning_rate": 8.510601016295777e-07, + "loss": 0.066, + "reward": 1.5083532333374023, + "reward_std": 0.27988600730895996, + "rewards/accuracy_reward_stage2": 0.6489783525466919, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 851 + }, + { + "completion_length": 10.5, + "epoch": 0.1492903451901174, + "grad_norm": 12.56724203301642, + "kl": 0.0213623046875, + "learning_rate": 8.508848782197301e-07, + "loss": -0.0204, + "reward": 1.546875, + "reward_std": 0.16887323558330536, + "rewards/accuracy_reward_stage2": 0.6875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 852 + }, + { + "completion_length": 10.78125, + "epoch": 0.14946556859996496, + "grad_norm": 17.700019567042943, + "kl": 0.01123046875, + "learning_rate": 8.507096548098825e-07, + "loss": -0.0397, + "reward": 1.685826063156128, + "reward_std": 0.13276247680187225, + "rewards/accuracy_reward_stage2": 0.7014511227607727, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 853 + }, + { + "completion_length": 10.015625, + "epoch": 0.1496407920098125, + "grad_norm": 13.651544327069582, + "kl": 0.01806640625, + "learning_rate": 8.50534431400035e-07, + "loss": -0.037, + "reward": 1.03125, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.296875, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 854 + }, + { + "completion_length": 11.953125, + "epoch": 0.14981601541966005, + "grad_norm": 24.999448901305616, + "kl": 0.05517578125, + "learning_rate": 8.503592079901874e-07, + "loss": -0.0788, + "reward": 1.5360831022262573, + "reward_std": 0.34290796518325806, + "rewards/accuracy_reward_stage2": 0.5829581618309021, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 855 + }, + { + "completion_length": 9.078125, + "epoch": 0.14999123882950763, + "grad_norm": 16.70769597449124, + "kl": 0.05859375, + "learning_rate": 8.501839845803399e-07, + "loss": -0.0649, + "reward": 1.453125, + "reward_std": 0.308285653591156, + "rewards/accuracy_reward_stage2": 0.484375, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 856 + }, + { + "completion_length": 13.453125, + "epoch": 0.15016646223935518, + "grad_norm": 17.40194584377364, + "kl": 0.111328125, + "learning_rate": 8.500087611704924e-07, + "loss": 0.0166, + "reward": 1.473811149597168, + "reward_std": 0.22556474804878235, + "rewards/accuracy_reward_stage2": 0.6144360303878784, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 857 + }, + { + "completion_length": 7.65625, + "epoch": 0.15034168564920272, + "grad_norm": 22.17923847228564, + "kl": 0.07421875, + "learning_rate": 8.498335377606448e-07, + "loss": -0.0146, + "reward": 1.602588415145874, + "reward_std": 0.2952241003513336, + "rewards/accuracy_reward_stage2": 0.7432133555412292, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 858 + }, + { + "completion_length": 7.796875, + "epoch": 0.1505169090590503, + "grad_norm": 15.660354355947947, + "kl": 0.03759765625, + "learning_rate": 8.496583143507973e-07, + "loss": 0.015, + "reward": 1.6695375442504883, + "reward_std": 0.17127437889575958, + "rewards/accuracy_reward_stage2": 0.6695374846458435, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 859 + }, + { + "completion_length": 11.90625, + "epoch": 0.15069213246889784, + "grad_norm": 21.1108280984021, + "kl": 0.048583984375, + "learning_rate": 8.494830909409497e-07, + "loss": -0.0199, + "reward": 1.5457628965377808, + "reward_std": 0.22555433213710785, + "rewards/accuracy_reward_stage2": 0.561387836933136, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 860 + }, + { + "completion_length": 9.328125, + "epoch": 0.1508673558787454, + "grad_norm": 19.86410560849335, + "kl": 0.0634765625, + "learning_rate": 8.493078675311021e-07, + "loss": -0.0189, + "reward": 1.4820592403411865, + "reward_std": 0.20549719035625458, + "rewards/accuracy_reward_stage2": 0.6226842403411865, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 861 + }, + { + "completion_length": 11.46875, + "epoch": 0.15104257928859297, + "grad_norm": 15.593853856372352, + "kl": 0.04296875, + "learning_rate": 8.491326441212546e-07, + "loss": -0.0765, + "reward": 1.6471900939941406, + "reward_std": 0.2422153800725937, + "rewards/accuracy_reward_stage2": 0.6940651535987854, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 862 + }, + { + "completion_length": 12.953125, + "epoch": 0.1512178026984405, + "grad_norm": 18.740518266665386, + "kl": 0.11865234375, + "learning_rate": 8.489574207114069e-07, + "loss": 0.0186, + "reward": 1.422258734703064, + "reward_std": 0.24776926636695862, + "rewards/accuracy_reward_stage2": 0.4378837049007416, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 863 + }, + { + "completion_length": 10.734375, + "epoch": 0.15139302610828806, + "grad_norm": 20.18460613831639, + "kl": 0.052734375, + "learning_rate": 8.487821973015594e-07, + "loss": 0.0211, + "reward": 1.3923512697219849, + "reward_std": 0.31910377740859985, + "rewards/accuracy_reward_stage2": 0.39235132932662964, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 864 + }, + { + "completion_length": 20.421875, + "epoch": 0.15156824951813563, + "grad_norm": 19.765628340117512, + "kl": 0.1044921875, + "learning_rate": 8.486069738917118e-07, + "loss": 0.0021, + "reward": 1.2905793190002441, + "reward_std": 0.1945052146911621, + "rewards/accuracy_reward_stage2": 0.5562041997909546, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 865 + }, + { + "completion_length": 11.65625, + "epoch": 0.15174347292798318, + "grad_norm": 23.360094504906623, + "kl": 0.171875, + "learning_rate": 8.484317504818643e-07, + "loss": -0.0099, + "reward": 1.372206449508667, + "reward_std": 0.3116145730018616, + "rewards/accuracy_reward_stage2": 0.544081449508667, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 866 + }, + { + "completion_length": 8.109375, + "epoch": 0.15191869633783073, + "grad_norm": 21.28420476628764, + "kl": 0.12255859375, + "learning_rate": 8.482565270720168e-07, + "loss": -0.0343, + "reward": 1.772420883178711, + "reward_std": 0.27512839436531067, + "rewards/accuracy_reward_stage2": 0.8036710619926453, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 867 + }, + { + "completion_length": 12.890625, + "epoch": 0.1520939197476783, + "grad_norm": 13.792981077834618, + "kl": 0.0111083984375, + "learning_rate": 8.480813036621692e-07, + "loss": 0.0044, + "reward": 1.7508642673492432, + "reward_std": 0.050229497253894806, + "rewards/accuracy_reward_stage2": 0.7508642673492432, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 868 + }, + { + "completion_length": 9.0, + "epoch": 0.15226914315752585, + "grad_norm": 20.417553629433254, + "kl": 0.04248046875, + "learning_rate": 8.479060802523217e-07, + "loss": -0.0271, + "reward": 1.6638281345367432, + "reward_std": 0.1942887008190155, + "rewards/accuracy_reward_stage2": 0.6794531941413879, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 869 + }, + { + "completion_length": 10.046875, + "epoch": 0.1524443665673734, + "grad_norm": 15.65798750344904, + "kl": 0.0546875, + "learning_rate": 8.477308568424742e-07, + "loss": 0.0219, + "reward": 1.4875478744506836, + "reward_std": 0.13235189020633698, + "rewards/accuracy_reward_stage2": 0.4875478744506836, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 870 + }, + { + "completion_length": 8.609375, + "epoch": 0.15261958997722094, + "grad_norm": 18.692786962291954, + "kl": 0.107421875, + "learning_rate": 8.475556334326266e-07, + "loss": -0.0761, + "reward": 1.7316548824310303, + "reward_std": 0.24129626154899597, + "rewards/accuracy_reward_stage2": 0.778529942035675, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 871 + }, + { + "completion_length": 11.21875, + "epoch": 0.15279481338706852, + "grad_norm": 36.74883998202966, + "kl": 0.1533203125, + "learning_rate": 8.473804100227791e-07, + "loss": -0.0271, + "reward": 1.359375, + "reward_std": 0.2198973000049591, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 872 + }, + { + "completion_length": 19.890625, + "epoch": 0.15297003679691606, + "grad_norm": 19.386833518301408, + "kl": 0.017333984375, + "learning_rate": 8.472051866129316e-07, + "loss": 0.0069, + "reward": 1.4666125774383545, + "reward_std": 0.1620863825082779, + "rewards/accuracy_reward_stage2": 0.4666125476360321, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 873 + }, + { + "completion_length": 10.96875, + "epoch": 0.1531452602067636, + "grad_norm": 24.861548593450724, + "kl": 0.064453125, + "learning_rate": 8.470299632030838e-07, + "loss": 0.0258, + "reward": 1.605391502380371, + "reward_std": 0.24627715349197388, + "rewards/accuracy_reward_stage2": 0.6053914427757263, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 874 + }, + { + "completion_length": 9.25, + "epoch": 0.1533204836166112, + "grad_norm": 23.9841767692309, + "kl": 0.08740234375, + "learning_rate": 8.468547397932363e-07, + "loss": -0.0403, + "reward": 1.6885817050933838, + "reward_std": 0.39420628547668457, + "rewards/accuracy_reward_stage2": 0.7198317050933838, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 875 + }, + { + "completion_length": 11.59375, + "epoch": 0.15349570702645873, + "grad_norm": 18.208345060683527, + "kl": 0.11279296875, + "learning_rate": 8.466795163833887e-07, + "loss": 0.0008, + "reward": 1.663696050643921, + "reward_std": 0.1911657601594925, + "rewards/accuracy_reward_stage2": 0.6793211102485657, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 876 + }, + { + "completion_length": 9.3125, + "epoch": 0.15367093043630628, + "grad_norm": 18.970527826997365, + "kl": 0.1513671875, + "learning_rate": 8.465042929735412e-07, + "loss": 0.0607, + "reward": 1.5123016834259033, + "reward_std": 0.21880435943603516, + "rewards/accuracy_reward_stage2": 0.6373016834259033, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 877 + }, + { + "completion_length": 8.625, + "epoch": 0.15384615384615385, + "grad_norm": 23.75217986780291, + "kl": 0.11279296875, + "learning_rate": 8.463290695636937e-07, + "loss": -0.0357, + "reward": 1.7432327270507812, + "reward_std": 0.30611133575439453, + "rewards/accuracy_reward_stage2": 0.7744826078414917, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 878 + }, + { + "completion_length": 9.25, + "epoch": 0.1540213772560014, + "grad_norm": 20.18899090894407, + "kl": 0.07421875, + "learning_rate": 8.461538461538461e-07, + "loss": -0.0626, + "reward": 1.6393646001815796, + "reward_std": 0.16706180572509766, + "rewards/accuracy_reward_stage2": 0.6862396597862244, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 879 + }, + { + "completion_length": 11.59375, + "epoch": 0.15419660066584895, + "grad_norm": 22.636159987310997, + "kl": 0.1337890625, + "learning_rate": 8.459786227439986e-07, + "loss": 0.0534, + "reward": 1.5441811084747314, + "reward_std": 0.28956350684165955, + "rewards/accuracy_reward_stage2": 0.6691809892654419, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 880 + }, + { + "completion_length": 7.578125, + "epoch": 0.15437182407569652, + "grad_norm": 90.53936940551583, + "kl": 0.349609375, + "learning_rate": 8.45803399334151e-07, + "loss": 0.0987, + "reward": 1.4019708633422852, + "reward_std": 0.2559443712234497, + "rewards/accuracy_reward_stage2": 0.5425958037376404, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 881 + }, + { + "completion_length": 11.4375, + "epoch": 0.15454704748554407, + "grad_norm": 19.73605396407536, + "kl": 0.1201171875, + "learning_rate": 8.456281759243035e-07, + "loss": 0.048, + "reward": 1.716138243675232, + "reward_std": 0.25902819633483887, + "rewards/accuracy_reward_stage2": 0.7161382436752319, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 882 + }, + { + "completion_length": 13.046875, + "epoch": 0.15472227089539162, + "grad_norm": 25.45542294657479, + "kl": 0.140625, + "learning_rate": 8.45452952514456e-07, + "loss": 0.0561, + "reward": 1.733203411102295, + "reward_std": 0.29068833589553833, + "rewards/accuracy_reward_stage2": 0.7332033514976501, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 883 + }, + { + "completion_length": 9.1875, + "epoch": 0.1548974943052392, + "grad_norm": 20.634460745406873, + "kl": 0.054931640625, + "learning_rate": 8.452777291046083e-07, + "loss": 0.022, + "reward": 1.4329566955566406, + "reward_std": 0.2863396406173706, + "rewards/accuracy_reward_stage2": 0.5579568147659302, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 884 + }, + { + "completion_length": 8.6875, + "epoch": 0.15507271771508674, + "grad_norm": 24.606510940394227, + "kl": 0.07421875, + "learning_rate": 8.451025056947608e-07, + "loss": 0.0047, + "reward": 1.554135799407959, + "reward_std": 0.2531256675720215, + "rewards/accuracy_reward_stage2": 0.5697606801986694, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 885 + }, + { + "completion_length": 11.875, + "epoch": 0.15524794112493429, + "grad_norm": 21.4730685645679, + "kl": 0.138671875, + "learning_rate": 8.449272822849133e-07, + "loss": 0.0334, + "reward": 1.5125616788864136, + "reward_std": 0.1945052444934845, + "rewards/accuracy_reward_stage2": 0.5281866788864136, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 886 + }, + { + "completion_length": 11.328125, + "epoch": 0.15542316453478186, + "grad_norm": 21.448196825835407, + "kl": 0.03125, + "learning_rate": 8.447520588750656e-07, + "loss": -0.0308, + "reward": 1.57716965675354, + "reward_std": 0.2582663893699646, + "rewards/accuracy_reward_stage2": 0.59279465675354, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 887 + }, + { + "completion_length": 10.234375, + "epoch": 0.1555983879446294, + "grad_norm": 23.030439291257906, + "kl": 0.205078125, + "learning_rate": 8.445768354652181e-07, + "loss": 0.0097, + "reward": 1.4215422868728638, + "reward_std": 0.2988489866256714, + "rewards/accuracy_reward_stage2": 0.7027922868728638, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 888 + }, + { + "completion_length": 7.484375, + "epoch": 0.15577361135447695, + "grad_norm": 17.137018589062855, + "kl": 0.09228515625, + "learning_rate": 8.444016120553705e-07, + "loss": 0.0307, + "reward": 1.6334525346755981, + "reward_std": 0.19943121075630188, + "rewards/accuracy_reward_stage2": 0.6490775942802429, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 889 + }, + { + "completion_length": 7.46875, + "epoch": 0.1559488347643245, + "grad_norm": 21.648857530110167, + "kl": 0.10595703125, + "learning_rate": 8.44226388645523e-07, + "loss": 0.0069, + "reward": 1.5306628942489624, + "reward_std": 0.2571268379688263, + "rewards/accuracy_reward_stage2": 0.5462879538536072, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 890 + }, + { + "completion_length": 8.65625, + "epoch": 0.15612405817417208, + "grad_norm": 20.660084693200343, + "kl": 0.011474609375, + "learning_rate": 8.440511652356755e-07, + "loss": 0.0046, + "reward": 1.5625, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 891 + }, + { + "completion_length": 10.84375, + "epoch": 0.15629928158401962, + "grad_norm": 16.66053317450666, + "kl": 0.333984375, + "learning_rate": 8.438759418258279e-07, + "loss": 0.089, + "reward": 1.17367422580719, + "reward_std": 0.2187984734773636, + "rewards/accuracy_reward_stage2": 0.31429922580718994, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 892 + }, + { + "completion_length": 7.75, + "epoch": 0.15647450499386717, + "grad_norm": 16.696832713661816, + "kl": 0.0986328125, + "learning_rate": 8.437007184159803e-07, + "loss": -0.0046, + "reward": 1.5517808198928833, + "reward_std": 0.1454845815896988, + "rewards/accuracy_reward_stage2": 0.5674057602882385, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 893 + }, + { + "completion_length": 13.90625, + "epoch": 0.15664972840371474, + "grad_norm": 18.134303764193838, + "kl": 0.0184326171875, + "learning_rate": 8.435254950061328e-07, + "loss": -0.0657, + "reward": 1.4824566841125488, + "reward_std": 0.10660809278488159, + "rewards/accuracy_reward_stage2": 0.5137066841125488, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 894 + }, + { + "completion_length": 11.546875, + "epoch": 0.1568249518135623, + "grad_norm": 17.47601020099436, + "kl": 0.193359375, + "learning_rate": 8.433502715962852e-07, + "loss": 0.0331, + "reward": 1.3741912841796875, + "reward_std": 0.16278903186321259, + "rewards/accuracy_reward_stage2": 0.514816164970398, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 895 + }, + { + "completion_length": 11.109375, + "epoch": 0.15700017522340984, + "grad_norm": 21.664510515701206, + "kl": 0.08544921875, + "learning_rate": 8.431750481864377e-07, + "loss": 0.0051, + "reward": 1.4644746780395508, + "reward_std": 0.28877538442611694, + "rewards/accuracy_reward_stage2": 0.4800996482372284, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 896 + }, + { + "completion_length": 8.21875, + "epoch": 0.1571753986332574, + "grad_norm": 20.922720090845928, + "kl": 0.08203125, + "learning_rate": 8.429998247765901e-07, + "loss": 0.001, + "reward": 1.4854588508605957, + "reward_std": 0.16852207481861115, + "rewards/accuracy_reward_stage2": 0.5010839104652405, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 897 + }, + { + "completion_length": 10.09375, + "epoch": 0.15735062204310496, + "grad_norm": 20.48289200755099, + "kl": 0.08203125, + "learning_rate": 8.428246013667426e-07, + "loss": 0.0329, + "reward": 1.5313962697982788, + "reward_std": 0.14356286823749542, + "rewards/accuracy_reward_stage2": 0.6563963294029236, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 898 + }, + { + "completion_length": 15.921875, + "epoch": 0.1575258454529525, + "grad_norm": 21.49636538136034, + "kl": 0.11376953125, + "learning_rate": 8.426493779568951e-07, + "loss": 0.0085, + "reward": 1.56718111038208, + "reward_std": 0.15211787819862366, + "rewards/accuracy_reward_stage2": 0.5828061103820801, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 899 + }, + { + "completion_length": 9.859375, + "epoch": 0.15770106886280008, + "grad_norm": 24.25647930376624, + "kl": 0.0615234375, + "learning_rate": 8.424741545470474e-07, + "loss": -0.0028, + "reward": 1.715989589691162, + "reward_std": 0.2878607511520386, + "rewards/accuracy_reward_stage2": 0.7316147089004517, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 900 + }, + { + "completion_length": 10.75, + "epoch": 0.15787629227264763, + "grad_norm": 19.215338734644746, + "kl": 0.0179443359375, + "learning_rate": 8.422989311371999e-07, + "loss": -0.0262, + "reward": 1.4582719802856445, + "reward_std": 0.32733142375946045, + "rewards/accuracy_reward_stage2": 0.5988969802856445, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 901 + }, + { + "completion_length": 14.046875, + "epoch": 0.15805151568249518, + "grad_norm": 23.475241913665684, + "kl": 0.05029296875, + "learning_rate": 8.421237077273524e-07, + "loss": -0.024, + "reward": 1.646759033203125, + "reward_std": 0.231346994638443, + "rewards/accuracy_reward_stage2": 0.662384033203125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 902 + }, + { + "completion_length": 8.859375, + "epoch": 0.15822673909234275, + "grad_norm": 19.976502771099383, + "kl": 0.1640625, + "learning_rate": 8.419484843175047e-07, + "loss": -0.1622, + "reward": 1.475749135017395, + "reward_std": 0.40651267766952515, + "rewards/accuracy_reward_stage2": 0.569499135017395, + "rewards/format_reward_stage1_pointerpad": 0.90625, + "scores/accuracy_reward_stage2": 0.90625, + "step": 903 + }, + { + "completion_length": 16.453125, + "epoch": 0.1584019625021903, + "grad_norm": 11.343613400181686, + "kl": 0.058837890625, + "learning_rate": 8.417732609076572e-07, + "loss": -0.0648, + "reward": 1.4348111152648926, + "reward_std": 0.1632448136806488, + "rewards/accuracy_reward_stage2": 0.4660611152648926, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 904 + }, + { + "completion_length": 9.171875, + "epoch": 0.15857718591203784, + "grad_norm": 19.639344625728686, + "kl": 0.083984375, + "learning_rate": 8.415980374978096e-07, + "loss": -0.0086, + "reward": 1.4668049812316895, + "reward_std": 0.10977669060230255, + "rewards/accuracy_reward_stage2": 0.4824299216270447, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 905 + }, + { + "completion_length": 18.25, + "epoch": 0.1587524093218854, + "grad_norm": 20.04975456715171, + "kl": 0.107421875, + "learning_rate": 8.414228140879621e-07, + "loss": 0.0046, + "reward": 1.3382542133331299, + "reward_std": 0.23458905518054962, + "rewards/accuracy_reward_stage2": 0.47887933254241943, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 906 + }, + { + "completion_length": 11.21875, + "epoch": 0.15892763273173297, + "grad_norm": 20.769525897884474, + "kl": 0.0625, + "learning_rate": 8.412475906781146e-07, + "loss": -0.0192, + "reward": 1.5473453998565674, + "reward_std": 0.1797097623348236, + "rewards/accuracy_reward_stage2": 0.5629702806472778, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 907 + }, + { + "completion_length": 9.15625, + "epoch": 0.1591028561415805, + "grad_norm": 19.600389057571586, + "kl": 0.06298828125, + "learning_rate": 8.41072367268267e-07, + "loss": -0.0082, + "reward": 1.6035189628601074, + "reward_std": 0.16768498718738556, + "rewards/accuracy_reward_stage2": 0.6191439628601074, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 908 + }, + { + "completion_length": 8.765625, + "epoch": 0.15927807955142806, + "grad_norm": 42.576306532869665, + "kl": 0.38671875, + "learning_rate": 8.408971438584195e-07, + "loss": 0.1551, + "reward": 1.4866942167282104, + "reward_std": 0.23718145489692688, + "rewards/accuracy_reward_stage2": 0.6116942763328552, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 909 + }, + { + "completion_length": 11.09375, + "epoch": 0.15945330296127563, + "grad_norm": 20.471116146597804, + "kl": 0.0286865234375, + "learning_rate": 8.40721920448572e-07, + "loss": 0.0115, + "reward": 1.4572649002075195, + "reward_std": 0.15716366469860077, + "rewards/accuracy_reward_stage2": 0.4572649598121643, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 910 + }, + { + "completion_length": 12.765625, + "epoch": 0.15962852637112318, + "grad_norm": 19.459024514081843, + "kl": 0.1103515625, + "learning_rate": 8.405466970387244e-07, + "loss": 0.0006, + "reward": 1.7537041902542114, + "reward_std": 0.11763329803943634, + "rewards/accuracy_reward_stage2": 0.7693291902542114, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 911 + }, + { + "completion_length": 11.390625, + "epoch": 0.15980374978097073, + "grad_norm": 25.867143049061717, + "kl": 0.126953125, + "learning_rate": 8.403714736288767e-07, + "loss": 0.0117, + "reward": 1.6886465549468994, + "reward_std": 0.22849583625793457, + "rewards/accuracy_reward_stage2": 0.7042715549468994, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 912 + }, + { + "completion_length": 10.375, + "epoch": 0.1599789731908183, + "grad_norm": 24.869482371507228, + "kl": 0.036376953125, + "learning_rate": 8.401962502190291e-07, + "loss": -0.0296, + "reward": 1.4990651607513428, + "reward_std": 0.38119715452194214, + "rewards/accuracy_reward_stage2": 0.6396902203559875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 913 + }, + { + "completion_length": 11.8125, + "epoch": 0.16015419660066585, + "grad_norm": 15.483895225098495, + "kl": 0.130859375, + "learning_rate": 8.400210268091816e-07, + "loss": 0.0523, + "reward": 1.4544987678527832, + "reward_std": 0.18442535400390625, + "rewards/accuracy_reward_stage2": 0.5794986486434937, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 914 + }, + { + "completion_length": 18.265625, + "epoch": 0.1603294200105134, + "grad_norm": 23.179724791748395, + "kl": 0.050048828125, + "learning_rate": 8.398458033993341e-07, + "loss": 0.02, + "reward": 1.7503162622451782, + "reward_std": 0.1276683211326599, + "rewards/accuracy_reward_stage2": 0.7503161430358887, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 915 + }, + { + "completion_length": 6.546875, + "epoch": 0.16050464342036097, + "grad_norm": 18.057542812191738, + "kl": 0.0390625, + "learning_rate": 8.396705799894865e-07, + "loss": -0.1062, + "reward": 1.418050765991211, + "reward_std": 0.23071405291557312, + "rewards/accuracy_reward_stage2": 0.46492570638656616, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 916 + }, + { + "completion_length": 9.3125, + "epoch": 0.16067986683020852, + "grad_norm": 19.653550716601625, + "kl": 0.0546875, + "learning_rate": 8.39495356579639e-07, + "loss": -0.007, + "reward": 1.4791667461395264, + "reward_std": 0.2661178410053253, + "rewards/accuracy_reward_stage2": 0.4947916865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 917 + }, + { + "completion_length": 19.21875, + "epoch": 0.16085509024005606, + "grad_norm": 14.689042788013323, + "kl": 0.06396484375, + "learning_rate": 8.393201331697915e-07, + "loss": -0.0184, + "reward": 1.3778043985366821, + "reward_std": 0.18908536434173584, + "rewards/accuracy_reward_stage2": 0.5184293985366821, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 918 + }, + { + "completion_length": 10.53125, + "epoch": 0.16103031364990364, + "grad_norm": 24.26621589117923, + "kl": 0.126953125, + "learning_rate": 8.391449097599439e-07, + "loss": 0.0219, + "reward": 1.4894888401031494, + "reward_std": 0.2673390507698059, + "rewards/accuracy_reward_stage2": 0.505113959312439, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 919 + }, + { + "completion_length": 7.390625, + "epoch": 0.16120553705975119, + "grad_norm": 18.517823793201053, + "kl": 0.07470703125, + "learning_rate": 8.389696863500964e-07, + "loss": 0.0132, + "reward": 1.5432384014129639, + "reward_std": 0.24379214644432068, + "rewards/accuracy_reward_stage2": 0.5588634014129639, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 920 + }, + { + "completion_length": 12.3125, + "epoch": 0.16138076046959873, + "grad_norm": 17.44714427018299, + "kl": 0.25390625, + "learning_rate": 8.387944629402488e-07, + "loss": 0.0665, + "reward": 1.4797606468200684, + "reward_std": 0.20411017537117004, + "rewards/accuracy_reward_stage2": 0.6203855276107788, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 921 + }, + { + "completion_length": 12.015625, + "epoch": 0.1615559838794463, + "grad_norm": 18.831065124465535, + "kl": 0.060302734375, + "learning_rate": 8.386192395304013e-07, + "loss": 0.0241, + "reward": 1.7214339971542358, + "reward_std": 0.19486960768699646, + "rewards/accuracy_reward_stage2": 0.7214340567588806, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 922 + }, + { + "completion_length": 8.40625, + "epoch": 0.16173120728929385, + "grad_norm": 16.571351956065644, + "kl": 0.08251953125, + "learning_rate": 8.384440161205537e-07, + "loss": -0.0004, + "reward": 1.573890209197998, + "reward_std": 0.15192250907421112, + "rewards/accuracy_reward_stage2": 0.589515209197998, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 923 + }, + { + "completion_length": 10.5625, + "epoch": 0.1619064306991414, + "grad_norm": 21.658736598327444, + "kl": 0.026123046875, + "learning_rate": 8.382687927107061e-07, + "loss": 0.0104, + "reward": 1.7161989212036133, + "reward_std": 0.1691042184829712, + "rewards/accuracy_reward_stage2": 0.7161988019943237, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 924 + }, + { + "completion_length": 5.546875, + "epoch": 0.16208165410898895, + "grad_norm": 15.044499024913339, + "kl": 0.025634765625, + "learning_rate": 8.380935693008585e-07, + "loss": 0.0102, + "reward": 1.7326302528381348, + "reward_std": 0.18306495249271393, + "rewards/accuracy_reward_stage2": 0.7326301336288452, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 925 + }, + { + "completion_length": 14.03125, + "epoch": 0.16225687751883652, + "grad_norm": 16.145061763378322, + "kl": 0.050048828125, + "learning_rate": 8.37918345891011e-07, + "loss": 0.02, + "reward": 1.646541953086853, + "reward_std": 0.09941836446523666, + "rewards/accuracy_reward_stage2": 0.771541953086853, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 926 + }, + { + "completion_length": 12.109375, + "epoch": 0.16243210092868407, + "grad_norm": 22.317007705230505, + "kl": 0.01904296875, + "learning_rate": 8.377431224811634e-07, + "loss": 0.0077, + "reward": 1.5018961429595947, + "reward_std": 0.20814120769500732, + "rewards/accuracy_reward_stage2": 0.5018961429595947, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 927 + }, + { + "completion_length": 11.640625, + "epoch": 0.16260732433853162, + "grad_norm": 15.262903698390526, + "kl": 0.036376953125, + "learning_rate": 8.375678990713159e-07, + "loss": -0.0296, + "reward": 1.6556651592254639, + "reward_std": 0.18859100341796875, + "rewards/accuracy_reward_stage2": 0.6712901592254639, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 928 + }, + { + "completion_length": 8.03125, + "epoch": 0.1627825477483792, + "grad_norm": 17.09237768184502, + "kl": 0.04638671875, + "learning_rate": 8.373926756614683e-07, + "loss": 0.0185, + "reward": 1.5406862497329712, + "reward_std": 0.18566709756851196, + "rewards/accuracy_reward_stage2": 0.5406862497329712, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 929 + }, + { + "completion_length": 14.859375, + "epoch": 0.16295777115822674, + "grad_norm": 18.91427736792877, + "kl": 0.06787109375, + "learning_rate": 8.372174522516208e-07, + "loss": 0.0271, + "reward": 1.4712986946105957, + "reward_std": 0.097105011343956, + "rewards/accuracy_reward_stage2": 0.47129860520362854, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 930 + }, + { + "completion_length": 9.75, + "epoch": 0.16313299456807429, + "grad_norm": 22.340739109880076, + "kl": 0.0123291015625, + "learning_rate": 8.370422288417733e-07, + "loss": 0.0049, + "reward": 1.4322266578674316, + "reward_std": 0.3248811960220337, + "rewards/accuracy_reward_stage2": 0.43222665786743164, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 931 + }, + { + "completion_length": 8.71875, + "epoch": 0.16330821797792186, + "grad_norm": 19.42803457345458, + "kl": 0.07275390625, + "learning_rate": 8.368670054319256e-07, + "loss": 0.0291, + "reward": 1.2958667278289795, + "reward_std": 0.1573130041360855, + "rewards/accuracy_reward_stage2": 0.29586678743362427, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 932 + }, + { + "completion_length": 12.328125, + "epoch": 0.1634834413877694, + "grad_norm": 15.734435814984927, + "kl": 0.12451171875, + "learning_rate": 8.366917820220781e-07, + "loss": -0.035, + "reward": 1.3563389778137207, + "reward_std": 0.21657343208789825, + "rewards/accuracy_reward_stage2": 0.5125889778137207, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 933 + }, + { + "completion_length": 11.765625, + "epoch": 0.16365866479761695, + "grad_norm": 19.72464887053107, + "kl": 0.04541015625, + "learning_rate": 8.365165586122306e-07, + "loss": 0.0182, + "reward": 1.3644483089447021, + "reward_std": 0.20462146401405334, + "rewards/accuracy_reward_stage2": 0.36444830894470215, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 934 + }, + { + "completion_length": 8.5, + "epoch": 0.16383388820746453, + "grad_norm": 17.043259339917427, + "kl": 0.0595703125, + "learning_rate": 8.36341335202383e-07, + "loss": 0.0239, + "reward": 1.6852599382400513, + "reward_std": 0.15038639307022095, + "rewards/accuracy_reward_stage2": 0.6852599382400513, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 935 + }, + { + "completion_length": 9.65625, + "epoch": 0.16400911161731208, + "grad_norm": 17.623879303196922, + "kl": 0.0673828125, + "learning_rate": 8.361661117925355e-07, + "loss": 0.0269, + "reward": 1.4992897510528564, + "reward_std": 0.14591173827648163, + "rewards/accuracy_reward_stage2": 0.49928975105285645, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 936 + }, + { + "completion_length": 10.71875, + "epoch": 0.16418433502715962, + "grad_norm": 20.812568655675257, + "kl": 0.051513671875, + "learning_rate": 8.359908883826879e-07, + "loss": 0.0206, + "reward": 1.5781810283660889, + "reward_std": 0.2800504267215729, + "rewards/accuracy_reward_stage2": 0.5781811475753784, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 937 + }, + { + "completion_length": 10.171875, + "epoch": 0.1643595584370072, + "grad_norm": 17.797227923047306, + "kl": 0.0172119140625, + "learning_rate": 8.358156649728403e-07, + "loss": 0.0069, + "reward": 1.610327124595642, + "reward_std": 0.06501858681440353, + "rewards/accuracy_reward_stage2": 0.7353270649909973, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 938 + }, + { + "completion_length": 11.09375, + "epoch": 0.16453478184685474, + "grad_norm": 13.469802078992393, + "kl": 0.048583984375, + "learning_rate": 8.356404415629928e-07, + "loss": 0.0194, + "reward": 1.607242465019226, + "reward_std": 0.07041595876216888, + "rewards/accuracy_reward_stage2": 0.6072424054145813, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 939 + }, + { + "completion_length": 8.734375, + "epoch": 0.1647100052567023, + "grad_norm": 17.907630506038068, + "kl": 0.10595703125, + "learning_rate": 8.354652181531452e-07, + "loss": -0.0203, + "reward": 1.6189165115356445, + "reward_std": 0.20212598145008087, + "rewards/accuracy_reward_stage2": 0.7751665115356445, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 940 + }, + { + "completion_length": 22.96875, + "epoch": 0.16488522866654984, + "grad_norm": 24.025578235260905, + "kl": 0.054931640625, + "learning_rate": 8.352899947432977e-07, + "loss": 0.022, + "reward": 1.6079862117767334, + "reward_std": 0.24116870760917664, + "rewards/accuracy_reward_stage2": 0.6079861521720886, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 941 + }, + { + "completion_length": 9.046875, + "epoch": 0.1650604520763974, + "grad_norm": 19.68392029533682, + "kl": 0.17578125, + "learning_rate": 8.3511477133345e-07, + "loss": 0.0702, + "reward": 1.2127020359039307, + "reward_std": 0.26052191853523254, + "rewards/accuracy_reward_stage2": 0.33770206570625305, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 942 + }, + { + "completion_length": 14.4375, + "epoch": 0.16523567548624496, + "grad_norm": 15.842916725269227, + "kl": 0.04345703125, + "learning_rate": 8.349395479236025e-07, + "loss": 0.0174, + "reward": 1.5224876403808594, + "reward_std": 0.16868887841701508, + "rewards/accuracy_reward_stage2": 0.5224875807762146, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 943 + }, + { + "completion_length": 9.125, + "epoch": 0.1654108988960925, + "grad_norm": 14.834285405679069, + "kl": 0.04345703125, + "learning_rate": 8.34764324513755e-07, + "loss": 0.0174, + "reward": 1.6600130796432495, + "reward_std": 0.0960196852684021, + "rewards/accuracy_reward_stage2": 0.6600130796432495, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 944 + }, + { + "completion_length": 17.140625, + "epoch": 0.16558612230594008, + "grad_norm": 16.652320078685968, + "kl": 0.0152587890625, + "learning_rate": 8.345891011039074e-07, + "loss": 0.0061, + "reward": 1.234375, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward_stage2": 0.359375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 945 + }, + { + "completion_length": 15.6875, + "epoch": 0.16576134571578763, + "grad_norm": 15.300518563687973, + "kl": 0.052490234375, + "learning_rate": 8.344138776940599e-07, + "loss": 0.021, + "reward": 1.638373851776123, + "reward_std": 0.1297706514596939, + "rewards/accuracy_reward_stage2": 0.6383737921714783, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 946 + }, + { + "completion_length": 19.375, + "epoch": 0.16593656912563517, + "grad_norm": 21.360435306543938, + "kl": 0.11328125, + "learning_rate": 8.342386542842124e-07, + "loss": 0.0452, + "reward": 1.4355685710906982, + "reward_std": 0.14541111886501312, + "rewards/accuracy_reward_stage2": 0.5605685710906982, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 947 + }, + { + "completion_length": 8.15625, + "epoch": 0.16611179253548275, + "grad_norm": 20.031436354993605, + "kl": 0.06787109375, + "learning_rate": 8.340634308743648e-07, + "loss": 0.0272, + "reward": 1.5705227851867676, + "reward_std": 0.1583014577627182, + "rewards/accuracy_reward_stage2": 0.5705227255821228, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 948 + }, + { + "completion_length": 12.109375, + "epoch": 0.1662870159453303, + "grad_norm": 18.51744509104956, + "kl": 0.04736328125, + "learning_rate": 8.338882074645173e-07, + "loss": 0.0189, + "reward": 1.668715476989746, + "reward_std": 0.2258095145225525, + "rewards/accuracy_reward_stage2": 0.6687155961990356, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 949 + }, + { + "completion_length": 7.25, + "epoch": 0.16646223935517784, + "grad_norm": 21.24530602480943, + "kl": 0.1337890625, + "learning_rate": 8.337129840546698e-07, + "loss": 0.0094, + "reward": 1.4853681325912476, + "reward_std": 0.21900716423988342, + "rewards/accuracy_reward_stage2": 0.6259931325912476, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 950 + }, + { + "completion_length": 7.953125, + "epoch": 0.16663746276502542, + "grad_norm": 18.566300126795337, + "kl": 0.060546875, + "learning_rate": 8.335377606448221e-07, + "loss": -0.02, + "reward": 1.4947917461395264, + "reward_std": 0.2537845969200134, + "rewards/accuracy_reward_stage2": 0.5104166865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 951 + }, + { + "completion_length": 11.390625, + "epoch": 0.16681268617487297, + "grad_norm": 17.93695120274296, + "kl": 0.0308837890625, + "learning_rate": 8.333625372349745e-07, + "loss": 0.0123, + "reward": 1.4117063283920288, + "reward_std": 0.21608895063400269, + "rewards/accuracy_reward_stage2": 0.5367063283920288, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 952 + }, + { + "completion_length": 12.546875, + "epoch": 0.1669879095847205, + "grad_norm": 17.191049053892815, + "kl": 0.08056640625, + "learning_rate": 8.331873138251269e-07, + "loss": 0.0321, + "reward": 1.3560242652893066, + "reward_std": 0.19221973419189453, + "rewards/accuracy_reward_stage2": 0.4810241460800171, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 953 + }, + { + "completion_length": 11.21875, + "epoch": 0.1671631329945681, + "grad_norm": 14.71217338958482, + "kl": 0.0576171875, + "learning_rate": 8.330120904152794e-07, + "loss": 0.0231, + "reward": 1.5724213123321533, + "reward_std": 0.11429198831319809, + "rewards/accuracy_reward_stage2": 0.5724212527275085, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 954 + }, + { + "completion_length": 9.203125, + "epoch": 0.16733835640441563, + "grad_norm": 17.366397954858957, + "kl": 0.0830078125, + "learning_rate": 8.328368670054319e-07, + "loss": 0.0087, + "reward": 1.718414306640625, + "reward_std": 0.16092431545257568, + "rewards/accuracy_reward_stage2": 0.7340393662452698, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 955 + }, + { + "completion_length": 10.0625, + "epoch": 0.16751357981426318, + "grad_norm": 19.431897152020653, + "kl": 0.0255126953125, + "learning_rate": 8.326616435955843e-07, + "loss": 0.0102, + "reward": 1.502739667892456, + "reward_std": 0.2304997444152832, + "rewards/accuracy_reward_stage2": 0.5027396082878113, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 956 + }, + { + "completion_length": 9.4375, + "epoch": 0.16768880322411073, + "grad_norm": 20.418223177440694, + "kl": 0.2353515625, + "learning_rate": 8.324864201857368e-07, + "loss": 0.0057, + "reward": 1.495002269744873, + "reward_std": 0.1964532732963562, + "rewards/accuracy_reward_stage2": 0.651252269744873, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 957 + }, + { + "completion_length": 8.375, + "epoch": 0.1678640266339583, + "grad_norm": 15.965850962778783, + "kl": 0.0233154296875, + "learning_rate": 8.323111967758892e-07, + "loss": 0.0093, + "reward": 1.4397320747375488, + "reward_std": 0.22162576019763947, + "rewards/accuracy_reward_stage2": 0.4397321343421936, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 958 + }, + { + "completion_length": 8.15625, + "epoch": 0.16803925004380585, + "grad_norm": 18.8964180423816, + "kl": 0.12060546875, + "learning_rate": 8.321359733660417e-07, + "loss": 0.012, + "reward": 1.560694694519043, + "reward_std": 0.3037213087081909, + "rewards/accuracy_reward_stage2": 0.5763195753097534, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 959 + }, + { + "completion_length": 11.453125, + "epoch": 0.1682144734536534, + "grad_norm": 13.521559828752565, + "kl": 0.040771484375, + "learning_rate": 8.319607499561942e-07, + "loss": -0.0185, + "reward": 1.584068775177002, + "reward_std": 0.15472474694252014, + "rewards/accuracy_reward_stage2": 0.599693775177002, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 960 + }, + { + "completion_length": 11.578125, + "epoch": 0.16838969686350097, + "grad_norm": 26.76151915493076, + "kl": 0.119140625, + "learning_rate": 8.317855265463466e-07, + "loss": 0.0478, + "reward": 1.4074745178222656, + "reward_std": 0.27609723806381226, + "rewards/accuracy_reward_stage2": 0.4074746072292328, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 961 + }, + { + "completion_length": 6.59375, + "epoch": 0.16856492027334852, + "grad_norm": 13.049430572037243, + "kl": 0.16015625, + "learning_rate": 8.31610303136499e-07, + "loss": 0.0638, + "reward": 1.546875, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 962 + }, + { + "completion_length": 8.734375, + "epoch": 0.16874014368319606, + "grad_norm": 15.147946605641504, + "kl": 0.2216796875, + "learning_rate": 8.314350797266514e-07, + "loss": 0.0887, + "reward": 1.2102738618850708, + "reward_std": 0.09944657981395721, + "rewards/accuracy_reward_stage2": 0.3352738320827484, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 963 + }, + { + "completion_length": 13.09375, + "epoch": 0.16891536709304364, + "grad_norm": 23.67007480900033, + "kl": 0.17578125, + "learning_rate": 8.312598563168038e-07, + "loss": 0.0703, + "reward": 1.0486290454864502, + "reward_std": 0.28356683254241943, + "rewards/accuracy_reward_stage2": 0.4236289858818054, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 964 + }, + { + "completion_length": 23.234375, + "epoch": 0.16909059050289119, + "grad_norm": 14.766032051622496, + "kl": 0.04248046875, + "learning_rate": 8.310846329069563e-07, + "loss": 0.017, + "reward": 1.5564314126968384, + "reward_std": 0.15278059244155884, + "rewards/accuracy_reward_stage2": 0.6814314723014832, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 965 + }, + { + "completion_length": 10.578125, + "epoch": 0.16926581391273873, + "grad_norm": 19.302599742311433, + "kl": 0.1376953125, + "learning_rate": 8.309094094971087e-07, + "loss": 0.0112, + "reward": 1.621635913848877, + "reward_std": 0.28337401151657104, + "rewards/accuracy_reward_stage2": 0.6372608542442322, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 966 + }, + { + "completion_length": 7.296875, + "epoch": 0.1694410373225863, + "grad_norm": 20.30266988191586, + "kl": 0.11181640625, + "learning_rate": 8.307341860872612e-07, + "loss": 0.0449, + "reward": 1.5777642726898193, + "reward_std": 0.18352115154266357, + "rewards/accuracy_reward_stage2": 0.5777642726898193, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 967 + }, + { + "completion_length": 10.15625, + "epoch": 0.16961626073243385, + "grad_norm": 25.58995419244153, + "kl": 0.03515625, + "learning_rate": 8.305589626774137e-07, + "loss": -0.0632, + "reward": 1.4791667461395264, + "reward_std": 0.34395408630371094, + "rewards/accuracy_reward_stage2": 0.6354166865348816, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 968 + }, + { + "completion_length": 23.40625, + "epoch": 0.1697914841422814, + "grad_norm": 19.70117347558748, + "kl": 0.0712890625, + "learning_rate": 8.303837392675661e-07, + "loss": 0.0285, + "reward": 1.3151779174804688, + "reward_std": 0.16388216614723206, + "rewards/accuracy_reward_stage2": 0.31517791748046875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 969 + }, + { + "completion_length": 7.390625, + "epoch": 0.16996670755212898, + "grad_norm": 13.108220090014575, + "kl": 0.04150390625, + "learning_rate": 8.302085158577186e-07, + "loss": 0.0166, + "reward": 1.4289811849594116, + "reward_std": 0.14381805062294006, + "rewards/accuracy_reward_stage2": 0.42898115515708923, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 970 + }, + { + "completion_length": 9.09375, + "epoch": 0.17014193096197652, + "grad_norm": 15.970366158750508, + "kl": 0.037841796875, + "learning_rate": 8.300332924478711e-07, + "loss": -0.029, + "reward": 1.5887812376022339, + "reward_std": 0.10244224965572357, + "rewards/accuracy_reward_stage2": 0.6044061779975891, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 971 + }, + { + "completion_length": 7.359375, + "epoch": 0.17031715437182407, + "grad_norm": 17.71757056881451, + "kl": 0.10107421875, + "learning_rate": 8.298580690380234e-07, + "loss": -0.0037, + "reward": 1.5167280435562134, + "reward_std": 0.16086438298225403, + "rewards/accuracy_reward_stage2": 0.5323530435562134, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 972 + }, + { + "completion_length": 9.125, + "epoch": 0.17049237778167164, + "grad_norm": 15.226591161141798, + "kl": 0.053955078125, + "learning_rate": 8.296828456281759e-07, + "loss": -0.0073, + "reward": 1.4342520236968994, + "reward_std": 0.18709491193294525, + "rewards/accuracy_reward_stage2": 0.5592520833015442, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 973 + }, + { + "completion_length": 11.28125, + "epoch": 0.1706676011915192, + "grad_norm": 23.202770013131985, + "kl": 0.037109375, + "learning_rate": 8.295076222183283e-07, + "loss": -0.0268, + "reward": 1.741548776626587, + "reward_std": 0.2346932739019394, + "rewards/accuracy_reward_stage2": 0.7571737766265869, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 974 + }, + { + "completion_length": 10.46875, + "epoch": 0.17084282460136674, + "grad_norm": 20.769549195676348, + "kl": 0.08349609375, + "learning_rate": 8.293323988084808e-07, + "loss": 0.0334, + "reward": 1.4408023357391357, + "reward_std": 0.36021432280540466, + "rewards/accuracy_reward_stage2": 0.4408022463321686, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 975 + }, + { + "completion_length": 9.5625, + "epoch": 0.17101804801121429, + "grad_norm": 15.462756460452283, + "kl": 0.05419921875, + "learning_rate": 8.291571753986332e-07, + "loss": -0.0226, + "reward": 1.5718014240264893, + "reward_std": 0.17255185544490814, + "rewards/accuracy_reward_stage2": 0.7124263644218445, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 976 + }, + { + "completion_length": 23.484375, + "epoch": 0.17119327142106186, + "grad_norm": 17.53958845633692, + "kl": 0.051025390625, + "learning_rate": 8.289819519887856e-07, + "loss": -0.0238, + "reward": 1.3811914920806885, + "reward_std": 0.2425132691860199, + "rewards/accuracy_reward_stage2": 0.3968164622783661, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 977 + }, + { + "completion_length": 7.828125, + "epoch": 0.1713684948309094, + "grad_norm": 22.068611094884353, + "kl": 0.08349609375, + "learning_rate": 8.288067285789381e-07, + "loss": -0.0325, + "reward": 1.6086739301681519, + "reward_std": 0.31228816509246826, + "rewards/accuracy_reward_stage2": 0.6399239301681519, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 978 + }, + { + "completion_length": 8.265625, + "epoch": 0.17154371824075695, + "grad_norm": 19.72837271320354, + "kl": 0.060791015625, + "learning_rate": 8.286315051690906e-07, + "loss": 0.0243, + "reward": 1.3992738723754883, + "reward_std": 0.26514434814453125, + "rewards/accuracy_reward_stage2": 0.3992738425731659, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 979 + }, + { + "completion_length": 19.65625, + "epoch": 0.17171894165060453, + "grad_norm": 20.081802914517276, + "kl": 0.035400390625, + "learning_rate": 8.28456281759243e-07, + "loss": 0.0142, + "reward": 1.3881545066833496, + "reward_std": 0.09738902747631073, + "rewards/accuracy_reward_stage2": 0.5131544470787048, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 980 + }, + { + "completion_length": 10.109375, + "epoch": 0.17189416506045208, + "grad_norm": 151.70299207827858, + "kl": 0.82421875, + "learning_rate": 8.282810583493955e-07, + "loss": 0.3295, + "reward": 1.504793643951416, + "reward_std": 0.12093257904052734, + "rewards/accuracy_reward_stage2": 0.6297937631607056, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 981 + }, + { + "completion_length": 13.046875, + "epoch": 0.17206938847029962, + "grad_norm": 15.725761134737253, + "kl": 0.048828125, + "learning_rate": 8.281058349395478e-07, + "loss": -0.0246, + "reward": 1.321092128753662, + "reward_std": 0.25256600975990295, + "rewards/accuracy_reward_stage2": 0.3367171585559845, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 982 + }, + { + "completion_length": 11.515625, + "epoch": 0.1722446118801472, + "grad_norm": 19.530650469401575, + "kl": 0.072265625, + "learning_rate": 8.279306115297003e-07, + "loss": 0.0027, + "reward": 1.491995096206665, + "reward_std": 0.2790702283382416, + "rewards/accuracy_reward_stage2": 0.507620096206665, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 983 + }, + { + "completion_length": 6.390625, + "epoch": 0.17241983528999474, + "grad_norm": 18.369641912161285, + "kl": 0.07958984375, + "learning_rate": 8.277553881198528e-07, + "loss": -0.1004, + "reward": 1.7411483526229858, + "reward_std": 0.2977067530155182, + "rewards/accuracy_reward_stage2": 0.7880233526229858, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 984 + }, + { + "completion_length": 12.4375, + "epoch": 0.1725950586998423, + "grad_norm": 21.629425902778692, + "kl": 0.10302734375, + "learning_rate": 8.275801647100052e-07, + "loss": -0.0549, + "reward": 1.7416812181472778, + "reward_std": 0.29112160205841064, + "rewards/accuracy_reward_stage2": 0.7885562777519226, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 985 + }, + { + "completion_length": 8.890625, + "epoch": 0.17277028210968987, + "grad_norm": 14.636965958095939, + "kl": 0.0673828125, + "learning_rate": 8.274049413001577e-07, + "loss": 0.0269, + "reward": 1.5075805187225342, + "reward_std": 0.12128952145576477, + "rewards/accuracy_reward_stage2": 0.5075805187225342, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 986 + }, + { + "completion_length": 11.28125, + "epoch": 0.1729455055195374, + "grad_norm": 16.800360042148473, + "kl": 0.1484375, + "learning_rate": 8.272297178903102e-07, + "loss": -0.0479, + "reward": 1.6010843515396118, + "reward_std": 0.21340158581733704, + "rewards/accuracy_reward_stage2": 0.647959291934967, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 987 + }, + { + "completion_length": 9.359375, + "epoch": 0.17312072892938496, + "grad_norm": 19.29185552280403, + "kl": 0.0673828125, + "learning_rate": 8.270544944804626e-07, + "loss": -0.0171, + "reward": 1.329951286315918, + "reward_std": 0.21836236119270325, + "rewards/accuracy_reward_stage2": 0.47057637572288513, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 988 + }, + { + "completion_length": 10.828125, + "epoch": 0.17329595233923253, + "grad_norm": 17.61933481117207, + "kl": 0.162109375, + "learning_rate": 8.26879271070615e-07, + "loss": 0.0647, + "reward": 1.7506136894226074, + "reward_std": 0.11140866577625275, + "rewards/accuracy_reward_stage2": 0.7506136894226074, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 989 + }, + { + "completion_length": 15.75, + "epoch": 0.17347117574908008, + "grad_norm": 16.550933351259392, + "kl": 0.0216064453125, + "learning_rate": 8.267040476607674e-07, + "loss": -0.074, + "reward": 1.5046889781951904, + "reward_std": 0.23458629846572876, + "rewards/accuracy_reward_stage2": 0.5359390377998352, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 990 + }, + { + "completion_length": 10.5625, + "epoch": 0.17364639915892763, + "grad_norm": 18.65045253777362, + "kl": 0.0810546875, + "learning_rate": 8.265288242509199e-07, + "loss": 0.0323, + "reward": 1.4617888927459717, + "reward_std": 0.3118630647659302, + "rewards/accuracy_reward_stage2": 0.4617888927459717, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 991 + }, + { + "completion_length": 10.171875, + "epoch": 0.17382162256877517, + "grad_norm": 21.35463925406452, + "kl": 0.228515625, + "learning_rate": 8.263536008410723e-07, + "loss": 0.0918, + "reward": 1.6203045845031738, + "reward_std": 0.1743711233139038, + "rewards/accuracy_reward_stage2": 0.7453045845031738, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 992 + }, + { + "completion_length": 8.78125, + "epoch": 0.17399684597862275, + "grad_norm": 16.277081608292, + "kl": 0.1943359375, + "learning_rate": 8.261783774312247e-07, + "loss": 0.0777, + "reward": 1.471451997756958, + "reward_std": 0.17242538928985596, + "rewards/accuracy_reward_stage2": 0.721451997756958, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 993 + }, + { + "completion_length": 8.671875, + "epoch": 0.1741720693884703, + "grad_norm": 20.263183545075382, + "kl": 0.049560546875, + "learning_rate": 8.260031540213772e-07, + "loss": 0.0084, + "reward": 1.3886260986328125, + "reward_std": 0.3107958137989044, + "rewards/accuracy_reward_stage2": 0.5136260986328125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 994 + }, + { + "completion_length": 9.296875, + "epoch": 0.17434729279831784, + "grad_norm": 19.59160552675768, + "kl": 0.09033203125, + "learning_rate": 8.258279306115297e-07, + "loss": 0.0362, + "reward": 1.3483185768127441, + "reward_std": 0.2468905746936798, + "rewards/accuracy_reward_stage2": 0.34831857681274414, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 995 + }, + { + "completion_length": 10.65625, + "epoch": 0.17452251620816542, + "grad_norm": 18.529946519101127, + "kl": 0.11474609375, + "learning_rate": 8.256527072016821e-07, + "loss": 0.0459, + "reward": 1.512170433998108, + "reward_std": 0.12816551327705383, + "rewards/accuracy_reward_stage2": 0.6371704339981079, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 996 + }, + { + "completion_length": 8.078125, + "epoch": 0.17469773961801296, + "grad_norm": 24.896906912430655, + "kl": 0.310546875, + "learning_rate": 8.254774837918346e-07, + "loss": 0.1239, + "reward": 1.4386553764343262, + "reward_std": 0.1602693349123001, + "rewards/accuracy_reward_stage2": 0.5636553764343262, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 997 + }, + { + "completion_length": 8.09375, + "epoch": 0.1748729630278605, + "grad_norm": 24.22248073843513, + "kl": 0.0458984375, + "learning_rate": 8.25302260381987e-07, + "loss": 0.0183, + "reward": 1.6866368055343628, + "reward_std": 0.20650362968444824, + "rewards/accuracy_reward_stage2": 0.6866368055343628, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 998 + }, + { + "completion_length": 11.0625, + "epoch": 0.1750481864377081, + "grad_norm": 18.275631526717135, + "kl": 0.037353515625, + "learning_rate": 8.251270369721395e-07, + "loss": 0.0149, + "reward": 1.853606939315796, + "reward_std": 0.21327659487724304, + "rewards/accuracy_reward_stage2": 0.8536069393157959, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 999 + }, + { + "completion_length": 7.328125, + "epoch": 0.17522340984755563, + "grad_norm": 20.812654899712545, + "kl": 0.07080078125, + "learning_rate": 8.24951813562292e-07, + "loss": -0.016, + "reward": 1.7883012294769287, + "reward_std": 0.17655321955680847, + "rewards/accuracy_reward_stage2": 0.8039262294769287, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1000 + }, + { + "completion_length": 12.421875, + "epoch": 0.17539863325740318, + "grad_norm": 20.401303313595758, + "kl": 0.0537109375, + "learning_rate": 8.247765901524442e-07, + "loss": -0.0227, + "reward": 1.476668119430542, + "reward_std": 0.19259199500083923, + "rewards/accuracy_reward_stage2": 0.6172930002212524, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1001 + }, + { + "completion_length": 8.328125, + "epoch": 0.17557385666725076, + "grad_norm": 16.901598014848172, + "kl": 0.0576171875, + "learning_rate": 8.246013667425967e-07, + "loss": -0.0211, + "reward": 1.639136552810669, + "reward_std": 0.27491295337677, + "rewards/accuracy_reward_stage2": 0.6547614336013794, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1002 + }, + { + "completion_length": 7.671875, + "epoch": 0.1757490800770983, + "grad_norm": 22.6402288692317, + "kl": 0.0791015625, + "learning_rate": 8.244261433327491e-07, + "loss": -0.0026, + "reward": 1.515639305114746, + "reward_std": 0.28191104531288147, + "rewards/accuracy_reward_stage2": 0.6562642455101013, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1003 + }, + { + "completion_length": 11.640625, + "epoch": 0.17592430348694585, + "grad_norm": 20.971542924107762, + "kl": 0.04443359375, + "learning_rate": 8.242509199229016e-07, + "loss": 0.0178, + "reward": 1.5278730392456055, + "reward_std": 0.22605562210083008, + "rewards/accuracy_reward_stage2": 0.5278730392456055, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1004 + }, + { + "completion_length": 9.796875, + "epoch": 0.17609952689679342, + "grad_norm": 16.783145596403724, + "kl": 0.23046875, + "learning_rate": 8.240756965130541e-07, + "loss": 0.0924, + "reward": 1.3827335834503174, + "reward_std": 0.1471167504787445, + "rewards/accuracy_reward_stage2": 0.5077335834503174, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1005 + }, + { + "completion_length": 8.453125, + "epoch": 0.17627475030664097, + "grad_norm": 21.650322269214524, + "kl": 0.2451171875, + "learning_rate": 8.239004731032065e-07, + "loss": 0.065, + "reward": 1.471717119216919, + "reward_std": 0.31655657291412354, + "rewards/accuracy_reward_stage2": 0.6123421788215637, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1006 + }, + { + "completion_length": 11.453125, + "epoch": 0.17644997371648852, + "grad_norm": 20.356875919340375, + "kl": 0.140625, + "learning_rate": 8.23725249693359e-07, + "loss": 0.0562, + "reward": 1.6293368339538574, + "reward_std": 0.22882044315338135, + "rewards/accuracy_reward_stage2": 0.7543368339538574, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1007 + }, + { + "completion_length": 18.09375, + "epoch": 0.1766251971263361, + "grad_norm": 14.100834330812283, + "kl": 0.1318359375, + "learning_rate": 8.235500262835115e-07, + "loss": 0.0525, + "reward": 1.4905918836593628, + "reward_std": 0.07823127508163452, + "rewards/accuracy_reward_stage2": 0.6155918836593628, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1008 + }, + { + "completion_length": 7.3125, + "epoch": 0.17680042053618364, + "grad_norm": 21.476913992157833, + "kl": 0.09130859375, + "learning_rate": 8.233748028736639e-07, + "loss": -0.0076, + "reward": 1.5450880527496338, + "reward_std": 0.23528814315795898, + "rewards/accuracy_reward_stage2": 0.560712993144989, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1009 + }, + { + "completion_length": 9.3125, + "epoch": 0.17697564394603119, + "grad_norm": 16.438543339296952, + "kl": 0.02978515625, + "learning_rate": 8.231995794638164e-07, + "loss": 0.0119, + "reward": 1.4612115621566772, + "reward_std": 0.12375926971435547, + "rewards/accuracy_reward_stage2": 0.46121156215667725, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1010 + }, + { + "completion_length": 10.734375, + "epoch": 0.17715086735587873, + "grad_norm": 26.435355144409534, + "kl": 0.0908203125, + "learning_rate": 8.230243560539689e-07, + "loss": 0.0148, + "reward": 1.6115930080413818, + "reward_std": 0.27439337968826294, + "rewards/accuracy_reward_stage2": 0.6272180676460266, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1011 + }, + { + "completion_length": 10.921875, + "epoch": 0.1773260907657263, + "grad_norm": 47.63378987105592, + "kl": 0.0283203125, + "learning_rate": 8.228491326441212e-07, + "loss": 0.0113, + "reward": 1.6302083730697632, + "reward_std": 0.13152070343494415, + "rewards/accuracy_reward_stage2": 0.7552083134651184, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1012 + }, + { + "completion_length": 11.03125, + "epoch": 0.17750131417557385, + "grad_norm": 18.101819542619275, + "kl": 0.134765625, + "learning_rate": 8.226739092342737e-07, + "loss": 0.0181, + "reward": 1.4931879043579102, + "reward_std": 0.24275703728199005, + "rewards/accuracy_reward_stage2": 0.6338127851486206, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1013 + }, + { + "completion_length": 9.15625, + "epoch": 0.1776765375854214, + "grad_norm": 16.102376244932238, + "kl": 0.07373046875, + "learning_rate": 8.22498685824426e-07, + "loss": 0.0295, + "reward": 1.5167219638824463, + "reward_std": 0.15145519375801086, + "rewards/accuracy_reward_stage2": 0.5167218446731567, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1014 + }, + { + "completion_length": 8.9375, + "epoch": 0.17785176099526898, + "grad_norm": 17.936021523356104, + "kl": 0.0546875, + "learning_rate": 8.223234624145785e-07, + "loss": -0.0061, + "reward": 1.4813058376312256, + "reward_std": 0.2451373040676117, + "rewards/accuracy_reward_stage2": 0.4969308078289032, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1015 + }, + { + "completion_length": 14.453125, + "epoch": 0.17802698440511652, + "grad_norm": 18.444340845595043, + "kl": 0.06298828125, + "learning_rate": 8.22148239004731e-07, + "loss": 0.0251, + "reward": 1.5098751783370972, + "reward_std": 0.251526415348053, + "rewards/accuracy_reward_stage2": 0.5098751783370972, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1016 + }, + { + "completion_length": 9.234375, + "epoch": 0.17820220781496407, + "grad_norm": 15.845520083072273, + "kl": 0.0250244140625, + "learning_rate": 8.219730155948834e-07, + "loss": 0.01, + "reward": 1.5925273895263672, + "reward_std": 0.1007295772433281, + "rewards/accuracy_reward_stage2": 0.7175273895263672, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1017 + }, + { + "completion_length": 8.515625, + "epoch": 0.17837743122481164, + "grad_norm": 16.774498969471463, + "kl": 0.06787109375, + "learning_rate": 8.217977921850359e-07, + "loss": -0.0613, + "reward": 1.5572917461395264, + "reward_std": 0.27777281403541565, + "rewards/accuracy_reward_stage2": 0.5885416865348816, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1018 + }, + { + "completion_length": 12.453125, + "epoch": 0.1785526546346592, + "grad_norm": 18.360925497310827, + "kl": 0.0595703125, + "learning_rate": 8.216225687751883e-07, + "loss": -0.0179, + "reward": 1.7169744968414307, + "reward_std": 0.23899121582508087, + "rewards/accuracy_reward_stage2": 0.7325994372367859, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1019 + }, + { + "completion_length": 17.859375, + "epoch": 0.17872787804450674, + "grad_norm": 19.006972211077844, + "kl": 0.029296875, + "learning_rate": 8.214473453653408e-07, + "loss": 0.0117, + "reward": 1.5444114208221436, + "reward_std": 0.08622656762599945, + "rewards/accuracy_reward_stage2": 0.5444114804267883, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1020 + }, + { + "completion_length": 6.828125, + "epoch": 0.1789031014543543, + "grad_norm": 20.900425709632742, + "kl": 0.130859375, + "learning_rate": 8.212721219554933e-07, + "loss": -0.0142, + "reward": 1.8227179050445557, + "reward_std": 0.19578629732131958, + "rewards/accuracy_reward_stage2": 0.8539679050445557, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1021 + }, + { + "completion_length": 10.40625, + "epoch": 0.17907832486420186, + "grad_norm": 13.043553359133417, + "kl": 0.03271484375, + "learning_rate": 8.210968985456456e-07, + "loss": 0.013, + "reward": 1.7412645816802979, + "reward_std": 0.1495451033115387, + "rewards/accuracy_reward_stage2": 0.7412645816802979, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1022 + }, + { + "completion_length": 12.03125, + "epoch": 0.1792535482740494, + "grad_norm": 20.33707074188944, + "kl": 0.047119140625, + "learning_rate": 8.209216751357981e-07, + "loss": 0.0188, + "reward": 1.577462077140808, + "reward_std": 0.18190613389015198, + "rewards/accuracy_reward_stage2": 0.5774620771408081, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1023 + }, + { + "completion_length": 12.734375, + "epoch": 0.17942877168389698, + "grad_norm": 30.677168098528167, + "kl": 0.0458984375, + "learning_rate": 8.207464517259506e-07, + "loss": 0.0183, + "reward": 1.5067112445831299, + "reward_std": 0.12453323602676392, + "rewards/accuracy_reward_stage2": 0.5067112445831299, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1024 + }, + { + "completion_length": 12.296875, + "epoch": 0.17960399509374453, + "grad_norm": 19.917300368702044, + "kl": 0.12109375, + "learning_rate": 8.20571228316103e-07, + "loss": 0.0485, + "reward": 1.4255033731460571, + "reward_std": 0.12041162699460983, + "rewards/accuracy_reward_stage2": 0.5505033731460571, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1025 + }, + { + "completion_length": 5.90625, + "epoch": 0.17977921850359208, + "grad_norm": 14.500310886298085, + "kl": 0.0322265625, + "learning_rate": 8.203960049062555e-07, + "loss": -0.0313, + "reward": 1.8072917461395264, + "reward_std": 0.1236192062497139, + "rewards/accuracy_reward_stage2": 0.8229166865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1026 + }, + { + "completion_length": 10.703125, + "epoch": 0.17995444191343962, + "grad_norm": 16.120327694641297, + "kl": 0.140625, + "learning_rate": 8.202207814964078e-07, + "loss": 0.0119, + "reward": 1.4947917461395264, + "reward_std": 0.2251920998096466, + "rewards/accuracy_reward_stage2": 0.6354166865348816, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1027 + }, + { + "completion_length": 11.1875, + "epoch": 0.1801296653232872, + "grad_norm": 17.841738073436776, + "kl": 0.020751953125, + "learning_rate": 8.200455580865603e-07, + "loss": 0.0083, + "reward": 1.6875, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward_stage2": 0.6875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1028 + }, + { + "completion_length": 9.5, + "epoch": 0.18030488873313474, + "grad_norm": 14.691140252447976, + "kl": 0.07470703125, + "learning_rate": 8.198703346767128e-07, + "loss": -0.0121, + "reward": 1.6741013526916504, + "reward_std": 0.252849280834198, + "rewards/accuracy_reward_stage2": 0.6897263526916504, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1029 + }, + { + "completion_length": 7.140625, + "epoch": 0.1804801121429823, + "grad_norm": 15.284062287517404, + "kl": 0.0159912109375, + "learning_rate": 8.196951112668652e-07, + "loss": 0.0064, + "reward": 1.6623451709747314, + "reward_std": 0.12901227176189423, + "rewards/accuracy_reward_stage2": 0.6623451709747314, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1030 + }, + { + "completion_length": 11.671875, + "epoch": 0.18065533555282987, + "grad_norm": 20.77060669760343, + "kl": 0.1162109375, + "learning_rate": 8.195198878570176e-07, + "loss": -0.0166, + "reward": 1.5759544372558594, + "reward_std": 0.3065808415412903, + "rewards/accuracy_reward_stage2": 0.6072044968605042, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1031 + }, + { + "completion_length": 9.328125, + "epoch": 0.1808305589626774, + "grad_norm": 22.582798418645474, + "kl": 0.265625, + "learning_rate": 8.193446644471701e-07, + "loss": 0.1057, + "reward": 1.3950915336608887, + "reward_std": 0.31523001194000244, + "rewards/accuracy_reward_stage2": 0.6450915336608887, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1032 + }, + { + "completion_length": 12.5, + "epoch": 0.18100578237252496, + "grad_norm": 23.123052879635157, + "kl": 0.169921875, + "learning_rate": 8.191694410373225e-07, + "loss": 0.0238, + "reward": 1.5907235145568848, + "reward_std": 0.24119962751865387, + "rewards/accuracy_reward_stage2": 0.7313483953475952, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1033 + }, + { + "completion_length": 8.921875, + "epoch": 0.18118100578237253, + "grad_norm": 16.125358149381995, + "kl": 0.0908203125, + "learning_rate": 8.18994217627475e-07, + "loss": 0.0363, + "reward": 1.5684726238250732, + "reward_std": 0.2518884241580963, + "rewards/accuracy_reward_stage2": 0.5684726238250732, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1034 + }, + { + "completion_length": 11.71875, + "epoch": 0.18135622919222008, + "grad_norm": 17.152327203738277, + "kl": 0.10205078125, + "learning_rate": 8.188189942176274e-07, + "loss": 0.0075, + "reward": 1.4885659217834473, + "reward_std": 0.2130601704120636, + "rewards/accuracy_reward_stage2": 0.504190981388092, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1035 + }, + { + "completion_length": 8.5625, + "epoch": 0.18153145260206763, + "grad_norm": 12.17071070502903, + "kl": 0.044921875, + "learning_rate": 8.186437708077799e-07, + "loss": -0.0263, + "reward": 1.5416667461395264, + "reward_std": 0.16781337559223175, + "rewards/accuracy_reward_stage2": 0.5572916865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1036 + }, + { + "completion_length": 9.765625, + "epoch": 0.1817066760119152, + "grad_norm": 15.998631392491633, + "kl": 0.06103515625, + "learning_rate": 8.184685473979324e-07, + "loss": -0.0197, + "reward": 1.7481575012207031, + "reward_std": 0.12084738910198212, + "rewards/accuracy_reward_stage2": 0.7637824416160583, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1037 + }, + { + "completion_length": 11.734375, + "epoch": 0.18188189942176275, + "grad_norm": 15.049644417962663, + "kl": 0.08203125, + "learning_rate": 8.182933239880848e-07, + "loss": 0.0328, + "reward": 1.6555730104446411, + "reward_std": 0.14731593430042267, + "rewards/accuracy_reward_stage2": 0.6555730104446411, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1038 + }, + { + "completion_length": 32.265625, + "epoch": 0.1820571228316103, + "grad_norm": 20.944749777286034, + "kl": 0.10546875, + "learning_rate": 8.181181005782373e-07, + "loss": 0.0004, + "reward": 1.3465502262115479, + "reward_std": 0.11534099280834198, + "rewards/accuracy_reward_stage2": 0.48717522621154785, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1039 + }, + { + "completion_length": 7.0625, + "epoch": 0.18223234624145787, + "grad_norm": 18.674889870148007, + "kl": 0.06396484375, + "learning_rate": 8.179428771683897e-07, + "loss": 0.0256, + "reward": 1.4174654483795166, + "reward_std": 0.17789804935455322, + "rewards/accuracy_reward_stage2": 0.5424654483795166, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1040 + }, + { + "completion_length": 12.375, + "epoch": 0.18240756965130542, + "grad_norm": 15.68352013594671, + "kl": 0.0181884765625, + "learning_rate": 8.17767653758542e-07, + "loss": 0.0073, + "reward": 1.6470057964324951, + "reward_std": 0.15792571008205414, + "rewards/accuracy_reward_stage2": 0.6470057368278503, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1041 + }, + { + "completion_length": 11.78125, + "epoch": 0.18258279306115296, + "grad_norm": 17.814980651109845, + "kl": 0.07568359375, + "learning_rate": 8.175924303486945e-07, + "loss": -0.0027, + "reward": 1.720482349395752, + "reward_std": 0.28667330741882324, + "rewards/accuracy_reward_stage2": 0.736107349395752, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1042 + }, + { + "completion_length": 17.796875, + "epoch": 0.18275801647100054, + "grad_norm": 21.4349531985855, + "kl": 0.059326171875, + "learning_rate": 8.174172069388469e-07, + "loss": 0.0237, + "reward": 1.185448169708252, + "reward_std": 0.19691388309001923, + "rewards/accuracy_reward_stage2": 0.31044822931289673, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1043 + }, + { + "completion_length": 12.515625, + "epoch": 0.1829332398808481, + "grad_norm": 17.88483618320598, + "kl": 0.078125, + "learning_rate": 8.172419835289994e-07, + "loss": -0.0127, + "reward": 1.491915225982666, + "reward_std": 0.18654459714889526, + "rewards/accuracy_reward_stage2": 0.5075401663780212, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1044 + }, + { + "completion_length": 10.296875, + "epoch": 0.18310846329069563, + "grad_norm": 22.59994444123705, + "kl": 0.027099609375, + "learning_rate": 8.170667601191519e-07, + "loss": 0.0108, + "reward": 1.53125, + "reward_std": 0.3119301199913025, + "rewards/accuracy_reward_stage2": 0.53125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1045 + }, + { + "completion_length": 5.375, + "epoch": 0.18328368670054318, + "grad_norm": 9.016221394372321, + "kl": 0.035400390625, + "learning_rate": 8.168915367093043e-07, + "loss": 0.0141, + "reward": 1.695914387702942, + "reward_std": 0.025557324290275574, + "rewards/accuracy_reward_stage2": 0.6959144473075867, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1046 + }, + { + "completion_length": 10.8125, + "epoch": 0.18345891011039075, + "grad_norm": 24.762047009674582, + "kl": 0.1064453125, + "learning_rate": 8.167163132994568e-07, + "loss": -0.0191, + "reward": 1.7301406860351562, + "reward_std": 0.292434424161911, + "rewards/accuracy_reward_stage2": 0.7613905668258667, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1047 + }, + { + "completion_length": 10.859375, + "epoch": 0.1836341335202383, + "grad_norm": 35.97968114566077, + "kl": 0.291015625, + "learning_rate": 8.165410898896093e-07, + "loss": 0.0854, + "reward": 1.3661143779754639, + "reward_std": 0.21945153176784515, + "rewards/accuracy_reward_stage2": 0.6317393779754639, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1048 + }, + { + "completion_length": 8.78125, + "epoch": 0.18380935693008585, + "grad_norm": 14.925876419456184, + "kl": 0.0269775390625, + "learning_rate": 8.163658664797617e-07, + "loss": 0.0108, + "reward": 1.5434216260910034, + "reward_std": 0.04544178768992424, + "rewards/accuracy_reward_stage2": 0.5434216260910034, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1049 + }, + { + "completion_length": 11.859375, + "epoch": 0.18398458033993342, + "grad_norm": 27.873893943359306, + "kl": 0.1494140625, + "learning_rate": 8.161906430699142e-07, + "loss": 0.0696, + "reward": 1.3853431940078735, + "reward_std": 0.20109063386917114, + "rewards/accuracy_reward_stage2": 0.510343074798584, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1050 + }, + { + "completion_length": 7.90625, + "epoch": 0.18415980374978097, + "grad_norm": 18.855523440997825, + "kl": 0.07666015625, + "learning_rate": 8.160154196600665e-07, + "loss": 0.0306, + "reward": 1.6734848022460938, + "reward_std": 0.2122466266155243, + "rewards/accuracy_reward_stage2": 0.6734848022460938, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1051 + }, + { + "completion_length": 13.25, + "epoch": 0.18433502715962852, + "grad_norm": 22.084077966222758, + "kl": 0.06787109375, + "learning_rate": 8.158401962502189e-07, + "loss": 0.0144, + "reward": 1.6123440265655518, + "reward_std": 0.1913946568965912, + "rewards/accuracy_reward_stage2": 0.627968966960907, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1052 + }, + { + "completion_length": 9.28125, + "epoch": 0.1845102505694761, + "grad_norm": 21.391734235107787, + "kl": 0.06787109375, + "learning_rate": 8.156649728403714e-07, + "loss": 0.0273, + "reward": 1.7385876178741455, + "reward_std": 0.15550118684768677, + "rewards/accuracy_reward_stage2": 0.7385876178741455, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1053 + }, + { + "completion_length": 9.21875, + "epoch": 0.18468547397932364, + "grad_norm": 22.317421155534973, + "kl": 0.0615234375, + "learning_rate": 8.154897494305238e-07, + "loss": 0.0246, + "reward": 1.710514783859253, + "reward_std": 0.30146676301956177, + "rewards/accuracy_reward_stage2": 0.7105147838592529, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1054 + }, + { + "completion_length": 6.28125, + "epoch": 0.18486069738917119, + "grad_norm": 17.435864193181782, + "kl": 0.125, + "learning_rate": 8.153145260206763e-07, + "loss": -0.0174, + "reward": 1.7268104553222656, + "reward_std": 0.16393327713012695, + "rewards/accuracy_reward_stage2": 0.7580605745315552, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1055 + }, + { + "completion_length": 11.96875, + "epoch": 0.18503592079901876, + "grad_norm": 51.940177880679094, + "kl": 0.357421875, + "learning_rate": 8.151393026108288e-07, + "loss": 0.181, + "reward": 1.4652410745620728, + "reward_std": 0.19520485401153564, + "rewards/accuracy_reward_stage2": 0.5902410745620728, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1056 + }, + { + "completion_length": 9.171875, + "epoch": 0.1852111442088663, + "grad_norm": 15.935770458923814, + "kl": 0.083984375, + "learning_rate": 8.149640792009812e-07, + "loss": 0.0337, + "reward": 1.5994462966918945, + "reward_std": 0.12744741141796112, + "rewards/accuracy_reward_stage2": 0.5994464159011841, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1057 + }, + { + "completion_length": 7.703125, + "epoch": 0.18538636761871385, + "grad_norm": 19.994315655368574, + "kl": 0.042724609375, + "learning_rate": 8.147888557911337e-07, + "loss": 0.017, + "reward": 1.5583534240722656, + "reward_std": 0.16728171706199646, + "rewards/accuracy_reward_stage2": 0.5583534836769104, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1058 + }, + { + "completion_length": 9.046875, + "epoch": 0.18556159102856143, + "grad_norm": 27.119082446679823, + "kl": 0.095703125, + "learning_rate": 8.146136323812861e-07, + "loss": 0.0319, + "reward": 1.4803056716918945, + "reward_std": 0.30267736315727234, + "rewards/accuracy_reward_stage2": 0.4959307312965393, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1059 + }, + { + "completion_length": 10.65625, + "epoch": 0.18573681443840898, + "grad_norm": 14.351166107837374, + "kl": 0.0693359375, + "learning_rate": 8.144384089714386e-07, + "loss": 0.0276, + "reward": 1.7468219995498657, + "reward_std": 0.07223241031169891, + "rewards/accuracy_reward_stage2": 0.7468219995498657, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1060 + }, + { + "completion_length": 9.484375, + "epoch": 0.18591203784825652, + "grad_norm": 18.448592069829477, + "kl": 0.10107421875, + "learning_rate": 8.14263185561591e-07, + "loss": 0.0019, + "reward": 1.4668774604797363, + "reward_std": 0.147735133767128, + "rewards/accuracy_reward_stage2": 0.4825023412704468, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1061 + }, + { + "completion_length": 8.078125, + "epoch": 0.18608726125810407, + "grad_norm": 18.84877306794847, + "kl": 0.07861328125, + "learning_rate": 8.140879621517434e-07, + "loss": -0.0128, + "reward": 1.6677536964416504, + "reward_std": 0.16473901271820068, + "rewards/accuracy_reward_stage2": 0.8083786368370056, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1062 + }, + { + "completion_length": 10.328125, + "epoch": 0.18626248466795164, + "grad_norm": 20.90295345468802, + "kl": 0.103515625, + "learning_rate": 8.139127387418959e-07, + "loss": -0.0027, + "reward": 1.4815478324890137, + "reward_std": 0.2598685026168823, + "rewards/accuracy_reward_stage2": 0.4971729516983032, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1063 + }, + { + "completion_length": 17.0, + "epoch": 0.1864377080777992, + "grad_norm": 23.64176392441905, + "kl": 0.07568359375, + "learning_rate": 8.137375153320484e-07, + "loss": 0.0302, + "reward": 1.4952113628387451, + "reward_std": 0.1622573733329773, + "rewards/accuracy_reward_stage2": 0.49521133303642273, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1064 + }, + { + "completion_length": 7.640625, + "epoch": 0.18661293148764674, + "grad_norm": 18.15053174337808, + "kl": 0.275390625, + "learning_rate": 8.135622919222007e-07, + "loss": 0.1104, + "reward": 1.2247974872589111, + "reward_std": 0.14716273546218872, + "rewards/accuracy_reward_stage2": 0.47479745745658875, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1065 + }, + { + "completion_length": 7.171875, + "epoch": 0.1867881548974943, + "grad_norm": 20.80874547850332, + "kl": 0.087890625, + "learning_rate": 8.133870685123532e-07, + "loss": 0.0351, + "reward": 1.516782283782959, + "reward_std": 0.19680972397327423, + "rewards/accuracy_reward_stage2": 0.7667823433876038, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1066 + }, + { + "completion_length": 9.59375, + "epoch": 0.18696337830734186, + "grad_norm": 18.272476449941177, + "kl": 0.197265625, + "learning_rate": 8.132118451025056e-07, + "loss": 0.0066, + "reward": 1.2306230068206787, + "reward_std": 0.2887798845767975, + "rewards/accuracy_reward_stage2": 0.5118729472160339, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1067 + }, + { + "completion_length": 7.90625, + "epoch": 0.1871386017171894, + "grad_norm": 18.320944156003165, + "kl": 0.0908203125, + "learning_rate": 8.130366216926581e-07, + "loss": -0.008, + "reward": 1.4491363763809204, + "reward_std": 0.26713356375694275, + "rewards/accuracy_reward_stage2": 0.5897614359855652, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1068 + }, + { + "completion_length": 5.15625, + "epoch": 0.18731382512703698, + "grad_norm": 5.6172327173402214, + "kl": 0.030517578125, + "learning_rate": 8.128613982828106e-07, + "loss": 0.0122, + "reward": 1.34375, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1069 + }, + { + "completion_length": 10.78125, + "epoch": 0.18748904853688453, + "grad_norm": 13.968815746488696, + "kl": 0.07763671875, + "learning_rate": 8.12686174872963e-07, + "loss": 0.031, + "reward": 1.6498345136642456, + "reward_std": 0.10637789964675903, + "rewards/accuracy_reward_stage2": 0.6498345136642456, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1070 + }, + { + "completion_length": 8.9375, + "epoch": 0.18766427194673208, + "grad_norm": 12.70060536444373, + "kl": 0.05615234375, + "learning_rate": 8.125109514631154e-07, + "loss": -0.0206, + "reward": 1.636265754699707, + "reward_std": 0.16409549117088318, + "rewards/accuracy_reward_stage2": 0.651890754699707, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1071 + }, + { + "completion_length": 10.46875, + "epoch": 0.18783949535657965, + "grad_norm": 16.49288173549537, + "kl": 0.0595703125, + "learning_rate": 8.123357280532679e-07, + "loss": -0.0119, + "reward": 1.7313894033432007, + "reward_std": 0.21923525631427765, + "rewards/accuracy_reward_stage2": 0.7470144033432007, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1072 + }, + { + "completion_length": 12.359375, + "epoch": 0.1880147187664272, + "grad_norm": 19.559616270302996, + "kl": 0.1982421875, + "learning_rate": 8.121605046434203e-07, + "loss": 0.0018, + "reward": 1.7189399003982544, + "reward_std": 0.16802644729614258, + "rewards/accuracy_reward_stage2": 0.750190019607544, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1073 + }, + { + "completion_length": 8.484375, + "epoch": 0.18818994217627474, + "grad_norm": 21.374718124967487, + "kl": 0.08935546875, + "learning_rate": 8.119852812335728e-07, + "loss": -0.0418, + "reward": 1.489177942276001, + "reward_std": 0.21345767378807068, + "rewards/accuracy_reward_stage2": 0.629802942276001, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1074 + }, + { + "completion_length": 9.78125, + "epoch": 0.18836516558612232, + "grad_norm": 13.154409710619879, + "kl": 0.01904296875, + "learning_rate": 8.118100578237252e-07, + "loss": 0.0076, + "reward": 1.484375, + "reward_std": 0.16887325048446655, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1075 + }, + { + "completion_length": 9.265625, + "epoch": 0.18854038899596987, + "grad_norm": 20.690764265380892, + "kl": 0.1337890625, + "learning_rate": 8.116348344138777e-07, + "loss": 0.0095, + "reward": 1.607621669769287, + "reward_std": 0.2766973674297333, + "rewards/accuracy_reward_stage2": 0.6232466697692871, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1076 + }, + { + "completion_length": 10.46875, + "epoch": 0.1887156124058174, + "grad_norm": 22.12174077968315, + "kl": 0.1279296875, + "learning_rate": 8.114596110040302e-07, + "loss": 0.0222, + "reward": 1.641465425491333, + "reward_std": 0.22600020468235016, + "rewards/accuracy_reward_stage2": 0.6570904850959778, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1077 + }, + { + "completion_length": 12.671875, + "epoch": 0.18889083581566496, + "grad_norm": 18.34750471892336, + "kl": 0.0458984375, + "learning_rate": 8.112843875941825e-07, + "loss": 0.0183, + "reward": 1.5698916912078857, + "reward_std": 0.1975686103105545, + "rewards/accuracy_reward_stage2": 0.5698915719985962, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1078 + }, + { + "completion_length": 10.953125, + "epoch": 0.18906605922551253, + "grad_norm": 29.019484580187594, + "kl": 0.0283203125, + "learning_rate": 8.11109164184335e-07, + "loss": 0.0113, + "reward": 1.8850083351135254, + "reward_std": 0.14863698184490204, + "rewards/accuracy_reward_stage2": 0.8850083351135254, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1079 + }, + { + "completion_length": 6.34375, + "epoch": 0.18924128263536008, + "grad_norm": 18.771028895482477, + "kl": 0.0859375, + "learning_rate": 8.109339407744873e-07, + "loss": -0.0387, + "reward": 1.7000000476837158, + "reward_std": 0.1992851197719574, + "rewards/accuracy_reward_stage2": 0.8562500476837158, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1080 + }, + { + "completion_length": 8.953125, + "epoch": 0.18941650604520763, + "grad_norm": 26.366143313068836, + "kl": 0.08935546875, + "learning_rate": 8.107587173646398e-07, + "loss": -0.0084, + "reward": 1.587983250617981, + "reward_std": 0.23805229365825653, + "rewards/accuracy_reward_stage2": 0.603608250617981, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1081 + }, + { + "completion_length": 6.390625, + "epoch": 0.1895917294550552, + "grad_norm": 10.240158184262638, + "kl": 0.004119873046875, + "learning_rate": 8.105834939547923e-07, + "loss": 0.0016, + "reward": 1.703125, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.703125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1082 + }, + { + "completion_length": 10.34375, + "epoch": 0.18976695286490275, + "grad_norm": 15.66851708461809, + "kl": 0.1318359375, + "learning_rate": 8.104082705449447e-07, + "loss": 0.0528, + "reward": 1.4515047073364258, + "reward_std": 0.16618230938911438, + "rewards/accuracy_reward_stage2": 0.5765047073364258, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1083 + }, + { + "completion_length": 13.140625, + "epoch": 0.1899421762747503, + "grad_norm": 19.031333710317966, + "kl": 0.0703125, + "learning_rate": 8.102330471350972e-07, + "loss": 0.0282, + "reward": 1.310152530670166, + "reward_std": 0.12037193030118942, + "rewards/accuracy_reward_stage2": 0.31015244126319885, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1084 + }, + { + "completion_length": 6.3125, + "epoch": 0.19011739968459787, + "grad_norm": 19.075249545090323, + "kl": 0.056884765625, + "learning_rate": 8.100578237252497e-07, + "loss": 0.0227, + "reward": 1.6960257291793823, + "reward_std": 0.1582869291305542, + "rewards/accuracy_reward_stage2": 0.6960256695747375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1085 + }, + { + "completion_length": 8.609375, + "epoch": 0.19029262309444542, + "grad_norm": 22.746907077342723, + "kl": 0.1337890625, + "learning_rate": 8.098826003154021e-07, + "loss": -0.0, + "reward": 1.8112388849258423, + "reward_std": 0.22928564250469208, + "rewards/accuracy_reward_stage2": 0.8424888849258423, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1086 + }, + { + "completion_length": 8.734375, + "epoch": 0.19046784650429296, + "grad_norm": 14.58116819960654, + "kl": 0.0162353515625, + "learning_rate": 8.097073769055546e-07, + "loss": 0.0065, + "reward": 1.4780704975128174, + "reward_std": 0.1812051385641098, + "rewards/accuracy_reward_stage2": 0.4780704975128174, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1087 + }, + { + "completion_length": 15.671875, + "epoch": 0.19064306991414054, + "grad_norm": 13.710019110530325, + "kl": 0.060302734375, + "learning_rate": 8.095321534957071e-07, + "loss": -0.0181, + "reward": 1.3493430614471436, + "reward_std": 0.12040011584758759, + "rewards/accuracy_reward_stage2": 0.36496806144714355, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1088 + }, + { + "completion_length": 9.65625, + "epoch": 0.1908182933239881, + "grad_norm": 18.8022343699883, + "kl": 0.0205078125, + "learning_rate": 8.093569300858595e-07, + "loss": 0.0082, + "reward": 1.6593749523162842, + "reward_std": 0.16405992209911346, + "rewards/accuracy_reward_stage2": 0.6593749523162842, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1089 + }, + { + "completion_length": 12.40625, + "epoch": 0.19099351673383563, + "grad_norm": 144.1566137298228, + "kl": 0.66796875, + "learning_rate": 8.09181706676012e-07, + "loss": 0.2231, + "reward": 1.3865861892700195, + "reward_std": 0.2962217926979065, + "rewards/accuracy_reward_stage2": 0.5272111296653748, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1090 + }, + { + "completion_length": 9.625, + "epoch": 0.1911687401436832, + "grad_norm": 16.503467268366663, + "kl": 0.05859375, + "learning_rate": 8.090064832661642e-07, + "loss": -0.0207, + "reward": 1.767581820487976, + "reward_std": 0.2052784264087677, + "rewards/accuracy_reward_stage2": 0.7832068204879761, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1091 + }, + { + "completion_length": 9.9375, + "epoch": 0.19134396355353075, + "grad_norm": 22.629686000416033, + "kl": 0.09130859375, + "learning_rate": 8.088312598563167e-07, + "loss": -0.078, + "reward": 1.576542615890503, + "reward_std": 0.2770374119281769, + "rewards/accuracy_reward_stage2": 0.6234177350997925, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1092 + }, + { + "completion_length": 11.890625, + "epoch": 0.1915191869633783, + "grad_norm": 19.855391563751468, + "kl": 0.12353515625, + "learning_rate": 8.086560364464692e-07, + "loss": -0.0374, + "reward": 1.4056425094604492, + "reward_std": 0.17462682723999023, + "rewards/accuracy_reward_stage2": 0.43689244985580444, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1093 + }, + { + "completion_length": 5.71875, + "epoch": 0.19169441037322588, + "grad_norm": 17.63095417389011, + "kl": 0.06201171875, + "learning_rate": 8.084808130366216e-07, + "loss": 0.0247, + "reward": 1.6471445560455322, + "reward_std": 0.18837015330791473, + "rewards/accuracy_reward_stage2": 0.6471446752548218, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1094 + }, + { + "completion_length": 18.28125, + "epoch": 0.19186963378307342, + "grad_norm": 25.517680180087687, + "kl": 0.1513671875, + "learning_rate": 8.083055896267741e-07, + "loss": 0.0571, + "reward": 1.354015588760376, + "reward_std": 0.2112516313791275, + "rewards/accuracy_reward_stage2": 0.4946404993534088, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1095 + }, + { + "completion_length": 10.90625, + "epoch": 0.19204485719292097, + "grad_norm": 31.255011942515864, + "kl": 0.05029296875, + "learning_rate": 8.081303662169265e-07, + "loss": 0.0201, + "reward": 1.5470197200775146, + "reward_std": 0.26016050577163696, + "rewards/accuracy_reward_stage2": 0.5470197796821594, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1096 + }, + { + "completion_length": 8.625, + "epoch": 0.19222008060276852, + "grad_norm": 20.68222206734905, + "kl": 0.1376953125, + "learning_rate": 8.07955142807079e-07, + "loss": 0.0214, + "reward": 1.4428706169128418, + "reward_std": 0.2221945971250534, + "rewards/accuracy_reward_stage2": 0.5834957361221313, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1097 + }, + { + "completion_length": 8.015625, + "epoch": 0.1923953040126161, + "grad_norm": 56.24965357387562, + "kl": 0.41015625, + "learning_rate": 8.077799193972315e-07, + "loss": 0.1637, + "reward": 1.48624587059021, + "reward_std": 0.09766960889101028, + "rewards/accuracy_reward_stage2": 0.6112458109855652, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1098 + }, + { + "completion_length": 10.640625, + "epoch": 0.19257052742246364, + "grad_norm": 12.780882453737103, + "kl": 0.007537841796875, + "learning_rate": 8.076046959873839e-07, + "loss": 0.003, + "reward": 1.65625, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1099 + }, + { + "completion_length": 9.265625, + "epoch": 0.19274575083231119, + "grad_norm": 20.199563575560575, + "kl": 0.09228515625, + "learning_rate": 8.074294725775364e-07, + "loss": -0.0072, + "reward": 1.8605185747146606, + "reward_std": 0.15882590413093567, + "rewards/accuracy_reward_stage2": 0.8761435747146606, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1100 + }, + { + "completion_length": 6.984375, + "epoch": 0.19292097424215876, + "grad_norm": 18.088546653390626, + "kl": 0.061767578125, + "learning_rate": 8.072542491676888e-07, + "loss": 0.0248, + "reward": 1.664806604385376, + "reward_std": 0.14888063073158264, + "rewards/accuracy_reward_stage2": 0.6648065447807312, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1101 + }, + { + "completion_length": 12.1875, + "epoch": 0.1930961976520063, + "grad_norm": 21.685034859130564, + "kl": 0.068359375, + "learning_rate": 8.070790257578412e-07, + "loss": -0.011, + "reward": 1.5130434036254883, + "reward_std": 0.2805604338645935, + "rewards/accuracy_reward_stage2": 0.5442932844161987, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1102 + }, + { + "completion_length": 8.390625, + "epoch": 0.19327142106185385, + "grad_norm": 19.693827267743735, + "kl": 0.2265625, + "learning_rate": 8.069038023479936e-07, + "loss": 0.0145, + "reward": 1.4980933666229248, + "reward_std": 0.3202298879623413, + "rewards/accuracy_reward_stage2": 0.54496830701828, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1103 + }, + { + "completion_length": 8.46875, + "epoch": 0.19344664447170143, + "grad_norm": 14.74632575144857, + "kl": 0.04248046875, + "learning_rate": 8.06728578938146e-07, + "loss": -0.0141, + "reward": 1.7438368797302246, + "reward_std": 0.20524749159812927, + "rewards/accuracy_reward_stage2": 0.7594619393348694, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1104 + }, + { + "completion_length": 10.671875, + "epoch": 0.19362186788154898, + "grad_norm": 13.627598413449007, + "kl": 0.060791015625, + "learning_rate": 8.065533555282985e-07, + "loss": 0.0243, + "reward": 1.5157394409179688, + "reward_std": 0.1647764891386032, + "rewards/accuracy_reward_stage2": 0.5157395005226135, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1105 + }, + { + "completion_length": 12.28125, + "epoch": 0.19379709129139652, + "grad_norm": 20.02739663095883, + "kl": 0.1943359375, + "learning_rate": 8.06378132118451e-07, + "loss": 0.0777, + "reward": 1.4368054866790771, + "reward_std": 0.27020564675331116, + "rewards/accuracy_reward_stage2": 0.5618055462837219, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1106 + }, + { + "completion_length": 13.40625, + "epoch": 0.1939723147012441, + "grad_norm": 14.013964860105306, + "kl": 0.02294921875, + "learning_rate": 8.062029087086034e-07, + "loss": -0.0327, + "reward": 1.385578989982605, + "reward_std": 0.132847398519516, + "rewards/accuracy_reward_stage2": 0.401203989982605, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1107 + }, + { + "completion_length": 13.703125, + "epoch": 0.19414753811109164, + "grad_norm": 21.173019061399607, + "kl": 0.205078125, + "learning_rate": 8.060276852987559e-07, + "loss": 0.0657, + "reward": 1.1857408285140991, + "reward_std": 0.24654100835323334, + "rewards/accuracy_reward_stage2": 0.5763658285140991, + "rewards/format_reward_stage1_pointerpad": 0.609375, + "scores/accuracy_reward_stage2": 0.609375, + "step": 1108 + }, + { + "completion_length": 17.8125, + "epoch": 0.1943227615209392, + "grad_norm": 14.208974126460554, + "kl": 0.045654296875, + "learning_rate": 8.058524618889084e-07, + "loss": -0.0934, + "reward": 1.469606876373291, + "reward_std": 0.14302250742912292, + "rewards/accuracy_reward_stage2": 0.5164818167686462, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1109 + }, + { + "completion_length": 12.609375, + "epoch": 0.19449798493078677, + "grad_norm": 20.904985420063163, + "kl": 0.2041015625, + "learning_rate": 8.056772384790608e-07, + "loss": 0.0817, + "reward": 1.2993509769439697, + "reward_std": 0.12314928323030472, + "rewards/accuracy_reward_stage2": 0.5493509769439697, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1110 + }, + { + "completion_length": 9.53125, + "epoch": 0.1946732083406343, + "grad_norm": 19.75010243636568, + "kl": 0.057861328125, + "learning_rate": 8.055020150692132e-07, + "loss": 0.0231, + "reward": 1.7738691568374634, + "reward_std": 0.14646458625793457, + "rewards/accuracy_reward_stage2": 0.7738690972328186, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1111 + }, + { + "completion_length": 18.140625, + "epoch": 0.19484843175048186, + "grad_norm": 16.508736731493936, + "kl": 0.0849609375, + "learning_rate": 8.053267916593656e-07, + "loss": -0.0515, + "reward": 1.4216002225875854, + "reward_std": 0.19240014255046844, + "rewards/accuracy_reward_stage2": 0.45285022258758545, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1112 + }, + { + "completion_length": 9.75, + "epoch": 0.1950236551603294, + "grad_norm": 19.065605788174146, + "kl": 0.115234375, + "learning_rate": 8.051515682495181e-07, + "loss": -0.106, + "reward": 1.5236477851867676, + "reward_std": 0.2754653990268707, + "rewards/accuracy_reward_stage2": 0.7111477851867676, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 1113 + }, + { + "completion_length": 9.65625, + "epoch": 0.19519887857017698, + "grad_norm": 18.696100390969345, + "kl": 0.061279296875, + "learning_rate": 8.049763448396706e-07, + "loss": -0.0196, + "reward": 1.6031370162963867, + "reward_std": 0.27101612091064453, + "rewards/accuracy_reward_stage2": 0.6187620162963867, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1114 + }, + { + "completion_length": 11.3125, + "epoch": 0.19537410198002453, + "grad_norm": 16.51187072505447, + "kl": 0.046142578125, + "learning_rate": 8.04801121429823e-07, + "loss": -0.0233, + "reward": 1.6047179698944092, + "reward_std": 0.10883722454309464, + "rewards/accuracy_reward_stage2": 0.8703429102897644, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1115 + }, + { + "completion_length": 9.578125, + "epoch": 0.19554932538987208, + "grad_norm": 13.640423112555817, + "kl": 0.040283203125, + "learning_rate": 8.046258980199754e-07, + "loss": -0.0718, + "reward": 1.6661803722381592, + "reward_std": 0.14868390560150146, + "rewards/accuracy_reward_stage2": 0.6974303722381592, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1116 + }, + { + "completion_length": 10.15625, + "epoch": 0.19572454879971965, + "grad_norm": 22.98236965229761, + "kl": 0.1357421875, + "learning_rate": 8.044506746101279e-07, + "loss": 0.0115, + "reward": 1.5336987972259521, + "reward_std": 0.22573234140872955, + "rewards/accuracy_reward_stage2": 0.5493239164352417, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1117 + }, + { + "completion_length": 8.515625, + "epoch": 0.1958997722095672, + "grad_norm": 15.808069400903483, + "kl": 0.0693359375, + "learning_rate": 8.042754512002803e-07, + "loss": 0.0277, + "reward": 1.6645541191101074, + "reward_std": 0.18062806129455566, + "rewards/accuracy_reward_stage2": 0.6645541787147522, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1118 + }, + { + "completion_length": 18.234375, + "epoch": 0.19607499561941474, + "grad_norm": 18.34917213634432, + "kl": 0.046875, + "learning_rate": 8.041002277904328e-07, + "loss": -0.0234, + "reward": 1.5347862243652344, + "reward_std": 0.2595851719379425, + "rewards/accuracy_reward_stage2": 0.5504111051559448, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1119 + }, + { + "completion_length": 10.296875, + "epoch": 0.19625021902926232, + "grad_norm": 39.987764020386464, + "kl": 0.2099609375, + "learning_rate": 8.039250043805851e-07, + "loss": 0.0216, + "reward": 1.5678634643554688, + "reward_std": 0.2782178819179535, + "rewards/accuracy_reward_stage2": 0.5991134643554688, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1120 + }, + { + "completion_length": 15.1875, + "epoch": 0.19642544243910987, + "grad_norm": 8.103500483325902, + "kl": 0.00994873046875, + "learning_rate": 8.037497809707376e-07, + "loss": 0.004, + "reward": 1.453125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward_stage2": 0.453125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1121 + }, + { + "completion_length": 9.9375, + "epoch": 0.1966006658489574, + "grad_norm": 21.186066478652183, + "kl": 0.296875, + "learning_rate": 8.035745575608901e-07, + "loss": 0.1183, + "reward": 1.4764658212661743, + "reward_std": 0.217693030834198, + "rewards/accuracy_reward_stage2": 0.8514657020568848, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1122 + }, + { + "completion_length": 10.5625, + "epoch": 0.196775889258805, + "grad_norm": 14.332701954367625, + "kl": 0.024169921875, + "learning_rate": 8.033993341510425e-07, + "loss": 0.0096, + "reward": 1.5120658874511719, + "reward_std": 0.1503337323665619, + "rewards/accuracy_reward_stage2": 0.5120658874511719, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1123 + }, + { + "completion_length": 5.0625, + "epoch": 0.19695111266865253, + "grad_norm": 14.324934707265692, + "kl": 0.021728515625, + "learning_rate": 8.03224110741195e-07, + "loss": 0.0087, + "reward": 1.8125, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.8125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1124 + }, + { + "completion_length": 10.390625, + "epoch": 0.19712633607850008, + "grad_norm": 16.529729841606866, + "kl": 0.08056640625, + "learning_rate": 8.030488873313475e-07, + "loss": -0.0786, + "reward": 1.5661100149154663, + "reward_std": 0.1992965042591095, + "rewards/accuracy_reward_stage2": 0.6129850149154663, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1125 + }, + { + "completion_length": 7.421875, + "epoch": 0.19730155948834766, + "grad_norm": 17.60318401877113, + "kl": 0.0654296875, + "learning_rate": 8.028736639214999e-07, + "loss": 0.0263, + "reward": 1.5065890550613403, + "reward_std": 0.19483307003974915, + "rewards/accuracy_reward_stage2": 0.5065890550613403, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1126 + }, + { + "completion_length": 10.359375, + "epoch": 0.1974767828981952, + "grad_norm": 13.272451524780983, + "kl": 0.083984375, + "learning_rate": 8.026984405116524e-07, + "loss": -0.0231, + "reward": 1.5966994762420654, + "reward_std": 0.16520971059799194, + "rewards/accuracy_reward_stage2": 0.6279494762420654, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1127 + }, + { + "completion_length": 8.453125, + "epoch": 0.19765200630804275, + "grad_norm": 19.345697034013444, + "kl": 0.046875, + "learning_rate": 8.025232171018048e-07, + "loss": -0.0255, + "reward": 1.320874810218811, + "reward_std": 0.1730649173259735, + "rewards/accuracy_reward_stage2": 0.33649978041648865, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1128 + }, + { + "completion_length": 10.71875, + "epoch": 0.19782722971789032, + "grad_norm": 22.72010394636336, + "kl": 0.12158203125, + "learning_rate": 8.023479936919572e-07, + "loss": -0.0043, + "reward": 1.314929723739624, + "reward_std": 0.2263418585062027, + "rewards/accuracy_reward_stage2": 0.596179723739624, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1129 + }, + { + "completion_length": 12.515625, + "epoch": 0.19800245312773787, + "grad_norm": 21.640102442615657, + "kl": 0.212890625, + "learning_rate": 8.021727702821096e-07, + "loss": -0.0032, + "reward": 1.4405428171157837, + "reward_std": 0.33114007115364075, + "rewards/accuracy_reward_stage2": 0.7217926979064941, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1130 + }, + { + "completion_length": 8.921875, + "epoch": 0.19817767653758542, + "grad_norm": 16.96837016642232, + "kl": 0.049072265625, + "learning_rate": 8.01997546872262e-07, + "loss": 0.0196, + "reward": 1.5607819557189941, + "reward_std": 0.2285585105419159, + "rewards/accuracy_reward_stage2": 0.6857819557189941, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1131 + }, + { + "completion_length": 11.03125, + "epoch": 0.19835289994743296, + "grad_norm": 20.74462179618685, + "kl": 0.08642578125, + "learning_rate": 8.018223234624145e-07, + "loss": 0.0057, + "reward": 1.6989161968231201, + "reward_std": 0.18676617741584778, + "rewards/accuracy_reward_stage2": 0.7145411968231201, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1132 + }, + { + "completion_length": 9.90625, + "epoch": 0.19852812335728054, + "grad_norm": 22.856482124259504, + "kl": 0.1015625, + "learning_rate": 8.01647100052567e-07, + "loss": -0.0499, + "reward": 1.6860418319702148, + "reward_std": 0.2933948040008545, + "rewards/accuracy_reward_stage2": 0.7329168319702148, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1133 + }, + { + "completion_length": 9.578125, + "epoch": 0.1987033467671281, + "grad_norm": 10.13055659742841, + "kl": 0.2255859375, + "learning_rate": 8.014718766427194e-07, + "loss": 0.0901, + "reward": 1.4931246042251587, + "reward_std": 0.06938232481479645, + "rewards/accuracy_reward_stage2": 0.7431246042251587, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1134 + }, + { + "completion_length": 13.234375, + "epoch": 0.19887857017697563, + "grad_norm": 21.296227271607425, + "kl": 0.06591796875, + "learning_rate": 8.012966532328719e-07, + "loss": 0.0263, + "reward": 1.3034307956695557, + "reward_std": 0.2627410888671875, + "rewards/accuracy_reward_stage2": 0.42843079566955566, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1135 + }, + { + "completion_length": 11.03125, + "epoch": 0.1990537935868232, + "grad_norm": 19.25565519002808, + "kl": 0.1337890625, + "learning_rate": 8.011214298230243e-07, + "loss": 0.0093, + "reward": 1.3711471557617188, + "reward_std": 0.26397189497947693, + "rewards/accuracy_reward_stage2": 0.5117720365524292, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1136 + }, + { + "completion_length": 12.6875, + "epoch": 0.19922901699667075, + "grad_norm": 16.864201753153427, + "kl": 0.0703125, + "learning_rate": 8.009462064131768e-07, + "loss": -0.0496, + "reward": 1.494227647781372, + "reward_std": 0.22331348061561584, + "rewards/accuracy_reward_stage2": 0.5254777073860168, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1137 + }, + { + "completion_length": 24.53125, + "epoch": 0.1994042404065183, + "grad_norm": 22.756765542105306, + "kl": 0.0283203125, + "learning_rate": 8.007709830033293e-07, + "loss": -0.0079, + "reward": 1.4776358604431152, + "reward_std": 0.29558607935905457, + "rewards/accuracy_reward_stage2": 0.49326080083847046, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1138 + }, + { + "completion_length": 9.28125, + "epoch": 0.19957946381636588, + "grad_norm": 23.208713725384428, + "kl": 0.0986328125, + "learning_rate": 8.005957595934817e-07, + "loss": -0.0951, + "reward": 1.4900455474853516, + "reward_std": 0.29236066341400146, + "rewards/accuracy_reward_stage2": 0.5525454878807068, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1139 + }, + { + "completion_length": 8.265625, + "epoch": 0.19975468722621342, + "grad_norm": 17.131297352821566, + "kl": 0.0634765625, + "learning_rate": 8.004205361836342e-07, + "loss": 0.0253, + "reward": 1.5214886665344238, + "reward_std": 0.21488597989082336, + "rewards/accuracy_reward_stage2": 0.5214886665344238, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1140 + }, + { + "completion_length": 14.359375, + "epoch": 0.19992991063606097, + "grad_norm": 20.921675496913988, + "kl": 0.1806640625, + "learning_rate": 8.002453127737866e-07, + "loss": -0.0048, + "reward": 1.1669869422912598, + "reward_std": 0.22576582431793213, + "rewards/accuracy_reward_stage2": 0.4482370615005493, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1141 + }, + { + "completion_length": 10.484375, + "epoch": 0.20010513404590854, + "grad_norm": 20.421989513086785, + "kl": 0.051025390625, + "learning_rate": 8.000700893639389e-07, + "loss": 0.0204, + "reward": 1.701317310333252, + "reward_std": 0.13738197088241577, + "rewards/accuracy_reward_stage2": 0.7013173699378967, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1142 + }, + { + "completion_length": 8.890625, + "epoch": 0.2002803574557561, + "grad_norm": 11.718815490151254, + "kl": 0.034423828125, + "learning_rate": 7.998948659540914e-07, + "loss": -0.0276, + "reward": 1.5396586656570435, + "reward_std": 0.09947885572910309, + "rewards/accuracy_reward_stage2": 0.5552836656570435, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1143 + }, + { + "completion_length": 10.265625, + "epoch": 0.20045558086560364, + "grad_norm": 18.279605016875873, + "kl": 0.126953125, + "learning_rate": 7.997196425442438e-07, + "loss": 0.051, + "reward": 1.6181336641311646, + "reward_std": 0.15057526528835297, + "rewards/accuracy_reward_stage2": 0.7431336641311646, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1144 + }, + { + "completion_length": 8.828125, + "epoch": 0.2006308042754512, + "grad_norm": 26.921773092556197, + "kl": 0.25, + "learning_rate": 7.995444191343963e-07, + "loss": 0.1002, + "reward": 1.542344570159912, + "reward_std": 0.2922079563140869, + "rewards/accuracy_reward_stage2": 0.5423445105552673, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1145 + }, + { + "completion_length": 9.140625, + "epoch": 0.20080602768529876, + "grad_norm": 19.701859450137892, + "kl": 0.111328125, + "learning_rate": 7.993691957245488e-07, + "loss": -0.0622, + "reward": 1.340492606163025, + "reward_std": 0.24292413890361786, + "rewards/accuracy_reward_stage2": 0.4029926657676697, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1146 + }, + { + "completion_length": 9.890625, + "epoch": 0.2009812510951463, + "grad_norm": 17.120098765809743, + "kl": 0.0233154296875, + "learning_rate": 7.991939723147012e-07, + "loss": 0.0093, + "reward": 1.623031497001648, + "reward_std": 0.15811499953269958, + "rewards/accuracy_reward_stage2": 0.623031497001648, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1147 + }, + { + "completion_length": 10.0625, + "epoch": 0.20115647450499385, + "grad_norm": 16.414510733056197, + "kl": 0.0194091796875, + "learning_rate": 7.990187489048537e-07, + "loss": -0.0364, + "reward": 1.7732515335083008, + "reward_std": 0.17782685160636902, + "rewards/accuracy_reward_stage2": 0.788876473903656, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1148 + }, + { + "completion_length": 11.65625, + "epoch": 0.20133169791484143, + "grad_norm": 21.158467097461774, + "kl": 0.107421875, + "learning_rate": 7.988435254950062e-07, + "loss": -0.0013, + "reward": 1.5274202823638916, + "reward_std": 0.2117118239402771, + "rewards/accuracy_reward_stage2": 0.5430452823638916, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1149 + }, + { + "completion_length": 8.703125, + "epoch": 0.20150692132468898, + "grad_norm": 21.15810096062497, + "kl": 0.11474609375, + "learning_rate": 7.986683020851585e-07, + "loss": 0.046, + "reward": 1.2723078727722168, + "reward_std": 0.282155305147171, + "rewards/accuracy_reward_stage2": 0.3973078727722168, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1150 + }, + { + "completion_length": 10.46875, + "epoch": 0.20168214473453652, + "grad_norm": 19.455305358264336, + "kl": 0.0615234375, + "learning_rate": 7.98493078675311e-07, + "loss": 0.0246, + "reward": 1.549159049987793, + "reward_std": 0.1756734549999237, + "rewards/accuracy_reward_stage2": 0.549159049987793, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1151 + }, + { + "completion_length": 29.0, + "epoch": 0.2018573681443841, + "grad_norm": 18.905730682719934, + "kl": 0.1669921875, + "learning_rate": 7.983178552654634e-07, + "loss": 0.0669, + "reward": 1.4018785953521729, + "reward_std": 0.14914950728416443, + "rewards/accuracy_reward_stage2": 0.5268786549568176, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1152 + }, + { + "completion_length": 12.53125, + "epoch": 0.20203259155423164, + "grad_norm": 26.675549604761773, + "kl": 0.181640625, + "learning_rate": 7.981426318556159e-07, + "loss": 0.0728, + "reward": 1.3172643184661865, + "reward_std": 0.2368382215499878, + "rewards/accuracy_reward_stage2": 0.5672642588615417, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1153 + }, + { + "completion_length": 20.140625, + "epoch": 0.2022078149640792, + "grad_norm": 22.721907098494817, + "kl": 0.171875, + "learning_rate": 7.979674084457683e-07, + "loss": 0.0687, + "reward": 1.3109116554260254, + "reward_std": 0.16556920111179352, + "rewards/accuracy_reward_stage2": 0.43591174483299255, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1154 + }, + { + "completion_length": 11.921875, + "epoch": 0.20238303837392677, + "grad_norm": 20.857879895249667, + "kl": 0.1240234375, + "learning_rate": 7.977921850359207e-07, + "loss": -0.0352, + "reward": 1.4702496528625488, + "reward_std": 0.24368052184581757, + "rewards/accuracy_reward_stage2": 0.5014996528625488, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1155 + }, + { + "completion_length": 12.453125, + "epoch": 0.2025582617837743, + "grad_norm": 20.071391682574166, + "kl": 0.0458984375, + "learning_rate": 7.976169616260732e-07, + "loss": 0.0183, + "reward": 1.1320414543151855, + "reward_std": 0.1970067322254181, + "rewards/accuracy_reward_stage2": 0.25704148411750793, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1156 + }, + { + "completion_length": 11.859375, + "epoch": 0.20273348519362186, + "grad_norm": 29.57065952608139, + "kl": 0.06201171875, + "learning_rate": 7.974417382162256e-07, + "loss": -0.1075, + "reward": 1.6288851499557495, + "reward_std": 0.263146311044693, + "rewards/accuracy_reward_stage2": 0.6757600903511047, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1157 + }, + { + "completion_length": 11.078125, + "epoch": 0.20290870860346943, + "grad_norm": 20.549246695365362, + "kl": 0.07177734375, + "learning_rate": 7.972665148063781e-07, + "loss": 0.0287, + "reward": 1.6410633325576782, + "reward_std": 0.2141045480966568, + "rewards/accuracy_reward_stage2": 0.6410633325576782, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1158 + }, + { + "completion_length": 10.640625, + "epoch": 0.20308393201331698, + "grad_norm": 18.21590409691188, + "kl": 0.07861328125, + "learning_rate": 7.970912913965306e-07, + "loss": 0.0315, + "reward": 1.7376164197921753, + "reward_std": 0.146676704287529, + "rewards/accuracy_reward_stage2": 0.7376164793968201, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1159 + }, + { + "completion_length": 12.03125, + "epoch": 0.20325915542316453, + "grad_norm": 16.926699738446533, + "kl": 0.024658203125, + "learning_rate": 7.969160679866829e-07, + "loss": -0.0343, + "reward": 1.622064471244812, + "reward_std": 0.19049572944641113, + "rewards/accuracy_reward_stage2": 0.6376894116401672, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1160 + }, + { + "completion_length": 12.203125, + "epoch": 0.2034343788330121, + "grad_norm": 17.57185430695999, + "kl": 0.0400390625, + "learning_rate": 7.967408445768354e-07, + "loss": 0.0159, + "reward": 1.5260417461395264, + "reward_std": 0.2976905405521393, + "rewards/accuracy_reward_stage2": 0.5260416269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1161 + }, + { + "completion_length": 15.5625, + "epoch": 0.20360960224285965, + "grad_norm": 20.624347335706137, + "kl": 0.1328125, + "learning_rate": 7.965656211669879e-07, + "loss": 0.053, + "reward": 1.6277599334716797, + "reward_std": 0.20827616751194, + "rewards/accuracy_reward_stage2": 0.6277599334716797, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1162 + }, + { + "completion_length": 8.734375, + "epoch": 0.2037848256527072, + "grad_norm": 20.888572067895698, + "kl": 0.09912109375, + "learning_rate": 7.963903977571403e-07, + "loss": 0.0396, + "reward": 1.7622499465942383, + "reward_std": 0.2863914966583252, + "rewards/accuracy_reward_stage2": 0.7622500658035278, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1163 + }, + { + "completion_length": 15.09375, + "epoch": 0.20396004906255474, + "grad_norm": 17.51885785997261, + "kl": 0.068359375, + "learning_rate": 7.962151743472928e-07, + "loss": 0.0272, + "reward": 1.4528738260269165, + "reward_std": 0.17152443528175354, + "rewards/accuracy_reward_stage2": 0.5778738260269165, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1164 + }, + { + "completion_length": 12.234375, + "epoch": 0.20413527247240232, + "grad_norm": 257.1767706508973, + "kl": 1.2421875, + "learning_rate": 7.960399509374453e-07, + "loss": 0.4498, + "reward": 1.5711274147033691, + "reward_std": 0.19762010872364044, + "rewards/accuracy_reward_stage2": 0.7117522954940796, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1165 + }, + { + "completion_length": 9.75, + "epoch": 0.20431049588224987, + "grad_norm": 23.04372942522331, + "kl": 0.20703125, + "learning_rate": 7.958647275275977e-07, + "loss": 0.0419, + "reward": 1.4802830219268799, + "reward_std": 0.21664901077747345, + "rewards/accuracy_reward_stage2": 0.6209080219268799, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1166 + }, + { + "completion_length": 9.0, + "epoch": 0.2044857192920974, + "grad_norm": 25.731014770808162, + "kl": 0.17578125, + "learning_rate": 7.956895041177501e-07, + "loss": 0.0223, + "reward": 1.528602123260498, + "reward_std": 0.1948903203010559, + "rewards/accuracy_reward_stage2": 0.669227123260498, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1167 + }, + { + "completion_length": 12.0625, + "epoch": 0.204660942701945, + "grad_norm": 23.776557724679314, + "kl": 0.044677734375, + "learning_rate": 7.955142807079025e-07, + "loss": 0.0178, + "reward": 1.709090232849121, + "reward_std": 0.1950330287218094, + "rewards/accuracy_reward_stage2": 0.7090902328491211, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1168 + }, + { + "completion_length": 9.765625, + "epoch": 0.20483616611179253, + "grad_norm": 15.919983081141853, + "kl": 0.032470703125, + "learning_rate": 7.95339057298055e-07, + "loss": 0.013, + "reward": 1.5694223642349243, + "reward_std": 0.08031494915485382, + "rewards/accuracy_reward_stage2": 0.5694223046302795, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1169 + }, + { + "completion_length": 18.828125, + "epoch": 0.20501138952164008, + "grad_norm": 21.286745727148634, + "kl": 0.08154296875, + "learning_rate": 7.951638338882074e-07, + "loss": 0.0262, + "reward": 1.190812110900879, + "reward_std": 0.18454615771770477, + "rewards/accuracy_reward_stage2": 0.2064370959997177, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1170 + }, + { + "completion_length": 8.125, + "epoch": 0.20518661293148766, + "grad_norm": 13.777441072502276, + "kl": 0.013916015625, + "learning_rate": 7.949886104783598e-07, + "loss": 0.0056, + "reward": 1.5902838706970215, + "reward_std": 0.11751788854598999, + "rewards/accuracy_reward_stage2": 0.590283989906311, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1171 + }, + { + "completion_length": 6.140625, + "epoch": 0.2053618363413352, + "grad_norm": 16.392015761201815, + "kl": 0.0284423828125, + "learning_rate": 7.948133870685123e-07, + "loss": 0.0114, + "reward": 1.8313522338867188, + "reward_std": 0.09155251830816269, + "rewards/accuracy_reward_stage2": 0.8313522338867188, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1172 + }, + { + "completion_length": 8.25, + "epoch": 0.20553705975118275, + "grad_norm": 20.217002311756907, + "kl": 0.11962890625, + "learning_rate": 7.946381636586647e-07, + "loss": 0.0479, + "reward": 1.2145620584487915, + "reward_std": 0.21878552436828613, + "rewards/accuracy_reward_stage2": 0.4645621180534363, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1173 + }, + { + "completion_length": 11.671875, + "epoch": 0.20571228316103032, + "grad_norm": 16.081412388073563, + "kl": 0.038330078125, + "learning_rate": 7.944629402488172e-07, + "loss": 0.0153, + "reward": 1.54155695438385, + "reward_std": 0.1580614596605301, + "rewards/accuracy_reward_stage2": 0.5415569543838501, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1174 + }, + { + "completion_length": 10.984375, + "epoch": 0.20588750657087787, + "grad_norm": 22.670026601608498, + "kl": 0.024658203125, + "learning_rate": 7.942877168389697e-07, + "loss": 0.0099, + "reward": 1.6144170761108398, + "reward_std": 0.2522222399711609, + "rewards/accuracy_reward_stage2": 0.6144170761108398, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1175 + }, + { + "completion_length": 8.3125, + "epoch": 0.20606272998072542, + "grad_norm": 17.841750146010423, + "kl": 0.06396484375, + "learning_rate": 7.941124934291221e-07, + "loss": 0.0255, + "reward": 1.8633146286010742, + "reward_std": 0.21237066388130188, + "rewards/accuracy_reward_stage2": 0.8633145689964294, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1176 + }, + { + "completion_length": 13.203125, + "epoch": 0.206237953390573, + "grad_norm": 19.983949385651133, + "kl": 0.1552734375, + "learning_rate": 7.939372700192746e-07, + "loss": 0.0684, + "reward": 1.2808198928833008, + "reward_std": 0.039055947214365005, + "rewards/accuracy_reward_stage2": 0.405819833278656, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1177 + }, + { + "completion_length": 8.46875, + "epoch": 0.20641317680042054, + "grad_norm": 19.892008522113688, + "kl": 0.052001953125, + "learning_rate": 7.937620466094271e-07, + "loss": 0.0208, + "reward": 1.7783706188201904, + "reward_std": 0.18616268038749695, + "rewards/accuracy_reward_stage2": 0.7783706188201904, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1178 + }, + { + "completion_length": 11.15625, + "epoch": 0.20658840021026809, + "grad_norm": 14.320012639942059, + "kl": 0.296875, + "learning_rate": 7.935868231995795e-07, + "loss": 0.0851, + "reward": 1.4547843933105469, + "reward_std": 0.12215742468833923, + "rewards/accuracy_reward_stage2": 0.5954092741012573, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1179 + }, + { + "completion_length": 13.421875, + "epoch": 0.20676362362011566, + "grad_norm": 22.20258067675751, + "kl": 0.0771484375, + "learning_rate": 7.934115997897318e-07, + "loss": 0.0308, + "reward": 1.740135908126831, + "reward_std": 0.22515861690044403, + "rewards/accuracy_reward_stage2": 0.7401360273361206, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1180 + }, + { + "completion_length": 6.859375, + "epoch": 0.2069388470299632, + "grad_norm": 12.56155836903543, + "kl": 0.048095703125, + "learning_rate": 7.932363763798842e-07, + "loss": 0.0129, + "reward": 1.2953832149505615, + "reward_std": 0.1523396372795105, + "rewards/accuracy_reward_stage2": 0.4203832149505615, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1181 + }, + { + "completion_length": 8.40625, + "epoch": 0.20711407043981075, + "grad_norm": 24.90537422362036, + "kl": 0.1455078125, + "learning_rate": 7.930611529700367e-07, + "loss": 0.0302, + "reward": 1.7267037630081177, + "reward_std": 0.26153823733329773, + "rewards/accuracy_reward_stage2": 0.7423287630081177, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1182 + }, + { + "completion_length": 12.484375, + "epoch": 0.2072892938496583, + "grad_norm": 21.687780028830804, + "kl": 0.046630859375, + "learning_rate": 7.928859295601892e-07, + "loss": 0.0187, + "reward": 1.5232006311416626, + "reward_std": 0.20009201765060425, + "rewards/accuracy_reward_stage2": 0.5232006311416626, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1183 + }, + { + "completion_length": 14.703125, + "epoch": 0.20746451725950588, + "grad_norm": 29.365889674879565, + "kl": 0.057861328125, + "learning_rate": 7.927107061503416e-07, + "loss": -0.0271, + "reward": 1.4428789615631104, + "reward_std": 0.24434049427509308, + "rewards/accuracy_reward_stage2": 0.5991290807723999, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1184 + }, + { + "completion_length": 13.59375, + "epoch": 0.20763974066935342, + "grad_norm": 18.703243296211422, + "kl": 0.01495361328125, + "learning_rate": 7.925354827404941e-07, + "loss": 0.006, + "reward": 1.7748501300811768, + "reward_std": 0.16142131388187408, + "rewards/accuracy_reward_stage2": 0.7748501300811768, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1185 + }, + { + "completion_length": 8.0625, + "epoch": 0.20781496407920097, + "grad_norm": 14.896985352193926, + "kl": 0.052001953125, + "learning_rate": 7.923602593306466e-07, + "loss": 0.0208, + "reward": 1.5153526067733765, + "reward_std": 0.23511351644992828, + "rewards/accuracy_reward_stage2": 0.5153526067733765, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1186 + }, + { + "completion_length": 9.828125, + "epoch": 0.20799018748904854, + "grad_norm": 15.184774328423192, + "kl": 0.060546875, + "learning_rate": 7.92185035920799e-07, + "loss": 0.0242, + "reward": 1.5610758066177368, + "reward_std": 0.2549077570438385, + "rewards/accuracy_reward_stage2": 0.686075747013092, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1187 + }, + { + "completion_length": 12.5, + "epoch": 0.2081654108988961, + "grad_norm": 28.950774793478022, + "kl": 0.052001953125, + "learning_rate": 7.920098125109515e-07, + "loss": 0.0208, + "reward": 1.5520353317260742, + "reward_std": 0.3283025622367859, + "rewards/accuracy_reward_stage2": 0.6770353317260742, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1188 + }, + { + "completion_length": 15.640625, + "epoch": 0.20834063430874364, + "grad_norm": 20.61263740578102, + "kl": 0.059814453125, + "learning_rate": 7.918345891011039e-07, + "loss": -0.0683, + "reward": 1.24334716796875, + "reward_std": 0.31474292278289795, + "rewards/accuracy_reward_stage2": 0.3995971381664276, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1189 + }, + { + "completion_length": 8.890625, + "epoch": 0.2085158577185912, + "grad_norm": 23.985866977400512, + "kl": 0.1953125, + "learning_rate": 7.916593656912563e-07, + "loss": 0.0932, + "reward": 1.5583475828170776, + "reward_std": 0.2929634153842926, + "rewards/accuracy_reward_stage2": 0.6833474636077881, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1190 + }, + { + "completion_length": 38.8125, + "epoch": 0.20869108112843876, + "grad_norm": 17.87360965600017, + "kl": 0.1220703125, + "learning_rate": 7.914841422814088e-07, + "loss": 0.0489, + "reward": 1.627746820449829, + "reward_std": 0.1904383897781372, + "rewards/accuracy_reward_stage2": 0.6277468204498291, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1191 + }, + { + "completion_length": 10.953125, + "epoch": 0.2088663045382863, + "grad_norm": 18.58192828258147, + "kl": 0.05712890625, + "learning_rate": 7.913089188715612e-07, + "loss": -0.0061, + "reward": 1.4184027910232544, + "reward_std": 0.2182597517967224, + "rewards/accuracy_reward_stage2": 0.434027761220932, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1192 + }, + { + "completion_length": 7.671875, + "epoch": 0.20904152794813388, + "grad_norm": 1.353597979884702, + "kl": 0.025634765625, + "learning_rate": 7.911336954617136e-07, + "loss": 0.0103, + "reward": 1.597916603088379, + "reward_std": 0.0, + "rewards/accuracy_reward_stage2": 0.5979166626930237, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1193 + }, + { + "completion_length": 15.390625, + "epoch": 0.20921675135798143, + "grad_norm": 16.06473391090549, + "kl": 0.046875, + "learning_rate": 7.909584720518661e-07, + "loss": 0.0188, + "reward": 1.383378505706787, + "reward_std": 0.21060232818126678, + "rewards/accuracy_reward_stage2": 0.3833785355091095, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1194 + }, + { + "completion_length": 10.828125, + "epoch": 0.20939197476782898, + "grad_norm": 14.175264395757077, + "kl": 0.11279296875, + "learning_rate": 7.907832486420185e-07, + "loss": 0.045, + "reward": 1.3746671676635742, + "reward_std": 0.1269018054008484, + "rewards/accuracy_reward_stage2": 0.49966704845428467, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1195 + }, + { + "completion_length": 9.578125, + "epoch": 0.20956719817767655, + "grad_norm": 22.205500444522485, + "kl": 0.107421875, + "learning_rate": 7.90608025232171e-07, + "loss": 0.0431, + "reward": 1.5016958713531494, + "reward_std": 0.23775088787078857, + "rewards/accuracy_reward_stage2": 0.5016958117485046, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1196 + }, + { + "completion_length": 9.765625, + "epoch": 0.2097424215875241, + "grad_norm": 18.843440652713703, + "kl": 0.232421875, + "learning_rate": 7.904328018223234e-07, + "loss": 0.0927, + "reward": 1.6348192691802979, + "reward_std": 0.14728645980358124, + "rewards/accuracy_reward_stage2": 0.7598193287849426, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1197 + }, + { + "completion_length": 17.21875, + "epoch": 0.20991764499737164, + "grad_norm": 25.964716086110375, + "kl": 0.1455078125, + "learning_rate": 7.902575784124759e-07, + "loss": 0.0583, + "reward": 1.3283579349517822, + "reward_std": 0.19651073217391968, + "rewards/accuracy_reward_stage2": 0.45335784554481506, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1198 + }, + { + "completion_length": 8.6875, + "epoch": 0.2100928684072192, + "grad_norm": 17.365380157487795, + "kl": 0.043701171875, + "learning_rate": 7.900823550026284e-07, + "loss": 0.0175, + "reward": 1.6050076484680176, + "reward_std": 0.16576728224754333, + "rewards/accuracy_reward_stage2": 0.6050077080726624, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1199 + }, + { + "completion_length": 7.40625, + "epoch": 0.21026809181706677, + "grad_norm": 17.84908015245948, + "kl": 0.046875, + "learning_rate": 7.899071315927807e-07, + "loss": 0.0188, + "reward": 1.2840076684951782, + "reward_std": 0.17957797646522522, + "rewards/accuracy_reward_stage2": 0.40900763869285583, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1200 + }, + { + "completion_length": 11.421875, + "epoch": 0.2104433152269143, + "grad_norm": 22.669593862703703, + "kl": 0.039306640625, + "learning_rate": 7.897319081829332e-07, + "loss": -0.0284, + "reward": 1.7310082912445068, + "reward_std": 0.16316458582878113, + "rewards/accuracy_reward_stage2": 0.7466332912445068, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1201 + }, + { + "completion_length": 9.828125, + "epoch": 0.21061853863676186, + "grad_norm": 30.363784239849362, + "kl": 0.1806640625, + "learning_rate": 7.895566847730857e-07, + "loss": 0.0535, + "reward": 1.4026780128479004, + "reward_std": 0.15883180499076843, + "rewards/accuracy_reward_stage2": 0.4183030128479004, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1202 + }, + { + "completion_length": 6.46875, + "epoch": 0.21079376204660943, + "grad_norm": 16.30063889671874, + "kl": 0.027587890625, + "learning_rate": 7.893814613632381e-07, + "loss": -0.0331, + "reward": 1.6259620189666748, + "reward_std": 0.1683189868927002, + "rewards/accuracy_reward_stage2": 0.64158695936203, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1203 + }, + { + "completion_length": 9.59375, + "epoch": 0.21096898545645698, + "grad_norm": 16.699157121205253, + "kl": 0.06298828125, + "learning_rate": 7.892062379533906e-07, + "loss": 0.0158, + "reward": 1.4885568618774414, + "reward_std": 0.12854987382888794, + "rewards/accuracy_reward_stage2": 0.6135568618774414, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1204 + }, + { + "completion_length": 8.9375, + "epoch": 0.21114420886630453, + "grad_norm": 14.78587528185712, + "kl": 0.04296875, + "learning_rate": 7.890310145435429e-07, + "loss": 0.0172, + "reward": 1.6338293552398682, + "reward_std": 0.12834054231643677, + "rewards/accuracy_reward_stage2": 0.7588293552398682, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1205 + }, + { + "completion_length": 12.3125, + "epoch": 0.2113194322761521, + "grad_norm": 19.546222852809866, + "kl": 0.060791015625, + "learning_rate": 7.888557911336954e-07, + "loss": 0.0244, + "reward": 1.721550464630127, + "reward_std": 0.17753317952156067, + "rewards/accuracy_reward_stage2": 0.7215505242347717, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1206 + }, + { + "completion_length": 7.703125, + "epoch": 0.21149465568599965, + "grad_norm": 20.43738576748487, + "kl": 0.091796875, + "learning_rate": 7.886805677238479e-07, + "loss": 0.0054, + "reward": 1.551939606666565, + "reward_std": 0.2607957422733307, + "rewards/accuracy_reward_stage2": 0.5675646066665649, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1207 + }, + { + "completion_length": 11.953125, + "epoch": 0.2116698790958472, + "grad_norm": 17.672123595755643, + "kl": 0.169921875, + "learning_rate": 7.885053443140003e-07, + "loss": 0.0312, + "reward": 1.431882381439209, + "reward_std": 0.2033139318227768, + "rewards/accuracy_reward_stage2": 0.5725074410438538, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1208 + }, + { + "completion_length": 11.28125, + "epoch": 0.21184510250569477, + "grad_norm": 17.456375274733755, + "kl": 0.12255859375, + "learning_rate": 7.883301209041528e-07, + "loss": -0.0308, + "reward": 1.4818658828735352, + "reward_std": 0.2782401740550995, + "rewards/accuracy_reward_stage2": 0.5131158828735352, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1209 + }, + { + "completion_length": 22.578125, + "epoch": 0.21202032591554232, + "grad_norm": 19.43033143294077, + "kl": 0.111328125, + "learning_rate": 7.881548974943052e-07, + "loss": 0.0113, + "reward": 1.272879719734192, + "reward_std": 0.1519315093755722, + "rewards/accuracy_reward_stage2": 0.4135046601295471, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1210 + }, + { + "completion_length": 10.390625, + "epoch": 0.21219554932538987, + "grad_norm": 21.513740888250048, + "kl": 0.068359375, + "learning_rate": 7.879796740844576e-07, + "loss": 0.0274, + "reward": 1.415919542312622, + "reward_std": 0.20253877341747284, + "rewards/accuracy_reward_stage2": 0.41591957211494446, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1211 + }, + { + "completion_length": 7.140625, + "epoch": 0.21237077273523744, + "grad_norm": 18.76383666387577, + "kl": 0.0181884765625, + "learning_rate": 7.878044506746101e-07, + "loss": 0.0073, + "reward": 1.9206148386001587, + "reward_std": 0.16348612308502197, + "rewards/accuracy_reward_stage2": 0.9206147789955139, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1212 + }, + { + "completion_length": 16.046875, + "epoch": 0.212545996145085, + "grad_norm": 5.082339464017986, + "kl": 0.015625, + "learning_rate": 7.876292272647625e-07, + "loss": 0.0062, + "reward": 1.40625, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward_stage2": 0.40625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1213 + }, + { + "completion_length": 11.53125, + "epoch": 0.21272121955493253, + "grad_norm": 16.474658851171878, + "kl": 0.017822265625, + "learning_rate": 7.87454003854915e-07, + "loss": 0.0071, + "reward": 1.4888964891433716, + "reward_std": 0.23521284759044647, + "rewards/accuracy_reward_stage2": 0.48889651894569397, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1214 + }, + { + "completion_length": 14.90625, + "epoch": 0.2128964429647801, + "grad_norm": 17.607668248281357, + "kl": 0.007659912109375, + "learning_rate": 7.872787804450675e-07, + "loss": -0.0218, + "reward": 1.5345828533172607, + "reward_std": 0.16702640056610107, + "rewards/accuracy_reward_stage2": 0.5502078533172607, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1215 + }, + { + "completion_length": 15.578125, + "epoch": 0.21307166637462766, + "grad_norm": 17.512366393353393, + "kl": 0.078125, + "learning_rate": 7.871035570352199e-07, + "loss": -0.013, + "reward": 1.507341980934143, + "reward_std": 0.10357113182544708, + "rewards/accuracy_reward_stage2": 0.5229669809341431, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1216 + }, + { + "completion_length": 11.40625, + "epoch": 0.2132468897844752, + "grad_norm": 17.65170516624198, + "kl": 0.056640625, + "learning_rate": 7.869283336253724e-07, + "loss": 0.0226, + "reward": 1.2889139652252197, + "reward_std": 0.12118306756019592, + "rewards/accuracy_reward_stage2": 0.2889139652252197, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1217 + }, + { + "completion_length": 9.5625, + "epoch": 0.21342211319432275, + "grad_norm": 22.66795441655081, + "kl": 0.1474609375, + "learning_rate": 7.867531102155247e-07, + "loss": 0.059, + "reward": 1.6077473163604736, + "reward_std": 0.1633305847644806, + "rewards/accuracy_reward_stage2": 0.6077473759651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1218 + }, + { + "completion_length": 9.359375, + "epoch": 0.21359733660417032, + "grad_norm": 14.421986062058423, + "kl": 0.04833984375, + "learning_rate": 7.865778868056771e-07, + "loss": 0.0193, + "reward": 1.2509424686431885, + "reward_std": 0.1293540596961975, + "rewards/accuracy_reward_stage2": 0.5009424686431885, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1219 + }, + { + "completion_length": 8.921875, + "epoch": 0.21377256001401787, + "grad_norm": 17.1096991822818, + "kl": 0.07568359375, + "learning_rate": 7.864026633958296e-07, + "loss": 0.0302, + "reward": 1.1151213645935059, + "reward_std": 0.13514229655265808, + "rewards/accuracy_reward_stage2": 0.11512142419815063, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1220 + }, + { + "completion_length": 27.3125, + "epoch": 0.21394778342386542, + "grad_norm": 16.857618479926604, + "kl": 0.025146484375, + "learning_rate": 7.86227439985982e-07, + "loss": 0.0101, + "reward": 1.5993309020996094, + "reward_std": 0.13720698654651642, + "rewards/accuracy_reward_stage2": 0.5993307828903198, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1221 + }, + { + "completion_length": 8.5, + "epoch": 0.214123006833713, + "grad_norm": 18.28127660466508, + "kl": 0.0245361328125, + "learning_rate": 7.860522165761345e-07, + "loss": 0.0098, + "reward": 1.831869125366211, + "reward_std": 0.20181122422218323, + "rewards/accuracy_reward_stage2": 0.8318691253662109, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1222 + }, + { + "completion_length": 12.78125, + "epoch": 0.21429823024356054, + "grad_norm": 14.07413822583387, + "kl": 0.06494140625, + "learning_rate": 7.85876993166287e-07, + "loss": -0.0619, + "reward": 1.5926318168640137, + "reward_std": 0.21398116648197174, + "rewards/accuracy_reward_stage2": 0.6238818168640137, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1223 + }, + { + "completion_length": 10.984375, + "epoch": 0.21447345365340809, + "grad_norm": 20.191094882292095, + "kl": 0.1181640625, + "learning_rate": 7.857017697564394e-07, + "loss": 0.0067, + "reward": 1.5521167516708374, + "reward_std": 0.1629626452922821, + "rewards/accuracy_reward_stage2": 0.5677417516708374, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1224 + }, + { + "completion_length": 8.5, + "epoch": 0.21464867706325566, + "grad_norm": 19.044635303856023, + "kl": 0.06591796875, + "learning_rate": 7.855265463465919e-07, + "loss": -0.0304, + "reward": 1.675663709640503, + "reward_std": 0.2704814374446869, + "rewards/accuracy_reward_stage2": 0.7069137096405029, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1225 + }, + { + "completion_length": 9.546875, + "epoch": 0.2148239004731032, + "grad_norm": 12.892689964479764, + "kl": 0.205078125, + "learning_rate": 7.853513229367444e-07, + "loss": -0.0004, + "reward": 1.5591697692871094, + "reward_std": 0.1387302577495575, + "rewards/accuracy_reward_stage2": 0.7154197096824646, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1226 + }, + { + "completion_length": 9.65625, + "epoch": 0.21499912388295075, + "grad_norm": 24.68468555159018, + "kl": 0.203125, + "learning_rate": 7.851760995268968e-07, + "loss": 0.0811, + "reward": 1.470663070678711, + "reward_std": 0.3228445053100586, + "rewards/accuracy_reward_stage2": 0.5956631302833557, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1227 + }, + { + "completion_length": 10.375, + "epoch": 0.21517434729279833, + "grad_norm": 19.866727346698354, + "kl": 0.05859375, + "learning_rate": 7.850008761170493e-07, + "loss": 0.0234, + "reward": 1.6230573654174805, + "reward_std": 0.2438812255859375, + "rewards/accuracy_reward_stage2": 0.6230573654174805, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1228 + }, + { + "completion_length": 11.9375, + "epoch": 0.21534957070264588, + "grad_norm": 323.85482590509264, + "kl": 0.58984375, + "learning_rate": 7.848256527072016e-07, + "loss": 0.1682, + "reward": 1.5124356746673584, + "reward_std": 0.1736292541027069, + "rewards/accuracy_reward_stage2": 0.6686856746673584, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1229 + }, + { + "completion_length": 9.3125, + "epoch": 0.21552479411249342, + "grad_norm": 18.499353836268497, + "kl": 0.064453125, + "learning_rate": 7.846504292973541e-07, + "loss": -0.0072, + "reward": 1.6767133474349976, + "reward_std": 0.15823645889759064, + "rewards/accuracy_reward_stage2": 0.6923382878303528, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1230 + }, + { + "completion_length": 6.359375, + "epoch": 0.215700017522341, + "grad_norm": 17.16666874669316, + "kl": 0.05322265625, + "learning_rate": 7.844752058875065e-07, + "loss": 0.0212, + "reward": 1.5173816680908203, + "reward_std": 0.21086975932121277, + "rewards/accuracy_reward_stage2": 0.5173816680908203, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1231 + }, + { + "completion_length": 7.25, + "epoch": 0.21587524093218854, + "grad_norm": 18.453030271685524, + "kl": 0.07421875, + "learning_rate": 7.842999824776589e-07, + "loss": 0.0296, + "reward": 1.5241796970367432, + "reward_std": 0.16060179471969604, + "rewards/accuracy_reward_stage2": 0.5241796970367432, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1232 + }, + { + "completion_length": 11.09375, + "epoch": 0.2160504643420361, + "grad_norm": 20.878559833643852, + "kl": 0.10986328125, + "learning_rate": 7.841247590678114e-07, + "loss": 0.0439, + "reward": 1.5652143955230713, + "reward_std": 0.23995351791381836, + "rewards/accuracy_reward_stage2": 0.5652143955230713, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1233 + }, + { + "completion_length": 7.875, + "epoch": 0.21622568775188364, + "grad_norm": 10.92429529273633, + "kl": 0.02197265625, + "learning_rate": 7.839495356579638e-07, + "loss": 0.0088, + "reward": 1.546875, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1234 + }, + { + "completion_length": 9.625, + "epoch": 0.2164009111617312, + "grad_norm": 22.419497200292163, + "kl": 0.12451171875, + "learning_rate": 7.837743122481163e-07, + "loss": 0.0614, + "reward": 1.256831407546997, + "reward_std": 0.3416307270526886, + "rewards/accuracy_reward_stage2": 0.3818313181400299, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1235 + }, + { + "completion_length": 8.53125, + "epoch": 0.21657613457157876, + "grad_norm": 43.141486161066105, + "kl": 0.283203125, + "learning_rate": 7.835990888382688e-07, + "loss": 0.0824, + "reward": 1.4279000759124756, + "reward_std": 0.19505921006202698, + "rewards/accuracy_reward_stage2": 0.568524956703186, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1236 + }, + { + "completion_length": 20.890625, + "epoch": 0.2167513579814263, + "grad_norm": 22.738050502586738, + "kl": 0.08251953125, + "learning_rate": 7.834238654284212e-07, + "loss": 0.033, + "reward": 1.67447829246521, + "reward_std": 0.2933582365512848, + "rewards/accuracy_reward_stage2": 0.67447829246521, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1237 + }, + { + "completion_length": 12.828125, + "epoch": 0.21692658139127388, + "grad_norm": 19.737080524499078, + "kl": 0.09912109375, + "learning_rate": 7.832486420185737e-07, + "loss": 0.0396, + "reward": 1.4670445919036865, + "reward_std": 0.1632198989391327, + "rewards/accuracy_reward_stage2": 0.46704450249671936, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1238 + }, + { + "completion_length": 9.71875, + "epoch": 0.21710180480112143, + "grad_norm": 18.02312666630985, + "kl": 0.1259765625, + "learning_rate": 7.830734186087262e-07, + "loss": 0.0504, + "reward": 1.2427325248718262, + "reward_std": 0.1590673327445984, + "rewards/accuracy_reward_stage2": 0.49273252487182617, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1239 + }, + { + "completion_length": 11.515625, + "epoch": 0.21727702821096898, + "grad_norm": 16.867177885985814, + "kl": 0.0888671875, + "learning_rate": 7.828981951988785e-07, + "loss": -0.0482, + "reward": 1.6229031085968018, + "reward_std": 0.26744771003723145, + "rewards/accuracy_reward_stage2": 0.6541531085968018, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1240 + }, + { + "completion_length": 14.78125, + "epoch": 0.21745225162081655, + "grad_norm": 14.87929735312399, + "kl": 0.0242919921875, + "learning_rate": 7.82722971789031e-07, + "loss": 0.0097, + "reward": 1.7083333730697632, + "reward_std": 0.10346909612417221, + "rewards/accuracy_reward_stage2": 0.7083333134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1241 + }, + { + "completion_length": 8.53125, + "epoch": 0.2176274750306641, + "grad_norm": 23.08135670217196, + "kl": 0.05908203125, + "learning_rate": 7.825477483791834e-07, + "loss": 0.0236, + "reward": 1.829587697982788, + "reward_std": 0.12539014220237732, + "rewards/accuracy_reward_stage2": 0.8295876979827881, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1242 + }, + { + "completion_length": 9.484375, + "epoch": 0.21780269844051164, + "grad_norm": 18.350211164053533, + "kl": 0.04638671875, + "learning_rate": 7.823725249693359e-07, + "loss": 0.0186, + "reward": 1.3802655935287476, + "reward_std": 0.17971576750278473, + "rewards/accuracy_reward_stage2": 0.38026559352874756, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1243 + }, + { + "completion_length": 16.0, + "epoch": 0.21797792185035922, + "grad_norm": 17.128190072983514, + "kl": 0.051025390625, + "learning_rate": 7.821973015594883e-07, + "loss": -0.0085, + "reward": 1.6977180242538452, + "reward_std": 0.1646936684846878, + "rewards/accuracy_reward_stage2": 0.7133429646492004, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1244 + }, + { + "completion_length": 26.921875, + "epoch": 0.21815314526020677, + "grad_norm": 12.042671829195628, + "kl": 0.0634765625, + "learning_rate": 7.820220781496407e-07, + "loss": -0.1071, + "reward": 1.4990663528442383, + "reward_std": 0.17997947335243225, + "rewards/accuracy_reward_stage2": 0.5459413528442383, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1245 + }, + { + "completion_length": 10.296875, + "epoch": 0.2183283686700543, + "grad_norm": 21.10255172023999, + "kl": 0.09375, + "learning_rate": 7.818468547397932e-07, + "loss": 0.0374, + "reward": 1.6483008861541748, + "reward_std": 0.2902987599372864, + "rewards/accuracy_reward_stage2": 0.6483009457588196, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1246 + }, + { + "completion_length": 9.03125, + "epoch": 0.2185035920799019, + "grad_norm": 21.7959643852111, + "kl": 0.09033203125, + "learning_rate": 7.816716313299457e-07, + "loss": -0.0081, + "reward": 1.255446434020996, + "reward_std": 0.2230614423751831, + "rewards/accuracy_reward_stage2": 0.2710713744163513, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1247 + }, + { + "completion_length": 17.5, + "epoch": 0.21867881548974943, + "grad_norm": 19.42697648439749, + "kl": 0.076171875, + "learning_rate": 7.814964079200981e-07, + "loss": -0.0075, + "reward": 1.437554121017456, + "reward_std": 0.12493880093097687, + "rewards/accuracy_reward_stage2": 0.45317918062210083, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1248 + }, + { + "completion_length": 12.71875, + "epoch": 0.21885403889959698, + "grad_norm": 17.63240550187539, + "kl": 0.057373046875, + "learning_rate": 7.813211845102505e-07, + "loss": 0.0229, + "reward": 1.2602894306182861, + "reward_std": 0.11471651494503021, + "rewards/accuracy_reward_stage2": 0.38528940081596375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1249 + }, + { + "completion_length": 10.71875, + "epoch": 0.21902926230944456, + "grad_norm": 20.388080966035275, + "kl": 0.06787109375, + "learning_rate": 7.811459611004029e-07, + "loss": -0.0611, + "reward": 1.572366714477539, + "reward_std": 0.14886946976184845, + "rewards/accuracy_reward_stage2": 0.6036166548728943, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1250 + }, + { + "completion_length": 8.734375, + "epoch": 0.2192044857192921, + "grad_norm": 13.291214271289837, + "kl": 0.0986328125, + "learning_rate": 7.809707376905554e-07, + "loss": 0.0107, + "reward": 1.7387276887893677, + "reward_std": 0.16672708094120026, + "rewards/accuracy_reward_stage2": 0.7543526887893677, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1251 + }, + { + "completion_length": 13.453125, + "epoch": 0.21937970912913965, + "grad_norm": 20.069113743487662, + "kl": 0.020751953125, + "learning_rate": 7.807955142807079e-07, + "loss": 0.0083, + "reward": 1.3933387994766235, + "reward_std": 0.3103262484073639, + "rewards/accuracy_reward_stage2": 0.39333879947662354, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1252 + }, + { + "completion_length": 10.015625, + "epoch": 0.2195549325389872, + "grad_norm": 16.660368209284208, + "kl": 0.064453125, + "learning_rate": 7.806202908708603e-07, + "loss": 0.0257, + "reward": 1.4384819269180298, + "reward_std": 0.16353179514408112, + "rewards/accuracy_reward_stage2": 0.4384819269180298, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1253 + }, + { + "completion_length": 7.375, + "epoch": 0.21973015594883477, + "grad_norm": 7.523798852671286, + "kl": 0.0115966796875, + "learning_rate": 7.804450674610128e-07, + "loss": 0.0047, + "reward": 1.6167200803756714, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward_stage2": 0.6167200803756714, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1254 + }, + { + "completion_length": 12.671875, + "epoch": 0.21990537935868232, + "grad_norm": 14.542038526776427, + "kl": 0.044189453125, + "learning_rate": 7.802698440511653e-07, + "loss": -0.0266, + "reward": 1.515625, + "reward_std": 0.19044628739356995, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1255 + }, + { + "completion_length": 10.8125, + "epoch": 0.22008060276852986, + "grad_norm": 17.201209817778793, + "kl": 0.055419921875, + "learning_rate": 7.800946206413176e-07, + "loss": 0.0221, + "reward": 1.429174780845642, + "reward_std": 0.26050835847854614, + "rewards/accuracy_reward_stage2": 0.4291748106479645, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1256 + }, + { + "completion_length": 9.296875, + "epoch": 0.22025582617837744, + "grad_norm": 21.338852752745392, + "kl": 0.265625, + "learning_rate": 7.799193972314701e-07, + "loss": 0.0755, + "reward": 1.5294039249420166, + "reward_std": 0.27403104305267334, + "rewards/accuracy_reward_stage2": 0.6700288653373718, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1257 + }, + { + "completion_length": 11.15625, + "epoch": 0.220431049588225, + "grad_norm": 22.23905656233593, + "kl": 0.060302734375, + "learning_rate": 7.797441738216225e-07, + "loss": -0.0201, + "reward": 1.869028091430664, + "reward_std": 0.17269808053970337, + "rewards/accuracy_reward_stage2": 0.8846530318260193, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1258 + }, + { + "completion_length": 9.984375, + "epoch": 0.22060627299807253, + "grad_norm": 19.399824642943887, + "kl": 0.1103515625, + "learning_rate": 7.795689504117749e-07, + "loss": 0.0441, + "reward": 1.223279595375061, + "reward_std": 0.2005893588066101, + "rewards/accuracy_reward_stage2": 0.5982796549797058, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1259 + }, + { + "completion_length": 15.875, + "epoch": 0.2207814964079201, + "grad_norm": 24.27584542312224, + "kl": 0.322265625, + "learning_rate": 7.793937270019274e-07, + "loss": 0.0848, + "reward": 1.191131830215454, + "reward_std": 0.23304371535778046, + "rewards/accuracy_reward_stage2": 0.33175671100616455, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1260 + }, + { + "completion_length": 10.6875, + "epoch": 0.22095671981776766, + "grad_norm": 20.4149420187181, + "kl": 0.1142578125, + "learning_rate": 7.792185035920798e-07, + "loss": 0.0455, + "reward": 1.4384210109710693, + "reward_std": 0.22694987058639526, + "rewards/accuracy_reward_stage2": 0.5634210109710693, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1261 + }, + { + "completion_length": 7.359375, + "epoch": 0.2211319432276152, + "grad_norm": 12.284447759112405, + "kl": 0.0673828125, + "learning_rate": 7.790432801822323e-07, + "loss": -0.0046, + "reward": 1.6275393962860107, + "reward_std": 0.15898236632347107, + "rewards/accuracy_reward_stage2": 0.6431642770767212, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1262 + }, + { + "completion_length": 9.53125, + "epoch": 0.22130716663746278, + "grad_norm": 19.201363227623997, + "kl": 0.0791015625, + "learning_rate": 7.788680567723848e-07, + "loss": -0.0125, + "reward": 1.521449089050293, + "reward_std": 0.2546595335006714, + "rewards/accuracy_reward_stage2": 0.5370742082595825, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1263 + }, + { + "completion_length": 13.0625, + "epoch": 0.22148239004731032, + "grad_norm": 17.83997321001346, + "kl": 0.1513671875, + "learning_rate": 7.786928333625372e-07, + "loss": 0.0164, + "reward": 1.5619481801986694, + "reward_std": 0.27089670300483704, + "rewards/accuracy_reward_stage2": 0.5775731801986694, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1264 + }, + { + "completion_length": 9.421875, + "epoch": 0.22165761345715787, + "grad_norm": 18.783396663696035, + "kl": 0.07373046875, + "learning_rate": 7.785176099526897e-07, + "loss": 0.0006, + "reward": 1.6510417461395264, + "reward_std": 0.2089996337890625, + "rewards/accuracy_reward_stage2": 0.6822916269302368, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1265 + }, + { + "completion_length": 16.65625, + "epoch": 0.22183283686700545, + "grad_norm": 23.50690187272042, + "kl": 0.11083984375, + "learning_rate": 7.783423865428421e-07, + "loss": 0.012, + "reward": 1.626581072807312, + "reward_std": 0.2647040784358978, + "rewards/accuracy_reward_stage2": 0.6422061324119568, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1266 + }, + { + "completion_length": 15.734375, + "epoch": 0.222008060276853, + "grad_norm": 19.475958082591323, + "kl": 0.09423828125, + "learning_rate": 7.781671631329946e-07, + "loss": 0.0294, + "reward": 1.0696529150009155, + "reward_std": 0.07191064208745956, + "rewards/accuracy_reward_stage2": 0.3196529150009155, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1267 + }, + { + "completion_length": 8.125, + "epoch": 0.22218328368670054, + "grad_norm": 23.898456160776146, + "kl": 0.123046875, + "learning_rate": 7.779919397231471e-07, + "loss": 0.0493, + "reward": 1.5231192111968994, + "reward_std": 0.20473015308380127, + "rewards/accuracy_reward_stage2": 0.5231192111968994, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1268 + }, + { + "completion_length": 8.1875, + "epoch": 0.22235850709654809, + "grad_norm": 13.176356060911349, + "kl": 0.0703125, + "learning_rate": 7.778167163132993e-07, + "loss": 0.0282, + "reward": 1.746025800704956, + "reward_std": 0.14434634149074554, + "rewards/accuracy_reward_stage2": 0.7460259199142456, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1269 + }, + { + "completion_length": 13.078125, + "epoch": 0.22253373050639566, + "grad_norm": 12.526083904205054, + "kl": 0.04931640625, + "learning_rate": 7.776414929034518e-07, + "loss": -0.0245, + "reward": 1.6939078569412231, + "reward_std": 0.13472694158554077, + "rewards/accuracy_reward_stage2": 0.7095328569412231, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1270 + }, + { + "completion_length": 19.03125, + "epoch": 0.2227089539162432, + "grad_norm": 15.443856012437893, + "kl": 0.06201171875, + "learning_rate": 7.774662694936043e-07, + "loss": 0.0248, + "reward": 1.3781781196594238, + "reward_std": 0.12267878651618958, + "rewards/accuracy_reward_stage2": 0.378178209066391, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1271 + }, + { + "completion_length": 12.859375, + "epoch": 0.22288417732609075, + "grad_norm": 21.835257227780716, + "kl": 0.140625, + "learning_rate": 7.772910460837567e-07, + "loss": 0.0224, + "reward": 1.552232265472412, + "reward_std": 0.2832726240158081, + "rewards/accuracy_reward_stage2": 0.5678572654724121, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1272 + }, + { + "completion_length": 9.0, + "epoch": 0.22305940073593833, + "grad_norm": 20.60717806692071, + "kl": 0.07177734375, + "learning_rate": 7.771158226739092e-07, + "loss": -0.0155, + "reward": 1.5970072746276855, + "reward_std": 0.30943915247917175, + "rewards/accuracy_reward_stage2": 0.6126322746276855, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1273 + }, + { + "completion_length": 11.46875, + "epoch": 0.22323462414578588, + "grad_norm": 19.321778472115792, + "kl": 0.07861328125, + "learning_rate": 7.769405992640616e-07, + "loss": -0.0128, + "reward": 1.7938032150268555, + "reward_std": 0.20492716133594513, + "rewards/accuracy_reward_stage2": 0.8094281554222107, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1274 + }, + { + "completion_length": 7.265625, + "epoch": 0.22340984755563342, + "grad_norm": 17.917894454080603, + "kl": 0.1162109375, + "learning_rate": 7.767653758542141e-07, + "loss": 0.0464, + "reward": 1.718109130859375, + "reward_std": 0.22331853210926056, + "rewards/accuracy_reward_stage2": 0.7181090116500854, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1275 + }, + { + "completion_length": 8.25, + "epoch": 0.223585070965481, + "grad_norm": 18.712665480373918, + "kl": 0.2451171875, + "learning_rate": 7.765901524443666e-07, + "loss": -0.0028, + "reward": 1.2784273624420166, + "reward_std": 0.35105907917022705, + "rewards/accuracy_reward_stage2": 0.4503024220466614, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1276 + }, + { + "completion_length": 11.296875, + "epoch": 0.22376029437532854, + "grad_norm": 29.74251938860607, + "kl": 0.06494140625, + "learning_rate": 7.76414929034519e-07, + "loss": 0.0259, + "reward": 1.7380640506744385, + "reward_std": 0.2355988770723343, + "rewards/accuracy_reward_stage2": 0.7380639910697937, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1277 + }, + { + "completion_length": 11.359375, + "epoch": 0.2239355177851761, + "grad_norm": 19.119455330636775, + "kl": 0.058349609375, + "learning_rate": 7.762397056246715e-07, + "loss": 0.0234, + "reward": 1.6018481254577637, + "reward_std": 0.14455029368400574, + "rewards/accuracy_reward_stage2": 0.6018481254577637, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1278 + }, + { + "completion_length": 14.21875, + "epoch": 0.22411074119502367, + "grad_norm": 18.31859900135893, + "kl": 0.05078125, + "learning_rate": 7.76064482214824e-07, + "loss": 0.0203, + "reward": 1.4314525127410889, + "reward_std": 0.16612425446510315, + "rewards/accuracy_reward_stage2": 0.4314524531364441, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1279 + }, + { + "completion_length": 10.578125, + "epoch": 0.2242859646048712, + "grad_norm": 14.4401733471884, + "kl": 0.0216064453125, + "learning_rate": 7.758892588049763e-07, + "loss": -0.0355, + "reward": 1.5586047172546387, + "reward_std": 0.1084030419588089, + "rewards/accuracy_reward_stage2": 0.5742297172546387, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1280 + }, + { + "completion_length": 9.953125, + "epoch": 0.22446118801471876, + "grad_norm": 12.963432488454982, + "kl": 0.07666015625, + "learning_rate": 7.757140353951288e-07, + "loss": -0.0135, + "reward": 1.4432322978973389, + "reward_std": 0.08829830586910248, + "rewards/accuracy_reward_stage2": 0.45885732769966125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1281 + }, + { + "completion_length": 12.75, + "epoch": 0.22463641142456633, + "grad_norm": 18.285143581142435, + "kl": 0.072265625, + "learning_rate": 7.755388119852811e-07, + "loss": 0.029, + "reward": 1.6616626977920532, + "reward_std": 0.16342474520206451, + "rewards/accuracy_reward_stage2": 0.7866626977920532, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1282 + }, + { + "completion_length": 10.828125, + "epoch": 0.22481163483441388, + "grad_norm": 25.22350216516759, + "kl": 0.24609375, + "learning_rate": 7.753635885754336e-07, + "loss": 0.069, + "reward": 1.5536048412322998, + "reward_std": 0.22804740071296692, + "rewards/accuracy_reward_stage2": 0.694229781627655, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1283 + }, + { + "completion_length": 15.65625, + "epoch": 0.22498685824426143, + "grad_norm": 13.930013600400436, + "kl": 0.0537109375, + "learning_rate": 7.751883651655861e-07, + "loss": -0.0542, + "reward": 1.5508832931518555, + "reward_std": 0.18876992166042328, + "rewards/accuracy_reward_stage2": 0.7071333527565002, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1284 + }, + { + "completion_length": 14.03125, + "epoch": 0.22516208165410898, + "grad_norm": 24.37646426933664, + "kl": 0.044921875, + "learning_rate": 7.750131417557385e-07, + "loss": -0.021, + "reward": 1.4661248922348022, + "reward_std": 0.2995033264160156, + "rewards/accuracy_reward_stage2": 0.48174989223480225, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1285 + }, + { + "completion_length": 24.984375, + "epoch": 0.22533730506395655, + "grad_norm": 21.56030833367531, + "kl": 0.0751953125, + "learning_rate": 7.74837918345891e-07, + "loss": -0.0029, + "reward": 1.5520787239074707, + "reward_std": 0.163404643535614, + "rewards/accuracy_reward_stage2": 0.5677036046981812, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1286 + }, + { + "completion_length": 11.484375, + "epoch": 0.2255125284738041, + "grad_norm": 20.697940818242685, + "kl": 0.1787109375, + "learning_rate": 7.746626949360435e-07, + "loss": -0.0314, + "reward": 1.4901411533355713, + "reward_std": 0.3281293213367462, + "rewards/accuracy_reward_stage2": 0.5370161533355713, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1287 + }, + { + "completion_length": 12.859375, + "epoch": 0.22568775188365164, + "grad_norm": 17.43483491224403, + "kl": 0.0260009765625, + "learning_rate": 7.744874715261959e-07, + "loss": 0.0104, + "reward": 1.7811851501464844, + "reward_std": 0.20009824633598328, + "rewards/accuracy_reward_stage2": 0.7811851501464844, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1288 + }, + { + "completion_length": 9.796875, + "epoch": 0.22586297529349922, + "grad_norm": 18.708135005309416, + "kl": 0.06298828125, + "learning_rate": 7.743122481163483e-07, + "loss": -0.0178, + "reward": 1.512305736541748, + "reward_std": 0.21341320872306824, + "rewards/accuracy_reward_stage2": 0.5279307961463928, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1289 + }, + { + "completion_length": 10.828125, + "epoch": 0.22603819870334677, + "grad_norm": 16.460682608123427, + "kl": 0.04638671875, + "learning_rate": 7.741370247065007e-07, + "loss": 0.0186, + "reward": 1.7030662298202515, + "reward_std": 0.15714354813098907, + "rewards/accuracy_reward_stage2": 0.7030661106109619, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1290 + }, + { + "completion_length": 12.8125, + "epoch": 0.2262134221131943, + "grad_norm": 17.695886350155817, + "kl": 0.04296875, + "learning_rate": 7.739618012966532e-07, + "loss": 0.0172, + "reward": 1.398033618927002, + "reward_std": 0.1854480504989624, + "rewards/accuracy_reward_stage2": 0.39803367853164673, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1291 + }, + { + "completion_length": 7.71875, + "epoch": 0.2263886455230419, + "grad_norm": 13.61432730167237, + "kl": 0.0252685546875, + "learning_rate": 7.737865778868057e-07, + "loss": 0.0101, + "reward": 1.7604167461395264, + "reward_std": 0.1167893186211586, + "rewards/accuracy_reward_stage2": 0.7604166865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1292 + }, + { + "completion_length": 9.109375, + "epoch": 0.22656386893288943, + "grad_norm": 15.963181167643066, + "kl": 0.091796875, + "learning_rate": 7.736113544769581e-07, + "loss": -0.0023, + "reward": 1.4881972074508667, + "reward_std": 0.11426497250795364, + "rewards/accuracy_reward_stage2": 0.5038222074508667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1293 + }, + { + "completion_length": 9.3125, + "epoch": 0.22673909234273698, + "grad_norm": 22.062928498011892, + "kl": 0.20703125, + "learning_rate": 7.734361310671105e-07, + "loss": 0.0224, + "reward": 1.3682682514190674, + "reward_std": 0.2822038233280182, + "rewards/accuracy_reward_stage2": 0.6495183110237122, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1294 + }, + { + "completion_length": 11.046875, + "epoch": 0.22691431575258456, + "grad_norm": 15.61429349005288, + "kl": 0.099609375, + "learning_rate": 7.732609076572629e-07, + "loss": -0.0333, + "reward": 1.3251614570617676, + "reward_std": 0.3028058409690857, + "rewards/accuracy_reward_stage2": 0.4814113974571228, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1295 + }, + { + "completion_length": 7.9375, + "epoch": 0.2270895391624321, + "grad_norm": 15.122875763850107, + "kl": 0.08544921875, + "learning_rate": 7.730856842474154e-07, + "loss": 0.0341, + "reward": 1.5477509498596191, + "reward_std": 0.14309881627559662, + "rewards/accuracy_reward_stage2": 0.5477508902549744, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1296 + }, + { + "completion_length": 12.703125, + "epoch": 0.22726476257227965, + "grad_norm": 19.00074915929077, + "kl": 0.02685546875, + "learning_rate": 7.729104608375679e-07, + "loss": 0.0107, + "reward": 1.6535483598709106, + "reward_std": 0.1936911642551422, + "rewards/accuracy_reward_stage2": 0.7785484790802002, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1297 + }, + { + "completion_length": 11.4375, + "epoch": 0.22743998598212722, + "grad_norm": 15.655251618086494, + "kl": 0.1328125, + "learning_rate": 7.727352374277202e-07, + "loss": 0.0528, + "reward": 1.3991637229919434, + "reward_std": 0.19552862644195557, + "rewards/accuracy_reward_stage2": 0.6491636633872986, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1298 + }, + { + "completion_length": 10.171875, + "epoch": 0.22761520939197477, + "grad_norm": 23.824028580200068, + "kl": 0.048095703125, + "learning_rate": 7.725600140178727e-07, + "loss": -0.0142, + "reward": 1.5, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1299 + }, + { + "completion_length": 9.296875, + "epoch": 0.22779043280182232, + "grad_norm": 9.523524049899153, + "kl": 0.007110595703125, + "learning_rate": 7.723847906080252e-07, + "loss": 0.0029, + "reward": 1.641369104385376, + "reward_std": 0.10163542628288269, + "rewards/accuracy_reward_stage2": 0.6413690447807312, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1300 + }, + { + "completion_length": 9.796875, + "epoch": 0.2279656562116699, + "grad_norm": 14.469235347826526, + "kl": 0.02587890625, + "learning_rate": 7.722095671981776e-07, + "loss": 0.0103, + "reward": 1.6671922206878662, + "reward_std": 0.16057150065898895, + "rewards/accuracy_reward_stage2": 0.6671922206878662, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1301 + }, + { + "completion_length": 14.734375, + "epoch": 0.22814087962151744, + "grad_norm": 19.157426995880886, + "kl": 0.09033203125, + "learning_rate": 7.720343437883301e-07, + "loss": -0.0081, + "reward": 1.5279631614685059, + "reward_std": 0.30399948358535767, + "rewards/accuracy_reward_stage2": 0.5435882210731506, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1302 + }, + { + "completion_length": 6.640625, + "epoch": 0.228316103031365, + "grad_norm": 19.37342779340734, + "kl": 0.1357421875, + "learning_rate": 7.718591203784826e-07, + "loss": -0.0341, + "reward": 1.626155138015747, + "reward_std": 0.21683211624622345, + "rewards/accuracy_reward_stage2": 0.7824052572250366, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1303 + }, + { + "completion_length": 10.671875, + "epoch": 0.22849132644121253, + "grad_norm": 16.792216050058997, + "kl": 0.061767578125, + "learning_rate": 7.71683896968635e-07, + "loss": -0.0508, + "reward": 1.5081243515014648, + "reward_std": 0.2527463436126709, + "rewards/accuracy_reward_stage2": 0.6643743515014648, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1304 + }, + { + "completion_length": 11.03125, + "epoch": 0.2286665498510601, + "grad_norm": 18.21345290572861, + "kl": 0.1474609375, + "learning_rate": 7.715086735587875e-07, + "loss": 0.0376, + "reward": 1.1700676679611206, + "reward_std": 0.2538905739784241, + "rewards/accuracy_reward_stage2": 0.3106927275657654, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1305 + }, + { + "completion_length": 9.78125, + "epoch": 0.22884177326090765, + "grad_norm": 25.267173688859856, + "kl": 0.1357421875, + "learning_rate": 7.713334501489399e-07, + "loss": 0.0214, + "reward": 1.5649652481079102, + "reward_std": 0.2843823730945587, + "rewards/accuracy_reward_stage2": 0.5805902481079102, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1306 + }, + { + "completion_length": 14.625, + "epoch": 0.2290169966707552, + "grad_norm": 17.060570429206486, + "kl": 0.053466796875, + "learning_rate": 7.711582267390923e-07, + "loss": -0.0228, + "reward": 1.5452286005020142, + "reward_std": 0.20523126423358917, + "rewards/accuracy_reward_stage2": 0.5608536005020142, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1307 + }, + { + "completion_length": 10.859375, + "epoch": 0.22919222008060278, + "grad_norm": 17.399015919757105, + "kl": 0.053955078125, + "learning_rate": 7.709830033292448e-07, + "loss": -0.0227, + "reward": 1.46875, + "reward_std": 0.2845909595489502, + "rewards/accuracy_reward_stage2": 0.484375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1308 + }, + { + "completion_length": 13.828125, + "epoch": 0.22936744349045032, + "grad_norm": 19.079762022572663, + "kl": 0.08447265625, + "learning_rate": 7.708077799193971e-07, + "loss": -0.0535, + "reward": 1.495568871498108, + "reward_std": 0.2711693048477173, + "rewards/accuracy_reward_stage2": 0.5268188118934631, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1309 + }, + { + "completion_length": 10.3125, + "epoch": 0.22954266690029787, + "grad_norm": 21.410291018790012, + "kl": 0.0830078125, + "learning_rate": 7.706325565095496e-07, + "loss": -0.0103, + "reward": 1.66116201877594, + "reward_std": 0.2185249626636505, + "rewards/accuracy_reward_stage2": 0.6767870187759399, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1310 + }, + { + "completion_length": 5.578125, + "epoch": 0.22971789031014545, + "grad_norm": 23.749171801536022, + "kl": 0.06787109375, + "learning_rate": 7.70457333099702e-07, + "loss": -0.0018, + "reward": 1.860271692276001, + "reward_std": 0.19402723014354706, + "rewards/accuracy_reward_stage2": 0.8758968114852905, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1311 + }, + { + "completion_length": 7.546875, + "epoch": 0.229893113719993, + "grad_norm": 16.81050162349492, + "kl": 0.050048828125, + "learning_rate": 7.702821096898545e-07, + "loss": 0.0201, + "reward": 1.6367621421813965, + "reward_std": 0.12695074081420898, + "rewards/accuracy_reward_stage2": 0.7617621421813965, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1312 + }, + { + "completion_length": 10.46875, + "epoch": 0.23006833712984054, + "grad_norm": 18.882004316200497, + "kl": 0.091796875, + "learning_rate": 7.70106886280007e-07, + "loss": 0.0368, + "reward": 1.577049970626831, + "reward_std": 0.28842049837112427, + "rewards/accuracy_reward_stage2": 0.7020500302314758, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1313 + }, + { + "completion_length": 7.609375, + "epoch": 0.2302435605396881, + "grad_norm": 18.144597604562488, + "kl": 0.0869140625, + "learning_rate": 7.699316628701594e-07, + "loss": 0.0348, + "reward": 1.497538685798645, + "reward_std": 0.23568907380104065, + "rewards/accuracy_reward_stage2": 0.49753862619400024, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1314 + }, + { + "completion_length": 7.125, + "epoch": 0.23041878394953566, + "grad_norm": 32.72252036712734, + "kl": 0.25390625, + "learning_rate": 7.697564394603119e-07, + "loss": 0.101, + "reward": 1.4892593622207642, + "reward_std": 0.09943016618490219, + "rewards/accuracy_reward_stage2": 0.6142593622207642, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1315 + }, + { + "completion_length": 14.59375, + "epoch": 0.2305940073593832, + "grad_norm": 23.14253804576249, + "kl": 0.11279296875, + "learning_rate": 7.695812160504644e-07, + "loss": 0.045, + "reward": 1.3638789653778076, + "reward_std": 0.26762983202934265, + "rewards/accuracy_reward_stage2": 0.48887893557548523, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1316 + }, + { + "completion_length": 6.953125, + "epoch": 0.23076923076923078, + "grad_norm": 16.876861475002965, + "kl": 0.04736328125, + "learning_rate": 7.694059926406168e-07, + "loss": -0.0486, + "reward": 1.484375, + "reward_std": 0.15992169082164764, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1317 + }, + { + "completion_length": 8.375, + "epoch": 0.23094445417907833, + "grad_norm": 16.856181738487386, + "kl": 0.15625, + "learning_rate": 7.692307692307693e-07, + "loss": 0.0186, + "reward": 1.605655550956726, + "reward_std": 0.2316904067993164, + "rewards/accuracy_reward_stage2": 0.7462804317474365, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1318 + }, + { + "completion_length": 8.65625, + "epoch": 0.23111967758892588, + "grad_norm": 21.496554558856914, + "kl": 0.1220703125, + "learning_rate": 7.690555458209216e-07, + "loss": 0.0047, + "reward": 1.4125896692276, + "reward_std": 0.2990317940711975, + "rewards/accuracy_reward_stage2": 0.4282146990299225, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1319 + }, + { + "completion_length": 13.046875, + "epoch": 0.23129490099877342, + "grad_norm": 22.226835762626035, + "kl": 0.1640625, + "learning_rate": 7.68880322411074e-07, + "loss": -0.0087, + "reward": 1.5795575380325317, + "reward_std": 0.34644702076911926, + "rewards/accuracy_reward_stage2": 0.6108075976371765, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1320 + }, + { + "completion_length": 9.28125, + "epoch": 0.231470124408621, + "grad_norm": 18.118326027742203, + "kl": 0.0361328125, + "learning_rate": 7.687050990012265e-07, + "loss": 0.0145, + "reward": 1.8229167461395264, + "reward_std": 0.21129511296749115, + "rewards/accuracy_reward_stage2": 0.8229166269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1321 + }, + { + "completion_length": 10.765625, + "epoch": 0.23164534781846854, + "grad_norm": 14.929477462062813, + "kl": 0.1298828125, + "learning_rate": 7.685298755913789e-07, + "loss": 0.0075, + "reward": 1.837185263633728, + "reward_std": 0.22572728991508484, + "rewards/accuracy_reward_stage2": 0.852810263633728, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1322 + }, + { + "completion_length": 6.34375, + "epoch": 0.2318205712283161, + "grad_norm": 16.885100313656853, + "kl": 0.040771484375, + "learning_rate": 7.683546521815314e-07, + "loss": 0.0163, + "reward": 1.641325831413269, + "reward_std": 0.2138734608888626, + "rewards/accuracy_reward_stage2": 0.641325831413269, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1323 + }, + { + "completion_length": 9.765625, + "epoch": 0.23199579463816367, + "grad_norm": 21.435876816815696, + "kl": 0.11572265625, + "learning_rate": 7.681794287716839e-07, + "loss": 0.0066, + "reward": 1.4557948112487793, + "reward_std": 0.37678366899490356, + "rewards/accuracy_reward_stage2": 0.4714197516441345, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1324 + }, + { + "completion_length": 9.640625, + "epoch": 0.2321710180480112, + "grad_norm": 19.439716800962223, + "kl": 0.04638671875, + "learning_rate": 7.680042053618363e-07, + "loss": -0.032, + "reward": 1.5691554546356201, + "reward_std": 0.2068910300731659, + "rewards/accuracy_reward_stage2": 0.6004054546356201, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1325 + }, + { + "completion_length": 7.359375, + "epoch": 0.23234624145785876, + "grad_norm": 17.288923569988533, + "kl": 0.0751953125, + "learning_rate": 7.678289819519888e-07, + "loss": -0.014, + "reward": 1.7781198024749756, + "reward_std": 0.15428794920444489, + "rewards/accuracy_reward_stage2": 0.7937447428703308, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1326 + }, + { + "completion_length": 9.0625, + "epoch": 0.23252146486770633, + "grad_norm": 17.876378399465597, + "kl": 0.1533203125, + "learning_rate": 7.676537585421412e-07, + "loss": 0.0171, + "reward": 1.6630034446716309, + "reward_std": 0.15936848521232605, + "rewards/accuracy_reward_stage2": 0.8036285042762756, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1327 + }, + { + "completion_length": 11.75, + "epoch": 0.23269668827755388, + "grad_norm": 17.121016301907396, + "kl": 0.2197265625, + "learning_rate": 7.674785351322936e-07, + "loss": 0.0876, + "reward": 1.5210437774658203, + "reward_std": 0.18795861303806305, + "rewards/accuracy_reward_stage2": 0.6460438966751099, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1328 + }, + { + "completion_length": 9.46875, + "epoch": 0.23287191168740143, + "grad_norm": 13.808690989852431, + "kl": 0.05322265625, + "learning_rate": 7.673033117224461e-07, + "loss": 0.0213, + "reward": 1.421875, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward_stage2": 0.421875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1329 + }, + { + "completion_length": 8.21875, + "epoch": 0.233047135097249, + "grad_norm": 16.41141039654268, + "kl": 0.1396484375, + "learning_rate": 7.671280883125985e-07, + "loss": 0.0561, + "reward": 1.5604474544525146, + "reward_std": 0.2582498788833618, + "rewards/accuracy_reward_stage2": 0.5604474544525146, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1330 + }, + { + "completion_length": 6.734375, + "epoch": 0.23322235850709655, + "grad_norm": 13.308302497276948, + "kl": 0.03662109375, + "learning_rate": 7.66952864902751e-07, + "loss": 0.0147, + "reward": 1.5406548976898193, + "reward_std": 0.16961881518363953, + "rewards/accuracy_reward_stage2": 0.5406548976898193, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1331 + }, + { + "completion_length": 6.015625, + "epoch": 0.2333975819169441, + "grad_norm": 18.64517313452226, + "kl": 0.060791015625, + "learning_rate": 7.667776414929035e-07, + "loss": -0.0506, + "reward": 1.6535872220993042, + "reward_std": 0.22229741513729095, + "rewards/accuracy_reward_stage2": 0.8098372220993042, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1332 + }, + { + "completion_length": 8.125, + "epoch": 0.23357280532679167, + "grad_norm": 15.161287245748868, + "kl": 0.06298828125, + "learning_rate": 7.666024180830558e-07, + "loss": -0.0138, + "reward": 1.6737689971923828, + "reward_std": 0.30542173981666565, + "rewards/accuracy_reward_stage2": 0.689393937587738, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1333 + }, + { + "completion_length": 6.5625, + "epoch": 0.23374802873663922, + "grad_norm": 19.129160927851164, + "kl": 0.169921875, + "learning_rate": 7.664271946732083e-07, + "loss": -0.0139, + "reward": 1.543736457824707, + "reward_std": 0.14306402206420898, + "rewards/accuracy_reward_stage2": 0.5749865770339966, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1334 + }, + { + "completion_length": 8.8125, + "epoch": 0.23392325214648677, + "grad_norm": 20.079367351448898, + "kl": 0.16015625, + "learning_rate": 7.662519712633607e-07, + "loss": 0.0195, + "reward": 1.2349598407745361, + "reward_std": 0.11756610125303268, + "rewards/accuracy_reward_stage2": 0.3755849003791809, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1335 + }, + { + "completion_length": 11.703125, + "epoch": 0.23409847555633434, + "grad_norm": 16.942394215976904, + "kl": 0.06396484375, + "learning_rate": 7.660767478535132e-07, + "loss": 0.0256, + "reward": 1.2687785625457764, + "reward_std": 0.22143125534057617, + "rewards/accuracy_reward_stage2": 0.5187786221504211, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1336 + }, + { + "completion_length": 10.421875, + "epoch": 0.2342736989661819, + "grad_norm": 18.10765071993869, + "kl": 0.06689453125, + "learning_rate": 7.659015244436657e-07, + "loss": -0.0175, + "reward": 1.4496071338653564, + "reward_std": 0.11517606675624847, + "rewards/accuracy_reward_stage2": 0.46523213386535645, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1337 + }, + { + "completion_length": 8.40625, + "epoch": 0.23444892237602943, + "grad_norm": 18.622226862732052, + "kl": 0.13671875, + "learning_rate": 7.65726301033818e-07, + "loss": -0.0338, + "reward": 1.515625, + "reward_std": 0.28778618574142456, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1338 + }, + { + "completion_length": 10.953125, + "epoch": 0.23462414578587698, + "grad_norm": 20.568536347605033, + "kl": 0.16796875, + "learning_rate": 7.655510776239705e-07, + "loss": 0.023, + "reward": 1.3898448944091797, + "reward_std": 0.24200965464115143, + "rewards/accuracy_reward_stage2": 0.5304698944091797, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1339 + }, + { + "completion_length": 15.25, + "epoch": 0.23479936919572456, + "grad_norm": 20.4797597642588, + "kl": 0.041259765625, + "learning_rate": 7.65375854214123e-07, + "loss": -0.0277, + "reward": 1.3560901880264282, + "reward_std": 0.1921404004096985, + "rewards/accuracy_reward_stage2": 0.3717151880264282, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1340 + }, + { + "completion_length": 9.8125, + "epoch": 0.2349745926055721, + "grad_norm": 21.944151781117228, + "kl": 0.09326171875, + "learning_rate": 7.652006308042754e-07, + "loss": -0.0051, + "reward": 1.6057844161987305, + "reward_std": 0.29269227385520935, + "rewards/accuracy_reward_stage2": 0.6214094161987305, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1341 + }, + { + "completion_length": 9.359375, + "epoch": 0.23514981601541965, + "grad_norm": 12.40325893539299, + "kl": 0.08154296875, + "learning_rate": 7.650254073944279e-07, + "loss": -0.0117, + "reward": 1.6456931829452515, + "reward_std": 0.08253457397222519, + "rewards/accuracy_reward_stage2": 0.6613181233406067, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1342 + }, + { + "completion_length": 8.234375, + "epoch": 0.23532503942526722, + "grad_norm": 19.208591877134815, + "kl": 0.1220703125, + "learning_rate": 7.648501839845803e-07, + "loss": 0.0489, + "reward": 1.7316572666168213, + "reward_std": 0.14084160327911377, + "rewards/accuracy_reward_stage2": 0.8566572070121765, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1343 + }, + { + "completion_length": 11.65625, + "epoch": 0.23550026283511477, + "grad_norm": 17.0946448588103, + "kl": 0.1279296875, + "learning_rate": 7.646749605747328e-07, + "loss": -0.0369, + "reward": 1.2201182842254639, + "reward_std": 0.23437920212745667, + "rewards/accuracy_reward_stage2": 0.37636837363243103, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1344 + }, + { + "completion_length": 6.921875, + "epoch": 0.23567548624496232, + "grad_norm": 10.852411604630273, + "kl": 0.06201171875, + "learning_rate": 7.644997371648852e-07, + "loss": 0.0248, + "reward": 1.7287862300872803, + "reward_std": 0.09362059086561203, + "rewards/accuracy_reward_stage2": 0.7287862300872803, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1345 + }, + { + "completion_length": 11.640625, + "epoch": 0.2358507096548099, + "grad_norm": 17.03273169319801, + "kl": 0.028076171875, + "learning_rate": 7.643245137550376e-07, + "loss": 0.0112, + "reward": 1.5563149452209473, + "reward_std": 0.21286053955554962, + "rewards/accuracy_reward_stage2": 0.5563148856163025, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1346 + }, + { + "completion_length": 16.9375, + "epoch": 0.23602593306465744, + "grad_norm": 23.584844649922754, + "kl": 0.1455078125, + "learning_rate": 7.641492903451901e-07, + "loss": -0.0076, + "reward": 1.402522325515747, + "reward_std": 0.36791902780532837, + "rewards/accuracy_reward_stage2": 0.43377232551574707, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1347 + }, + { + "completion_length": 9.015625, + "epoch": 0.236201156474505, + "grad_norm": 14.40621714172456, + "kl": 0.123046875, + "learning_rate": 7.639740669353425e-07, + "loss": -0.0711, + "reward": 1.4438755512237549, + "reward_std": 0.3174234628677368, + "rewards/accuracy_reward_stage2": 0.4907504916191101, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1348 + }, + { + "completion_length": 6.453125, + "epoch": 0.23637637988435256, + "grad_norm": 17.89363169126588, + "kl": 0.02734375, + "learning_rate": 7.637988435254949e-07, + "loss": 0.011, + "reward": 1.8724802732467651, + "reward_std": 0.13984158635139465, + "rewards/accuracy_reward_stage2": 0.8724802136421204, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1349 + }, + { + "completion_length": 10.96875, + "epoch": 0.2365516032942001, + "grad_norm": 20.342646184614797, + "kl": 0.041748046875, + "learning_rate": 7.636236201156474e-07, + "loss": -0.0717, + "reward": 1.6354167461395264, + "reward_std": 0.2051776647567749, + "rewards/accuracy_reward_stage2": 0.6666666269302368, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1350 + }, + { + "completion_length": 8.375, + "epoch": 0.23672682670404765, + "grad_norm": 29.274146939478232, + "kl": 0.21484375, + "learning_rate": 7.634483967057998e-07, + "loss": 0.0107, + "reward": 1.5348470211029053, + "reward_std": 0.24751737713813782, + "rewards/accuracy_reward_stage2": 0.56609708070755, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1351 + }, + { + "completion_length": 7.140625, + "epoch": 0.23690205011389523, + "grad_norm": 15.083267265104967, + "kl": 0.1201171875, + "learning_rate": 7.632731732959523e-07, + "loss": 0.048, + "reward": 1.5895814895629883, + "reward_std": 0.15502366423606873, + "rewards/accuracy_reward_stage2": 0.5895814895629883, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1352 + }, + { + "completion_length": 13.671875, + "epoch": 0.23707727352374278, + "grad_norm": 23.24766828227649, + "kl": 0.2080078125, + "learning_rate": 7.630979498861048e-07, + "loss": 0.0387, + "reward": 1.4341652393341064, + "reward_std": 0.2539823651313782, + "rewards/accuracy_reward_stage2": 0.5747902393341064, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1353 + }, + { + "completion_length": 23.40625, + "epoch": 0.23725249693359032, + "grad_norm": 18.812711379812775, + "kl": 0.05859375, + "learning_rate": 7.629227264762572e-07, + "loss": -0.0052, + "reward": 1.6507964134216309, + "reward_std": 0.12219469249248505, + "rewards/accuracy_reward_stage2": 0.6664214134216309, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1354 + }, + { + "completion_length": 12.15625, + "epoch": 0.23742772034343787, + "grad_norm": 20.974248217569382, + "kl": 0.0888671875, + "learning_rate": 7.627475030664097e-07, + "loss": 0.0355, + "reward": 1.4771928787231445, + "reward_std": 0.2657541334629059, + "rewards/accuracy_reward_stage2": 0.47719287872314453, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1355 + }, + { + "completion_length": 7.59375, + "epoch": 0.23760294375328544, + "grad_norm": 14.147872821836522, + "kl": 0.0751953125, + "learning_rate": 7.625722796565622e-07, + "loss": 0.0302, + "reward": 1.8256654739379883, + "reward_std": 0.16003847122192383, + "rewards/accuracy_reward_stage2": 0.8256654739379883, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1356 + }, + { + "completion_length": 8.4375, + "epoch": 0.237778167163133, + "grad_norm": 23.388735536574366, + "kl": 0.146484375, + "learning_rate": 7.623970562467146e-07, + "loss": -0.0669, + "reward": 1.6211612224578857, + "reward_std": 0.28670769929885864, + "rewards/accuracy_reward_stage2": 0.6680362224578857, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1357 + }, + { + "completion_length": 8.140625, + "epoch": 0.23795339057298054, + "grad_norm": 1026.974996111092, + "kl": 5.0625, + "learning_rate": 7.622218328368669e-07, + "loss": 1.9528, + "reward": 1.3219940662384033, + "reward_std": 0.2393598109483719, + "rewards/accuracy_reward_stage2": 0.4938691258430481, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1358 + }, + { + "completion_length": 11.953125, + "epoch": 0.2381286139828281, + "grad_norm": 25.40176586494032, + "kl": 0.12060546875, + "learning_rate": 7.620466094270193e-07, + "loss": -0.0242, + "reward": 1.5815457105636597, + "reward_std": 0.29680225253105164, + "rewards/accuracy_reward_stage2": 0.6127958297729492, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1359 + }, + { + "completion_length": 6.578125, + "epoch": 0.23830383739267566, + "grad_norm": 18.639160968237462, + "kl": 0.0693359375, + "learning_rate": 7.618713860171718e-07, + "loss": -0.0165, + "reward": 1.541497826576233, + "reward_std": 0.24797038733959198, + "rewards/accuracy_reward_stage2": 0.5571227669715881, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1360 + }, + { + "completion_length": 16.125, + "epoch": 0.2384790608025232, + "grad_norm": 18.559946755867603, + "kl": 0.2578125, + "learning_rate": 7.616961626073243e-07, + "loss": 0.0012, + "reward": 1.537853717803955, + "reward_std": 0.2887413203716278, + "rewards/accuracy_reward_stage2": 0.7097286581993103, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1361 + }, + { + "completion_length": 5.953125, + "epoch": 0.23865428421237078, + "grad_norm": 15.60884525772603, + "kl": 0.057373046875, + "learning_rate": 7.615209391974767e-07, + "loss": -0.0105, + "reward": 1.5885417461395264, + "reward_std": 0.24286779761314392, + "rewards/accuracy_reward_stage2": 0.6041666269302368, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1362 + }, + { + "completion_length": 9.46875, + "epoch": 0.23882950762221833, + "grad_norm": 24.530372357681458, + "kl": 0.10009765625, + "learning_rate": 7.613457157876292e-07, + "loss": -0.0649, + "reward": 1.209742784500122, + "reward_std": 0.3205549716949463, + "rewards/accuracy_reward_stage2": 0.38161781430244446, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1363 + }, + { + "completion_length": 10.71875, + "epoch": 0.23900473103206588, + "grad_norm": 18.32753281244687, + "kl": 0.1005859375, + "learning_rate": 7.611704923777817e-07, + "loss": 0.0403, + "reward": 1.592026710510254, + "reward_std": 0.1409560739994049, + "rewards/accuracy_reward_stage2": 0.7170267105102539, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1364 + }, + { + "completion_length": 10.78125, + "epoch": 0.23917995444191345, + "grad_norm": 18.624788183315577, + "kl": 0.1630859375, + "learning_rate": 7.609952689679341e-07, + "loss": -0.0905, + "reward": 1.6103694438934326, + "reward_std": 0.23033568263053894, + "rewards/accuracy_reward_stage2": 0.6728694438934326, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1365 + }, + { + "completion_length": 11.890625, + "epoch": 0.239355177851761, + "grad_norm": 17.579634638243046, + "kl": 0.16015625, + "learning_rate": 7.608200455580866e-07, + "loss": 0.0284, + "reward": 1.3834011554718018, + "reward_std": 0.2378893941640854, + "rewards/accuracy_reward_stage2": 0.3990260362625122, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1366 + }, + { + "completion_length": 16.71875, + "epoch": 0.23953040126160854, + "grad_norm": 21.561179085008593, + "kl": 0.10107421875, + "learning_rate": 7.60644822148239e-07, + "loss": 0.0404, + "reward": 1.5523983240127563, + "reward_std": 0.19578197598457336, + "rewards/accuracy_reward_stage2": 0.5523982644081116, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1367 + }, + { + "completion_length": 6.9375, + "epoch": 0.23970562467145612, + "grad_norm": 21.208497450352066, + "kl": 0.1767578125, + "learning_rate": 7.604695987383914e-07, + "loss": 0.0009, + "reward": 1.471160650253296, + "reward_std": 0.20998351275920868, + "rewards/accuracy_reward_stage2": 0.6274106502532959, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1368 + }, + { + "completion_length": 9.859375, + "epoch": 0.23988084808130367, + "grad_norm": 15.544292327181843, + "kl": 0.11669921875, + "learning_rate": 7.602943753285439e-07, + "loss": 0.0025, + "reward": 1.3777086734771729, + "reward_std": 0.14471843838691711, + "rewards/accuracy_reward_stage2": 0.39333376288414, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1369 + }, + { + "completion_length": 12.6875, + "epoch": 0.2400560714911512, + "grad_norm": 20.253928877390308, + "kl": 0.05908203125, + "learning_rate": 7.601191519186963e-07, + "loss": 0.0236, + "reward": 1.4667612314224243, + "reward_std": 0.2103850245475769, + "rewards/accuracy_reward_stage2": 0.4667612612247467, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1370 + }, + { + "completion_length": 10.09375, + "epoch": 0.24023129490099876, + "grad_norm": 18.476012392364993, + "kl": 0.1513671875, + "learning_rate": 7.599439285088487e-07, + "loss": 0.0163, + "reward": 1.645654320716858, + "reward_std": 0.13692879676818848, + "rewards/accuracy_reward_stage2": 0.6612793803215027, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1371 + }, + { + "completion_length": 14.140625, + "epoch": 0.24040651831084633, + "grad_norm": 15.660536838371636, + "kl": 0.07177734375, + "learning_rate": 7.597687050990011e-07, + "loss": 0.0288, + "reward": 1.5812649726867676, + "reward_std": 0.17887036502361298, + "rewards/accuracy_reward_stage2": 0.5812650918960571, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1372 + }, + { + "completion_length": 10.96875, + "epoch": 0.24058174172069388, + "grad_norm": 20.22050450069256, + "kl": 0.08154296875, + "learning_rate": 7.595934816891536e-07, + "loss": 0.0088, + "reward": 1.6261334419250488, + "reward_std": 0.14943452179431915, + "rewards/accuracy_reward_stage2": 0.6417584419250488, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1373 + }, + { + "completion_length": 18.953125, + "epoch": 0.24075696513054143, + "grad_norm": 15.262480413366804, + "kl": 0.06494140625, + "learning_rate": 7.594182582793061e-07, + "loss": -0.0157, + "reward": 1.6218539476394653, + "reward_std": 0.18022483587265015, + "rewards/accuracy_reward_stage2": 0.6374789476394653, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1374 + }, + { + "completion_length": 12.796875, + "epoch": 0.240932188540389, + "grad_norm": 21.09257487877772, + "kl": 0.11962890625, + "learning_rate": 7.592430348694585e-07, + "loss": 0.0119, + "reward": 1.6718034744262695, + "reward_std": 0.36308354139328003, + "rewards/accuracy_reward_stage2": 0.68742835521698, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1375 + }, + { + "completion_length": 8.25, + "epoch": 0.24110741195023655, + "grad_norm": 19.02651654352321, + "kl": 0.059814453125, + "learning_rate": 7.59067811459611e-07, + "loss": 0.0239, + "reward": 1.585327386856079, + "reward_std": 0.21454137563705444, + "rewards/accuracy_reward_stage2": 0.5853273868560791, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1376 + }, + { + "completion_length": 9.0, + "epoch": 0.2412826353600841, + "grad_norm": 19.488328207175734, + "kl": 0.0927734375, + "learning_rate": 7.588925880497635e-07, + "loss": -0.0065, + "reward": 1.418020486831665, + "reward_std": 0.3297243118286133, + "rewards/accuracy_reward_stage2": 0.5586454272270203, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1377 + }, + { + "completion_length": 11.3125, + "epoch": 0.24145785876993167, + "grad_norm": 21.364367900934393, + "kl": 0.244140625, + "learning_rate": 7.587173646399158e-07, + "loss": 0.0623, + "reward": 1.37074613571167, + "reward_std": 0.20727571845054626, + "rewards/accuracy_reward_stage2": 0.5113711357116699, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1378 + }, + { + "completion_length": 16.78125, + "epoch": 0.24163308217977922, + "grad_norm": 15.906057313581638, + "kl": 0.018310546875, + "learning_rate": 7.585421412300683e-07, + "loss": 0.0073, + "reward": 1.6640890836715698, + "reward_std": 0.12738674879074097, + "rewards/accuracy_reward_stage2": 0.7890890836715698, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1379 + }, + { + "completion_length": 7.4375, + "epoch": 0.24180830558962677, + "grad_norm": 17.62827161364021, + "kl": 0.111328125, + "learning_rate": 7.583669178202208e-07, + "loss": 0.0005, + "reward": 1.6967413425445557, + "reward_std": 0.13932161033153534, + "rewards/accuracy_reward_stage2": 0.7123663425445557, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1380 + }, + { + "completion_length": 13.5, + "epoch": 0.24198352899947434, + "grad_norm": 24.47590495691864, + "kl": 0.232421875, + "learning_rate": 7.581916944103732e-07, + "loss": 0.0992, + "reward": 1.4481374025344849, + "reward_std": 0.3019064962863922, + "rewards/accuracy_reward_stage2": 0.5731374621391296, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1381 + }, + { + "completion_length": 8.296875, + "epoch": 0.2421587524093219, + "grad_norm": 27.811506620771162, + "kl": 0.1357421875, + "learning_rate": 7.580164710005257e-07, + "loss": 0.0543, + "reward": 1.6210970878601074, + "reward_std": 0.22113262116909027, + "rewards/accuracy_reward_stage2": 0.6210970878601074, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1382 + }, + { + "completion_length": 14.390625, + "epoch": 0.24233397581916943, + "grad_norm": 16.569060349237095, + "kl": 0.052734375, + "learning_rate": 7.578412475906781e-07, + "loss": 0.0211, + "reward": 1.3238930702209473, + "reward_std": 0.1760530024766922, + "rewards/accuracy_reward_stage2": 0.3238930106163025, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1383 + }, + { + "completion_length": 14.671875, + "epoch": 0.242509199229017, + "grad_norm": 9.046751874397618, + "kl": 0.02685546875, + "learning_rate": 7.576660241808305e-07, + "loss": -0.0334, + "reward": 1.515625, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.53125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1384 + }, + { + "completion_length": 12.09375, + "epoch": 0.24268442263886456, + "grad_norm": 18.023372522650554, + "kl": 0.072265625, + "learning_rate": 7.57490800770983e-07, + "loss": 0.0289, + "reward": 1.1849263906478882, + "reward_std": 0.14363038539886475, + "rewards/accuracy_reward_stage2": 0.4349263310432434, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1385 + }, + { + "completion_length": 10.625, + "epoch": 0.2428596460487121, + "grad_norm": 18.403407774950747, + "kl": 0.138671875, + "learning_rate": 7.573155773611354e-07, + "loss": 0.0259, + "reward": 1.6670591831207275, + "reward_std": 0.20200037956237793, + "rewards/accuracy_reward_stage2": 0.682684063911438, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1386 + }, + { + "completion_length": 13.890625, + "epoch": 0.24303486945855968, + "grad_norm": 20.79383527590382, + "kl": 0.357421875, + "learning_rate": 7.571403539512879e-07, + "loss": 0.15, + "reward": 1.3685318231582642, + "reward_std": 0.25985872745513916, + "rewards/accuracy_reward_stage2": 0.6185318231582642, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1387 + }, + { + "completion_length": 10.03125, + "epoch": 0.24321009286840722, + "grad_norm": 18.86950437113182, + "kl": 0.126953125, + "learning_rate": 7.569651305414402e-07, + "loss": 0.0067, + "reward": 1.7178502082824707, + "reward_std": 0.1606583297252655, + "rewards/accuracy_reward_stage2": 0.7334751486778259, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1388 + }, + { + "completion_length": 6.796875, + "epoch": 0.24338531627825477, + "grad_norm": 22.82460816866117, + "kl": 0.1572265625, + "learning_rate": 7.567899071315927e-07, + "loss": 0.0632, + "reward": 1.465291976928711, + "reward_std": 0.38037338852882385, + "rewards/accuracy_reward_stage2": 0.46529191732406616, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1389 + }, + { + "completion_length": 10.1875, + "epoch": 0.24356053968810232, + "grad_norm": 24.642991253870704, + "kl": 0.11279296875, + "learning_rate": 7.566146837217452e-07, + "loss": 0.0009, + "reward": 1.7565643787384033, + "reward_std": 0.19197356700897217, + "rewards/accuracy_reward_stage2": 0.8971893787384033, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1390 + }, + { + "completion_length": 11.515625, + "epoch": 0.2437357630979499, + "grad_norm": 20.364150468143993, + "kl": 0.29296875, + "learning_rate": 7.564394603118976e-07, + "loss": 0.0728, + "reward": 1.351882815361023, + "reward_std": 0.29016733169555664, + "rewards/accuracy_reward_stage2": 0.4925077557563782, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1391 + }, + { + "completion_length": 13.09375, + "epoch": 0.24391098650779744, + "grad_norm": 20.290293173883178, + "kl": 0.345703125, + "learning_rate": 7.562642369020501e-07, + "loss": 0.1379, + "reward": 1.0814404487609863, + "reward_std": 0.20886757969856262, + "rewards/accuracy_reward_stage2": 0.45644041895866394, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1392 + }, + { + "completion_length": 9.453125, + "epoch": 0.244086209917645, + "grad_norm": 19.97602043621162, + "kl": 0.296875, + "learning_rate": 7.560890134922026e-07, + "loss": 0.0348, + "reward": 1.3737891912460327, + "reward_std": 0.2441052496433258, + "rewards/accuracy_reward_stage2": 0.5456641912460327, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1393 + }, + { + "completion_length": 9.953125, + "epoch": 0.24426143332749256, + "grad_norm": 21.983461650003445, + "kl": 0.083984375, + "learning_rate": 7.55913790082355e-07, + "loss": 0.0334, + "reward": 1.4830906391143799, + "reward_std": 0.21525192260742188, + "rewards/accuracy_reward_stage2": 0.6080905795097351, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1394 + }, + { + "completion_length": 8.1875, + "epoch": 0.2444366567373401, + "grad_norm": 21.998258110612007, + "kl": 0.1337890625, + "learning_rate": 7.557385666725075e-07, + "loss": 0.0248, + "reward": 1.452605128288269, + "reward_std": 0.28352901339530945, + "rewards/accuracy_reward_stage2": 0.46823009848594666, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1395 + }, + { + "completion_length": 7.84375, + "epoch": 0.24461188014718765, + "grad_norm": 19.909572146545123, + "kl": 0.08056640625, + "learning_rate": 7.555633432626598e-07, + "loss": 0.0323, + "reward": 1.5185023546218872, + "reward_std": 0.2108551412820816, + "rewards/accuracy_reward_stage2": 0.5185023546218872, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1396 + }, + { + "completion_length": 9.53125, + "epoch": 0.24478710355703523, + "grad_norm": 12.74340987455773, + "kl": 0.0174560546875, + "learning_rate": 7.553881198528122e-07, + "loss": 0.007, + "reward": 1.3910613059997559, + "reward_std": 0.15033581852912903, + "rewards/accuracy_reward_stage2": 0.5160612463951111, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1397 + }, + { + "completion_length": 10.5, + "epoch": 0.24496232696688278, + "grad_norm": 18.40246201674656, + "kl": 0.11181640625, + "learning_rate": 7.552128964429647e-07, + "loss": -0.026, + "reward": 1.5389893054962158, + "reward_std": 0.20025639235973358, + "rewards/accuracy_reward_stage2": 0.570239245891571, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1398 + }, + { + "completion_length": 10.296875, + "epoch": 0.24513755037673032, + "grad_norm": 13.009913705130106, + "kl": 0.055419921875, + "learning_rate": 7.550376730331171e-07, + "loss": 0.0222, + "reward": 1.4153332710266113, + "reward_std": 0.13312244415283203, + "rewards/accuracy_reward_stage2": 0.41533327102661133, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1399 + }, + { + "completion_length": 8.1875, + "epoch": 0.2453127737865779, + "grad_norm": 18.3058249826422, + "kl": 0.032470703125, + "learning_rate": 7.548624496232696e-07, + "loss": -0.0312, + "reward": 1.7342438697814941, + "reward_std": 0.2937045693397522, + "rewards/accuracy_reward_stage2": 0.7498688697814941, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1400 + }, + { + "completion_length": 9.171875, + "epoch": 0.24548799719642544, + "grad_norm": 23.01036824762215, + "kl": 0.052001953125, + "learning_rate": 7.546872262134221e-07, + "loss": 0.0208, + "reward": 1.603689432144165, + "reward_std": 0.320762038230896, + "rewards/accuracy_reward_stage2": 0.6036894917488098, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1401 + }, + { + "completion_length": 12.171875, + "epoch": 0.245663220606273, + "grad_norm": 19.966849327865962, + "kl": 0.05224609375, + "learning_rate": 7.545120028035745e-07, + "loss": 0.0209, + "reward": 1.2532411813735962, + "reward_std": 0.23318162560462952, + "rewards/accuracy_reward_stage2": 0.3782411813735962, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1402 + }, + { + "completion_length": 11.25, + "epoch": 0.24583844401612057, + "grad_norm": 32.40288041494598, + "kl": 0.1103515625, + "learning_rate": 7.54336779393727e-07, + "loss": 0.0442, + "reward": 1.524993658065796, + "reward_std": 0.18178695440292358, + "rewards/accuracy_reward_stage2": 0.6499937176704407, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1403 + }, + { + "completion_length": 12.96875, + "epoch": 0.2460136674259681, + "grad_norm": 13.635345821381598, + "kl": 0.0189208984375, + "learning_rate": 7.541615559838794e-07, + "loss": 0.0076, + "reward": 1.600611925125122, + "reward_std": 0.11209513992071152, + "rewards/accuracy_reward_stage2": 0.6006119251251221, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1404 + }, + { + "completion_length": 7.671875, + "epoch": 0.24618889083581566, + "grad_norm": 16.000875268863858, + "kl": 0.193359375, + "learning_rate": 7.539863325740319e-07, + "loss": 0.0774, + "reward": 1.7346065044403076, + "reward_std": 0.09460826218128204, + "rewards/accuracy_reward_stage2": 0.8596064448356628, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1405 + }, + { + "completion_length": 8.84375, + "epoch": 0.2463641142456632, + "grad_norm": 16.47259843018963, + "kl": 0.08203125, + "learning_rate": 7.538111091641844e-07, + "loss": 0.0327, + "reward": 1.893869161605835, + "reward_std": 0.1298605501651764, + "rewards/accuracy_reward_stage2": 0.8938692212104797, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1406 + }, + { + "completion_length": 15.265625, + "epoch": 0.24653933765551078, + "grad_norm": 17.276454759130427, + "kl": 0.0546875, + "learning_rate": 7.536358857543368e-07, + "loss": -0.0223, + "reward": 1.3059229850769043, + "reward_std": 0.17960919439792633, + "rewards/accuracy_reward_stage2": 0.3215479254722595, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1407 + }, + { + "completion_length": 11.796875, + "epoch": 0.24671456106535833, + "grad_norm": 23.06353585461065, + "kl": 0.061767578125, + "learning_rate": 7.534606623444892e-07, + "loss": -0.0092, + "reward": 1.5690124034881592, + "reward_std": 0.2022152692079544, + "rewards/accuracy_reward_stage2": 0.5846374034881592, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1408 + }, + { + "completion_length": 10.28125, + "epoch": 0.24688978447520588, + "grad_norm": 16.828407269287137, + "kl": 0.0306396484375, + "learning_rate": 7.532854389346416e-07, + "loss": -0.0319, + "reward": 1.7287945747375488, + "reward_std": 0.23023012280464172, + "rewards/accuracy_reward_stage2": 0.7444195747375488, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1409 + }, + { + "completion_length": 16.59375, + "epoch": 0.24706500788505345, + "grad_norm": 15.25377782281171, + "kl": 0.10546875, + "learning_rate": 7.53110215524794e-07, + "loss": 0.0423, + "reward": 1.3533527851104736, + "reward_std": 0.13732056319713593, + "rewards/accuracy_reward_stage2": 0.47835278511047363, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1410 + }, + { + "completion_length": 12.125, + "epoch": 0.247240231294901, + "grad_norm": 21.936185903908854, + "kl": 0.06982421875, + "learning_rate": 7.529349921149465e-07, + "loss": 0.028, + "reward": 1.7008384466171265, + "reward_std": 0.2988817095756531, + "rewards/accuracy_reward_stage2": 0.7008384466171265, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1411 + }, + { + "completion_length": 4.3125, + "epoch": 0.24741545470474854, + "grad_norm": 25.73348775481858, + "kl": 0.232421875, + "learning_rate": 7.527597687050989e-07, + "loss": 0.0256, + "reward": 1.4827898740768433, + "reward_std": 0.125931054353714, + "rewards/accuracy_reward_stage2": 0.5140398740768433, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1412 + }, + { + "completion_length": 11.59375, + "epoch": 0.24759067811459612, + "grad_norm": 21.094142841819078, + "kl": 0.11376953125, + "learning_rate": 7.525845452952514e-07, + "loss": 0.0141, + "reward": 1.4062385559082031, + "reward_std": 0.16928933560848236, + "rewards/accuracy_reward_stage2": 0.4218636453151703, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1413 + }, + { + "completion_length": 11.296875, + "epoch": 0.24776590152444367, + "grad_norm": 19.519594526717743, + "kl": 0.0732421875, + "learning_rate": 7.524093218854039e-07, + "loss": 0.0292, + "reward": 1.7651951313018799, + "reward_std": 0.2411029040813446, + "rewards/accuracy_reward_stage2": 0.7651951909065247, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1414 + }, + { + "completion_length": 10.484375, + "epoch": 0.2479411249342912, + "grad_norm": 16.276921952277196, + "kl": 0.072265625, + "learning_rate": 7.522340984755563e-07, + "loss": -0.0152, + "reward": 1.505523920059204, + "reward_std": 0.2533569037914276, + "rewards/accuracy_reward_stage2": 0.5211489796638489, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1415 + }, + { + "completion_length": 6.9375, + "epoch": 0.2481163483441388, + "grad_norm": 17.29022823380657, + "kl": 0.0400390625, + "learning_rate": 7.520588750657088e-07, + "loss": -0.0611, + "reward": 1.5840046405792236, + "reward_std": 0.29173704981803894, + "rewards/accuracy_reward_stage2": 0.6152546405792236, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1416 + }, + { + "completion_length": 9.0, + "epoch": 0.24829157175398633, + "grad_norm": 20.546807374945057, + "kl": 0.08203125, + "learning_rate": 7.518836516558613e-07, + "loss": -0.0403, + "reward": 1.6551421880722046, + "reward_std": 0.2729400098323822, + "rewards/accuracy_reward_stage2": 0.6863921880722046, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1417 + }, + { + "completion_length": 10.171875, + "epoch": 0.24846679516383388, + "grad_norm": 16.351239914609064, + "kl": 0.095703125, + "learning_rate": 7.517084282460136e-07, + "loss": 0.0383, + "reward": 1.6802784204483032, + "reward_std": 0.20798176527023315, + "rewards/accuracy_reward_stage2": 0.6802783608436584, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1418 + }, + { + "completion_length": 11.15625, + "epoch": 0.24864201857368146, + "grad_norm": 15.355436275669865, + "kl": 0.09716796875, + "learning_rate": 7.515332048361661e-07, + "loss": 0.0389, + "reward": 1.7547039985656738, + "reward_std": 0.19607709348201752, + "rewards/accuracy_reward_stage2": 0.7547039985656738, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1419 + }, + { + "completion_length": 14.3125, + "epoch": 0.248817241983529, + "grad_norm": 26.208048459448875, + "kl": 0.14453125, + "learning_rate": 7.513579814263185e-07, + "loss": 0.0579, + "reward": 1.5602327585220337, + "reward_std": 0.19080065190792084, + "rewards/accuracy_reward_stage2": 0.6852326989173889, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1420 + }, + { + "completion_length": 6.875, + "epoch": 0.24899246539337655, + "grad_norm": 20.93084846517713, + "kl": 0.0439453125, + "learning_rate": 7.51182758016471e-07, + "loss": -0.0266, + "reward": 1.5314676761627197, + "reward_std": 0.1645144373178482, + "rewards/accuracy_reward_stage2": 0.5470925569534302, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1421 + }, + { + "completion_length": 12.65625, + "epoch": 0.24916768880322412, + "grad_norm": 22.8515591698515, + "kl": 0.06689453125, + "learning_rate": 7.510075346066234e-07, + "loss": -0.0175, + "reward": 1.425929069519043, + "reward_std": 0.34283581376075745, + "rewards/accuracy_reward_stage2": 0.44155409932136536, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1422 + }, + { + "completion_length": 10.265625, + "epoch": 0.24934291221307167, + "grad_norm": 20.10466292441384, + "kl": 0.119140625, + "learning_rate": 7.508323111967758e-07, + "loss": 0.0308, + "reward": 1.6480023860931396, + "reward_std": 0.2787425220012665, + "rewards/accuracy_reward_stage2": 0.6636273860931396, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1423 + }, + { + "completion_length": 7.53125, + "epoch": 0.24951813562291922, + "grad_norm": 18.74219789011929, + "kl": 0.181640625, + "learning_rate": 7.506570877869283e-07, + "loss": -0.0252, + "reward": 1.5641932487487793, + "reward_std": 0.1766408234834671, + "rewards/accuracy_reward_stage2": 0.6110682487487793, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1424 + }, + { + "completion_length": 12.25, + "epoch": 0.24969335903276677, + "grad_norm": 21.74483115845773, + "kl": 0.11572265625, + "learning_rate": 7.504818643770808e-07, + "loss": 0.0464, + "reward": 1.3805147409439087, + "reward_std": 0.08259022235870361, + "rewards/accuracy_reward_stage2": 0.5055146813392639, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1425 + }, + { + "completion_length": 7.53125, + "epoch": 0.24986858244261434, + "grad_norm": 28.82118561441724, + "kl": 0.25390625, + "learning_rate": 7.503066409672332e-07, + "loss": 0.0226, + "reward": 1.5050541162490845, + "reward_std": 0.23860237002372742, + "rewards/accuracy_reward_stage2": 0.5363041162490845, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1426 + }, + { + "completion_length": 15.484375, + "epoch": 0.2500438058524619, + "grad_norm": 14.751929645062313, + "kl": 0.029541015625, + "learning_rate": 7.501314175573856e-07, + "loss": 0.0118, + "reward": 1.4637627601623535, + "reward_std": 0.1221727728843689, + "rewards/accuracy_reward_stage2": 0.46376264095306396, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1427 + }, + { + "completion_length": 9.953125, + "epoch": 0.25021902926230943, + "grad_norm": 20.848579861484406, + "kl": 0.06298828125, + "learning_rate": 7.49956194147538e-07, + "loss": 0.0251, + "reward": 1.6132272481918335, + "reward_std": 0.24379153549671173, + "rewards/accuracy_reward_stage2": 0.6132272481918335, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1428 + }, + { + "completion_length": 10.484375, + "epoch": 0.250394252672157, + "grad_norm": 12.875223980001039, + "kl": 0.04638671875, + "learning_rate": 7.497809707376905e-07, + "loss": -0.0246, + "reward": 1.5445420742034912, + "reward_std": 0.10616233944892883, + "rewards/accuracy_reward_stage2": 0.5601670742034912, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1429 + }, + { + "completion_length": 11.40625, + "epoch": 0.2505694760820046, + "grad_norm": 20.47718883591059, + "kl": 0.1630859375, + "learning_rate": 7.49605747327843e-07, + "loss": 0.0323, + "reward": 1.5031461715698242, + "reward_std": 0.21613532304763794, + "rewards/accuracy_reward_stage2": 0.6437711715698242, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1430 + }, + { + "completion_length": 7.171875, + "epoch": 0.25074469949185213, + "grad_norm": 16.85359317480518, + "kl": 0.0830078125, + "learning_rate": 7.494305239179954e-07, + "loss": 0.0333, + "reward": 1.4066383838653564, + "reward_std": 0.13117088377475739, + "rewards/accuracy_reward_stage2": 0.40663841366767883, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1431 + }, + { + "completion_length": 8.234375, + "epoch": 0.2509199229016997, + "grad_norm": 16.17823877879919, + "kl": 0.02392578125, + "learning_rate": 7.492553005081479e-07, + "loss": 0.0096, + "reward": 1.5708149671554565, + "reward_std": 0.175631582736969, + "rewards/accuracy_reward_stage2": 0.5708150267601013, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1432 + }, + { + "completion_length": 10.921875, + "epoch": 0.2510951463115472, + "grad_norm": 26.57272633941125, + "kl": 0.06591796875, + "learning_rate": 7.490800770983004e-07, + "loss": 0.0264, + "reward": 1.4339896440505981, + "reward_std": 0.2509709894657135, + "rewards/accuracy_reward_stage2": 0.43398961424827576, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1433 + }, + { + "completion_length": 9.953125, + "epoch": 0.25127036972139477, + "grad_norm": 17.38723444203358, + "kl": 0.1396484375, + "learning_rate": 7.489048536884528e-07, + "loss": 0.056, + "reward": 1.7741584777832031, + "reward_std": 0.19155940413475037, + "rewards/accuracy_reward_stage2": 0.7741584181785583, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1434 + }, + { + "completion_length": 6.390625, + "epoch": 0.2514455931312423, + "grad_norm": 28.928724087604284, + "kl": 0.240234375, + "learning_rate": 7.487296302786052e-07, + "loss": 0.0556, + "reward": 1.4168357849121094, + "reward_std": 0.19237574934959412, + "rewards/accuracy_reward_stage2": 0.4480857849121094, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1435 + }, + { + "completion_length": 32.203125, + "epoch": 0.25162081654108986, + "grad_norm": 17.414773448675557, + "kl": 0.07373046875, + "learning_rate": 7.485544068687576e-07, + "loss": 0.0295, + "reward": 1.1635406017303467, + "reward_std": 0.22368814051151276, + "rewards/accuracy_reward_stage2": 0.4135405719280243, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1436 + }, + { + "completion_length": 12.75, + "epoch": 0.25179603995093747, + "grad_norm": 14.257635558313762, + "kl": 0.07275390625, + "learning_rate": 7.4837918345891e-07, + "loss": 0.0291, + "reward": 1.6434564590454102, + "reward_std": 0.10967773199081421, + "rewards/accuracy_reward_stage2": 0.6434564590454102, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1437 + }, + { + "completion_length": 8.65625, + "epoch": 0.251971263360785, + "grad_norm": 23.17077825610406, + "kl": 0.166015625, + "learning_rate": 7.482039600490625e-07, + "loss": -0.0219, + "reward": 1.4026463031768799, + "reward_std": 0.28032857179641724, + "rewards/accuracy_reward_stage2": 0.4495212733745575, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1438 + }, + { + "completion_length": 12.28125, + "epoch": 0.25214648677063256, + "grad_norm": 24.845948232527043, + "kl": 0.11962890625, + "learning_rate": 7.480287366392149e-07, + "loss": 0.0477, + "reward": 1.4077582359313965, + "reward_std": 0.24148832261562347, + "rewards/accuracy_reward_stage2": 0.5327582359313965, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1439 + }, + { + "completion_length": 16.875, + "epoch": 0.2523217101804801, + "grad_norm": 19.20260433661532, + "kl": 0.048828125, + "learning_rate": 7.478535132293674e-07, + "loss": 0.0196, + "reward": 1.3172272443771362, + "reward_std": 0.14055949449539185, + "rewards/accuracy_reward_stage2": 0.31722724437713623, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1440 + }, + { + "completion_length": 11.921875, + "epoch": 0.25249693359032765, + "grad_norm": 19.72320501947569, + "kl": 0.099609375, + "learning_rate": 7.476782898195199e-07, + "loss": 0.0001, + "reward": 1.4920825958251953, + "reward_std": 0.2956010699272156, + "rewards/accuracy_reward_stage2": 0.5077076554298401, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1441 + }, + { + "completion_length": 7.734375, + "epoch": 0.2526721570001752, + "grad_norm": 19.618789319973512, + "kl": 0.07763671875, + "learning_rate": 7.475030664096723e-07, + "loss": -0.0043, + "reward": 1.4340300559997559, + "reward_std": 0.2557186782360077, + "rewards/accuracy_reward_stage2": 0.5746550559997559, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1442 + }, + { + "completion_length": 12.609375, + "epoch": 0.2528473804100228, + "grad_norm": 17.932373270098704, + "kl": 0.068359375, + "learning_rate": 7.473278429998248e-07, + "loss": -0.0061, + "reward": 1.3050273656845093, + "reward_std": 0.2368200719356537, + "rewards/accuracy_reward_stage2": 0.3206523656845093, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1443 + }, + { + "completion_length": 10.3125, + "epoch": 0.25302260381987035, + "grad_norm": 17.592314152355126, + "kl": 0.015625, + "learning_rate": 7.471526195899772e-07, + "loss": 0.0063, + "reward": 1.6176997423171997, + "reward_std": 0.23842398822307587, + "rewards/accuracy_reward_stage2": 0.6176997423171997, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1444 + }, + { + "completion_length": 7.828125, + "epoch": 0.2531978272297179, + "grad_norm": 17.54833559818638, + "kl": 0.0625, + "learning_rate": 7.469773961801297e-07, + "loss": -0.0066, + "reward": 1.583640456199646, + "reward_std": 0.2253027856349945, + "rewards/accuracy_reward_stage2": 0.599265456199646, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1445 + }, + { + "completion_length": 7.34375, + "epoch": 0.25337305063956544, + "grad_norm": 18.76032144403276, + "kl": 0.02783203125, + "learning_rate": 7.468021727702822e-07, + "loss": -0.033, + "reward": 1.8142331838607788, + "reward_std": 0.22606132924556732, + "rewards/accuracy_reward_stage2": 0.829858124256134, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1446 + }, + { + "completion_length": 13.171875, + "epoch": 0.253548274049413, + "grad_norm": 19.281481149835077, + "kl": 0.21875, + "learning_rate": 7.466269493604344e-07, + "loss": 0.012, + "reward": 1.2332404851913452, + "reward_std": 0.2107694447040558, + "rewards/accuracy_reward_stage2": 0.3894904851913452, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1447 + }, + { + "completion_length": 12.515625, + "epoch": 0.25372349745926054, + "grad_norm": 16.387474225377876, + "kl": 0.06005859375, + "learning_rate": 7.464517259505869e-07, + "loss": -0.0201, + "reward": 1.6585381031036377, + "reward_std": 0.18613138794898987, + "rewards/accuracy_reward_stage2": 0.6741631031036377, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1448 + }, + { + "completion_length": 8.734375, + "epoch": 0.2538987208691081, + "grad_norm": 23.012981197680062, + "kl": 0.1591796875, + "learning_rate": 7.462765025407393e-07, + "loss": 0.0196, + "reward": 1.4184439182281494, + "reward_std": 0.311229944229126, + "rewards/accuracy_reward_stage2": 0.5590689182281494, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1449 + }, + { + "completion_length": 12.015625, + "epoch": 0.2540739442789557, + "grad_norm": 20.388117986982774, + "kl": 0.0966796875, + "learning_rate": 7.461012791308918e-07, + "loss": 0.0387, + "reward": 1.2674095630645752, + "reward_std": 0.179796040058136, + "rewards/accuracy_reward_stage2": 0.3924095034599304, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1450 + }, + { + "completion_length": 11.859375, + "epoch": 0.25424916768880323, + "grad_norm": 23.739127987190525, + "kl": 0.08740234375, + "learning_rate": 7.459260557210443e-07, + "loss": -0.0093, + "reward": 1.3971586227416992, + "reward_std": 0.24291972815990448, + "rewards/accuracy_reward_stage2": 0.4127836227416992, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1451 + }, + { + "completion_length": 9.953125, + "epoch": 0.2544243910986508, + "grad_norm": 18.38959476405202, + "kl": 0.04541015625, + "learning_rate": 7.457508323111967e-07, + "loss": 0.0182, + "reward": 1.6254558563232422, + "reward_std": 0.16483592987060547, + "rewards/accuracy_reward_stage2": 0.750455915927887, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1452 + }, + { + "completion_length": 8.953125, + "epoch": 0.25459961450849833, + "grad_norm": 18.41502038751911, + "kl": 0.07080078125, + "learning_rate": 7.455756089013492e-07, + "loss": -0.0368, + "reward": 1.500500202178955, + "reward_std": 0.23161879181861877, + "rewards/accuracy_reward_stage2": 0.5317501425743103, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1453 + }, + { + "completion_length": 8.484375, + "epoch": 0.2547748379183459, + "grad_norm": 21.173867795926054, + "kl": 0.080078125, + "learning_rate": 7.454003854915017e-07, + "loss": 0.032, + "reward": 1.5174919366836548, + "reward_std": 0.19462689757347107, + "rewards/accuracy_reward_stage2": 0.5174919962882996, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1454 + }, + { + "completion_length": 10.890625, + "epoch": 0.2549500613281934, + "grad_norm": 24.184911493436093, + "kl": 0.1875, + "learning_rate": 7.452251620816541e-07, + "loss": 0.0846, + "reward": 1.4352514743804932, + "reward_std": 0.25682583451271057, + "rewards/accuracy_reward_stage2": 0.5602514743804932, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1455 + }, + { + "completion_length": 17.625, + "epoch": 0.255125284738041, + "grad_norm": 14.765625590550027, + "kl": 0.0595703125, + "learning_rate": 7.450499386718066e-07, + "loss": 0.011, + "reward": 1.5301792621612549, + "reward_std": 0.14312390983104706, + "rewards/accuracy_reward_stage2": 0.6708042025566101, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1456 + }, + { + "completion_length": 14.546875, + "epoch": 0.25530050814788857, + "grad_norm": 26.303298887600445, + "kl": 0.12109375, + "learning_rate": 7.448747152619589e-07, + "loss": 0.0105, + "reward": 1.30367112159729, + "reward_std": 0.248937726020813, + "rewards/accuracy_reward_stage2": 0.5692960619926453, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1457 + }, + { + "completion_length": 10.71875, + "epoch": 0.2554757315577361, + "grad_norm": 15.87483540806158, + "kl": 0.279296875, + "learning_rate": 7.446994918521114e-07, + "loss": 0.0236, + "reward": 1.452156662940979, + "reward_std": 0.25209495425224304, + "rewards/accuracy_reward_stage2": 0.608406662940979, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1458 + }, + { + "completion_length": 23.171875, + "epoch": 0.25565095496758367, + "grad_norm": 16.989660761895617, + "kl": 0.10400390625, + "learning_rate": 7.445242684422639e-07, + "loss": -0.0027, + "reward": 1.3787615299224854, + "reward_std": 0.1453903168439865, + "rewards/accuracy_reward_stage2": 0.5193865299224854, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1459 + }, + { + "completion_length": 18.28125, + "epoch": 0.2558261783774312, + "grad_norm": 17.646162678744076, + "kl": 0.04736328125, + "learning_rate": 7.443490450324162e-07, + "loss": 0.0189, + "reward": 1.3753581047058105, + "reward_std": 0.16490060091018677, + "rewards/accuracy_reward_stage2": 0.37535810470581055, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1460 + }, + { + "completion_length": 11.796875, + "epoch": 0.25600140178727876, + "grad_norm": 17.721008191652405, + "kl": 0.09423828125, + "learning_rate": 7.441738216225687e-07, + "loss": 0.0376, + "reward": 1.703101396560669, + "reward_std": 0.1289398968219757, + "rewards/accuracy_reward_stage2": 0.8281015157699585, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1461 + }, + { + "completion_length": 7.8125, + "epoch": 0.25617662519712636, + "grad_norm": 18.96483793561938, + "kl": 0.031494140625, + "learning_rate": 7.439985982127212e-07, + "loss": 0.0126, + "reward": 1.644444465637207, + "reward_std": 0.18405601382255554, + "rewards/accuracy_reward_stage2": 0.644444465637207, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1462 + }, + { + "completion_length": 13.390625, + "epoch": 0.2563518486069739, + "grad_norm": 17.659693979455977, + "kl": 0.0439453125, + "learning_rate": 7.438233748028736e-07, + "loss": -0.0195, + "reward": 1.7559140920639038, + "reward_std": 0.2098643183708191, + "rewards/accuracy_reward_stage2": 0.7715390920639038, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1463 + }, + { + "completion_length": 5.984375, + "epoch": 0.25652707201682146, + "grad_norm": 15.24752737199156, + "kl": 0.07763671875, + "learning_rate": 7.436481513930261e-07, + "loss": -0.0131, + "reward": 1.7054529190063477, + "reward_std": 0.20000435411930084, + "rewards/accuracy_reward_stage2": 0.7210779786109924, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1464 + }, + { + "completion_length": 10.046875, + "epoch": 0.256702295426669, + "grad_norm": 19.832224778307896, + "kl": 0.053955078125, + "learning_rate": 7.434729279831785e-07, + "loss": 0.0215, + "reward": 1.7961781024932861, + "reward_std": 0.22936266660690308, + "rewards/accuracy_reward_stage2": 0.7961781024932861, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1465 + }, + { + "completion_length": 12.28125, + "epoch": 0.25687751883651655, + "grad_norm": 18.569052563883726, + "kl": 0.0830078125, + "learning_rate": 7.43297704573331e-07, + "loss": -0.0106, + "reward": 1.6345622539520264, + "reward_std": 0.22242167592048645, + "rewards/accuracy_reward_stage2": 0.6501872539520264, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1466 + }, + { + "completion_length": 9.953125, + "epoch": 0.2570527422463641, + "grad_norm": 19.164318358894214, + "kl": 0.130859375, + "learning_rate": 7.431224811634834e-07, + "loss": -0.0306, + "reward": 1.5217496156692505, + "reward_std": 0.26683294773101807, + "rewards/accuracy_reward_stage2": 0.5529996156692505, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1467 + }, + { + "completion_length": 9.828125, + "epoch": 0.25722796565621164, + "grad_norm": 23.655118472656955, + "kl": 0.11376953125, + "learning_rate": 7.429472577536358e-07, + "loss": 0.026, + "reward": 1.4828197956085205, + "reward_std": 0.2148694396018982, + "rewards/accuracy_reward_stage2": 0.4984448552131653, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1468 + }, + { + "completion_length": 12.0625, + "epoch": 0.25740318906605925, + "grad_norm": 19.835704810324053, + "kl": 0.09912109375, + "learning_rate": 7.427720343437883e-07, + "loss": 0.0397, + "reward": 1.1822917461395264, + "reward_std": 0.3223046064376831, + "rewards/accuracy_reward_stage2": 0.4322916865348816, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1469 + }, + { + "completion_length": 8.0625, + "epoch": 0.2575784124759068, + "grad_norm": 16.988692252928395, + "kl": 0.15234375, + "learning_rate": 7.425968109339408e-07, + "loss": 0.0609, + "reward": 1.390625, + "reward_std": 0.16887323558330536, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1470 + }, + { + "completion_length": 13.828125, + "epoch": 0.25775363588575434, + "grad_norm": 19.896280413945497, + "kl": 0.171875, + "learning_rate": 7.424215875240932e-07, + "loss": -0.0636, + "reward": 1.4487630128860474, + "reward_std": 0.24810229241847992, + "rewards/accuracy_reward_stage2": 0.49563801288604736, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1471 + }, + { + "completion_length": 10.984375, + "epoch": 0.2579288592956019, + "grad_norm": 24.565980627282247, + "kl": 0.16015625, + "learning_rate": 7.422463641142457e-07, + "loss": 0.0641, + "reward": 1.600043773651123, + "reward_std": 0.23149800300598145, + "rewards/accuracy_reward_stage2": 0.600043773651123, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1472 + }, + { + "completion_length": 11.328125, + "epoch": 0.25810408270544943, + "grad_norm": 16.323516653761956, + "kl": 0.08935546875, + "learning_rate": 7.42071140704398e-07, + "loss": -0.0014, + "reward": 1.389192819595337, + "reward_std": 0.18605825304985046, + "rewards/accuracy_reward_stage2": 0.4048178791999817, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1473 + }, + { + "completion_length": 12.875, + "epoch": 0.258279306115297, + "grad_norm": 13.089491093183998, + "kl": 0.0458984375, + "learning_rate": 7.418959172945505e-07, + "loss": 0.0184, + "reward": 1.8300971984863281, + "reward_std": 0.10606367141008377, + "rewards/accuracy_reward_stage2": 0.8300973176956177, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1474 + }, + { + "completion_length": 7.40625, + "epoch": 0.2584545295251446, + "grad_norm": 21.708425799782695, + "kl": 0.09765625, + "learning_rate": 7.41720693884703e-07, + "loss": 0.0101, + "reward": 1.613210916519165, + "reward_std": 0.24434758722782135, + "rewards/accuracy_reward_stage2": 0.6288357973098755, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1475 + }, + { + "completion_length": 7.609375, + "epoch": 0.25862975293499213, + "grad_norm": 22.158316501149994, + "kl": 0.07177734375, + "learning_rate": 7.415454704748554e-07, + "loss": -0.0089, + "reward": 1.8659720420837402, + "reward_std": 0.14032378792762756, + "rewards/accuracy_reward_stage2": 0.8815969824790955, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1476 + }, + { + "completion_length": 10.140625, + "epoch": 0.2588049763448397, + "grad_norm": 20.15236047195376, + "kl": 0.05029296875, + "learning_rate": 7.413702470650078e-07, + "loss": 0.0201, + "reward": 1.6414086818695068, + "reward_std": 0.16311705112457275, + "rewards/accuracy_reward_stage2": 0.6414086222648621, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1477 + }, + { + "completion_length": 8.625, + "epoch": 0.2589801997546872, + "grad_norm": 11.965558883144823, + "kl": 0.0380859375, + "learning_rate": 7.411950236551603e-07, + "loss": 0.0153, + "reward": 1.6470980644226074, + "reward_std": 0.09230685234069824, + "rewards/accuracy_reward_stage2": 0.6470980644226074, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1478 + }, + { + "completion_length": 13.5, + "epoch": 0.25915542316453477, + "grad_norm": 19.52953275782209, + "kl": 0.09912109375, + "learning_rate": 7.410198002453127e-07, + "loss": 0.0396, + "reward": 1.4840954542160034, + "reward_std": 0.16965684294700623, + "rewards/accuracy_reward_stage2": 0.6090954542160034, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1479 + }, + { + "completion_length": 24.25, + "epoch": 0.2593306465743823, + "grad_norm": 21.65207615786653, + "kl": 0.09375, + "learning_rate": 7.408445768354652e-07, + "loss": 0.0374, + "reward": 1.379471778869629, + "reward_std": 0.26678863167762756, + "rewards/accuracy_reward_stage2": 0.5044718980789185, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1480 + }, + { + "completion_length": 10.28125, + "epoch": 0.2595058699842299, + "grad_norm": 18.114718995075766, + "kl": 0.046142578125, + "learning_rate": 7.406693534256176e-07, + "loss": 0.0185, + "reward": 1.7144222259521484, + "reward_std": 0.24147561192512512, + "rewards/accuracy_reward_stage2": 0.7144221067428589, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1481 + }, + { + "completion_length": 9.171875, + "epoch": 0.25968109339407747, + "grad_norm": 23.559305769873887, + "kl": 0.2314453125, + "learning_rate": 7.404941300157701e-07, + "loss": 0.0559, + "reward": 1.7861251831054688, + "reward_std": 0.2582182288169861, + "rewards/accuracy_reward_stage2": 0.8017500638961792, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1482 + }, + { + "completion_length": 12.125, + "epoch": 0.259856316803925, + "grad_norm": 16.380539897990786, + "kl": 0.09814453125, + "learning_rate": 7.403189066059226e-07, + "loss": 0.0392, + "reward": 1.516692876815796, + "reward_std": 0.14989466965198517, + "rewards/accuracy_reward_stage2": 0.5166928172111511, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1483 + }, + { + "completion_length": 6.640625, + "epoch": 0.26003154021377256, + "grad_norm": 14.736981869826542, + "kl": 0.11181640625, + "learning_rate": 7.40143683196075e-07, + "loss": 0.0004, + "reward": 1.8959097862243652, + "reward_std": 0.15828779339790344, + "rewards/accuracy_reward_stage2": 0.9115347862243652, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1484 + }, + { + "completion_length": 7.328125, + "epoch": 0.2602067636236201, + "grad_norm": 15.499096500204017, + "kl": 0.072265625, + "learning_rate": 7.399684597862275e-07, + "loss": -0.004, + "reward": 1.6398365497589111, + "reward_std": 0.17402216792106628, + "rewards/accuracy_reward_stage2": 0.6554616093635559, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1485 + }, + { + "completion_length": 10.8125, + "epoch": 0.26038198703346765, + "grad_norm": 18.499317534039804, + "kl": 0.06640625, + "learning_rate": 7.397932363763799e-07, + "loss": 0.0265, + "reward": 1.4845848083496094, + "reward_std": 0.10854353755712509, + "rewards/accuracy_reward_stage2": 0.4845846891403198, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1486 + }, + { + "completion_length": 12.578125, + "epoch": 0.2605572104433152, + "grad_norm": 17.074920587332564, + "kl": 0.12060546875, + "learning_rate": 7.396180129665322e-07, + "loss": 0.0481, + "reward": 1.523491621017456, + "reward_std": 0.0827716588973999, + "rewards/accuracy_reward_stage2": 0.648491621017456, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1487 + }, + { + "completion_length": 6.8125, + "epoch": 0.2607324338531628, + "grad_norm": 16.74247186108171, + "kl": 0.10400390625, + "learning_rate": 7.394427895566847e-07, + "loss": 0.0031, + "reward": 1.5125616788864136, + "reward_std": 0.2064923644065857, + "rewards/accuracy_reward_stage2": 0.6531867384910583, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1488 + }, + { + "completion_length": 13.6875, + "epoch": 0.26090765726301035, + "grad_norm": 23.25587684438963, + "kl": 0.1923828125, + "learning_rate": 7.392675661468371e-07, + "loss": 0.0769, + "reward": 1.2328770160675049, + "reward_std": 0.23550641536712646, + "rewards/accuracy_reward_stage2": 0.4828770160675049, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1489 + }, + { + "completion_length": 12.75, + "epoch": 0.2610828806728579, + "grad_norm": 16.46879874350823, + "kl": 0.08984375, + "learning_rate": 7.390923427369896e-07, + "loss": 0.0359, + "reward": 1.3310246467590332, + "reward_std": 0.21695610880851746, + "rewards/accuracy_reward_stage2": 0.4560246169567108, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1490 + }, + { + "completion_length": 9.53125, + "epoch": 0.26125810408270544, + "grad_norm": 18.813317538380165, + "kl": 0.150390625, + "learning_rate": 7.389171193271421e-07, + "loss": 0.0603, + "reward": 1.631639003753662, + "reward_std": 0.21653306484222412, + "rewards/accuracy_reward_stage2": 0.7566390633583069, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1491 + }, + { + "completion_length": 8.5625, + "epoch": 0.261433327492553, + "grad_norm": 14.57852348656557, + "kl": 0.1455078125, + "learning_rate": 7.387418959172945e-07, + "loss": 0.0141, + "reward": 1.3212279081344604, + "reward_std": 0.20050299167633057, + "rewards/accuracy_reward_stage2": 0.33685290813446045, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1492 + }, + { + "completion_length": 9.71875, + "epoch": 0.26160855090240054, + "grad_norm": 19.213824798628277, + "kl": 0.1484375, + "learning_rate": 7.38566672507447e-07, + "loss": 0.0593, + "reward": 1.7892357110977173, + "reward_std": 0.31280261278152466, + "rewards/accuracy_reward_stage2": 0.7892358303070068, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1493 + }, + { + "completion_length": 7.71875, + "epoch": 0.26178377431224814, + "grad_norm": 17.296368273180498, + "kl": 0.11572265625, + "learning_rate": 7.383914490975995e-07, + "loss": 0.0462, + "reward": 1.476837158203125, + "reward_std": 0.15431980788707733, + "rewards/accuracy_reward_stage2": 0.4768372178077698, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1494 + }, + { + "completion_length": 6.453125, + "epoch": 0.2619589977220957, + "grad_norm": 15.24005128245555, + "kl": 0.11328125, + "learning_rate": 7.382162256877519e-07, + "loss": -0.0341, + "reward": 1.624133586883545, + "reward_std": 0.1425294131040573, + "rewards/accuracy_reward_stage2": 0.6553836464881897, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1495 + }, + { + "completion_length": 9.375, + "epoch": 0.26213422113194323, + "grad_norm": 24.641600506436927, + "kl": 0.045654296875, + "learning_rate": 7.380410022779044e-07, + "loss": 0.0182, + "reward": 1.531754493713379, + "reward_std": 0.2628553509712219, + "rewards/accuracy_reward_stage2": 0.5317546129226685, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1496 + }, + { + "completion_length": 12.90625, + "epoch": 0.2623094445417908, + "grad_norm": 17.434987564564846, + "kl": 0.057373046875, + "learning_rate": 7.378657788680567e-07, + "loss": 0.0229, + "reward": 1.3142204284667969, + "reward_std": 0.08087074756622314, + "rewards/accuracy_reward_stage2": 0.3142204284667969, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1497 + }, + { + "completion_length": 9.0, + "epoch": 0.26248466795163833, + "grad_norm": 21.262342736610243, + "kl": 0.0634765625, + "learning_rate": 7.376905554582091e-07, + "loss": 0.0254, + "reward": 1.4722018241882324, + "reward_std": 0.24770504236221313, + "rewards/accuracy_reward_stage2": 0.47220176458358765, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1498 + }, + { + "completion_length": 10.125, + "epoch": 0.2626598913614859, + "grad_norm": 26.079360544594778, + "kl": 0.10693359375, + "learning_rate": 7.375153320483616e-07, + "loss": 0.0428, + "reward": 1.614469289779663, + "reward_std": 0.23577997088432312, + "rewards/accuracy_reward_stage2": 0.6144692897796631, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1499 + }, + { + "completion_length": 8.953125, + "epoch": 0.2628351147713335, + "grad_norm": 13.188425069955375, + "kl": 0.07177734375, + "learning_rate": 7.37340108638514e-07, + "loss": 0.0287, + "reward": 1.6197917461395264, + "reward_std": 0.17163534462451935, + "rewards/accuracy_reward_stage2": 0.7447916865348816, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1500 + }, + { + "completion_length": 10.0625, + "epoch": 0.263010338181181, + "grad_norm": 25.910669248652916, + "kl": 0.1787109375, + "learning_rate": 7.371648852286665e-07, + "loss": 0.0382, + "reward": 1.2576980590820312, + "reward_std": 0.421562522649765, + "rewards/accuracy_reward_stage2": 0.39832305908203125, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1501 + }, + { + "completion_length": 9.515625, + "epoch": 0.26318556159102857, + "grad_norm": 36.427488731530936, + "kl": 0.173828125, + "learning_rate": 7.36989661818819e-07, + "loss": 0.0317, + "reward": 1.3031154870986938, + "reward_std": 0.3199523091316223, + "rewards/accuracy_reward_stage2": 0.5687404870986938, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1502 + }, + { + "completion_length": 9.640625, + "epoch": 0.2633607850008761, + "grad_norm": 19.300581958634684, + "kl": 0.236328125, + "learning_rate": 7.368144384089714e-07, + "loss": 0.0945, + "reward": 1.5580551624298096, + "reward_std": 0.18352779746055603, + "rewards/accuracy_reward_stage2": 0.6830551624298096, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1503 + }, + { + "completion_length": 10.375, + "epoch": 0.26353600841072367, + "grad_norm": 20.598441592058176, + "kl": 0.025390625, + "learning_rate": 7.366392149991239e-07, + "loss": 0.0102, + "reward": 1.4132441282272339, + "reward_std": 0.34681200981140137, + "rewards/accuracy_reward_stage2": 0.4132440388202667, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1504 + }, + { + "completion_length": 15.46875, + "epoch": 0.2637112318205712, + "grad_norm": 16.923225965838604, + "kl": 0.0595703125, + "learning_rate": 7.364639915892763e-07, + "loss": -0.0203, + "reward": 1.5988796949386597, + "reward_std": 0.22254234552383423, + "rewards/accuracy_reward_stage2": 0.6145046949386597, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1505 + }, + { + "completion_length": 8.015625, + "epoch": 0.26388645523041876, + "grad_norm": 50.729038664876875, + "kl": 0.220703125, + "learning_rate": 7.362887681794288e-07, + "loss": 0.0882, + "reward": 1.7398931980133057, + "reward_std": 0.1275492012500763, + "rewards/accuracy_reward_stage2": 0.7398930788040161, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1506 + }, + { + "completion_length": 11.875, + "epoch": 0.26406167864026636, + "grad_norm": 24.131657758244867, + "kl": 0.072265625, + "learning_rate": 7.361135447695812e-07, + "loss": 0.0289, + "reward": 1.7232370376586914, + "reward_std": 0.17531737685203552, + "rewards/accuracy_reward_stage2": 0.7232369184494019, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1507 + }, + { + "completion_length": 11.984375, + "epoch": 0.2642369020501139, + "grad_norm": 18.52551523920455, + "kl": 0.04638671875, + "learning_rate": 7.359383213597336e-07, + "loss": 0.0186, + "reward": 1.78243887424469, + "reward_std": 0.06022557616233826, + "rewards/accuracy_reward_stage2": 0.7824387550354004, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1508 + }, + { + "completion_length": 12.34375, + "epoch": 0.26441212545996146, + "grad_norm": 12.259126050769588, + "kl": 0.04052734375, + "learning_rate": 7.357630979498861e-07, + "loss": -0.0206, + "reward": 1.5980116128921509, + "reward_std": 0.06384958326816559, + "rewards/accuracy_reward_stage2": 0.6136366128921509, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1509 + }, + { + "completion_length": 14.21875, + "epoch": 0.264587348869809, + "grad_norm": 113.84216603325396, + "kl": 0.5625, + "learning_rate": 7.355878745400386e-07, + "loss": 0.2247, + "reward": 1.5885417461395264, + "reward_std": 0.20276054739952087, + "rewards/accuracy_reward_stage2": 0.7135416269302368, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1510 + }, + { + "completion_length": 14.0625, + "epoch": 0.26476257227965655, + "grad_norm": 22.257783257834138, + "kl": 0.04443359375, + "learning_rate": 7.354126511301909e-07, + "loss": -0.0152, + "reward": 1.488447666168213, + "reward_std": 0.2847330570220947, + "rewards/accuracy_reward_stage2": 0.5040726661682129, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1511 + }, + { + "completion_length": 13.375, + "epoch": 0.2649377956895041, + "grad_norm": 19.024628703462216, + "kl": 0.10986328125, + "learning_rate": 7.352374277203434e-07, + "loss": 0.0441, + "reward": 1.3006612062454224, + "reward_std": 0.16250282526016235, + "rewards/accuracy_reward_stage2": 0.42566123604774475, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1512 + }, + { + "completion_length": 8.421875, + "epoch": 0.2651130190993517, + "grad_norm": 23.432281970637504, + "kl": 0.11669921875, + "learning_rate": 7.350622043104958e-07, + "loss": 0.0467, + "reward": 1.5951578617095947, + "reward_std": 0.2725307047367096, + "rewards/accuracy_reward_stage2": 0.5951578617095947, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1513 + }, + { + "completion_length": 13.90625, + "epoch": 0.26528824250919925, + "grad_norm": 15.405200044290508, + "kl": 0.1337890625, + "learning_rate": 7.348869809006483e-07, + "loss": 0.0533, + "reward": 1.2321314811706543, + "reward_std": 0.09853121638298035, + "rewards/accuracy_reward_stage2": 0.3571315109729767, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1514 + }, + { + "completion_length": 9.671875, + "epoch": 0.2654634659190468, + "grad_norm": 14.970455451581415, + "kl": 0.0169677734375, + "learning_rate": 7.347117574908008e-07, + "loss": 0.0068, + "reward": 1.8090277910232544, + "reward_std": 0.15713483095169067, + "rewards/accuracy_reward_stage2": 0.8090277910232544, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1515 + }, + { + "completion_length": 10.015625, + "epoch": 0.26563868932889434, + "grad_norm": 20.289129130031835, + "kl": 0.126953125, + "learning_rate": 7.345365340809531e-07, + "loss": 0.0192, + "reward": 1.2822504043579102, + "reward_std": 0.1558304727077484, + "rewards/accuracy_reward_stage2": 0.29787540435791016, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1516 + }, + { + "completion_length": 7.984375, + "epoch": 0.2658139127387419, + "grad_norm": 12.479550386762886, + "kl": 0.0517578125, + "learning_rate": 7.343613106711056e-07, + "loss": -0.0175, + "reward": 1.6450035572052002, + "reward_std": 0.17588838934898376, + "rewards/accuracy_reward_stage2": 0.6606285572052002, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1517 + }, + { + "completion_length": 17.109375, + "epoch": 0.26598913614858943, + "grad_norm": 19.38559620135752, + "kl": 0.1162109375, + "learning_rate": 7.341860872612581e-07, + "loss": 0.0463, + "reward": 1.4216383695602417, + "reward_std": 0.1845116913318634, + "rewards/accuracy_reward_stage2": 0.5466383695602417, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1518 + }, + { + "completion_length": 12.25, + "epoch": 0.266164359558437, + "grad_norm": 13.140767714807309, + "kl": 0.041748046875, + "learning_rate": 7.340108638514105e-07, + "loss": -0.0122, + "reward": 1.6374698877334595, + "reward_std": 0.16872502863407135, + "rewards/accuracy_reward_stage2": 0.6530948877334595, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1519 + }, + { + "completion_length": 12.8125, + "epoch": 0.2663395829682846, + "grad_norm": 17.16037416358004, + "kl": 0.08837890625, + "learning_rate": 7.33835640441563e-07, + "loss": 0.0353, + "reward": 1.5358409881591797, + "reward_std": 0.26287949085235596, + "rewards/accuracy_reward_stage2": 0.7858409881591797, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1520 + }, + { + "completion_length": 12.1875, + "epoch": 0.26651480637813213, + "grad_norm": 20.780652134014453, + "kl": 0.10498046875, + "learning_rate": 7.336604170317154e-07, + "loss": -0.0256, + "reward": 1.5530736446380615, + "reward_std": 0.22033998370170593, + "rewards/accuracy_reward_stage2": 0.7093237042427063, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1521 + }, + { + "completion_length": 9.15625, + "epoch": 0.2666900297879797, + "grad_norm": 21.789248697651665, + "kl": 0.095703125, + "learning_rate": 7.334851936218679e-07, + "loss": -0.0756, + "reward": 1.4975124597549438, + "reward_std": 0.32140904664993286, + "rewards/accuracy_reward_stage2": 0.6693874597549438, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1522 + }, + { + "completion_length": 12.203125, + "epoch": 0.2668652531978272, + "grad_norm": 22.67098179945106, + "kl": 0.125, + "learning_rate": 7.333099702120204e-07, + "loss": 0.05, + "reward": 1.5546128749847412, + "reward_std": 0.25176021456718445, + "rewards/accuracy_reward_stage2": 0.679612934589386, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1523 + }, + { + "completion_length": 16.75, + "epoch": 0.26704047660767477, + "grad_norm": 17.754697761796134, + "kl": 0.1806640625, + "learning_rate": 7.331347468021727e-07, + "loss": 0.0722, + "reward": 1.5393863916397095, + "reward_std": 0.2093697190284729, + "rewards/accuracy_reward_stage2": 0.5393863916397095, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1524 + }, + { + "completion_length": 16.9375, + "epoch": 0.2672157000175223, + "grad_norm": 17.132183072177597, + "kl": 0.0625, + "learning_rate": 7.329595233923252e-07, + "loss": 0.0251, + "reward": 1.7056671380996704, + "reward_std": 0.12278222292661667, + "rewards/accuracy_reward_stage2": 0.7056670784950256, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1525 + }, + { + "completion_length": 11.9375, + "epoch": 0.2673909234273699, + "grad_norm": 21.318333986059294, + "kl": 0.14453125, + "learning_rate": 7.327842999824775e-07, + "loss": 0.0393, + "reward": 1.4820630550384521, + "reward_std": 0.1880171000957489, + "rewards/accuracy_reward_stage2": 0.4976881146430969, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1526 + }, + { + "completion_length": 6.453125, + "epoch": 0.26756614683721747, + "grad_norm": 15.136268546383176, + "kl": 0.12060546875, + "learning_rate": 7.3260907657263e-07, + "loss": -0.0378, + "reward": 1.5672743320465088, + "reward_std": 0.22971659898757935, + "rewards/accuracy_reward_stage2": 0.6141493320465088, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1527 + }, + { + "completion_length": 6.796875, + "epoch": 0.267741370247065, + "grad_norm": 20.14827169845298, + "kl": 0.08544921875, + "learning_rate": 7.324338531627825e-07, + "loss": -0.0082, + "reward": 1.6023056507110596, + "reward_std": 0.26246657967567444, + "rewards/accuracy_reward_stage2": 0.6179307103157043, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1528 + }, + { + "completion_length": 8.84375, + "epoch": 0.26791659365691256, + "grad_norm": 10.198922534215432, + "kl": 0.0478515625, + "learning_rate": 7.322586297529349e-07, + "loss": 0.0192, + "reward": 1.6613264083862305, + "reward_std": 0.07768907397985458, + "rewards/accuracy_reward_stage2": 0.6613264083862305, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1529 + }, + { + "completion_length": 9.140625, + "epoch": 0.2680918170667601, + "grad_norm": 16.248910050517573, + "kl": 0.08203125, + "learning_rate": 7.320834063430874e-07, + "loss": -0.0078, + "reward": 1.5019550323486328, + "reward_std": 0.19075937569141388, + "rewards/accuracy_reward_stage2": 0.6425800323486328, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1530 + }, + { + "completion_length": 7.859375, + "epoch": 0.26826704047660765, + "grad_norm": 20.561452330667606, + "kl": 0.033447265625, + "learning_rate": 7.319081829332399e-07, + "loss": 0.0134, + "reward": 1.5087047815322876, + "reward_std": 0.26103299856185913, + "rewards/accuracy_reward_stage2": 0.5087048411369324, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1531 + }, + { + "completion_length": 21.0625, + "epoch": 0.26844226388645526, + "grad_norm": 18.92039677260036, + "kl": 0.01806640625, + "learning_rate": 7.317329595233923e-07, + "loss": 0.0072, + "reward": 1.5572808980941772, + "reward_std": 0.2519490718841553, + "rewards/accuracy_reward_stage2": 0.6822808980941772, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1532 + }, + { + "completion_length": 8.46875, + "epoch": 0.2686174872963028, + "grad_norm": 16.95047090037599, + "kl": 0.078125, + "learning_rate": 7.315577361135448e-07, + "loss": -0.0128, + "reward": 1.5476700067520142, + "reward_std": 0.22458161413669586, + "rewards/accuracy_reward_stage2": 0.5632950067520142, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1533 + }, + { + "completion_length": 11.53125, + "epoch": 0.26879271070615035, + "grad_norm": 13.105746686622085, + "kl": 0.1044921875, + "learning_rate": 7.313825127036972e-07, + "loss": 0.0011, + "reward": 1.5887963771820068, + "reward_std": 0.20041370391845703, + "rewards/accuracy_reward_stage2": 0.6044213771820068, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1534 + }, + { + "completion_length": 12.71875, + "epoch": 0.2689679341159979, + "grad_norm": 22.064501649963276, + "kl": 0.064453125, + "learning_rate": 7.312072892938497e-07, + "loss": 0.0163, + "reward": 1.5054750442504883, + "reward_std": 0.15925616025924683, + "rewards/accuracy_reward_stage2": 0.5210999846458435, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1535 + }, + { + "completion_length": 10.84375, + "epoch": 0.26914315752584544, + "grad_norm": 16.184712854528378, + "kl": 0.1103515625, + "learning_rate": 7.310320658840022e-07, + "loss": 0.0001, + "reward": 1.243492603302002, + "reward_std": 0.18201014399528503, + "rewards/accuracy_reward_stage2": 0.38411760330200195, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1536 + }, + { + "completion_length": 15.578125, + "epoch": 0.269318380935693, + "grad_norm": 18.757460190128715, + "kl": 0.078125, + "learning_rate": 7.308568424741544e-07, + "loss": 0.0312, + "reward": 1.6463735103607178, + "reward_std": 0.17724749445915222, + "rewards/accuracy_reward_stage2": 0.646373450756073, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1537 + }, + { + "completion_length": 12.90625, + "epoch": 0.26949360434554054, + "grad_norm": 12.440359363732176, + "kl": 0.053466796875, + "learning_rate": 7.306816190643069e-07, + "loss": 0.0214, + "reward": 1.3152844905853271, + "reward_std": 0.08948960155248642, + "rewards/accuracy_reward_stage2": 0.31528446078300476, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1538 + }, + { + "completion_length": 8.828125, + "epoch": 0.26966882775538814, + "grad_norm": 23.314871830339797, + "kl": 0.10693359375, + "learning_rate": 7.305063956544594e-07, + "loss": 0.0031, + "reward": 1.6845183372497559, + "reward_std": 0.31178730726242065, + "rewards/accuracy_reward_stage2": 0.7001434564590454, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1539 + }, + { + "completion_length": 8.265625, + "epoch": 0.2698440511652357, + "grad_norm": 15.128750411864983, + "kl": 0.197265625, + "learning_rate": 7.303311722446118e-07, + "loss": 0.079, + "reward": 1.5312914848327637, + "reward_std": 0.1368497908115387, + "rewards/accuracy_reward_stage2": 0.6562913656234741, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1540 + }, + { + "completion_length": 9.453125, + "epoch": 0.27001927457508323, + "grad_norm": 9.897014047607549, + "kl": 0.06982421875, + "learning_rate": 7.301559488347643e-07, + "loss": 0.028, + "reward": 1.6693193912506104, + "reward_std": 0.1036425307393074, + "rewards/accuracy_reward_stage2": 0.7943194508552551, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1541 + }, + { + "completion_length": 12.96875, + "epoch": 0.2701944979849308, + "grad_norm": 17.61779482500366, + "kl": 0.0260009765625, + "learning_rate": 7.299807254249167e-07, + "loss": 0.0104, + "reward": 1.4325488805770874, + "reward_std": 0.23638816177845, + "rewards/accuracy_reward_stage2": 0.4325488805770874, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1542 + }, + { + "completion_length": 10.296875, + "epoch": 0.27036972139477833, + "grad_norm": 21.49084583990938, + "kl": 0.240234375, + "learning_rate": 7.298055020150692e-07, + "loss": 0.096, + "reward": 1.5804922580718994, + "reward_std": 0.23495854437351227, + "rewards/accuracy_reward_stage2": 0.7054921984672546, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1543 + }, + { + "completion_length": 9.703125, + "epoch": 0.2705449448046259, + "grad_norm": 25.659390561790094, + "kl": 0.1953125, + "learning_rate": 7.296302786052217e-07, + "loss": 0.0779, + "reward": 1.3564759492874146, + "reward_std": 0.2723844647407532, + "rewards/accuracy_reward_stage2": 0.48147594928741455, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1544 + }, + { + "completion_length": 11.140625, + "epoch": 0.2707201682144735, + "grad_norm": 21.15270962339563, + "kl": 0.12060546875, + "learning_rate": 7.294550551953741e-07, + "loss": 0.0751, + "reward": 1.6740481853485107, + "reward_std": 0.16617107391357422, + "rewards/accuracy_reward_stage2": 0.799048125743866, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1545 + }, + { + "completion_length": 6.625, + "epoch": 0.270895391624321, + "grad_norm": 25.622727181263297, + "kl": 0.095703125, + "learning_rate": 7.292798317855265e-07, + "loss": 0.0384, + "reward": 1.722312331199646, + "reward_std": 0.19830942153930664, + "rewards/accuracy_reward_stage2": 0.847312331199646, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1546 + }, + { + "completion_length": 9.796875, + "epoch": 0.27107061503416857, + "grad_norm": 27.517916045250036, + "kl": 0.453125, + "learning_rate": 7.29104608375679e-07, + "loss": 0.1811, + "reward": 1.391111135482788, + "reward_std": 0.3434157371520996, + "rewards/accuracy_reward_stage2": 0.6411112546920776, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1547 + }, + { + "completion_length": 7.59375, + "epoch": 0.2712458384440161, + "grad_norm": 14.200419137196972, + "kl": 0.0751953125, + "learning_rate": 7.289293849658314e-07, + "loss": 0.0301, + "reward": 1.612762212753296, + "reward_std": 0.1002715528011322, + "rewards/accuracy_reward_stage2": 0.7377622127532959, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1548 + }, + { + "completion_length": 15.734375, + "epoch": 0.27142106185386367, + "grad_norm": 19.81161434968578, + "kl": 0.012939453125, + "learning_rate": 7.287541615559838e-07, + "loss": 0.0052, + "reward": 1.5175888538360596, + "reward_std": 0.219430610537529, + "rewards/accuracy_reward_stage2": 0.5175889134407043, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1549 + }, + { + "completion_length": 15.28125, + "epoch": 0.2715962852637112, + "grad_norm": 21.63402553561504, + "kl": 0.1640625, + "learning_rate": 7.285789381461362e-07, + "loss": 0.0034, + "reward": 1.5669314861297607, + "reward_std": 0.19692623615264893, + "rewards/accuracy_reward_stage2": 0.6138066053390503, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1550 + }, + { + "completion_length": 12.21875, + "epoch": 0.2717715086735588, + "grad_norm": 22.15076349499154, + "kl": 0.1708984375, + "learning_rate": 7.284037147362887e-07, + "loss": 0.0324, + "reward": 1.4736478328704834, + "reward_std": 0.13731749355793, + "rewards/accuracy_reward_stage2": 0.4892728924751282, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1551 + }, + { + "completion_length": 6.09375, + "epoch": 0.27194673208340636, + "grad_norm": 17.382817812945337, + "kl": 0.1005859375, + "learning_rate": 7.282284913264412e-07, + "loss": 0.0404, + "reward": 1.7686809301376343, + "reward_std": 0.19698965549468994, + "rewards/accuracy_reward_stage2": 0.7686809301376343, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1552 + }, + { + "completion_length": 8.046875, + "epoch": 0.2721219554932539, + "grad_norm": 20.083754257329154, + "kl": 0.1650390625, + "learning_rate": 7.280532679165936e-07, + "loss": 0.0661, + "reward": 1.3247437477111816, + "reward_std": 0.12370945513248444, + "rewards/accuracy_reward_stage2": 0.44974377751350403, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1553 + }, + { + "completion_length": 10.109375, + "epoch": 0.27229717890310146, + "grad_norm": 16.555514649268687, + "kl": 0.0751953125, + "learning_rate": 7.278780445067461e-07, + "loss": -0.0325, + "reward": 1.443509578704834, + "reward_std": 0.3313031792640686, + "rewards/accuracy_reward_stage2": 0.474759578704834, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1554 + }, + { + "completion_length": 9.859375, + "epoch": 0.272472402312949, + "grad_norm": 17.971828702025466, + "kl": 0.060302734375, + "learning_rate": 7.277028210968986e-07, + "loss": 0.0241, + "reward": 1.5731000900268555, + "reward_std": 0.29691874980926514, + "rewards/accuracy_reward_stage2": 0.5731000900268555, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1555 + }, + { + "completion_length": 15.796875, + "epoch": 0.27264762572279655, + "grad_norm": 14.684691166296368, + "kl": 0.04541015625, + "learning_rate": 7.275275976870509e-07, + "loss": 0.0182, + "reward": 1.428015947341919, + "reward_std": 0.14021556079387665, + "rewards/accuracy_reward_stage2": 0.42801591753959656, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1556 + }, + { + "completion_length": 10.421875, + "epoch": 0.2728228491326441, + "grad_norm": 21.497354071634, + "kl": 0.1943359375, + "learning_rate": 7.273523742772034e-07, + "loss": 0.0378, + "reward": 1.5667054653167725, + "reward_std": 0.31732115149497986, + "rewards/accuracy_reward_stage2": 0.5823305249214172, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1557 + }, + { + "completion_length": 7.53125, + "epoch": 0.2729980725424917, + "grad_norm": 12.187444232549343, + "kl": 0.03515625, + "learning_rate": 7.271771508673558e-07, + "loss": 0.0141, + "reward": 1.6551628112792969, + "reward_std": 0.11131031066179276, + "rewards/accuracy_reward_stage2": 0.6551628708839417, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1558 + }, + { + "completion_length": 10.0625, + "epoch": 0.27317329595233925, + "grad_norm": 17.247051556669394, + "kl": 0.037841796875, + "learning_rate": 7.270019274575083e-07, + "loss": 0.0152, + "reward": 1.6297712326049805, + "reward_std": 0.1473989188671112, + "rewards/accuracy_reward_stage2": 0.6297712922096252, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1559 + }, + { + "completion_length": 8.84375, + "epoch": 0.2733485193621868, + "grad_norm": 18.867441244462896, + "kl": 0.0458984375, + "learning_rate": 7.268267040476608e-07, + "loss": 0.0184, + "reward": 1.7025973796844482, + "reward_std": 0.19792814552783966, + "rewards/accuracy_reward_stage2": 0.7025973796844482, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1560 + }, + { + "completion_length": 7.921875, + "epoch": 0.27352374277203434, + "grad_norm": 16.56831611531489, + "kl": 0.11376953125, + "learning_rate": 7.266514806378132e-07, + "loss": -0.0846, + "reward": 1.465267300605774, + "reward_std": 0.2277645468711853, + "rewards/accuracy_reward_stage2": 0.6371423006057739, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1561 + }, + { + "completion_length": 10.59375, + "epoch": 0.2736989661818819, + "grad_norm": 21.307744690181806, + "kl": 0.04248046875, + "learning_rate": 7.264762572279656e-07, + "loss": -0.0164, + "reward": 1.3307263851165771, + "reward_std": 0.30374133586883545, + "rewards/accuracy_reward_stage2": 0.34635138511657715, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1562 + }, + { + "completion_length": 10.109375, + "epoch": 0.27387418959172943, + "grad_norm": 20.814389143707405, + "kl": 0.08544921875, + "learning_rate": 7.263010338181181e-07, + "loss": -0.0101, + "reward": 1.476302146911621, + "reward_std": 0.35749757289886475, + "rewards/accuracy_reward_stage2": 0.4919270873069763, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1563 + }, + { + "completion_length": 7.890625, + "epoch": 0.27404941300157704, + "grad_norm": 16.58609657046578, + "kl": 0.050537109375, + "learning_rate": 7.261258104082705e-07, + "loss": 0.0202, + "reward": 1.630878210067749, + "reward_std": 0.21472877264022827, + "rewards/accuracy_reward_stage2": 0.6308783292770386, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1564 + }, + { + "completion_length": 12.390625, + "epoch": 0.2742246364114246, + "grad_norm": 13.702475279669857, + "kl": 0.04296875, + "learning_rate": 7.25950586998423e-07, + "loss": 0.0172, + "reward": 1.6737043857574463, + "reward_std": 0.07372551411390305, + "rewards/accuracy_reward_stage2": 0.6737043857574463, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1565 + }, + { + "completion_length": 12.09375, + "epoch": 0.27439985982127213, + "grad_norm": 22.677346899697838, + "kl": 0.109375, + "learning_rate": 7.257753635885753e-07, + "loss": 0.0437, + "reward": 1.6756170988082886, + "reward_std": 0.24650396406650543, + "rewards/accuracy_reward_stage2": 0.6756170988082886, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1566 + }, + { + "completion_length": 13.390625, + "epoch": 0.2745750832311197, + "grad_norm": 23.576489442401506, + "kl": 0.026611328125, + "learning_rate": 7.256001401787278e-07, + "loss": 0.0107, + "reward": 1.3751254081726074, + "reward_std": 0.23284590244293213, + "rewards/accuracy_reward_stage2": 0.5001254677772522, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1567 + }, + { + "completion_length": 11.375, + "epoch": 0.2747503066409672, + "grad_norm": 25.478991718817426, + "kl": 0.038818359375, + "learning_rate": 7.254249167688803e-07, + "loss": -0.0511, + "reward": 1.4270833730697632, + "reward_std": 0.37081876397132874, + "rewards/accuracy_reward_stage2": 0.4583333134651184, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1568 + }, + { + "completion_length": 10.4375, + "epoch": 0.27492553005081477, + "grad_norm": 17.76540063991035, + "kl": 0.078125, + "learning_rate": 7.252496933590327e-07, + "loss": 0.0314, + "reward": 1.5698776245117188, + "reward_std": 0.2609265446662903, + "rewards/accuracy_reward_stage2": 0.5698776245117188, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1569 + }, + { + "completion_length": 9.96875, + "epoch": 0.2751007534606623, + "grad_norm": 20.370761525766085, + "kl": 0.04638671875, + "learning_rate": 7.250744699491852e-07, + "loss": 0.0186, + "reward": 1.5809662342071533, + "reward_std": 0.2464766502380371, + "rewards/accuracy_reward_stage2": 0.5809662342071533, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1570 + }, + { + "completion_length": 8.59375, + "epoch": 0.2752759768705099, + "grad_norm": 21.541839159331936, + "kl": 0.1123046875, + "learning_rate": 7.248992465393377e-07, + "loss": 0.0235, + "reward": 1.5938735008239746, + "reward_std": 0.25188708305358887, + "rewards/accuracy_reward_stage2": 0.6094985008239746, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1571 + }, + { + "completion_length": 9.015625, + "epoch": 0.27545120028035747, + "grad_norm": 11.988388116776276, + "kl": 0.0615234375, + "learning_rate": 7.247240231294901e-07, + "loss": -0.0146, + "reward": 1.7149364948272705, + "reward_std": 0.2069161832332611, + "rewards/accuracy_reward_stage2": 0.7305614948272705, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1572 + }, + { + "completion_length": 10.125, + "epoch": 0.275626423690205, + "grad_norm": 22.24484166978493, + "kl": 0.1923828125, + "learning_rate": 7.245487997196426e-07, + "loss": 0.0769, + "reward": 1.0984094142913818, + "reward_std": 0.1601712703704834, + "rewards/accuracy_reward_stage2": 0.348409503698349, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1573 + }, + { + "completion_length": 8.546875, + "epoch": 0.27580164710005256, + "grad_norm": 19.80297995187602, + "kl": 0.1552734375, + "learning_rate": 7.24373576309795e-07, + "loss": 0.0012, + "reward": 1.5901460647583008, + "reward_std": 0.2562961280345917, + "rewards/accuracy_reward_stage2": 0.6213959455490112, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1574 + }, + { + "completion_length": 7.484375, + "epoch": 0.2759768705099001, + "grad_norm": 18.965599614704047, + "kl": 0.076171875, + "learning_rate": 7.241983528999474e-07, + "loss": 0.0304, + "reward": 1.657869815826416, + "reward_std": 0.15086671710014343, + "rewards/accuracy_reward_stage2": 0.6578697562217712, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1575 + }, + { + "completion_length": 10.65625, + "epoch": 0.27615209391974765, + "grad_norm": 16.051869215816296, + "kl": 0.07373046875, + "learning_rate": 7.240231294900998e-07, + "loss": -0.0147, + "reward": 1.4174940586090088, + "reward_std": 0.2014116644859314, + "rewards/accuracy_reward_stage2": 0.5581189393997192, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1576 + }, + { + "completion_length": 11.71875, + "epoch": 0.27632731732959526, + "grad_norm": 28.32712176310248, + "kl": 0.04150390625, + "learning_rate": 7.238479060802522e-07, + "loss": 0.0166, + "reward": 1.4083119630813599, + "reward_std": 0.32629430294036865, + "rewards/accuracy_reward_stage2": 0.5333119630813599, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1577 + }, + { + "completion_length": 20.421875, + "epoch": 0.2765025407394428, + "grad_norm": 18.039033791626412, + "kl": 0.0634765625, + "learning_rate": 7.236726826704047e-07, + "loss": 0.0254, + "reward": 1.5804221630096436, + "reward_std": 0.16351595520973206, + "rewards/accuracy_reward_stage2": 0.5804222226142883, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1578 + }, + { + "completion_length": 9.171875, + "epoch": 0.27667776414929035, + "grad_norm": 21.66543564045071, + "kl": 0.212890625, + "learning_rate": 7.234974592605572e-07, + "loss": 0.0416, + "reward": 1.7025885581970215, + "reward_std": 0.3499017357826233, + "rewards/accuracy_reward_stage2": 0.7182136178016663, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1579 + }, + { + "completion_length": 17.84375, + "epoch": 0.2768529875591379, + "grad_norm": 18.73158665742104, + "kl": 0.05224609375, + "learning_rate": 7.233222358507096e-07, + "loss": -0.0233, + "reward": 1.4649728536605835, + "reward_std": 0.19344475865364075, + "rewards/accuracy_reward_stage2": 0.4805978834629059, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1580 + }, + { + "completion_length": 10.453125, + "epoch": 0.27702821096898544, + "grad_norm": 19.02839268753434, + "kl": 0.08740234375, + "learning_rate": 7.231470124408621e-07, + "loss": 0.0349, + "reward": 1.4809317588806152, + "reward_std": 0.26230835914611816, + "rewards/accuracy_reward_stage2": 0.6059318780899048, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1581 + }, + { + "completion_length": 12.21875, + "epoch": 0.277203434378833, + "grad_norm": 17.77505642063965, + "kl": 0.052734375, + "learning_rate": 7.229717890310145e-07, + "loss": -0.0187, + "reward": 1.6692399978637695, + "reward_std": 0.23491568863391876, + "rewards/accuracy_reward_stage2": 0.80986487865448, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1582 + }, + { + "completion_length": 8.609375, + "epoch": 0.2773786577886806, + "grad_norm": 14.097807207383974, + "kl": 0.16796875, + "learning_rate": 7.22796565621167e-07, + "loss": 0.0002, + "reward": 1.224002480506897, + "reward_std": 0.1800319105386734, + "rewards/accuracy_reward_stage2": 0.255252480506897, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1583 + }, + { + "completion_length": 8.390625, + "epoch": 0.27755388119852814, + "grad_norm": 18.685066348919854, + "kl": 0.2158203125, + "learning_rate": 7.226213422113195e-07, + "loss": 0.086, + "reward": 1.3312758207321167, + "reward_std": 0.2569296658039093, + "rewards/accuracy_reward_stage2": 0.4562758803367615, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1584 + }, + { + "completion_length": 12.75, + "epoch": 0.2777291046083757, + "grad_norm": 24.130001829627535, + "kl": 0.1962890625, + "learning_rate": 7.224461188014719e-07, + "loss": 0.0452, + "reward": 1.5319864749908447, + "reward_std": 0.31532320380210876, + "rewards/accuracy_reward_stage2": 0.5476114749908447, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1585 + }, + { + "completion_length": 9.25, + "epoch": 0.27790432801822323, + "grad_norm": 19.15274013812705, + "kl": 0.03759765625, + "learning_rate": 7.222708953916243e-07, + "loss": 0.008, + "reward": 1.5639368295669556, + "reward_std": 0.1757792979478836, + "rewards/accuracy_reward_stage2": 0.6889368295669556, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1586 + }, + { + "completion_length": 7.546875, + "epoch": 0.2780795514280708, + "grad_norm": 21.16513159458688, + "kl": 0.0576171875, + "learning_rate": 7.220956719817766e-07, + "loss": 0.0001, + "reward": 1.4314806461334229, + "reward_std": 0.2914354205131531, + "rewards/accuracy_reward_stage2": 0.44710561633110046, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1587 + }, + { + "completion_length": 7.875, + "epoch": 0.27825477483791833, + "grad_norm": 28.105192038087445, + "kl": 0.267578125, + "learning_rate": 7.219204485719291e-07, + "loss": 0.107, + "reward": 1.2831439971923828, + "reward_std": 0.19191929697990417, + "rewards/accuracy_reward_stage2": 0.533143937587738, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1588 + }, + { + "completion_length": 7.71875, + "epoch": 0.2784299982477659, + "grad_norm": 20.911599804476825, + "kl": 0.08642578125, + "learning_rate": 7.217452251620816e-07, + "loss": 0.0346, + "reward": 1.5567564964294434, + "reward_std": 0.18814368546009064, + "rewards/accuracy_reward_stage2": 0.6817566156387329, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1589 + }, + { + "completion_length": 6.484375, + "epoch": 0.2786052216576135, + "grad_norm": 18.09001762878497, + "kl": 0.057861328125, + "learning_rate": 7.21570001752234e-07, + "loss": 0.0232, + "reward": 1.5766370296478271, + "reward_std": 0.16156907379627228, + "rewards/accuracy_reward_stage2": 0.5766370296478271, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1590 + }, + { + "completion_length": 8.90625, + "epoch": 0.278780445067461, + "grad_norm": 12.938723311552208, + "kl": 0.036865234375, + "learning_rate": 7.213947783423865e-07, + "loss": 0.0147, + "reward": 1.4394537210464478, + "reward_std": 0.09740547835826874, + "rewards/accuracy_reward_stage2": 0.43945372104644775, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1591 + }, + { + "completion_length": 11.921875, + "epoch": 0.27895566847730857, + "grad_norm": 17.96221751888626, + "kl": 0.0341796875, + "learning_rate": 7.21219554932539e-07, + "loss": -0.0079, + "reward": 1.4706923961639404, + "reward_std": 0.18699324131011963, + "rewards/accuracy_reward_stage2": 0.48631754517555237, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1592 + }, + { + "completion_length": 9.015625, + "epoch": 0.2791308918871561, + "grad_norm": 18.842290019196636, + "kl": 0.07373046875, + "learning_rate": 7.210443315226914e-07, + "loss": 0.0295, + "reward": 1.8209922313690186, + "reward_std": 0.2510858178138733, + "rewards/accuracy_reward_stage2": 0.8209922909736633, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1593 + }, + { + "completion_length": 11.796875, + "epoch": 0.27930611529700367, + "grad_norm": 20.724838234141696, + "kl": 0.020263671875, + "learning_rate": 7.208691081128439e-07, + "loss": 0.0081, + "reward": 1.7598631381988525, + "reward_std": 0.1263093203306198, + "rewards/accuracy_reward_stage2": 0.7598632574081421, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1594 + }, + { + "completion_length": 11.890625, + "epoch": 0.2794813387068512, + "grad_norm": 21.423489400476658, + "kl": 0.068359375, + "learning_rate": 7.206938847029962e-07, + "loss": 0.0274, + "reward": 1.5535926818847656, + "reward_std": 0.16711866855621338, + "rewards/accuracy_reward_stage2": 0.5535926818847656, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1595 + }, + { + "completion_length": 10.515625, + "epoch": 0.2796565621166988, + "grad_norm": 14.707843957438799, + "kl": 0.03271484375, + "learning_rate": 7.205186612931487e-07, + "loss": 0.013, + "reward": 1.8087513446807861, + "reward_std": 0.07620933651924133, + "rewards/accuracy_reward_stage2": 0.8087514042854309, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1596 + }, + { + "completion_length": 6.96875, + "epoch": 0.27983178552654636, + "grad_norm": 20.941631632466663, + "kl": 0.083984375, + "learning_rate": 7.203434378833012e-07, + "loss": 0.0048, + "reward": 1.7164759635925293, + "reward_std": 0.19856837391853333, + "rewards/accuracy_reward_stage2": 0.8571010828018188, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1597 + }, + { + "completion_length": 12.390625, + "epoch": 0.2800070089363939, + "grad_norm": 13.708390554967615, + "kl": 0.1943359375, + "learning_rate": 7.201682144734536e-07, + "loss": 0.0436, + "reward": 1.4508538246154785, + "reward_std": 0.21650546789169312, + "rewards/accuracy_reward_stage2": 0.5914788246154785, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1598 + }, + { + "completion_length": 17.421875, + "epoch": 0.28018223234624146, + "grad_norm": 15.884490536295282, + "kl": 0.01385498046875, + "learning_rate": 7.199929910636061e-07, + "loss": 0.0055, + "reward": 1.6412231922149658, + "reward_std": 0.1582455039024353, + "rewards/accuracy_reward_stage2": 0.6412231922149658, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1599 + }, + { + "completion_length": 12.75, + "epoch": 0.280357455756089, + "grad_norm": 19.54963850066304, + "kl": 0.2392578125, + "learning_rate": 7.198177676537585e-07, + "loss": 0.0515, + "reward": 1.5843968391418457, + "reward_std": 0.3087541460990906, + "rewards/accuracy_reward_stage2": 0.7250218391418457, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1600 + }, + { + "completion_length": 8.796875, + "epoch": 0.28053267916593655, + "grad_norm": 11.990420921934714, + "kl": 0.1044921875, + "learning_rate": 7.196425442439109e-07, + "loss": -0.0466, + "reward": 1.8261520862579346, + "reward_std": 0.12564031779766083, + "rewards/accuracy_reward_stage2": 0.8574021458625793, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1601 + }, + { + "completion_length": 9.625, + "epoch": 0.28070790257578415, + "grad_norm": 16.45410163995305, + "kl": 0.07177734375, + "learning_rate": 7.194673208340634e-07, + "loss": -0.0153, + "reward": 1.3370466232299805, + "reward_std": 0.2418329268693924, + "rewards/accuracy_reward_stage2": 0.35267162322998047, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1602 + }, + { + "completion_length": 7.515625, + "epoch": 0.2808831259856317, + "grad_norm": 15.062055007504009, + "kl": 0.0269775390625, + "learning_rate": 7.192920974242158e-07, + "loss": 0.0108, + "reward": 1.6208202838897705, + "reward_std": 0.11473879963159561, + "rewards/accuracy_reward_stage2": 0.6208202242851257, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1603 + }, + { + "completion_length": 15.46875, + "epoch": 0.28105834939547925, + "grad_norm": 15.064200762093972, + "kl": 0.045654296875, + "learning_rate": 7.191168740143683e-07, + "loss": -0.0235, + "reward": 1.4908268451690674, + "reward_std": 0.14759297668933868, + "rewards/accuracy_reward_stage2": 0.5064517855644226, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1604 + }, + { + "completion_length": 9.53125, + "epoch": 0.2812335728053268, + "grad_norm": 18.400798164712032, + "kl": 0.28125, + "learning_rate": 7.189416506045208e-07, + "loss": 0.0809, + "reward": 1.1706050634384155, + "reward_std": 0.21874907612800598, + "rewards/accuracy_reward_stage2": 0.4362300634384155, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1605 + }, + { + "completion_length": 9.75, + "epoch": 0.28140879621517434, + "grad_norm": 16.819137140159572, + "kl": 0.07080078125, + "learning_rate": 7.187664271946731e-07, + "loss": 0.0284, + "reward": 1.6809344291687012, + "reward_std": 0.2468591034412384, + "rewards/accuracy_reward_stage2": 0.6809343099594116, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1606 + }, + { + "completion_length": 12.28125, + "epoch": 0.2815840196250219, + "grad_norm": 24.427891880001116, + "kl": 0.2197265625, + "learning_rate": 7.185912037848256e-07, + "loss": 0.0876, + "reward": 1.5013978481292725, + "reward_std": 0.2571167051792145, + "rewards/accuracy_reward_stage2": 0.6263978481292725, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1607 + }, + { + "completion_length": 9.8125, + "epoch": 0.28175924303486943, + "grad_norm": 10.410467714252185, + "kl": 0.059814453125, + "learning_rate": 7.184159803749781e-07, + "loss": 0.0143, + "reward": 1.71875, + "reward_std": 0.1246790662407875, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1608 + }, + { + "completion_length": 14.9375, + "epoch": 0.28193446644471704, + "grad_norm": 29.89187810370791, + "kl": 0.050537109375, + "learning_rate": 7.182407569651305e-07, + "loss": 0.014, + "reward": 1.3833160400390625, + "reward_std": 0.2767779231071472, + "rewards/accuracy_reward_stage2": 0.3989410996437073, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1609 + }, + { + "completion_length": 12.703125, + "epoch": 0.2821096898545646, + "grad_norm": 22.31757950701303, + "kl": 0.3203125, + "learning_rate": 7.18065533555283e-07, + "loss": 0.0685, + "reward": 1.545056700706482, + "reward_std": 0.20747987926006317, + "rewards/accuracy_reward_stage2": 0.6856817007064819, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1610 + }, + { + "completion_length": 11.03125, + "epoch": 0.28228491326441213, + "grad_norm": 15.946911886957992, + "kl": 0.03125, + "learning_rate": 7.178903101454354e-07, + "loss": 0.0125, + "reward": 1.7586082220077515, + "reward_std": 0.13714845478534698, + "rewards/accuracy_reward_stage2": 0.7586082220077515, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1611 + }, + { + "completion_length": 23.9375, + "epoch": 0.2824601366742597, + "grad_norm": 21.26908901702462, + "kl": 0.26953125, + "learning_rate": 7.177150867355879e-07, + "loss": 0.1081, + "reward": 1.35392427444458, + "reward_std": 0.16395366191864014, + "rewards/accuracy_reward_stage2": 0.4789242446422577, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1612 + }, + { + "completion_length": 10.5625, + "epoch": 0.2826353600841072, + "grad_norm": 21.791260706148368, + "kl": 0.03759765625, + "learning_rate": 7.175398633257403e-07, + "loss": -0.0184, + "reward": 1.765136480331421, + "reward_std": 0.20289413630962372, + "rewards/accuracy_reward_stage2": 0.9057614803314209, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1613 + }, + { + "completion_length": 8.8125, + "epoch": 0.28281058349395477, + "grad_norm": 23.844684700591703, + "kl": 0.060791015625, + "learning_rate": 7.173646399158927e-07, + "loss": -0.0091, + "reward": 1.355473518371582, + "reward_std": 0.3000277876853943, + "rewards/accuracy_reward_stage2": 0.37109851837158203, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1614 + }, + { + "completion_length": 14.484375, + "epoch": 0.2829858069038024, + "grad_norm": 14.153123323249732, + "kl": 0.06591796875, + "learning_rate": 7.171894165060451e-07, + "loss": -0.0179, + "reward": 1.6771589517593384, + "reward_std": 0.14484737813472748, + "rewards/accuracy_reward_stage2": 0.6927839517593384, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1615 + }, + { + "completion_length": 13.734375, + "epoch": 0.2831610303136499, + "grad_norm": 17.854073307258215, + "kl": 0.0947265625, + "learning_rate": 7.170141930961976e-07, + "loss": 0.0379, + "reward": 1.5207949876785278, + "reward_std": 0.15267062187194824, + "rewards/accuracy_reward_stage2": 0.6457949876785278, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1616 + }, + { + "completion_length": 7.21875, + "epoch": 0.28333625372349747, + "grad_norm": 14.907211623261963, + "kl": 0.205078125, + "learning_rate": 7.1683896968635e-07, + "loss": -0.0062, + "reward": 1.416919231414795, + "reward_std": 0.2574279308319092, + "rewards/accuracy_reward_stage2": 0.5731692314147949, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1617 + }, + { + "completion_length": 9.59375, + "epoch": 0.283511477133345, + "grad_norm": 23.825957362220002, + "kl": 0.115234375, + "learning_rate": 7.166637462765025e-07, + "loss": 0.046, + "reward": 1.5133368968963623, + "reward_std": 0.1434442102909088, + "rewards/accuracy_reward_stage2": 0.7633370161056519, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1618 + }, + { + "completion_length": 9.671875, + "epoch": 0.28368670054319256, + "grad_norm": 12.868527094628538, + "kl": 0.05615234375, + "learning_rate": 7.164885228666549e-07, + "loss": -0.0482, + "reward": 1.6809239387512207, + "reward_std": 0.20865806937217712, + "rewards/accuracy_reward_stage2": 0.7121739387512207, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1619 + }, + { + "completion_length": 13.109375, + "epoch": 0.2838619239530401, + "grad_norm": 16.658702628163443, + "kl": 0.0118408203125, + "learning_rate": 7.163132994568074e-07, + "loss": 0.0047, + "reward": 1.34375, + "reward_std": 0.2630179226398468, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1620 + }, + { + "completion_length": 12.203125, + "epoch": 0.28403714736288765, + "grad_norm": 16.104471025041327, + "kl": 0.14453125, + "learning_rate": 7.161380760469599e-07, + "loss": 0.0582, + "reward": 1.785825490951538, + "reward_std": 0.10489936918020248, + "rewards/accuracy_reward_stage2": 0.9108256101608276, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1621 + }, + { + "completion_length": 20.03125, + "epoch": 0.28421237077273526, + "grad_norm": 15.819631796987974, + "kl": 0.1083984375, + "learning_rate": 7.159628526371123e-07, + "loss": -0.0419, + "reward": 1.4067251682281494, + "reward_std": 0.20268367230892181, + "rewards/accuracy_reward_stage2": 0.5629751086235046, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1622 + }, + { + "completion_length": 7.140625, + "epoch": 0.2843875941825828, + "grad_norm": 37.538471453661955, + "kl": 0.06884765625, + "learning_rate": 7.157876292272648e-07, + "loss": 0.0275, + "reward": 1.5744768381118774, + "reward_std": 0.1758725643157959, + "rewards/accuracy_reward_stage2": 0.5744768977165222, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1623 + }, + { + "completion_length": 9.6875, + "epoch": 0.28456281759243035, + "grad_norm": 18.377628674736215, + "kl": 0.18359375, + "learning_rate": 7.156124058174173e-07, + "loss": 0.0447, + "reward": 1.4850250482559204, + "reward_std": 0.2172246277332306, + "rewards/accuracy_reward_stage2": 0.6256501078605652, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1624 + }, + { + "completion_length": 12.078125, + "epoch": 0.2847380410022779, + "grad_norm": 18.698228548749846, + "kl": 0.09130859375, + "learning_rate": 7.154371824075697e-07, + "loss": -0.052, + "reward": 1.5879881381988525, + "reward_std": 0.14833402633666992, + "rewards/accuracy_reward_stage2": 0.6192381381988525, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1625 + }, + { + "completion_length": 7.59375, + "epoch": 0.28491326441212544, + "grad_norm": 13.655609339465885, + "kl": 0.296875, + "learning_rate": 7.15261958997722e-07, + "loss": 0.031, + "reward": 1.6834176778793335, + "reward_std": 0.18731647729873657, + "rewards/accuracy_reward_stage2": 0.7146677374839783, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1626 + }, + { + "completion_length": 10.671875, + "epoch": 0.285088487821973, + "grad_norm": 23.757278282718293, + "kl": 0.13671875, + "learning_rate": 7.150867355878744e-07, + "loss": 0.0546, + "reward": 1.4906599521636963, + "reward_std": 0.10720989108085632, + "rewards/accuracy_reward_stage2": 0.4906599223613739, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1627 + }, + { + "completion_length": 10.25, + "epoch": 0.2852637112318206, + "grad_norm": 55.79335785557363, + "kl": 0.0211181640625, + "learning_rate": 7.149115121780269e-07, + "loss": -0.0306, + "reward": 1.618137240409851, + "reward_std": 0.24807269871234894, + "rewards/accuracy_reward_stage2": 0.7587622404098511, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1628 + }, + { + "completion_length": 6.921875, + "epoch": 0.28543893464166814, + "grad_norm": 18.74178145843626, + "kl": 0.1328125, + "learning_rate": 7.147362887681794e-07, + "loss": -0.0351, + "reward": 1.3534233570098877, + "reward_std": 0.1923113763332367, + "rewards/accuracy_reward_stage2": 0.3846732974052429, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1629 + }, + { + "completion_length": 9.359375, + "epoch": 0.2856141580515157, + "grad_norm": 11.461917013351266, + "kl": 0.2080078125, + "learning_rate": 7.145610653583318e-07, + "loss": 0.0833, + "reward": 1.4668022394180298, + "reward_std": 0.08062316477298737, + "rewards/accuracy_reward_stage2": 0.466802179813385, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1630 + }, + { + "completion_length": 13.796875, + "epoch": 0.28578938146136323, + "grad_norm": 22.880906576728375, + "kl": 0.1318359375, + "learning_rate": 7.143858419484843e-07, + "loss": 0.0087, + "reward": 1.6901664733886719, + "reward_std": 0.30371004343032837, + "rewards/accuracy_reward_stage2": 0.8307914733886719, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1631 + }, + { + "completion_length": 7.015625, + "epoch": 0.2859646048712108, + "grad_norm": 19.54098394102929, + "kl": 0.07275390625, + "learning_rate": 7.142106185386368e-07, + "loss": 0.0292, + "reward": 1.4868440628051758, + "reward_std": 0.1463613510131836, + "rewards/accuracy_reward_stage2": 0.48684412240982056, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1632 + }, + { + "completion_length": 10.375, + "epoch": 0.28613982828105833, + "grad_norm": 21.269052597139854, + "kl": 0.06494140625, + "learning_rate": 7.140353951287892e-07, + "loss": -0.0137, + "reward": 1.6641602516174316, + "reward_std": 0.3177988529205322, + "rewards/accuracy_reward_stage2": 0.6797853112220764, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1633 + }, + { + "completion_length": 5.234375, + "epoch": 0.28631505169090593, + "grad_norm": 16.2130956464234, + "kl": 0.0673828125, + "learning_rate": 7.138601717189417e-07, + "loss": -0.0235, + "reward": 1.3314732313156128, + "reward_std": 0.19711866974830627, + "rewards/accuracy_reward_stage2": 0.4720982015132904, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1634 + }, + { + "completion_length": 7.21875, + "epoch": 0.2864902751007535, + "grad_norm": 45.213911068141066, + "kl": 0.46484375, + "learning_rate": 7.13684948309094e-07, + "loss": 0.1418, + "reward": 1.2447587251663208, + "reward_std": 0.23937861621379852, + "rewards/accuracy_reward_stage2": 0.3853837251663208, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1635 + }, + { + "completion_length": 19.984375, + "epoch": 0.286665498510601, + "grad_norm": 16.039437320650997, + "kl": 0.09912109375, + "learning_rate": 7.135097248992465e-07, + "loss": -0.0685, + "reward": 1.687240719795227, + "reward_std": 0.16071754693984985, + "rewards/accuracy_reward_stage2": 0.7341156601905823, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1636 + }, + { + "completion_length": 13.296875, + "epoch": 0.28684072192044857, + "grad_norm": 14.849111870421927, + "kl": 0.185546875, + "learning_rate": 7.13334501489399e-07, + "loss": 0.074, + "reward": 1.274766445159912, + "reward_std": 0.1772993505001068, + "rewards/accuracy_reward_stage2": 0.5247663855552673, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1637 + }, + { + "completion_length": 13.046875, + "epoch": 0.2870159453302961, + "grad_norm": 23.99212122858826, + "kl": 0.07373046875, + "learning_rate": 7.131592780795513e-07, + "loss": -0.0338, + "reward": 1.5409257411956787, + "reward_std": 0.31629616022109985, + "rewards/accuracy_reward_stage2": 0.5721758008003235, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1638 + }, + { + "completion_length": 34.140625, + "epoch": 0.28719116874014367, + "grad_norm": 20.01264481300106, + "kl": 0.09375, + "learning_rate": 7.129840546697038e-07, + "loss": 0.0375, + "reward": 1.5125254392623901, + "reward_std": 0.23700213432312012, + "rewards/accuracy_reward_stage2": 0.5125254392623901, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1639 + }, + { + "completion_length": 9.0625, + "epoch": 0.2873663921499912, + "grad_norm": 11.016581012451287, + "kl": 0.061767578125, + "learning_rate": 7.128088312598563e-07, + "loss": 0.0247, + "reward": 1.7490017414093018, + "reward_std": 0.10017408430576324, + "rewards/accuracy_reward_stage2": 0.7490018010139465, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1640 + }, + { + "completion_length": 12.375, + "epoch": 0.2875416155598388, + "grad_norm": 25.213433901660334, + "kl": 0.23046875, + "learning_rate": 7.126336078500087e-07, + "loss": 0.1007, + "reward": 1.5582143068313599, + "reward_std": 0.2488112449645996, + "rewards/accuracy_reward_stage2": 0.8082143068313599, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1641 + }, + { + "completion_length": 7.5625, + "epoch": 0.28771683896968636, + "grad_norm": 24.57818343120992, + "kl": 0.201171875, + "learning_rate": 7.124583844401612e-07, + "loss": -0.0179, + "reward": 1.7404444217681885, + "reward_std": 0.25796931982040405, + "rewards/accuracy_reward_stage2": 0.7873194217681885, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1642 + }, + { + "completion_length": 27.875, + "epoch": 0.2878920623795339, + "grad_norm": 22.31498698944003, + "kl": 0.05712890625, + "learning_rate": 7.122831610303136e-07, + "loss": 0.0229, + "reward": 1.6970582008361816, + "reward_std": 0.2050331085920334, + "rewards/accuracy_reward_stage2": 0.6970581412315369, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1643 + }, + { + "completion_length": 16.09375, + "epoch": 0.28806728578938146, + "grad_norm": 19.38288084898735, + "kl": 0.0439453125, + "learning_rate": 7.121079376204661e-07, + "loss": 0.0175, + "reward": 1.7315735816955566, + "reward_std": 0.19836972653865814, + "rewards/accuracy_reward_stage2": 0.7315736413002014, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1644 + }, + { + "completion_length": 7.296875, + "epoch": 0.288242509199229, + "grad_norm": 20.68800808708774, + "kl": 0.04296875, + "learning_rate": 7.119327142106185e-07, + "loss": 0.0172, + "reward": 1.6252111196517944, + "reward_std": 0.2738679349422455, + "rewards/accuracy_reward_stage2": 0.6252111196517944, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1645 + }, + { + "completion_length": 9.328125, + "epoch": 0.28841773260907655, + "grad_norm": 25.418979208262513, + "kl": 0.263671875, + "learning_rate": 7.117574908007709e-07, + "loss": 0.0062, + "reward": 1.490492820739746, + "reward_std": 0.32653310894966125, + "rewards/accuracy_reward_stage2": 0.6623678207397461, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1646 + }, + { + "completion_length": 10.515625, + "epoch": 0.28859295601892415, + "grad_norm": 17.77005226261, + "kl": 0.142578125, + "learning_rate": 7.115822673909234e-07, + "loss": 0.0256, + "reward": 1.305906057357788, + "reward_std": 0.2853769063949585, + "rewards/accuracy_reward_stage2": 0.5715309381484985, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1647 + }, + { + "completion_length": 8.328125, + "epoch": 0.2887681794287717, + "grad_norm": 21.53141032151713, + "kl": 0.1259765625, + "learning_rate": 7.114070439810759e-07, + "loss": 0.0504, + "reward": 1.5774625539779663, + "reward_std": 0.19729429483413696, + "rewards/accuracy_reward_stage2": 0.5774626135826111, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1648 + }, + { + "completion_length": 9.765625, + "epoch": 0.28894340283861925, + "grad_norm": 22.13505312706258, + "kl": 0.029541015625, + "learning_rate": 7.112318205712283e-07, + "loss": -0.0273, + "reward": 1.4544023275375366, + "reward_std": 0.2900089621543884, + "rewards/accuracy_reward_stage2": 0.5950272679328918, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1649 + }, + { + "completion_length": 10.4375, + "epoch": 0.2891186262484668, + "grad_norm": 16.66684202268035, + "kl": 0.030517578125, + "learning_rate": 7.110565971613808e-07, + "loss": 0.0122, + "reward": 1.1875, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.3125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1650 + }, + { + "completion_length": 6.1875, + "epoch": 0.28929384965831434, + "grad_norm": 7.4505253530629245, + "kl": 0.041748046875, + "learning_rate": 7.108813737515331e-07, + "loss": 0.0167, + "reward": 1.6225404739379883, + "reward_std": 0.0069564878940582275, + "rewards/accuracy_reward_stage2": 0.7475404739379883, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1651 + }, + { + "completion_length": 7.53125, + "epoch": 0.2894690730681619, + "grad_norm": 28.55696586333797, + "kl": 0.1796875, + "learning_rate": 7.107061503416856e-07, + "loss": 0.0279, + "reward": 1.49065363407135, + "reward_std": 0.29641100764274597, + "rewards/accuracy_reward_stage2": 0.5062786340713501, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1652 + }, + { + "completion_length": 11.75, + "epoch": 0.2896442964780095, + "grad_norm": 20.033273583846604, + "kl": 0.042236328125, + "learning_rate": 7.105309269318381e-07, + "loss": 0.0169, + "reward": 1.4843425750732422, + "reward_std": 0.209964781999588, + "rewards/accuracy_reward_stage2": 0.6093425750732422, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1653 + }, + { + "completion_length": 13.109375, + "epoch": 0.28981951988785704, + "grad_norm": 19.51626253625671, + "kl": 0.11669921875, + "learning_rate": 7.103557035219905e-07, + "loss": -0.1495, + "reward": 1.3871873617172241, + "reward_std": 0.32389187812805176, + "rewards/accuracy_reward_stage2": 0.4653124511241913, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 1654 + }, + { + "completion_length": 7.59375, + "epoch": 0.2899947432977046, + "grad_norm": 14.401028332479045, + "kl": 0.0184326171875, + "learning_rate": 7.101804801121429e-07, + "loss": 0.0074, + "reward": 1.8791757822036743, + "reward_std": 0.12005934864282608, + "rewards/accuracy_reward_stage2": 0.8791757225990295, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1655 + }, + { + "completion_length": 16.75, + "epoch": 0.29016996670755213, + "grad_norm": 24.425108474466274, + "kl": 0.1767578125, + "learning_rate": 7.100052567022954e-07, + "loss": 0.0268, + "reward": 1.4417062997817993, + "reward_std": 0.24762201309204102, + "rewards/accuracy_reward_stage2": 0.45733126997947693, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1656 + }, + { + "completion_length": 9.125, + "epoch": 0.2903451901173997, + "grad_norm": 15.885346234048205, + "kl": 0.08251953125, + "learning_rate": 7.098300332924478e-07, + "loss": -0.0457, + "reward": 1.750290870666504, + "reward_std": 0.21685898303985596, + "rewards/accuracy_reward_stage2": 0.7815408706665039, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1657 + }, + { + "completion_length": 7.921875, + "epoch": 0.2905204135272472, + "grad_norm": 14.827047303372789, + "kl": 0.07177734375, + "learning_rate": 7.096548098826003e-07, + "loss": 0.0286, + "reward": 1.8020378351211548, + "reward_std": 0.13259246945381165, + "rewards/accuracy_reward_stage2": 0.8020378351211548, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1658 + }, + { + "completion_length": 10.53125, + "epoch": 0.29069563693709477, + "grad_norm": 21.85151316980941, + "kl": 0.09912109375, + "learning_rate": 7.094795864727527e-07, + "loss": -0.0489, + "reward": 1.6325395107269287, + "reward_std": 0.3101848363876343, + "rewards/accuracy_reward_stage2": 0.7887896299362183, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1659 + }, + { + "completion_length": 6.265625, + "epoch": 0.2908708603469424, + "grad_norm": 18.287190634942217, + "kl": 0.0615234375, + "learning_rate": 7.093043630629052e-07, + "loss": 0.0246, + "reward": 1.5881173610687256, + "reward_std": 0.10431988537311554, + "rewards/accuracy_reward_stage2": 0.5881173610687256, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1660 + }, + { + "completion_length": 8.65625, + "epoch": 0.2910460837567899, + "grad_norm": 15.895496389770273, + "kl": 0.061279296875, + "learning_rate": 7.091291396530577e-07, + "loss": 0.0245, + "reward": 1.6019673347473145, + "reward_std": 0.12393862009048462, + "rewards/accuracy_reward_stage2": 0.6019672155380249, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1661 + }, + { + "completion_length": 13.390625, + "epoch": 0.29122130716663747, + "grad_norm": 10.816467295801187, + "kl": 0.0908203125, + "learning_rate": 7.089539162432101e-07, + "loss": -0.0078, + "reward": 1.8275662660598755, + "reward_std": 0.13626514375209808, + "rewards/accuracy_reward_stage2": 0.8431912660598755, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1662 + }, + { + "completion_length": 8.546875, + "epoch": 0.291396530576485, + "grad_norm": 15.694381347608815, + "kl": 0.1865234375, + "learning_rate": 7.087786928333626e-07, + "loss": -0.0344, + "reward": 1.4604032039642334, + "reward_std": 0.21337604522705078, + "rewards/accuracy_reward_stage2": 0.6322782039642334, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1663 + }, + { + "completion_length": 10.875, + "epoch": 0.29157175398633256, + "grad_norm": 16.364034081347608, + "kl": 0.068359375, + "learning_rate": 7.086034694235148e-07, + "loss": -0.0073, + "reward": 1.4936509132385254, + "reward_std": 0.21664029359817505, + "rewards/accuracy_reward_stage2": 0.6342759728431702, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1664 + }, + { + "completion_length": 10.09375, + "epoch": 0.2917469773961801, + "grad_norm": 26.763510925051527, + "kl": 0.35546875, + "learning_rate": 7.084282460136673e-07, + "loss": 0.0305, + "reward": 1.3588550090789795, + "reward_std": 0.25715306401252747, + "rewards/accuracy_reward_stage2": 0.5307300090789795, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1665 + }, + { + "completion_length": 10.859375, + "epoch": 0.2919222008060277, + "grad_norm": 29.69703159052776, + "kl": 0.283203125, + "learning_rate": 7.082530226038198e-07, + "loss": 0.0743, + "reward": 1.75, + "reward_std": 0.3197399973869324, + "rewards/accuracy_reward_stage2": 0.765625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1666 + }, + { + "completion_length": 9.53125, + "epoch": 0.29209742421587526, + "grad_norm": 22.667034575376125, + "kl": 0.251953125, + "learning_rate": 7.080777991939722e-07, + "loss": 0.0564, + "reward": 1.568906545639038, + "reward_std": 0.211813285946846, + "rewards/accuracy_reward_stage2": 0.8345316052436829, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1667 + }, + { + "completion_length": 17.21875, + "epoch": 0.2922726476257228, + "grad_norm": 22.412634113582342, + "kl": 0.11279296875, + "learning_rate": 7.079025757841247e-07, + "loss": 0.0021, + "reward": 1.3446143865585327, + "reward_std": 0.2523040771484375, + "rewards/accuracy_reward_stage2": 0.5008643865585327, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1668 + }, + { + "completion_length": 8.1875, + "epoch": 0.29244787103557035, + "grad_norm": 17.11055627315877, + "kl": 0.0849609375, + "learning_rate": 7.077273523742772e-07, + "loss": 0.0339, + "reward": 1.4724442958831787, + "reward_std": 0.19884838163852692, + "rewards/accuracy_reward_stage2": 0.47244423627853394, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1669 + }, + { + "completion_length": 8.828125, + "epoch": 0.2926230944454179, + "grad_norm": 18.118629529042312, + "kl": 0.115234375, + "learning_rate": 7.075521289644296e-07, + "loss": 0.0459, + "reward": 1.6591260433197021, + "reward_std": 0.16784465312957764, + "rewards/accuracy_reward_stage2": 0.6591259241104126, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1670 + }, + { + "completion_length": 15.046875, + "epoch": 0.29279831785526544, + "grad_norm": 17.44795830979665, + "kl": 0.04541015625, + "learning_rate": 7.073769055545821e-07, + "loss": -0.0482, + "reward": 1.4219658374786377, + "reward_std": 0.11053664982318878, + "rewards/accuracy_reward_stage2": 0.5782157778739929, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1671 + }, + { + "completion_length": 17.296875, + "epoch": 0.29297354126511305, + "grad_norm": 16.374551074871473, + "kl": 0.03271484375, + "learning_rate": 7.072016821447345e-07, + "loss": 0.013, + "reward": 1.5180377960205078, + "reward_std": 0.18597131967544556, + "rewards/accuracy_reward_stage2": 0.5180378556251526, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1672 + }, + { + "completion_length": 6.328125, + "epoch": 0.2931487646749606, + "grad_norm": 12.337377058119465, + "kl": 0.033935546875, + "learning_rate": 7.07026458734887e-07, + "loss": 0.0136, + "reward": 1.8476905822753906, + "reward_std": 0.08578959107398987, + "rewards/accuracy_reward_stage2": 0.8476906418800354, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1673 + }, + { + "completion_length": 11.5, + "epoch": 0.29332398808480814, + "grad_norm": 22.60062112863413, + "kl": 0.294921875, + "learning_rate": 7.068512353250395e-07, + "loss": -0.0613, + "reward": 1.504547119140625, + "reward_std": 0.2656328082084656, + "rewards/accuracy_reward_stage2": 0.582672119140625, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 1674 + }, + { + "completion_length": 10.234375, + "epoch": 0.2934992114946557, + "grad_norm": 19.392245624011455, + "kl": 0.06787109375, + "learning_rate": 7.066760119151918e-07, + "loss": 0.0273, + "reward": 1.4372313022613525, + "reward_std": 0.12203386425971985, + "rewards/accuracy_reward_stage2": 0.4372313618659973, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1675 + }, + { + "completion_length": 13.59375, + "epoch": 0.29367443490450323, + "grad_norm": 24.5100935157014, + "kl": 0.1240234375, + "learning_rate": 7.065007885053443e-07, + "loss": 0.0054, + "reward": 1.2867053747177124, + "reward_std": 0.23883283138275146, + "rewards/accuracy_reward_stage2": 0.30233034491539, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1676 + }, + { + "completion_length": 14.296875, + "epoch": 0.2938496583143508, + "grad_norm": 21.009404224525824, + "kl": 0.2119140625, + "learning_rate": 7.063255650954967e-07, + "loss": 0.0469, + "reward": 1.4832779169082642, + "reward_std": 0.22361421585083008, + "rewards/accuracy_reward_stage2": 0.6239029169082642, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1677 + }, + { + "completion_length": 5.453125, + "epoch": 0.29402488172419833, + "grad_norm": 18.31135062907212, + "kl": 0.072265625, + "learning_rate": 7.061503416856491e-07, + "loss": -0.0152, + "reward": 1.8268635272979736, + "reward_std": 0.11123533546924591, + "rewards/accuracy_reward_stage2": 0.8424884080886841, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1678 + }, + { + "completion_length": 8.625, + "epoch": 0.29420010513404593, + "grad_norm": 20.70682730963805, + "kl": 0.1884765625, + "learning_rate": 7.059751182758016e-07, + "loss": -0.0091, + "reward": 1.6978143453598022, + "reward_std": 0.3025016486644745, + "rewards/accuracy_reward_stage2": 0.7446893453598022, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1679 + }, + { + "completion_length": 8.453125, + "epoch": 0.2943753285438935, + "grad_norm": 18.365155253399227, + "kl": 0.06689453125, + "learning_rate": 7.05799894865954e-07, + "loss": -0.0174, + "reward": 1.580909252166748, + "reward_std": 0.19966112077236176, + "rewards/accuracy_reward_stage2": 0.5965343117713928, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1680 + }, + { + "completion_length": 8.1875, + "epoch": 0.294550551953741, + "grad_norm": 25.531188137039162, + "kl": 0.30859375, + "learning_rate": 7.056246714561065e-07, + "loss": 0.1237, + "reward": 1.1729844808578491, + "reward_std": 0.16541114449501038, + "rewards/accuracy_reward_stage2": 0.4229844808578491, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1681 + }, + { + "completion_length": 13.703125, + "epoch": 0.29472577536358857, + "grad_norm": 17.57522237064251, + "kl": 0.055908203125, + "learning_rate": 7.05449448046259e-07, + "loss": 0.0222, + "reward": 1.7702456712722778, + "reward_std": 0.1767645925283432, + "rewards/accuracy_reward_stage2": 0.7702457308769226, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1682 + }, + { + "completion_length": 9.515625, + "epoch": 0.2949009987734361, + "grad_norm": 14.555393826814768, + "kl": 0.06494140625, + "learning_rate": 7.052742246364114e-07, + "loss": -0.0164, + "reward": 1.7005529403686523, + "reward_std": 0.09703925251960754, + "rewards/accuracy_reward_stage2": 0.8411779999732971, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1683 + }, + { + "completion_length": 9.78125, + "epoch": 0.29507622218328367, + "grad_norm": 22.386526662514495, + "kl": 0.142578125, + "learning_rate": 7.050990012265639e-07, + "loss": 0.0569, + "reward": 1.701306939125061, + "reward_std": 0.26653289794921875, + "rewards/accuracy_reward_stage2": 0.7013068795204163, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1684 + }, + { + "completion_length": 15.75, + "epoch": 0.29525144559313127, + "grad_norm": 18.142136706976988, + "kl": 0.0576171875, + "learning_rate": 7.049237778167163e-07, + "loss": 0.023, + "reward": 1.2344558238983154, + "reward_std": 0.1106235533952713, + "rewards/accuracy_reward_stage2": 0.23445577919483185, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1685 + }, + { + "completion_length": 9.953125, + "epoch": 0.2954266690029788, + "grad_norm": 16.073543348250226, + "kl": 0.07275390625, + "learning_rate": 7.047485544068687e-07, + "loss": 0.0343, + "reward": 1.5708773136138916, + "reward_std": 0.16317957639694214, + "rewards/accuracy_reward_stage2": 0.6958773732185364, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1686 + }, + { + "completion_length": 12.640625, + "epoch": 0.29560189241282636, + "grad_norm": 23.8509920791397, + "kl": 0.011962890625, + "learning_rate": 7.045733309970212e-07, + "loss": 0.0048, + "reward": 1.6665546894073486, + "reward_std": 0.2417851835489273, + "rewards/accuracy_reward_stage2": 0.6665546894073486, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1687 + }, + { + "completion_length": 9.9375, + "epoch": 0.2957771158226739, + "grad_norm": 20.34102299001797, + "kl": 0.2451171875, + "learning_rate": 7.043981075871736e-07, + "loss": 0.0975, + "reward": 1.5598974227905273, + "reward_std": 0.1706065535545349, + "rewards/accuracy_reward_stage2": 0.6848974227905273, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1688 + }, + { + "completion_length": 27.171875, + "epoch": 0.29595233923252146, + "grad_norm": 24.778954933179413, + "kl": 0.20703125, + "learning_rate": 7.04222884177326e-07, + "loss": 0.0437, + "reward": 1.4397300481796265, + "reward_std": 0.20912772417068481, + "rewards/accuracy_reward_stage2": 0.5803550481796265, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1689 + }, + { + "completion_length": 9.984375, + "epoch": 0.296127562642369, + "grad_norm": 18.30052895572527, + "kl": 0.1416015625, + "learning_rate": 7.040476607674785e-07, + "loss": 0.015, + "reward": 1.3897716999053955, + "reward_std": 0.20197075605392456, + "rewards/accuracy_reward_stage2": 0.4053967595100403, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1690 + }, + { + "completion_length": 9.328125, + "epoch": 0.29630278605221655, + "grad_norm": 21.721276007608935, + "kl": 0.1328125, + "learning_rate": 7.038724373576309e-07, + "loss": 0.0532, + "reward": 1.6735678911209106, + "reward_std": 0.32562583684921265, + "rewards/accuracy_reward_stage2": 0.6735677719116211, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1691 + }, + { + "completion_length": 8.046875, + "epoch": 0.29647800946206415, + "grad_norm": 20.41959856286839, + "kl": 0.12890625, + "learning_rate": 7.036972139477834e-07, + "loss": 0.0513, + "reward": 1.5956950187683105, + "reward_std": 0.18200629949569702, + "rewards/accuracy_reward_stage2": 0.5956949591636658, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1692 + }, + { + "completion_length": 12.109375, + "epoch": 0.2966532328719117, + "grad_norm": 17.309247466482873, + "kl": 0.1044921875, + "learning_rate": 7.035219905379359e-07, + "loss": 0.0252, + "reward": 1.6495327949523926, + "reward_std": 0.2783252000808716, + "rewards/accuracy_reward_stage2": 0.7901579141616821, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1693 + }, + { + "completion_length": 10.421875, + "epoch": 0.29682845628175925, + "grad_norm": 11.817725673933843, + "kl": 0.07763671875, + "learning_rate": 7.033467671280882e-07, + "loss": -0.011, + "reward": 1.3649306297302246, + "reward_std": 0.07383120805025101, + "rewards/accuracy_reward_stage2": 0.38055557012557983, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1694 + }, + { + "completion_length": 10.984375, + "epoch": 0.2970036796916068, + "grad_norm": 20.20455995667041, + "kl": 0.052978515625, + "learning_rate": 7.031715437182407e-07, + "loss": 0.0211, + "reward": 1.615881085395813, + "reward_std": 0.1898079514503479, + "rewards/accuracy_reward_stage2": 0.6158811450004578, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1695 + }, + { + "completion_length": 5.859375, + "epoch": 0.29717890310145434, + "grad_norm": 20.723780937675922, + "kl": 0.1044921875, + "learning_rate": 7.029963203083931e-07, + "loss": 0.0418, + "reward": 1.6208045482635498, + "reward_std": 0.10398261994123459, + "rewards/accuracy_reward_stage2": 0.6208046078681946, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1696 + }, + { + "completion_length": 8.390625, + "epoch": 0.2973541265113019, + "grad_norm": 23.222706582027712, + "kl": 0.09765625, + "learning_rate": 7.028210968985456e-07, + "loss": 0.0391, + "reward": 1.7384235858917236, + "reward_std": 0.19045890867710114, + "rewards/accuracy_reward_stage2": 0.7384235858917236, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1697 + }, + { + "completion_length": 9.453125, + "epoch": 0.2975293499211495, + "grad_norm": 15.779788156610927, + "kl": 0.078125, + "learning_rate": 7.026458734886981e-07, + "loss": -0.013, + "reward": 1.3911956548690796, + "reward_std": 0.242641419172287, + "rewards/accuracy_reward_stage2": 0.5318205952644348, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1698 + }, + { + "completion_length": 9.53125, + "epoch": 0.29770457333099704, + "grad_norm": 22.91272507487531, + "kl": 0.1201171875, + "learning_rate": 7.024706500788505e-07, + "loss": -0.0927, + "reward": 1.5430138111114502, + "reward_std": 0.25438836216926575, + "rewards/accuracy_reward_stage2": 0.6055138111114502, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1699 + }, + { + "completion_length": 10.625, + "epoch": 0.2978797967408446, + "grad_norm": 19.778327412160536, + "kl": 0.11328125, + "learning_rate": 7.02295426669003e-07, + "loss": 0.0453, + "reward": 1.5060014724731445, + "reward_std": 0.24018427729606628, + "rewards/accuracy_reward_stage2": 0.5060014128684998, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1700 + }, + { + "completion_length": 11.984375, + "epoch": 0.29805502015069213, + "grad_norm": 19.40855478090295, + "kl": 0.08447265625, + "learning_rate": 7.021202032591555e-07, + "loss": -0.0105, + "reward": 1.603383183479309, + "reward_std": 0.3217662572860718, + "rewards/accuracy_reward_stage2": 0.6190081238746643, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1701 + }, + { + "completion_length": 8.796875, + "epoch": 0.2982302435605397, + "grad_norm": 18.222665367205884, + "kl": 0.058837890625, + "learning_rate": 7.019449798493078e-07, + "loss": 0.0235, + "reward": 1.5814367532730103, + "reward_std": 0.1338120698928833, + "rewards/accuracy_reward_stage2": 0.5814367532730103, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1702 + }, + { + "completion_length": 11.953125, + "epoch": 0.2984054669703872, + "grad_norm": 24.934665597811787, + "kl": 0.1875, + "learning_rate": 7.017697564394603e-07, + "loss": 0.0748, + "reward": 1.6412934064865112, + "reward_std": 0.3221040666103363, + "rewards/accuracy_reward_stage2": 0.7662933468818665, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1703 + }, + { + "completion_length": 10.609375, + "epoch": 0.2985806903802348, + "grad_norm": 15.026115592854442, + "kl": 0.09814453125, + "learning_rate": 7.015945330296126e-07, + "loss": -0.0049, + "reward": 1.6630749702453613, + "reward_std": 0.16869525611400604, + "rewards/accuracy_reward_stage2": 0.6786999702453613, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1704 + }, + { + "completion_length": 11.046875, + "epoch": 0.2987559137900824, + "grad_norm": 25.677616356620522, + "kl": 0.30859375, + "learning_rate": 7.014193096197651e-07, + "loss": 0.1236, + "reward": 1.0827093124389648, + "reward_std": 0.2146224081516266, + "rewards/accuracy_reward_stage2": 0.5827093720436096, + "rewards/format_reward_stage1_pointerpad": 0.5, + "scores/accuracy_reward_stage2": 0.5, + "step": 1705 + }, + { + "completion_length": 8.890625, + "epoch": 0.2989311371999299, + "grad_norm": 16.767615751833127, + "kl": 0.008544921875, + "learning_rate": 7.012440862099176e-07, + "loss": 0.0034, + "reward": 1.5104167461395264, + "reward_std": 0.1473139077425003, + "rewards/accuracy_reward_stage2": 0.6354166269302368, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1706 + }, + { + "completion_length": 8.046875, + "epoch": 0.29910636060977747, + "grad_norm": 21.753461446874425, + "kl": 0.08447265625, + "learning_rate": 7.0106886280007e-07, + "loss": 0.0009, + "reward": 1.4956333637237549, + "reward_std": 0.26299524307250977, + "rewards/accuracy_reward_stage2": 0.5112583637237549, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1707 + }, + { + "completion_length": 10.3125, + "epoch": 0.299281584019625, + "grad_norm": 20.184864788281224, + "kl": 0.224609375, + "learning_rate": 7.008936393902225e-07, + "loss": 0.0509, + "reward": 1.5954148769378662, + "reward_std": 0.29817402362823486, + "rewards/accuracy_reward_stage2": 0.7360398769378662, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1708 + }, + { + "completion_length": 12.6875, + "epoch": 0.29945680742947256, + "grad_norm": 15.485288813738334, + "kl": 0.083984375, + "learning_rate": 7.00718415980375e-07, + "loss": -0.0064, + "reward": 1.474943995475769, + "reward_std": 0.19505015015602112, + "rewards/accuracy_reward_stage2": 0.615568995475769, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1709 + }, + { + "completion_length": 27.671875, + "epoch": 0.2996320308393201, + "grad_norm": 17.649240525297987, + "kl": 0.041748046875, + "learning_rate": 7.005431925705274e-07, + "loss": 0.0167, + "reward": 1.5623822212219238, + "reward_std": 0.2002073973417282, + "rewards/accuracy_reward_stage2": 0.5623822212219238, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1710 + }, + { + "completion_length": 7.203125, + "epoch": 0.2998072542491677, + "grad_norm": 23.44514836779095, + "kl": 0.10498046875, + "learning_rate": 7.003679691606799e-07, + "loss": 0.042, + "reward": 1.682141661643982, + "reward_std": 0.28925785422325134, + "rewards/accuracy_reward_stage2": 0.8071417808532715, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1711 + }, + { + "completion_length": 9.28125, + "epoch": 0.29998247765901526, + "grad_norm": 18.377371253216243, + "kl": 0.045166015625, + "learning_rate": 7.001927457508323e-07, + "loss": 0.018, + "reward": 1.7067670822143555, + "reward_std": 0.17415907979011536, + "rewards/accuracy_reward_stage2": 0.7067670822143555, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1712 + }, + { + "completion_length": 9.296875, + "epoch": 0.3001577010688628, + "grad_norm": 20.942129163532762, + "kl": 0.053955078125, + "learning_rate": 7.000175223409848e-07, + "loss": -0.0009, + "reward": 1.6164193153381348, + "reward_std": 0.21394112706184387, + "rewards/accuracy_reward_stage2": 0.74141925573349, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1713 + }, + { + "completion_length": 9.40625, + "epoch": 0.30033292447871035, + "grad_norm": 15.992685381176873, + "kl": 0.150390625, + "learning_rate": 6.998422989311373e-07, + "loss": 0.0267, + "reward": 1.441511869430542, + "reward_std": 0.2877542972564697, + "rewards/accuracy_reward_stage2": 0.45713692903518677, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1714 + }, + { + "completion_length": 18.140625, + "epoch": 0.3005081478885579, + "grad_norm": 20.477620250403163, + "kl": 0.0654296875, + "learning_rate": 6.996670755212895e-07, + "loss": 0.0262, + "reward": 1.519256353378296, + "reward_std": 0.20742157101631165, + "rewards/accuracy_reward_stage2": 0.5192563533782959, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1715 + }, + { + "completion_length": 12.03125, + "epoch": 0.30068337129840544, + "grad_norm": 15.828782229925874, + "kl": 0.236328125, + "learning_rate": 6.99491852111442e-07, + "loss": 0.006, + "reward": 1.2528257369995117, + "reward_std": 0.16298237442970276, + "rewards/accuracy_reward_stage2": 0.5184507966041565, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1716 + }, + { + "completion_length": 16.984375, + "epoch": 0.30085859470825305, + "grad_norm": 19.351136914365497, + "kl": 0.11376953125, + "learning_rate": 6.993166287015945e-07, + "loss": -0.0378, + "reward": 1.2651151418685913, + "reward_std": 0.26157358288764954, + "rewards/accuracy_reward_stage2": 0.4213651418685913, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1717 + }, + { + "completion_length": 16.40625, + "epoch": 0.3010338181181006, + "grad_norm": 22.03734358648279, + "kl": 0.0439453125, + "learning_rate": 6.991414052917469e-07, + "loss": 0.0176, + "reward": 1.6531357765197754, + "reward_std": 0.21220947802066803, + "rewards/accuracy_reward_stage2": 0.7781356573104858, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1718 + }, + { + "completion_length": 7.046875, + "epoch": 0.30120904152794814, + "grad_norm": 18.753470433630223, + "kl": 0.126953125, + "learning_rate": 6.989661818818994e-07, + "loss": -0.0375, + "reward": 1.7411688566207886, + "reward_std": 0.19873298704624176, + "rewards/accuracy_reward_stage2": 0.7724189162254333, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1719 + }, + { + "completion_length": 8.8125, + "epoch": 0.3013842649377957, + "grad_norm": 15.466518379156236, + "kl": 0.07080078125, + "learning_rate": 6.987909584720518e-07, + "loss": 0.0284, + "reward": 1.7029595375061035, + "reward_std": 0.12291057407855988, + "rewards/accuracy_reward_stage2": 0.827959418296814, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1720 + }, + { + "completion_length": 7.796875, + "epoch": 0.30155948834764323, + "grad_norm": 19.501181455296493, + "kl": 0.06298828125, + "learning_rate": 6.986157350622043e-07, + "loss": 0.0252, + "reward": 1.5771433115005493, + "reward_std": 0.17379529774188995, + "rewards/accuracy_reward_stage2": 0.7021433115005493, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1721 + }, + { + "completion_length": 10.421875, + "epoch": 0.3017347117574908, + "grad_norm": 22.98391134712126, + "kl": 0.2119140625, + "learning_rate": 6.984405116523568e-07, + "loss": -0.03, + "reward": 1.4428976774215698, + "reward_std": 0.2138996124267578, + "rewards/accuracy_reward_stage2": 0.5053976774215698, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1722 + }, + { + "completion_length": 24.34375, + "epoch": 0.3019099351673384, + "grad_norm": 24.585847348472083, + "kl": 0.04638671875, + "learning_rate": 6.982652882425092e-07, + "loss": 0.0185, + "reward": 1.6923989057540894, + "reward_std": 0.1567954421043396, + "rewards/accuracy_reward_stage2": 0.8173988461494446, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1723 + }, + { + "completion_length": 10.015625, + "epoch": 0.30208515857718593, + "grad_norm": 29.306608426277936, + "kl": 0.154296875, + "learning_rate": 6.980900648326617e-07, + "loss": 0.0173, + "reward": 1.7728705406188965, + "reward_std": 0.2877151072025299, + "rewards/accuracy_reward_stage2": 0.7884955406188965, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1724 + }, + { + "completion_length": 9.578125, + "epoch": 0.3022603819870335, + "grad_norm": 20.790074690488744, + "kl": 0.11669921875, + "learning_rate": 6.979148414228141e-07, + "loss": 0.0466, + "reward": 1.8588755130767822, + "reward_std": 0.1497223675251007, + "rewards/accuracy_reward_stage2": 0.8588753938674927, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1725 + }, + { + "completion_length": 10.875, + "epoch": 0.302435605396881, + "grad_norm": 28.348773800842704, + "kl": 0.205078125, + "learning_rate": 6.977396180129665e-07, + "loss": 0.0505, + "reward": 1.4835188388824463, + "reward_std": 0.2762864828109741, + "rewards/accuracy_reward_stage2": 0.4991438388824463, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1726 + }, + { + "completion_length": 7.296875, + "epoch": 0.30261082880672857, + "grad_norm": 19.30322415984756, + "kl": 0.0859375, + "learning_rate": 6.97564394603119e-07, + "loss": 0.0344, + "reward": 1.5627442598342896, + "reward_std": 0.13243502378463745, + "rewards/accuracy_reward_stage2": 0.5627442598342896, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1727 + }, + { + "completion_length": 8.953125, + "epoch": 0.3027860522165761, + "grad_norm": 13.272637700139425, + "kl": 0.138671875, + "learning_rate": 6.973891711932713e-07, + "loss": 0.0557, + "reward": 1.1412205696105957, + "reward_std": 0.20045800507068634, + "rewards/accuracy_reward_stage2": 0.5162205696105957, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1728 + }, + { + "completion_length": 7.765625, + "epoch": 0.30296127562642367, + "grad_norm": 18.56909415470759, + "kl": 0.07958984375, + "learning_rate": 6.972139477834238e-07, + "loss": 0.0319, + "reward": 1.6612675189971924, + "reward_std": 0.19378286600112915, + "rewards/accuracy_reward_stage2": 0.6612674593925476, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1729 + }, + { + "completion_length": 8.296875, + "epoch": 0.30313649903627127, + "grad_norm": 19.996812828528885, + "kl": 0.1953125, + "learning_rate": 6.970387243735763e-07, + "loss": -0.003, + "reward": 1.4460539817810059, + "reward_std": 0.3024117946624756, + "rewards/accuracy_reward_stage2": 0.6023039221763611, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1730 + }, + { + "completion_length": 10.875, + "epoch": 0.3033117224461188, + "grad_norm": 20.133568474644466, + "kl": 0.10009765625, + "learning_rate": 6.968635009637287e-07, + "loss": -0.064, + "reward": 1.5166605710983276, + "reward_std": 0.3427537679672241, + "rewards/accuracy_reward_stage2": 0.5635355710983276, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1731 + }, + { + "completion_length": 10.078125, + "epoch": 0.30348694585596636, + "grad_norm": 26.844065801862286, + "kl": 0.263671875, + "learning_rate": 6.966882775538812e-07, + "loss": 0.1057, + "reward": 1.6550629138946533, + "reward_std": 0.18192754685878754, + "rewards/accuracy_reward_stage2": 0.7800629138946533, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1732 + }, + { + "completion_length": 7.453125, + "epoch": 0.3036621692658139, + "grad_norm": 72.76331505491501, + "kl": 0.53125, + "learning_rate": 6.965130541440337e-07, + "loss": 0.169, + "reward": 1.6019704341888428, + "reward_std": 0.19926634430885315, + "rewards/accuracy_reward_stage2": 0.7425954937934875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1733 + }, + { + "completion_length": 10.34375, + "epoch": 0.30383739267566146, + "grad_norm": 18.421882393675595, + "kl": 0.07275390625, + "learning_rate": 6.96337830734186e-07, + "loss": -0.0591, + "reward": 1.6065335273742676, + "reward_std": 0.3355046510696411, + "rewards/accuracy_reward_stage2": 0.6377835869789124, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1734 + }, + { + "completion_length": 10.125, + "epoch": 0.304012616085509, + "grad_norm": 11.942224141003042, + "kl": 0.058837890625, + "learning_rate": 6.961626073243385e-07, + "loss": -0.0207, + "reward": 1.4618998765945435, + "reward_std": 0.07960294187068939, + "rewards/accuracy_reward_stage2": 0.47752487659454346, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1735 + }, + { + "completion_length": 9.703125, + "epoch": 0.3041878394953566, + "grad_norm": 19.31158166873272, + "kl": 0.14453125, + "learning_rate": 6.959873839144909e-07, + "loss": -0.0482, + "reward": 1.4449491500854492, + "reward_std": 0.35838043689727783, + "rewards/accuracy_reward_stage2": 0.4918241500854492, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1736 + }, + { + "completion_length": 8.84375, + "epoch": 0.30436306290520415, + "grad_norm": 26.716362549948762, + "kl": 0.0673828125, + "learning_rate": 6.958121605046434e-07, + "loss": 0.027, + "reward": 1.786990761756897, + "reward_std": 0.2307921200990677, + "rewards/accuracy_reward_stage2": 0.786990761756897, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1737 + }, + { + "completion_length": 9.578125, + "epoch": 0.3045382863150517, + "grad_norm": 13.812922311399504, + "kl": 0.1044921875, + "learning_rate": 6.956369370947959e-07, + "loss": 0.0418, + "reward": 1.4359885454177856, + "reward_std": 0.22620894014835358, + "rewards/accuracy_reward_stage2": 0.43598854541778564, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1738 + }, + { + "completion_length": 7.609375, + "epoch": 0.30471350972489925, + "grad_norm": 20.11713933240359, + "kl": 0.0908203125, + "learning_rate": 6.954617136849483e-07, + "loss": 0.0363, + "reward": 1.5567574501037598, + "reward_std": 0.1984756886959076, + "rewards/accuracy_reward_stage2": 0.5567575097084045, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1739 + }, + { + "completion_length": 19.1875, + "epoch": 0.3048887331347468, + "grad_norm": 20.825688050186635, + "kl": 0.0595703125, + "learning_rate": 6.952864902751007e-07, + "loss": 0.0238, + "reward": 1.6817889213562012, + "reward_std": 0.1523667573928833, + "rewards/accuracy_reward_stage2": 0.6817888021469116, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1740 + }, + { + "completion_length": 11.75, + "epoch": 0.30506395654459434, + "grad_norm": 24.818617359584426, + "kl": 0.3046875, + "learning_rate": 6.951112668652531e-07, + "loss": 0.0934, + "reward": 1.1904207468032837, + "reward_std": 0.2955701947212219, + "rewards/accuracy_reward_stage2": 0.4560457468032837, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1741 + }, + { + "completion_length": 14.953125, + "epoch": 0.3052391799544419, + "grad_norm": 17.298674576478618, + "kl": 0.1513671875, + "learning_rate": 6.949360434554056e-07, + "loss": 0.0605, + "reward": 1.2696678638458252, + "reward_std": 0.19690418243408203, + "rewards/accuracy_reward_stage2": 0.3946678340435028, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1742 + }, + { + "completion_length": 13.5, + "epoch": 0.3054144033642895, + "grad_norm": 18.43912249339227, + "kl": 0.287109375, + "learning_rate": 6.947608200455581e-07, + "loss": 0.0735, + "reward": 1.2864735126495361, + "reward_std": 0.29236263036727905, + "rewards/accuracy_reward_stage2": 0.4270986318588257, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1743 + }, + { + "completion_length": 14.703125, + "epoch": 0.30558962677413704, + "grad_norm": 20.53686398022577, + "kl": 0.09423828125, + "learning_rate": 6.945855966357104e-07, + "loss": 0.0377, + "reward": 1.43604576587677, + "reward_std": 0.21306610107421875, + "rewards/accuracy_reward_stage2": 0.43604573607444763, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1744 + }, + { + "completion_length": 10.375, + "epoch": 0.3057648501839846, + "grad_norm": 21.82032382446694, + "kl": 0.046875, + "learning_rate": 6.944103732258629e-07, + "loss": -0.0146, + "reward": 1.505760669708252, + "reward_std": 0.21576610207557678, + "rewards/accuracy_reward_stage2": 0.5213857293128967, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1745 + }, + { + "completion_length": 8.75, + "epoch": 0.30594007359383213, + "grad_norm": 16.48715253308781, + "kl": 0.08056640625, + "learning_rate": 6.942351498160154e-07, + "loss": -0.0041, + "reward": 1.5899578332901, + "reward_std": 0.23990005254745483, + "rewards/accuracy_reward_stage2": 0.6055828332901001, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1746 + }, + { + "completion_length": 13.5625, + "epoch": 0.3061152970036797, + "grad_norm": 25.359738427813962, + "kl": 0.10107421875, + "learning_rate": 6.940599264061678e-07, + "loss": 0.0405, + "reward": 1.7109923362731934, + "reward_std": 0.25352123379707336, + "rewards/accuracy_reward_stage2": 0.7109923362731934, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1747 + }, + { + "completion_length": 9.34375, + "epoch": 0.3062905204135272, + "grad_norm": 20.257427202231625, + "kl": 0.06884765625, + "learning_rate": 6.938847029963203e-07, + "loss": 0.0275, + "reward": 1.3980247974395752, + "reward_std": 0.181054025888443, + "rewards/accuracy_reward_stage2": 0.3980247974395752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1748 + }, + { + "completion_length": 12.9375, + "epoch": 0.3064657438233748, + "grad_norm": 17.564095255403075, + "kl": 0.294921875, + "learning_rate": 6.937094795864727e-07, + "loss": 0.1018, + "reward": 1.5810174942016602, + "reward_std": 0.2686046361923218, + "rewards/accuracy_reward_stage2": 0.7216424942016602, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1749 + }, + { + "completion_length": 6.25, + "epoch": 0.3066409672332224, + "grad_norm": 14.375651726004179, + "kl": 0.046630859375, + "learning_rate": 6.935342561766252e-07, + "loss": -0.0255, + "reward": 1.6467738151550293, + "reward_std": 0.1914321780204773, + "rewards/accuracy_reward_stage2": 0.6623987555503845, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1750 + }, + { + "completion_length": 12.09375, + "epoch": 0.3068161906430699, + "grad_norm": 12.345937734240898, + "kl": 0.01513671875, + "learning_rate": 6.933590327667777e-07, + "loss": -0.0376, + "reward": 1.5015289783477783, + "reward_std": 0.14592652022838593, + "rewards/accuracy_reward_stage2": 0.5171540379524231, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1751 + }, + { + "completion_length": 11.0, + "epoch": 0.30699141405291747, + "grad_norm": 16.665879700484737, + "kl": 0.1904296875, + "learning_rate": 6.931838093569301e-07, + "loss": 0.0377, + "reward": 1.5536797046661377, + "reward_std": 0.3037300109863281, + "rewards/accuracy_reward_stage2": 0.5693047046661377, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1752 + }, + { + "completion_length": 12.0, + "epoch": 0.307166637462765, + "grad_norm": 18.09600917797856, + "kl": 0.1474609375, + "learning_rate": 6.930085859470825e-07, + "loss": 0.0147, + "reward": 1.3697917461395264, + "reward_std": 0.17622756958007812, + "rewards/accuracy_reward_stage2": 0.5104166269302368, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1753 + }, + { + "completion_length": 8.421875, + "epoch": 0.30734186087261256, + "grad_norm": 17.718409175442556, + "kl": 0.062255859375, + "learning_rate": 6.928333625372349e-07, + "loss": -0.0193, + "reward": 1.8201448917388916, + "reward_std": 0.188689187169075, + "rewards/accuracy_reward_stage2": 0.8357699513435364, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1754 + }, + { + "completion_length": 10.234375, + "epoch": 0.30751708428246016, + "grad_norm": 17.51134890833407, + "kl": 0.1787109375, + "learning_rate": 6.926581391273873e-07, + "loss": 0.0714, + "reward": 1.1596169471740723, + "reward_std": 0.09221327304840088, + "rewards/accuracy_reward_stage2": 0.40961694717407227, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1755 + }, + { + "completion_length": 10.390625, + "epoch": 0.3076923076923077, + "grad_norm": 21.006838236257735, + "kl": 0.224609375, + "learning_rate": 6.924829157175398e-07, + "loss": 0.0447, + "reward": 1.1669657230377197, + "reward_std": 0.3041993975639343, + "rewards/accuracy_reward_stage2": 0.5732156038284302, + "rewards/format_reward_stage1_pointerpad": 0.59375, + "scores/accuracy_reward_stage2": 0.59375, + "step": 1756 + }, + { + "completion_length": 10.5625, + "epoch": 0.30786753110215526, + "grad_norm": 17.512426542672337, + "kl": 0.08740234375, + "learning_rate": 6.923076923076922e-07, + "loss": 0.035, + "reward": 1.471048355102539, + "reward_std": 0.21887768805027008, + "rewards/accuracy_reward_stage2": 0.47104835510253906, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1757 + }, + { + "completion_length": 10.09375, + "epoch": 0.3080427545120028, + "grad_norm": 23.961599386130843, + "kl": 0.2197265625, + "learning_rate": 6.921324688978447e-07, + "loss": 0.0881, + "reward": 1.472118854522705, + "reward_std": 0.1884707510471344, + "rewards/accuracy_reward_stage2": 0.5971187949180603, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1758 + }, + { + "completion_length": 8.671875, + "epoch": 0.30821797792185035, + "grad_norm": 11.807947327421346, + "kl": 0.0185546875, + "learning_rate": 6.919572454879972e-07, + "loss": 0.0074, + "reward": 1.8020833730697632, + "reward_std": 0.15872615575790405, + "rewards/accuracy_reward_stage2": 0.8020833730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1759 + }, + { + "completion_length": 11.125, + "epoch": 0.3083932013316979, + "grad_norm": 21.153286980872252, + "kl": 0.28125, + "learning_rate": 6.917820220781496e-07, + "loss": 0.1127, + "reward": 1.3018338680267334, + "reward_std": 0.21189749240875244, + "rewards/accuracy_reward_stage2": 0.5518338680267334, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1760 + }, + { + "completion_length": 16.3125, + "epoch": 0.30856842474154544, + "grad_norm": 16.93507853922392, + "kl": 0.0927734375, + "learning_rate": 6.916067986683021e-07, + "loss": 0.0062, + "reward": 1.6600141525268555, + "reward_std": 0.14201043546199799, + "rewards/accuracy_reward_stage2": 0.6756391525268555, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1761 + }, + { + "completion_length": 12.46875, + "epoch": 0.30874364815139305, + "grad_norm": 25.109481914584034, + "kl": 0.12255859375, + "learning_rate": 6.914315752584546e-07, + "loss": 0.0048, + "reward": 1.365222692489624, + "reward_std": 0.28223758935928345, + "rewards/accuracy_reward_stage2": 0.5058478116989136, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1762 + }, + { + "completion_length": 12.46875, + "epoch": 0.3089188715612406, + "grad_norm": 17.783663309948565, + "kl": 0.10498046875, + "learning_rate": 6.91256351848607e-07, + "loss": 0.042, + "reward": 1.6931748390197754, + "reward_std": 0.31448644399642944, + "rewards/accuracy_reward_stage2": 0.6931748986244202, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1763 + }, + { + "completion_length": 9.75, + "epoch": 0.30909409497108814, + "grad_norm": 14.788676137177648, + "kl": 0.076171875, + "learning_rate": 6.910811284387594e-07, + "loss": 0.0304, + "reward": 1.4861619472503662, + "reward_std": 0.19302460551261902, + "rewards/accuracy_reward_stage2": 0.4861619770526886, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1764 + }, + { + "completion_length": 17.140625, + "epoch": 0.3092693183809357, + "grad_norm": 16.180602672992737, + "kl": 0.0595703125, + "learning_rate": 6.909059050289118e-07, + "loss": -0.0073, + "reward": 1.5507076978683472, + "reward_std": 0.14039362967014313, + "rewards/accuracy_reward_stage2": 0.5663327574729919, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1765 + }, + { + "completion_length": 7.6875, + "epoch": 0.30944454179078323, + "grad_norm": 18.944751242113163, + "kl": 0.181640625, + "learning_rate": 6.907306816190642e-07, + "loss": 0.0006, + "reward": 1.7289048433303833, + "reward_std": 0.18334315717220306, + "rewards/accuracy_reward_stage2": 0.760154664516449, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1766 + }, + { + "completion_length": 11.421875, + "epoch": 0.3096197652006308, + "grad_norm": 19.16973729447749, + "kl": 0.1494140625, + "learning_rate": 6.905554582092167e-07, + "loss": -0.052, + "reward": 1.4670138359069824, + "reward_std": 0.3148658871650696, + "rewards/accuracy_reward_stage2": 0.5138888955116272, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1767 + }, + { + "completion_length": 9.71875, + "epoch": 0.3097949886104784, + "grad_norm": 17.309743318297446, + "kl": 0.044921875, + "learning_rate": 6.903802347993691e-07, + "loss": 0.018, + "reward": 1.6352248191833496, + "reward_std": 0.28032034635543823, + "rewards/accuracy_reward_stage2": 0.6352247595787048, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1768 + }, + { + "completion_length": 9.5625, + "epoch": 0.30997021202032593, + "grad_norm": 18.063172656913363, + "kl": 0.11181640625, + "learning_rate": 6.902050113895216e-07, + "loss": -0.0092, + "reward": 1.2916667461395264, + "reward_std": 0.31406551599502563, + "rewards/accuracy_reward_stage2": 0.3229166865348816, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1769 + }, + { + "completion_length": 8.296875, + "epoch": 0.3101454354301735, + "grad_norm": 16.536948904301017, + "kl": 0.035888671875, + "learning_rate": 6.900297879796741e-07, + "loss": 0.0143, + "reward": 1.728659987449646, + "reward_std": 0.14718098938465118, + "rewards/accuracy_reward_stage2": 0.728659987449646, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1770 + }, + { + "completion_length": 7.84375, + "epoch": 0.310320658840021, + "grad_norm": 11.889469909971977, + "kl": 0.111328125, + "learning_rate": 6.898545645698265e-07, + "loss": -0.0151, + "reward": 1.8219847679138184, + "reward_std": 0.21930548548698425, + "rewards/accuracy_reward_stage2": 0.8532347083091736, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1771 + }, + { + "completion_length": 15.546875, + "epoch": 0.31049588224986857, + "grad_norm": 15.999502540982576, + "kl": 0.11328125, + "learning_rate": 6.89679341159979e-07, + "loss": -0.0765, + "reward": 1.328125, + "reward_std": 0.19939783215522766, + "rewards/accuracy_reward_stage2": 0.375, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1772 + }, + { + "completion_length": 11.1875, + "epoch": 0.3106711056597161, + "grad_norm": 15.775649129635754, + "kl": 0.049072265625, + "learning_rate": 6.895041177501314e-07, + "loss": 0.0196, + "reward": 1.67244553565979, + "reward_std": 0.1583077609539032, + "rewards/accuracy_reward_stage2": 0.6724455952644348, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1773 + }, + { + "completion_length": 11.671875, + "epoch": 0.3108463290695637, + "grad_norm": 17.128809859484743, + "kl": 0.1494140625, + "learning_rate": 6.893288943402838e-07, + "loss": -0.0652, + "reward": 1.5411221981048584, + "reward_std": 0.3266652524471283, + "rewards/accuracy_reward_stage2": 0.5879971981048584, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1774 + }, + { + "completion_length": 9.84375, + "epoch": 0.31102155247941127, + "grad_norm": 21.203604381045565, + "kl": 0.0947265625, + "learning_rate": 6.891536709304363e-07, + "loss": 0.0378, + "reward": 1.4142817258834839, + "reward_std": 0.2929390072822571, + "rewards/accuracy_reward_stage2": 0.5392817258834839, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1775 + }, + { + "completion_length": 10.1875, + "epoch": 0.3111967758892588, + "grad_norm": 18.289077348581767, + "kl": 0.1865234375, + "learning_rate": 6.889784475205887e-07, + "loss": 0.0554, + "reward": 1.5055460929870605, + "reward_std": 0.18611961603164673, + "rewards/accuracy_reward_stage2": 0.6461710333824158, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1776 + }, + { + "completion_length": 8.21875, + "epoch": 0.31137199929910636, + "grad_norm": 20.461632497981842, + "kl": 0.08984375, + "learning_rate": 6.888032241107412e-07, + "loss": 0.0358, + "reward": 1.6237359046936035, + "reward_std": 0.21165037155151367, + "rewards/accuracy_reward_stage2": 0.6393609046936035, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1777 + }, + { + "completion_length": 13.5625, + "epoch": 0.3115472227089539, + "grad_norm": 15.923489710242217, + "kl": 0.138671875, + "learning_rate": 6.886280007008937e-07, + "loss": 0.0555, + "reward": 1.4647328853607178, + "reward_std": 0.19382403790950775, + "rewards/accuracy_reward_stage2": 0.5897328853607178, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1778 + }, + { + "completion_length": 11.40625, + "epoch": 0.31172244611880146, + "grad_norm": 23.491850619048765, + "kl": 0.1572265625, + "learning_rate": 6.88452777291046e-07, + "loss": -0.0162, + "reward": 1.7011396884918213, + "reward_std": 0.21986907720565796, + "rewards/accuracy_reward_stage2": 0.8417646288871765, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1779 + }, + { + "completion_length": 21.65625, + "epoch": 0.311897669528649, + "grad_norm": 17.648231242026238, + "kl": 0.166015625, + "learning_rate": 6.882775538811985e-07, + "loss": 0.0309, + "reward": 1.2604795694351196, + "reward_std": 0.2557189464569092, + "rewards/accuracy_reward_stage2": 0.40110456943511963, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1780 + }, + { + "completion_length": 33.84375, + "epoch": 0.3120728929384966, + "grad_norm": 20.279169259088963, + "kl": 0.06689453125, + "learning_rate": 6.881023304713509e-07, + "loss": 0.0204, + "reward": 1.2711501121520996, + "reward_std": 0.11733835190534592, + "rewards/accuracy_reward_stage2": 0.5367749929428101, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1781 + }, + { + "completion_length": 13.0625, + "epoch": 0.31224811634834415, + "grad_norm": 19.349168319136698, + "kl": 0.064453125, + "learning_rate": 6.879271070615034e-07, + "loss": -0.0022, + "reward": 1.6423496007919312, + "reward_std": 0.23339146375656128, + "rewards/accuracy_reward_stage2": 0.7829747200012207, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1782 + }, + { + "completion_length": 6.5625, + "epoch": 0.3124233397581917, + "grad_norm": 19.729708472837455, + "kl": 0.11572265625, + "learning_rate": 6.877518836516559e-07, + "loss": 0.0109, + "reward": 1.6721187829971313, + "reward_std": 0.27952146530151367, + "rewards/accuracy_reward_stage2": 0.6877437233924866, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1783 + }, + { + "completion_length": 15.03125, + "epoch": 0.31259856316803925, + "grad_norm": 16.744165717397728, + "kl": 0.037841796875, + "learning_rate": 6.875766602418082e-07, + "loss": -0.0263, + "reward": 1.404069185256958, + "reward_std": 0.1922588050365448, + "rewards/accuracy_reward_stage2": 0.5446941256523132, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1784 + }, + { + "completion_length": 9.359375, + "epoch": 0.3127737865778868, + "grad_norm": 19.139819261721463, + "kl": 0.1142578125, + "learning_rate": 6.874014368319607e-07, + "loss": 0.0458, + "reward": 1.667823314666748, + "reward_std": 0.19142280519008636, + "rewards/accuracy_reward_stage2": 0.6678231954574585, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1785 + }, + { + "completion_length": 13.546875, + "epoch": 0.31294900998773434, + "grad_norm": 18.638743470121007, + "kl": 0.087890625, + "learning_rate": 6.872262134221132e-07, + "loss": 0.0351, + "reward": 1.5992480516433716, + "reward_std": 0.09372645616531372, + "rewards/accuracy_reward_stage2": 0.5992480516433716, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1786 + }, + { + "completion_length": 11.5, + "epoch": 0.31312423339758194, + "grad_norm": 15.238512705166738, + "kl": 0.08447265625, + "learning_rate": 6.870509900122656e-07, + "loss": 0.0338, + "reward": 1.466585636138916, + "reward_std": 0.13504785299301147, + "rewards/accuracy_reward_stage2": 0.591585636138916, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1787 + }, + { + "completion_length": 14.46875, + "epoch": 0.3132994568074295, + "grad_norm": 22.201999425814748, + "kl": 0.1064453125, + "learning_rate": 6.868757666024181e-07, + "loss": -0.0017, + "reward": 1.3887823820114136, + "reward_std": 0.24121630191802979, + "rewards/accuracy_reward_stage2": 0.5294073820114136, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1788 + }, + { + "completion_length": 12.140625, + "epoch": 0.31347468021727704, + "grad_norm": 16.71526611503449, + "kl": 0.061767578125, + "learning_rate": 6.867005431925705e-07, + "loss": 0.0248, + "reward": 1.3335199356079102, + "reward_std": 0.16721726953983307, + "rewards/accuracy_reward_stage2": 0.45851996541023254, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1789 + }, + { + "completion_length": 13.109375, + "epoch": 0.3136499036271246, + "grad_norm": 65.37486727195878, + "kl": 0.369140625, + "learning_rate": 6.86525319782723e-07, + "loss": 0.1083, + "reward": 1.7268718481063843, + "reward_std": 0.1315089464187622, + "rewards/accuracy_reward_stage2": 0.8831217885017395, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1790 + }, + { + "completion_length": 12.203125, + "epoch": 0.31382512703697213, + "grad_norm": 21.970841856829036, + "kl": 0.0751953125, + "learning_rate": 6.863500963728754e-07, + "loss": 0.0237, + "reward": 1.3490945100784302, + "reward_std": 0.28697898983955383, + "rewards/accuracy_reward_stage2": 0.4897195100784302, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1791 + }, + { + "completion_length": 5.40625, + "epoch": 0.3140003504468197, + "grad_norm": 14.203849936543449, + "kl": 0.06640625, + "learning_rate": 6.861748729630278e-07, + "loss": -0.0175, + "reward": 1.6782519817352295, + "reward_std": 0.12433382123708725, + "rewards/accuracy_reward_stage2": 0.6938769817352295, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1792 + }, + { + "completion_length": 12.09375, + "epoch": 0.3141755738566673, + "grad_norm": 19.317828271285112, + "kl": 0.17578125, + "learning_rate": 6.859996495531802e-07, + "loss": 0.026, + "reward": 1.4357094764709473, + "reward_std": 0.252302348613739, + "rewards/accuracy_reward_stage2": 0.5763345956802368, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1793 + }, + { + "completion_length": 10.25, + "epoch": 0.3143507972665148, + "grad_norm": 18.615946255677432, + "kl": 0.3046875, + "learning_rate": 6.858244261433327e-07, + "loss": 0.0775, + "reward": 1.4579432010650635, + "reward_std": 0.19797083735466003, + "rewards/accuracy_reward_stage2": 0.5985681414604187, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1794 + }, + { + "completion_length": 8.609375, + "epoch": 0.3145260206763624, + "grad_norm": 19.174787497092378, + "kl": 0.16015625, + "learning_rate": 6.856492027334851e-07, + "loss": 0.0213, + "reward": 1.614447832107544, + "reward_std": 0.2662124037742615, + "rewards/accuracy_reward_stage2": 0.630072832107544, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1795 + }, + { + "completion_length": 10.734375, + "epoch": 0.3147012440862099, + "grad_norm": 15.360421940382055, + "kl": 0.06787109375, + "learning_rate": 6.854739793236376e-07, + "loss": -0.0331, + "reward": 1.457749605178833, + "reward_std": 0.19533106684684753, + "rewards/accuracy_reward_stage2": 0.48899969458580017, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1796 + }, + { + "completion_length": 20.28125, + "epoch": 0.31487646749605747, + "grad_norm": 658.9442180431508, + "kl": 1.140625, + "learning_rate": 6.8529875591379e-07, + "loss": 0.409, + "reward": 1.5153186321258545, + "reward_std": 0.22788935899734497, + "rewards/accuracy_reward_stage2": 0.6559436917304993, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1797 + }, + { + "completion_length": 8.671875, + "epoch": 0.315051690905905, + "grad_norm": 20.79543805564879, + "kl": 0.21484375, + "learning_rate": 6.851235325039425e-07, + "loss": 0.0206, + "reward": 1.3769316673278809, + "reward_std": 0.2010369449853897, + "rewards/accuracy_reward_stage2": 0.5331815481185913, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1798 + }, + { + "completion_length": 9.5625, + "epoch": 0.31522691431575256, + "grad_norm": 29.209511348518387, + "kl": 0.150390625, + "learning_rate": 6.84948309094095e-07, + "loss": 0.0603, + "reward": 1.6429357528686523, + "reward_std": 0.2625489830970764, + "rewards/accuracy_reward_stage2": 0.7679356932640076, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1799 + }, + { + "completion_length": 9.71875, + "epoch": 0.31540213772560016, + "grad_norm": 16.85321625670232, + "kl": 0.0712890625, + "learning_rate": 6.847730856842474e-07, + "loss": -0.0156, + "reward": 1.6711848974227905, + "reward_std": 0.19875484704971313, + "rewards/accuracy_reward_stage2": 0.6868098974227905, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1800 + }, + { + "completion_length": 6.875, + "epoch": 0.3155773611354477, + "grad_norm": 20.034299414918525, + "kl": 0.1259765625, + "learning_rate": 6.845978622743999e-07, + "loss": 0.0015, + "reward": 1.737661600112915, + "reward_std": 0.2608322203159332, + "rewards/accuracy_reward_stage2": 0.768911600112915, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1801 + }, + { + "completion_length": 9.515625, + "epoch": 0.31575258454529526, + "grad_norm": 16.12460743639783, + "kl": 0.0986328125, + "learning_rate": 6.844226388645524e-07, + "loss": 0.0394, + "reward": 1.5717509984970093, + "reward_std": 0.15481790900230408, + "rewards/accuracy_reward_stage2": 0.6967509984970093, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1802 + }, + { + "completion_length": 10.875, + "epoch": 0.3159278079551428, + "grad_norm": 15.839550154054333, + "kl": 0.12158203125, + "learning_rate": 6.842474154547048e-07, + "loss": 0.0046, + "reward": 1.812552809715271, + "reward_std": 0.1441907435655594, + "rewards/accuracy_reward_stage2": 0.828177809715271, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1803 + }, + { + "completion_length": 9.453125, + "epoch": 0.31610303136499035, + "grad_norm": 13.23966129910894, + "kl": 0.03515625, + "learning_rate": 6.840721920448571e-07, + "loss": 0.0141, + "reward": 1.5005505084991455, + "reward_std": 0.1589551568031311, + "rewards/accuracy_reward_stage2": 0.5005505084991455, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1804 + }, + { + "completion_length": 17.421875, + "epoch": 0.3162782547748379, + "grad_norm": 20.892304554616636, + "kl": 0.087890625, + "learning_rate": 6.838969686350095e-07, + "loss": 0.0351, + "reward": 1.4543895721435547, + "reward_std": 0.18670448660850525, + "rewards/accuracy_reward_stage2": 0.45438963174819946, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1805 + }, + { + "completion_length": 8.203125, + "epoch": 0.3164534781846855, + "grad_norm": 20.082354567975813, + "kl": 0.0634765625, + "learning_rate": 6.83721745225162e-07, + "loss": 0.0253, + "reward": 1.6109774112701416, + "reward_std": 0.20097583532333374, + "rewards/accuracy_reward_stage2": 0.6109773516654968, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1806 + }, + { + "completion_length": 9.359375, + "epoch": 0.31662870159453305, + "grad_norm": 13.84361945350731, + "kl": 0.81640625, + "learning_rate": 6.835465218153145e-07, + "loss": 0.3261, + "reward": 1.1875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1807 + }, + { + "completion_length": 9.46875, + "epoch": 0.3168039250043806, + "grad_norm": 22.82033708517315, + "kl": 0.208984375, + "learning_rate": 6.833712984054669e-07, + "loss": 0.0014, + "reward": 1.634639859199524, + "reward_std": 0.24602361023426056, + "rewards/accuracy_reward_stage2": 0.7908898591995239, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1808 + }, + { + "completion_length": 10.515625, + "epoch": 0.31697914841422814, + "grad_norm": 20.550993721079408, + "kl": 0.158203125, + "learning_rate": 6.831960749956194e-07, + "loss": -0.0, + "reward": 1.1504223346710205, + "reward_std": 0.26793819665908813, + "rewards/accuracy_reward_stage2": 0.29104727506637573, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1809 + }, + { + "completion_length": 9.90625, + "epoch": 0.3171543718240757, + "grad_norm": 12.17435510264999, + "kl": 0.051025390625, + "learning_rate": 6.830208515857718e-07, + "loss": -0.0215, + "reward": 1.8293263912200928, + "reward_std": 0.09863705933094025, + "rewards/accuracy_reward_stage2": 0.8449514508247375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1810 + }, + { + "completion_length": 9.828125, + "epoch": 0.31732959523392323, + "grad_norm": 16.92668722984298, + "kl": 0.0966796875, + "learning_rate": 6.828456281759243e-07, + "loss": -0.0056, + "reward": 1.7856630086898804, + "reward_std": 0.26676464080810547, + "rewards/accuracy_reward_stage2": 0.8012880086898804, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1811 + }, + { + "completion_length": 6.65625, + "epoch": 0.3175048186437708, + "grad_norm": 17.29778111581455, + "kl": 0.12890625, + "learning_rate": 6.826704047660768e-07, + "loss": 0.0225, + "reward": 1.6117510795593262, + "reward_std": 0.21526718139648438, + "rewards/accuracy_reward_stage2": 0.6273760795593262, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1812 + }, + { + "completion_length": 9.734375, + "epoch": 0.3176800420536184, + "grad_norm": 13.89711888030353, + "kl": 0.037353515625, + "learning_rate": 6.824951813562291e-07, + "loss": 0.015, + "reward": 1.5531659126281738, + "reward_std": 0.0688503310084343, + "rewards/accuracy_reward_stage2": 0.553165853023529, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1813 + }, + { + "completion_length": 9.328125, + "epoch": 0.31785526546346593, + "grad_norm": 20.5060681062909, + "kl": 0.1376953125, + "learning_rate": 6.823199579463816e-07, + "loss": 0.0135, + "reward": 1.1958703994750977, + "reward_std": 0.17446595430374146, + "rewards/accuracy_reward_stage2": 0.33649545907974243, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1814 + }, + { + "completion_length": 6.484375, + "epoch": 0.3180304888733135, + "grad_norm": 16.327339834068823, + "kl": 0.0322265625, + "learning_rate": 6.821447345365341e-07, + "loss": 0.0129, + "reward": 1.6876778602600098, + "reward_std": 0.13662895560264587, + "rewards/accuracy_reward_stage2": 0.687677800655365, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1815 + }, + { + "completion_length": 11.546875, + "epoch": 0.318205712283161, + "grad_norm": 20.985490210673987, + "kl": 0.0771484375, + "learning_rate": 6.819695111266865e-07, + "loss": 0.0309, + "reward": 1.6510412693023682, + "reward_std": 0.22464652359485626, + "rewards/accuracy_reward_stage2": 0.7760413289070129, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1816 + }, + { + "completion_length": 14.3125, + "epoch": 0.31838093569300857, + "grad_norm": 25.15709767548847, + "kl": 0.08203125, + "learning_rate": 6.817942877168389e-07, + "loss": 0.0329, + "reward": 1.381837248802185, + "reward_std": 0.2987816333770752, + "rewards/accuracy_reward_stage2": 0.6318372488021851, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1817 + }, + { + "completion_length": 9.5625, + "epoch": 0.3185561591028561, + "grad_norm": 21.45777273299857, + "kl": 0.2041015625, + "learning_rate": 6.816190643069913e-07, + "loss": 0.0816, + "reward": 1.4881311655044556, + "reward_std": 0.17458751797676086, + "rewards/accuracy_reward_stage2": 0.6131311058998108, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1818 + }, + { + "completion_length": 10.890625, + "epoch": 0.3187313825127037, + "grad_norm": 22.804407689637223, + "kl": 0.34765625, + "learning_rate": 6.814438408971438e-07, + "loss": 0.1395, + "reward": 1.1429061889648438, + "reward_std": 0.2962269186973572, + "rewards/accuracy_reward_stage2": 0.39290618896484375, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1819 + }, + { + "completion_length": 11.046875, + "epoch": 0.31890660592255127, + "grad_norm": 18.228809141794585, + "kl": 0.0625, + "learning_rate": 6.812686174872963e-07, + "loss": 0.0251, + "reward": 1.617333173751831, + "reward_std": 0.19772064685821533, + "rewards/accuracy_reward_stage2": 0.742333173751831, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1820 + }, + { + "completion_length": 5.90625, + "epoch": 0.3190818293323988, + "grad_norm": 14.05008775941103, + "kl": 0.04296875, + "learning_rate": 6.810933940774487e-07, + "loss": 0.0172, + "reward": 1.57807457447052, + "reward_std": 0.1071772426366806, + "rewards/accuracy_reward_stage2": 0.57807457447052, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1821 + }, + { + "completion_length": 5.4375, + "epoch": 0.31925705274224636, + "grad_norm": 8.770429021155142, + "kl": 0.006134033203125, + "learning_rate": 6.809181706676012e-07, + "loss": 0.0025, + "reward": 1.640625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1822 + }, + { + "completion_length": 8.78125, + "epoch": 0.3194322761520939, + "grad_norm": 16.03661282647896, + "kl": 0.0947265625, + "learning_rate": 6.807429472577537e-07, + "loss": 0.0379, + "reward": 1.28269624710083, + "reward_std": 0.1685888171195984, + "rewards/accuracy_reward_stage2": 0.2826962471008301, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1823 + }, + { + "completion_length": 9.390625, + "epoch": 0.31960749956194145, + "grad_norm": 20.147449873716155, + "kl": 0.138671875, + "learning_rate": 6.80567723847906e-07, + "loss": -0.0222, + "reward": 1.3721352815628052, + "reward_std": 0.2498307228088379, + "rewards/accuracy_reward_stage2": 0.4033852517604828, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1824 + }, + { + "completion_length": 11.140625, + "epoch": 0.31978272297178906, + "grad_norm": 22.691989026804517, + "kl": 0.2353515625, + "learning_rate": 6.803925004380585e-07, + "loss": 0.0609, + "reward": 1.6414058208465576, + "reward_std": 0.16309432685375214, + "rewards/accuracy_reward_stage2": 0.7820307016372681, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1825 + }, + { + "completion_length": 11.609375, + "epoch": 0.3199579463816366, + "grad_norm": 17.33336402597771, + "kl": 0.0947265625, + "learning_rate": 6.802172770282109e-07, + "loss": -0.0064, + "reward": 1.430842638015747, + "reward_std": 0.23729614913463593, + "rewards/accuracy_reward_stage2": 0.5714677572250366, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1826 + }, + { + "completion_length": 9.8125, + "epoch": 0.32013316979148415, + "grad_norm": 10.20804098641941, + "kl": 0.08056640625, + "learning_rate": 6.800420536183634e-07, + "loss": -0.012, + "reward": 1.8573908805847168, + "reward_std": 0.2043817788362503, + "rewards/accuracy_reward_stage2": 0.8730158805847168, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1827 + }, + { + "completion_length": 11.15625, + "epoch": 0.3203083932013317, + "grad_norm": 16.427886870766105, + "kl": 0.076171875, + "learning_rate": 6.798668302085159e-07, + "loss": -0.0138, + "reward": 1.3155391216278076, + "reward_std": 0.19373847544193268, + "rewards/accuracy_reward_stage2": 0.33116400241851807, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1828 + }, + { + "completion_length": 15.734375, + "epoch": 0.32048361661117925, + "grad_norm": 23.683707696314503, + "kl": 0.0966796875, + "learning_rate": 6.796916067986683e-07, + "loss": -0.0056, + "reward": 1.4888830184936523, + "reward_std": 0.3248387575149536, + "rewards/accuracy_reward_stage2": 0.5045079588890076, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1829 + }, + { + "completion_length": 12.828125, + "epoch": 0.3206588400210268, + "grad_norm": 22.327109222125593, + "kl": 0.0869140625, + "learning_rate": 6.795163833888207e-07, + "loss": 0.0349, + "reward": 1.7786725759506226, + "reward_std": 0.19240732491016388, + "rewards/accuracy_reward_stage2": 0.7786725163459778, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1830 + }, + { + "completion_length": 7.171875, + "epoch": 0.32083406343087434, + "grad_norm": 21.46194590157146, + "kl": 0.236328125, + "learning_rate": 6.793411599789732e-07, + "loss": 0.0848, + "reward": 1.3829593658447266, + "reward_std": 0.21105429530143738, + "rewards/accuracy_reward_stage2": 0.5235843658447266, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1831 + }, + { + "completion_length": 11.828125, + "epoch": 0.32100928684072194, + "grad_norm": 22.802588704471336, + "kl": 0.01806640625, + "learning_rate": 6.791659365691256e-07, + "loss": 0.0072, + "reward": 1.7552083730697632, + "reward_std": 0.2507331371307373, + "rewards/accuracy_reward_stage2": 0.7552083730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1832 + }, + { + "completion_length": 8.40625, + "epoch": 0.3211845102505695, + "grad_norm": 17.738512216103807, + "kl": 0.06787109375, + "learning_rate": 6.78990713159278e-07, + "loss": 0.0272, + "reward": 1.3199025392532349, + "reward_std": 0.22648808360099792, + "rewards/accuracy_reward_stage2": 0.44490253925323486, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1833 + }, + { + "completion_length": 12.53125, + "epoch": 0.32135973366041704, + "grad_norm": 17.953454178427567, + "kl": 0.1552734375, + "learning_rate": 6.788154897494304e-07, + "loss": 0.0624, + "reward": 1.4585304260253906, + "reward_std": 0.19420871138572693, + "rewards/accuracy_reward_stage2": 0.5835304856300354, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1834 + }, + { + "completion_length": 7.953125, + "epoch": 0.3215349570702646, + "grad_norm": 19.099478848368094, + "kl": 0.08740234375, + "learning_rate": 6.786402663395829e-07, + "loss": 0.0349, + "reward": 1.5200002193450928, + "reward_std": 0.24521487951278687, + "rewards/accuracy_reward_stage2": 0.6450002789497375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1835 + }, + { + "completion_length": 9.296875, + "epoch": 0.32171018048011213, + "grad_norm": 14.022198227726895, + "kl": 0.07470703125, + "learning_rate": 6.784650429297354e-07, + "loss": 0.03, + "reward": 1.5641283988952637, + "reward_std": 0.1534004509449005, + "rewards/accuracy_reward_stage2": 0.5641283392906189, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1836 + }, + { + "completion_length": 7.875, + "epoch": 0.3218854038899597, + "grad_norm": 15.95768049558501, + "kl": 0.1650390625, + "learning_rate": 6.782898195198878e-07, + "loss": 0.0661, + "reward": 1.0655455589294434, + "reward_std": 0.13622140884399414, + "rewards/accuracy_reward_stage2": 0.3155454993247986, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1837 + }, + { + "completion_length": 13.5, + "epoch": 0.3220606272998073, + "grad_norm": 30.588601350925913, + "kl": 0.244140625, + "learning_rate": 6.781145961100403e-07, + "loss": 0.0353, + "reward": 1.3970437049865723, + "reward_std": 0.23561923205852509, + "rewards/accuracy_reward_stage2": 0.662668764591217, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1838 + }, + { + "completion_length": 11.9375, + "epoch": 0.3222358507096548, + "grad_norm": 23.826287338434923, + "kl": 0.060546875, + "learning_rate": 6.779393727001928e-07, + "loss": 0.0242, + "reward": 1.3648169040679932, + "reward_std": 0.24242788553237915, + "rewards/accuracy_reward_stage2": 0.4898168742656708, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1839 + }, + { + "completion_length": 12.015625, + "epoch": 0.32241107411950237, + "grad_norm": 16.304101239213786, + "kl": 0.11572265625, + "learning_rate": 6.777641492903452e-07, + "loss": 0.0314, + "reward": 1.4843683242797852, + "reward_std": 0.15928740799427032, + "rewards/accuracy_reward_stage2": 0.49999332427978516, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1840 + }, + { + "completion_length": 12.0625, + "epoch": 0.3225862975293499, + "grad_norm": 19.02631795490684, + "kl": 0.2158203125, + "learning_rate": 6.775889258804977e-07, + "loss": 0.0419, + "reward": 1.5703623294830322, + "reward_std": 0.3020603358745575, + "rewards/accuracy_reward_stage2": 0.710987389087677, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1841 + }, + { + "completion_length": 9.875, + "epoch": 0.32276152093919747, + "grad_norm": 11.51782773806621, + "kl": 0.051513671875, + "learning_rate": 6.7741370247065e-07, + "loss": -0.0237, + "reward": 1.4839015007019043, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.4995265007019043, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1842 + }, + { + "completion_length": 21.875, + "epoch": 0.322936744349045, + "grad_norm": 22.112513404630846, + "kl": 0.1435546875, + "learning_rate": 6.772384790608024e-07, + "loss": 0.0572, + "reward": 1.2472364902496338, + "reward_std": 0.11612291634082794, + "rewards/accuracy_reward_stage2": 0.4972364008426666, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1843 + }, + { + "completion_length": 12.421875, + "epoch": 0.3231119677588926, + "grad_norm": 19.85265521887095, + "kl": 0.08251953125, + "learning_rate": 6.770632556509549e-07, + "loss": 0.0329, + "reward": 1.3744330406188965, + "reward_std": 0.17216874659061432, + "rewards/accuracy_reward_stage2": 0.3744330406188965, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1844 + }, + { + "completion_length": 8.078125, + "epoch": 0.32328719116874016, + "grad_norm": 11.854489737566515, + "kl": 0.0576171875, + "learning_rate": 6.768880322411073e-07, + "loss": 0.0231, + "reward": 1.5750248432159424, + "reward_std": 0.10127189010381699, + "rewards/accuracy_reward_stage2": 0.7000248432159424, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1845 + }, + { + "completion_length": 10.046875, + "epoch": 0.3234624145785877, + "grad_norm": 213.56026056990504, + "kl": 1.328125, + "learning_rate": 6.767128088312598e-07, + "loss": 0.5308, + "reward": 1.5306633710861206, + "reward_std": 0.20582614839076996, + "rewards/accuracy_reward_stage2": 0.6556633114814758, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1846 + }, + { + "completion_length": 7.46875, + "epoch": 0.32363763798843526, + "grad_norm": 24.61251417703207, + "kl": 0.1279296875, + "learning_rate": 6.765375854214123e-07, + "loss": 0.0512, + "reward": 1.7689133882522583, + "reward_std": 0.2784450054168701, + "rewards/accuracy_reward_stage2": 0.7689133882522583, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1847 + }, + { + "completion_length": 11.625, + "epoch": 0.3238128613982828, + "grad_norm": 21.46606276764324, + "kl": 0.279296875, + "learning_rate": 6.763623620115647e-07, + "loss": 0.0673, + "reward": 1.1931835412979126, + "reward_std": 0.28681859374046326, + "rewards/accuracy_reward_stage2": 0.4588085412979126, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1848 + }, + { + "completion_length": 9.5625, + "epoch": 0.32398808480813035, + "grad_norm": 24.369027773600227, + "kl": 0.1552734375, + "learning_rate": 6.761871386017172e-07, + "loss": 0.062, + "reward": 1.7190744876861572, + "reward_std": 0.25476139783859253, + "rewards/accuracy_reward_stage2": 0.7190744280815125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1849 + }, + { + "completion_length": 12.609375, + "epoch": 0.3241633082179779, + "grad_norm": 18.611472393606462, + "kl": 0.09423828125, + "learning_rate": 6.760119151918696e-07, + "loss": 0.0378, + "reward": 1.5120489597320557, + "reward_std": 0.2501325011253357, + "rewards/accuracy_reward_stage2": 0.5120488405227661, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1850 + }, + { + "completion_length": 11.40625, + "epoch": 0.3243385316278255, + "grad_norm": 16.87913889310574, + "kl": 0.036376953125, + "learning_rate": 6.758366917820221e-07, + "loss": 0.0145, + "reward": 1.5233901739120483, + "reward_std": 0.29552075266838074, + "rewards/accuracy_reward_stage2": 0.5233901739120483, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1851 + }, + { + "completion_length": 9.9375, + "epoch": 0.32451375503767305, + "grad_norm": 19.573316617620065, + "kl": 0.060546875, + "learning_rate": 6.756614683721746e-07, + "loss": 0.0242, + "reward": 1.6326043605804443, + "reward_std": 0.23380601406097412, + "rewards/accuracy_reward_stage2": 0.6326042413711548, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1852 + }, + { + "completion_length": 12.109375, + "epoch": 0.3246889784475206, + "grad_norm": 17.457107762068464, + "kl": 0.0654296875, + "learning_rate": 6.754862449623269e-07, + "loss": 0.0262, + "reward": 1.4427083730697632, + "reward_std": 0.17677412927150726, + "rewards/accuracy_reward_stage2": 0.4427083134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1853 + }, + { + "completion_length": 11.34375, + "epoch": 0.32486420185736814, + "grad_norm": 18.220680601291093, + "kl": 0.109375, + "learning_rate": 6.753110215524794e-07, + "loss": 0.0016, + "reward": 1.3575248718261719, + "reward_std": 0.229498952627182, + "rewards/accuracy_reward_stage2": 0.4981498122215271, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1854 + }, + { + "completion_length": 9.390625, + "epoch": 0.3250394252672157, + "grad_norm": 21.931580448265247, + "kl": 0.271484375, + "learning_rate": 6.751357981426318e-07, + "loss": 0.1087, + "reward": 1.392435908317566, + "reward_std": 0.2234029769897461, + "rewards/accuracy_reward_stage2": 0.6424359083175659, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1855 + }, + { + "completion_length": 6.515625, + "epoch": 0.32521464867706323, + "grad_norm": 13.46112374907508, + "kl": 0.05322265625, + "learning_rate": 6.749605747327842e-07, + "loss": 0.0212, + "reward": 1.5464448928833008, + "reward_std": 0.08465109765529633, + "rewards/accuracy_reward_stage2": 0.5464448928833008, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1856 + }, + { + "completion_length": 4.796875, + "epoch": 0.32538987208691084, + "grad_norm": 16.306716966882355, + "kl": 0.1513671875, + "learning_rate": 6.747853513229367e-07, + "loss": -0.0126, + "reward": 1.6233456134796143, + "reward_std": 0.17143097519874573, + "rewards/accuracy_reward_stage2": 0.6545956134796143, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1857 + }, + { + "completion_length": 12.9375, + "epoch": 0.3255650954967584, + "grad_norm": 14.849624955126052, + "kl": 0.2412109375, + "learning_rate": 6.746101279130891e-07, + "loss": 0.0526, + "reward": 1.2325433492660522, + "reward_std": 0.1239112988114357, + "rewards/accuracy_reward_stage2": 0.37316834926605225, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1858 + }, + { + "completion_length": 10.8125, + "epoch": 0.32574031890660593, + "grad_norm": 24.755112462709235, + "kl": 0.18359375, + "learning_rate": 6.744349045032416e-07, + "loss": 0.0399, + "reward": 1.4605742692947388, + "reward_std": 0.29862216114997864, + "rewards/accuracy_reward_stage2": 0.6011992692947388, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1859 + }, + { + "completion_length": 10.1875, + "epoch": 0.3259155423164535, + "grad_norm": 19.06098095277532, + "kl": 0.2119140625, + "learning_rate": 6.742596810933941e-07, + "loss": 0.0409, + "reward": 1.6791045665740967, + "reward_std": 0.21750207245349884, + "rewards/accuracy_reward_stage2": 0.8197296261787415, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1860 + }, + { + "completion_length": 7.484375, + "epoch": 0.326090765726301, + "grad_norm": 13.58793029092376, + "kl": 0.0289306640625, + "learning_rate": 6.740844576835465e-07, + "loss": 0.0116, + "reward": 1.6041667461395264, + "reward_std": 0.1746530830860138, + "rewards/accuracy_reward_stage2": 0.6041666865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1861 + }, + { + "completion_length": 8.828125, + "epoch": 0.32626598913614857, + "grad_norm": 17.06926939592597, + "kl": 0.287109375, + "learning_rate": 6.73909234273699e-07, + "loss": 0.1147, + "reward": 1.2411842346191406, + "reward_std": 0.24955028295516968, + "rewards/accuracy_reward_stage2": 0.49118414521217346, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1862 + }, + { + "completion_length": 6.296875, + "epoch": 0.3264412125459961, + "grad_norm": 21.228109264490534, + "kl": 0.05517578125, + "learning_rate": 6.737340108638514e-07, + "loss": 0.0221, + "reward": 1.6191439628601074, + "reward_std": 0.2491680532693863, + "rewards/accuracy_reward_stage2": 0.6191439032554626, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1863 + }, + { + "completion_length": 7.84375, + "epoch": 0.3266164359558437, + "grad_norm": 15.003439247671935, + "kl": 0.06298828125, + "learning_rate": 6.735587874540038e-07, + "loss": 0.0251, + "reward": 1.6949942111968994, + "reward_std": 0.1083931177854538, + "rewards/accuracy_reward_stage2": 0.694994330406189, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1864 + }, + { + "completion_length": 11.71875, + "epoch": 0.32679165936569127, + "grad_norm": 28.267395553834955, + "kl": 0.1904296875, + "learning_rate": 6.733835640441563e-07, + "loss": 0.076, + "reward": 1.5004314184188843, + "reward_std": 0.27763575315475464, + "rewards/accuracy_reward_stage2": 0.7504312992095947, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1865 + }, + { + "completion_length": 10.265625, + "epoch": 0.3269668827755388, + "grad_norm": 26.120695411410892, + "kl": 0.107421875, + "learning_rate": 6.732083406343087e-07, + "loss": 0.0431, + "reward": 1.693007230758667, + "reward_std": 0.2572506070137024, + "rewards/accuracy_reward_stage2": 0.6930071711540222, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1866 + }, + { + "completion_length": 12.21875, + "epoch": 0.32714210618538636, + "grad_norm": 15.245095474259111, + "kl": 0.0244140625, + "learning_rate": 6.730331172244612e-07, + "loss": 0.0098, + "reward": 1.7239583730697632, + "reward_std": 0.17329776287078857, + "rewards/accuracy_reward_stage2": 0.7239583730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1867 + }, + { + "completion_length": 10.046875, + "epoch": 0.3273173295952339, + "grad_norm": 16.955249727198467, + "kl": 0.0213623046875, + "learning_rate": 6.728578938146136e-07, + "loss": 0.0085, + "reward": 1.2291667461395264, + "reward_std": 0.15133953094482422, + "rewards/accuracy_reward_stage2": 0.3541666567325592, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1868 + }, + { + "completion_length": 9.015625, + "epoch": 0.32749255300508145, + "grad_norm": 32.29386433854243, + "kl": 0.16796875, + "learning_rate": 6.72682670404766e-07, + "loss": 0.0229, + "reward": 1.3476319313049316, + "reward_std": 0.156138077378273, + "rewards/accuracy_reward_stage2": 0.36325690150260925, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1869 + }, + { + "completion_length": 9.5, + "epoch": 0.32766777641492906, + "grad_norm": 12.37315895680729, + "kl": 0.0634765625, + "learning_rate": 6.725074469949185e-07, + "loss": 0.0254, + "reward": 1.488959550857544, + "reward_std": 0.07612180709838867, + "rewards/accuracy_reward_stage2": 0.6139594316482544, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1870 + }, + { + "completion_length": 9.1875, + "epoch": 0.3278429998247766, + "grad_norm": 20.2049118237751, + "kl": 0.1611328125, + "learning_rate": 6.72332223585071e-07, + "loss": 0.0203, + "reward": 1.7136660814285278, + "reward_std": 0.15727868676185608, + "rewards/accuracy_reward_stage2": 0.7292912006378174, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1871 + }, + { + "completion_length": 12.28125, + "epoch": 0.32801822323462415, + "grad_norm": 20.781043538536267, + "kl": 0.1396484375, + "learning_rate": 6.721570001752234e-07, + "loss": 0.0561, + "reward": 1.3187334537506104, + "reward_std": 0.1667216718196869, + "rewards/accuracy_reward_stage2": 0.5687333941459656, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1872 + }, + { + "completion_length": 12.671875, + "epoch": 0.3281934466444717, + "grad_norm": 15.177500825022513, + "kl": 0.043701171875, + "learning_rate": 6.719817767653758e-07, + "loss": 0.0175, + "reward": 1.38564133644104, + "reward_std": 0.12951165437698364, + "rewards/accuracy_reward_stage2": 0.38564133644104004, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1873 + }, + { + "completion_length": 10.21875, + "epoch": 0.32836867005431924, + "grad_norm": 16.988081404749867, + "kl": 0.1904296875, + "learning_rate": 6.718065533555282e-07, + "loss": 0.0763, + "reward": 1.4357813596725464, + "reward_std": 0.11432722210884094, + "rewards/accuracy_reward_stage2": 0.5607813596725464, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1874 + }, + { + "completion_length": 6.078125, + "epoch": 0.3285438934641668, + "grad_norm": 7.521942053848321, + "kl": 0.03759765625, + "learning_rate": 6.716313299456807e-07, + "loss": 0.015, + "reward": 1.671875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1875 + }, + { + "completion_length": 8.515625, + "epoch": 0.3287191168740144, + "grad_norm": 18.43664423428494, + "kl": 0.0810546875, + "learning_rate": 6.714561065358332e-07, + "loss": -0.0119, + "reward": 1.522249460220337, + "reward_std": 0.2793046236038208, + "rewards/accuracy_reward_stage2": 0.5378744602203369, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1876 + }, + { + "completion_length": 10.78125, + "epoch": 0.32889434028386194, + "grad_norm": 23.785278163967817, + "kl": 0.044921875, + "learning_rate": 6.712808831259856e-07, + "loss": 0.018, + "reward": 1.4928114414215088, + "reward_std": 0.32112449407577515, + "rewards/accuracy_reward_stage2": 0.49281150102615356, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1877 + }, + { + "completion_length": 9.75, + "epoch": 0.3290695636937095, + "grad_norm": 10.868982497146204, + "kl": 0.037841796875, + "learning_rate": 6.711056597161381e-07, + "loss": 0.0151, + "reward": 1.6688337326049805, + "reward_std": 0.055166780948638916, + "rewards/accuracy_reward_stage2": 0.6688336730003357, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1878 + }, + { + "completion_length": 11.125, + "epoch": 0.32924478710355704, + "grad_norm": 17.995101435846504, + "kl": 0.10302734375, + "learning_rate": 6.709304363062906e-07, + "loss": -0.0028, + "reward": 1.4962878227233887, + "reward_std": 0.22154772281646729, + "rewards/accuracy_reward_stage2": 0.5119128227233887, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1879 + }, + { + "completion_length": 10.375, + "epoch": 0.3294200105134046, + "grad_norm": 15.528621184241214, + "kl": 0.06591796875, + "learning_rate": 6.707552128964429e-07, + "loss": 0.0263, + "reward": 1.7492897510528564, + "reward_std": 0.18351225554943085, + "rewards/accuracy_reward_stage2": 0.7492897510528564, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1880 + }, + { + "completion_length": 13.71875, + "epoch": 0.32959523392325213, + "grad_norm": 14.708818987692135, + "kl": 0.0751953125, + "learning_rate": 6.705799894865954e-07, + "loss": 0.03, + "reward": 1.515625, + "reward_std": 0.19939783215522766, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1881 + }, + { + "completion_length": 12.234375, + "epoch": 0.3297704573330997, + "grad_norm": 16.61658553061757, + "kl": 0.064453125, + "learning_rate": 6.704047660767477e-07, + "loss": 0.0258, + "reward": 1.690694808959961, + "reward_std": 0.19719509780406952, + "rewards/accuracy_reward_stage2": 0.8156948089599609, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1882 + }, + { + "completion_length": 22.75, + "epoch": 0.3299456807429473, + "grad_norm": 17.117396245079153, + "kl": 0.119140625, + "learning_rate": 6.702295426669002e-07, + "loss": 0.0477, + "reward": 1.3262156248092651, + "reward_std": 0.13321471214294434, + "rewards/accuracy_reward_stage2": 0.3262156844139099, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1883 + }, + { + "completion_length": 11.265625, + "epoch": 0.3301209041527948, + "grad_norm": 17.60683011247102, + "kl": 0.04541015625, + "learning_rate": 6.700543192570527e-07, + "loss": -0.0702, + "reward": 1.4391136169433594, + "reward_std": 0.3226979374885559, + "rewards/accuracy_reward_stage2": 0.5953635573387146, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1884 + }, + { + "completion_length": 8.390625, + "epoch": 0.33029612756264237, + "grad_norm": 21.465121692233303, + "kl": 0.04248046875, + "learning_rate": 6.698790958472051e-07, + "loss": 0.017, + "reward": 1.71236252784729, + "reward_std": 0.3065722584724426, + "rewards/accuracy_reward_stage2": 0.71236252784729, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1885 + }, + { + "completion_length": 11.09375, + "epoch": 0.3304713509724899, + "grad_norm": 14.173433662062596, + "kl": 0.07666015625, + "learning_rate": 6.697038724373576e-07, + "loss": 0.0307, + "reward": 1.0344958305358887, + "reward_std": 0.1303577572107315, + "rewards/accuracy_reward_stage2": 0.2844958007335663, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1886 + }, + { + "completion_length": 12.640625, + "epoch": 0.33064657438233747, + "grad_norm": 15.206400035440623, + "kl": 0.05078125, + "learning_rate": 6.6952864902751e-07, + "loss": 0.0203, + "reward": 1.540507435798645, + "reward_std": 0.1567876935005188, + "rewards/accuracy_reward_stage2": 0.6655075550079346, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1887 + }, + { + "completion_length": 12.21875, + "epoch": 0.330821797792185, + "grad_norm": 16.109514600038697, + "kl": 0.283203125, + "learning_rate": 6.693534256176625e-07, + "loss": 0.1129, + "reward": 1.6333606243133545, + "reward_std": 0.15332239866256714, + "rewards/accuracy_reward_stage2": 0.8833605647087097, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1888 + }, + { + "completion_length": 7.90625, + "epoch": 0.3309970212020326, + "grad_norm": 16.047303236328855, + "kl": 0.052001953125, + "learning_rate": 6.69178202207815e-07, + "loss": 0.0208, + "reward": 1.561516284942627, + "reward_std": 0.25483137369155884, + "rewards/accuracy_reward_stage2": 0.561516284942627, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1889 + }, + { + "completion_length": 18.453125, + "epoch": 0.33117224461188016, + "grad_norm": 16.776042014411114, + "kl": 0.11669921875, + "learning_rate": 6.690029787979674e-07, + "loss": 0.0467, + "reward": 1.7119944095611572, + "reward_std": 0.1588687002658844, + "rewards/accuracy_reward_stage2": 0.7119944095611572, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1890 + }, + { + "completion_length": 12.15625, + "epoch": 0.3313474680217277, + "grad_norm": 17.90016040850292, + "kl": 0.2080078125, + "learning_rate": 6.688277553881199e-07, + "loss": 0.039, + "reward": 1.617673635482788, + "reward_std": 0.17623630166053772, + "rewards/accuracy_reward_stage2": 0.7582985758781433, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1891 + }, + { + "completion_length": 10.703125, + "epoch": 0.33152269143157526, + "grad_norm": 23.42697168203776, + "kl": 0.1708984375, + "learning_rate": 6.686525319782724e-07, + "loss": 0.0394, + "reward": 1.5168395042419434, + "reward_std": 0.23415003716945648, + "rewards/accuracy_reward_stage2": 0.6574645042419434, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1892 + }, + { + "completion_length": 20.15625, + "epoch": 0.3316979148414228, + "grad_norm": 16.298440462486333, + "kl": 0.0279541015625, + "learning_rate": 6.684773085684246e-07, + "loss": -0.0242, + "reward": 1.6867103576660156, + "reward_std": 0.17012272775173187, + "rewards/accuracy_reward_stage2": 0.7023352384567261, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1893 + }, + { + "completion_length": 14.203125, + "epoch": 0.33187313825127035, + "grad_norm": 22.995134068750403, + "kl": 0.1376953125, + "learning_rate": 6.683020851585771e-07, + "loss": 0.0107, + "reward": 1.4166667461395264, + "reward_std": 0.3733384609222412, + "rewards/accuracy_reward_stage2": 0.4322916865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1894 + }, + { + "completion_length": 8.578125, + "epoch": 0.33204836166111795, + "grad_norm": 19.63325924492353, + "kl": 0.050537109375, + "learning_rate": 6.681268617487295e-07, + "loss": -0.0148, + "reward": 1.6966354846954346, + "reward_std": 0.20570926368236542, + "rewards/accuracy_reward_stage2": 0.8372604250907898, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1895 + }, + { + "completion_length": 13.875, + "epoch": 0.3322235850709655, + "grad_norm": 19.72172048639019, + "kl": 0.1982421875, + "learning_rate": 6.67951638338882e-07, + "loss": 0.0793, + "reward": 1.390139102935791, + "reward_std": 0.20460231602191925, + "rewards/accuracy_reward_stage2": 0.515139102935791, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1896 + }, + { + "completion_length": 10.734375, + "epoch": 0.33239880848081305, + "grad_norm": 17.034895000067117, + "kl": 0.1162109375, + "learning_rate": 6.677764149290345e-07, + "loss": 0.0024, + "reward": 1.743318796157837, + "reward_std": 0.25681620836257935, + "rewards/accuracy_reward_stage2": 0.7589437961578369, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1897 + }, + { + "completion_length": 8.3125, + "epoch": 0.3325740318906606, + "grad_norm": 13.096367317694307, + "kl": 0.09521484375, + "learning_rate": 6.676011915191869e-07, + "loss": 0.0382, + "reward": 1.7255065441131592, + "reward_std": 0.19382783770561218, + "rewards/accuracy_reward_stage2": 0.7255065441131592, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1898 + }, + { + "completion_length": 12.8125, + "epoch": 0.33274925530050814, + "grad_norm": 18.103895780983308, + "kl": 0.11376953125, + "learning_rate": 6.674259681093394e-07, + "loss": 0.0014, + "reward": 1.4381917715072632, + "reward_std": 0.2100490778684616, + "rewards/accuracy_reward_stage2": 0.4538167119026184, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1899 + }, + { + "completion_length": 12.359375, + "epoch": 0.3329244787103557, + "grad_norm": 14.372391687615288, + "kl": 0.0537109375, + "learning_rate": 6.672507446994919e-07, + "loss": -0.0226, + "reward": 1.0958333015441895, + "reward_std": 0.1708841323852539, + "rewards/accuracy_reward_stage2": 0.36145833134651184, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1900 + }, + { + "completion_length": 10.390625, + "epoch": 0.33309970212020323, + "grad_norm": 18.28746014331557, + "kl": 0.07421875, + "learning_rate": 6.670755212896443e-07, + "loss": -0.0066, + "reward": 1.7098007202148438, + "reward_std": 0.21772566437721252, + "rewards/accuracy_reward_stage2": 0.725425660610199, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1901 + }, + { + "completion_length": 22.21875, + "epoch": 0.33327492553005084, + "grad_norm": 33.34818803522203, + "kl": 0.0517578125, + "learning_rate": 6.669002978797968e-07, + "loss": -0.0234, + "reward": 1.6470694541931152, + "reward_std": 0.17796316742897034, + "rewards/accuracy_reward_stage2": 0.6626943349838257, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1902 + }, + { + "completion_length": 14.875, + "epoch": 0.3334501489398984, + "grad_norm": 19.340941210498876, + "kl": 0.08642578125, + "learning_rate": 6.667250744699491e-07, + "loss": -0.0514, + "reward": 1.5270521640777588, + "reward_std": 0.32001104950904846, + "rewards/accuracy_reward_stage2": 0.5583021640777588, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1903 + }, + { + "completion_length": 8.75, + "epoch": 0.33362537234974593, + "grad_norm": 19.62612387488368, + "kl": 0.1708984375, + "learning_rate": 6.665498510601016e-07, + "loss": 0.0683, + "reward": 1.6385350227355957, + "reward_std": 0.0839998871088028, + "rewards/accuracy_reward_stage2": 0.7635350227355957, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1904 + }, + { + "completion_length": 12.109375, + "epoch": 0.3338005957595935, + "grad_norm": 13.862894810729566, + "kl": 0.0184326171875, + "learning_rate": 6.663746276502541e-07, + "loss": 0.0074, + "reward": 1.5989583730697632, + "reward_std": 0.13459712266921997, + "rewards/accuracy_reward_stage2": 0.5989583134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1905 + }, + { + "completion_length": 14.015625, + "epoch": 0.333975819169441, + "grad_norm": 21.690891130630494, + "kl": 0.275390625, + "learning_rate": 6.661994042404064e-07, + "loss": 0.0112, + "reward": 1.396896243095398, + "reward_std": 0.23912891745567322, + "rewards/accuracy_reward_stage2": 0.5687711834907532, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1906 + }, + { + "completion_length": 10.25, + "epoch": 0.33415104257928857, + "grad_norm": 16.18886844350906, + "kl": 0.045654296875, + "learning_rate": 6.660241808305589e-07, + "loss": 0.0182, + "reward": 1.7281818389892578, + "reward_std": 0.18544113636016846, + "rewards/accuracy_reward_stage2": 0.728181779384613, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1907 + }, + { + "completion_length": 11.53125, + "epoch": 0.3343262659891362, + "grad_norm": 8.774059095589669, + "kl": 0.06494140625, + "learning_rate": 6.658489574207114e-07, + "loss": -0.0182, + "reward": 1.3096954822540283, + "reward_std": 0.11100947856903076, + "rewards/accuracy_reward_stage2": 0.3253205120563507, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1908 + }, + { + "completion_length": 8.671875, + "epoch": 0.3345014893989837, + "grad_norm": 26.765411058532464, + "kl": 0.181640625, + "learning_rate": 6.656737340108638e-07, + "loss": 0.0438, + "reward": 1.8735935688018799, + "reward_std": 0.25263744592666626, + "rewards/accuracy_reward_stage2": 0.8892185688018799, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1909 + }, + { + "completion_length": 13.78125, + "epoch": 0.33467671280883127, + "grad_norm": 16.654080115758653, + "kl": 0.042724609375, + "learning_rate": 6.654985106010163e-07, + "loss": 0.0171, + "reward": 1.5321245193481445, + "reward_std": 0.16976571083068848, + "rewards/accuracy_reward_stage2": 0.5321245193481445, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1910 + }, + { + "completion_length": 10.578125, + "epoch": 0.3348519362186788, + "grad_norm": 22.42678703367036, + "kl": 0.279296875, + "learning_rate": 6.653232871911687e-07, + "loss": 0.0479, + "reward": 1.2796870470046997, + "reward_std": 0.23620405793190002, + "rewards/accuracy_reward_stage2": 0.5609370470046997, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1911 + }, + { + "completion_length": 10.859375, + "epoch": 0.33502715962852636, + "grad_norm": 29.145456284362375, + "kl": 0.053955078125, + "learning_rate": 6.651480637813211e-07, + "loss": 0.0216, + "reward": 1.7155293226242065, + "reward_std": 0.15784859657287598, + "rewards/accuracy_reward_stage2": 0.8405293226242065, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1912 + }, + { + "completion_length": 29.1875, + "epoch": 0.3352023830383739, + "grad_norm": 17.007278396942223, + "kl": 0.0556640625, + "learning_rate": 6.649728403714736e-07, + "loss": -0.0637, + "reward": 1.624013900756836, + "reward_std": 0.24514907598495483, + "rewards/accuracy_reward_stage2": 0.6552638411521912, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1913 + }, + { + "completion_length": 13.125, + "epoch": 0.33537760644822145, + "grad_norm": 22.398287940378502, + "kl": 0.142578125, + "learning_rate": 6.64797616961626e-07, + "loss": 0.0262, + "reward": 1.3815398216247559, + "reward_std": 0.299510657787323, + "rewards/accuracy_reward_stage2": 0.5221648216247559, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1914 + }, + { + "completion_length": 11.09375, + "epoch": 0.33555282985806906, + "grad_norm": 21.564872684074892, + "kl": 0.2177734375, + "learning_rate": 6.646223935517785e-07, + "loss": 0.0304, + "reward": 1.7198235988616943, + "reward_std": 0.27606385946273804, + "rewards/accuracy_reward_stage2": 0.7510735988616943, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1915 + }, + { + "completion_length": 10.8125, + "epoch": 0.3357280532679166, + "grad_norm": 24.75448251141867, + "kl": 0.107421875, + "learning_rate": 6.64447170141931e-07, + "loss": 0.0214, + "reward": 1.3904714584350586, + "reward_std": 0.2850678861141205, + "rewards/accuracy_reward_stage2": 0.5310965180397034, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1916 + }, + { + "completion_length": 20.8125, + "epoch": 0.33590327667776415, + "grad_norm": 18.343565887362885, + "kl": 0.14453125, + "learning_rate": 6.642719467320834e-07, + "loss": 0.0578, + "reward": 1.4843181371688843, + "reward_std": 0.11950040608644485, + "rewards/accuracy_reward_stage2": 0.6093181371688843, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1917 + }, + { + "completion_length": 15.0, + "epoch": 0.3360785000876117, + "grad_norm": 16.500056251343526, + "kl": 0.05224609375, + "learning_rate": 6.640967233222359e-07, + "loss": 0.0209, + "reward": 1.7818292379379272, + "reward_std": 0.15213115513324738, + "rewards/accuracy_reward_stage2": 0.7818291783332825, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1918 + }, + { + "completion_length": 11.5, + "epoch": 0.33625372349745924, + "grad_norm": 23.112114220771804, + "kl": 0.158203125, + "learning_rate": 6.639214999123882e-07, + "loss": 0.0634, + "reward": 1.6982868909835815, + "reward_std": 0.29553136229515076, + "rewards/accuracy_reward_stage2": 0.6982868313789368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1919 + }, + { + "completion_length": 13.28125, + "epoch": 0.3364289469073068, + "grad_norm": 26.025706676782015, + "kl": 0.294921875, + "learning_rate": 6.637462765025407e-07, + "loss": 0.0736, + "reward": 1.2600409984588623, + "reward_std": 0.25321489572525024, + "rewards/accuracy_reward_stage2": 0.4006659686565399, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1920 + }, + { + "completion_length": 18.734375, + "epoch": 0.3366041703171544, + "grad_norm": 16.870784749615115, + "kl": 0.1953125, + "learning_rate": 6.635710530926932e-07, + "loss": 0.0007, + "reward": 1.2553613185882568, + "reward_std": 0.16248267889022827, + "rewards/accuracy_reward_stage2": 0.41161128878593445, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1921 + }, + { + "completion_length": 5.203125, + "epoch": 0.33677939372700194, + "grad_norm": 11.88597080650647, + "kl": 0.08740234375, + "learning_rate": 6.633958296828455e-07, + "loss": -0.0092, + "reward": 1.796875, + "reward_std": 0.19044628739356995, + "rewards/accuracy_reward_stage2": 0.8125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1922 + }, + { + "completion_length": 8.8125, + "epoch": 0.3369546171368495, + "grad_norm": 18.00565963799991, + "kl": 0.1044921875, + "learning_rate": 6.63220606272998e-07, + "loss": -0.0396, + "reward": 1.3730442523956299, + "reward_std": 0.24149462580680847, + "rewards/accuracy_reward_stage2": 0.40429437160491943, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1923 + }, + { + "completion_length": 11.953125, + "epoch": 0.33712984054669703, + "grad_norm": 21.296992883781495, + "kl": 0.24609375, + "learning_rate": 6.630453828631505e-07, + "loss": 0.0543, + "reward": 1.3399149179458618, + "reward_std": 0.19342654943466187, + "rewards/accuracy_reward_stage2": 0.4805399179458618, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1924 + }, + { + "completion_length": 6.296875, + "epoch": 0.3373050639565446, + "grad_norm": 18.698121586193267, + "kl": 0.1923828125, + "learning_rate": 6.628701594533029e-07, + "loss": -0.0437, + "reward": 1.8635075092315674, + "reward_std": 0.2860381007194519, + "rewards/accuracy_reward_stage2": 0.9103825092315674, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1925 + }, + { + "completion_length": 11.796875, + "epoch": 0.33748028736639213, + "grad_norm": 15.837593428346915, + "kl": 0.059326171875, + "learning_rate": 6.626949360434554e-07, + "loss": -0.0205, + "reward": 1.519178867340088, + "reward_std": 0.15417931973934174, + "rewards/accuracy_reward_stage2": 0.5348039269447327, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1926 + }, + { + "completion_length": 10.15625, + "epoch": 0.33765551077623973, + "grad_norm": 22.356629840600462, + "kl": 0.201171875, + "learning_rate": 6.625197126336078e-07, + "loss": -0.0407, + "reward": 1.5961397886276245, + "reward_std": 0.3365238308906555, + "rewards/accuracy_reward_stage2": 0.6430148482322693, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1927 + }, + { + "completion_length": 9.640625, + "epoch": 0.3378307341860873, + "grad_norm": 19.981589308553318, + "kl": 0.228515625, + "learning_rate": 6.623444892237603e-07, + "loss": 0.0523, + "reward": 1.538138508796692, + "reward_std": 0.3606716990470886, + "rewards/accuracy_reward_stage2": 0.6787635087966919, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1928 + }, + { + "completion_length": 9.125, + "epoch": 0.3380059575959348, + "grad_norm": 21.779944693905204, + "kl": 0.19921875, + "learning_rate": 6.621692658139128e-07, + "loss": 0.0246, + "reward": 1.6251780986785889, + "reward_std": 0.30405157804489136, + "rewards/accuracy_reward_stage2": 0.6564280986785889, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1929 + }, + { + "completion_length": 10.90625, + "epoch": 0.33818118100578237, + "grad_norm": 17.360682645924115, + "kl": 0.1181640625, + "learning_rate": 6.619940424040652e-07, + "loss": 0.0074, + "reward": 1.4416460990905762, + "reward_std": 0.15304405987262726, + "rewards/accuracy_reward_stage2": 0.5822710394859314, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1930 + }, + { + "completion_length": 9.421875, + "epoch": 0.3383564044156299, + "grad_norm": 15.773455335638124, + "kl": 0.1123046875, + "learning_rate": 6.618188189942176e-07, + "loss": 0.0009, + "reward": 1.5871254205703735, + "reward_std": 0.14930549263954163, + "rewards/accuracy_reward_stage2": 0.6027504205703735, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1931 + }, + { + "completion_length": 11.0, + "epoch": 0.33853162782547747, + "grad_norm": 18.534239535030164, + "kl": 0.04296875, + "learning_rate": 6.6164359558437e-07, + "loss": 0.0172, + "reward": 1.6630206108093262, + "reward_std": 0.1501636952161789, + "rewards/accuracy_reward_stage2": 0.6630206108093262, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1932 + }, + { + "completion_length": 11.328125, + "epoch": 0.338706851235325, + "grad_norm": 16.023803016884205, + "kl": 0.0341796875, + "learning_rate": 6.614683721745224e-07, + "loss": 0.0137, + "reward": 1.7510204315185547, + "reward_std": 0.07846297323703766, + "rewards/accuracy_reward_stage2": 0.7510203123092651, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1933 + }, + { + "completion_length": 11.453125, + "epoch": 0.3388820746451726, + "grad_norm": 21.742080959952997, + "kl": 0.076171875, + "learning_rate": 6.612931487646749e-07, + "loss": 0.0304, + "reward": 1.41162109375, + "reward_std": 0.3468879461288452, + "rewards/accuracy_reward_stage2": 0.4116211533546448, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1934 + }, + { + "completion_length": 10.625, + "epoch": 0.33905729805502016, + "grad_norm": 15.14521136054857, + "kl": 0.18359375, + "learning_rate": 6.611179253548273e-07, + "loss": 0.0733, + "reward": 1.47281813621521, + "reward_std": 0.13576127588748932, + "rewards/accuracy_reward_stage2": 0.59781813621521, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1935 + }, + { + "completion_length": 9.671875, + "epoch": 0.3392325214648677, + "grad_norm": 14.546117917073072, + "kl": 0.09619140625, + "learning_rate": 6.609427019449798e-07, + "loss": -0.0056, + "reward": 1.7761536836624146, + "reward_std": 0.09021301567554474, + "rewards/accuracy_reward_stage2": 0.7917786240577698, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1936 + }, + { + "completion_length": 9.8125, + "epoch": 0.33940774487471526, + "grad_norm": 13.956049087009886, + "kl": 0.12451171875, + "learning_rate": 6.607674785351323e-07, + "loss": 0.0497, + "reward": 1.6214237213134766, + "reward_std": 0.11793522536754608, + "rewards/accuracy_reward_stage2": 0.7464236617088318, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1937 + }, + { + "completion_length": 11.5625, + "epoch": 0.3395829682845628, + "grad_norm": 10.727183178173318, + "kl": 0.07958984375, + "learning_rate": 6.605922551252847e-07, + "loss": -0.0124, + "reward": 1.488537073135376, + "reward_std": 0.1613566279411316, + "rewards/accuracy_reward_stage2": 0.629162073135376, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1938 + }, + { + "completion_length": 6.078125, + "epoch": 0.33975819169441035, + "grad_norm": 20.65504262119513, + "kl": 0.068359375, + "learning_rate": 6.604170317154372e-07, + "loss": 0.0274, + "reward": 1.7564607858657837, + "reward_std": 0.16308678686618805, + "rewards/accuracy_reward_stage2": 0.7564607262611389, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1939 + }, + { + "completion_length": 8.40625, + "epoch": 0.33993341510425795, + "grad_norm": 17.045082527702256, + "kl": 0.1298828125, + "learning_rate": 6.602418083055897e-07, + "loss": -0.035, + "reward": 1.672278642654419, + "reward_std": 0.2258690595626831, + "rewards/accuracy_reward_stage2": 0.7035285830497742, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1940 + }, + { + "completion_length": 9.40625, + "epoch": 0.3401086385141055, + "grad_norm": 20.15182148776487, + "kl": 0.1943359375, + "learning_rate": 6.600665848957421e-07, + "loss": 0.0778, + "reward": 1.5685465335845947, + "reward_std": 0.2734663784503937, + "rewards/accuracy_reward_stage2": 0.6935466527938843, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1941 + }, + { + "completion_length": 14.28125, + "epoch": 0.34028386192395305, + "grad_norm": 16.56521479852848, + "kl": 0.111328125, + "learning_rate": 6.598913614858945e-07, + "loss": 0.0445, + "reward": 1.4376866817474365, + "reward_std": 0.13336199522018433, + "rewards/accuracy_reward_stage2": 0.4376866817474365, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1942 + }, + { + "completion_length": 9.390625, + "epoch": 0.3404590853338006, + "grad_norm": 18.12517967513942, + "kl": 0.0771484375, + "learning_rate": 6.597161380760469e-07, + "loss": 0.0308, + "reward": 1.4623807668685913, + "reward_std": 0.18650619685649872, + "rewards/accuracy_reward_stage2": 0.5873807668685913, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1943 + }, + { + "completion_length": 11.359375, + "epoch": 0.34063430874364814, + "grad_norm": 20.774634464244237, + "kl": 0.11083984375, + "learning_rate": 6.595409146661993e-07, + "loss": 0.0164, + "reward": 1.4170732498168945, + "reward_std": 0.26271551847457886, + "rewards/accuracy_reward_stage2": 0.43269819021224976, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1944 + }, + { + "completion_length": 10.3125, + "epoch": 0.3408095321534957, + "grad_norm": 21.154764502543014, + "kl": 0.1630859375, + "learning_rate": 6.593656912563518e-07, + "loss": 0.0655, + "reward": 1.6541085243225098, + "reward_std": 0.20481356978416443, + "rewards/accuracy_reward_stage2": 0.779108464717865, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1945 + }, + { + "completion_length": 15.8125, + "epoch": 0.3409847555633433, + "grad_norm": 26.25091427777858, + "kl": 0.046875, + "learning_rate": 6.591904678465042e-07, + "loss": 0.0188, + "reward": 1.4044055938720703, + "reward_std": 0.3343871831893921, + "rewards/accuracy_reward_stage2": 0.4044056236743927, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1946 + }, + { + "completion_length": 7.578125, + "epoch": 0.34115997897319084, + "grad_norm": 21.537045200934948, + "kl": 0.0576171875, + "learning_rate": 6.590152444366567e-07, + "loss": -0.021, + "reward": 1.3350812196731567, + "reward_std": 0.2705545425415039, + "rewards/accuracy_reward_stage2": 0.35070618987083435, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1947 + }, + { + "completion_length": 9.828125, + "epoch": 0.3413352023830384, + "grad_norm": 15.870683875822307, + "kl": 0.119140625, + "learning_rate": 6.588400210268091e-07, + "loss": -0.0296, + "reward": 1.4924291372299194, + "reward_std": 0.3276829123497009, + "rewards/accuracy_reward_stage2": 0.5236790776252747, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1948 + }, + { + "completion_length": 11.640625, + "epoch": 0.34151042579288593, + "grad_norm": 19.496774629125934, + "kl": 0.1337890625, + "learning_rate": 6.586647976169616e-07, + "loss": -0.0063, + "reward": 1.6220420598983765, + "reward_std": 0.2530994415283203, + "rewards/accuracy_reward_stage2": 0.7782920002937317, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1949 + }, + { + "completion_length": 9.765625, + "epoch": 0.3416856492027335, + "grad_norm": 21.680354742630147, + "kl": 0.1611328125, + "learning_rate": 6.584895742071141e-07, + "loss": -0.0087, + "reward": 1.5224058628082275, + "reward_std": 0.2604309916496277, + "rewards/accuracy_reward_stage2": 0.6786558628082275, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1950 + }, + { + "completion_length": 14.59375, + "epoch": 0.341860872612581, + "grad_norm": 20.994977503450045, + "kl": 0.046142578125, + "learning_rate": 6.583143507972665e-07, + "loss": 0.0184, + "reward": 1.4522783756256104, + "reward_std": 0.3166598677635193, + "rewards/accuracy_reward_stage2": 0.4522784352302551, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1951 + }, + { + "completion_length": 9.96875, + "epoch": 0.34203609602242857, + "grad_norm": 27.082783672558996, + "kl": 0.1552734375, + "learning_rate": 6.581391273874189e-07, + "loss": 0.062, + "reward": 1.4804890155792236, + "reward_std": 0.19034737348556519, + "rewards/accuracy_reward_stage2": 0.7304890751838684, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1952 + }, + { + "completion_length": 13.4375, + "epoch": 0.3422113194322762, + "grad_norm": 21.054283543273684, + "kl": 0.158203125, + "learning_rate": 6.579639039775714e-07, + "loss": -0.0083, + "reward": 1.4764494895935059, + "reward_std": 0.28760117292404175, + "rewards/accuracy_reward_stage2": 0.5076994299888611, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1953 + }, + { + "completion_length": 7.21875, + "epoch": 0.3423865428421237, + "grad_norm": 13.542532358252753, + "kl": 0.0361328125, + "learning_rate": 6.577886805677238e-07, + "loss": 0.0145, + "reward": 1.671720266342163, + "reward_std": 0.11243344098329544, + "rewards/accuracy_reward_stage2": 0.6717202663421631, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1954 + }, + { + "completion_length": 8.5, + "epoch": 0.34256176625197127, + "grad_norm": 27.08696220045124, + "kl": 0.1416015625, + "learning_rate": 6.576134571578763e-07, + "loss": -0.0228, + "reward": 1.2418166399002075, + "reward_std": 0.2573709487915039, + "rewards/accuracy_reward_stage2": 0.5230665802955627, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1955 + }, + { + "completion_length": 17.078125, + "epoch": 0.3427369896618188, + "grad_norm": 14.514008667861964, + "kl": 0.043701171875, + "learning_rate": 6.574382337480288e-07, + "loss": 0.0174, + "reward": 1.6762876510620117, + "reward_std": 0.12784245610237122, + "rewards/accuracy_reward_stage2": 0.6762876510620117, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1956 + }, + { + "completion_length": 12.046875, + "epoch": 0.34291221307166636, + "grad_norm": 11.57712868725116, + "kl": 0.0712890625, + "learning_rate": 6.572630103381811e-07, + "loss": 0.0285, + "reward": 1.4849703311920166, + "reward_std": 0.0639050081372261, + "rewards/accuracy_reward_stage2": 0.48497024178504944, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1957 + }, + { + "completion_length": 10.484375, + "epoch": 0.3430874364815139, + "grad_norm": 17.600663870878677, + "kl": 0.0810546875, + "learning_rate": 6.570877869283336e-07, + "loss": 0.0324, + "reward": 1.7086526155471802, + "reward_std": 0.1977860927581787, + "rewards/accuracy_reward_stage2": 0.7086526155471802, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1958 + }, + { + "completion_length": 11.53125, + "epoch": 0.3432626598913615, + "grad_norm": 21.655825213453603, + "kl": 0.17578125, + "learning_rate": 6.56912563518486e-07, + "loss": 0.0306, + "reward": 1.259408712387085, + "reward_std": 0.23448513448238373, + "rewards/accuracy_reward_stage2": 0.5250337719917297, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1959 + }, + { + "completion_length": 10.09375, + "epoch": 0.34343788330120906, + "grad_norm": 14.008372896408902, + "kl": 0.0634765625, + "learning_rate": 6.567373401086385e-07, + "loss": 0.0254, + "reward": 1.745302438735962, + "reward_std": 0.04648788273334503, + "rewards/accuracy_reward_stage2": 0.7453025579452515, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1960 + }, + { + "completion_length": 11.453125, + "epoch": 0.3436131067110566, + "grad_norm": 26.769825802585252, + "kl": 0.11083984375, + "learning_rate": 6.56562116698791e-07, + "loss": 0.0444, + "reward": 1.5379596948623657, + "reward_std": 0.06265933811664581, + "rewards/accuracy_reward_stage2": 0.6629596948623657, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1961 + }, + { + "completion_length": 12.53125, + "epoch": 0.34378833012090415, + "grad_norm": 25.466248398573278, + "kl": 0.16796875, + "learning_rate": 6.563868932889433e-07, + "loss": 0.023, + "reward": 1.312827229499817, + "reward_std": 0.31713223457336426, + "rewards/accuracy_reward_stage2": 0.32845228910446167, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1962 + }, + { + "completion_length": 12.515625, + "epoch": 0.3439635535307517, + "grad_norm": 19.458826083401085, + "kl": 0.10400390625, + "learning_rate": 6.562116698790958e-07, + "loss": 0.0415, + "reward": 1.4178786277770996, + "reward_std": 0.06989157199859619, + "rewards/accuracy_reward_stage2": 0.4178787171840668, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1963 + }, + { + "completion_length": 12.296875, + "epoch": 0.34413877694059924, + "grad_norm": 28.785121368008614, + "kl": 0.0927734375, + "learning_rate": 6.560364464692482e-07, + "loss": 0.037, + "reward": 1.6619970798492432, + "reward_std": 0.2762282192707062, + "rewards/accuracy_reward_stage2": 0.6619970202445984, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1964 + }, + { + "completion_length": 12.734375, + "epoch": 0.34431400035044685, + "grad_norm": 17.61475593043083, + "kl": 0.162109375, + "learning_rate": 6.558612230594007e-07, + "loss": 0.0056, + "reward": 1.4520516395568848, + "reward_std": 0.20958861708641052, + "rewards/accuracy_reward_stage2": 0.48330157995224, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1965 + }, + { + "completion_length": 7.171875, + "epoch": 0.3444892237602944, + "grad_norm": 15.181443481188113, + "kl": 0.12353515625, + "learning_rate": 6.556859996495532e-07, + "loss": 0.0052, + "reward": 1.7034376859664917, + "reward_std": 0.15864676237106323, + "rewards/accuracy_reward_stage2": 0.8440626859664917, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1966 + }, + { + "completion_length": 8.703125, + "epoch": 0.34466444717014194, + "grad_norm": 18.706753461209544, + "kl": 0.0732421875, + "learning_rate": 6.555107762397056e-07, + "loss": 0.0293, + "reward": 1.7096948623657227, + "reward_std": 0.15963897109031677, + "rewards/accuracy_reward_stage2": 0.8346949219703674, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1967 + }, + { + "completion_length": 9.28125, + "epoch": 0.3448396705799895, + "grad_norm": 20.080951676498536, + "kl": 0.05908203125, + "learning_rate": 6.553355528298581e-07, + "loss": 0.0236, + "reward": 1.6409235000610352, + "reward_std": 0.18434467911720276, + "rewards/accuracy_reward_stage2": 0.6409235000610352, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1968 + }, + { + "completion_length": 19.921875, + "epoch": 0.34501489398983703, + "grad_norm": 15.480120800250623, + "kl": 0.08984375, + "learning_rate": 6.551603294200106e-07, + "loss": 0.036, + "reward": 1.6779283285140991, + "reward_std": 0.12816794216632843, + "rewards/accuracy_reward_stage2": 0.6779283285140991, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1969 + }, + { + "completion_length": 13.625, + "epoch": 0.3451901173996846, + "grad_norm": 9.019538711132414, + "kl": 0.0255126953125, + "learning_rate": 6.549851060101629e-07, + "loss": 0.0102, + "reward": 1.7239811420440674, + "reward_std": 0.03791867941617966, + "rewards/accuracy_reward_stage2": 0.7239811420440674, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1970 + }, + { + "completion_length": 12.53125, + "epoch": 0.34536534080953213, + "grad_norm": 18.72549133371684, + "kl": 0.076171875, + "learning_rate": 6.548098826003154e-07, + "loss": 0.0304, + "reward": 1.5978374481201172, + "reward_std": 0.24242404103279114, + "rewards/accuracy_reward_stage2": 0.5978374481201172, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1971 + }, + { + "completion_length": 13.5625, + "epoch": 0.34554056421937973, + "grad_norm": 13.629983423344452, + "kl": 0.10595703125, + "learning_rate": 6.546346591904677e-07, + "loss": -0.0019, + "reward": 1.494866132736206, + "reward_std": 0.0999663770198822, + "rewards/accuracy_reward_stage2": 0.510491132736206, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1972 + }, + { + "completion_length": 12.40625, + "epoch": 0.3457157876292273, + "grad_norm": 16.544557598029545, + "kl": 0.1640625, + "learning_rate": 6.544594357806202e-07, + "loss": 0.024, + "reward": 1.320347547531128, + "reward_std": 0.25300195813179016, + "rewards/accuracy_reward_stage2": 0.46097254753112793, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1973 + }, + { + "completion_length": 8.65625, + "epoch": 0.3458910110390748, + "grad_norm": 27.031728264190516, + "kl": 0.1015625, + "learning_rate": 6.542842123707727e-07, + "loss": 0.0407, + "reward": 1.6358357667922974, + "reward_std": 0.2411271631717682, + "rewards/accuracy_reward_stage2": 0.6358357667922974, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1974 + }, + { + "completion_length": 7.03125, + "epoch": 0.34606623444892237, + "grad_norm": 20.65655761948469, + "kl": 0.142578125, + "learning_rate": 6.541089889609251e-07, + "loss": 0.0572, + "reward": 1.5470199584960938, + "reward_std": 0.2650681138038635, + "rewards/accuracy_reward_stage2": 0.6720199584960938, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1975 + }, + { + "completion_length": 10.140625, + "epoch": 0.3462414578587699, + "grad_norm": 17.86962987062657, + "kl": 0.049072265625, + "learning_rate": 6.539337655510776e-07, + "loss": 0.0133, + "reward": 1.5956544876098633, + "reward_std": 0.17510367929935455, + "rewards/accuracy_reward_stage2": 0.6112794280052185, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1976 + }, + { + "completion_length": 10.9375, + "epoch": 0.34641668126861747, + "grad_norm": 14.238587117023814, + "kl": 0.060546875, + "learning_rate": 6.537585421412301e-07, + "loss": 0.0241, + "reward": 1.809149980545044, + "reward_std": 0.15811999142169952, + "rewards/accuracy_reward_stage2": 0.8091498613357544, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1977 + }, + { + "completion_length": 9.203125, + "epoch": 0.34659190467846507, + "grad_norm": 17.02518771036635, + "kl": 0.1845703125, + "learning_rate": 6.535833187313825e-07, + "loss": -0.0057, + "reward": 1.661616325378418, + "reward_std": 0.28009992837905884, + "rewards/accuracy_reward_stage2": 0.8178663849830627, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1978 + }, + { + "completion_length": 7.75, + "epoch": 0.3467671280883126, + "grad_norm": 27.107300154566495, + "kl": 0.2470703125, + "learning_rate": 6.53408095321535e-07, + "loss": 0.055, + "reward": 1.6364436149597168, + "reward_std": 0.17650076746940613, + "rewards/accuracy_reward_stage2": 0.6520686149597168, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1979 + }, + { + "completion_length": 10.71875, + "epoch": 0.34694235149816016, + "grad_norm": 16.632001236146216, + "kl": 0.119140625, + "learning_rate": 6.532328719116874e-07, + "loss": -0.0406, + "reward": 1.2066890001296997, + "reward_std": 0.15964210033416748, + "rewards/accuracy_reward_stage2": 0.3629389703273773, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1980 + }, + { + "completion_length": 11.5625, + "epoch": 0.3471175749080077, + "grad_norm": 25.372168175931847, + "kl": 0.1220703125, + "learning_rate": 6.530576485018399e-07, + "loss": 0.0047, + "reward": 1.4195719957351685, + "reward_std": 0.3210878372192383, + "rewards/accuracy_reward_stage2": 0.5601969957351685, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1981 + }, + { + "completion_length": 13.484375, + "epoch": 0.34729279831785526, + "grad_norm": 18.356621675505927, + "kl": 0.09765625, + "learning_rate": 6.528824250919922e-07, + "loss": 0.039, + "reward": 1.68565034866333, + "reward_std": 0.1419445276260376, + "rewards/accuracy_reward_stage2": 0.6856504678726196, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1982 + }, + { + "completion_length": 15.625, + "epoch": 0.3474680217277028, + "grad_norm": 15.596275383801663, + "kl": 0.064453125, + "learning_rate": 6.527072016821446e-07, + "loss": 0.0258, + "reward": 1.618418574333191, + "reward_std": 0.22926867008209229, + "rewards/accuracy_reward_stage2": 0.6184185147285461, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1983 + }, + { + "completion_length": 10.125, + "epoch": 0.34764324513755035, + "grad_norm": 17.592549545737327, + "kl": 0.16015625, + "learning_rate": 6.525319782722971e-07, + "loss": 0.064, + "reward": 1.397711157798767, + "reward_std": 0.20366990566253662, + "rewards/accuracy_reward_stage2": 0.6477111577987671, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1984 + }, + { + "completion_length": 9.296875, + "epoch": 0.34781846854739795, + "grad_norm": 20.029006910207976, + "kl": 0.208984375, + "learning_rate": 6.523567548624496e-07, + "loss": 0.0344, + "reward": 1.679081916809082, + "reward_std": 0.2575289309024811, + "rewards/accuracy_reward_stage2": 0.835331916809082, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1985 + }, + { + "completion_length": 26.0625, + "epoch": 0.3479936919572455, + "grad_norm": 15.756414153200662, + "kl": 0.10400390625, + "learning_rate": 6.52181531452602e-07, + "loss": -0.0028, + "reward": 1.7272648811340332, + "reward_std": 0.15835845470428467, + "rewards/accuracy_reward_stage2": 0.7428898811340332, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1986 + }, + { + "completion_length": 8.609375, + "epoch": 0.34816891536709305, + "grad_norm": 25.260937057275694, + "kl": 0.1376953125, + "learning_rate": 6.520063080427545e-07, + "loss": 0.0552, + "reward": 1.3206074237823486, + "reward_std": 0.30781957507133484, + "rewards/accuracy_reward_stage2": 0.44560742378234863, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1987 + }, + { + "completion_length": 8.203125, + "epoch": 0.3483441387769406, + "grad_norm": 21.79510382885528, + "kl": 0.267578125, + "learning_rate": 6.518310846329069e-07, + "loss": -0.0048, + "reward": 1.5288782119750977, + "reward_std": 0.19049212336540222, + "rewards/accuracy_reward_stage2": 0.7007532715797424, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1988 + }, + { + "completion_length": 8.34375, + "epoch": 0.34851936218678814, + "grad_norm": 16.02145132189449, + "kl": 0.078125, + "learning_rate": 6.516558612230594e-07, + "loss": -0.0093, + "reward": 1.5895137786865234, + "reward_std": 0.15934374928474426, + "rewards/accuracy_reward_stage2": 0.6051387786865234, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1989 + }, + { + "completion_length": 9.703125, + "epoch": 0.3486945855966357, + "grad_norm": 14.497867470770592, + "kl": 0.0184326171875, + "learning_rate": 6.514806378132119e-07, + "loss": 0.0074, + "reward": 1.5919466018676758, + "reward_std": 0.13881367444992065, + "rewards/accuracy_reward_stage2": 0.5919466018676758, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1990 + }, + { + "completion_length": 12.875, + "epoch": 0.3488698090064833, + "grad_norm": 17.89472880207737, + "kl": 0.208984375, + "learning_rate": 6.513054144033643e-07, + "loss": 0.0546, + "reward": 1.4190115928649902, + "reward_std": 0.1824534833431244, + "rewards/accuracy_reward_stage2": 0.6846365928649902, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1991 + }, + { + "completion_length": 12.5625, + "epoch": 0.34904503241633084, + "grad_norm": 20.924271113255397, + "kl": 0.09130859375, + "learning_rate": 6.511301909935167e-07, + "loss": -0.0075, + "reward": 1.6088534593582153, + "reward_std": 0.2591401934623718, + "rewards/accuracy_reward_stage2": 0.6244784593582153, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1992 + }, + { + "completion_length": 7.453125, + "epoch": 0.3492202558261784, + "grad_norm": 15.743749637089023, + "kl": 0.0267333984375, + "learning_rate": 6.509549675836692e-07, + "loss": -0.0224, + "reward": 1.6175103187561035, + "reward_std": 0.11349479854106903, + "rewards/accuracy_reward_stage2": 0.6331353187561035, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1993 + }, + { + "completion_length": 9.3125, + "epoch": 0.34939547923602593, + "grad_norm": 16.28655284774425, + "kl": 0.1318359375, + "learning_rate": 6.507797441738216e-07, + "loss": 0.0157, + "reward": 1.5714805126190186, + "reward_std": 0.14586706459522247, + "rewards/accuracy_reward_stage2": 0.5871055126190186, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1994 + }, + { + "completion_length": 10.640625, + "epoch": 0.3495707026458735, + "grad_norm": 19.764527745348857, + "kl": 0.1494140625, + "learning_rate": 6.50604520763974e-07, + "loss": 0.0598, + "reward": 1.5106749534606934, + "reward_std": 0.24963447451591492, + "rewards/accuracy_reward_stage2": 0.5106750130653381, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1995 + }, + { + "completion_length": 7.78125, + "epoch": 0.349745926055721, + "grad_norm": 25.807882325692766, + "kl": 0.212890625, + "learning_rate": 6.504292973541264e-07, + "loss": 0.0412, + "reward": 1.402549386024475, + "reward_std": 0.22109973430633545, + "rewards/accuracy_reward_stage2": 0.4181743860244751, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1996 + }, + { + "completion_length": 10.890625, + "epoch": 0.3499211494655686, + "grad_norm": 19.0405343018544, + "kl": 0.05029296875, + "learning_rate": 6.502540739442789e-07, + "loss": 0.0201, + "reward": 1.6537227630615234, + "reward_std": 0.25505587458610535, + "rewards/accuracy_reward_stage2": 0.7787227630615234, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1997 + }, + { + "completion_length": 13.421875, + "epoch": 0.3500963728754162, + "grad_norm": 18.426578115254816, + "kl": 0.033447265625, + "learning_rate": 6.500788505344314e-07, + "loss": -0.0306, + "reward": 1.6478216648101807, + "reward_std": 0.19639632105827332, + "rewards/accuracy_reward_stage2": 0.6634466052055359, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1998 + }, + { + "completion_length": 6.578125, + "epoch": 0.3502715962852637, + "grad_norm": 16.863516374025938, + "kl": 0.10595703125, + "learning_rate": 6.499036271245838e-07, + "loss": 0.0424, + "reward": 1.6519603729248047, + "reward_std": 0.1129794791340828, + "rewards/accuracy_reward_stage2": 0.6519604325294495, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1999 + }, + { + "completion_length": 9.546875, + "epoch": 0.35044681969511127, + "grad_norm": 18.840306610077086, + "kl": 0.1435546875, + "learning_rate": 6.497284037147363e-07, + "loss": 0.013, + "reward": 1.7694342136383057, + "reward_std": 0.15474824607372284, + "rewards/accuracy_reward_stage2": 0.7850591540336609, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2000 + }, + { + "completion_length": 11.3125, + "epoch": 0.3506220431049588, + "grad_norm": 15.429352320389754, + "kl": 0.054931640625, + "learning_rate": 6.495531803048888e-07, + "loss": -0.017, + "reward": 1.5891973972320557, + "reward_std": 0.2515534460544586, + "rewards/accuracy_reward_stage2": 0.6048224568367004, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2001 + }, + { + "completion_length": 10.671875, + "epoch": 0.35079726651480636, + "grad_norm": 18.638199329131833, + "kl": 0.061767578125, + "learning_rate": 6.493779568950411e-07, + "loss": -0.0654, + "reward": 1.734375, + "reward_std": 0.3686423897743225, + "rewards/accuracy_reward_stage2": 0.78125, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2002 + }, + { + "completion_length": 7.65625, + "epoch": 0.3509724899246539, + "grad_norm": 21.399105794557666, + "kl": 0.15234375, + "learning_rate": 6.492027334851936e-07, + "loss": 0.061, + "reward": 1.5191402435302734, + "reward_std": 0.18384888768196106, + "rewards/accuracy_reward_stage2": 0.519140362739563, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2003 + }, + { + "completion_length": 17.203125, + "epoch": 0.3511477133345015, + "grad_norm": 19.156350104124215, + "kl": 0.08447265625, + "learning_rate": 6.49027510075346e-07, + "loss": -0.0482, + "reward": 1.5820919275283813, + "reward_std": 0.2840636372566223, + "rewards/accuracy_reward_stage2": 0.6133419275283813, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2004 + }, + { + "completion_length": 15.453125, + "epoch": 0.35132293674434906, + "grad_norm": 14.893311472170298, + "kl": 0.2255859375, + "learning_rate": 6.488522866654985e-07, + "loss": 0.0459, + "reward": 1.4101537466049194, + "reward_std": 0.23728793859481812, + "rewards/accuracy_reward_stage2": 0.4257788062095642, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2005 + }, + { + "completion_length": 13.59375, + "epoch": 0.3514981601541966, + "grad_norm": 22.347330342450295, + "kl": 0.07470703125, + "learning_rate": 6.48677063255651e-07, + "loss": 0.0299, + "reward": 1.632706880569458, + "reward_std": 0.24964894354343414, + "rewards/accuracy_reward_stage2": 0.632706880569458, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2006 + }, + { + "completion_length": 11.484375, + "epoch": 0.35167338356404415, + "grad_norm": 12.16406297008582, + "kl": 0.057861328125, + "learning_rate": 6.485018398458034e-07, + "loss": 0.0231, + "reward": 1.4031907320022583, + "reward_std": 0.10199880599975586, + "rewards/accuracy_reward_stage2": 0.5281907320022583, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2007 + }, + { + "completion_length": 8.5, + "epoch": 0.3518486069738917, + "grad_norm": 14.241194495526061, + "kl": 0.0576171875, + "learning_rate": 6.483266164359558e-07, + "loss": 0.023, + "reward": 1.467761754989624, + "reward_std": 0.1030719205737114, + "rewards/accuracy_reward_stage2": 0.467761754989624, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2008 + }, + { + "completion_length": 9.6875, + "epoch": 0.35202383038373924, + "grad_norm": 13.251755078633238, + "kl": 0.1259765625, + "learning_rate": 6.481513930261083e-07, + "loss": 0.0504, + "reward": 1.577303409576416, + "reward_std": 0.08948713541030884, + "rewards/accuracy_reward_stage2": 0.7023034691810608, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2009 + }, + { + "completion_length": 10.703125, + "epoch": 0.35219905379358685, + "grad_norm": 19.886765618288138, + "kl": 0.18359375, + "learning_rate": 6.479761696162607e-07, + "loss": 0.0292, + "reward": 1.5866138935089111, + "reward_std": 0.19587093591690063, + "rewards/accuracy_reward_stage2": 0.6022388935089111, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2010 + }, + { + "completion_length": 9.453125, + "epoch": 0.3523742772034344, + "grad_norm": 13.824131232354484, + "kl": 0.10791015625, + "learning_rate": 6.478009462064131e-07, + "loss": 0.0006, + "reward": 1.2880022525787354, + "reward_std": 0.2643819749355316, + "rewards/accuracy_reward_stage2": 0.3036273121833801, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2011 + }, + { + "completion_length": 12.8125, + "epoch": 0.35254950061328194, + "grad_norm": 21.65675317994148, + "kl": 0.052490234375, + "learning_rate": 6.476257227965655e-07, + "loss": -0.0368, + "reward": 1.6309711933135986, + "reward_std": 0.24923476576805115, + "rewards/accuracy_reward_stage2": 0.7872211337089539, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2012 + }, + { + "completion_length": 9.296875, + "epoch": 0.3527247240231295, + "grad_norm": 19.112002940505786, + "kl": 0.21875, + "learning_rate": 6.47450499386718e-07, + "loss": 0.0088, + "reward": 1.6558035612106323, + "reward_std": 0.27317488193511963, + "rewards/accuracy_reward_stage2": 0.6870535612106323, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2013 + }, + { + "completion_length": 5.625, + "epoch": 0.35289994743297703, + "grad_norm": 18.397807988383157, + "kl": 0.06201171875, + "learning_rate": 6.472752759768705e-07, + "loss": -0.0107, + "reward": 1.3729119300842285, + "reward_std": 0.20705291628837585, + "rewards/accuracy_reward_stage2": 0.38853681087493896, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2014 + }, + { + "completion_length": 6.46875, + "epoch": 0.3530751708428246, + "grad_norm": 14.86373871431777, + "kl": 0.201171875, + "learning_rate": 6.471000525670229e-07, + "loss": -0.0046, + "reward": 1.536039113998413, + "reward_std": 0.14704757928848267, + "rewards/accuracy_reward_stage2": 0.6922890543937683, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2015 + }, + { + "completion_length": 7.0, + "epoch": 0.3532503942526722, + "grad_norm": 12.402681811929515, + "kl": 0.072265625, + "learning_rate": 6.469248291571754e-07, + "loss": -0.0154, + "reward": 1.7066841125488281, + "reward_std": 0.14217713475227356, + "rewards/accuracy_reward_stage2": 0.7223089933395386, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2016 + }, + { + "completion_length": 11.3125, + "epoch": 0.35342561766251973, + "grad_norm": 20.751088060330876, + "kl": 0.09228515625, + "learning_rate": 6.467496057473279e-07, + "loss": 0.037, + "reward": 1.7199280261993408, + "reward_std": 0.24458998441696167, + "rewards/accuracy_reward_stage2": 0.8449280858039856, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2017 + }, + { + "completion_length": 14.578125, + "epoch": 0.3536008410723673, + "grad_norm": 23.455831102973022, + "kl": 0.15234375, + "learning_rate": 6.465743823374803e-07, + "loss": -0.0263, + "reward": 1.6048414707183838, + "reward_std": 0.29559487104415894, + "rewards/accuracy_reward_stage2": 0.6360914707183838, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2018 + }, + { + "completion_length": 11.46875, + "epoch": 0.3537760644822148, + "grad_norm": 15.818034624782776, + "kl": 0.0537109375, + "learning_rate": 6.463991589276328e-07, + "loss": 0.0215, + "reward": 1.595871925354004, + "reward_std": 0.16525067389011383, + "rewards/accuracy_reward_stage2": 0.5958719253540039, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2019 + }, + { + "completion_length": 14.140625, + "epoch": 0.35395128789206237, + "grad_norm": 17.47953156713791, + "kl": 0.09765625, + "learning_rate": 6.462239355177852e-07, + "loss": -0.004, + "reward": 1.6608421802520752, + "reward_std": 0.19523340463638306, + "rewards/accuracy_reward_stage2": 0.6764671802520752, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2020 + }, + { + "completion_length": 9.484375, + "epoch": 0.3541265113019099, + "grad_norm": 23.25571658748381, + "kl": 0.13671875, + "learning_rate": 6.460487121079375e-07, + "loss": 0.0546, + "reward": 1.4660158157348633, + "reward_std": 0.38287341594696045, + "rewards/accuracy_reward_stage2": 0.4660158157348633, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2021 + }, + { + "completion_length": 6.046875, + "epoch": 0.35430173471175747, + "grad_norm": 15.16000099952722, + "kl": 0.10498046875, + "learning_rate": 6.4587348869809e-07, + "loss": 0.0066, + "reward": 1.6145833730697632, + "reward_std": 0.23177990317344666, + "rewards/accuracy_reward_stage2": 0.6302083730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2022 + }, + { + "completion_length": 17.296875, + "epoch": 0.35447695812160507, + "grad_norm": 19.871120104217546, + "kl": 0.0517578125, + "learning_rate": 6.456982652882424e-07, + "loss": -0.0235, + "reward": 1.518093466758728, + "reward_std": 0.20393085479736328, + "rewards/accuracy_reward_stage2": 0.533718466758728, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2023 + }, + { + "completion_length": 9.84375, + "epoch": 0.3546521815314526, + "grad_norm": 16.326228486631557, + "kl": 0.049072265625, + "learning_rate": 6.455230418783949e-07, + "loss": 0.0196, + "reward": 1.6053377389907837, + "reward_std": 0.13290469348430634, + "rewards/accuracy_reward_stage2": 0.6053377389907837, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2024 + }, + { + "completion_length": 10.578125, + "epoch": 0.35482740494130016, + "grad_norm": 15.652855871849557, + "kl": 0.1875, + "learning_rate": 6.453478184685473e-07, + "loss": 0.0307, + "reward": 1.299550175666809, + "reward_std": 0.1480276733636856, + "rewards/accuracy_reward_stage2": 0.5651751756668091, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2025 + }, + { + "completion_length": 5.984375, + "epoch": 0.3550026283511477, + "grad_norm": 17.75877666884954, + "kl": 0.10595703125, + "learning_rate": 6.451725950586998e-07, + "loss": -0.0018, + "reward": 1.9087555408477783, + "reward_std": 0.18428705632686615, + "rewards/accuracy_reward_stage2": 0.9243804812431335, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2026 + }, + { + "completion_length": 7.9375, + "epoch": 0.35517785176099526, + "grad_norm": 21.16012729432452, + "kl": 0.138671875, + "learning_rate": 6.449973716488523e-07, + "loss": 0.0242, + "reward": 1.3229167461395264, + "reward_std": 0.31512731313705444, + "rewards/accuracy_reward_stage2": 0.3385416865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2027 + }, + { + "completion_length": 8.9375, + "epoch": 0.3553530751708428, + "grad_norm": 29.08543121262397, + "kl": 0.19921875, + "learning_rate": 6.448221482390047e-07, + "loss": 0.0798, + "reward": 1.4590182304382324, + "reward_std": 0.21175247430801392, + "rewards/accuracy_reward_stage2": 0.5840181708335876, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2028 + }, + { + "completion_length": 7.921875, + "epoch": 0.3555282985806904, + "grad_norm": 22.709142062225904, + "kl": 0.138671875, + "learning_rate": 6.446469248291572e-07, + "loss": 0.0265, + "reward": 1.6932740211486816, + "reward_std": 0.2825216054916382, + "rewards/accuracy_reward_stage2": 0.7088989019393921, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2029 + }, + { + "completion_length": 12.71875, + "epoch": 0.35570352199053795, + "grad_norm": 13.691321079425855, + "kl": 0.07666015625, + "learning_rate": 6.444717014193097e-07, + "loss": 0.0023, + "reward": 1.057405710220337, + "reward_std": 0.08621557056903839, + "rewards/accuracy_reward_stage2": 0.4480307996273041, + "rewards/format_reward_stage1_pointerpad": 0.609375, + "scores/accuracy_reward_stage2": 0.609375, + "step": 2030 + }, + { + "completion_length": 7.15625, + "epoch": 0.3558787454003855, + "grad_norm": 21.772071074721506, + "kl": 0.0830078125, + "learning_rate": 6.44296478009462e-07, + "loss": -0.042, + "reward": 1.594986081123352, + "reward_std": 0.35093414783477783, + "rewards/accuracy_reward_stage2": 0.626236081123352, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2031 + }, + { + "completion_length": 7.9375, + "epoch": 0.35605396881023305, + "grad_norm": 21.906110366807358, + "kl": 0.0947265625, + "learning_rate": 6.441212545996145e-07, + "loss": 0.0163, + "reward": 1.538081407546997, + "reward_std": 0.24354253709316254, + "rewards/accuracy_reward_stage2": 0.5537062883377075, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2032 + }, + { + "completion_length": 10.59375, + "epoch": 0.3562291922200806, + "grad_norm": 19.764452082312108, + "kl": 0.099609375, + "learning_rate": 6.439460311897668e-07, + "loss": -0.0043, + "reward": 1.6770200729370117, + "reward_std": 0.23365044593811035, + "rewards/accuracy_reward_stage2": 0.6926450729370117, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2033 + }, + { + "completion_length": 10.0625, + "epoch": 0.35640441562992814, + "grad_norm": 17.17510404758707, + "kl": 0.1435546875, + "learning_rate": 6.437708077799193e-07, + "loss": -0.0825, + "reward": 1.515505313873291, + "reward_std": 0.25960028171539307, + "rewards/accuracy_reward_stage2": 0.578005313873291, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2034 + }, + { + "completion_length": 9.953125, + "epoch": 0.3565796390397757, + "grad_norm": 8.404722024682297, + "kl": 0.060791015625, + "learning_rate": 6.435955843700718e-07, + "loss": -0.0198, + "reward": 1.578125, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2035 + }, + { + "completion_length": 9.734375, + "epoch": 0.3567548624496233, + "grad_norm": 25.262565696637864, + "kl": 0.201171875, + "learning_rate": 6.434203609602242e-07, + "loss": -0.0703, + "reward": 1.5607659816741943, + "reward_std": 0.3894246518611908, + "rewards/accuracy_reward_stage2": 0.6232660412788391, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2036 + }, + { + "completion_length": 8.875, + "epoch": 0.35693008585947084, + "grad_norm": 17.54284935941715, + "kl": 0.11328125, + "learning_rate": 6.432451375503767e-07, + "loss": 0.0104, + "reward": 1.6565544605255127, + "reward_std": 0.14218929409980774, + "rewards/accuracy_reward_stage2": 0.6721794605255127, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2037 + }, + { + "completion_length": 10.234375, + "epoch": 0.3571053092693184, + "grad_norm": 16.722634304165545, + "kl": 0.07373046875, + "learning_rate": 6.430699141405292e-07, + "loss": -0.0147, + "reward": 1.6033732891082764, + "reward_std": 0.2091490775346756, + "rewards/accuracy_reward_stage2": 0.6189983487129211, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2038 + }, + { + "completion_length": 8.078125, + "epoch": 0.35728053267916593, + "grad_norm": 28.241725109043774, + "kl": 0.1826171875, + "learning_rate": 6.428946907306816e-07, + "loss": 0.0664, + "reward": 1.652994155883789, + "reward_std": 0.322782039642334, + "rewards/accuracy_reward_stage2": 0.6686190366744995, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2039 + }, + { + "completion_length": 10.9375, + "epoch": 0.3574557560890135, + "grad_norm": 17.023308776735536, + "kl": 0.115234375, + "learning_rate": 6.427194673208341e-07, + "loss": -0.0315, + "reward": 1.7313854694366455, + "reward_std": 0.2074601650238037, + "rewards/accuracy_reward_stage2": 0.762635350227356, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2040 + }, + { + "completion_length": 6.78125, + "epoch": 0.357630979498861, + "grad_norm": 21.88276224578183, + "kl": 0.1611328125, + "learning_rate": 6.425442439109864e-07, + "loss": -0.0487, + "reward": 1.6473379135131836, + "reward_std": 0.3075483441352844, + "rewards/accuracy_reward_stage2": 0.6942129731178284, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2041 + }, + { + "completion_length": 9.0625, + "epoch": 0.3578062029087086, + "grad_norm": 22.899850742372728, + "kl": 0.04736328125, + "learning_rate": 6.423690205011389e-07, + "loss": 0.0189, + "reward": 1.5322256088256836, + "reward_std": 0.13407042622566223, + "rewards/accuracy_reward_stage2": 0.5322255492210388, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2042 + }, + { + "completion_length": 6.734375, + "epoch": 0.3579814263185562, + "grad_norm": 31.881156553500293, + "kl": 0.1875, + "learning_rate": 6.421937970912914e-07, + "loss": 0.0749, + "reward": 1.6132948398590088, + "reward_std": 0.1358180195093155, + "rewards/accuracy_reward_stage2": 0.6132948398590088, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2043 + }, + { + "completion_length": 12.28125, + "epoch": 0.3581566497284037, + "grad_norm": 20.593066376423288, + "kl": 0.06494140625, + "learning_rate": 6.420185736814438e-07, + "loss": 0.0259, + "reward": 1.8108373880386353, + "reward_std": 0.1923947036266327, + "rewards/accuracy_reward_stage2": 0.8108373880386353, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2044 + }, + { + "completion_length": 7.875, + "epoch": 0.35833187313825127, + "grad_norm": 20.026577573811554, + "kl": 0.2265625, + "learning_rate": 6.418433502715963e-07, + "loss": 0.0103, + "reward": 1.3905614614486694, + "reward_std": 0.2854178547859192, + "rewards/accuracy_reward_stage2": 0.42181146144866943, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2045 + }, + { + "completion_length": 12.828125, + "epoch": 0.3585070965480988, + "grad_norm": 20.679212222356824, + "kl": 0.1875, + "learning_rate": 6.416681268617487e-07, + "loss": 0.0394, + "reward": 1.506111979484558, + "reward_std": 0.41419732570648193, + "rewards/accuracy_reward_stage2": 0.5217369794845581, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2046 + }, + { + "completion_length": 10.390625, + "epoch": 0.35868231995794636, + "grad_norm": 18.808809960877102, + "kl": 0.1728515625, + "learning_rate": 6.414929034519011e-07, + "loss": -0.0093, + "reward": 1.079209804534912, + "reward_std": 0.29377779364585876, + "rewards/accuracy_reward_stage2": 0.37608474493026733, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 2047 + }, + { + "completion_length": 7.28125, + "epoch": 0.35885754336779396, + "grad_norm": 15.818689616065766, + "kl": 0.0458984375, + "learning_rate": 6.413176800420536e-07, + "loss": 0.0183, + "reward": 1.321853518486023, + "reward_std": 0.20559610426425934, + "rewards/accuracy_reward_stage2": 0.32185354828834534, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2048 + }, + { + "completion_length": 10.21875, + "epoch": 0.3590327667776415, + "grad_norm": 23.13419565702121, + "kl": 0.09033203125, + "learning_rate": 6.41142456632206e-07, + "loss": -0.0523, + "reward": 1.5469226837158203, + "reward_std": 0.2661864161491394, + "rewards/accuracy_reward_stage2": 0.5781725645065308, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2049 + }, + { + "completion_length": 9.8125, + "epoch": 0.35920799018748906, + "grad_norm": 20.85199762225754, + "kl": 0.265625, + "learning_rate": 6.409672332223585e-07, + "loss": -0.0781, + "reward": 1.7552984952926636, + "reward_std": 0.37930476665496826, + "rewards/accuracy_reward_stage2": 0.8334234952926636, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 2050 + }, + { + "completion_length": 11.328125, + "epoch": 0.3593832135973366, + "grad_norm": 22.537170988157758, + "kl": 0.181640625, + "learning_rate": 6.407920098125109e-07, + "loss": 0.0028, + "reward": 1.5682830810546875, + "reward_std": 0.24193453788757324, + "rewards/accuracy_reward_stage2": 0.708907961845398, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2051 + }, + { + "completion_length": 28.390625, + "epoch": 0.35955843700718415, + "grad_norm": 23.631049170169213, + "kl": 0.1416015625, + "learning_rate": 6.406167864026633e-07, + "loss": -0.0653, + "reward": 1.147277593612671, + "reward_std": 0.26074129343032837, + "rewards/accuracy_reward_stage2": 0.3191525340080261, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2052 + }, + { + "completion_length": 8.515625, + "epoch": 0.3597336604170317, + "grad_norm": 24.54048817390111, + "kl": 0.11083984375, + "learning_rate": 6.404415629928158e-07, + "loss": -0.0462, + "reward": 1.5549089908599854, + "reward_std": 0.4777141511440277, + "rewards/accuracy_reward_stage2": 0.6017839908599854, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2053 + }, + { + "completion_length": 7.734375, + "epoch": 0.35990888382687924, + "grad_norm": 14.236796163475823, + "kl": 0.12255859375, + "learning_rate": 6.402663395829683e-07, + "loss": -0.0079, + "reward": 1.3742108345031738, + "reward_std": 0.2040639966726303, + "rewards/accuracy_reward_stage2": 0.5304608941078186, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2054 + }, + { + "completion_length": 16.34375, + "epoch": 0.36008410723672685, + "grad_norm": 15.587263426743462, + "kl": 0.083984375, + "learning_rate": 6.400911161731207e-07, + "loss": 0.0335, + "reward": 1.8176294565200806, + "reward_std": 0.10821881890296936, + "rewards/accuracy_reward_stage2": 0.9426294565200806, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2055 + }, + { + "completion_length": 12.078125, + "epoch": 0.3602593306465744, + "grad_norm": 21.937235308292358, + "kl": 0.125, + "learning_rate": 6.399158927632732e-07, + "loss": -0.022, + "reward": 1.440577745437622, + "reward_std": 0.25374239683151245, + "rewards/accuracy_reward_stage2": 0.47182780504226685, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2056 + }, + { + "completion_length": 15.578125, + "epoch": 0.36043455405642194, + "grad_norm": 18.473381578423286, + "kl": 0.140625, + "learning_rate": 6.397406693534256e-07, + "loss": 0.0565, + "reward": 1.441630244255066, + "reward_std": 0.14738719165325165, + "rewards/accuracy_reward_stage2": 0.5666301846504211, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2057 + }, + { + "completion_length": 11.765625, + "epoch": 0.3606097774662695, + "grad_norm": 18.322017197253313, + "kl": 0.150390625, + "learning_rate": 6.395654459435781e-07, + "loss": 0.0186, + "reward": 0.9354975819587708, + "reward_std": 0.25476324558258057, + "rewards/accuracy_reward_stage2": 0.34174755215644836, + "rewards/format_reward_stage1_pointerpad": 0.59375, + "scores/accuracy_reward_stage2": 0.59375, + "step": 2058 + }, + { + "completion_length": 8.046875, + "epoch": 0.36078500087611703, + "grad_norm": 12.411016991104368, + "kl": 0.04150390625, + "learning_rate": 6.393902225337305e-07, + "loss": 0.0165, + "reward": 1.748430848121643, + "reward_std": 0.10755133628845215, + "rewards/accuracy_reward_stage2": 0.7484308481216431, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2059 + }, + { + "completion_length": 21.53125, + "epoch": 0.3609602242859646, + "grad_norm": 25.542296675136917, + "kl": 0.13671875, + "learning_rate": 6.392149991238828e-07, + "loss": 0.0033, + "reward": 1.7117555141448975, + "reward_std": 0.3055155873298645, + "rewards/accuracy_reward_stage2": 0.7430053949356079, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2060 + }, + { + "completion_length": 11.125, + "epoch": 0.3611354476958122, + "grad_norm": 19.080721242770668, + "kl": 0.1513671875, + "learning_rate": 6.390397757140353e-07, + "loss": -0.0493, + "reward": 1.5307211875915527, + "reward_std": 0.23762869834899902, + "rewards/accuracy_reward_stage2": 0.7025961875915527, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2061 + }, + { + "completion_length": 8.171875, + "epoch": 0.36131067110565973, + "grad_norm": 24.843716465626326, + "kl": 0.173828125, + "learning_rate": 6.388645523041878e-07, + "loss": 0.0194, + "reward": 1.4921178817749023, + "reward_std": 0.3609883189201355, + "rewards/accuracy_reward_stage2": 0.5233679413795471, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2062 + }, + { + "completion_length": 16.453125, + "epoch": 0.3614858945155073, + "grad_norm": 17.65292521561761, + "kl": 0.11474609375, + "learning_rate": 6.386893288943402e-07, + "loss": 0.0458, + "reward": 1.544228434562683, + "reward_std": 0.16590997576713562, + "rewards/accuracy_reward_stage2": 0.6692283749580383, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2063 + }, + { + "completion_length": 8.0, + "epoch": 0.3616611179253548, + "grad_norm": 19.760964980028835, + "kl": 0.1259765625, + "learning_rate": 6.385141054844927e-07, + "loss": 0.0063, + "reward": 1.652033805847168, + "reward_std": 0.19817985594272614, + "rewards/accuracy_reward_stage2": 0.6676587462425232, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2064 + }, + { + "completion_length": 8.109375, + "epoch": 0.36183634133520237, + "grad_norm": 15.99382134890339, + "kl": 0.03125, + "learning_rate": 6.383388820746451e-07, + "loss": 0.0125, + "reward": 1.565201997756958, + "reward_std": 0.10470834374427795, + "rewards/accuracy_reward_stage2": 0.565201997756958, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2065 + }, + { + "completion_length": 11.234375, + "epoch": 0.3620115647450499, + "grad_norm": 19.134680179647805, + "kl": 0.0322265625, + "learning_rate": 6.381636586647976e-07, + "loss": 0.0129, + "reward": 1.2855641841888428, + "reward_std": 0.1369982659816742, + "rewards/accuracy_reward_stage2": 0.4105641841888428, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2066 + }, + { + "completion_length": 12.953125, + "epoch": 0.3621867881548975, + "grad_norm": 17.13585526565401, + "kl": 0.07861328125, + "learning_rate": 6.379884352549501e-07, + "loss": 0.0148, + "reward": 1.3759901523590088, + "reward_std": 0.1790485382080078, + "rewards/accuracy_reward_stage2": 0.516615092754364, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2067 + }, + { + "completion_length": 9.140625, + "epoch": 0.36236201156474507, + "grad_norm": 19.919764455309917, + "kl": 0.1533203125, + "learning_rate": 6.378132118451025e-07, + "loss": -0.0473, + "reward": 1.5150220394134521, + "reward_std": 0.2945772409439087, + "rewards/accuracy_reward_stage2": 0.5618971586227417, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2068 + }, + { + "completion_length": 10.71875, + "epoch": 0.3625372349745926, + "grad_norm": 20.46268376467591, + "kl": 0.1259765625, + "learning_rate": 6.37637988435255e-07, + "loss": 0.019, + "reward": 1.4374001026153564, + "reward_std": 0.30063989758491516, + "rewards/accuracy_reward_stage2": 0.4530249834060669, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2069 + }, + { + "completion_length": 11.6875, + "epoch": 0.36271245838444016, + "grad_norm": 19.75936756902344, + "kl": 0.11181640625, + "learning_rate": 6.374627650254075e-07, + "loss": 0.0447, + "reward": 1.327678918838501, + "reward_std": 0.17409127950668335, + "rewards/accuracy_reward_stage2": 0.4526788890361786, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2070 + }, + { + "completion_length": 8.765625, + "epoch": 0.3628876817942877, + "grad_norm": 22.704894291915178, + "kl": 0.2451171875, + "learning_rate": 6.372875416155598e-07, + "loss": -0.0568, + "reward": 1.6028995513916016, + "reward_std": 0.42264601588249207, + "rewards/accuracy_reward_stage2": 0.6810245513916016, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 2071 + }, + { + "completion_length": 15.046875, + "epoch": 0.36306290520413526, + "grad_norm": 22.1590454237176, + "kl": 0.1474609375, + "learning_rate": 6.371123182057122e-07, + "loss": 0.0625, + "reward": 1.362949013710022, + "reward_std": 0.21775904297828674, + "rewards/accuracy_reward_stage2": 0.503574013710022, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2072 + }, + { + "completion_length": 12.078125, + "epoch": 0.3632381286139828, + "grad_norm": 18.402322297906913, + "kl": 0.1318359375, + "learning_rate": 6.369370947958646e-07, + "loss": -0.1177, + "reward": 1.6029870510101318, + "reward_std": 0.27655163407325745, + "rewards/accuracy_reward_stage2": 0.6654870510101318, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2073 + }, + { + "completion_length": 9.53125, + "epoch": 0.3634133520238304, + "grad_norm": 14.489973679958617, + "kl": 0.138671875, + "learning_rate": 6.367618713860171e-07, + "loss": -0.0287, + "reward": 1.4545139074325562, + "reward_std": 0.25849562883377075, + "rewards/accuracy_reward_stage2": 0.6107639074325562, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2074 + }, + { + "completion_length": 9.328125, + "epoch": 0.36358857543367795, + "grad_norm": 15.909096721212883, + "kl": 0.099609375, + "learning_rate": 6.365866479761696e-07, + "loss": -0.0043, + "reward": 1.6145410537719727, + "reward_std": 0.09930635988712311, + "rewards/accuracy_reward_stage2": 0.6301660537719727, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2075 + }, + { + "completion_length": 11.71875, + "epoch": 0.3637637988435255, + "grad_norm": 23.310846273565204, + "kl": 0.2236328125, + "learning_rate": 6.36411424566322e-07, + "loss": 0.0079, + "reward": 1.6138209104537964, + "reward_std": 0.2763480246067047, + "rewards/accuracy_reward_stage2": 0.6606959104537964, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2076 + }, + { + "completion_length": 8.59375, + "epoch": 0.36393902225337305, + "grad_norm": 19.359891452547277, + "kl": 0.0732421875, + "learning_rate": 6.362362011564745e-07, + "loss": 0.0293, + "reward": 1.501549243927002, + "reward_std": 0.20947618782520294, + "rewards/accuracy_reward_stage2": 0.5015493631362915, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2077 + }, + { + "completion_length": 8.0, + "epoch": 0.3641142456632206, + "grad_norm": 55.484372617544224, + "kl": 0.3359375, + "learning_rate": 6.36060977746627e-07, + "loss": 0.1337, + "reward": 1.5378704071044922, + "reward_std": 0.2450861930847168, + "rewards/accuracy_reward_stage2": 0.6628704071044922, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2078 + }, + { + "completion_length": 11.375, + "epoch": 0.36428946907306814, + "grad_norm": 16.416779187832226, + "kl": 0.0181884765625, + "learning_rate": 6.358857543367794e-07, + "loss": 0.0073, + "reward": 1.8072917461395264, + "reward_std": 0.1814829707145691, + "rewards/accuracy_reward_stage2": 0.8072916865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2079 + }, + { + "completion_length": 10.890625, + "epoch": 0.36446469248291574, + "grad_norm": 16.328709056063502, + "kl": 0.12353515625, + "learning_rate": 6.357105309269319e-07, + "loss": 0.0198, + "reward": 1.7240674495697021, + "reward_std": 0.18539977073669434, + "rewards/accuracy_reward_stage2": 0.8646925091743469, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2080 + }, + { + "completion_length": 8.96875, + "epoch": 0.3646399158927633, + "grad_norm": 15.053467686996736, + "kl": 0.10546875, + "learning_rate": 6.355353075170842e-07, + "loss": -0.0015, + "reward": 1.6101465225219727, + "reward_std": 0.16715562343597412, + "rewards/accuracy_reward_stage2": 0.6257715225219727, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2081 + }, + { + "completion_length": 10.75, + "epoch": 0.36481513930261084, + "grad_norm": 18.342628051433906, + "kl": 0.09912109375, + "learning_rate": 6.353600841072367e-07, + "loss": 0.0005, + "reward": 1.6331281661987305, + "reward_std": 0.16434608399868011, + "rewards/accuracy_reward_stage2": 0.6487530469894409, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2082 + }, + { + "completion_length": 10.359375, + "epoch": 0.3649903627124584, + "grad_norm": 20.182663981829865, + "kl": 0.03466796875, + "learning_rate": 6.351848606973892e-07, + "loss": 0.0265, + "reward": 1.6030964851379395, + "reward_std": 0.13247910141944885, + "rewards/accuracy_reward_stage2": 0.8530964255332947, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2083 + }, + { + "completion_length": 7.984375, + "epoch": 0.36516558612230593, + "grad_norm": 15.458876474291781, + "kl": 0.0189208984375, + "learning_rate": 6.350096372875415e-07, + "loss": 0.0076, + "reward": 1.7303240299224854, + "reward_std": 0.23356689512729645, + "rewards/accuracy_reward_stage2": 0.7303240895271301, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2084 + }, + { + "completion_length": 9.46875, + "epoch": 0.3653408095321535, + "grad_norm": 20.7766939779222, + "kl": 0.1767578125, + "learning_rate": 6.34834413877694e-07, + "loss": 0.0551, + "reward": 1.4968338012695312, + "reward_std": 0.19729726016521454, + "rewards/accuracy_reward_stage2": 0.6374588012695312, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2085 + }, + { + "completion_length": 10.96875, + "epoch": 0.3655160329420011, + "grad_norm": 17.868613252036152, + "kl": 0.1240234375, + "learning_rate": 6.346591904678465e-07, + "loss": 0.0182, + "reward": 1.673816204071045, + "reward_std": 0.23597976565361023, + "rewards/accuracy_reward_stage2": 0.6894412040710449, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2086 + }, + { + "completion_length": 7.734375, + "epoch": 0.3656912563518486, + "grad_norm": 17.78504110129761, + "kl": 0.0216064453125, + "learning_rate": 6.344839670579989e-07, + "loss": 0.0086, + "reward": 1.8547598123550415, + "reward_std": 0.04631902277469635, + "rewards/accuracy_reward_stage2": 0.8547598123550415, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2087 + }, + { + "completion_length": 19.875, + "epoch": 0.3658664797616962, + "grad_norm": 17.125101728063743, + "kl": 0.0162353515625, + "learning_rate": 6.343087436481514e-07, + "loss": 0.0065, + "reward": 1.5940463542938232, + "reward_std": 0.19404737651348114, + "rewards/accuracy_reward_stage2": 0.594046413898468, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2088 + }, + { + "completion_length": 15.6875, + "epoch": 0.3660417031715437, + "grad_norm": 23.482317824072272, + "kl": 0.08837890625, + "learning_rate": 6.341335202383038e-07, + "loss": 0.0354, + "reward": 1.4581522941589355, + "reward_std": 0.21087868511676788, + "rewards/accuracy_reward_stage2": 0.45815223455429077, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2089 + }, + { + "completion_length": 8.03125, + "epoch": 0.36621692658139127, + "grad_norm": 13.199062968702936, + "kl": 0.03125, + "learning_rate": 6.339582968284563e-07, + "loss": 0.0125, + "reward": 1.7291667461395264, + "reward_std": 0.1836046278476715, + "rewards/accuracy_reward_stage2": 0.7291666865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2090 + }, + { + "completion_length": 9.40625, + "epoch": 0.3663921499912388, + "grad_norm": 16.858039673473282, + "kl": 0.13671875, + "learning_rate": 6.337830734186087e-07, + "loss": 0.013, + "reward": 1.5398142337799072, + "reward_std": 0.21081207692623138, + "rewards/accuracy_reward_stage2": 0.5554392337799072, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2091 + }, + { + "completion_length": 8.578125, + "epoch": 0.36656737340108636, + "grad_norm": 18.827872921303243, + "kl": 0.06884765625, + "learning_rate": 6.336078500087611e-07, + "loss": 0.0274, + "reward": 1.4628106355667114, + "reward_std": 0.23339983820915222, + "rewards/accuracy_reward_stage2": 0.5878106355667114, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2092 + }, + { + "completion_length": 6.765625, + "epoch": 0.36674259681093396, + "grad_norm": 14.330653120833654, + "kl": 0.0234375, + "learning_rate": 6.334326265989136e-07, + "loss": 0.0094, + "reward": 1.6666667461395264, + "reward_std": 0.15343135595321655, + "rewards/accuracy_reward_stage2": 0.6666666269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2093 + }, + { + "completion_length": 9.015625, + "epoch": 0.3669178202207815, + "grad_norm": 20.552324553426114, + "kl": 0.1357421875, + "learning_rate": 6.332574031890661e-07, + "loss": 0.0543, + "reward": 1.632039189338684, + "reward_std": 0.24369873106479645, + "rewards/accuracy_reward_stage2": 0.6320391297340393, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2094 + }, + { + "completion_length": 17.640625, + "epoch": 0.36709304363062906, + "grad_norm": 20.42833604517789, + "kl": 0.10693359375, + "learning_rate": 6.330821797792185e-07, + "loss": 0.0427, + "reward": 1.4707213640213013, + "reward_std": 0.18929457664489746, + "rewards/accuracy_reward_stage2": 0.47072139382362366, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2095 + }, + { + "completion_length": 9.3125, + "epoch": 0.3672682670404766, + "grad_norm": 19.495544878255846, + "kl": 0.08349609375, + "learning_rate": 6.32906956369371e-07, + "loss": -0.0279, + "reward": 1.6371318101882935, + "reward_std": 0.19539067149162292, + "rewards/accuracy_reward_stage2": 0.6683818101882935, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2096 + }, + { + "completion_length": 9.859375, + "epoch": 0.36744349045032415, + "grad_norm": 24.742390078521236, + "kl": 0.30859375, + "learning_rate": 6.327317329595233e-07, + "loss": 0.1236, + "reward": 1.5489616394042969, + "reward_std": 0.14816182851791382, + "rewards/accuracy_reward_stage2": 0.6739615201950073, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2097 + }, + { + "completion_length": 7.109375, + "epoch": 0.3676187138601717, + "grad_norm": 17.907769375524044, + "kl": 0.06787109375, + "learning_rate": 6.325565095496758e-07, + "loss": -0.0171, + "reward": 1.6017649173736572, + "reward_std": 0.18081702291965485, + "rewards/accuracy_reward_stage2": 0.6173898577690125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2098 + }, + { + "completion_length": 8.96875, + "epoch": 0.3677939372700193, + "grad_norm": 10.834800917825248, + "kl": 0.09765625, + "learning_rate": 6.323812861398283e-07, + "loss": -0.005, + "reward": 1.584733009338379, + "reward_std": 0.10907712578773499, + "rewards/accuracy_reward_stage2": 0.6003579497337341, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2099 + }, + { + "completion_length": 12.34375, + "epoch": 0.36796916067986685, + "grad_norm": 19.662688636320638, + "kl": 0.050048828125, + "learning_rate": 6.322060627299806e-07, + "loss": -0.0131, + "reward": 1.6531250476837158, + "reward_std": 0.3001001179218292, + "rewards/accuracy_reward_stage2": 0.668749988079071, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2100 + }, + { + "completion_length": 10.609375, + "epoch": 0.3681443840897144, + "grad_norm": 24.23833484560236, + "kl": 0.34765625, + "learning_rate": 6.320308393201331e-07, + "loss": 0.1391, + "reward": 1.312018871307373, + "reward_std": 0.26352953910827637, + "rewards/accuracy_reward_stage2": 0.43701881170272827, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2101 + }, + { + "completion_length": 4.859375, + "epoch": 0.36831960749956194, + "grad_norm": 14.376870930324024, + "kl": 0.08740234375, + "learning_rate": 6.318556159102855e-07, + "loss": -0.0093, + "reward": 1.8119255304336548, + "reward_std": 0.12499181926250458, + "rewards/accuracy_reward_stage2": 0.8275505304336548, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2102 + }, + { + "completion_length": 18.75, + "epoch": 0.3684948309094095, + "grad_norm": 17.023075703116874, + "kl": 0.023681640625, + "learning_rate": 6.31680392500438e-07, + "loss": -0.0343, + "reward": 1.6704235076904297, + "reward_std": 0.2026294320821762, + "rewards/accuracy_reward_stage2": 0.6860485076904297, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2103 + }, + { + "completion_length": 9.796875, + "epoch": 0.36867005431925703, + "grad_norm": 18.01675915994742, + "kl": 0.11328125, + "learning_rate": 6.315051690905905e-07, + "loss": 0.0011, + "reward": 1.657438039779663, + "reward_std": 0.21235749125480652, + "rewards/accuracy_reward_stage2": 0.6730630397796631, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2104 + }, + { + "completion_length": 11.578125, + "epoch": 0.3688452777291046, + "grad_norm": 20.560044080024486, + "kl": 0.15625, + "learning_rate": 6.313299456807429e-07, + "loss": 0.0184, + "reward": 1.146272897720337, + "reward_std": 0.21671149134635925, + "rewards/accuracy_reward_stage2": 0.2868978977203369, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2105 + }, + { + "completion_length": 9.4375, + "epoch": 0.3690205011389522, + "grad_norm": 18.96185128228613, + "kl": 0.07958984375, + "learning_rate": 6.311547222708954e-07, + "loss": -0.0178, + "reward": 1.6386375427246094, + "reward_std": 0.23830300569534302, + "rewards/accuracy_reward_stage2": 0.6698874235153198, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2106 + }, + { + "completion_length": 8.796875, + "epoch": 0.36919572454879973, + "grad_norm": 19.016078689475286, + "kl": 0.076171875, + "learning_rate": 6.309794988610479e-07, + "loss": 0.0304, + "reward": 1.6456325054168701, + "reward_std": 0.12463878095149994, + "rewards/accuracy_reward_stage2": 0.6456325054168701, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2107 + }, + { + "completion_length": 10.734375, + "epoch": 0.3693709479586473, + "grad_norm": 16.740065149883968, + "kl": 0.0294189453125, + "learning_rate": 6.308042754512003e-07, + "loss": 0.0118, + "reward": 1.7477238178253174, + "reward_std": 0.10108815133571625, + "rewards/accuracy_reward_stage2": 0.7477236986160278, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2108 + }, + { + "completion_length": 12.078125, + "epoch": 0.3695461713684948, + "grad_norm": 24.223329981252363, + "kl": 0.061279296875, + "learning_rate": 6.306290520413528e-07, + "loss": 0.0245, + "reward": 1.6649169921875, + "reward_std": 0.1374625563621521, + "rewards/accuracy_reward_stage2": 0.6649170517921448, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2109 + }, + { + "completion_length": 9.71875, + "epoch": 0.36972139477834237, + "grad_norm": 16.487067073275398, + "kl": 0.1171875, + "learning_rate": 6.30453828631505e-07, + "loss": 0.0027, + "reward": 1.557640790939331, + "reward_std": 0.22457411885261536, + "rewards/accuracy_reward_stage2": 0.573265790939331, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2110 + }, + { + "completion_length": 8.796875, + "epoch": 0.3698966181881899, + "grad_norm": 21.62888433471808, + "kl": 0.140625, + "learning_rate": 6.302786052216575e-07, + "loss": 0.0561, + "reward": 1.6204335689544678, + "reward_std": 0.22641383111476898, + "rewards/accuracy_reward_stage2": 0.6204336881637573, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2111 + }, + { + "completion_length": 11.328125, + "epoch": 0.3700718415980375, + "grad_norm": 16.87788815845728, + "kl": 0.0078125, + "learning_rate": 6.3010338181181e-07, + "loss": 0.0031, + "reward": 1.8675432205200195, + "reward_std": 0.11035715043544769, + "rewards/accuracy_reward_stage2": 0.8675432205200195, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2112 + }, + { + "completion_length": 9.4375, + "epoch": 0.37024706500788507, + "grad_norm": 24.39768532767775, + "kl": 0.2431640625, + "learning_rate": 6.299281584019624e-07, + "loss": 0.0091, + "reward": 1.415531873703003, + "reward_std": 0.25741642713546753, + "rewards/accuracy_reward_stage2": 0.5717819929122925, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2113 + }, + { + "completion_length": 21.625, + "epoch": 0.3704222884177326, + "grad_norm": 15.837403285224246, + "kl": 0.06884765625, + "learning_rate": 6.297529349921149e-07, + "loss": 0.0276, + "reward": 1.5362218618392944, + "reward_std": 0.1085284948348999, + "rewards/accuracy_reward_stage2": 0.6612218022346497, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2114 + }, + { + "completion_length": 12.28125, + "epoch": 0.37059751182758016, + "grad_norm": 13.917895664990166, + "kl": 0.142578125, + "learning_rate": 6.295777115822674e-07, + "loss": 0.032, + "reward": 1.4274253845214844, + "reward_std": 0.17158064246177673, + "rewards/accuracy_reward_stage2": 0.5680503845214844, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2115 + }, + { + "completion_length": 9.8125, + "epoch": 0.3707727352374277, + "grad_norm": 16.353481146815447, + "kl": 0.326171875, + "learning_rate": 6.294024881724198e-07, + "loss": 0.1303, + "reward": 1.4166667461395264, + "reward_std": 0.11135885119438171, + "rewards/accuracy_reward_stage2": 0.6666666269302368, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2116 + }, + { + "completion_length": 12.40625, + "epoch": 0.37094795864727526, + "grad_norm": 23.075007096362388, + "kl": 0.20703125, + "learning_rate": 6.292272647625723e-07, + "loss": 0.0783, + "reward": 1.3603246212005615, + "reward_std": 0.14933273196220398, + "rewards/accuracy_reward_stage2": 0.6103246808052063, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2117 + }, + { + "completion_length": 8.34375, + "epoch": 0.37112318205712286, + "grad_norm": 24.00052084623189, + "kl": 0.140625, + "learning_rate": 6.290520413527247e-07, + "loss": 0.003, + "reward": 1.4961320161819458, + "reward_std": 0.32098907232284546, + "rewards/accuracy_reward_stage2": 0.5273820161819458, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2118 + }, + { + "completion_length": 9.078125, + "epoch": 0.3712984054669704, + "grad_norm": 20.123218248950387, + "kl": 0.32421875, + "learning_rate": 6.288768179428772e-07, + "loss": 0.1298, + "reward": 1.5326968431472778, + "reward_std": 0.24906522035598755, + "rewards/accuracy_reward_stage2": 0.6576968431472778, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2119 + }, + { + "completion_length": 9.203125, + "epoch": 0.37147362887681795, + "grad_norm": 18.376972007455322, + "kl": 0.12890625, + "learning_rate": 6.287015945330297e-07, + "loss": 0.0123, + "reward": 1.8160606622695923, + "reward_std": 0.270508348941803, + "rewards/accuracy_reward_stage2": 0.8316856026649475, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2120 + }, + { + "completion_length": 12.5, + "epoch": 0.3716488522866655, + "grad_norm": 22.271447130988797, + "kl": 0.109375, + "learning_rate": 6.28526371123182e-07, + "loss": 0.0311, + "reward": 1.5290179252624512, + "reward_std": 0.22298547625541687, + "rewards/accuracy_reward_stage2": 0.6696428656578064, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2121 + }, + { + "completion_length": 10.65625, + "epoch": 0.37182407569651305, + "grad_norm": 19.405882578543313, + "kl": 0.126953125, + "learning_rate": 6.283511477133345e-07, + "loss": -0.0267, + "reward": 1.306645154953003, + "reward_std": 0.23632827401161194, + "rewards/accuracy_reward_stage2": 0.3378952145576477, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2122 + }, + { + "completion_length": 16.796875, + "epoch": 0.3719992991063606, + "grad_norm": 20.1689736986702, + "kl": 0.053955078125, + "learning_rate": 6.281759243034869e-07, + "loss": 0.0215, + "reward": 1.6409006118774414, + "reward_std": 0.14495626091957092, + "rewards/accuracy_reward_stage2": 0.6409005522727966, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2123 + }, + { + "completion_length": 7.40625, + "epoch": 0.37217452251620814, + "grad_norm": 22.532524076188334, + "kl": 0.12890625, + "learning_rate": 6.280007008936393e-07, + "loss": 0.0515, + "reward": 1.6531740427017212, + "reward_std": 0.21819378435611725, + "rewards/accuracy_reward_stage2": 0.6531739830970764, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2124 + }, + { + "completion_length": 7.8125, + "epoch": 0.37234974592605574, + "grad_norm": 16.457651486905775, + "kl": 0.0751953125, + "learning_rate": 6.278254774837918e-07, + "loss": -0.014, + "reward": 1.547957181930542, + "reward_std": 0.13146492838859558, + "rewards/accuracy_reward_stage2": 0.563582181930542, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2125 + }, + { + "completion_length": 15.34375, + "epoch": 0.3725249693359033, + "grad_norm": 64.28172636474491, + "kl": 0.494140625, + "learning_rate": 6.276502540739442e-07, + "loss": 0.1979, + "reward": 1.3385417461395264, + "reward_std": 0.19351094961166382, + "rewards/accuracy_reward_stage2": 0.5885416269302368, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2126 + }, + { + "completion_length": 11.328125, + "epoch": 0.37270019274575084, + "grad_norm": 18.082131358708025, + "kl": 0.08251953125, + "learning_rate": 6.274750306640967e-07, + "loss": 0.001, + "reward": 1.559586763381958, + "reward_std": 0.1725788712501526, + "rewards/accuracy_reward_stage2": 0.575211763381958, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2127 + }, + { + "completion_length": 9.84375, + "epoch": 0.3728754161555984, + "grad_norm": 20.991156359291157, + "kl": 0.1875, + "learning_rate": 6.272998072542492e-07, + "loss": -0.0102, + "reward": 1.5073506832122803, + "reward_std": 0.23045286536216736, + "rewards/accuracy_reward_stage2": 0.5386006236076355, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2128 + }, + { + "completion_length": 10.84375, + "epoch": 0.37305063956544593, + "grad_norm": 14.329424297893015, + "kl": 0.04052734375, + "learning_rate": 6.271245838444016e-07, + "loss": -0.028, + "reward": 1.6722835302352905, + "reward_std": 0.19924761354923248, + "rewards/accuracy_reward_stage2": 0.6879085302352905, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2129 + }, + { + "completion_length": 13.828125, + "epoch": 0.3732258629752935, + "grad_norm": 21.446783046742176, + "kl": 0.306640625, + "learning_rate": 6.26949360434554e-07, + "loss": 0.1223, + "reward": 1.3281686305999756, + "reward_std": 0.2296711653470993, + "rewards/accuracy_reward_stage2": 0.453168660402298, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2130 + }, + { + "completion_length": 7.484375, + "epoch": 0.3734010863851411, + "grad_norm": 17.776532396212232, + "kl": 0.05615234375, + "learning_rate": 6.267741370247065e-07, + "loss": -0.0217, + "reward": 1.695128321647644, + "reward_std": 0.215063214302063, + "rewards/accuracy_reward_stage2": 0.710753321647644, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2131 + }, + { + "completion_length": 10.359375, + "epoch": 0.3735763097949886, + "grad_norm": 16.999543337441377, + "kl": 0.11376953125, + "learning_rate": 6.265989136148589e-07, + "loss": 0.0013, + "reward": 1.6943080425262451, + "reward_std": 0.15610839426517487, + "rewards/accuracy_reward_stage2": 0.7099331021308899, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2132 + }, + { + "completion_length": 8.921875, + "epoch": 0.3737515332048362, + "grad_norm": 16.62115120629419, + "kl": 0.111328125, + "learning_rate": 6.264236902050114e-07, + "loss": 0.0154, + "reward": 1.5700395107269287, + "reward_std": 0.18041619658470154, + "rewards/accuracy_reward_stage2": 0.5856645107269287, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2133 + }, + { + "completion_length": 9.5625, + "epoch": 0.3739267566146837, + "grad_norm": 24.958913260953675, + "kl": 0.296875, + "learning_rate": 6.262484667951638e-07, + "loss": 0.1185, + "reward": 1.5284466743469238, + "reward_std": 0.27969932556152344, + "rewards/accuracy_reward_stage2": 0.6534466743469238, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2134 + }, + { + "completion_length": 10.578125, + "epoch": 0.37410198002453127, + "grad_norm": 19.010121391563455, + "kl": 0.09130859375, + "learning_rate": 6.260732433853162e-07, + "loss": -0.0067, + "reward": 1.3212120532989502, + "reward_std": 0.1627998948097229, + "rewards/accuracy_reward_stage2": 0.4618370831012726, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2135 + }, + { + "completion_length": 9.59375, + "epoch": 0.3742772034343788, + "grad_norm": 15.576660675742353, + "kl": 0.0498046875, + "learning_rate": 6.258980199754687e-07, + "loss": 0.0199, + "reward": 1.5728716850280762, + "reward_std": 0.17854374647140503, + "rewards/accuracy_reward_stage2": 0.6978715658187866, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2136 + }, + { + "completion_length": 9.90625, + "epoch": 0.3744524268442264, + "grad_norm": 19.01937287035376, + "kl": 0.10107421875, + "learning_rate": 6.257227965656211e-07, + "loss": -0.0038, + "reward": 1.3226069211959839, + "reward_std": 0.3287656605243683, + "rewards/accuracy_reward_stage2": 0.3382318615913391, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2137 + }, + { + "completion_length": 10.28125, + "epoch": 0.37462765025407396, + "grad_norm": 17.223905236511207, + "kl": 0.1328125, + "learning_rate": 6.255475731557736e-07, + "loss": 0.0092, + "reward": 1.548721194267273, + "reward_std": 0.196788489818573, + "rewards/accuracy_reward_stage2": 0.5643461346626282, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2138 + }, + { + "completion_length": 11.109375, + "epoch": 0.3748028736639215, + "grad_norm": 18.440782368725525, + "kl": 0.09716796875, + "learning_rate": 6.253723497459261e-07, + "loss": -0.0052, + "reward": 1.5827999114990234, + "reward_std": 0.22741162776947021, + "rewards/accuracy_reward_stage2": 0.5984249114990234, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2139 + }, + { + "completion_length": 9.546875, + "epoch": 0.37497809707376906, + "grad_norm": 21.842367934370266, + "kl": 0.224609375, + "learning_rate": 6.251971263360784e-07, + "loss": 0.0565, + "reward": 1.41269850730896, + "reward_std": 0.2589086890220642, + "rewards/accuracy_reward_stage2": 0.5533234477043152, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2140 + }, + { + "completion_length": 13.734375, + "epoch": 0.3751533204836166, + "grad_norm": 23.19318360447688, + "kl": 0.11474609375, + "learning_rate": 6.250219029262309e-07, + "loss": 0.0458, + "reward": 1.5469526052474976, + "reward_std": 0.23762944340705872, + "rewards/accuracy_reward_stage2": 0.5469525456428528, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2141 + }, + { + "completion_length": 13.078125, + "epoch": 0.37532854389346415, + "grad_norm": 20.666520109212396, + "kl": 0.10498046875, + "learning_rate": 6.248466795163833e-07, + "loss": -0.0003, + "reward": 1.459972858428955, + "reward_std": 0.26203304529190063, + "rewards/accuracy_reward_stage2": 0.4755978584289551, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2142 + }, + { + "completion_length": 11.203125, + "epoch": 0.3755037673033117, + "grad_norm": 61.93263278879373, + "kl": 0.48046875, + "learning_rate": 6.246714561065358e-07, + "loss": 0.1529, + "reward": 1.3742291927337646, + "reward_std": 0.282656192779541, + "rewards/accuracy_reward_stage2": 0.3898541331291199, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2143 + }, + { + "completion_length": 7.984375, + "epoch": 0.3756789907131593, + "grad_norm": 15.307871940423698, + "kl": 0.016357421875, + "learning_rate": 6.244962326966883e-07, + "loss": 0.0066, + "reward": 1.7660094499588013, + "reward_std": 0.20985843241214752, + "rewards/accuracy_reward_stage2": 0.7660094499588013, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2144 + }, + { + "completion_length": 13.625, + "epoch": 0.37585421412300685, + "grad_norm": 17.61172102911123, + "kl": 0.1416015625, + "learning_rate": 6.243210092868407e-07, + "loss": -0.0297, + "reward": 1.2838246822357178, + "reward_std": 0.20607344806194305, + "rewards/accuracy_reward_stage2": 0.44007477164268494, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2145 + }, + { + "completion_length": 20.0, + "epoch": 0.3760294375328544, + "grad_norm": 21.1168053036622, + "kl": 0.058837890625, + "learning_rate": 6.241457858769932e-07, + "loss": 0.0236, + "reward": 1.6720213890075684, + "reward_std": 0.15761704742908478, + "rewards/accuracy_reward_stage2": 0.6720214486122131, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2146 + }, + { + "completion_length": 7.796875, + "epoch": 0.37620466094270194, + "grad_norm": 19.460637478896743, + "kl": 0.060791015625, + "learning_rate": 6.239705624671457e-07, + "loss": -0.0062, + "reward": 1.7685046195983887, + "reward_std": 0.25962045788764954, + "rewards/accuracy_reward_stage2": 0.7841296792030334, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2147 + }, + { + "completion_length": 9.984375, + "epoch": 0.3763798843525495, + "grad_norm": 15.049930214687446, + "kl": 0.10009765625, + "learning_rate": 6.23795339057298e-07, + "loss": -0.0192, + "reward": 1.5843532085418701, + "reward_std": 0.18276052176952362, + "rewards/accuracy_reward_stage2": 0.6156031489372253, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2148 + }, + { + "completion_length": 12.6875, + "epoch": 0.37655510776239703, + "grad_norm": 32.813654287269, + "kl": 0.27734375, + "learning_rate": 6.236201156474505e-07, + "loss": 0.0208, + "reward": 1.2662172317504883, + "reward_std": 0.28443384170532227, + "rewards/accuracy_reward_stage2": 0.5474672317504883, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2149 + }, + { + "completion_length": 16.234375, + "epoch": 0.37673033117224464, + "grad_norm": 21.62062817649044, + "kl": 0.1865234375, + "learning_rate": 6.234448922376028e-07, + "loss": 0.0533, + "reward": 1.4966247081756592, + "reward_std": 0.3166399598121643, + "rewards/accuracy_reward_stage2": 0.5122496485710144, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2150 + }, + { + "completion_length": 11.390625, + "epoch": 0.3769055545820922, + "grad_norm": 34.291704019048645, + "kl": 0.21875, + "learning_rate": 6.232696688277553e-07, + "loss": 0.0125, + "reward": 1.5817184448242188, + "reward_std": 0.2617461681365967, + "rewards/accuracy_reward_stage2": 0.6129684448242188, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2151 + }, + { + "completion_length": 5.9375, + "epoch": 0.37708077799193973, + "grad_norm": 18.933549522426574, + "kl": 0.11572265625, + "learning_rate": 6.230944454179078e-07, + "loss": 0.0109, + "reward": 1.6863667964935303, + "reward_std": 0.24173963069915771, + "rewards/accuracy_reward_stage2": 0.7019917964935303, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2152 + }, + { + "completion_length": 12.671875, + "epoch": 0.3772560014017873, + "grad_norm": 20.28812624382422, + "kl": 0.1416015625, + "learning_rate": 6.229192220080602e-07, + "loss": -0.0256, + "reward": 1.601873517036438, + "reward_std": 0.15559428930282593, + "rewards/accuracy_reward_stage2": 0.633123517036438, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2153 + }, + { + "completion_length": 6.96875, + "epoch": 0.3774312248116348, + "grad_norm": 20.570384731892993, + "kl": 0.302734375, + "learning_rate": 6.227439985982127e-07, + "loss": 0.0875, + "reward": 1.512142300605774, + "reward_std": 0.22807812690734863, + "rewards/accuracy_reward_stage2": 0.6527671813964844, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2154 + }, + { + "completion_length": 11.296875, + "epoch": 0.37760644822148237, + "grad_norm": 22.071858525335955, + "kl": 0.1376953125, + "learning_rate": 6.225687751883652e-07, + "loss": 0.0133, + "reward": 1.5736531019210815, + "reward_std": 0.26310211420059204, + "rewards/accuracy_reward_stage2": 0.7142781615257263, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2155 + }, + { + "completion_length": 10.296875, + "epoch": 0.3777816716313299, + "grad_norm": 18.90253121748625, + "kl": 0.038330078125, + "learning_rate": 6.223935517785176e-07, + "loss": 0.0153, + "reward": 1.4485113620758057, + "reward_std": 0.20023290812969208, + "rewards/accuracy_reward_stage2": 0.4485113024711609, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2156 + }, + { + "completion_length": 8.4375, + "epoch": 0.3779568950411775, + "grad_norm": 14.504422161072734, + "kl": 0.0693359375, + "learning_rate": 6.222183283686701e-07, + "loss": -0.0536, + "reward": 1.6381034851074219, + "reward_std": 0.19941505789756775, + "rewards/accuracy_reward_stage2": 0.7943534851074219, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2157 + }, + { + "completion_length": 10.375, + "epoch": 0.37813211845102507, + "grad_norm": 15.329958007937897, + "kl": 0.0986328125, + "learning_rate": 6.220431049588225e-07, + "loss": -0.0046, + "reward": 1.8430249691009521, + "reward_std": 0.17351368069648743, + "rewards/accuracy_reward_stage2": 0.8586499691009521, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2158 + }, + { + "completion_length": 12.71875, + "epoch": 0.3783073418608726, + "grad_norm": 15.56534467237157, + "kl": 0.04296875, + "learning_rate": 6.21867881548975e-07, + "loss": 0.0171, + "reward": 1.3171195983886719, + "reward_std": 0.21159344911575317, + "rewards/accuracy_reward_stage2": 0.44211962819099426, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2159 + }, + { + "completion_length": 9.0625, + "epoch": 0.37848256527072016, + "grad_norm": 48.364954367785906, + "kl": 0.0859375, + "learning_rate": 6.216926581391274e-07, + "loss": 0.0343, + "reward": 1.4293932914733887, + "reward_std": 0.2702651023864746, + "rewards/accuracy_reward_stage2": 0.5543933510780334, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2160 + }, + { + "completion_length": 9.0, + "epoch": 0.3786577886805677, + "grad_norm": 18.0001437695822, + "kl": 0.0751953125, + "learning_rate": 6.215174347292797e-07, + "loss": 0.03, + "reward": 1.7156038284301758, + "reward_std": 0.1932457685470581, + "rewards/accuracy_reward_stage2": 0.7156038284301758, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2161 + }, + { + "completion_length": 11.046875, + "epoch": 0.37883301209041526, + "grad_norm": 19.107410028406957, + "kl": 0.142578125, + "learning_rate": 6.213422113194322e-07, + "loss": 0.0131, + "reward": 1.3738727569580078, + "reward_std": 0.2558482885360718, + "rewards/accuracy_reward_stage2": 0.3894977271556854, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2162 + }, + { + "completion_length": 12.40625, + "epoch": 0.37900823550026286, + "grad_norm": 21.945046097805324, + "kl": 0.28515625, + "learning_rate": 6.211669879095846e-07, + "loss": 0.0744, + "reward": 1.4311002492904663, + "reward_std": 0.19576802849769592, + "rewards/accuracy_reward_stage2": 0.7123501896858215, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2163 + }, + { + "completion_length": 7.078125, + "epoch": 0.3791834589101104, + "grad_norm": 13.820927562745863, + "kl": 0.09814453125, + "learning_rate": 6.209917644997371e-07, + "loss": -0.0338, + "reward": 1.875470757484436, + "reward_std": 0.1988440304994583, + "rewards/accuracy_reward_stage2": 0.906720757484436, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2164 + }, + { + "completion_length": 5.375, + "epoch": 0.37935868231995795, + "grad_norm": 18.20756335986232, + "kl": 0.1865234375, + "learning_rate": 6.208165410898896e-07, + "loss": 0.0307, + "reward": 1.600242018699646, + "reward_std": 0.19384321570396423, + "rewards/accuracy_reward_stage2": 0.740867018699646, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2165 + }, + { + "completion_length": 8.90625, + "epoch": 0.3795339057298055, + "grad_norm": 19.02185897001305, + "kl": 0.1474609375, + "learning_rate": 6.20641317680042e-07, + "loss": -0.0998, + "reward": 1.5020967721939087, + "reward_std": 0.30960702896118164, + "rewards/accuracy_reward_stage2": 0.5645967125892639, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2166 + }, + { + "completion_length": 18.328125, + "epoch": 0.37970912913965305, + "grad_norm": 18.567459094184585, + "kl": 0.1826171875, + "learning_rate": 6.204660942701945e-07, + "loss": 0.0729, + "reward": 1.330150842666626, + "reward_std": 0.11160098016262054, + "rewards/accuracy_reward_stage2": 0.580150842666626, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2167 + }, + { + "completion_length": 9.109375, + "epoch": 0.3798843525495006, + "grad_norm": 16.303999370260797, + "kl": 0.16015625, + "learning_rate": 6.20290870860347e-07, + "loss": -0.0299, + "reward": 1.5709354877471924, + "reward_std": 0.3122965097427368, + "rewards/accuracy_reward_stage2": 0.6178104877471924, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2168 + }, + { + "completion_length": 7.546875, + "epoch": 0.3800595759593482, + "grad_norm": 17.12522028504591, + "kl": 0.189453125, + "learning_rate": 6.201156474504994e-07, + "loss": 0.0314, + "reward": 1.2758276462554932, + "reward_std": 0.24823029339313507, + "rewards/accuracy_reward_stage2": 0.2914525866508484, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2169 + }, + { + "completion_length": 13.0, + "epoch": 0.38023479936919574, + "grad_norm": 18.81176063318736, + "kl": 0.1513671875, + "learning_rate": 6.199404240406518e-07, + "loss": 0.0165, + "reward": 1.4827790260314941, + "reward_std": 0.21302473545074463, + "rewards/accuracy_reward_stage2": 0.6234040260314941, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2170 + }, + { + "completion_length": 9.21875, + "epoch": 0.3804100227790433, + "grad_norm": 16.121698472462196, + "kl": 0.134765625, + "learning_rate": 6.197652006308043e-07, + "loss": -0.0314, + "reward": 1.466298222541809, + "reward_std": 0.17698809504508972, + "rewards/accuracy_reward_stage2": 0.4975482225418091, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2171 + }, + { + "completion_length": 9.203125, + "epoch": 0.38058524618889084, + "grad_norm": 22.993006711543188, + "kl": 0.2265625, + "learning_rate": 6.195899772209567e-07, + "loss": -0.028, + "reward": 1.6543668508529663, + "reward_std": 0.3108653426170349, + "rewards/accuracy_reward_stage2": 0.8262418508529663, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2172 + }, + { + "completion_length": 20.59375, + "epoch": 0.3807604695987384, + "grad_norm": 22.070196626977605, + "kl": 0.07080078125, + "learning_rate": 6.194147538111091e-07, + "loss": 0.0284, + "reward": 1.2055113315582275, + "reward_std": 0.15062808990478516, + "rewards/accuracy_reward_stage2": 0.20551134645938873, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2173 + }, + { + "completion_length": 8.578125, + "epoch": 0.38093569300858593, + "grad_norm": 79.3112215970755, + "kl": 0.34375, + "learning_rate": 6.192395304012615e-07, + "loss": 0.1048, + "reward": 1.4546903371810913, + "reward_std": 0.2561033368110657, + "rewards/accuracy_reward_stage2": 0.5953153371810913, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2174 + }, + { + "completion_length": 14.328125, + "epoch": 0.3811109164184335, + "grad_norm": 13.769850844797942, + "kl": 0.047607421875, + "learning_rate": 6.19064306991414e-07, + "loss": -0.0237, + "reward": 1.4421863555908203, + "reward_std": 0.18708321452140808, + "rewards/accuracy_reward_stage2": 0.4578113257884979, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2175 + }, + { + "completion_length": 11.84375, + "epoch": 0.3812861398282811, + "grad_norm": 17.77441489979722, + "kl": 0.2734375, + "learning_rate": 6.188890835815665e-07, + "loss": -0.0233, + "reward": 1.3345986604690552, + "reward_std": 0.21731144189834595, + "rewards/accuracy_reward_stage2": 0.6314736604690552, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 2176 + }, + { + "completion_length": 7.21875, + "epoch": 0.3814613632381286, + "grad_norm": 14.420432239876597, + "kl": 0.08544921875, + "learning_rate": 6.187138601717189e-07, + "loss": -0.0099, + "reward": 1.375, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward_stage2": 0.390625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2177 + }, + { + "completion_length": 4.53125, + "epoch": 0.3816365866479762, + "grad_norm": 16.752767383625837, + "kl": 0.02490234375, + "learning_rate": 6.185386367618714e-07, + "loss": 0.0099, + "reward": 1.7901811599731445, + "reward_std": 0.18780627846717834, + "rewards/accuracy_reward_stage2": 0.7901811599731445, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2178 + }, + { + "completion_length": 11.984375, + "epoch": 0.3818118100578237, + "grad_norm": 19.134729548097912, + "kl": 0.138671875, + "learning_rate": 6.183634133520237e-07, + "loss": 0.0163, + "reward": 1.8155899047851562, + "reward_std": 0.20211075246334076, + "rewards/accuracy_reward_stage2": 0.8312147855758667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2179 + }, + { + "completion_length": 10.1875, + "epoch": 0.38198703346767127, + "grad_norm": 20.542028737132014, + "kl": 0.10595703125, + "learning_rate": 6.181881899421762e-07, + "loss": -0.0018, + "reward": 1.8165841102600098, + "reward_std": 0.24220114946365356, + "rewards/accuracy_reward_stage2": 0.8322091102600098, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2180 + }, + { + "completion_length": 8.953125, + "epoch": 0.3821622568775188, + "grad_norm": 15.416932155893878, + "kl": 0.087890625, + "learning_rate": 6.180129665323287e-07, + "loss": 0.0351, + "reward": 1.6418914794921875, + "reward_std": 0.26054543256759644, + "rewards/accuracy_reward_stage2": 0.6418914198875427, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2181 + }, + { + "completion_length": 9.34375, + "epoch": 0.3823374802873664, + "grad_norm": 14.650724234460618, + "kl": 0.162109375, + "learning_rate": 6.178377431224811e-07, + "loss": -0.0025, + "reward": 1.7068524360656738, + "reward_std": 0.19050803780555725, + "rewards/accuracy_reward_stage2": 0.7381024360656738, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2182 + }, + { + "completion_length": 9.671875, + "epoch": 0.38251270369721396, + "grad_norm": 18.73601022394114, + "kl": 0.1962890625, + "learning_rate": 6.176625197126336e-07, + "loss": 0.0369, + "reward": 1.4021036624908447, + "reward_std": 0.19758348166942596, + "rewards/accuracy_reward_stage2": 0.5427286028862, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2183 + }, + { + "completion_length": 10.015625, + "epoch": 0.3826879271070615, + "grad_norm": 16.57620370026051, + "kl": 0.1494140625, + "learning_rate": 6.174872963027861e-07, + "loss": -0.0179, + "reward": 1.5209438800811768, + "reward_std": 0.25040262937545776, + "rewards/accuracy_reward_stage2": 0.552193820476532, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2184 + }, + { + "completion_length": 9.140625, + "epoch": 0.38286315051690906, + "grad_norm": 16.435430776689625, + "kl": 0.1328125, + "learning_rate": 6.173120728929385e-07, + "loss": -0.0297, + "reward": 1.881011962890625, + "reward_std": 0.1751585155725479, + "rewards/accuracy_reward_stage2": 0.9122620224952698, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2185 + }, + { + "completion_length": 7.359375, + "epoch": 0.3830383739267566, + "grad_norm": 12.327616399469395, + "kl": 0.07470703125, + "learning_rate": 6.171368494830909e-07, + "loss": 0.0299, + "reward": 1.6680908203125, + "reward_std": 0.1365250200033188, + "rewards/accuracy_reward_stage2": 0.6680908799171448, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2186 + }, + { + "completion_length": 9.34375, + "epoch": 0.38321359733660415, + "grad_norm": 23.12163155547108, + "kl": 0.126953125, + "learning_rate": 6.169616260732433e-07, + "loss": -0.0032, + "reward": 1.4950852394104004, + "reward_std": 0.20893503725528717, + "rewards/accuracy_reward_stage2": 0.5263352394104004, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2187 + }, + { + "completion_length": 14.5625, + "epoch": 0.38338882074645175, + "grad_norm": 12.708155648632918, + "kl": 0.0595703125, + "learning_rate": 6.167864026633958e-07, + "loss": -0.0552, + "reward": 1.695472002029419, + "reward_std": 0.11602778732776642, + "rewards/accuracy_reward_stage2": 0.7267219424247742, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2188 + }, + { + "completion_length": 8.0625, + "epoch": 0.3835640441562993, + "grad_norm": 19.159424648991013, + "kl": 0.1455078125, + "learning_rate": 6.166111792535483e-07, + "loss": 0.0141, + "reward": 1.589672565460205, + "reward_std": 0.26173871755599976, + "rewards/accuracy_reward_stage2": 0.6052975654602051, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2189 + }, + { + "completion_length": 10.921875, + "epoch": 0.38373926756614685, + "grad_norm": 14.923624480497136, + "kl": 0.04443359375, + "learning_rate": 6.164359558437006e-07, + "loss": 0.0178, + "reward": 1.6148505210876465, + "reward_std": 0.1436673402786255, + "rewards/accuracy_reward_stage2": 0.6148505210876465, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2190 + }, + { + "completion_length": 8.90625, + "epoch": 0.3839144909759944, + "grad_norm": 17.51066119218728, + "kl": 0.12353515625, + "learning_rate": 6.162607324338531e-07, + "loss": 0.0053, + "reward": 1.3896396160125732, + "reward_std": 0.29294657707214355, + "rewards/accuracy_reward_stage2": 0.6552645564079285, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2191 + }, + { + "completion_length": 11.859375, + "epoch": 0.38408971438584194, + "grad_norm": 13.80016364569103, + "kl": 0.1953125, + "learning_rate": 6.160855090240056e-07, + "loss": -0.0815, + "reward": 1.6285045146942139, + "reward_std": 0.3758038282394409, + "rewards/accuracy_reward_stage2": 0.6910045146942139, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2192 + }, + { + "completion_length": 9.546875, + "epoch": 0.3842649377956895, + "grad_norm": 22.35046974856224, + "kl": 0.1201171875, + "learning_rate": 6.15910285614158e-07, + "loss": -0.0138, + "reward": 1.4999234676361084, + "reward_std": 0.2815472483634949, + "rewards/accuracy_reward_stage2": 0.6561734676361084, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2193 + }, + { + "completion_length": 10.109375, + "epoch": 0.38444016120553703, + "grad_norm": 16.895726320418603, + "kl": 0.0830078125, + "learning_rate": 6.157350622043105e-07, + "loss": 0.0331, + "reward": 1.51102614402771, + "reward_std": 0.1778338998556137, + "rewards/accuracy_reward_stage2": 0.51102614402771, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2194 + }, + { + "completion_length": 9.78125, + "epoch": 0.38461538461538464, + "grad_norm": 14.773306048930774, + "kl": 0.1298828125, + "learning_rate": 6.155598387944629e-07, + "loss": -0.0137, + "reward": 1.846685767173767, + "reward_std": 0.21043790876865387, + "rewards/accuracy_reward_stage2": 0.8779357671737671, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2195 + }, + { + "completion_length": 9.265625, + "epoch": 0.3847906080252322, + "grad_norm": 20.128255005421288, + "kl": 0.08984375, + "learning_rate": 6.153846153846154e-07, + "loss": 0.0005, + "reward": 1.0712401866912842, + "reward_std": 0.06993351876735687, + "rewards/accuracy_reward_stage2": 0.3524901568889618, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2196 + }, + { + "completion_length": 10.171875, + "epoch": 0.38496583143507973, + "grad_norm": 14.97086397468025, + "kl": 0.2001953125, + "learning_rate": 6.152093919747679e-07, + "loss": 0.0801, + "reward": 1.7159042358398438, + "reward_std": 0.058444324880838394, + "rewards/accuracy_reward_stage2": 0.8409042954444885, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2197 + }, + { + "completion_length": 6.96875, + "epoch": 0.3851410548449273, + "grad_norm": 14.820380862297538, + "kl": 0.0703125, + "learning_rate": 6.150341685649203e-07, + "loss": 0.0002, + "reward": 1.7224714756011963, + "reward_std": 0.21219471096992493, + "rewards/accuracy_reward_stage2": 0.7380965352058411, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2198 + }, + { + "completion_length": 10.140625, + "epoch": 0.3853162782547748, + "grad_norm": 17.29181890196777, + "kl": 0.12255859375, + "learning_rate": 6.148589451550726e-07, + "loss": 0.0049, + "reward": 1.4856467247009277, + "reward_std": 0.1462748497724533, + "rewards/accuracy_reward_stage2": 0.5012717843055725, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2199 + }, + { + "completion_length": 13.578125, + "epoch": 0.38549150166462237, + "grad_norm": 17.56320922297244, + "kl": 0.1015625, + "learning_rate": 6.146837217452251e-07, + "loss": 0.0406, + "reward": 1.3396921157836914, + "reward_std": 0.24257370829582214, + "rewards/accuracy_reward_stage2": 0.3396921157836914, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2200 + }, + { + "completion_length": 12.375, + "epoch": 0.38566672507447, + "grad_norm": 21.98389020678823, + "kl": 0.10986328125, + "learning_rate": 6.145084983353775e-07, + "loss": -0.0367, + "reward": 1.409691572189331, + "reward_std": 0.2644343078136444, + "rewards/accuracy_reward_stage2": 0.44094154238700867, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2201 + }, + { + "completion_length": 8.609375, + "epoch": 0.3858419484843175, + "grad_norm": 20.532977914563833, + "kl": 0.17578125, + "learning_rate": 6.1433327492553e-07, + "loss": -0.0015, + "reward": 1.5434110164642334, + "reward_std": 0.370197057723999, + "rewards/accuracy_reward_stage2": 0.5746610760688782, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2202 + }, + { + "completion_length": 11.953125, + "epoch": 0.38601717189416507, + "grad_norm": 20.94024842889363, + "kl": 0.1591796875, + "learning_rate": 6.141580515156824e-07, + "loss": 0.0322, + "reward": 1.5620813369750977, + "reward_std": 0.3396362066268921, + "rewards/accuracy_reward_stage2": 0.7027062773704529, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2203 + }, + { + "completion_length": 9.671875, + "epoch": 0.3861923953040126, + "grad_norm": 22.13104501755214, + "kl": 0.0233154296875, + "learning_rate": 6.139828281058349e-07, + "loss": 0.0093, + "reward": 1.7760417461395264, + "reward_std": 0.20276054739952087, + "rewards/accuracy_reward_stage2": 0.7760416269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2204 + }, + { + "completion_length": 10.828125, + "epoch": 0.38636761871386016, + "grad_norm": 21.760828418146833, + "kl": 0.06884765625, + "learning_rate": 6.138076046959874e-07, + "loss": 0.0276, + "reward": 1.4502272605895996, + "reward_std": 0.25459161400794983, + "rewards/accuracy_reward_stage2": 0.45022720098495483, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2205 + }, + { + "completion_length": 8.5625, + "epoch": 0.3865428421237077, + "grad_norm": 21.217578680048884, + "kl": 0.11376953125, + "learning_rate": 6.136323812861398e-07, + "loss": 0.0037, + "reward": 1.5424821376800537, + "reward_std": 0.16328753530979156, + "rewards/accuracy_reward_stage2": 0.5581071376800537, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2206 + }, + { + "completion_length": 7.921875, + "epoch": 0.38671806553355526, + "grad_norm": 14.634238577480422, + "kl": 0.1103515625, + "learning_rate": 6.134571578762923e-07, + "loss": -0.0, + "reward": 1.578125, + "reward_std": 0.22097086906433105, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2207 + }, + { + "completion_length": 10.40625, + "epoch": 0.38689328894340286, + "grad_norm": 17.851183119937975, + "kl": 0.1376953125, + "learning_rate": 6.132819344664448e-07, + "loss": 0.0161, + "reward": 1.6369647979736328, + "reward_std": 0.20657899975776672, + "rewards/accuracy_reward_stage2": 0.6525896787643433, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2208 + }, + { + "completion_length": 7.625, + "epoch": 0.3870685123532504, + "grad_norm": 26.111489744750457, + "kl": 0.06396484375, + "learning_rate": 6.131067110565971e-07, + "loss": 0.0257, + "reward": 1.4487724304199219, + "reward_std": 0.24807778000831604, + "rewards/accuracy_reward_stage2": 0.5737723112106323, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2209 + }, + { + "completion_length": 12.96875, + "epoch": 0.38724373576309795, + "grad_norm": 16.602603346013872, + "kl": 0.115234375, + "learning_rate": 6.129314876467496e-07, + "loss": 0.046, + "reward": 1.612157940864563, + "reward_std": 0.18975886702537537, + "rewards/accuracy_reward_stage2": 0.7371578812599182, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2210 + }, + { + "completion_length": 8.140625, + "epoch": 0.3874189591729455, + "grad_norm": 15.491528641737077, + "kl": 0.0947265625, + "learning_rate": 6.12756264236902e-07, + "loss": -0.0461, + "reward": 1.6862807273864746, + "reward_std": 0.23824840784072876, + "rewards/accuracy_reward_stage2": 0.7175307869911194, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2211 + }, + { + "completion_length": 13.265625, + "epoch": 0.38759418258279305, + "grad_norm": 16.70501103695778, + "kl": 0.11767578125, + "learning_rate": 6.125810408270544e-07, + "loss": -0.11, + "reward": 1.2275532484054565, + "reward_std": 0.22072014212608337, + "rewards/accuracy_reward_stage2": 0.29005324840545654, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2212 + }, + { + "completion_length": 11.84375, + "epoch": 0.3877694059926406, + "grad_norm": 10.653261541291995, + "kl": 0.1669921875, + "learning_rate": 6.124058174172069e-07, + "loss": 0.0272, + "reward": 1.6923514604568481, + "reward_std": 0.0814502090215683, + "rewards/accuracy_reward_stage2": 0.8329764604568481, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2213 + }, + { + "completion_length": 12.75, + "epoch": 0.3879446294024882, + "grad_norm": 18.466229184417685, + "kl": 0.2265625, + "learning_rate": 6.122305940073593e-07, + "loss": -0.0294, + "reward": 1.4522085189819336, + "reward_std": 0.3213249742984772, + "rewards/accuracy_reward_stage2": 0.499083548784256, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2214 + }, + { + "completion_length": 9.484375, + "epoch": 0.38811985281233574, + "grad_norm": 23.07519279033274, + "kl": 0.203125, + "learning_rate": 6.120553705975118e-07, + "loss": 0.0371, + "reward": 1.647832989692688, + "reward_std": 0.17026206851005554, + "rewards/accuracy_reward_stage2": 0.7884579300880432, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2215 + }, + { + "completion_length": 8.671875, + "epoch": 0.3882950762221833, + "grad_norm": 28.513257819873985, + "kl": 0.2001953125, + "learning_rate": 6.118801471876643e-07, + "loss": 0.0356, + "reward": 1.6165659427642822, + "reward_std": 0.23038463294506073, + "rewards/accuracy_reward_stage2": 0.6321908831596375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2216 + }, + { + "completion_length": 9.765625, + "epoch": 0.38847029963203084, + "grad_norm": 27.15934863307988, + "kl": 0.138671875, + "learning_rate": 6.117049237778167e-07, + "loss": 0.0113, + "reward": 1.5638474225997925, + "reward_std": 0.2967807650566101, + "rewards/accuracy_reward_stage2": 0.5794724225997925, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2217 + }, + { + "completion_length": 10.890625, + "epoch": 0.3886455230418784, + "grad_norm": 16.716283148171772, + "kl": 0.0673828125, + "learning_rate": 6.115297003679692e-07, + "loss": -0.0103, + "reward": 1.5654242038726807, + "reward_std": 0.19815650582313538, + "rewards/accuracy_reward_stage2": 0.5810492038726807, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2218 + }, + { + "completion_length": 8.0625, + "epoch": 0.38882074645172593, + "grad_norm": 13.171143475838283, + "kl": 0.162109375, + "learning_rate": 6.113544769581215e-07, + "loss": 0.0208, + "reward": 1.3602125644683838, + "reward_std": 0.14076007902622223, + "rewards/accuracy_reward_stage2": 0.3758375942707062, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2219 + }, + { + "completion_length": 11.109375, + "epoch": 0.38899596986157353, + "grad_norm": 22.902761311358915, + "kl": 0.1279296875, + "learning_rate": 6.11179253548274e-07, + "loss": 0.0176, + "reward": 1.2605656385421753, + "reward_std": 0.3278544843196869, + "rewards/accuracy_reward_stage2": 0.5261905789375305, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2220 + }, + { + "completion_length": 10.171875, + "epoch": 0.3891711932714211, + "grad_norm": 60.59700362265599, + "kl": 0.345703125, + "learning_rate": 6.110040301384265e-07, + "loss": 0.138, + "reward": 1.541421890258789, + "reward_std": 0.33629554510116577, + "rewards/accuracy_reward_stage2": 0.5414219498634338, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2221 + }, + { + "completion_length": 17.390625, + "epoch": 0.3893464166812686, + "grad_norm": 21.843503785841705, + "kl": 0.09765625, + "learning_rate": 6.108288067285789e-07, + "loss": 0.039, + "reward": 1.5664076805114746, + "reward_std": 0.17039306461811066, + "rewards/accuracy_reward_stage2": 0.5664076805114746, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2222 + }, + { + "completion_length": 22.28125, + "epoch": 0.3895216400911162, + "grad_norm": 22.159828627899486, + "kl": 0.1787109375, + "learning_rate": 6.106535833187314e-07, + "loss": 0.0059, + "reward": 1.702857255935669, + "reward_std": 0.2878776788711548, + "rewards/accuracy_reward_stage2": 0.734107255935669, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2223 + }, + { + "completion_length": 13.4375, + "epoch": 0.3896968635009637, + "grad_norm": 16.887738826370676, + "kl": 0.2099609375, + "learning_rate": 6.104783599088838e-07, + "loss": 0.048, + "reward": 1.2868878841400146, + "reward_std": 0.15713664889335632, + "rewards/accuracy_reward_stage2": 0.3025129437446594, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2224 + }, + { + "completion_length": 9.578125, + "epoch": 0.38987208691081127, + "grad_norm": 21.232434409269125, + "kl": 0.1416015625, + "learning_rate": 6.103031364990362e-07, + "loss": 0.0125, + "reward": 1.5615177154541016, + "reward_std": 0.21830402314662933, + "rewards/accuracy_reward_stage2": 0.5771427154541016, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2225 + }, + { + "completion_length": 8.75, + "epoch": 0.3900473103206588, + "grad_norm": 20.431913799465242, + "kl": 0.087890625, + "learning_rate": 6.101279130891887e-07, + "loss": -0.009, + "reward": 1.498471736907959, + "reward_std": 0.23524844646453857, + "rewards/accuracy_reward_stage2": 0.514096736907959, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2226 + }, + { + "completion_length": 12.40625, + "epoch": 0.3902225337305064, + "grad_norm": 25.779614194701836, + "kl": 0.1201171875, + "learning_rate": 6.099526896793411e-07, + "loss": 0.0482, + "reward": 1.6931931972503662, + "reward_std": 0.19687005877494812, + "rewards/accuracy_reward_stage2": 0.6931931376457214, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2227 + }, + { + "completion_length": 13.359375, + "epoch": 0.39039775714035396, + "grad_norm": 115.30703123252181, + "kl": 0.55078125, + "learning_rate": 6.097774662694936e-07, + "loss": 0.2402, + "reward": 1.5276418924331665, + "reward_std": 0.11854963004589081, + "rewards/accuracy_reward_stage2": 0.6526418924331665, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2228 + }, + { + "completion_length": 10.796875, + "epoch": 0.3905729805502015, + "grad_norm": 14.790021209721399, + "kl": 0.0634765625, + "learning_rate": 6.09602242859646e-07, + "loss": -0.0188, + "reward": 1.562615156173706, + "reward_std": 0.14368489384651184, + "rewards/accuracy_reward_stage2": 0.578240156173706, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2229 + }, + { + "completion_length": 8.203125, + "epoch": 0.39074820396004906, + "grad_norm": 21.47388207177197, + "kl": 0.16796875, + "learning_rate": 6.094270194497984e-07, + "loss": 0.0673, + "reward": 1.5674675703048706, + "reward_std": 0.29743796586990356, + "rewards/accuracy_reward_stage2": 0.6924675703048706, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2230 + }, + { + "completion_length": 14.078125, + "epoch": 0.3909234273698966, + "grad_norm": 18.456799523246854, + "kl": 0.1591796875, + "learning_rate": 6.092517960399509e-07, + "loss": -0.0777, + "reward": 1.2790626287460327, + "reward_std": 0.2566419243812561, + "rewards/accuracy_reward_stage2": 0.3415626287460327, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2231 + }, + { + "completion_length": 11.09375, + "epoch": 0.39109865077974415, + "grad_norm": 13.925303709250686, + "kl": 0.1455078125, + "learning_rate": 6.090765726301034e-07, + "loss": -0.0635, + "reward": 1.607444405555725, + "reward_std": 0.2526930570602417, + "rewards/accuracy_reward_stage2": 0.7793193459510803, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2232 + }, + { + "completion_length": 24.921875, + "epoch": 0.39127387418959175, + "grad_norm": 23.813652296058024, + "kl": 0.0830078125, + "learning_rate": 6.089013492202558e-07, + "loss": -0.011, + "reward": 1.5801374912261963, + "reward_std": 0.28528648614883423, + "rewards/accuracy_reward_stage2": 0.5957625508308411, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2233 + }, + { + "completion_length": 11.25, + "epoch": 0.3914490975994393, + "grad_norm": 17.95375870725181, + "kl": 0.07373046875, + "learning_rate": 6.087261258104083e-07, + "loss": 0.008, + "reward": 1.7711431980133057, + "reward_std": 0.2443651705980301, + "rewards/accuracy_reward_stage2": 0.7867681980133057, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2234 + }, + { + "completion_length": 11.359375, + "epoch": 0.39162432100928685, + "grad_norm": 14.143073454348915, + "kl": 0.10498046875, + "learning_rate": 6.085509024005607e-07, + "loss": -0.0311, + "reward": 1.688242793083191, + "reward_std": 0.17107422649860382, + "rewards/accuracy_reward_stage2": 0.7194927930831909, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2235 + }, + { + "completion_length": 10.171875, + "epoch": 0.3917995444191344, + "grad_norm": 18.96235279859737, + "kl": 0.212890625, + "learning_rate": 6.083756789907132e-07, + "loss": -0.0775, + "reward": 1.7642440795898438, + "reward_std": 0.3353942632675171, + "rewards/accuracy_reward_stage2": 0.826744019985199, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2236 + }, + { + "completion_length": 10.6875, + "epoch": 0.39197476782898194, + "grad_norm": 44.617783503094316, + "kl": 0.26953125, + "learning_rate": 6.082004555808656e-07, + "loss": 0.0191, + "reward": 1.5416667461395264, + "reward_std": 0.3209052085876465, + "rewards/accuracy_reward_stage2": 0.5729166269302368, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2237 + }, + { + "completion_length": 8.359375, + "epoch": 0.3921499912388295, + "grad_norm": 20.41487185448497, + "kl": 0.2158203125, + "learning_rate": 6.08025232171018e-07, + "loss": 0.0209, + "reward": 1.5106935501098633, + "reward_std": 0.19475436210632324, + "rewards/accuracy_reward_stage2": 0.5419436693191528, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2238 + }, + { + "completion_length": 11.28125, + "epoch": 0.3923252146486771, + "grad_norm": 21.018475522284266, + "kl": 0.24609375, + "learning_rate": 6.078500087611704e-07, + "loss": 0.055, + "reward": 1.3984198570251465, + "reward_std": 0.282496839761734, + "rewards/accuracy_reward_stage2": 0.6640447378158569, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2239 + }, + { + "completion_length": 15.265625, + "epoch": 0.39250043805852464, + "grad_norm": 19.170457946095826, + "kl": 0.11328125, + "learning_rate": 6.076747853513228e-07, + "loss": -0.0192, + "reward": 1.558809757232666, + "reward_std": 0.38950616121292114, + "rewards/accuracy_reward_stage2": 0.590059757232666, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2240 + }, + { + "completion_length": 9.5, + "epoch": 0.3926756614683722, + "grad_norm": 23.118453194182642, + "kl": 0.37890625, + "learning_rate": 6.074995619414753e-07, + "loss": 0.1076, + "reward": 1.6901085376739502, + "reward_std": 0.21713736653327942, + "rewards/accuracy_reward_stage2": 0.830733597278595, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2241 + }, + { + "completion_length": 14.609375, + "epoch": 0.39285088487821973, + "grad_norm": 15.373361647993663, + "kl": 0.024658203125, + "learning_rate": 6.073243385316278e-07, + "loss": 0.0099, + "reward": 1.337906837463379, + "reward_std": 0.150638610124588, + "rewards/accuracy_reward_stage2": 0.3379068374633789, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2242 + }, + { + "completion_length": 24.15625, + "epoch": 0.3930261082880673, + "grad_norm": 22.081573519968778, + "kl": 0.0625, + "learning_rate": 6.071491151217802e-07, + "loss": -0.0585, + "reward": 1.5403672456741333, + "reward_std": 0.2464727759361267, + "rewards/accuracy_reward_stage2": 0.5716171860694885, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2243 + }, + { + "completion_length": 8.8125, + "epoch": 0.3932013316979148, + "grad_norm": 22.833636209238247, + "kl": 0.09814453125, + "learning_rate": 6.069738917119327e-07, + "loss": 0.0077, + "reward": 1.6146864891052246, + "reward_std": 0.2203153371810913, + "rewards/accuracy_reward_stage2": 0.6303114295005798, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2244 + }, + { + "completion_length": 12.921875, + "epoch": 0.39337655510776237, + "grad_norm": 23.377203815412372, + "kl": 0.197265625, + "learning_rate": 6.067986683020852e-07, + "loss": 0.0264, + "reward": 1.4449132680892944, + "reward_std": 0.2471257746219635, + "rewards/accuracy_reward_stage2": 0.6011632084846497, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2245 + }, + { + "completion_length": 10.421875, + "epoch": 0.39355177851761, + "grad_norm": 19.72501700323542, + "kl": 0.1708984375, + "learning_rate": 6.066234448922376e-07, + "loss": 0.0244, + "reward": 1.5555421113967896, + "reward_std": 0.21328100562095642, + "rewards/accuracy_reward_stage2": 0.6961670517921448, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2246 + }, + { + "completion_length": 11.375, + "epoch": 0.3937270019274575, + "grad_norm": 22.98459927286484, + "kl": 0.1513671875, + "learning_rate": 6.064482214823901e-07, + "loss": -0.0156, + "reward": 1.5627611875534058, + "reward_std": 0.366780549287796, + "rewards/accuracy_reward_stage2": 0.5940111875534058, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2247 + }, + { + "completion_length": 10.9375, + "epoch": 0.39390222533730507, + "grad_norm": 22.93261313656731, + "kl": 0.1416015625, + "learning_rate": 6.062729980725426e-07, + "loss": 0.0231, + "reward": 1.6208666563034058, + "reward_std": 0.27645695209503174, + "rewards/accuracy_reward_stage2": 0.6364917159080505, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2248 + }, + { + "completion_length": 11.59375, + "epoch": 0.3940774487471526, + "grad_norm": 17.41514520486553, + "kl": 0.054931640625, + "learning_rate": 6.060977746626949e-07, + "loss": 0.022, + "reward": 1.6029356718063354, + "reward_std": 0.0914289802312851, + "rewards/accuracy_reward_stage2": 0.6029355525970459, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2249 + }, + { + "completion_length": 11.65625, + "epoch": 0.39425267215700016, + "grad_norm": 17.63786857950724, + "kl": 0.04736328125, + "learning_rate": 6.059225512528473e-07, + "loss": 0.0189, + "reward": 1.6209280490875244, + "reward_std": 0.18614742159843445, + "rewards/accuracy_reward_stage2": 0.6209280490875244, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2250 + }, + { + "completion_length": 15.296875, + "epoch": 0.3944278955668477, + "grad_norm": 15.877845199573407, + "kl": 0.06494140625, + "learning_rate": 6.057473278429997e-07, + "loss": 0.0041, + "reward": 1.6792845726013184, + "reward_std": 0.17453373968601227, + "rewards/accuracy_reward_stage2": 0.6949096322059631, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2251 + }, + { + "completion_length": 6.40625, + "epoch": 0.3946031189766953, + "grad_norm": 18.188032907525475, + "kl": 0.087890625, + "learning_rate": 6.055721044331522e-07, + "loss": 0.0351, + "reward": 1.784743070602417, + "reward_std": 0.20735129714012146, + "rewards/accuracy_reward_stage2": 0.784743070602417, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2252 + }, + { + "completion_length": 11.40625, + "epoch": 0.39477834238654286, + "grad_norm": 24.27092280061725, + "kl": 0.032470703125, + "learning_rate": 6.053968810233047e-07, + "loss": 0.013, + "reward": 1.7523884773254395, + "reward_std": 0.18452656269073486, + "rewards/accuracy_reward_stage2": 0.7523884773254395, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2253 + }, + { + "completion_length": 10.8125, + "epoch": 0.3949535657963904, + "grad_norm": 31.106394878920838, + "kl": 0.16796875, + "learning_rate": 6.052216576134571e-07, + "loss": -0.0084, + "reward": 1.6320232152938843, + "reward_std": 0.2674624025821686, + "rewards/accuracy_reward_stage2": 0.6788982152938843, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2254 + }, + { + "completion_length": 8.015625, + "epoch": 0.39512878920623795, + "grad_norm": 19.402853907661573, + "kl": 0.11181640625, + "learning_rate": 6.050464342036096e-07, + "loss": 0.0351, + "reward": 1.7537932395935059, + "reward_std": 0.1687513291835785, + "rewards/accuracy_reward_stage2": 0.7694183588027954, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2255 + }, + { + "completion_length": 11.859375, + "epoch": 0.3953040126160855, + "grad_norm": 23.78392267136443, + "kl": 0.06591796875, + "learning_rate": 6.04871210793762e-07, + "loss": -0.0066, + "reward": 1.2469592094421387, + "reward_std": 0.21802271902561188, + "rewards/accuracy_reward_stage2": 0.38758420944213867, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2256 + }, + { + "completion_length": 8.390625, + "epoch": 0.39547923602593305, + "grad_norm": 17.136787258060238, + "kl": 0.08251953125, + "learning_rate": 6.046959873839145e-07, + "loss": -0.0112, + "reward": 1.6200617551803589, + "reward_std": 0.2077975869178772, + "rewards/accuracy_reward_stage2": 0.7606866955757141, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2257 + }, + { + "completion_length": 20.28125, + "epoch": 0.39565445943578065, + "grad_norm": 20.39764284951551, + "kl": 0.0869140625, + "learning_rate": 6.04520763974067e-07, + "loss": 0.0192, + "reward": 1.4464240074157715, + "reward_std": 0.2652880549430847, + "rewards/accuracy_reward_stage2": 0.5870490074157715, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2258 + }, + { + "completion_length": 7.65625, + "epoch": 0.3958296828456282, + "grad_norm": 16.894150235680982, + "kl": 0.1083984375, + "learning_rate": 6.043455405642193e-07, + "loss": 0.0433, + "reward": 1.6962438821792603, + "reward_std": 0.14486585557460785, + "rewards/accuracy_reward_stage2": 0.6962438225746155, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2259 + }, + { + "completion_length": 25.328125, + "epoch": 0.39600490625547574, + "grad_norm": 15.769439083890266, + "kl": 0.1318359375, + "learning_rate": 6.041703171543718e-07, + "loss": 0.0219, + "reward": 1.3447751998901367, + "reward_std": 0.11412560939788818, + "rewards/accuracy_reward_stage2": 0.6104001998901367, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2260 + }, + { + "completion_length": 9.328125, + "epoch": 0.3961801296653233, + "grad_norm": 12.128215396278677, + "kl": 0.034912109375, + "learning_rate": 6.039950937445243e-07, + "loss": 0.0139, + "reward": 1.8489582538604736, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward_stage2": 0.8489583730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2261 + }, + { + "completion_length": 8.4375, + "epoch": 0.39635535307517084, + "grad_norm": 18.815808354313415, + "kl": 0.05126953125, + "learning_rate": 6.038198703346767e-07, + "loss": 0.0205, + "reward": 1.71771240234375, + "reward_std": 0.3155533969402313, + "rewards/accuracy_reward_stage2": 0.71771240234375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2262 + }, + { + "completion_length": 12.875, + "epoch": 0.3965305764850184, + "grad_norm": 19.072455081759877, + "kl": 0.2021484375, + "learning_rate": 6.036446469248291e-07, + "loss": 0.0241, + "reward": 1.636966347694397, + "reward_std": 0.3073020577430725, + "rewards/accuracy_reward_stage2": 0.6682164072990417, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2263 + }, + { + "completion_length": 9.46875, + "epoch": 0.39670579989486593, + "grad_norm": 14.377658250079735, + "kl": 0.056640625, + "learning_rate": 6.034694235149815e-07, + "loss": 0.0227, + "reward": 1.4962797164916992, + "reward_std": 0.14264775812625885, + "rewards/accuracy_reward_stage2": 0.6212797164916992, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2264 + }, + { + "completion_length": 11.203125, + "epoch": 0.39688102330471353, + "grad_norm": 22.85198974569718, + "kl": 0.20703125, + "learning_rate": 6.03294200105134e-07, + "loss": -0.0026, + "reward": 1.5522370338439941, + "reward_std": 0.2735584080219269, + "rewards/accuracy_reward_stage2": 0.5834869146347046, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2265 + }, + { + "completion_length": 7.65625, + "epoch": 0.3970562467145611, + "grad_norm": 18.775607460968544, + "kl": 0.052734375, + "learning_rate": 6.031189766952865e-07, + "loss": 0.0211, + "reward": 1.533717155456543, + "reward_std": 0.13953596353530884, + "rewards/accuracy_reward_stage2": 0.533717155456543, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2266 + }, + { + "completion_length": 9.21875, + "epoch": 0.3972314701244086, + "grad_norm": 17.46988710403811, + "kl": 0.10546875, + "learning_rate": 6.029437532854389e-07, + "loss": -0.0019, + "reward": 1.6571948528289795, + "reward_std": 0.20851103961467743, + "rewards/accuracy_reward_stage2": 0.6728198528289795, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2267 + }, + { + "completion_length": 9.796875, + "epoch": 0.3974066935342562, + "grad_norm": 16.101983150034496, + "kl": 0.212890625, + "learning_rate": 6.027685298755914e-07, + "loss": 0.0281, + "reward": 1.4166667461395264, + "reward_std": 0.16781339049339294, + "rewards/accuracy_reward_stage2": 0.5729166865348816, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2268 + }, + { + "completion_length": 7.703125, + "epoch": 0.3975819169441037, + "grad_norm": 18.603337507415958, + "kl": 0.11767578125, + "learning_rate": 6.025933064657438e-07, + "loss": 0.0028, + "reward": 1.6792187690734863, + "reward_std": 0.2111339271068573, + "rewards/accuracy_reward_stage2": 0.6948437690734863, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2269 + }, + { + "completion_length": 10.34375, + "epoch": 0.39775714035395127, + "grad_norm": 21.344244615987552, + "kl": 0.09912109375, + "learning_rate": 6.024180830558962e-07, + "loss": 0.0397, + "reward": 1.2726788520812988, + "reward_std": 0.29367613792419434, + "rewards/accuracy_reward_stage2": 0.39767885208129883, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2270 + }, + { + "completion_length": 9.5625, + "epoch": 0.39793236376379887, + "grad_norm": 18.185559517665077, + "kl": 0.048583984375, + "learning_rate": 6.022428596460487e-07, + "loss": 0.0194, + "reward": 1.5903193950653076, + "reward_std": 0.22018752992153168, + "rewards/accuracy_reward_stage2": 0.5903194546699524, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2271 + }, + { + "completion_length": 8.15625, + "epoch": 0.3981075871736464, + "grad_norm": 19.903045893225336, + "kl": 0.150390625, + "learning_rate": 6.020676362362011e-07, + "loss": 0.06, + "reward": 1.56424081325531, + "reward_std": 0.20303234457969666, + "rewards/accuracy_reward_stage2": 0.5642408132553101, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2272 + }, + { + "completion_length": 7.953125, + "epoch": 0.39828281058349396, + "grad_norm": 8.3917061329364, + "kl": 0.0556640625, + "learning_rate": 6.018924128263536e-07, + "loss": 0.0222, + "reward": 1.484375, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.484375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2273 + }, + { + "completion_length": 10.71875, + "epoch": 0.3984580339933415, + "grad_norm": 17.9917725511186, + "kl": 0.099609375, + "learning_rate": 6.017171894165061e-07, + "loss": -0.0044, + "reward": 1.5761617422103882, + "reward_std": 0.10360636562108994, + "rewards/accuracy_reward_stage2": 0.5917867422103882, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2274 + }, + { + "completion_length": 16.703125, + "epoch": 0.39863325740318906, + "grad_norm": 19.771813163606605, + "kl": 0.10595703125, + "learning_rate": 6.015419660066584e-07, + "loss": 0.0057, + "reward": 1.7596876621246338, + "reward_std": 0.17410920560359955, + "rewards/accuracy_reward_stage2": 0.7753127217292786, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2275 + }, + { + "completion_length": 10.15625, + "epoch": 0.3988084808130366, + "grad_norm": 20.262330099879136, + "kl": 0.03662109375, + "learning_rate": 6.013667425968109e-07, + "loss": 0.0147, + "reward": 1.481999397277832, + "reward_std": 0.11602069437503815, + "rewards/accuracy_reward_stage2": 0.4819994568824768, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2276 + }, + { + "completion_length": 11.34375, + "epoch": 0.39898370422288415, + "grad_norm": 17.02612667675846, + "kl": 0.0673828125, + "learning_rate": 6.011915191869634e-07, + "loss": 0.027, + "reward": 1.57454514503479, + "reward_std": 0.18871784210205078, + "rewards/accuracy_reward_stage2": 0.57454514503479, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2277 + }, + { + "completion_length": 13.0625, + "epoch": 0.39915892763273175, + "grad_norm": 22.694585700918292, + "kl": 0.09326171875, + "learning_rate": 6.010162957771157e-07, + "loss": 0.0373, + "reward": 1.4261221885681152, + "reward_std": 0.23612135648727417, + "rewards/accuracy_reward_stage2": 0.42612212896347046, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2278 + }, + { + "completion_length": 13.890625, + "epoch": 0.3993341510425793, + "grad_norm": 18.353435530372224, + "kl": 0.087890625, + "learning_rate": 6.008410723672682e-07, + "loss": 0.0352, + "reward": 1.4055817127227783, + "reward_std": 0.1411152333021164, + "rewards/accuracy_reward_stage2": 0.4055817723274231, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2279 + }, + { + "completion_length": 11.0, + "epoch": 0.39950937445242685, + "grad_norm": 20.489860262338013, + "kl": 0.17578125, + "learning_rate": 6.006658489574206e-07, + "loss": 0.0699, + "reward": 1.3175370693206787, + "reward_std": 0.13264381885528564, + "rewards/accuracy_reward_stage2": 0.44253700971603394, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2280 + }, + { + "completion_length": 10.59375, + "epoch": 0.3996845978622744, + "grad_norm": 19.79224638731199, + "kl": 0.154296875, + "learning_rate": 6.004906255475731e-07, + "loss": 0.0617, + "reward": 1.0645630359649658, + "reward_std": 0.23653094470500946, + "rewards/accuracy_reward_stage2": 0.4395630359649658, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 2281 + }, + { + "completion_length": 7.859375, + "epoch": 0.39985982127212194, + "grad_norm": 18.134076910029414, + "kl": 0.07373046875, + "learning_rate": 6.003154021377256e-07, + "loss": 0.0296, + "reward": 1.6970256567001343, + "reward_std": 0.06270062178373337, + "rewards/accuracy_reward_stage2": 0.6970256567001343, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2282 + }, + { + "completion_length": 9.625, + "epoch": 0.4000350446819695, + "grad_norm": 17.74429963997985, + "kl": 0.05908203125, + "learning_rate": 6.00140178727878e-07, + "loss": 0.0237, + "reward": 1.6750741004943848, + "reward_std": 0.163091778755188, + "rewards/accuracy_reward_stage2": 0.6750742197036743, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2283 + }, + { + "completion_length": 10.984375, + "epoch": 0.4002102680918171, + "grad_norm": 16.65654588186162, + "kl": 0.1572265625, + "learning_rate": 5.999649553180305e-07, + "loss": 0.063, + "reward": 1.5580189228057861, + "reward_std": 0.2063732147216797, + "rewards/accuracy_reward_stage2": 0.5580189228057861, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2284 + }, + { + "completion_length": 8.609375, + "epoch": 0.40038549150166464, + "grad_norm": 25.418214645346826, + "kl": 0.2333984375, + "learning_rate": 5.99789731908183e-07, + "loss": 0.0503, + "reward": 1.7506150007247925, + "reward_std": 0.248035728931427, + "rewards/accuracy_reward_stage2": 0.7662400007247925, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2285 + }, + { + "completion_length": 9.109375, + "epoch": 0.4005607149115122, + "grad_norm": 24.176216380847496, + "kl": 0.1376953125, + "learning_rate": 5.996145084983354e-07, + "loss": 0.0109, + "reward": 1.501746416091919, + "reward_std": 0.2735890746116638, + "rewards/accuracy_reward_stage2": 0.517371416091919, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2286 + }, + { + "completion_length": 10.515625, + "epoch": 0.40073593832135973, + "grad_norm": 23.296848488888745, + "kl": 0.055908203125, + "learning_rate": 5.994392850884879e-07, + "loss": -0.0108, + "reward": 1.4851511716842651, + "reward_std": 0.2239103764295578, + "rewards/accuracy_reward_stage2": 0.5007761716842651, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2287 + }, + { + "completion_length": 14.796875, + "epoch": 0.4009111617312073, + "grad_norm": 15.295213469553111, + "kl": 0.06396484375, + "learning_rate": 5.992640616786401e-07, + "loss": -0.0134, + "reward": 1.421875, + "reward_std": 0.2597545385360718, + "rewards/accuracy_reward_stage2": 0.4375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2288 + }, + { + "completion_length": 6.953125, + "epoch": 0.4010863851410548, + "grad_norm": 22.493392811606345, + "kl": 0.078125, + "learning_rate": 5.990888382687926e-07, + "loss": 0.0313, + "reward": 1.4117205142974854, + "reward_std": 0.3008676767349243, + "rewards/accuracy_reward_stage2": 0.5367204546928406, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2289 + }, + { + "completion_length": 10.921875, + "epoch": 0.4012616085509024, + "grad_norm": 19.29072294069736, + "kl": 0.0966796875, + "learning_rate": 5.989136148589451e-07, + "loss": 0.0387, + "reward": 1.4010417461395264, + "reward_std": 0.24621228873729706, + "rewards/accuracy_reward_stage2": 0.5260416269302368, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2290 + }, + { + "completion_length": 10.375, + "epoch": 0.40143683196075, + "grad_norm": 17.345347687316323, + "kl": 0.10498046875, + "learning_rate": 5.987383914490975e-07, + "loss": 0.042, + "reward": 1.8262851238250732, + "reward_std": 0.21577240526676178, + "rewards/accuracy_reward_stage2": 0.8262850642204285, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2291 + }, + { + "completion_length": 19.953125, + "epoch": 0.4016120553705975, + "grad_norm": 22.370097121820592, + "kl": 0.1083984375, + "learning_rate": 5.9856316803925e-07, + "loss": 0.0432, + "reward": 1.4479179382324219, + "reward_std": 0.32231512665748596, + "rewards/accuracy_reward_stage2": 0.44791799783706665, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2292 + }, + { + "completion_length": 10.46875, + "epoch": 0.40178727878044507, + "grad_norm": 19.718412661688117, + "kl": 0.146484375, + "learning_rate": 5.983879446294025e-07, + "loss": 0.0143, + "reward": 1.53125, + "reward_std": 0.25513994693756104, + "rewards/accuracy_reward_stage2": 0.671875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2293 + }, + { + "completion_length": 21.390625, + "epoch": 0.4019625021902926, + "grad_norm": 32.51152529262389, + "kl": 0.07861328125, + "learning_rate": 5.982127212195549e-07, + "loss": 0.0314, + "reward": 1.2806692123413086, + "reward_std": 0.2547403573989868, + "rewards/accuracy_reward_stage2": 0.40566927194595337, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2294 + }, + { + "completion_length": 8.65625, + "epoch": 0.40213772560014016, + "grad_norm": 32.45483359709806, + "kl": 0.078125, + "learning_rate": 5.980374978097074e-07, + "loss": 0.0312, + "reward": 1.7657719850540161, + "reward_std": 0.2733202576637268, + "rewards/accuracy_reward_stage2": 0.7657719850540161, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2295 + }, + { + "completion_length": 9.671875, + "epoch": 0.4023129490099877, + "grad_norm": 18.02935670992023, + "kl": 0.1005859375, + "learning_rate": 5.978622743998598e-07, + "loss": 0.0402, + "reward": 1.7565680742263794, + "reward_std": 0.2380290925502777, + "rewards/accuracy_reward_stage2": 0.7565680742263794, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2296 + }, + { + "completion_length": 7.65625, + "epoch": 0.4024881724198353, + "grad_norm": 16.789220677853052, + "kl": 0.1005859375, + "learning_rate": 5.976870509900123e-07, + "loss": -0.0481, + "reward": 1.4566528797149658, + "reward_std": 0.22974956035614014, + "rewards/accuracy_reward_stage2": 0.48790284991264343, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2297 + }, + { + "completion_length": 8.5, + "epoch": 0.40266339582968286, + "grad_norm": 19.418589942374066, + "kl": 0.0947265625, + "learning_rate": 5.975118275801648e-07, + "loss": -0.0062, + "reward": 1.598454236984253, + "reward_std": 0.19560183584690094, + "rewards/accuracy_reward_stage2": 0.6140791773796082, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2298 + }, + { + "completion_length": 20.4375, + "epoch": 0.4028386192395304, + "grad_norm": 14.144931450364158, + "kl": 0.0174560546875, + "learning_rate": 5.973366041703171e-07, + "loss": -0.0372, + "reward": 1.5348436832427979, + "reward_std": 0.15786908566951752, + "rewards/accuracy_reward_stage2": 0.5504686832427979, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2299 + }, + { + "completion_length": 7.109375, + "epoch": 0.40301384264937795, + "grad_norm": 25.985920547810785, + "kl": 0.08740234375, + "learning_rate": 5.971613807604696e-07, + "loss": 0.035, + "reward": 1.701280117034912, + "reward_std": 0.3256889581680298, + "rewards/accuracy_reward_stage2": 0.7012800574302673, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2300 + }, + { + "completion_length": 9.65625, + "epoch": 0.4031890660592255, + "grad_norm": 18.26426575589906, + "kl": 0.1044921875, + "learning_rate": 5.969861573506219e-07, + "loss": 0.0417, + "reward": 1.7318881750106812, + "reward_std": 0.18695884943008423, + "rewards/accuracy_reward_stage2": 0.8568881750106812, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2301 + }, + { + "completion_length": 8.34375, + "epoch": 0.40336428946907305, + "grad_norm": 19.11727770163386, + "kl": 0.1044921875, + "learning_rate": 5.968109339407744e-07, + "loss": 0.0099, + "reward": 1.6046810150146484, + "reward_std": 0.24243220686912537, + "rewards/accuracy_reward_stage2": 0.6203060150146484, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2302 + }, + { + "completion_length": 9.359375, + "epoch": 0.40353951287892065, + "grad_norm": 23.491406867573176, + "kl": 0.050048828125, + "learning_rate": 5.966357105309269e-07, + "loss": 0.02, + "reward": 1.4263932704925537, + "reward_std": 0.3580125868320465, + "rewards/accuracy_reward_stage2": 0.5513932704925537, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2303 + }, + { + "completion_length": 8.5625, + "epoch": 0.4037147362887682, + "grad_norm": 15.3331068572184, + "kl": 0.041259765625, + "learning_rate": 5.964604871210793e-07, + "loss": 0.0165, + "reward": 1.697649598121643, + "reward_std": 0.19590801000595093, + "rewards/accuracy_reward_stage2": 0.6976495981216431, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2304 + }, + { + "completion_length": 13.046875, + "epoch": 0.40388995969861574, + "grad_norm": 11.462365031990299, + "kl": 0.024658203125, + "learning_rate": 5.962852637112318e-07, + "loss": 0.0099, + "reward": 1.6736334562301636, + "reward_std": 0.063841812312603, + "rewards/accuracy_reward_stage2": 0.6736334562301636, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2305 + }, + { + "completion_length": 8.40625, + "epoch": 0.4040651831084633, + "grad_norm": 16.405845023216393, + "kl": 0.0673828125, + "learning_rate": 5.961100403013843e-07, + "loss": 0.027, + "reward": 1.499981164932251, + "reward_std": 0.10144259035587311, + "rewards/accuracy_reward_stage2": 0.624981164932251, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2306 + }, + { + "completion_length": 10.453125, + "epoch": 0.40424040651831084, + "grad_norm": 15.784375987182974, + "kl": 0.15234375, + "learning_rate": 5.959348168915367e-07, + "loss": -0.0628, + "reward": 1.589674949645996, + "reward_std": 0.27123454213142395, + "rewards/accuracy_reward_stage2": 0.6365499496459961, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2307 + }, + { + "completion_length": 6.421875, + "epoch": 0.4044156299281584, + "grad_norm": 18.311048533606936, + "kl": 0.087890625, + "learning_rate": 5.957595934816891e-07, + "loss": 0.0039, + "reward": 1.6572329998016357, + "reward_std": 0.26454687118530273, + "rewards/accuracy_reward_stage2": 0.672857940196991, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2308 + }, + { + "completion_length": 6.46875, + "epoch": 0.404590853338006, + "grad_norm": 17.68654151954203, + "kl": 0.1416015625, + "learning_rate": 5.955843700718416e-07, + "loss": -0.0318, + "reward": 1.57929265499115, + "reward_std": 0.13953736424446106, + "rewards/accuracy_reward_stage2": 0.6105427145957947, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2309 + }, + { + "completion_length": 12.875, + "epoch": 0.40476607674785353, + "grad_norm": 19.35674883065028, + "kl": 0.12353515625, + "learning_rate": 5.95409146661994e-07, + "loss": 0.0494, + "reward": 1.546875, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2310 + }, + { + "completion_length": 10.421875, + "epoch": 0.4049413001577011, + "grad_norm": 19.56185076460949, + "kl": 0.1875, + "learning_rate": 5.952339232521465e-07, + "loss": 0.0361, + "reward": 1.436431646347046, + "reward_std": 0.3601900339126587, + "rewards/accuracy_reward_stage2": 0.7020567059516907, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2311 + }, + { + "completion_length": 7.828125, + "epoch": 0.4051165235675486, + "grad_norm": 22.221393548545816, + "kl": 0.08056640625, + "learning_rate": 5.950586998422989e-07, + "loss": -0.0119, + "reward": 1.5891380310058594, + "reward_std": 0.1557682603597641, + "rewards/accuracy_reward_stage2": 0.7297629714012146, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2312 + }, + { + "completion_length": 9.546875, + "epoch": 0.4052917469773962, + "grad_norm": 21.028475980947228, + "kl": 0.2265625, + "learning_rate": 5.948834764324514e-07, + "loss": 0.016, + "reward": 1.4670155048370361, + "reward_std": 0.20655208826065063, + "rewards/accuracy_reward_stage2": 0.6232655048370361, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2313 + }, + { + "completion_length": 7.890625, + "epoch": 0.4054669703872437, + "grad_norm": 18.888146521184282, + "kl": 0.1494140625, + "learning_rate": 5.947082530226038e-07, + "loss": 0.0155, + "reward": 1.5098905563354492, + "reward_std": 0.2939804792404175, + "rewards/accuracy_reward_stage2": 0.5255155563354492, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2314 + }, + { + "completion_length": 8.03125, + "epoch": 0.40564219379709127, + "grad_norm": 18.779321483798796, + "kl": 0.0888671875, + "learning_rate": 5.945330296127562e-07, + "loss": 0.0355, + "reward": 1.5954867601394653, + "reward_std": 0.27154141664505005, + "rewards/accuracy_reward_stage2": 0.7204867601394653, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2315 + }, + { + "completion_length": 8.671875, + "epoch": 0.40581741720693887, + "grad_norm": 18.485802326126173, + "kl": 0.13671875, + "learning_rate": 5.943578062029087e-07, + "loss": -0.0229, + "reward": 1.25, + "reward_std": 0.24359199404716492, + "rewards/accuracy_reward_stage2": 0.40625, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2316 + }, + { + "completion_length": 10.015625, + "epoch": 0.4059926406167864, + "grad_norm": 28.565253954106154, + "kl": 0.283203125, + "learning_rate": 5.941825827930611e-07, + "loss": 0.0337, + "reward": 1.1393563747406006, + "reward_std": 0.27226772904396057, + "rewards/accuracy_reward_stage2": 0.4206062853336334, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2317 + }, + { + "completion_length": 10.875, + "epoch": 0.40616786402663396, + "grad_norm": 18.355085386228502, + "kl": 0.12890625, + "learning_rate": 5.940073593832135e-07, + "loss": 0.0146, + "reward": 1.6291195154190063, + "reward_std": 0.2628851532936096, + "rewards/accuracy_reward_stage2": 0.6447445154190063, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2318 + }, + { + "completion_length": 9.0, + "epoch": 0.4063430874364815, + "grad_norm": 19.009951811781782, + "kl": 0.087890625, + "learning_rate": 5.93832135973366e-07, + "loss": 0.0351, + "reward": 1.7401200532913208, + "reward_std": 0.21193233132362366, + "rewards/accuracy_reward_stage2": 0.740119993686676, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2319 + }, + { + "completion_length": 7.84375, + "epoch": 0.40651831084632906, + "grad_norm": 16.304528563870416, + "kl": 0.0439453125, + "learning_rate": 5.936569125635184e-07, + "loss": 0.0175, + "reward": 1.640625, + "reward_std": 0.32878512144088745, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2320 + }, + { + "completion_length": 15.15625, + "epoch": 0.4066935342561766, + "grad_norm": 15.160611953847901, + "kl": 0.037109375, + "learning_rate": 5.934816891536709e-07, + "loss": 0.0148, + "reward": 1.5934399366378784, + "reward_std": 0.07537385821342468, + "rewards/accuracy_reward_stage2": 0.5934398770332336, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2321 + }, + { + "completion_length": 9.328125, + "epoch": 0.4068687576660242, + "grad_norm": 17.838232315322752, + "kl": 0.0400390625, + "learning_rate": 5.933064657438234e-07, + "loss": 0.0159, + "reward": 1.8585360050201416, + "reward_std": 0.12488370388746262, + "rewards/accuracy_reward_stage2": 0.8585360050201416, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2322 + }, + { + "completion_length": 11.21875, + "epoch": 0.40704398107587175, + "grad_norm": 18.85560208158748, + "kl": 0.076171875, + "learning_rate": 5.931312423339758e-07, + "loss": 0.0306, + "reward": 1.368743658065796, + "reward_std": 0.16224510967731476, + "rewards/accuracy_reward_stage2": 0.3687437176704407, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2323 + }, + { + "completion_length": 12.53125, + "epoch": 0.4072192044857193, + "grad_norm": 19.332274366874387, + "kl": 0.056884765625, + "learning_rate": 5.929560189241283e-07, + "loss": 0.0228, + "reward": 1.4458041191101074, + "reward_std": 0.19428783655166626, + "rewards/accuracy_reward_stage2": 0.5708041787147522, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2324 + }, + { + "completion_length": 10.421875, + "epoch": 0.40739442789556685, + "grad_norm": 18.71638987852407, + "kl": 0.0732421875, + "learning_rate": 5.927807955142807e-07, + "loss": -0.0591, + "reward": 1.426041603088379, + "reward_std": 0.261094331741333, + "rewards/accuracy_reward_stage2": 0.4572916626930237, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2325 + }, + { + "completion_length": 9.234375, + "epoch": 0.4075696513054144, + "grad_norm": 11.874790893429372, + "kl": 0.06396484375, + "learning_rate": 5.926055721044331e-07, + "loss": 0.0256, + "reward": 1.5826786756515503, + "reward_std": 0.08243951201438904, + "rewards/accuracy_reward_stage2": 0.5826787948608398, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2326 + }, + { + "completion_length": 7.5625, + "epoch": 0.40774487471526194, + "grad_norm": 20.37028603700548, + "kl": 0.09033203125, + "learning_rate": 5.924303486945856e-07, + "loss": 0.0362, + "reward": 1.448150634765625, + "reward_std": 0.20492343604564667, + "rewards/accuracy_reward_stage2": 0.4481506943702698, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2327 + }, + { + "completion_length": 10.703125, + "epoch": 0.4079200981251095, + "grad_norm": 17.434897565743288, + "kl": 0.1474609375, + "learning_rate": 5.922551252847379e-07, + "loss": 0.0147, + "reward": 1.474340558052063, + "reward_std": 0.2231925129890442, + "rewards/accuracy_reward_stage2": 0.4899655282497406, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2328 + }, + { + "completion_length": 9.25, + "epoch": 0.4080953215349571, + "grad_norm": 16.34813909602192, + "kl": 0.06201171875, + "learning_rate": 5.920799018748904e-07, + "loss": 0.0248, + "reward": 1.3414571285247803, + "reward_std": 0.09716267138719559, + "rewards/accuracy_reward_stage2": 0.3414571285247803, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2329 + }, + { + "completion_length": 10.53125, + "epoch": 0.40827054494480464, + "grad_norm": 22.23078087484954, + "kl": 0.091796875, + "learning_rate": 5.919046784650429e-07, + "loss": 0.0368, + "reward": 1.6912882328033447, + "reward_std": 0.2064078152179718, + "rewards/accuracy_reward_stage2": 0.8162882328033447, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2330 + }, + { + "completion_length": 16.40625, + "epoch": 0.4084457683546522, + "grad_norm": 24.824321168016088, + "kl": 0.04248046875, + "learning_rate": 5.917294550551953e-07, + "loss": 0.017, + "reward": 1.4140516519546509, + "reward_std": 0.2925530672073364, + "rewards/accuracy_reward_stage2": 0.5390516519546509, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2331 + }, + { + "completion_length": 16.28125, + "epoch": 0.40862099176449973, + "grad_norm": 18.417940232007503, + "kl": 0.10205078125, + "learning_rate": 5.915542316453478e-07, + "loss": 0.0408, + "reward": 1.3555889129638672, + "reward_std": 0.1305437982082367, + "rewards/accuracy_reward_stage2": 0.48058879375457764, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2332 + }, + { + "completion_length": 10.234375, + "epoch": 0.4087962151743473, + "grad_norm": 18.67898869078281, + "kl": 0.158203125, + "learning_rate": 5.913790082355002e-07, + "loss": 0.0191, + "reward": 1.4799710512161255, + "reward_std": 0.2335529327392578, + "rewards/accuracy_reward_stage2": 0.6205961108207703, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2333 + }, + { + "completion_length": 9.296875, + "epoch": 0.4089714385841948, + "grad_norm": 20.62020309808911, + "kl": 0.119140625, + "learning_rate": 5.912037848256527e-07, + "loss": 0.0477, + "reward": 1.198518991470337, + "reward_std": 0.3195599317550659, + "rewards/accuracy_reward_stage2": 0.4485190510749817, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2334 + }, + { + "completion_length": 9.1875, + "epoch": 0.4091466619940424, + "grad_norm": 15.477641285474997, + "kl": 0.048583984375, + "learning_rate": 5.910285614158052e-07, + "loss": 0.0194, + "reward": 1.541133999824524, + "reward_std": 0.17401783168315887, + "rewards/accuracy_reward_stage2": 0.5411341190338135, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2335 + }, + { + "completion_length": 22.546875, + "epoch": 0.40932188540389, + "grad_norm": 16.15447248636756, + "kl": 0.171875, + "learning_rate": 5.908533380059576e-07, + "loss": 0.025, + "reward": 1.6095988750457764, + "reward_std": 0.27953198552131653, + "rewards/accuracy_reward_stage2": 0.6252239942550659, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2336 + }, + { + "completion_length": 7.8125, + "epoch": 0.4094971088137375, + "grad_norm": 14.97687341250913, + "kl": 0.060791015625, + "learning_rate": 5.906781145961101e-07, + "loss": 0.0244, + "reward": 1.546875, + "reward_std": 0.1530819982290268, + "rewards/accuracy_reward_stage2": 0.671875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2337 + }, + { + "completion_length": 9.25, + "epoch": 0.40967233222358507, + "grad_norm": 20.206446418912705, + "kl": 0.055908203125, + "learning_rate": 5.905028911862626e-07, + "loss": 0.0224, + "reward": 1.4281362295150757, + "reward_std": 0.1827203780412674, + "rewards/accuracy_reward_stage2": 0.4281362295150757, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2338 + }, + { + "completion_length": 14.078125, + "epoch": 0.4098475556334326, + "grad_norm": 18.36376649072188, + "kl": 0.080078125, + "learning_rate": 5.903276677764148e-07, + "loss": -0.0034, + "reward": 1.5158047676086426, + "reward_std": 0.28939586877822876, + "rewards/accuracy_reward_stage2": 0.6564297080039978, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2339 + }, + { + "completion_length": 7.484375, + "epoch": 0.41002277904328016, + "grad_norm": 19.562603721538245, + "kl": 0.08447265625, + "learning_rate": 5.901524443665673e-07, + "loss": 0.0014, + "reward": 1.4270833730697632, + "reward_std": 0.32159775495529175, + "rewards/accuracy_reward_stage2": 0.4427083432674408, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2340 + }, + { + "completion_length": 14.015625, + "epoch": 0.41019800245312776, + "grad_norm": 77.16144099299103, + "kl": 0.37109375, + "learning_rate": 5.899772209567197e-07, + "loss": 0.0955, + "reward": 1.4920990467071533, + "reward_std": 0.1890915185213089, + "rewards/accuracy_reward_stage2": 0.6483490467071533, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2341 + }, + { + "completion_length": 9.3125, + "epoch": 0.4103732258629753, + "grad_norm": 21.558413026021842, + "kl": 0.2490234375, + "learning_rate": 5.898019975468722e-07, + "loss": 0.0554, + "reward": 1.7347142696380615, + "reward_std": 0.21133540570735931, + "rewards/accuracy_reward_stage2": 0.7503393888473511, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2342 + }, + { + "completion_length": 10.125, + "epoch": 0.41054844927282286, + "grad_norm": 22.687231445611207, + "kl": 0.16796875, + "learning_rate": 5.896267741370247e-07, + "loss": 0.0671, + "reward": 1.3260161876678467, + "reward_std": 0.19513970613479614, + "rewards/accuracy_reward_stage2": 0.5760161876678467, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2343 + }, + { + "completion_length": 9.734375, + "epoch": 0.4107236726826704, + "grad_norm": 18.09101369627577, + "kl": 0.0947265625, + "learning_rate": 5.894515507271771e-07, + "loss": 0.0018, + "reward": 1.4208391904830933, + "reward_std": 0.28820210695266724, + "rewards/accuracy_reward_stage2": 0.43646419048309326, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2344 + }, + { + "completion_length": 9.921875, + "epoch": 0.41089889609251795, + "grad_norm": 13.054076648742582, + "kl": 0.047607421875, + "learning_rate": 5.892763273173296e-07, + "loss": 0.0191, + "reward": 1.3740955591201782, + "reward_std": 0.06014459952712059, + "rewards/accuracy_reward_stage2": 0.49909549951553345, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2345 + }, + { + "completion_length": 11.375, + "epoch": 0.4110741195023655, + "grad_norm": 12.061553291437448, + "kl": 0.07275390625, + "learning_rate": 5.891011039074821e-07, + "loss": 0.0292, + "reward": 1.8303499221801758, + "reward_std": 0.11832354962825775, + "rewards/accuracy_reward_stage2": 0.8303500413894653, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2346 + }, + { + "completion_length": 17.546875, + "epoch": 0.41124934291221305, + "grad_norm": 15.867100667252611, + "kl": 0.050537109375, + "learning_rate": 5.889258804976345e-07, + "loss": -0.024, + "reward": 1.6114246845245361, + "reward_std": 0.2627890110015869, + "rewards/accuracy_reward_stage2": 0.6270497441291809, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2347 + }, + { + "completion_length": 12.171875, + "epoch": 0.41142456632206065, + "grad_norm": 20.27755333621668, + "kl": 0.0703125, + "learning_rate": 5.887506570877869e-07, + "loss": 0.0282, + "reward": 1.559401035308838, + "reward_std": 0.2290632128715515, + "rewards/accuracy_reward_stage2": 0.5594009160995483, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2348 + }, + { + "completion_length": 11.90625, + "epoch": 0.4115997897319082, + "grad_norm": 23.597797783521305, + "kl": 0.07470703125, + "learning_rate": 5.885754336779393e-07, + "loss": 0.0299, + "reward": 1.4080188274383545, + "reward_std": 0.2232939600944519, + "rewards/accuracy_reward_stage2": 0.5330188274383545, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2349 + }, + { + "completion_length": 17.25, + "epoch": 0.41177501314175574, + "grad_norm": 9.278143499351788, + "kl": 0.06640625, + "learning_rate": 5.884002102680918e-07, + "loss": -0.0134, + "reward": 1.5017169713974, + "reward_std": 0.0607755072414875, + "rewards/accuracy_reward_stage2": 0.5173419117927551, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2350 + }, + { + "completion_length": 12.953125, + "epoch": 0.4119502365516033, + "grad_norm": 23.03371984485041, + "kl": 0.10595703125, + "learning_rate": 5.882249868582443e-07, + "loss": 0.0423, + "reward": 1.599036693572998, + "reward_std": 0.14129234850406647, + "rewards/accuracy_reward_stage2": 0.5990367531776428, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2351 + }, + { + "completion_length": 15.171875, + "epoch": 0.41212545996145084, + "grad_norm": 18.022645161745377, + "kl": 0.054931640625, + "learning_rate": 5.880497634483966e-07, + "loss": -0.0222, + "reward": 1.696798324584961, + "reward_std": 0.17698150873184204, + "rewards/accuracy_reward_stage2": 0.7124233245849609, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2352 + }, + { + "completion_length": 9.84375, + "epoch": 0.4123006833712984, + "grad_norm": 12.495739623364917, + "kl": 0.034912109375, + "learning_rate": 5.878745400385491e-07, + "loss": 0.014, + "reward": 1.3732178211212158, + "reward_std": 0.17081034183502197, + "rewards/accuracy_reward_stage2": 0.37321779131889343, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2353 + }, + { + "completion_length": 17.03125, + "epoch": 0.412475906781146, + "grad_norm": 20.169046026792728, + "kl": 0.0478515625, + "learning_rate": 5.876993166287016e-07, + "loss": -0.0167, + "reward": 1.5913221836090088, + "reward_std": 0.24479767680168152, + "rewards/accuracy_reward_stage2": 0.6069472432136536, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2354 + }, + { + "completion_length": 9.75, + "epoch": 0.41265113019099353, + "grad_norm": 19.99644756259702, + "kl": 0.1630859375, + "learning_rate": 5.87524093218854e-07, + "loss": 0.0653, + "reward": 1.5812091827392578, + "reward_std": 0.24733664095401764, + "rewards/accuracy_reward_stage2": 0.581209123134613, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2355 + }, + { + "completion_length": 10.484375, + "epoch": 0.4128263536008411, + "grad_norm": 17.799954110495957, + "kl": 0.1416015625, + "learning_rate": 5.873488698090065e-07, + "loss": 0.0207, + "reward": 1.379780888557434, + "reward_std": 0.19786177575588226, + "rewards/accuracy_reward_stage2": 0.5204058289527893, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2356 + }, + { + "completion_length": 8.453125, + "epoch": 0.4130015770106886, + "grad_norm": 20.148851520890076, + "kl": 0.08203125, + "learning_rate": 5.871736463991589e-07, + "loss": -0.0114, + "reward": 1.6698863506317139, + "reward_std": 0.26759451627731323, + "rewards/accuracy_reward_stage2": 0.6855113506317139, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2357 + }, + { + "completion_length": 8.734375, + "epoch": 0.41317680042053617, + "grad_norm": 10.823426387963599, + "kl": 0.1337890625, + "learning_rate": 5.869984229893113e-07, + "loss": -0.0319, + "reward": 1.6802399158477783, + "reward_std": 0.14132839441299438, + "rewards/accuracy_reward_stage2": 0.7114899754524231, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2358 + }, + { + "completion_length": 7.265625, + "epoch": 0.4133520238303837, + "grad_norm": 19.50099203681281, + "kl": 0.1533203125, + "learning_rate": 5.868231995794638e-07, + "loss": -0.0199, + "reward": 1.6012730598449707, + "reward_std": 0.31847089529037476, + "rewards/accuracy_reward_stage2": 0.6481481194496155, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2359 + }, + { + "completion_length": 7.390625, + "epoch": 0.4135272472402313, + "grad_norm": 24.237481396274326, + "kl": 0.369140625, + "learning_rate": 5.866479761696162e-07, + "loss": 0.0941, + "reward": 1.1497249603271484, + "reward_std": 0.31039959192276, + "rewards/accuracy_reward_stage2": 0.4153498709201813, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2360 + }, + { + "completion_length": 9.921875, + "epoch": 0.41370247065007887, + "grad_norm": 16.566273456490055, + "kl": 0.1748046875, + "learning_rate": 5.864727527597687e-07, + "loss": 0.0308, + "reward": 1.4874138832092285, + "reward_std": 0.17307858169078827, + "rewards/accuracy_reward_stage2": 0.6280390024185181, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2361 + }, + { + "completion_length": 8.625, + "epoch": 0.4138776940599264, + "grad_norm": 11.367175152946368, + "kl": 0.037109375, + "learning_rate": 5.862975293499212e-07, + "loss": 0.0148, + "reward": 1.4189951419830322, + "reward_std": 0.07283923774957657, + "rewards/accuracy_reward_stage2": 0.41899508237838745, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2362 + }, + { + "completion_length": 14.890625, + "epoch": 0.41405291746977396, + "grad_norm": 23.14965223243577, + "kl": 0.11279296875, + "learning_rate": 5.861223059400736e-07, + "loss": 0.0601, + "reward": 1.551171064376831, + "reward_std": 0.19559019804000854, + "rewards/accuracy_reward_stage2": 0.6761711239814758, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2363 + }, + { + "completion_length": 12.125, + "epoch": 0.4142281408796215, + "grad_norm": 13.994240957679672, + "kl": 0.1396484375, + "learning_rate": 5.859470825302261e-07, + "loss": 0.0115, + "reward": 1.59100341796875, + "reward_std": 0.14728805422782898, + "rewards/accuracy_reward_stage2": 0.6066284775733948, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2364 + }, + { + "completion_length": 9.78125, + "epoch": 0.41440336428946906, + "grad_norm": 21.323259031008334, + "kl": 0.1328125, + "learning_rate": 5.857718591203784e-07, + "loss": 0.0089, + "reward": 1.6100748777389526, + "reward_std": 0.28893154859542847, + "rewards/accuracy_reward_stage2": 0.6256999373435974, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2365 + }, + { + "completion_length": 14.53125, + "epoch": 0.4145785876993166, + "grad_norm": 27.66782558140099, + "kl": 0.1904296875, + "learning_rate": 5.855966357105309e-07, + "loss": 0.0761, + "reward": 1.4185956716537476, + "reward_std": 0.2524084746837616, + "rewards/accuracy_reward_stage2": 0.5435957312583923, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2366 + }, + { + "completion_length": 6.34375, + "epoch": 0.4147538111091642, + "grad_norm": 18.43566490915197, + "kl": 0.040283203125, + "learning_rate": 5.854214123006834e-07, + "loss": 0.0161, + "reward": 1.792750358581543, + "reward_std": 0.12878680229187012, + "rewards/accuracy_reward_stage2": 0.7927502393722534, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2367 + }, + { + "completion_length": 14.59375, + "epoch": 0.41492903451901175, + "grad_norm": 13.91831957839429, + "kl": 0.083984375, + "learning_rate": 5.852461888908357e-07, + "loss": 0.0056, + "reward": 1.3589837551116943, + "reward_std": 0.1351110190153122, + "rewards/accuracy_reward_stage2": 0.37460869550704956, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2368 + }, + { + "completion_length": 10.5, + "epoch": 0.4151042579288593, + "grad_norm": 16.737366881330868, + "kl": 0.1142578125, + "learning_rate": 5.850709654809882e-07, + "loss": 0.0019, + "reward": 1.6517565250396729, + "reward_std": 0.16313903033733368, + "rewards/accuracy_reward_stage2": 0.6673814654350281, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2369 + }, + { + "completion_length": 14.828125, + "epoch": 0.41527948133870685, + "grad_norm": 17.248787576180106, + "kl": 0.220703125, + "learning_rate": 5.848957420711407e-07, + "loss": -0.0111, + "reward": 1.3924638032913208, + "reward_std": 0.207502081990242, + "rewards/accuracy_reward_stage2": 0.6893388032913208, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 2370 + }, + { + "completion_length": 20.984375, + "epoch": 0.4154547047485544, + "grad_norm": 20.195264394665625, + "kl": 0.06982421875, + "learning_rate": 5.847205186612931e-07, + "loss": 0.0116, + "reward": 1.4419444799423218, + "reward_std": 0.1619192659854889, + "rewards/accuracy_reward_stage2": 0.582569420337677, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2371 + }, + { + "completion_length": 8.703125, + "epoch": 0.41562992815840194, + "grad_norm": 23.724709031407677, + "kl": 0.1279296875, + "learning_rate": 5.845452952514456e-07, + "loss": 0.0069, + "reward": 1.5115091800689697, + "reward_std": 0.25413644313812256, + "rewards/accuracy_reward_stage2": 0.6521342396736145, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2372 + }, + { + "completion_length": 11.46875, + "epoch": 0.41580515156824954, + "grad_norm": 24.900258630232408, + "kl": 0.2255859375, + "learning_rate": 5.84370071841598e-07, + "loss": 0.0916, + "reward": 1.3193838596343994, + "reward_std": 0.2140500545501709, + "rewards/accuracy_reward_stage2": 0.44438380002975464, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2373 + }, + { + "completion_length": 12.390625, + "epoch": 0.4159803749780971, + "grad_norm": 13.963494157079705, + "kl": 0.134765625, + "learning_rate": 5.841948484317505e-07, + "loss": 0.0195, + "reward": 1.5970426797866821, + "reward_std": 0.18137384951114655, + "rewards/accuracy_reward_stage2": 0.6126677989959717, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2374 + }, + { + "completion_length": 10.265625, + "epoch": 0.41615559838794464, + "grad_norm": 26.68313987733041, + "kl": 0.0576171875, + "learning_rate": 5.84019625021903e-07, + "loss": 0.023, + "reward": 1.7117927074432373, + "reward_std": 0.14283138513565063, + "rewards/accuracy_reward_stage2": 0.7117927670478821, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2375 + }, + { + "completion_length": 6.828125, + "epoch": 0.4163308217977922, + "grad_norm": 19.976590487399438, + "kl": 0.038330078125, + "learning_rate": 5.838444016120554e-07, + "loss": 0.0153, + "reward": 1.5760102272033691, + "reward_std": 0.1239052414894104, + "rewards/accuracy_reward_stage2": 0.5760102868080139, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2376 + }, + { + "completion_length": 13.8125, + "epoch": 0.41650604520763973, + "grad_norm": 24.706278623692512, + "kl": 0.109375, + "learning_rate": 5.836691782022077e-07, + "loss": 0.0437, + "reward": 1.5468864440917969, + "reward_std": 0.24405643343925476, + "rewards/accuracy_reward_stage2": 0.5468865036964417, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2377 + }, + { + "completion_length": 9.078125, + "epoch": 0.4166812686174873, + "grad_norm": 23.394912393901517, + "kl": 0.171875, + "learning_rate": 5.834939547923601e-07, + "loss": 0.0247, + "reward": 1.769432544708252, + "reward_std": 0.27068987488746643, + "rewards/accuracy_reward_stage2": 0.785057544708252, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2378 + }, + { + "completion_length": 11.515625, + "epoch": 0.4168564920273349, + "grad_norm": 14.7960789603516, + "kl": 0.0257568359375, + "learning_rate": 5.833187313825126e-07, + "loss": 0.0103, + "reward": 1.6770833730697632, + "reward_std": 0.13835059106349945, + "rewards/accuracy_reward_stage2": 0.6770833730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2379 + }, + { + "completion_length": 14.671875, + "epoch": 0.4170317154371824, + "grad_norm": 14.686541826227506, + "kl": 0.04296875, + "learning_rate": 5.831435079726651e-07, + "loss": -0.0271, + "reward": 1.234375, + "reward_std": 0.19044627249240875, + "rewards/accuracy_reward_stage2": 0.25, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2380 + }, + { + "completion_length": 9.390625, + "epoch": 0.41720693884703, + "grad_norm": 16.51517195261775, + "kl": 0.1875, + "learning_rate": 5.829682845628175e-07, + "loss": 0.0748, + "reward": 1.4675534963607788, + "reward_std": 0.16185539960861206, + "rewards/accuracy_reward_stage2": 0.4675534665584564, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2381 + }, + { + "completion_length": 21.046875, + "epoch": 0.4173821622568775, + "grad_norm": 19.278164592265732, + "kl": 0.091796875, + "learning_rate": 5.8279306115297e-07, + "loss": 0.0366, + "reward": 1.4626104831695557, + "reward_std": 0.12916041910648346, + "rewards/accuracy_reward_stage2": 0.5876104831695557, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2382 + }, + { + "completion_length": 10.4375, + "epoch": 0.41755738566672507, + "grad_norm": 17.740228123681344, + "kl": 0.1787109375, + "learning_rate": 5.826178377431225e-07, + "loss": 0.0323, + "reward": 1.526477336883545, + "reward_std": 0.1329089105129242, + "rewards/accuracy_reward_stage2": 0.6671023964881897, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2383 + }, + { + "completion_length": 13.078125, + "epoch": 0.4177326090765726, + "grad_norm": 38.03817298589301, + "kl": 0.09716796875, + "learning_rate": 5.824426143332749e-07, + "loss": 0.0174, + "reward": 1.5752477645874023, + "reward_std": 0.19630175828933716, + "rewards/accuracy_reward_stage2": 0.5908727645874023, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2384 + }, + { + "completion_length": 10.453125, + "epoch": 0.41790783248642016, + "grad_norm": 20.96990598678958, + "kl": 0.1630859375, + "learning_rate": 5.822673909234274e-07, + "loss": 0.022, + "reward": 1.7025611400604248, + "reward_std": 0.22266198694705963, + "rewards/accuracy_reward_stage2": 0.7181861400604248, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2385 + }, + { + "completion_length": 11.296875, + "epoch": 0.41808305589626776, + "grad_norm": 11.484983822112438, + "kl": 0.050048828125, + "learning_rate": 5.820921675135799e-07, + "loss": -0.0242, + "reward": 1.604400873184204, + "reward_std": 0.05826300382614136, + "rewards/accuracy_reward_stage2": 0.6200259327888489, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2386 + }, + { + "completion_length": 14.078125, + "epoch": 0.4182582793061153, + "grad_norm": 18.563600984589137, + "kl": 0.1982421875, + "learning_rate": 5.819169441037323e-07, + "loss": -0.009, + "reward": 1.8097143173217773, + "reward_std": 0.20771729946136475, + "rewards/accuracy_reward_stage2": 0.8409643173217773, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2387 + }, + { + "completion_length": 13.890625, + "epoch": 0.41843350271596286, + "grad_norm": 19.485960652460495, + "kl": 0.125, + "learning_rate": 5.817417206938847e-07, + "loss": 0.05, + "reward": 1.6337437629699707, + "reward_std": 0.258608341217041, + "rewards/accuracy_reward_stage2": 0.6337437033653259, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2388 + }, + { + "completion_length": 9.421875, + "epoch": 0.4186087261258104, + "grad_norm": 14.326349224837957, + "kl": 0.045654296875, + "learning_rate": 5.815664972840371e-07, + "loss": 0.0183, + "reward": 1.6053376197814941, + "reward_std": 0.13150310516357422, + "rewards/accuracy_reward_stage2": 0.6053376197814941, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2389 + }, + { + "completion_length": 16.984375, + "epoch": 0.41878394953565795, + "grad_norm": 14.73197467489026, + "kl": 0.1318359375, + "learning_rate": 5.813912738741895e-07, + "loss": 0.0025, + "reward": 1.8244647979736328, + "reward_std": 0.14700627326965332, + "rewards/accuracy_reward_stage2": 0.8557147979736328, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2390 + }, + { + "completion_length": 7.65625, + "epoch": 0.4189591729455055, + "grad_norm": 18.481419837735878, + "kl": 0.158203125, + "learning_rate": 5.81216050464342e-07, + "loss": 0.0193, + "reward": 1.6163908243179321, + "reward_std": 0.2330131232738495, + "rewards/accuracy_reward_stage2": 0.7570158243179321, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2391 + }, + { + "completion_length": 12.140625, + "epoch": 0.4191343963553531, + "grad_norm": 18.545690051338077, + "kl": 0.1328125, + "learning_rate": 5.810408270544944e-07, + "loss": 0.0532, + "reward": 1.7197383642196655, + "reward_std": 0.2599320113658905, + "rewards/accuracy_reward_stage2": 0.7197383046150208, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2392 + }, + { + "completion_length": 7.421875, + "epoch": 0.41930961976520065, + "grad_norm": 21.630837755379087, + "kl": 0.20703125, + "learning_rate": 5.808656036446469e-07, + "loss": 0.0539, + "reward": 1.6220672130584717, + "reward_std": 0.21142326295375824, + "rewards/accuracy_reward_stage2": 0.6376922130584717, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2393 + }, + { + "completion_length": 15.875, + "epoch": 0.4194848431750482, + "grad_norm": 22.5945277461123, + "kl": 0.1240234375, + "learning_rate": 5.806903802347993e-07, + "loss": 0.0053, + "reward": 1.5958011150360107, + "reward_std": 0.2829567790031433, + "rewards/accuracy_reward_stage2": 0.6114259958267212, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2394 + }, + { + "completion_length": 8.8125, + "epoch": 0.41966006658489574, + "grad_norm": 22.150968540044698, + "kl": 0.16015625, + "learning_rate": 5.805151568249518e-07, + "loss": 0.0473, + "reward": 1.3075213432312012, + "reward_std": 0.33274227380752563, + "rewards/accuracy_reward_stage2": 0.4481462836265564, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2395 + }, + { + "completion_length": 15.859375, + "epoch": 0.4198352899947433, + "grad_norm": 58.532722707916875, + "kl": 0.275390625, + "learning_rate": 5.803399334151043e-07, + "loss": 0.1387, + "reward": 1.5200127363204956, + "reward_std": 0.15271207690238953, + "rewards/accuracy_reward_stage2": 0.6450127363204956, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2396 + }, + { + "completion_length": 11.015625, + "epoch": 0.42001051340459084, + "grad_norm": 15.509242191723505, + "kl": 0.07861328125, + "learning_rate": 5.801647100052566e-07, + "loss": -0.0127, + "reward": 1.5883493423461914, + "reward_std": 0.15838760137557983, + "rewards/accuracy_reward_stage2": 0.6039743423461914, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2397 + }, + { + "completion_length": 7.078125, + "epoch": 0.4201857368144384, + "grad_norm": 15.094137285652373, + "kl": 0.1279296875, + "learning_rate": 5.799894865954091e-07, + "loss": -0.006, + "reward": 1.484375, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2398 + }, + { + "completion_length": 12.078125, + "epoch": 0.420360960224286, + "grad_norm": 19.19974924019669, + "kl": 0.306640625, + "learning_rate": 5.798142631855616e-07, + "loss": 0.0341, + "reward": 1.3660731315612793, + "reward_std": 0.29376906156539917, + "rewards/accuracy_reward_stage2": 0.5223231315612793, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2399 + }, + { + "completion_length": 11.984375, + "epoch": 0.42053618363413353, + "grad_norm": 14.368830703427102, + "kl": 0.062255859375, + "learning_rate": 5.79639039775714e-07, + "loss": 0.0249, + "reward": 1.515123724937439, + "reward_std": 0.2202598750591278, + "rewards/accuracy_reward_stage2": 0.5151236653327942, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2400 + }, + { + "completion_length": 8.796875, + "epoch": 0.4207114070439811, + "grad_norm": 19.06424785954498, + "kl": 0.11865234375, + "learning_rate": 5.794638163658665e-07, + "loss": 0.0065, + "reward": 1.5015455484390259, + "reward_std": 0.3422601819038391, + "rewards/accuracy_reward_stage2": 0.5327955484390259, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2401 + }, + { + "completion_length": 8.953125, + "epoch": 0.4208866304538286, + "grad_norm": 19.732187174356824, + "kl": 0.0625, + "learning_rate": 5.792885929560189e-07, + "loss": 0.025, + "reward": 1.4542206525802612, + "reward_std": 0.17934425175189972, + "rewards/accuracy_reward_stage2": 0.5792206525802612, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2402 + }, + { + "completion_length": 7.515625, + "epoch": 0.42106185386367617, + "grad_norm": 17.66601038639952, + "kl": 0.07861328125, + "learning_rate": 5.791133695461713e-07, + "loss": -0.0128, + "reward": 1.6888391971588135, + "reward_std": 0.20628750324249268, + "rewards/accuracy_reward_stage2": 0.7044641375541687, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2403 + }, + { + "completion_length": 8.359375, + "epoch": 0.4212370772735237, + "grad_norm": 22.577077448823406, + "kl": 0.068359375, + "learning_rate": 5.789381461363238e-07, + "loss": -0.0056, + "reward": 1.601570725440979, + "reward_std": 0.3855169117450714, + "rewards/accuracy_reward_stage2": 0.617195725440979, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2404 + }, + { + "completion_length": 15.1875, + "epoch": 0.4214123006833713, + "grad_norm": 8.304898189657713, + "kl": 0.0174560546875, + "learning_rate": 5.787629227264762e-07, + "loss": 0.007, + "reward": 1.7204861640930176, + "reward_std": 0.054775021970272064, + "rewards/accuracy_reward_stage2": 0.7204861044883728, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2405 + }, + { + "completion_length": 9.78125, + "epoch": 0.42158752409321887, + "grad_norm": 21.344555928253595, + "kl": 0.068359375, + "learning_rate": 5.785876993166287e-07, + "loss": 0.0274, + "reward": 1.5492045879364014, + "reward_std": 0.21212232112884521, + "rewards/accuracy_reward_stage2": 0.6742044687271118, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2406 + }, + { + "completion_length": 12.0625, + "epoch": 0.4217627475030664, + "grad_norm": 16.746001989010427, + "kl": 0.140625, + "learning_rate": 5.784124759067811e-07, + "loss": 0.0188, + "reward": 1.6990876197814941, + "reward_std": 0.20534729957580566, + "rewards/accuracy_reward_stage2": 0.7147126197814941, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2407 + }, + { + "completion_length": 28.90625, + "epoch": 0.42193797091291396, + "grad_norm": 18.15173262927103, + "kl": 0.0257568359375, + "learning_rate": 5.782372524969335e-07, + "loss": -0.0337, + "reward": 1.58535635471344, + "reward_std": 0.2794279158115387, + "rewards/accuracy_reward_stage2": 0.6009812951087952, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2408 + }, + { + "completion_length": 9.203125, + "epoch": 0.4221131943227615, + "grad_norm": 21.973624282288817, + "kl": 0.11083984375, + "learning_rate": 5.78062029087086e-07, + "loss": 0.0086, + "reward": 1.8336703777313232, + "reward_std": 0.14962394535541534, + "rewards/accuracy_reward_stage2": 0.8492953777313232, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2409 + }, + { + "completion_length": 8.8125, + "epoch": 0.42228841773260906, + "grad_norm": 18.160116177629245, + "kl": 0.061279296875, + "learning_rate": 5.778868056772384e-07, + "loss": 0.0245, + "reward": 1.5550178289413452, + "reward_std": 0.17941808700561523, + "rewards/accuracy_reward_stage2": 0.5550177693367004, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2410 + }, + { + "completion_length": 10.015625, + "epoch": 0.42246364114245666, + "grad_norm": 14.5878725632844, + "kl": 0.0771484375, + "learning_rate": 5.777115822673909e-07, + "loss": -0.0131, + "reward": 1.8403640985488892, + "reward_std": 0.15390296280384064, + "rewards/accuracy_reward_stage2": 0.8559890985488892, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2411 + }, + { + "completion_length": 8.1875, + "epoch": 0.4226388645523042, + "grad_norm": 18.022444942085286, + "kl": 0.19921875, + "learning_rate": 5.775363588575434e-07, + "loss": -0.0373, + "reward": 1.6093087196350098, + "reward_std": 0.26443642377853394, + "rewards/accuracy_reward_stage2": 0.6561837196350098, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2412 + }, + { + "completion_length": 9.109375, + "epoch": 0.42281408796215175, + "grad_norm": 14.90491302423205, + "kl": 0.1328125, + "learning_rate": 5.773611354476958e-07, + "loss": 0.053, + "reward": 1.486379861831665, + "reward_std": 0.10205523669719696, + "rewards/accuracy_reward_stage2": 0.4863799214363098, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2413 + }, + { + "completion_length": 9.453125, + "epoch": 0.4229893113719993, + "grad_norm": 19.41092038526134, + "kl": 0.33203125, + "learning_rate": 5.771859120378483e-07, + "loss": 0.0286, + "reward": 1.4454668760299683, + "reward_std": 0.23624253273010254, + "rewards/accuracy_reward_stage2": 0.6173418760299683, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2414 + }, + { + "completion_length": 7.765625, + "epoch": 0.42316453478184685, + "grad_norm": 18.530930531417717, + "kl": 0.2451171875, + "learning_rate": 5.770106886280008e-07, + "loss": 0.0182, + "reward": 1.3699134588241577, + "reward_std": 0.2863275408744812, + "rewards/accuracy_reward_stage2": 0.5261634588241577, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2415 + }, + { + "completion_length": 7.953125, + "epoch": 0.4233397581916944, + "grad_norm": 19.940209981188683, + "kl": 0.23828125, + "learning_rate": 5.768354652181531e-07, + "loss": -0.0313, + "reward": 1.5837643146514893, + "reward_std": 0.2522251605987549, + "rewards/accuracy_reward_stage2": 0.6306391954421997, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2416 + }, + { + "completion_length": 12.578125, + "epoch": 0.42351498160154194, + "grad_norm": 20.306264227490708, + "kl": 0.2490234375, + "learning_rate": 5.766602418083055e-07, + "loss": -0.0231, + "reward": 1.3948123455047607, + "reward_std": 0.3546648919582367, + "rewards/accuracy_reward_stage2": 0.5666873455047607, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2417 + }, + { + "completion_length": 11.53125, + "epoch": 0.42369020501138954, + "grad_norm": 20.154209703831523, + "kl": 0.12890625, + "learning_rate": 5.764850183984579e-07, + "loss": 0.0349, + "reward": 1.2792103290557861, + "reward_std": 0.2176147699356079, + "rewards/accuracy_reward_stage2": 0.4198353886604309, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2418 + }, + { + "completion_length": 12.59375, + "epoch": 0.4238654284212371, + "grad_norm": 15.276341266991679, + "kl": 0.068359375, + "learning_rate": 5.763097949886104e-07, + "loss": -0.0056, + "reward": 1.411638617515564, + "reward_std": 0.21439909934997559, + "rewards/accuracy_reward_stage2": 0.4272635877132416, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2419 + }, + { + "completion_length": 13.375, + "epoch": 0.42404065183108464, + "grad_norm": 25.83525267716047, + "kl": 0.166015625, + "learning_rate": 5.761345715787629e-07, + "loss": 0.009, + "reward": 1.4438656568527222, + "reward_std": 0.40633732080459595, + "rewards/accuracy_reward_stage2": 0.47511565685272217, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2420 + }, + { + "completion_length": 30.53125, + "epoch": 0.4242158752409322, + "grad_norm": 19.080239012482636, + "kl": 0.1708984375, + "learning_rate": 5.759593481689153e-07, + "loss": 0.0242, + "reward": 1.6823811531066895, + "reward_std": 0.23308956623077393, + "rewards/accuracy_reward_stage2": 0.6980061531066895, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2421 + }, + { + "completion_length": 14.0625, + "epoch": 0.42439109865077973, + "grad_norm": 17.221754256398736, + "kl": 0.1279296875, + "learning_rate": 5.757841247590678e-07, + "loss": 0.0179, + "reward": 1.3743314743041992, + "reward_std": 0.16007143259048462, + "rewards/accuracy_reward_stage2": 0.5149564743041992, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2422 + }, + { + "completion_length": 10.765625, + "epoch": 0.4245663220606273, + "grad_norm": 17.92302781537834, + "kl": 0.13671875, + "learning_rate": 5.756089013492203e-07, + "loss": -0.0002, + "reward": 1.69929838180542, + "reward_std": 0.11890119314193726, + "rewards/accuracy_reward_stage2": 0.8399233222007751, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2423 + }, + { + "completion_length": 8.21875, + "epoch": 0.4247415454704749, + "grad_norm": 19.420077030170336, + "kl": 0.2158203125, + "learning_rate": 5.754336779393727e-07, + "loss": -0.0744, + "reward": 1.741911768913269, + "reward_std": 0.3283819556236267, + "rewards/accuracy_reward_stage2": 0.804411768913269, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2424 + }, + { + "completion_length": 13.265625, + "epoch": 0.4249167688803224, + "grad_norm": 20.955694465793627, + "kl": 0.2451171875, + "learning_rate": 5.752584545295252e-07, + "loss": 0.0293, + "reward": 1.463441252708435, + "reward_std": 0.2972392439842224, + "rewards/accuracy_reward_stage2": 0.49469125270843506, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2425 + }, + { + "completion_length": 15.734375, + "epoch": 0.42509199229017, + "grad_norm": 21.360009650947255, + "kl": 0.044677734375, + "learning_rate": 5.750832311196776e-07, + "loss": 0.0179, + "reward": 1.5914231538772583, + "reward_std": 0.16339072585105896, + "rewards/accuracy_reward_stage2": 0.7164231538772583, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2426 + }, + { + "completion_length": 9.859375, + "epoch": 0.4252672157000175, + "grad_norm": 13.966446126628513, + "kl": 0.1357421875, + "learning_rate": 5.7490800770983e-07, + "loss": 0.0545, + "reward": 1.5450599193572998, + "reward_std": 0.10719159245491028, + "rewards/accuracy_reward_stage2": 0.6700599789619446, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2427 + }, + { + "completion_length": 11.625, + "epoch": 0.42544243910986507, + "grad_norm": 19.51937307944042, + "kl": 0.15234375, + "learning_rate": 5.747327842999824e-07, + "loss": 0.0611, + "reward": 1.6079663038253784, + "reward_std": 0.1823212057352066, + "rewards/accuracy_reward_stage2": 0.7329663038253784, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2428 + }, + { + "completion_length": 9.90625, + "epoch": 0.4256176625197126, + "grad_norm": 14.795791933246239, + "kl": 0.201171875, + "learning_rate": 5.745575608901348e-07, + "loss": -0.0012, + "reward": 1.6733319759368896, + "reward_std": 0.28265178203582764, + "rewards/accuracy_reward_stage2": 0.7045819759368896, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2429 + }, + { + "completion_length": 12.796875, + "epoch": 0.4257928859295602, + "grad_norm": 22.40763134045858, + "kl": 0.130859375, + "learning_rate": 5.743823374802873e-07, + "loss": 0.0081, + "reward": 1.3852684497833252, + "reward_std": 0.20095396041870117, + "rewards/accuracy_reward_stage2": 0.4008934497833252, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2430 + }, + { + "completion_length": 12.4375, + "epoch": 0.42596810933940776, + "grad_norm": 17.35337680952318, + "kl": 0.1767578125, + "learning_rate": 5.742071140704398e-07, + "loss": -0.0526, + "reward": 1.5718660354614258, + "reward_std": 0.3506343364715576, + "rewards/accuracy_reward_stage2": 0.6187410354614258, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2431 + }, + { + "completion_length": 7.578125, + "epoch": 0.4261433327492553, + "grad_norm": 19.778617819967412, + "kl": 0.0419921875, + "learning_rate": 5.740318906605922e-07, + "loss": 0.0168, + "reward": 1.5186505317687988, + "reward_std": 0.27518704533576965, + "rewards/accuracy_reward_stage2": 0.518650472164154, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2432 + }, + { + "completion_length": 16.5, + "epoch": 0.42631855615910286, + "grad_norm": 15.30317538140248, + "kl": 0.04345703125, + "learning_rate": 5.738566672507447e-07, + "loss": 0.0174, + "reward": 1.603685736656189, + "reward_std": 0.08226728439331055, + "rewards/accuracy_reward_stage2": 0.603685736656189, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2433 + }, + { + "completion_length": 11.25, + "epoch": 0.4264937795689504, + "grad_norm": 20.356807009322075, + "kl": 0.2041015625, + "learning_rate": 5.736814438408971e-07, + "loss": 0.0398, + "reward": 1.5039076805114746, + "reward_std": 0.2799040675163269, + "rewards/accuracy_reward_stage2": 0.5195327401161194, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2434 + }, + { + "completion_length": 8.015625, + "epoch": 0.42666900297879795, + "grad_norm": 14.960749798331605, + "kl": 0.053466796875, + "learning_rate": 5.735062204310496e-07, + "loss": 0.0214, + "reward": 1.6923701763153076, + "reward_std": 0.10996302962303162, + "rewards/accuracy_reward_stage2": 0.6923701763153076, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2435 + }, + { + "completion_length": 11.65625, + "epoch": 0.4268442263886455, + "grad_norm": 17.103308154395133, + "kl": 0.05419921875, + "learning_rate": 5.733309970212021e-07, + "loss": 0.0216, + "reward": 1.2994627952575684, + "reward_std": 0.1993103176355362, + "rewards/accuracy_reward_stage2": 0.29946279525756836, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2436 + }, + { + "completion_length": 9.296875, + "epoch": 0.4270194497984931, + "grad_norm": 29.964224876637175, + "kl": 0.11474609375, + "learning_rate": 5.731557736113544e-07, + "loss": -0.0325, + "reward": 1.4375, + "reward_std": 0.303472638130188, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2437 + }, + { + "completion_length": 7.3125, + "epoch": 0.42719467320834065, + "grad_norm": 17.02039408599391, + "kl": 0.1513671875, + "learning_rate": 5.729805502015069e-07, + "loss": 0.0035, + "reward": 1.6351255178451538, + "reward_std": 0.1954609751701355, + "rewards/accuracy_reward_stage2": 0.666375458240509, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2438 + }, + { + "completion_length": 7.84375, + "epoch": 0.4273698966181882, + "grad_norm": 17.43461625512826, + "kl": 0.1484375, + "learning_rate": 5.728053267916594e-07, + "loss": -0.0059, + "reward": 1.4844474792480469, + "reward_std": 0.20424222946166992, + "rewards/accuracy_reward_stage2": 0.5156975984573364, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2439 + }, + { + "completion_length": 7.171875, + "epoch": 0.42754512002803574, + "grad_norm": 12.179955934075089, + "kl": 0.0947265625, + "learning_rate": 5.726301033818118e-07, + "loss": -0.0061, + "reward": 1.6276520490646362, + "reward_std": 0.09620348364114761, + "rewards/accuracy_reward_stage2": 0.6432770490646362, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2440 + }, + { + "completion_length": 8.4375, + "epoch": 0.4277203434378833, + "grad_norm": 19.9623888755173, + "kl": 0.14453125, + "learning_rate": 5.724548799719642e-07, + "loss": 0.0579, + "reward": 1.653172254562378, + "reward_std": 0.1910550743341446, + "rewards/accuracy_reward_stage2": 0.6531723737716675, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2441 + }, + { + "completion_length": 9.46875, + "epoch": 0.42789556684773083, + "grad_norm": 23.38544505707681, + "kl": 0.1513671875, + "learning_rate": 5.722796565621166e-07, + "loss": -0.0094, + "reward": 1.4377069473266602, + "reward_std": 0.3157484829425812, + "rewards/accuracy_reward_stage2": 0.4689568877220154, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2442 + }, + { + "completion_length": 6.890625, + "epoch": 0.42807079025757844, + "grad_norm": 19.817111523775036, + "kl": 0.059814453125, + "learning_rate": 5.721044331522691e-07, + "loss": 0.0239, + "reward": 1.669250249862671, + "reward_std": 0.2423802763223648, + "rewards/accuracy_reward_stage2": 0.6692502498626709, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2443 + }, + { + "completion_length": 10.296875, + "epoch": 0.428246013667426, + "grad_norm": 17.613478551009646, + "kl": 0.10302734375, + "learning_rate": 5.719292097424216e-07, + "loss": -0.0006, + "reward": 1.6044498682022095, + "reward_std": 0.27946341037750244, + "rewards/accuracy_reward_stage2": 0.6200748682022095, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2444 + }, + { + "completion_length": 10.3125, + "epoch": 0.42842123707727353, + "grad_norm": 22.91097469403058, + "kl": 0.2734375, + "learning_rate": 5.71753986332574e-07, + "loss": 0.0262, + "reward": 1.5360618829727173, + "reward_std": 0.28243163228034973, + "rewards/accuracy_reward_stage2": 0.5673118829727173, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2445 + }, + { + "completion_length": 6.71875, + "epoch": 0.4285964604871211, + "grad_norm": 23.222689234508, + "kl": 0.06640625, + "learning_rate": 5.715787629227265e-07, + "loss": -0.0014, + "reward": 1.4264297485351562, + "reward_std": 0.2040741741657257, + "rewards/accuracy_reward_stage2": 0.6920547485351562, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2446 + }, + { + "completion_length": 13.90625, + "epoch": 0.4287716838969686, + "grad_norm": 19.669761903072615, + "kl": 0.150390625, + "learning_rate": 5.714035395128789e-07, + "loss": 0.0601, + "reward": 1.4472002983093262, + "reward_std": 0.2749719023704529, + "rewards/accuracy_reward_stage2": 0.6972004175186157, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2447 + }, + { + "completion_length": 9.671875, + "epoch": 0.42894690730681617, + "grad_norm": 16.145787029771405, + "kl": 0.1640625, + "learning_rate": 5.712283161030313e-07, + "loss": 0.0152, + "reward": 1.766361951828003, + "reward_std": 0.14357280731201172, + "rewards/accuracy_reward_stage2": 0.7976118922233582, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2448 + }, + { + "completion_length": 12.640625, + "epoch": 0.4291221307166637, + "grad_norm": 21.49360682772942, + "kl": 0.30078125, + "learning_rate": 5.710530926931838e-07, + "loss": 0.0347, + "reward": 1.4998382329940796, + "reward_std": 0.23182250559329987, + "rewards/accuracy_reward_stage2": 0.6560881733894348, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2449 + }, + { + "completion_length": 11.59375, + "epoch": 0.4292973541265113, + "grad_norm": 24.51503238020705, + "kl": 0.203125, + "learning_rate": 5.708778692833362e-07, + "loss": 0.0413, + "reward": 1.3393845558166504, + "reward_std": 0.26047730445861816, + "rewards/accuracy_reward_stage2": 0.6050096154212952, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2450 + }, + { + "completion_length": 22.6875, + "epoch": 0.42947257753635887, + "grad_norm": 20.936424541852354, + "kl": 0.30078125, + "learning_rate": 5.707026458734887e-07, + "loss": 0.026, + "reward": 1.1758679151535034, + "reward_std": 0.2872629165649414, + "rewards/accuracy_reward_stage2": 0.3477429151535034, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2451 + }, + { + "completion_length": 9.09375, + "epoch": 0.4296478009462064, + "grad_norm": 14.344676006512493, + "kl": 0.072265625, + "learning_rate": 5.705274224636412e-07, + "loss": -0.0042, + "reward": 1.702180027961731, + "reward_std": 0.12235504388809204, + "rewards/accuracy_reward_stage2": 0.7178049683570862, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2452 + }, + { + "completion_length": 12.453125, + "epoch": 0.42982302435605396, + "grad_norm": 21.047318587932917, + "kl": 0.1376953125, + "learning_rate": 5.703521990537936e-07, + "loss": -0.0279, + "reward": 1.59375, + "reward_std": 0.354972779750824, + "rewards/accuracy_reward_stage2": 0.75, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2453 + }, + { + "completion_length": 10.015625, + "epoch": 0.4299982477659015, + "grad_norm": 18.28343107689635, + "kl": 0.0693359375, + "learning_rate": 5.70176975643946e-07, + "loss": -0.0035, + "reward": 1.4171864986419678, + "reward_std": 0.26342296600341797, + "rewards/accuracy_reward_stage2": 0.4328114986419678, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2454 + }, + { + "completion_length": 9.765625, + "epoch": 0.43017347117574906, + "grad_norm": 22.09226668402471, + "kl": 0.142578125, + "learning_rate": 5.700017522340984e-07, + "loss": 0.0131, + "reward": 1.3958333730697632, + "reward_std": 0.3385624885559082, + "rewards/accuracy_reward_stage2": 0.6614583730697632, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2455 + }, + { + "completion_length": 8.984375, + "epoch": 0.43034869458559666, + "grad_norm": 16.869530097214756, + "kl": 0.2099609375, + "learning_rate": 5.698265288242509e-07, + "loss": -0.093, + "reward": 1.744043231010437, + "reward_std": 0.2645736336708069, + "rewards/accuracy_reward_stage2": 0.931543231010437, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 2456 + }, + { + "completion_length": 8.28125, + "epoch": 0.4305239179954442, + "grad_norm": 18.478454670590494, + "kl": 0.25, + "learning_rate": 5.696513054144033e-07, + "loss": -0.0083, + "reward": 1.5073299407958984, + "reward_std": 0.231063574552536, + "rewards/accuracy_reward_stage2": 0.5542050004005432, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2457 + }, + { + "completion_length": 12.03125, + "epoch": 0.43069914140529175, + "grad_norm": 16.2323364003327, + "kl": 0.130859375, + "learning_rate": 5.694760820045557e-07, + "loss": 0.0108, + "reward": 1.563699722290039, + "reward_std": 0.20556166768074036, + "rewards/accuracy_reward_stage2": 0.5793246030807495, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2458 + }, + { + "completion_length": 8.9375, + "epoch": 0.4308743648151393, + "grad_norm": 18.48203471270133, + "kl": 0.19140625, + "learning_rate": 5.693008585947082e-07, + "loss": 0.0595, + "reward": 1.7979397773742676, + "reward_std": 0.07218047231435776, + "rewards/accuracy_reward_stage2": 0.8291897773742676, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2459 + }, + { + "completion_length": 12.96875, + "epoch": 0.43104958822498685, + "grad_norm": 21.08552299144359, + "kl": 0.08984375, + "learning_rate": 5.691256351848607e-07, + "loss": 0.0359, + "reward": 1.418628215789795, + "reward_std": 0.1847638040781021, + "rewards/accuracy_reward_stage2": 0.4186283051967621, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2460 + }, + { + "completion_length": 9.75, + "epoch": 0.4312248116348344, + "grad_norm": 9.440874169401996, + "kl": 0.0859375, + "learning_rate": 5.689504117750131e-07, + "loss": -0.054, + "reward": 1.639630913734436, + "reward_std": 0.0937529057264328, + "rewards/accuracy_reward_stage2": 0.795880913734436, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2461 + }, + { + "completion_length": 10.859375, + "epoch": 0.431400035044682, + "grad_norm": 20.556394887831715, + "kl": 0.0654296875, + "learning_rate": 5.687751883651656e-07, + "loss": 0.0262, + "reward": 1.713507056236267, + "reward_std": 0.21507734060287476, + "rewards/accuracy_reward_stage2": 0.7135070562362671, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2462 + }, + { + "completion_length": 8.765625, + "epoch": 0.43157525845452954, + "grad_norm": 16.17523527305367, + "kl": 0.115234375, + "learning_rate": 5.685999649553181e-07, + "loss": 0.0405, + "reward": 1.5567264556884766, + "reward_std": 0.10888748615980148, + "rewards/accuracy_reward_stage2": 0.6817264556884766, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2463 + }, + { + "completion_length": 7.734375, + "epoch": 0.4317504818643771, + "grad_norm": 20.05211199152606, + "kl": 0.2216796875, + "learning_rate": 5.684247415454705e-07, + "loss": 0.0134, + "reward": 1.4463741779327393, + "reward_std": 0.3639988303184509, + "rewards/accuracy_reward_stage2": 0.47762417793273926, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2464 + }, + { + "completion_length": 10.96875, + "epoch": 0.43192570527422464, + "grad_norm": 19.119738395479125, + "kl": 0.056640625, + "learning_rate": 5.68249518135623e-07, + "loss": 0.0226, + "reward": 1.7154829502105713, + "reward_std": 0.20708030462265015, + "rewards/accuracy_reward_stage2": 0.7154829502105713, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2465 + }, + { + "completion_length": 23.453125, + "epoch": 0.4321009286840722, + "grad_norm": 16.74592932741985, + "kl": 0.06396484375, + "learning_rate": 5.680742947257752e-07, + "loss": -0.0185, + "reward": 1.7202057838439941, + "reward_std": 0.13266494870185852, + "rewards/accuracy_reward_stage2": 0.7358307838439941, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2466 + }, + { + "completion_length": 11.796875, + "epoch": 0.43227615209391973, + "grad_norm": 19.32836770464296, + "kl": 0.11279296875, + "learning_rate": 5.678990713159277e-07, + "loss": 0.0023, + "reward": 1.5369431972503662, + "reward_std": 0.2311927080154419, + "rewards/accuracy_reward_stage2": 0.5525681376457214, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2467 + }, + { + "completion_length": 6.59375, + "epoch": 0.4324513755037673, + "grad_norm": 15.640132670257081, + "kl": 0.236328125, + "learning_rate": 5.677238479060802e-07, + "loss": 0.0211, + "reward": 1.5886476039886475, + "reward_std": 0.2265038788318634, + "rewards/accuracy_reward_stage2": 0.6198976635932922, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2468 + }, + { + "completion_length": 8.421875, + "epoch": 0.4326265989136149, + "grad_norm": 22.452989347045982, + "kl": 0.1298828125, + "learning_rate": 5.675486244962326e-07, + "loss": 0.0077, + "reward": 1.7098286151885986, + "reward_std": 0.256533682346344, + "rewards/accuracy_reward_stage2": 0.7254536151885986, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2469 + }, + { + "completion_length": 12.65625, + "epoch": 0.4328018223234624, + "grad_norm": 22.12282683860938, + "kl": 0.12353515625, + "learning_rate": 5.673734010863851e-07, + "loss": 0.0181, + "reward": 1.2503278255462646, + "reward_std": 0.3297951817512512, + "rewards/accuracy_reward_stage2": 0.5159528255462646, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2470 + }, + { + "completion_length": 12.390625, + "epoch": 0.43297704573331, + "grad_norm": 13.672982165021379, + "kl": 0.126953125, + "learning_rate": 5.671981776765375e-07, + "loss": -0.0294, + "reward": 1.3744654655456543, + "reward_std": 0.14306974411010742, + "rewards/accuracy_reward_stage2": 0.5307154059410095, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2471 + }, + { + "completion_length": 9.171875, + "epoch": 0.4331522691431575, + "grad_norm": 20.3049908134314, + "kl": 0.1943359375, + "learning_rate": 5.6702295426669e-07, + "loss": 0.0004, + "reward": 1.2787423133850098, + "reward_std": 0.34057146310806274, + "rewards/accuracy_reward_stage2": 0.4349922835826874, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2472 + }, + { + "completion_length": 8.375, + "epoch": 0.43332749255300507, + "grad_norm": 15.57066797986624, + "kl": 0.1728515625, + "learning_rate": 5.668477308568425e-07, + "loss": -0.0273, + "reward": 1.227855920791626, + "reward_std": 0.2600477337837219, + "rewards/accuracy_reward_stage2": 0.649730920791626, + "rewards/format_reward_stage1_pointerpad": 0.578125, + "scores/accuracy_reward_stage2": 0.578125, + "step": 2473 + }, + { + "completion_length": 9.015625, + "epoch": 0.4335027159628526, + "grad_norm": 16.92854945026168, + "kl": 0.06689453125, + "learning_rate": 5.666725074469949e-07, + "loss": 0.0267, + "reward": 1.605328917503357, + "reward_std": 0.1650751680135727, + "rewards/accuracy_reward_stage2": 0.6053289175033569, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2474 + }, + { + "completion_length": 9.140625, + "epoch": 0.4336779393727002, + "grad_norm": 36.341656245826364, + "kl": 0.32421875, + "learning_rate": 5.664972840371474e-07, + "loss": 0.0901, + "reward": 1.2757874727249146, + "reward_std": 0.2893187999725342, + "rewards/accuracy_reward_stage2": 0.5414124727249146, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2475 + }, + { + "completion_length": 8.640625, + "epoch": 0.43385316278254776, + "grad_norm": 16.659823748323067, + "kl": 0.0390625, + "learning_rate": 5.663220606272999e-07, + "loss": 0.0156, + "reward": 1.8644938468933105, + "reward_std": 0.10486021637916565, + "rewards/accuracy_reward_stage2": 0.8644937872886658, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2476 + }, + { + "completion_length": 8.1875, + "epoch": 0.4340283861923953, + "grad_norm": 13.634458932508217, + "kl": 0.19140625, + "learning_rate": 5.661468372174522e-07, + "loss": -0.0392, + "reward": 1.6913225650787354, + "reward_std": 0.19711565971374512, + "rewards/accuracy_reward_stage2": 0.7381975054740906, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2477 + }, + { + "completion_length": 10.078125, + "epoch": 0.43420360960224286, + "grad_norm": 20.019412732245556, + "kl": 0.087890625, + "learning_rate": 5.659716138076047e-07, + "loss": 0.0352, + "reward": 1.625917911529541, + "reward_std": 0.22597362101078033, + "rewards/accuracy_reward_stage2": 0.6259177923202515, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2478 + }, + { + "completion_length": 8.421875, + "epoch": 0.4343788330120904, + "grad_norm": 20.72840852940728, + "kl": 0.322265625, + "learning_rate": 5.65796390397757e-07, + "loss": 0.0199, + "reward": 1.2296040058135986, + "reward_std": 0.256480872631073, + "rewards/accuracy_reward_stage2": 0.40147897601127625, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2479 + }, + { + "completion_length": 15.046875, + "epoch": 0.43455405642193795, + "grad_norm": 16.062794937831228, + "kl": 0.19140625, + "learning_rate": 5.656211669879095e-07, + "loss": -0.0408, + "reward": 1.5773541927337646, + "reward_std": 0.19875936210155487, + "rewards/accuracy_reward_stage2": 0.6242291927337646, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2480 + }, + { + "completion_length": 9.59375, + "epoch": 0.43472927983178555, + "grad_norm": 20.873383707835117, + "kl": 0.11328125, + "learning_rate": 5.65445943578062e-07, + "loss": 0.0454, + "reward": 1.7643051147460938, + "reward_std": 0.12677878141403198, + "rewards/accuracy_reward_stage2": 0.7643051147460938, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2481 + }, + { + "completion_length": 8.40625, + "epoch": 0.4349045032416331, + "grad_norm": 15.811934560487519, + "kl": 0.03125, + "learning_rate": 5.652707201682144e-07, + "loss": 0.0125, + "reward": 1.6651852130889893, + "reward_std": 0.25956788659095764, + "rewards/accuracy_reward_stage2": 0.6651852130889893, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2482 + }, + { + "completion_length": 10.484375, + "epoch": 0.43507972665148065, + "grad_norm": 18.44281784049491, + "kl": 0.2294921875, + "learning_rate": 5.650954967583669e-07, + "loss": 0.0474, + "reward": 1.1519947052001953, + "reward_std": 0.1992306411266327, + "rewards/accuracy_reward_stage2": 0.30824464559555054, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2483 + }, + { + "completion_length": 9.453125, + "epoch": 0.4352549500613282, + "grad_norm": 17.669081506895804, + "kl": 0.32421875, + "learning_rate": 5.649202733485194e-07, + "loss": 0.019, + "reward": 1.6348530054092407, + "reward_std": 0.2544611692428589, + "rewards/accuracy_reward_stage2": 0.6817280054092407, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2484 + }, + { + "completion_length": 11.28125, + "epoch": 0.43543017347117574, + "grad_norm": 18.882120225824227, + "kl": 0.1455078125, + "learning_rate": 5.647450499386718e-07, + "loss": 0.0354, + "reward": 1.2779840230941772, + "reward_std": 0.32248374819755554, + "rewards/accuracy_reward_stage2": 0.41860899329185486, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2485 + }, + { + "completion_length": 7.59375, + "epoch": 0.4356053968810233, + "grad_norm": 15.706969735056607, + "kl": 0.171875, + "learning_rate": 5.645698265288243e-07, + "loss": -0.0103, + "reward": 1.625713586807251, + "reward_std": 0.20755544304847717, + "rewards/accuracy_reward_stage2": 0.656963586807251, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2486 + }, + { + "completion_length": 9.25, + "epoch": 0.43578062029087083, + "grad_norm": 16.539471166479995, + "kl": 0.09619140625, + "learning_rate": 5.643946031189766e-07, + "loss": -0.0499, + "reward": 1.5612146854400635, + "reward_std": 0.24608927965164185, + "rewards/accuracy_reward_stage2": 0.5924647450447083, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2487 + }, + { + "completion_length": 7.0, + "epoch": 0.43595584370071844, + "grad_norm": 20.675547669779483, + "kl": 0.265625, + "learning_rate": 5.642193797091291e-07, + "loss": -0.036, + "reward": 1.549863576889038, + "reward_std": 0.3346082270145416, + "rewards/accuracy_reward_stage2": 0.6123635768890381, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2488 + }, + { + "completion_length": 10.0625, + "epoch": 0.436131067110566, + "grad_norm": 22.613480108024827, + "kl": 0.2353515625, + "learning_rate": 5.640441562992816e-07, + "loss": -0.0745, + "reward": 1.6920068264007568, + "reward_std": 0.29510557651519775, + "rewards/accuracy_reward_stage2": 0.7545068264007568, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2489 + }, + { + "completion_length": 14.296875, + "epoch": 0.43630629052041353, + "grad_norm": 19.553744432186637, + "kl": 0.07568359375, + "learning_rate": 5.63868932889434e-07, + "loss": -0.014, + "reward": 1.7241294384002686, + "reward_std": 0.29626208543777466, + "rewards/accuracy_reward_stage2": 0.7397544980049133, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2490 + }, + { + "completion_length": 11.0, + "epoch": 0.4364815139302611, + "grad_norm": 16.8282524901791, + "kl": 0.08837890625, + "learning_rate": 5.636937094795865e-07, + "loss": 0.0355, + "reward": 1.6007554531097412, + "reward_std": 0.17432302236557007, + "rewards/accuracy_reward_stage2": 0.6007554531097412, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2491 + }, + { + "completion_length": 10.65625, + "epoch": 0.4366567373401086, + "grad_norm": 21.60463160566603, + "kl": 0.2333984375, + "learning_rate": 5.635184860697389e-07, + "loss": -0.0129, + "reward": 1.5278030633926392, + "reward_std": 0.3193010687828064, + "rewards/accuracy_reward_stage2": 0.5746780633926392, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2492 + }, + { + "completion_length": 12.875, + "epoch": 0.43683196074995617, + "grad_norm": 21.031349941173335, + "kl": 0.103515625, + "learning_rate": 5.633432626598913e-07, + "loss": 0.0103, + "reward": 1.7680280208587646, + "reward_std": 0.19699907302856445, + "rewards/accuracy_reward_stage2": 0.7836530208587646, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2493 + }, + { + "completion_length": 9.875, + "epoch": 0.4370071841598038, + "grad_norm": 17.252817207871704, + "kl": 0.1865234375, + "learning_rate": 5.631680392500438e-07, + "loss": 0.0122, + "reward": 1.5799731016159058, + "reward_std": 0.20395085215568542, + "rewards/accuracy_reward_stage2": 0.736223042011261, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2494 + }, + { + "completion_length": 10.046875, + "epoch": 0.4371824075696513, + "grad_norm": 16.459408311866355, + "kl": 0.10595703125, + "learning_rate": 5.629928158401962e-07, + "loss": 0.0425, + "reward": 1.7014847993850708, + "reward_std": 0.27053919434547424, + "rewards/accuracy_reward_stage2": 0.7014847993850708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2495 + }, + { + "completion_length": 9.0, + "epoch": 0.43735763097949887, + "grad_norm": 17.406967530913867, + "kl": 0.212890625, + "learning_rate": 5.628175924303486e-07, + "loss": 0.0406, + "reward": 1.4516146183013916, + "reward_std": 0.2875097990036011, + "rewards/accuracy_reward_stage2": 0.5922396183013916, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2496 + }, + { + "completion_length": 10.90625, + "epoch": 0.4375328543893464, + "grad_norm": 18.29723073565994, + "kl": 0.2138671875, + "learning_rate": 5.626423690205011e-07, + "loss": 0.0414, + "reward": 1.6410624980926514, + "reward_std": 0.24239271879196167, + "rewards/accuracy_reward_stage2": 0.6566874980926514, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2497 + }, + { + "completion_length": 10.96875, + "epoch": 0.43770807779919396, + "grad_norm": 13.061474527595225, + "kl": 0.08251953125, + "learning_rate": 5.624671456106535e-07, + "loss": 0.033, + "reward": 1.4094147682189941, + "reward_std": 0.125525563955307, + "rewards/accuracy_reward_stage2": 0.4094148278236389, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2498 + }, + { + "completion_length": 11.171875, + "epoch": 0.4378833012090415, + "grad_norm": 19.58076173683503, + "kl": 0.2197265625, + "learning_rate": 5.62291922200806e-07, + "loss": 0.0198, + "reward": 1.470327377319336, + "reward_std": 0.30748701095581055, + "rewards/accuracy_reward_stage2": 0.6265773177146912, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2499 + }, + { + "completion_length": 7.59375, + "epoch": 0.4380585246188891, + "grad_norm": 18.040021273363966, + "kl": 0.07861328125, + "learning_rate": 5.621166987909585e-07, + "loss": -0.0081, + "reward": 1.828190803527832, + "reward_std": 0.21691644191741943, + "rewards/accuracy_reward_stage2": 0.843815803527832, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2500 + }, + { + "completion_length": 8.265625, + "epoch": 0.43823374802873666, + "grad_norm": 17.384233035604485, + "kl": 0.05029296875, + "learning_rate": 5.619414753811109e-07, + "loss": 0.0201, + "reward": 1.7455376386642456, + "reward_std": 0.14652395248413086, + "rewards/accuracy_reward_stage2": 0.7455376386642456, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2501 + }, + { + "completion_length": 10.203125, + "epoch": 0.4384089714385842, + "grad_norm": 17.737734790062877, + "kl": 0.1767578125, + "learning_rate": 5.617662519712634e-07, + "loss": 0.0613, + "reward": 1.6611220836639404, + "reward_std": 0.20425570011138916, + "rewards/accuracy_reward_stage2": 0.6767470836639404, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2502 + }, + { + "completion_length": 14.140625, + "epoch": 0.43858419484843175, + "grad_norm": 12.045441698437324, + "kl": 0.10107421875, + "learning_rate": 5.615910285614158e-07, + "loss": -0.0251, + "reward": 1.0958956480026245, + "reward_std": 0.14148275554180145, + "rewards/accuracy_reward_stage2": 0.12714560329914093, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2503 + }, + { + "completion_length": 10.875, + "epoch": 0.4387594182582793, + "grad_norm": 15.969767938919299, + "kl": 0.04345703125, + "learning_rate": 5.614158051515683e-07, + "loss": 0.0174, + "reward": 1.5387020111083984, + "reward_std": 0.125541090965271, + "rewards/accuracy_reward_stage2": 0.6637020111083984, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2504 + }, + { + "completion_length": 10.046875, + "epoch": 0.43893464166812685, + "grad_norm": 16.597279311330908, + "kl": 0.0751953125, + "learning_rate": 5.612405817417207e-07, + "loss": -0.0134, + "reward": 1.6844640970230103, + "reward_std": 0.338733047246933, + "rewards/accuracy_reward_stage2": 0.7000890374183655, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2505 + }, + { + "completion_length": 10.640625, + "epoch": 0.4391098650779744, + "grad_norm": 14.121928210049473, + "kl": 0.03466796875, + "learning_rate": 5.61065358331873e-07, + "loss": 0.0139, + "reward": 1.6286423206329346, + "reward_std": 0.1547921895980835, + "rewards/accuracy_reward_stage2": 0.6286423206329346, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2506 + }, + { + "completion_length": 6.203125, + "epoch": 0.439285088487822, + "grad_norm": 35.73116912990075, + "kl": 0.058837890625, + "learning_rate": 5.608901349220255e-07, + "loss": 0.0139, + "reward": 1.3072917461395264, + "reward_std": 0.16098350286483765, + "rewards/accuracy_reward_stage2": 0.3229166865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2507 + }, + { + "completion_length": 11.25, + "epoch": 0.43946031189766954, + "grad_norm": 17.708270132972558, + "kl": 0.02001953125, + "learning_rate": 5.60714911512178e-07, + "loss": 0.008, + "reward": 1.609148621559143, + "reward_std": 0.16704700887203217, + "rewards/accuracy_reward_stage2": 0.6091486215591431, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2508 + }, + { + "completion_length": 14.078125, + "epoch": 0.4396355353075171, + "grad_norm": 18.329023276387215, + "kl": 0.1064453125, + "learning_rate": 5.605396881023304e-07, + "loss": 0.0036, + "reward": 1.6048202514648438, + "reward_std": 0.23036982119083405, + "rewards/accuracy_reward_stage2": 0.620445191860199, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2509 + }, + { + "completion_length": 7.984375, + "epoch": 0.43981075871736464, + "grad_norm": 20.726709280170827, + "kl": 0.06591796875, + "learning_rate": 5.603644646924829e-07, + "loss": 0.0264, + "reward": 1.8156670331954956, + "reward_std": 0.13929487764835358, + "rewards/accuracy_reward_stage2": 0.8156670331954956, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2510 + }, + { + "completion_length": 8.515625, + "epoch": 0.4399859821272122, + "grad_norm": 27.31374661746669, + "kl": 0.2255859375, + "learning_rate": 5.601892412826353e-07, + "loss": 0.0273, + "reward": 1.1579368114471436, + "reward_std": 0.18273042142391205, + "rewards/accuracy_reward_stage2": 0.4235617518424988, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2511 + }, + { + "completion_length": 10.90625, + "epoch": 0.44016120553705973, + "grad_norm": 21.205034695123313, + "kl": 0.026611328125, + "learning_rate": 5.600140178727878e-07, + "loss": 0.0107, + "reward": 1.3730418682098389, + "reward_std": 0.19322770833969116, + "rewards/accuracy_reward_stage2": 0.3730418086051941, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2512 + }, + { + "completion_length": 15.515625, + "epoch": 0.44033642894690733, + "grad_norm": 25.29220340718031, + "kl": 0.2392578125, + "learning_rate": 5.598387944629403e-07, + "loss": 0.064, + "reward": 1.3425215482711792, + "reward_std": 0.3277851343154907, + "rewards/accuracy_reward_stage2": 0.4831465482711792, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2513 + }, + { + "completion_length": 8.359375, + "epoch": 0.4405116523567549, + "grad_norm": 17.103435466073208, + "kl": 0.1328125, + "learning_rate": 5.596635710530927e-07, + "loss": 0.0529, + "reward": 1.6302083730697632, + "reward_std": 0.04419417679309845, + "rewards/accuracy_reward_stage2": 0.7552083730697632, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2514 + }, + { + "completion_length": 9.578125, + "epoch": 0.4406868757666024, + "grad_norm": 13.641401001570447, + "kl": 0.056640625, + "learning_rate": 5.594883476432452e-07, + "loss": -0.0191, + "reward": 1.6454601287841797, + "reward_std": 0.08494816720485687, + "rewards/accuracy_reward_stage2": 0.6610851883888245, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2515 + }, + { + "completion_length": 12.875, + "epoch": 0.44086209917645, + "grad_norm": 18.981704398952896, + "kl": 0.1708984375, + "learning_rate": 5.593131242333977e-07, + "loss": 0.0682, + "reward": 1.295891284942627, + "reward_std": 0.20116037130355835, + "rewards/accuracy_reward_stage2": 0.5458913445472717, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2516 + }, + { + "completion_length": 6.625, + "epoch": 0.4410373225862975, + "grad_norm": 11.751108110567436, + "kl": 0.1015625, + "learning_rate": 5.591379008235499e-07, + "loss": 0.001, + "reward": 1.798114538192749, + "reward_std": 0.20511700212955475, + "rewards/accuracy_reward_stage2": 0.8137395977973938, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2517 + }, + { + "completion_length": 10.3125, + "epoch": 0.44121254599614507, + "grad_norm": 13.022570147089745, + "kl": 0.03369140625, + "learning_rate": 5.589626774137024e-07, + "loss": 0.0135, + "reward": 1.6906923055648804, + "reward_std": 0.08085846900939941, + "rewards/accuracy_reward_stage2": 0.6906922459602356, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2518 + }, + { + "completion_length": 12.0625, + "epoch": 0.4413877694059926, + "grad_norm": 18.866592945670142, + "kl": 0.07373046875, + "learning_rate": 5.587874540038548e-07, + "loss": 0.0294, + "reward": 1.6115610599517822, + "reward_std": 0.21088054776191711, + "rewards/accuracy_reward_stage2": 0.6115610599517822, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2519 + }, + { + "completion_length": 14.078125, + "epoch": 0.4415629928158402, + "grad_norm": 22.588152551889987, + "kl": 0.193359375, + "learning_rate": 5.586122305940073e-07, + "loss": 0.045, + "reward": 1.5260608196258545, + "reward_std": 0.2753870487213135, + "rewards/accuracy_reward_stage2": 0.7916858196258545, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2520 + }, + { + "completion_length": 10.125, + "epoch": 0.44173821622568776, + "grad_norm": 19.79433337645499, + "kl": 0.158203125, + "learning_rate": 5.584370071841598e-07, + "loss": 0.063, + "reward": 1.4418141841888428, + "reward_std": 0.11664269864559174, + "rewards/accuracy_reward_stage2": 0.5668141841888428, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2521 + }, + { + "completion_length": 9.75, + "epoch": 0.4419134396355353, + "grad_norm": 16.81120708295592, + "kl": 0.189453125, + "learning_rate": 5.582617837743122e-07, + "loss": -0.0077, + "reward": 1.4906489849090576, + "reward_std": 0.25340473651885986, + "rewards/accuracy_reward_stage2": 0.5218990445137024, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2522 + }, + { + "completion_length": 9.71875, + "epoch": 0.44208866304538286, + "grad_norm": 28.142505492164215, + "kl": 0.130859375, + "learning_rate": 5.580865603644647e-07, + "loss": 0.043, + "reward": 1.1741572618484497, + "reward_std": 0.4262906312942505, + "rewards/accuracy_reward_stage2": 0.42415720224380493, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2523 + }, + { + "completion_length": 23.515625, + "epoch": 0.4422638864552304, + "grad_norm": 14.510789188327777, + "kl": 0.0703125, + "learning_rate": 5.579113369546172e-07, + "loss": -0.0161, + "reward": 1.7277365922927856, + "reward_std": 0.17168009281158447, + "rewards/accuracy_reward_stage2": 0.7433614730834961, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2524 + }, + { + "completion_length": 9.546875, + "epoch": 0.44243910986507795, + "grad_norm": 29.374054399951252, + "kl": 0.10107421875, + "learning_rate": 5.577361135447696e-07, + "loss": 0.0403, + "reward": 1.5435447692871094, + "reward_std": 0.29414820671081543, + "rewards/accuracy_reward_stage2": 0.5435448288917542, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2525 + }, + { + "completion_length": 8.953125, + "epoch": 0.44261433327492555, + "grad_norm": 17.304382130864624, + "kl": 0.134765625, + "learning_rate": 5.57560890134922e-07, + "loss": -0.0765, + "reward": 1.5247858762741089, + "reward_std": 0.20727473497390747, + "rewards/accuracy_reward_stage2": 0.5872858762741089, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2526 + }, + { + "completion_length": 8.125, + "epoch": 0.4427895566847731, + "grad_norm": 17.846323420295455, + "kl": 0.1552734375, + "learning_rate": 5.573856667250744e-07, + "loss": -0.0814, + "reward": 1.5099225044250488, + "reward_std": 0.3462476134300232, + "rewards/accuracy_reward_stage2": 0.5724225044250488, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2527 + }, + { + "completion_length": 14.125, + "epoch": 0.44296478009462065, + "grad_norm": 17.30740806674459, + "kl": 0.12109375, + "learning_rate": 5.572104433152269e-07, + "loss": 0.0042, + "reward": 1.4569900035858154, + "reward_std": 0.17400649189949036, + "rewards/accuracy_reward_stage2": 0.5976149439811707, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2528 + }, + { + "completion_length": 15.890625, + "epoch": 0.4431400035044682, + "grad_norm": 15.564204835103096, + "kl": 0.1005859375, + "learning_rate": 5.570352199053794e-07, + "loss": -0.01, + "reward": 1.460471510887146, + "reward_std": 0.15011385083198547, + "rewards/accuracy_reward_stage2": 0.491721510887146, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2529 + }, + { + "completion_length": 9.203125, + "epoch": 0.44331522691431574, + "grad_norm": 18.140320819227657, + "kl": 0.07080078125, + "learning_rate": 5.568599964955317e-07, + "loss": -0.0288, + "reward": 1.5155820846557617, + "reward_std": 0.3032262623310089, + "rewards/accuracy_reward_stage2": 0.5468320250511169, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2530 + }, + { + "completion_length": 7.453125, + "epoch": 0.4434904503241633, + "grad_norm": 19.07138088025204, + "kl": 0.1220703125, + "learning_rate": 5.566847730856842e-07, + "loss": -0.0371, + "reward": 1.4525704383850098, + "reward_std": 0.2989710867404938, + "rewards/accuracy_reward_stage2": 0.6088204383850098, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2531 + }, + { + "completion_length": 20.734375, + "epoch": 0.4436656737340109, + "grad_norm": 18.482553685228076, + "kl": 0.1572265625, + "learning_rate": 5.565095496758366e-07, + "loss": 0.0299, + "reward": 1.2634599208831787, + "reward_std": 0.2991285026073456, + "rewards/accuracy_reward_stage2": 0.40408504009246826, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2532 + }, + { + "completion_length": 11.84375, + "epoch": 0.44384089714385844, + "grad_norm": 20.09217595030906, + "kl": 0.1748046875, + "learning_rate": 5.563343262659891e-07, + "loss": 0.07, + "reward": 1.6363379955291748, + "reward_std": 0.18992879986763, + "rewards/accuracy_reward_stage2": 0.7613379955291748, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2533 + }, + { + "completion_length": 19.71875, + "epoch": 0.444016120553706, + "grad_norm": 21.313434613331474, + "kl": 0.2001953125, + "learning_rate": 5.561591028561416e-07, + "loss": -0.0077, + "reward": 1.2119635343551636, + "reward_std": 0.20474477112293243, + "rewards/accuracy_reward_stage2": 0.4932134747505188, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2534 + }, + { + "completion_length": 18.046875, + "epoch": 0.44419134396355353, + "grad_norm": 22.413468625805006, + "kl": 0.310546875, + "learning_rate": 5.55983879446294e-07, + "loss": 0.0457, + "reward": 1.422907829284668, + "reward_std": 0.159887433052063, + "rewards/accuracy_reward_stage2": 0.594782829284668, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2535 + }, + { + "completion_length": 16.5, + "epoch": 0.4443665673734011, + "grad_norm": 62.449080252584, + "kl": 0.068359375, + "learning_rate": 5.558086560364464e-07, + "loss": -0.008, + "reward": 1.517016887664795, + "reward_std": 0.2230542004108429, + "rewards/accuracy_reward_stage2": 0.5326418876647949, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2536 + }, + { + "completion_length": 9.421875, + "epoch": 0.4445417907832486, + "grad_norm": 16.9292753906615, + "kl": 0.04150390625, + "learning_rate": 5.556334326265989e-07, + "loss": 0.0165, + "reward": 1.482633113861084, + "reward_std": 0.1710350215435028, + "rewards/accuracy_reward_stage2": 0.607633113861084, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2537 + }, + { + "completion_length": 10.5625, + "epoch": 0.44471701419309617, + "grad_norm": 26.703555972127315, + "kl": 0.31640625, + "learning_rate": 5.554582092167513e-07, + "loss": -0.0319, + "reward": 1.3041858673095703, + "reward_std": 0.2861517071723938, + "rewards/accuracy_reward_stage2": 0.5073109269142151, + "rewards/format_reward_stage1_pointerpad": 0.796875, + "scores/accuracy_reward_stage2": 0.796875, + "step": 2538 + }, + { + "completion_length": 11.4375, + "epoch": 0.4448922376029438, + "grad_norm": 7.063004974438899, + "kl": 0.044677734375, + "learning_rate": 5.552829858069038e-07, + "loss": -0.0263, + "reward": 1.78125, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2539 + }, + { + "completion_length": 7.328125, + "epoch": 0.4450674610127913, + "grad_norm": 14.845076825348983, + "kl": 0.15234375, + "learning_rate": 5.551077623970562e-07, + "loss": -0.0562, + "reward": 1.6265857219696045, + "reward_std": 0.2581363916397095, + "rewards/accuracy_reward_stage2": 0.6734606623649597, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2540 + }, + { + "completion_length": 8.453125, + "epoch": 0.44524268442263887, + "grad_norm": 25.135986150588074, + "kl": 0.1669921875, + "learning_rate": 5.549325389872087e-07, + "loss": 0.0133, + "reward": 1.6385996341705322, + "reward_std": 0.3358069062232971, + "rewards/accuracy_reward_stage2": 0.6698496341705322, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2541 + }, + { + "completion_length": 12.46875, + "epoch": 0.4454179078324864, + "grad_norm": 43.303538715189156, + "kl": 0.490234375, + "learning_rate": 5.547573155773612e-07, + "loss": 0.1526, + "reward": 1.3292334079742432, + "reward_std": 0.25464826822280884, + "rewards/accuracy_reward_stage2": 0.5948582887649536, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2542 + }, + { + "completion_length": 14.09375, + "epoch": 0.44559313124233396, + "grad_norm": 16.19933901607236, + "kl": 0.044189453125, + "learning_rate": 5.545820921675135e-07, + "loss": 0.0177, + "reward": 1.329080581665039, + "reward_std": 0.20220564305782318, + "rewards/accuracy_reward_stage2": 0.3290805220603943, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2543 + }, + { + "completion_length": 9.953125, + "epoch": 0.4457683546521815, + "grad_norm": 21.785405769856855, + "kl": 0.062255859375, + "learning_rate": 5.54406868757666e-07, + "loss": 0.0249, + "reward": 1.7413477897644043, + "reward_std": 0.24454760551452637, + "rewards/accuracy_reward_stage2": 0.7413477301597595, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2544 + }, + { + "completion_length": 8.390625, + "epoch": 0.4459435780620291, + "grad_norm": 19.600424294478405, + "kl": 0.1162109375, + "learning_rate": 5.542316453478185e-07, + "loss": 0.0464, + "reward": 1.4191194772720337, + "reward_std": 0.27458277344703674, + "rewards/accuracy_reward_stage2": 0.5441195368766785, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2545 + }, + { + "completion_length": 9.3125, + "epoch": 0.44611880147187666, + "grad_norm": 19.44479956550256, + "kl": 0.15234375, + "learning_rate": 5.540564219379708e-07, + "loss": 0.0606, + "reward": 0.9888094663619995, + "reward_std": 0.180924654006958, + "rewards/accuracy_reward_stage2": 0.2388094961643219, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2546 + }, + { + "completion_length": 10.828125, + "epoch": 0.4462940248817242, + "grad_norm": 19.593423184652256, + "kl": 0.1572265625, + "learning_rate": 5.538811985281233e-07, + "loss": 0.0239, + "reward": 1.71272873878479, + "reward_std": 0.3122587203979492, + "rewards/accuracy_reward_stage2": 0.7283537983894348, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2547 + }, + { + "completion_length": 12.53125, + "epoch": 0.44646924829157175, + "grad_norm": 17.261805816189508, + "kl": 0.10009765625, + "learning_rate": 5.537059751182757e-07, + "loss": -0.0192, + "reward": 1.4522144794464111, + "reward_std": 0.22447730600833893, + "rewards/accuracy_reward_stage2": 0.48346447944641113, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2548 + }, + { + "completion_length": 8.734375, + "epoch": 0.4466444717014193, + "grad_norm": 14.751921509927948, + "kl": 0.0174560546875, + "learning_rate": 5.535307517084282e-07, + "loss": 0.007, + "reward": 1.827867031097412, + "reward_std": 0.1435975730419159, + "rewards/accuracy_reward_stage2": 0.8278670907020569, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2549 + }, + { + "completion_length": 13.875, + "epoch": 0.44681969511126685, + "grad_norm": 21.018803559215076, + "kl": 0.240234375, + "learning_rate": 5.533555282985807e-07, + "loss": 0.0675, + "reward": 1.5392227172851562, + "reward_std": 0.20909258723258972, + "rewards/accuracy_reward_stage2": 0.6798477172851562, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2550 + }, + { + "completion_length": 15.71875, + "epoch": 0.44699491852111445, + "grad_norm": 18.16554039242802, + "kl": 0.10009765625, + "learning_rate": 5.531803048887331e-07, + "loss": -0.0031, + "reward": 1.5369999408721924, + "reward_std": 0.21140316128730774, + "rewards/accuracy_reward_stage2": 0.5526249408721924, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2551 + }, + { + "completion_length": 10.859375, + "epoch": 0.447170141930962, + "grad_norm": 21.735984742936957, + "kl": 0.0810546875, + "learning_rate": 5.530050814788856e-07, + "loss": 0.0324, + "reward": 1.5851173400878906, + "reward_std": 0.2794300317764282, + "rewards/accuracy_reward_stage2": 0.5851173400878906, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2552 + }, + { + "completion_length": 7.96875, + "epoch": 0.44734536534080954, + "grad_norm": 17.867846182029798, + "kl": 0.12890625, + "learning_rate": 5.528298580690381e-07, + "loss": 0.0085, + "reward": 1.6685552597045898, + "reward_std": 0.20725038647651672, + "rewards/accuracy_reward_stage2": 0.6841802597045898, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2553 + }, + { + "completion_length": 11.953125, + "epoch": 0.4475205887506571, + "grad_norm": 34.389850460739886, + "kl": 0.3203125, + "learning_rate": 5.526546346591905e-07, + "loss": 0.0838, + "reward": 1.4835262298583984, + "reward_std": 0.2388857752084732, + "rewards/accuracy_reward_stage2": 0.6241511106491089, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2554 + }, + { + "completion_length": 8.40625, + "epoch": 0.44769581216050464, + "grad_norm": 27.431755120229397, + "kl": 0.19140625, + "learning_rate": 5.52479411249343e-07, + "loss": 0.0482, + "reward": 1.5072015523910522, + "reward_std": 0.28641408681869507, + "rewards/accuracy_reward_stage2": 0.5228264927864075, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2555 + }, + { + "completion_length": 8.890625, + "epoch": 0.4478710355703522, + "grad_norm": 16.111416884167895, + "kl": 0.03515625, + "learning_rate": 5.523041878394952e-07, + "loss": -0.0301, + "reward": 1.78125, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2556 + }, + { + "completion_length": 9.453125, + "epoch": 0.44804625898019973, + "grad_norm": 14.573244310740995, + "kl": 0.10546875, + "learning_rate": 5.521289644296477e-07, + "loss": 0.0421, + "reward": 1.6069750785827637, + "reward_std": 0.20344749093055725, + "rewards/accuracy_reward_stage2": 0.6069749593734741, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2557 + }, + { + "completion_length": 8.40625, + "epoch": 0.44822148239004733, + "grad_norm": 14.188444483313175, + "kl": 0.06982421875, + "learning_rate": 5.519537410198002e-07, + "loss": -0.0322, + "reward": 1.6511962413787842, + "reward_std": 0.12565788626670837, + "rewards/accuracy_reward_stage2": 0.6824462413787842, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2558 + }, + { + "completion_length": 8.28125, + "epoch": 0.4483967057998949, + "grad_norm": 20.127520754602703, + "kl": 0.09033203125, + "learning_rate": 5.517785176099526e-07, + "loss": -0.0522, + "reward": 1.349897861480713, + "reward_std": 0.26652413606643677, + "rewards/accuracy_reward_stage2": 0.3811478316783905, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2559 + }, + { + "completion_length": 13.34375, + "epoch": 0.4485719292097424, + "grad_norm": 20.592003423289654, + "kl": 0.0810546875, + "learning_rate": 5.516032942001051e-07, + "loss": -0.0117, + "reward": 1.5917490720748901, + "reward_std": 0.18286369740962982, + "rewards/accuracy_reward_stage2": 0.6073740720748901, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2560 + }, + { + "completion_length": 8.5625, + "epoch": 0.44874715261959, + "grad_norm": 17.541758776799224, + "kl": 0.12255859375, + "learning_rate": 5.514280707902576e-07, + "loss": 0.0119, + "reward": 1.6510417461395264, + "reward_std": 0.2547297775745392, + "rewards/accuracy_reward_stage2": 0.6666666269302368, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2561 + }, + { + "completion_length": 11.921875, + "epoch": 0.4489223760294375, + "grad_norm": 13.059778536734967, + "kl": 0.1455078125, + "learning_rate": 5.5125284738041e-07, + "loss": 0.0583, + "reward": 1.515625, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2562 + }, + { + "completion_length": 14.734375, + "epoch": 0.44909759943928507, + "grad_norm": 17.78107440024237, + "kl": 0.06005859375, + "learning_rate": 5.510776239705625e-07, + "loss": 0.024, + "reward": 1.5911427736282349, + "reward_std": 0.23703868687152863, + "rewards/accuracy_reward_stage2": 0.5911428332328796, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2563 + }, + { + "completion_length": 15.09375, + "epoch": 0.44927282284913267, + "grad_norm": 21.778189739327654, + "kl": 0.080078125, + "learning_rate": 5.509024005607149e-07, + "loss": 0.0319, + "reward": 1.6581635475158691, + "reward_std": 0.24475786089897156, + "rewards/accuracy_reward_stage2": 0.6581635475158691, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2564 + }, + { + "completion_length": 12.15625, + "epoch": 0.4494480462589802, + "grad_norm": 17.34065307299987, + "kl": 0.11328125, + "learning_rate": 5.507271771508674e-07, + "loss": 0.0453, + "reward": 1.4566253423690796, + "reward_std": 0.17970743775367737, + "rewards/accuracy_reward_stage2": 0.4566253423690796, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2565 + }, + { + "completion_length": 11.09375, + "epoch": 0.44962326966882776, + "grad_norm": 26.66239812379797, + "kl": 0.21875, + "learning_rate": 5.505519537410198e-07, + "loss": 0.0198, + "reward": 1.3697327375411987, + "reward_std": 0.2223256528377533, + "rewards/accuracy_reward_stage2": 0.5259827971458435, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2566 + }, + { + "completion_length": 15.34375, + "epoch": 0.4497984930786753, + "grad_norm": 17.23732687057513, + "kl": 0.09033203125, + "learning_rate": 5.503767303311722e-07, + "loss": -0.0081, + "reward": 1.6454896926879883, + "reward_std": 0.23397132754325867, + "rewards/accuracy_reward_stage2": 0.6611147522926331, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2567 + }, + { + "completion_length": 18.5, + "epoch": 0.44997371648852286, + "grad_norm": 20.48343016070601, + "kl": 0.091796875, + "learning_rate": 5.502015069213246e-07, + "loss": 0.0366, + "reward": 1.6218650341033936, + "reward_std": 0.15606439113616943, + "rewards/accuracy_reward_stage2": 0.6218649744987488, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2568 + }, + { + "completion_length": 13.25, + "epoch": 0.4501489398983704, + "grad_norm": 20.279168042824203, + "kl": 0.1435546875, + "learning_rate": 5.500262835114771e-07, + "loss": 0.0575, + "reward": 1.488884449005127, + "reward_std": 0.24019384384155273, + "rewards/accuracy_reward_stage2": 0.4888843894004822, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2569 + }, + { + "completion_length": 7.75, + "epoch": 0.45032416330821795, + "grad_norm": 16.791397881227955, + "kl": 0.0576171875, + "learning_rate": 5.498510601016295e-07, + "loss": 0.0231, + "reward": 1.7581019401550293, + "reward_std": 0.2148759514093399, + "rewards/accuracy_reward_stage2": 0.7581018209457397, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2570 + }, + { + "completion_length": 18.171875, + "epoch": 0.45049938671806555, + "grad_norm": 15.321118233041698, + "kl": 0.07666015625, + "learning_rate": 5.49675836691782e-07, + "loss": 0.0018, + "reward": 1.543554425239563, + "reward_std": 0.16047553718090057, + "rewards/accuracy_reward_stage2": 0.559179425239563, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2571 + }, + { + "completion_length": 13.046875, + "epoch": 0.4506746101279131, + "grad_norm": 17.57267956385793, + "kl": 0.2099609375, + "learning_rate": 5.495006132819344e-07, + "loss": 0.0399, + "reward": 1.5052083730697632, + "reward_std": 0.23056091368198395, + "rewards/accuracy_reward_stage2": 0.6458333134651184, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2572 + }, + { + "completion_length": 11.421875, + "epoch": 0.45084983353776065, + "grad_norm": 19.669061029863865, + "kl": 0.0693359375, + "learning_rate": 5.493253898720869e-07, + "loss": 0.0278, + "reward": 1.5255483388900757, + "reward_std": 0.22976790368556976, + "rewards/accuracy_reward_stage2": 0.5255483984947205, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2573 + }, + { + "completion_length": 10.125, + "epoch": 0.4510250569476082, + "grad_norm": 13.546947500812909, + "kl": 0.056396484375, + "learning_rate": 5.491501664622394e-07, + "loss": 0.0225, + "reward": 1.3796207904815674, + "reward_std": 0.10605320334434509, + "rewards/accuracy_reward_stage2": 0.5046207904815674, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2574 + }, + { + "completion_length": 11.0, + "epoch": 0.45120028035745574, + "grad_norm": 22.4200432270858, + "kl": 0.251953125, + "learning_rate": 5.489749430523917e-07, + "loss": 0.0591, + "reward": 1.4289274215698242, + "reward_std": 0.19315001368522644, + "rewards/accuracy_reward_stage2": 0.569552481174469, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2575 + }, + { + "completion_length": 10.453125, + "epoch": 0.4513755037673033, + "grad_norm": 17.1520528815772, + "kl": 0.08740234375, + "learning_rate": 5.487997196425442e-07, + "loss": -0.0092, + "reward": 1.49286687374115, + "reward_std": 0.20367136597633362, + "rewards/accuracy_reward_stage2": 0.5084918737411499, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2576 + }, + { + "completion_length": 12.125, + "epoch": 0.4515507271771509, + "grad_norm": 16.746090519646597, + "kl": 0.06494140625, + "learning_rate": 5.486244962326967e-07, + "loss": 0.0261, + "reward": 1.6583220958709717, + "reward_std": 0.18301549553871155, + "rewards/accuracy_reward_stage2": 0.7833219766616821, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2577 + }, + { + "completion_length": 9.484375, + "epoch": 0.45172595058699844, + "grad_norm": 18.860574402696923, + "kl": 0.1630859375, + "learning_rate": 5.484492728228491e-07, + "loss": 0.032, + "reward": 1.6444649696350098, + "reward_std": 0.22633656859397888, + "rewards/accuracy_reward_stage2": 0.6600899696350098, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2578 + }, + { + "completion_length": 15.453125, + "epoch": 0.451901173996846, + "grad_norm": 18.134086534099705, + "kl": 0.06982421875, + "learning_rate": 5.482740494130016e-07, + "loss": 0.028, + "reward": 1.4077608585357666, + "reward_std": 0.12077020853757858, + "rewards/accuracy_reward_stage2": 0.407760888338089, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2579 + }, + { + "completion_length": 10.625, + "epoch": 0.45207639740669353, + "grad_norm": 18.336216197197302, + "kl": 0.0250244140625, + "learning_rate": 5.48098826003154e-07, + "loss": 0.01, + "reward": 1.5207476615905762, + "reward_std": 0.16083654761314392, + "rewards/accuracy_reward_stage2": 0.5207476615905762, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2580 + }, + { + "completion_length": 18.53125, + "epoch": 0.4522516208165411, + "grad_norm": 18.966009229323692, + "kl": 0.224609375, + "learning_rate": 5.479236025933064e-07, + "loss": 0.0455, + "reward": 1.5232280492782593, + "reward_std": 0.17001014947891235, + "rewards/accuracy_reward_stage2": 0.663853108882904, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2581 + }, + { + "completion_length": 10.03125, + "epoch": 0.4524268442263886, + "grad_norm": 19.36451871278005, + "kl": 0.1611328125, + "learning_rate": 5.477483791834589e-07, + "loss": 0.0194, + "reward": 1.6093440055847168, + "reward_std": 0.31875163316726685, + "rewards/accuracy_reward_stage2": 0.6405940055847168, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2582 + }, + { + "completion_length": 11.640625, + "epoch": 0.4526020676362362, + "grad_norm": 18.495981870617012, + "kl": 0.1884765625, + "learning_rate": 5.475731557736113e-07, + "loss": 0.0753, + "reward": 1.604927897453308, + "reward_std": 0.17703410983085632, + "rewards/accuracy_reward_stage2": 0.7299278378486633, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2583 + }, + { + "completion_length": 8.4375, + "epoch": 0.4527772910460838, + "grad_norm": 13.749480254627533, + "kl": 0.1875, + "learning_rate": 5.473979323637638e-07, + "loss": 0.0306, + "reward": 1.4577176570892334, + "reward_std": 0.20654311776161194, + "rewards/accuracy_reward_stage2": 0.4733426570892334, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2584 + }, + { + "completion_length": 8.390625, + "epoch": 0.4529525144559313, + "grad_norm": 25.879623238170172, + "kl": 0.1728515625, + "learning_rate": 5.472227089539163e-07, + "loss": 0.0479, + "reward": 1.7104345560073853, + "reward_std": 0.2433592975139618, + "rewards/accuracy_reward_stage2": 0.7260594964027405, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2585 + }, + { + "completion_length": 9.546875, + "epoch": 0.45312773786577887, + "grad_norm": 18.655052078316817, + "kl": 0.271484375, + "learning_rate": 5.470474855440686e-07, + "loss": -0.0886, + "reward": 1.6952078342437744, + "reward_std": 0.2790555953979492, + "rewards/accuracy_reward_stage2": 0.7733327746391296, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 2586 + }, + { + "completion_length": 8.75, + "epoch": 0.4533029612756264, + "grad_norm": 17.11928499202011, + "kl": 0.16796875, + "learning_rate": 5.468722621342211e-07, + "loss": 0.0673, + "reward": 1.462017297744751, + "reward_std": 0.3036963939666748, + "rewards/accuracy_reward_stage2": 0.587017297744751, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2587 + }, + { + "completion_length": 8.28125, + "epoch": 0.45347818468547396, + "grad_norm": 17.212160066694576, + "kl": 0.054443359375, + "learning_rate": 5.466970387243735e-07, + "loss": 0.0217, + "reward": 1.844616413116455, + "reward_std": 0.07760395854711533, + "rewards/accuracy_reward_stage2": 0.8446164727210999, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2588 + }, + { + "completion_length": 7.734375, + "epoch": 0.4536534080953215, + "grad_norm": 27.47549068286514, + "kl": 0.298828125, + "learning_rate": 5.46521815314526e-07, + "loss": 0.0384, + "reward": 1.6829417943954468, + "reward_std": 0.24853834509849548, + "rewards/accuracy_reward_stage2": 0.714191734790802, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2589 + }, + { + "completion_length": 6.890625, + "epoch": 0.4538286315051691, + "grad_norm": 17.530731905229523, + "kl": 0.1484375, + "learning_rate": 5.463465919046785e-07, + "loss": -0.0719, + "reward": 1.6600399017333984, + "reward_std": 0.2958260476589203, + "rewards/accuracy_reward_stage2": 0.7069148421287537, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2590 + }, + { + "completion_length": 11.546875, + "epoch": 0.45400385491501666, + "grad_norm": 19.476134745790667, + "kl": 0.12890625, + "learning_rate": 5.461713684948309e-07, + "loss": 0.0514, + "reward": 1.619103193283081, + "reward_std": 0.23953868448734283, + "rewards/accuracy_reward_stage2": 0.6191032528877258, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2591 + }, + { + "completion_length": 10.59375, + "epoch": 0.4541790783248642, + "grad_norm": 16.844721399664373, + "kl": 0.125, + "learning_rate": 5.459961450849834e-07, + "loss": 0.0059, + "reward": 1.814000129699707, + "reward_std": 0.18107157945632935, + "rewards/accuracy_reward_stage2": 0.8296250700950623, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2592 + }, + { + "completion_length": 19.125, + "epoch": 0.45435430173471175, + "grad_norm": 15.050078423215997, + "kl": 0.09912109375, + "learning_rate": 5.458209216751359e-07, + "loss": -0.0045, + "reward": 1.384594202041626, + "reward_std": 0.1534789651632309, + "rewards/accuracy_reward_stage2": 0.40021926164627075, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2593 + }, + { + "completion_length": 12.578125, + "epoch": 0.4545295251445593, + "grad_norm": 16.303858847797237, + "kl": 0.125, + "learning_rate": 5.456456982652882e-07, + "loss": 0.0059, + "reward": 1.482431411743164, + "reward_std": 0.2253151684999466, + "rewards/accuracy_reward_stage2": 0.4980563521385193, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2594 + }, + { + "completion_length": 11.203125, + "epoch": 0.45470474855440685, + "grad_norm": 16.395967548946256, + "kl": 0.169921875, + "learning_rate": 5.454704748554406e-07, + "loss": 0.0237, + "reward": 1.3767869472503662, + "reward_std": 0.22264955937862396, + "rewards/accuracy_reward_stage2": 0.5174120664596558, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2595 + }, + { + "completion_length": 10.578125, + "epoch": 0.45487997196425445, + "grad_norm": 18.742319177335087, + "kl": 0.12060546875, + "learning_rate": 5.45295251445593e-07, + "loss": 0.0086, + "reward": 1.7379519939422607, + "reward_std": 0.2344542145729065, + "rewards/accuracy_reward_stage2": 0.7535768747329712, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2596 + }, + { + "completion_length": 11.515625, + "epoch": 0.455055195374102, + "grad_norm": 20.001536055255976, + "kl": 0.23046875, + "learning_rate": 5.451200280357455e-07, + "loss": -0.0223, + "reward": 1.3056724071502686, + "reward_std": 0.3078497648239136, + "rewards/accuracy_reward_stage2": 0.35254743695259094, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2597 + }, + { + "completion_length": 15.28125, + "epoch": 0.45523041878394954, + "grad_norm": 23.3106633392216, + "kl": 0.140625, + "learning_rate": 5.44944804625898e-07, + "loss": 0.0286, + "reward": 1.4137800931930542, + "reward_std": 0.29428166151046753, + "rewards/accuracy_reward_stage2": 0.4294050335884094, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2598 + }, + { + "completion_length": 11.515625, + "epoch": 0.4554056421937971, + "grad_norm": 17.17033481514972, + "kl": 0.08837890625, + "learning_rate": 5.447695812160504e-07, + "loss": 0.0054, + "reward": 1.3880894184112549, + "reward_std": 0.25328290462493896, + "rewards/accuracy_reward_stage2": 0.5287142992019653, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2599 + }, + { + "completion_length": 9.171875, + "epoch": 0.45558086560364464, + "grad_norm": 21.993536038979904, + "kl": 0.142578125, + "learning_rate": 5.445943578062029e-07, + "loss": 0.0572, + "reward": 1.56821870803833, + "reward_std": 0.24604782462120056, + "rewards/accuracy_reward_stage2": 0.6932187676429749, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2600 + }, + { + "completion_length": 9.03125, + "epoch": 0.4557560890134922, + "grad_norm": 19.09370863576669, + "kl": 0.076171875, + "learning_rate": 5.444191343963554e-07, + "loss": -0.0137, + "reward": 1.6924455165863037, + "reward_std": 0.19052401185035706, + "rewards/accuracy_reward_stage2": 0.7080705165863037, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2601 + }, + { + "completion_length": 9.96875, + "epoch": 0.4559313124233398, + "grad_norm": 20.253305860465996, + "kl": 0.1416015625, + "learning_rate": 5.442439109865078e-07, + "loss": 0.0236, + "reward": 1.4954473972320557, + "reward_std": 0.2541544735431671, + "rewards/accuracy_reward_stage2": 0.6360723972320557, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2602 + }, + { + "completion_length": 12.765625, + "epoch": 0.45610653583318733, + "grad_norm": 15.689281208872066, + "kl": 0.1240234375, + "learning_rate": 5.440686875766603e-07, + "loss": 0.0056, + "reward": 1.3455251455307007, + "reward_std": 0.13567785918712616, + "rewards/accuracy_reward_stage2": 0.4861501157283783, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2603 + }, + { + "completion_length": 11.90625, + "epoch": 0.4562817592430349, + "grad_norm": 18.400359885372882, + "kl": 0.150390625, + "learning_rate": 5.438934641668127e-07, + "loss": 0.0386, + "reward": 1.3754087686538696, + "reward_std": 0.26961177587509155, + "rewards/accuracy_reward_stage2": 0.6410337686538696, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2604 + }, + { + "completion_length": 6.015625, + "epoch": 0.4564569826528824, + "grad_norm": 18.298089152870244, + "kl": 0.038330078125, + "learning_rate": 5.437182407569652e-07, + "loss": 0.0154, + "reward": 1.7267228364944458, + "reward_std": 0.15918239951133728, + "rewards/accuracy_reward_stage2": 0.8517228960990906, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2605 + }, + { + "completion_length": 11.609375, + "epoch": 0.45663220606273, + "grad_norm": 14.534911045435273, + "kl": 0.1318359375, + "learning_rate": 5.435430173471176e-07, + "loss": -0.0358, + "reward": 1.7274062633514404, + "reward_std": 0.11909748613834381, + "rewards/accuracy_reward_stage2": 0.7586562633514404, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2606 + }, + { + "completion_length": 12.1875, + "epoch": 0.4568074294725775, + "grad_norm": 15.895748749987368, + "kl": 0.08544921875, + "learning_rate": 5.433677939372699e-07, + "loss": 0.0341, + "reward": 1.7754074335098267, + "reward_std": 0.17999312281608582, + "rewards/accuracy_reward_stage2": 0.7754074335098267, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2607 + }, + { + "completion_length": 8.453125, + "epoch": 0.45698265288242507, + "grad_norm": 15.343178933225655, + "kl": 0.08154296875, + "learning_rate": 5.431925705274224e-07, + "loss": -0.0116, + "reward": 1.7874504327774048, + "reward_std": 0.17973880469799042, + "rewards/accuracy_reward_stage2": 0.8030754327774048, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2608 + }, + { + "completion_length": 9.96875, + "epoch": 0.45715787629227267, + "grad_norm": 16.31743124314278, + "kl": 0.2197265625, + "learning_rate": 5.430173471175748e-07, + "loss": 0.088, + "reward": 1.2728722095489502, + "reward_std": 0.06837272644042969, + "rewards/accuracy_reward_stage2": 0.39787212014198303, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2609 + }, + { + "completion_length": 12.921875, + "epoch": 0.4573330997021202, + "grad_norm": 22.3630972019856, + "kl": 0.3046875, + "learning_rate": 5.428421237077273e-07, + "loss": 0.0188, + "reward": 1.1350054740905762, + "reward_std": 0.45940613746643066, + "rewards/accuracy_reward_stage2": 0.3068804144859314, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2610 + }, + { + "completion_length": 10.03125, + "epoch": 0.45750832311196776, + "grad_norm": 18.02245735574043, + "kl": 0.11669921875, + "learning_rate": 5.426669002978798e-07, + "loss": 0.01, + "reward": 1.3457145690917969, + "reward_std": 0.18275277316570282, + "rewards/accuracy_reward_stage2": 0.4863395094871521, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2611 + }, + { + "completion_length": 11.625, + "epoch": 0.4576835465218153, + "grad_norm": 27.94120506568601, + "kl": 0.224609375, + "learning_rate": 5.424916768880322e-07, + "loss": 0.0393, + "reward": 1.4052271842956543, + "reward_std": 0.3087007403373718, + "rewards/accuracy_reward_stage2": 0.5458522439002991, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2612 + }, + { + "completion_length": 13.28125, + "epoch": 0.45785876993166286, + "grad_norm": 18.883931756205346, + "kl": 0.1494140625, + "learning_rate": 5.423164534781847e-07, + "loss": 0.0286, + "reward": 1.4941810369491577, + "reward_std": 0.17316259443759918, + "rewards/accuracy_reward_stage2": 0.5098060369491577, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2613 + }, + { + "completion_length": 15.90625, + "epoch": 0.4580339933415104, + "grad_norm": 19.042972465701816, + "kl": 0.0771484375, + "learning_rate": 5.421412300683372e-07, + "loss": -0.0133, + "reward": 1.240898609161377, + "reward_std": 0.19125321507453918, + "rewards/accuracy_reward_stage2": 0.3815236985683441, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2614 + }, + { + "completion_length": 15.765625, + "epoch": 0.458209216751358, + "grad_norm": 60.96562398209649, + "kl": 0.380859375, + "learning_rate": 5.419660066584895e-07, + "loss": 0.0644, + "reward": 1.300868034362793, + "reward_std": 0.1779276430606842, + "rewards/accuracy_reward_stage2": 0.33211806416511536, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2615 + }, + { + "completion_length": 7.171875, + "epoch": 0.45838444016120555, + "grad_norm": 16.982198405407203, + "kl": 0.06494140625, + "learning_rate": 5.41790783248642e-07, + "loss": 0.0259, + "reward": 1.6714116334915161, + "reward_std": 0.10098038613796234, + "rewards/accuracy_reward_stage2": 0.6714116334915161, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2616 + }, + { + "completion_length": 8.40625, + "epoch": 0.4585596635710531, + "grad_norm": 14.23052773010471, + "kl": 0.1220703125, + "learning_rate": 5.416155598387944e-07, + "loss": 0.0047, + "reward": 1.5308772325515747, + "reward_std": 0.17923077940940857, + "rewards/accuracy_reward_stage2": 0.6715022325515747, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2617 + }, + { + "completion_length": 9.015625, + "epoch": 0.45873488698090065, + "grad_norm": 16.07430260605479, + "kl": 0.08544921875, + "learning_rate": 5.414403364289469e-07, + "loss": -0.0068, + "reward": 1.5297343730926514, + "reward_std": 0.19875817000865936, + "rewards/accuracy_reward_stage2": 0.5453594326972961, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2618 + }, + { + "completion_length": 6.75, + "epoch": 0.4589101103907482, + "grad_norm": 10.922705313788358, + "kl": 0.1181640625, + "learning_rate": 5.412651130190993e-07, + "loss": 0.003, + "reward": 1.6112689971923828, + "reward_std": 0.12722565233707428, + "rewards/accuracy_reward_stage2": 0.626893937587738, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2619 + }, + { + "completion_length": 11.375, + "epoch": 0.45908533380059574, + "grad_norm": 16.884670927133442, + "kl": 0.099609375, + "learning_rate": 5.410898896092517e-07, + "loss": -0.0378, + "reward": 1.529841661453247, + "reward_std": 0.28377997875213623, + "rewards/accuracy_reward_stage2": 0.5610915422439575, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2620 + }, + { + "completion_length": 8.515625, + "epoch": 0.4592605572104433, + "grad_norm": 19.32312477942592, + "kl": 0.1376953125, + "learning_rate": 5.409146661994042e-07, + "loss": 0.055, + "reward": 1.831881046295166, + "reward_std": 0.24564093351364136, + "rewards/accuracy_reward_stage2": 0.831881046295166, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2621 + }, + { + "completion_length": 13.1875, + "epoch": 0.4594357806202909, + "grad_norm": 50.20033941318913, + "kl": 0.059814453125, + "learning_rate": 5.407394427895567e-07, + "loss": -0.0075, + "reward": 1.488027811050415, + "reward_std": 0.25944140553474426, + "rewards/accuracy_reward_stage2": 0.503652811050415, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2622 + }, + { + "completion_length": 9.53125, + "epoch": 0.45961100403013844, + "grad_norm": 20.21375048286647, + "kl": 0.1640625, + "learning_rate": 5.405642193797091e-07, + "loss": -0.0056, + "reward": 1.4095327854156494, + "reward_std": 0.1797855645418167, + "rewards/accuracy_reward_stage2": 0.6907828450202942, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2623 + }, + { + "completion_length": 9.84375, + "epoch": 0.459786227439986, + "grad_norm": 16.485504483917524, + "kl": 0.12060546875, + "learning_rate": 5.403889959698616e-07, + "loss": 0.004, + "reward": 1.4300655126571655, + "reward_std": 0.24736399948596954, + "rewards/accuracy_reward_stage2": 0.4456905424594879, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2624 + }, + { + "completion_length": 11.71875, + "epoch": 0.45996145084983353, + "grad_norm": 24.078458947001245, + "kl": 0.11181640625, + "learning_rate": 5.402137725600139e-07, + "loss": 0.0212, + "reward": 1.616410493850708, + "reward_std": 0.3158206343650818, + "rewards/accuracy_reward_stage2": 0.6320353746414185, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2625 + }, + { + "completion_length": 11.734375, + "epoch": 0.4601366742596811, + "grad_norm": 20.69459400538519, + "kl": 0.10498046875, + "learning_rate": 5.400385491501664e-07, + "loss": -0.0401, + "reward": 1.7175240516662598, + "reward_std": 0.2727046310901642, + "rewards/accuracy_reward_stage2": 0.7487740516662598, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2626 + }, + { + "completion_length": 12.984375, + "epoch": 0.4603118976695286, + "grad_norm": 27.47238303244588, + "kl": 0.1318359375, + "learning_rate": 5.398633257403189e-07, + "loss": 0.0526, + "reward": 1.394019365310669, + "reward_std": 0.2805423140525818, + "rewards/accuracy_reward_stage2": 0.39401930570602417, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2627 + }, + { + "completion_length": 7.921875, + "epoch": 0.4604871210793762, + "grad_norm": 23.372854969361217, + "kl": 0.08251953125, + "learning_rate": 5.396881023304713e-07, + "loss": -0.0553, + "reward": 1.5483975410461426, + "reward_std": 0.2884424328804016, + "rewards/accuracy_reward_stage2": 0.5796475410461426, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2628 + }, + { + "completion_length": 8.109375, + "epoch": 0.4606623444892238, + "grad_norm": 21.654997769290077, + "kl": 0.234375, + "learning_rate": 5.395128789206238e-07, + "loss": -0.0155, + "reward": 1.647642731666565, + "reward_std": 0.29530832171440125, + "rewards/accuracy_reward_stage2": 0.6945177316665649, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2629 + }, + { + "completion_length": 7.1875, + "epoch": 0.4608375678990713, + "grad_norm": 20.993264285496924, + "kl": 0.0498046875, + "learning_rate": 5.393376555107763e-07, + "loss": 0.0199, + "reward": 1.6677969694137573, + "reward_std": 0.250203400850296, + "rewards/accuracy_reward_stage2": 0.6677969694137573, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2630 + }, + { + "completion_length": 12.375, + "epoch": 0.46101279130891887, + "grad_norm": 22.242581404062868, + "kl": 0.1572265625, + "learning_rate": 5.391624321009287e-07, + "loss": -0.0134, + "reward": 1.2994139194488525, + "reward_std": 0.27216124534606934, + "rewards/accuracy_reward_stage2": 0.45566391944885254, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2631 + }, + { + "completion_length": 9.0, + "epoch": 0.4611880147187664, + "grad_norm": 16.18477678764415, + "kl": 0.1669921875, + "learning_rate": 5.389872086910811e-07, + "loss": -0.0109, + "reward": 1.5696120262145996, + "reward_std": 0.22004368901252747, + "rewards/accuracy_reward_stage2": 0.7258619666099548, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2632 + }, + { + "completion_length": 10.84375, + "epoch": 0.46136323812861396, + "grad_norm": 18.029162444228128, + "kl": 0.2265625, + "learning_rate": 5.388119852812335e-07, + "loss": -0.0061, + "reward": 1.670124888420105, + "reward_std": 0.34664466977119446, + "rewards/accuracy_reward_stage2": 0.7169998288154602, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2633 + }, + { + "completion_length": 11.203125, + "epoch": 0.46153846153846156, + "grad_norm": 20.919917738046767, + "kl": 0.2021484375, + "learning_rate": 5.38636761871386e-07, + "loss": 0.0468, + "reward": 1.5588070154190063, + "reward_std": 0.20780065655708313, + "rewards/accuracy_reward_stage2": 0.6994320154190063, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2634 + }, + { + "completion_length": 10.734375, + "epoch": 0.4617136849483091, + "grad_norm": 13.869513449660197, + "kl": 0.048583984375, + "learning_rate": 5.384615384615384e-07, + "loss": 0.0194, + "reward": 1.3759428262710571, + "reward_std": 0.11085714399814606, + "rewards/accuracy_reward_stage2": 0.5009427666664124, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2635 + }, + { + "completion_length": 10.484375, + "epoch": 0.46188890835815666, + "grad_norm": 38.51280865681154, + "kl": 0.345703125, + "learning_rate": 5.382863150516908e-07, + "loss": 0.0005, + "reward": 1.303555965423584, + "reward_std": 0.3703456521034241, + "rewards/accuracy_reward_stage2": 0.4910559356212616, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 2636 + }, + { + "completion_length": 10.234375, + "epoch": 0.4620641317680042, + "grad_norm": 20.342024074638832, + "kl": 0.09716796875, + "learning_rate": 5.381110916418433e-07, + "loss": 0.0049, + "reward": 1.2301913499832153, + "reward_std": 0.29700982570648193, + "rewards/accuracy_reward_stage2": 0.37081634998321533, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2637 + }, + { + "completion_length": 11.03125, + "epoch": 0.46223935517785175, + "grad_norm": 23.111731352596458, + "kl": 0.103515625, + "learning_rate": 5.379358682319958e-07, + "loss": -0.0029, + "reward": 1.7350542545318604, + "reward_std": 0.3148344159126282, + "rewards/accuracy_reward_stage2": 0.7506792545318604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2638 + }, + { + "completion_length": 9.828125, + "epoch": 0.4624145785876993, + "grad_norm": 13.631803606055552, + "kl": 0.10546875, + "learning_rate": 5.377606448221482e-07, + "loss": -0.0008, + "reward": 1.5906519889831543, + "reward_std": 0.1260703206062317, + "rewards/accuracy_reward_stage2": 0.6062769293785095, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2639 + }, + { + "completion_length": 9.4375, + "epoch": 0.46258980199754685, + "grad_norm": 19.392433475402512, + "kl": 0.1806640625, + "learning_rate": 5.375854214123007e-07, + "loss": 0.0201, + "reward": 1.4371408224105835, + "reward_std": 0.2100004106760025, + "rewards/accuracy_reward_stage2": 0.4683907926082611, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2640 + }, + { + "completion_length": 23.40625, + "epoch": 0.46276502540739445, + "grad_norm": 17.634424260006245, + "kl": 0.1904296875, + "learning_rate": 5.374101980024531e-07, + "loss": 0.0321, + "reward": 1.568893551826477, + "reward_std": 0.24052214622497559, + "rewards/accuracy_reward_stage2": 0.709518551826477, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2641 + }, + { + "completion_length": 14.765625, + "epoch": 0.462940248817242, + "grad_norm": 47.522481370357156, + "kl": 0.42578125, + "learning_rate": 5.372349745926056e-07, + "loss": 0.1699, + "reward": 1.3623605966567993, + "reward_std": 0.22106300294399261, + "rewards/accuracy_reward_stage2": 0.4873605966567993, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2642 + }, + { + "completion_length": 8.5, + "epoch": 0.46311547222708954, + "grad_norm": 24.112626776811034, + "kl": 0.1923828125, + "learning_rate": 5.370597511827581e-07, + "loss": 0.0769, + "reward": 1.6660329103469849, + "reward_std": 0.2319527268409729, + "rewards/accuracy_reward_stage2": 0.6660328507423401, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2643 + }, + { + "completion_length": 8.859375, + "epoch": 0.4632906956369371, + "grad_norm": 21.399072455362678, + "kl": 0.040283203125, + "learning_rate": 5.368845277729105e-07, + "loss": 0.0161, + "reward": 1.5974323749542236, + "reward_std": 0.17615637183189392, + "rewards/accuracy_reward_stage2": 0.5974323749542236, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2644 + }, + { + "completion_length": 12.625, + "epoch": 0.46346591904678464, + "grad_norm": 22.64683277462908, + "kl": 0.294921875, + "learning_rate": 5.367093043630628e-07, + "loss": 0.1177, + "reward": 1.2825133800506592, + "reward_std": 0.27305760979652405, + "rewards/accuracy_reward_stage2": 0.532513439655304, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2645 + }, + { + "completion_length": 9.59375, + "epoch": 0.4636411424566322, + "grad_norm": 18.268335926025458, + "kl": 0.0966796875, + "learning_rate": 5.365340809532153e-07, + "loss": -0.0055, + "reward": 1.5037181377410889, + "reward_std": 0.28322064876556396, + "rewards/accuracy_reward_stage2": 0.5193430185317993, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2646 + }, + { + "completion_length": 7.234375, + "epoch": 0.4638163658664798, + "grad_norm": 19.29371554660658, + "kl": 0.056396484375, + "learning_rate": 5.363588575433677e-07, + "loss": -0.0108, + "reward": 1.640625, + "reward_std": 0.30721205472946167, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2647 + }, + { + "completion_length": 8.671875, + "epoch": 0.46399158927632733, + "grad_norm": 12.55328298580008, + "kl": 0.1484375, + "learning_rate": 5.361836341335202e-07, + "loss": 0.0187, + "reward": 1.6145833730697632, + "reward_std": 0.14359083771705627, + "rewards/accuracy_reward_stage2": 0.7708333730697632, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2648 + }, + { + "completion_length": 13.703125, + "epoch": 0.4641668126861749, + "grad_norm": 28.805720632436707, + "kl": 0.341796875, + "learning_rate": 5.360084107236726e-07, + "loss": 0.0532, + "reward": 1.2130721807479858, + "reward_std": 0.22352425754070282, + "rewards/accuracy_reward_stage2": 0.36932215094566345, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2649 + }, + { + "completion_length": 9.296875, + "epoch": 0.4643420360960224, + "grad_norm": 16.244027746694883, + "kl": 0.1435546875, + "learning_rate": 5.358331873138251e-07, + "loss": 0.0446, + "reward": 1.689650058746338, + "reward_std": 0.1842573583126068, + "rewards/accuracy_reward_stage2": 0.7052749991416931, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2650 + }, + { + "completion_length": 10.390625, + "epoch": 0.46451725950587, + "grad_norm": 14.736478953994629, + "kl": 0.142578125, + "learning_rate": 5.356579639039776e-07, + "loss": 0.0281, + "reward": 1.7371633052825928, + "reward_std": 0.16478168964385986, + "rewards/accuracy_reward_stage2": 0.752788245677948, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2651 + }, + { + "completion_length": 28.796875, + "epoch": 0.4646924829157175, + "grad_norm": 18.855857379978904, + "kl": 0.1201171875, + "learning_rate": 5.3548274049413e-07, + "loss": 0.0142, + "reward": 1.503377914428711, + "reward_std": 0.19588232040405273, + "rewards/accuracy_reward_stage2": 0.6440029740333557, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2652 + }, + { + "completion_length": 11.984375, + "epoch": 0.4648677063255651, + "grad_norm": 18.497622511685186, + "kl": 0.1748046875, + "learning_rate": 5.353075170842825e-07, + "loss": -0.0081, + "reward": 1.4114044904708862, + "reward_std": 0.21059125661849976, + "rewards/accuracy_reward_stage2": 0.45827943086624146, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2653 + }, + { + "completion_length": 7.671875, + "epoch": 0.46504292973541267, + "grad_norm": 15.292450968559894, + "kl": 0.1767578125, + "learning_rate": 5.35132293674435e-07, + "loss": -0.0177, + "reward": 1.621706247329712, + "reward_std": 0.2714681327342987, + "rewards/accuracy_reward_stage2": 0.6529563665390015, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2654 + }, + { + "completion_length": 7.5625, + "epoch": 0.4652181531452602, + "grad_norm": 18.294816417859785, + "kl": 0.08154296875, + "learning_rate": 5.349570702645873e-07, + "loss": 0.0326, + "reward": 1.434023141860962, + "reward_std": 0.22572475671768188, + "rewards/accuracy_reward_stage2": 0.43402308225631714, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2655 + }, + { + "completion_length": 11.78125, + "epoch": 0.46539337655510776, + "grad_norm": 20.490285425162387, + "kl": 0.0380859375, + "learning_rate": 5.347818468547398e-07, + "loss": 0.0153, + "reward": 1.6418755054473877, + "reward_std": 0.273904949426651, + "rewards/accuracy_reward_stage2": 0.6418755054473877, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2656 + }, + { + "completion_length": 6.984375, + "epoch": 0.4655685999649553, + "grad_norm": 16.005220815998094, + "kl": 0.359375, + "learning_rate": 5.346066234448922e-07, + "loss": 0.0993, + "reward": 1.5720269680023193, + "reward_std": 0.21252962946891785, + "rewards/accuracy_reward_stage2": 0.8376519083976746, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2657 + }, + { + "completion_length": 11.59375, + "epoch": 0.46574382337480286, + "grad_norm": 18.98961650790962, + "kl": 0.05224609375, + "learning_rate": 5.344314000350446e-07, + "loss": 0.021, + "reward": 1.7668068408966064, + "reward_std": 0.23004046082496643, + "rewards/accuracy_reward_stage2": 0.7668067812919617, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2658 + }, + { + "completion_length": 8.140625, + "epoch": 0.4659190467846504, + "grad_norm": 15.850391773857606, + "kl": 0.177734375, + "learning_rate": 5.342561766251971e-07, + "loss": -0.0365, + "reward": 1.7493340969085693, + "reward_std": 0.26293256878852844, + "rewards/accuracy_reward_stage2": 0.7962090373039246, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2659 + }, + { + "completion_length": 7.515625, + "epoch": 0.466094270194498, + "grad_norm": 16.976524700254885, + "kl": 0.1396484375, + "learning_rate": 5.340809532153495e-07, + "loss": -0.017, + "reward": 1.4632868766784668, + "reward_std": 0.3231024742126465, + "rewards/accuracy_reward_stage2": 0.494536817073822, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2660 + }, + { + "completion_length": 10.125, + "epoch": 0.46626949360434555, + "grad_norm": 14.97834778598425, + "kl": 0.10888671875, + "learning_rate": 5.33905729805502e-07, + "loss": -0.0117, + "reward": 1.3670856952667236, + "reward_std": 0.11902426183223724, + "rewards/accuracy_reward_stage2": 0.5233356952667236, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2661 + }, + { + "completion_length": 12.609375, + "epoch": 0.4664447170141931, + "grad_norm": 24.159374464410256, + "kl": 0.392578125, + "learning_rate": 5.337305063956545e-07, + "loss": 0.1385, + "reward": 1.3967511653900146, + "reward_std": 0.274562805891037, + "rewards/accuracy_reward_stage2": 0.6780011653900146, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2662 + }, + { + "completion_length": 12.78125, + "epoch": 0.46661994042404065, + "grad_norm": 17.379055498809137, + "kl": 0.1689453125, + "learning_rate": 5.335552829858069e-07, + "loss": -0.0268, + "reward": 1.5766353607177734, + "reward_std": 0.13020777702331543, + "rewards/accuracy_reward_stage2": 0.6235103011131287, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2663 + }, + { + "completion_length": 9.4375, + "epoch": 0.4667951638338882, + "grad_norm": 19.81851006216102, + "kl": 0.09521484375, + "learning_rate": 5.333800595759594e-07, + "loss": 0.0381, + "reward": 1.2878808975219727, + "reward_std": 0.16528277099132538, + "rewards/accuracy_reward_stage2": 0.28788089752197266, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2664 + }, + { + "completion_length": 9.796875, + "epoch": 0.46697038724373574, + "grad_norm": 22.755169857106104, + "kl": 0.26171875, + "learning_rate": 5.332048361661117e-07, + "loss": 0.0705, + "reward": 1.0916426181793213, + "reward_std": 0.2884361147880554, + "rewards/accuracy_reward_stage2": 0.35726767778396606, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2665 + }, + { + "completion_length": 8.078125, + "epoch": 0.46714561065358334, + "grad_norm": 14.03332775424425, + "kl": 0.10791015625, + "learning_rate": 5.330296127562642e-07, + "loss": 0.0075, + "reward": 1.8164703845977783, + "reward_std": 0.17855873703956604, + "rewards/accuracy_reward_stage2": 0.8320953845977783, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2666 + }, + { + "completion_length": 9.265625, + "epoch": 0.4673208340634309, + "grad_norm": 21.478285987226695, + "kl": 0.047607421875, + "learning_rate": 5.328543893464167e-07, + "loss": 0.0191, + "reward": 1.6493043899536133, + "reward_std": 0.20907220244407654, + "rewards/accuracy_reward_stage2": 0.6493044495582581, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2667 + }, + { + "completion_length": 17.453125, + "epoch": 0.46749605747327844, + "grad_norm": 18.982904311577798, + "kl": 0.04248046875, + "learning_rate": 5.326791659365691e-07, + "loss": 0.0171, + "reward": 1.5038068294525146, + "reward_std": 0.24129807949066162, + "rewards/accuracy_reward_stage2": 0.5038068294525146, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2668 + }, + { + "completion_length": 10.21875, + "epoch": 0.467671280883126, + "grad_norm": 18.579744416225406, + "kl": 0.08837890625, + "learning_rate": 5.325039425267216e-07, + "loss": -0.0088, + "reward": 1.5844638347625732, + "reward_std": 0.32090964913368225, + "rewards/accuracy_reward_stage2": 0.6000887751579285, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2669 + }, + { + "completion_length": 8.625, + "epoch": 0.46784650429297353, + "grad_norm": 18.892868541506303, + "kl": 0.2080078125, + "learning_rate": 5.323287191168739e-07, + "loss": 0.0829, + "reward": 1.334208369255066, + "reward_std": 0.2512897253036499, + "rewards/accuracy_reward_stage2": 0.45920833945274353, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2670 + }, + { + "completion_length": 7.5, + "epoch": 0.4680217277028211, + "grad_norm": 19.860807173605743, + "kl": 0.11181640625, + "learning_rate": 5.321534957070264e-07, + "loss": 0.0006, + "reward": 1.6716079711914062, + "reward_std": 0.2117011994123459, + "rewards/accuracy_reward_stage2": 0.6872329115867615, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2671 + }, + { + "completion_length": 7.65625, + "epoch": 0.4681969511126687, + "grad_norm": 12.98836942773075, + "kl": 0.10546875, + "learning_rate": 5.319782722971789e-07, + "loss": -0.0019, + "reward": 1.8471788167953491, + "reward_std": 0.20064638555049896, + "rewards/accuracy_reward_stage2": 0.8628038167953491, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2672 + }, + { + "completion_length": 9.890625, + "epoch": 0.4683721745225162, + "grad_norm": 16.79316864521017, + "kl": 0.11572265625, + "learning_rate": 5.318030488873313e-07, + "loss": 0.0179, + "reward": 1.1738388538360596, + "reward_std": 0.22742962837219238, + "rewards/accuracy_reward_stage2": 0.33008885383605957, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2673 + }, + { + "completion_length": 10.71875, + "epoch": 0.4685473979323638, + "grad_norm": 15.028835319480741, + "kl": 0.09521484375, + "learning_rate": 5.316278254774837e-07, + "loss": 0.0445, + "reward": 1.4867298603057861, + "reward_std": 0.1454581469297409, + "rewards/accuracy_reward_stage2": 0.6117299199104309, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2674 + }, + { + "completion_length": 12.671875, + "epoch": 0.4687226213422113, + "grad_norm": 18.414379794358428, + "kl": 0.1591796875, + "learning_rate": 5.314526020676362e-07, + "loss": -0.0015, + "reward": 1.24593985080719, + "reward_std": 0.25791746377944946, + "rewards/accuracy_reward_stage2": 0.2771899104118347, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2675 + }, + { + "completion_length": 9.078125, + "epoch": 0.46889784475205887, + "grad_norm": 17.428660061327285, + "kl": 0.1435546875, + "learning_rate": 5.312773786577886e-07, + "loss": -0.0857, + "reward": 1.5682522058486938, + "reward_std": 0.3257772922515869, + "rewards/accuracy_reward_stage2": 0.6307522058486938, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2676 + }, + { + "completion_length": 7.984375, + "epoch": 0.4690730681619064, + "grad_norm": 15.516788823611714, + "kl": 0.2060546875, + "learning_rate": 5.311021552479411e-07, + "loss": 0.041, + "reward": 1.3916726112365723, + "reward_std": 0.23419056832790375, + "rewards/accuracy_reward_stage2": 0.5322976112365723, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2677 + }, + { + "completion_length": 14.671875, + "epoch": 0.46924829157175396, + "grad_norm": 18.35877218792254, + "kl": 0.173828125, + "learning_rate": 5.309269318380935e-07, + "loss": 0.0084, + "reward": 1.2911961078643799, + "reward_std": 0.1765722930431366, + "rewards/accuracy_reward_stage2": 0.4474460780620575, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2678 + }, + { + "completion_length": 9.5, + "epoch": 0.46942351498160156, + "grad_norm": 8.312887165929723, + "kl": 0.1572265625, + "learning_rate": 5.30751708428246e-07, + "loss": 0.0626, + "reward": 1.453125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward_stage2": 0.578125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2679 + }, + { + "completion_length": 8.375, + "epoch": 0.4695987383914491, + "grad_norm": 16.46782716326156, + "kl": 0.08447265625, + "learning_rate": 5.305764850183985e-07, + "loss": -0.0258, + "reward": 1.4971678256988525, + "reward_std": 0.23257961869239807, + "rewards/accuracy_reward_stage2": 0.5284177660942078, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2680 + }, + { + "completion_length": 9.078125, + "epoch": 0.46977396180129666, + "grad_norm": 19.04409731102832, + "kl": 0.11962890625, + "learning_rate": 5.304012616085509e-07, + "loss": -0.0398, + "reward": 1.68888258934021, + "reward_std": 0.34067055583000183, + "rewards/accuracy_reward_stage2": 0.72013258934021, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2681 + }, + { + "completion_length": 6.578125, + "epoch": 0.4699491852111442, + "grad_norm": 20.867960532935875, + "kl": 0.28515625, + "learning_rate": 5.302260381987034e-07, + "loss": 0.0963, + "reward": 1.5492311716079712, + "reward_std": 0.19412344694137573, + "rewards/accuracy_reward_stage2": 0.6898561716079712, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2682 + }, + { + "completion_length": 11.78125, + "epoch": 0.47012440862099175, + "grad_norm": 16.12013757010351, + "kl": 0.091796875, + "learning_rate": 5.300508147888558e-07, + "loss": 0.0368, + "reward": 1.4084248542785645, + "reward_std": 0.1533108353614807, + "rewards/accuracy_reward_stage2": 0.5334248542785645, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2683 + }, + { + "completion_length": 6.703125, + "epoch": 0.4702996320308393, + "grad_norm": 19.017046076157825, + "kl": 0.140625, + "learning_rate": 5.298755913790081e-07, + "loss": -0.0103, + "reward": 1.6729154586791992, + "reward_std": 0.2343599647283554, + "rewards/accuracy_reward_stage2": 0.7041655778884888, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2684 + }, + { + "completion_length": 10.15625, + "epoch": 0.4704748554406869, + "grad_norm": 20.280683165766458, + "kl": 0.1396484375, + "learning_rate": 5.297003679691606e-07, + "loss": 0.0559, + "reward": 1.545297384262085, + "reward_std": 0.14967718720436096, + "rewards/accuracy_reward_stage2": 0.5452974438667297, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2685 + }, + { + "completion_length": 7.140625, + "epoch": 0.47065007885053445, + "grad_norm": 18.72214225563852, + "kl": 0.189453125, + "learning_rate": 5.29525144559313e-07, + "loss": -0.0698, + "reward": 1.581083059310913, + "reward_std": 0.326572060585022, + "rewards/accuracy_reward_stage2": 0.6435831189155579, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2686 + }, + { + "completion_length": 11.5625, + "epoch": 0.470825302260382, + "grad_norm": 18.248174147174804, + "kl": 0.2275390625, + "learning_rate": 5.293499211494655e-07, + "loss": 0.0168, + "reward": 1.6100983619689941, + "reward_std": 0.376659095287323, + "rewards/accuracy_reward_stage2": 0.6413483619689941, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2687 + }, + { + "completion_length": 7.390625, + "epoch": 0.47100052567022954, + "grad_norm": 17.62123757873847, + "kl": 0.12451171875, + "learning_rate": 5.29174697739618e-07, + "loss": -0.1241, + "reward": 1.5080466270446777, + "reward_std": 0.36043059825897217, + "rewards/accuracy_reward_stage2": 0.5705466866493225, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2688 + }, + { + "completion_length": 10.34375, + "epoch": 0.4711757490800771, + "grad_norm": 19.985883149588634, + "kl": 0.2470703125, + "learning_rate": 5.289994743297704e-07, + "loss": 0.0313, + "reward": 1.5201334953308105, + "reward_std": 0.16913798451423645, + "rewards/accuracy_reward_stage2": 0.6763834357261658, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2689 + }, + { + "completion_length": 10.375, + "epoch": 0.47135097248992464, + "grad_norm": 14.34850276613726, + "kl": 0.25390625, + "learning_rate": 5.288242509199229e-07, + "loss": 0.0128, + "reward": 1.7002893686294556, + "reward_std": 0.21416278183460236, + "rewards/accuracy_reward_stage2": 0.8565393090248108, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2690 + }, + { + "completion_length": 12.828125, + "epoch": 0.4715261958997722, + "grad_norm": 25.408458323983105, + "kl": 0.279296875, + "learning_rate": 5.286490275100754e-07, + "loss": 0.0334, + "reward": 1.489842414855957, + "reward_std": 0.1563968062400818, + "rewards/accuracy_reward_stage2": 0.6460922956466675, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2691 + }, + { + "completion_length": 7.015625, + "epoch": 0.4717014193096198, + "grad_norm": 11.780555474494543, + "kl": 0.19140625, + "learning_rate": 5.284738041002278e-07, + "loss": -0.0559, + "reward": 1.4947917461395264, + "reward_std": 0.2120075523853302, + "rewards/accuracy_reward_stage2": 0.5416666865348816, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2692 + }, + { + "completion_length": 12.078125, + "epoch": 0.47187664271946733, + "grad_norm": 15.257207088001591, + "kl": 0.1552734375, + "learning_rate": 5.282985806903803e-07, + "loss": 0.0179, + "reward": 1.0771777629852295, + "reward_std": 0.24159622192382812, + "rewards/accuracy_reward_stage2": 0.21780279278755188, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2693 + }, + { + "completion_length": 9.0, + "epoch": 0.4720518661293149, + "grad_norm": 21.52324411905624, + "kl": 0.0869140625, + "learning_rate": 5.281233572805326e-07, + "loss": 0.0251, + "reward": 1.5122637748718262, + "reward_std": 0.2604824900627136, + "rewards/accuracy_reward_stage2": 0.5278887152671814, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2694 + }, + { + "completion_length": 9.71875, + "epoch": 0.4722270895391624, + "grad_norm": 16.36077000355877, + "kl": 0.06982421875, + "learning_rate": 5.279481338706851e-07, + "loss": -0.0162, + "reward": 1.5569807291030884, + "reward_std": 0.21449267864227295, + "rewards/accuracy_reward_stage2": 0.5726057291030884, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2695 + }, + { + "completion_length": 10.421875, + "epoch": 0.47240231294901, + "grad_norm": 17.87484625185122, + "kl": 0.236328125, + "learning_rate": 5.277729104608375e-07, + "loss": -0.0391, + "reward": 1.5181585550308228, + "reward_std": 0.32765793800354004, + "rewards/accuracy_reward_stage2": 0.5806585550308228, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2696 + }, + { + "completion_length": 10.109375, + "epoch": 0.4725775363588575, + "grad_norm": 18.244962917000976, + "kl": 0.1875, + "learning_rate": 5.275976870509899e-07, + "loss": -0.0004, + "reward": 1.5278222560882568, + "reward_std": 0.2655717730522156, + "rewards/accuracy_reward_stage2": 0.5590722560882568, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2697 + }, + { + "completion_length": 9.84375, + "epoch": 0.4727527597687051, + "grad_norm": 23.181774765042697, + "kl": 0.283203125, + "learning_rate": 5.274224636411424e-07, + "loss": 0.0402, + "reward": 1.4854509830474854, + "reward_std": 0.2893902063369751, + "rewards/accuracy_reward_stage2": 0.6417009830474854, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2698 + }, + { + "completion_length": 9.640625, + "epoch": 0.47292798317855267, + "grad_norm": 13.396986252061001, + "kl": 0.083984375, + "learning_rate": 5.272472402312949e-07, + "loss": 0.0336, + "reward": 1.7557774782180786, + "reward_std": 0.19559994339942932, + "rewards/accuracy_reward_stage2": 0.7557775378227234, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2699 + }, + { + "completion_length": 6.828125, + "epoch": 0.4731032065884002, + "grad_norm": 35.2728621141681, + "kl": 0.15625, + "learning_rate": 5.270720168214473e-07, + "loss": 0.0181, + "reward": 1.7280571460723877, + "reward_std": 0.1603502482175827, + "rewards/accuracy_reward_stage2": 0.7436821460723877, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2700 + }, + { + "completion_length": 10.5, + "epoch": 0.47327842999824776, + "grad_norm": 16.561588817945058, + "kl": 0.025634765625, + "learning_rate": 5.268967934115998e-07, + "loss": 0.0102, + "reward": 1.59375, + "reward_std": 0.22461533546447754, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2701 + }, + { + "completion_length": 9.71875, + "epoch": 0.4734536534080953, + "grad_norm": 18.49027431198656, + "kl": 0.18359375, + "learning_rate": 5.267215700017522e-07, + "loss": 0.0291, + "reward": 1.696709156036377, + "reward_std": 0.3068729639053345, + "rewards/accuracy_reward_stage2": 0.712334156036377, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2702 + }, + { + "completion_length": 10.375, + "epoch": 0.47362887681794286, + "grad_norm": 17.52710806280741, + "kl": 0.32421875, + "learning_rate": 5.265463465919047e-07, + "loss": -0.0549, + "reward": 1.6311153173446655, + "reward_std": 0.28500238060951233, + "rewards/accuracy_reward_stage2": 0.7248653173446655, + "rewards/format_reward_stage1_pointerpad": 0.90625, + "scores/accuracy_reward_stage2": 0.90625, + "step": 2703 + }, + { + "completion_length": 13.265625, + "epoch": 0.47380410022779046, + "grad_norm": 14.301506601984673, + "kl": 0.08251953125, + "learning_rate": 5.263711231820572e-07, + "loss": -0.0444, + "reward": 1.4702627658843994, + "reward_std": 0.14659681916236877, + "rewards/accuracy_reward_stage2": 0.5015127658843994, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2704 + }, + { + "completion_length": 10.0, + "epoch": 0.473979323637638, + "grad_norm": 14.95292873090198, + "kl": 0.07958984375, + "learning_rate": 5.261958997722095e-07, + "loss": -0.0122, + "reward": 1.3216766119003296, + "reward_std": 0.19845643639564514, + "rewards/accuracy_reward_stage2": 0.3373016119003296, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2705 + }, + { + "completion_length": 12.875, + "epoch": 0.47415454704748555, + "grad_norm": 17.955246882117542, + "kl": 0.1787109375, + "learning_rate": 5.26020676362362e-07, + "loss": 0.0042, + "reward": 1.5646817684173584, + "reward_std": 0.1885027289390564, + "rewards/accuracy_reward_stage2": 0.7209317684173584, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2706 + }, + { + "completion_length": 8.328125, + "epoch": 0.4743297704573331, + "grad_norm": 19.838909273934828, + "kl": 0.09375, + "learning_rate": 5.258454529525145e-07, + "loss": -0.051, + "reward": 1.4685415029525757, + "reward_std": 0.23180589079856873, + "rewards/accuracy_reward_stage2": 0.4997915029525757, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2707 + }, + { + "completion_length": 7.28125, + "epoch": 0.47450499386718065, + "grad_norm": 19.38312558603043, + "kl": 0.142578125, + "learning_rate": 5.256702295426669e-07, + "loss": 0.0572, + "reward": 1.6531894207000732, + "reward_std": 0.3191227316856384, + "rewards/accuracy_reward_stage2": 0.6531893610954285, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2708 + }, + { + "completion_length": 5.28125, + "epoch": 0.4746802172770282, + "grad_norm": 15.183205801849777, + "kl": 0.109375, + "learning_rate": 5.254950061328193e-07, + "loss": 0.0121, + "reward": 1.630523681640625, + "reward_std": 0.2849048376083374, + "rewards/accuracy_reward_stage2": 0.6461487412452698, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2709 + }, + { + "completion_length": 10.671875, + "epoch": 0.47485544068687574, + "grad_norm": 17.944530022570852, + "kl": 0.2099609375, + "learning_rate": 5.253197827229717e-07, + "loss": -0.0046, + "reward": 1.6410465240478516, + "reward_std": 0.3454177677631378, + "rewards/accuracy_reward_stage2": 0.7972965836524963, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2710 + }, + { + "completion_length": 10.03125, + "epoch": 0.47503066409672334, + "grad_norm": 14.387584231770411, + "kl": 0.0537109375, + "learning_rate": 5.251445593131242e-07, + "loss": 0.0215, + "reward": 1.7751755714416504, + "reward_std": 0.06710825860500336, + "rewards/accuracy_reward_stage2": 0.7751755118370056, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2711 + }, + { + "completion_length": 8.5, + "epoch": 0.4752058875065709, + "grad_norm": 18.35149562175783, + "kl": 0.107421875, + "learning_rate": 5.249693359032767e-07, + "loss": -0.0011, + "reward": 1.4777603149414062, + "reward_std": 0.2660313844680786, + "rewards/accuracy_reward_stage2": 0.509010374546051, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2712 + }, + { + "completion_length": 9.109375, + "epoch": 0.47538111091641844, + "grad_norm": 22.14344971109736, + "kl": 0.1328125, + "learning_rate": 5.247941124934291e-07, + "loss": 0.0091, + "reward": 1.499030590057373, + "reward_std": 0.26058027148246765, + "rewards/accuracy_reward_stage2": 0.514655590057373, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2713 + }, + { + "completion_length": 8.09375, + "epoch": 0.475556334326266, + "grad_norm": 18.117297272818654, + "kl": 0.08203125, + "learning_rate": 5.246188890835815e-07, + "loss": -0.0113, + "reward": 1.8141202926635742, + "reward_std": 0.22963713109493256, + "rewards/accuracy_reward_stage2": 0.8297452926635742, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2714 + }, + { + "completion_length": 7.09375, + "epoch": 0.47573155773611353, + "grad_norm": 21.12299950414213, + "kl": 0.0556640625, + "learning_rate": 5.24443665673734e-07, + "loss": 0.0223, + "reward": 1.625364899635315, + "reward_std": 0.26791059970855713, + "rewards/accuracy_reward_stage2": 0.6253648996353149, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2715 + }, + { + "completion_length": 9.46875, + "epoch": 0.4759067811459611, + "grad_norm": 18.622730234407964, + "kl": 0.15234375, + "learning_rate": 5.242684422638864e-07, + "loss": 0.0202, + "reward": 1.7383769750595093, + "reward_std": 0.2080550491809845, + "rewards/accuracy_reward_stage2": 0.754002034664154, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2716 + }, + { + "completion_length": 17.484375, + "epoch": 0.4760820045558087, + "grad_norm": 19.294998296148034, + "kl": 0.0257568359375, + "learning_rate": 5.240932188540389e-07, + "loss": 0.0103, + "reward": 1.5087703466415405, + "reward_std": 0.2145693451166153, + "rewards/accuracy_reward_stage2": 0.5087703466415405, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2717 + }, + { + "completion_length": 14.46875, + "epoch": 0.4762572279656562, + "grad_norm": 16.976148575535277, + "kl": 0.1328125, + "learning_rate": 5.239179954441913e-07, + "loss": 0.0185, + "reward": 1.5493242740631104, + "reward_std": 0.19351491332054138, + "rewards/accuracy_reward_stage2": 0.5649492144584656, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2718 + }, + { + "completion_length": 13.140625, + "epoch": 0.4764324513755038, + "grad_norm": 455.72001856064566, + "kl": 2.078125, + "learning_rate": 5.237427720343438e-07, + "loss": 0.838, + "reward": 1.220144510269165, + "reward_std": 0.18339301645755768, + "rewards/accuracy_reward_stage2": 0.34514448046684265, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2719 + }, + { + "completion_length": 9.359375, + "epoch": 0.4766076747853513, + "grad_norm": 15.895693514019884, + "kl": 0.039794921875, + "learning_rate": 5.235675486244963e-07, + "loss": 0.016, + "reward": 1.7980906963348389, + "reward_std": 0.17123383283615112, + "rewards/accuracy_reward_stage2": 0.7980905771255493, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2720 + }, + { + "completion_length": 26.078125, + "epoch": 0.47678289819519887, + "grad_norm": 20.814016048937567, + "kl": 0.056640625, + "learning_rate": 5.233923252146486e-07, + "loss": 0.0227, + "reward": 1.6870832443237305, + "reward_std": 0.14134840667247772, + "rewards/accuracy_reward_stage2": 0.6870833039283752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2721 + }, + { + "completion_length": 10.125, + "epoch": 0.4769581216050464, + "grad_norm": 19.542106632980662, + "kl": 0.158203125, + "learning_rate": 5.232171018048011e-07, + "loss": 0.0333, + "reward": 1.585869312286377, + "reward_std": 0.27999138832092285, + "rewards/accuracy_reward_stage2": 0.601494312286377, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2722 + }, + { + "completion_length": 14.953125, + "epoch": 0.477133345014894, + "grad_norm": 18.92187880218421, + "kl": 0.049560546875, + "learning_rate": 5.230418783949536e-07, + "loss": -0.0244, + "reward": 1.6283730268478394, + "reward_std": 0.20701447129249573, + "rewards/accuracy_reward_stage2": 0.6439980268478394, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2723 + }, + { + "completion_length": 9.421875, + "epoch": 0.47730856842474156, + "grad_norm": 17.871778532303395, + "kl": 0.12890625, + "learning_rate": 5.228666549851059e-07, + "loss": -0.0161, + "reward": 1.4331648349761963, + "reward_std": 0.21878674626350403, + "rewards/accuracy_reward_stage2": 0.7144148945808411, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2724 + }, + { + "completion_length": 14.25, + "epoch": 0.4774837918345891, + "grad_norm": 23.580290074841233, + "kl": 0.18359375, + "learning_rate": 5.226914315752584e-07, + "loss": 0.0391, + "reward": 1.5170743465423584, + "reward_std": 0.3067265748977661, + "rewards/accuracy_reward_stage2": 0.5326994061470032, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2725 + }, + { + "completion_length": 9.375, + "epoch": 0.47765901524443666, + "grad_norm": 12.153889116582954, + "kl": 0.142578125, + "learning_rate": 5.225162081654108e-07, + "loss": -0.0313, + "reward": 1.4144365787506104, + "reward_std": 0.24257135391235352, + "rewards/accuracy_reward_stage2": 0.5706866383552551, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2726 + }, + { + "completion_length": 25.015625, + "epoch": 0.4778342386542842, + "grad_norm": 24.166691606219224, + "kl": 0.1279296875, + "learning_rate": 5.223409847555633e-07, + "loss": -0.022, + "reward": 1.6062610149383545, + "reward_std": 0.19640934467315674, + "rewards/accuracy_reward_stage2": 0.6375110149383545, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2727 + }, + { + "completion_length": 12.734375, + "epoch": 0.47800946206413175, + "grad_norm": 15.49223977916494, + "kl": 0.109375, + "learning_rate": 5.221657613457158e-07, + "loss": 0.0146, + "reward": 1.351271390914917, + "reward_std": 0.15009453892707825, + "rewards/accuracy_reward_stage2": 0.4918963313102722, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2728 + }, + { + "completion_length": 10.28125, + "epoch": 0.4781846854739793, + "grad_norm": 18.615076012997733, + "kl": 0.1552734375, + "learning_rate": 5.219905379358682e-07, + "loss": 0.0306, + "reward": 1.407578706741333, + "reward_std": 0.19074060022830963, + "rewards/accuracy_reward_stage2": 0.4232037663459778, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2729 + }, + { + "completion_length": 14.703125, + "epoch": 0.4783599088838269, + "grad_norm": 18.12338121605169, + "kl": 0.275390625, + "learning_rate": 5.218153145260207e-07, + "loss": -0.0129, + "reward": 1.5154175758361816, + "reward_std": 0.34471186995506287, + "rewards/accuracy_reward_stage2": 0.5622925758361816, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2730 + }, + { + "completion_length": 8.5625, + "epoch": 0.47853513229367445, + "grad_norm": 16.168990286816936, + "kl": 0.21875, + "learning_rate": 5.216400911161732e-07, + "loss": 0.0064, + "reward": 1.820338249206543, + "reward_std": 0.19135203957557678, + "rewards/accuracy_reward_stage2": 0.8515881896018982, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2731 + }, + { + "completion_length": 6.75, + "epoch": 0.478710355703522, + "grad_norm": 18.149000957856924, + "kl": 0.134765625, + "learning_rate": 5.214648677063256e-07, + "loss": 0.0191, + "reward": 1.583438515663147, + "reward_std": 0.2895791232585907, + "rewards/accuracy_reward_stage2": 0.5990635752677917, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2732 + }, + { + "completion_length": 6.875, + "epoch": 0.47888557911336954, + "grad_norm": 16.452446078960264, + "kl": 0.06689453125, + "learning_rate": 5.212896442964781e-07, + "loss": 0.0267, + "reward": 1.7636473178863525, + "reward_std": 0.22638459503650665, + "rewards/accuracy_reward_stage2": 0.7636473178863525, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2733 + }, + { + "completion_length": 14.546875, + "epoch": 0.4790608025232171, + "grad_norm": 19.049273000089443, + "kl": 0.14453125, + "learning_rate": 5.211144208866303e-07, + "loss": -0.0298, + "reward": 1.4528450965881348, + "reward_std": 0.2527550458908081, + "rewards/accuracy_reward_stage2": 0.6090949773788452, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2734 + }, + { + "completion_length": 10.90625, + "epoch": 0.47923602593306464, + "grad_norm": 13.796239244227346, + "kl": 0.1455078125, + "learning_rate": 5.209391974767828e-07, + "loss": 0.0142, + "reward": 1.4563570022583008, + "reward_std": 0.16542761027812958, + "rewards/accuracy_reward_stage2": 0.596981942653656, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2735 + }, + { + "completion_length": 12.46875, + "epoch": 0.47941124934291224, + "grad_norm": 16.062767527991255, + "kl": 0.028564453125, + "learning_rate": 5.207639740669353e-07, + "loss": 0.0114, + "reward": 1.559941053390503, + "reward_std": 0.18492963910102844, + "rewards/accuracy_reward_stage2": 0.5599411129951477, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2736 + }, + { + "completion_length": 9.21875, + "epoch": 0.4795864727527598, + "grad_norm": 12.052747871748826, + "kl": 0.10400390625, + "learning_rate": 5.205887506570877e-07, + "loss": -0.0026, + "reward": 1.2457869052886963, + "reward_std": 0.1598706841468811, + "rewards/accuracy_reward_stage2": 0.26141196489334106, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2737 + }, + { + "completion_length": 9.4375, + "epoch": 0.47976169616260733, + "grad_norm": 17.36749522465019, + "kl": 0.2392578125, + "learning_rate": 5.204135272472402e-07, + "loss": 0.052, + "reward": 1.4359642267227173, + "reward_std": 0.2153088003396988, + "rewards/accuracy_reward_stage2": 0.5765892267227173, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2738 + }, + { + "completion_length": 7.8125, + "epoch": 0.4799369195724549, + "grad_norm": 18.251925809945238, + "kl": 0.1376953125, + "learning_rate": 5.202383038373927e-07, + "loss": 0.0109, + "reward": 1.615727186203003, + "reward_std": 0.2166653424501419, + "rewards/accuracy_reward_stage2": 0.7563521265983582, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2739 + }, + { + "completion_length": 15.109375, + "epoch": 0.4801121429823024, + "grad_norm": 19.749273852695445, + "kl": 0.0849609375, + "learning_rate": 5.200630804275451e-07, + "loss": 0.0339, + "reward": 1.3953063488006592, + "reward_std": 0.2997656464576721, + "rewards/accuracy_reward_stage2": 0.3953062891960144, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2740 + }, + { + "completion_length": 7.046875, + "epoch": 0.48028736639215, + "grad_norm": 17.631041499607417, + "kl": 0.185546875, + "learning_rate": 5.198878570176976e-07, + "loss": 0.0303, + "reward": 1.7178881168365479, + "reward_std": 0.12682121992111206, + "rewards/accuracy_reward_stage2": 0.8585132360458374, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2741 + }, + { + "completion_length": 9.953125, + "epoch": 0.4804625898019975, + "grad_norm": 12.619249957281948, + "kl": 0.1328125, + "learning_rate": 5.1971263360785e-07, + "loss": 0.0532, + "reward": 1.65625, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.78125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2742 + }, + { + "completion_length": 14.015625, + "epoch": 0.4806378132118451, + "grad_norm": 19.162755775127746, + "kl": 0.1103515625, + "learning_rate": 5.195374101980025e-07, + "loss": 0.0018, + "reward": 1.5159637928009033, + "reward_std": 0.2675696015357971, + "rewards/accuracy_reward_stage2": 0.5315887928009033, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2743 + }, + { + "completion_length": 6.6875, + "epoch": 0.48081303662169267, + "grad_norm": 9.318900647205547, + "kl": 0.1181640625, + "learning_rate": 5.19362186788155e-07, + "loss": 0.0182, + "reward": 1.703125, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.71875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2744 + }, + { + "completion_length": 9.390625, + "epoch": 0.4809882600315402, + "grad_norm": 18.835359657896657, + "kl": 0.2021484375, + "learning_rate": 5.191869633783073e-07, + "loss": 0.052, + "reward": 1.319472074508667, + "reward_std": 0.24716606736183167, + "rewards/accuracy_reward_stage2": 0.4600970447063446, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2745 + }, + { + "completion_length": 8.390625, + "epoch": 0.48116348344138776, + "grad_norm": 23.08480354751511, + "kl": 0.2060546875, + "learning_rate": 5.190117399684598e-07, + "loss": 0.0439, + "reward": 1.4809374809265137, + "reward_std": 0.27444180846214294, + "rewards/accuracy_reward_stage2": 0.6371875405311584, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2746 + }, + { + "completion_length": 11.34375, + "epoch": 0.4813387068512353, + "grad_norm": 20.294096164081342, + "kl": 0.150390625, + "learning_rate": 5.188365165586121e-07, + "loss": 0.0319, + "reward": 1.4782813787460327, + "reward_std": 0.14485354721546173, + "rewards/accuracy_reward_stage2": 0.6189062595367432, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2747 + }, + { + "completion_length": 7.3125, + "epoch": 0.48151393026108286, + "grad_norm": 21.102713733518698, + "kl": 0.10791015625, + "learning_rate": 5.186612931487646e-07, + "loss": 0.0432, + "reward": 1.6589291095733643, + "reward_std": 0.172014981508255, + "rewards/accuracy_reward_stage2": 0.6589291095733643, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2748 + }, + { + "completion_length": 8.765625, + "epoch": 0.48168915367093046, + "grad_norm": 18.910051182893728, + "kl": 0.1259765625, + "learning_rate": 5.184860697389171e-07, + "loss": 0.0569, + "reward": 1.5960662364959717, + "reward_std": 0.12384433299303055, + "rewards/accuracy_reward_stage2": 0.7210662364959717, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2749 + }, + { + "completion_length": 14.671875, + "epoch": 0.481864377080778, + "grad_norm": 17.660883704124444, + "kl": 0.146484375, + "learning_rate": 5.183108463290695e-07, + "loss": -0.0074, + "reward": 1.6546704769134521, + "reward_std": 0.31043297052383423, + "rewards/accuracy_reward_stage2": 0.6859204769134521, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2750 + }, + { + "completion_length": 10.546875, + "epoch": 0.48203960049062555, + "grad_norm": 27.94560857989135, + "kl": 0.2578125, + "learning_rate": 5.18135622919222e-07, + "loss": 0.0145, + "reward": 1.5631669759750366, + "reward_std": 0.3198142647743225, + "rewards/accuracy_reward_stage2": 0.5944169759750366, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2751 + }, + { + "completion_length": 10.15625, + "epoch": 0.4822148239004731, + "grad_norm": 22.23540615294016, + "kl": 0.1611328125, + "learning_rate": 5.179603995093745e-07, + "loss": 0.0311, + "reward": 1.7261333465576172, + "reward_std": 0.28550904989242554, + "rewards/accuracy_reward_stage2": 0.7417583465576172, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2752 + }, + { + "completion_length": 8.40625, + "epoch": 0.48239004731032065, + "grad_norm": 18.09184589875181, + "kl": 0.2353515625, + "learning_rate": 5.177851760995269e-07, + "loss": 0.0502, + "reward": 1.505954623222351, + "reward_std": 0.26220905780792236, + "rewards/accuracy_reward_stage2": 0.6465796232223511, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2753 + }, + { + "completion_length": 22.03125, + "epoch": 0.4825652707201682, + "grad_norm": 22.332245761929883, + "kl": 0.10302734375, + "learning_rate": 5.176099526896793e-07, + "loss": 0.0127, + "reward": 1.3112815618515015, + "reward_std": 0.19790546596050262, + "rewards/accuracy_reward_stage2": 0.4362815022468567, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2754 + }, + { + "completion_length": 10.34375, + "epoch": 0.4827404941300158, + "grad_norm": 20.220182184480546, + "kl": 0.06591796875, + "learning_rate": 5.174347292798317e-07, + "loss": 0.0265, + "reward": 1.569887399673462, + "reward_std": 0.1608072817325592, + "rewards/accuracy_reward_stage2": 0.5698874592781067, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2755 + }, + { + "completion_length": 8.234375, + "epoch": 0.48291571753986334, + "grad_norm": 21.54860264069522, + "kl": 0.039794921875, + "learning_rate": 5.172595058699842e-07, + "loss": 0.0159, + "reward": 1.6516201496124268, + "reward_std": 0.24412159621715546, + "rewards/accuracy_reward_stage2": 0.776620090007782, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2756 + }, + { + "completion_length": 10.515625, + "epoch": 0.4830909409497109, + "grad_norm": 16.106284578983384, + "kl": 0.2236328125, + "learning_rate": 5.170842824601367e-07, + "loss": -0.0316, + "reward": 1.4267677068710327, + "reward_std": 0.24894431233406067, + "rewards/accuracy_reward_stage2": 0.4892677068710327, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2757 + }, + { + "completion_length": 13.125, + "epoch": 0.48326616435955844, + "grad_norm": 16.33349841945425, + "kl": 0.064453125, + "learning_rate": 5.169090590502891e-07, + "loss": -0.0398, + "reward": 1.5895304679870605, + "reward_std": 0.216194748878479, + "rewards/accuracy_reward_stage2": 0.6207805275917053, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2758 + }, + { + "completion_length": 9.390625, + "epoch": 0.483441387769406, + "grad_norm": 16.146027542899503, + "kl": 0.1474609375, + "learning_rate": 5.167338356404415e-07, + "loss": 0.0476, + "reward": 1.5741090774536133, + "reward_std": 0.11098361015319824, + "rewards/accuracy_reward_stage2": 0.8241090178489685, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2759 + }, + { + "completion_length": 12.21875, + "epoch": 0.48361661117925353, + "grad_norm": 20.163523124388767, + "kl": 0.1259765625, + "learning_rate": 5.16558612230594e-07, + "loss": -0.0227, + "reward": 1.4342626333236694, + "reward_std": 0.21140506863594055, + "rewards/accuracy_reward_stage2": 0.5905126333236694, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2760 + }, + { + "completion_length": 10.921875, + "epoch": 0.4837918345891011, + "grad_norm": 13.666142555657446, + "kl": 0.04052734375, + "learning_rate": 5.163833888207464e-07, + "loss": 0.0163, + "reward": 1.6616389751434326, + "reward_std": 0.09498253464698792, + "rewards/accuracy_reward_stage2": 0.7866389155387878, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2761 + }, + { + "completion_length": 11.171875, + "epoch": 0.4839670579989487, + "grad_norm": 19.951044847589454, + "kl": 0.1416015625, + "learning_rate": 5.162081654108989e-07, + "loss": -0.0053, + "reward": 1.637669324874878, + "reward_std": 0.27671176195144653, + "rewards/accuracy_reward_stage2": 0.6689193248748779, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2762 + }, + { + "completion_length": 9.53125, + "epoch": 0.4841422814087962, + "grad_norm": 20.847489567446733, + "kl": 0.10107421875, + "learning_rate": 5.160329420010512e-07, + "loss": 0.0406, + "reward": 1.5803248882293701, + "reward_std": 0.20699653029441833, + "rewards/accuracy_reward_stage2": 0.7053249478340149, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2763 + }, + { + "completion_length": 8.265625, + "epoch": 0.4843175048186438, + "grad_norm": 18.2142109782604, + "kl": 0.115234375, + "learning_rate": 5.158577185912037e-07, + "loss": 0.0461, + "reward": 1.4986896514892578, + "reward_std": 0.255196750164032, + "rewards/accuracy_reward_stage2": 0.623689591884613, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2764 + }, + { + "completion_length": 9.4375, + "epoch": 0.4844927282284913, + "grad_norm": 8.677932747497366, + "kl": 0.1376953125, + "learning_rate": 5.156824951813562e-07, + "loss": 0.055, + "reward": 1.34375, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2765 + }, + { + "completion_length": 10.40625, + "epoch": 0.48466795163833887, + "grad_norm": 30.869785143898685, + "kl": 0.1259765625, + "learning_rate": 5.155072717715086e-07, + "loss": -0.0505, + "reward": 1.3989291191101074, + "reward_std": 0.253897100687027, + "rewards/accuracy_reward_stage2": 0.5708041787147522, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2766 + }, + { + "completion_length": 9.59375, + "epoch": 0.4848431750481864, + "grad_norm": 12.300031714411467, + "kl": 0.0693359375, + "learning_rate": 5.153320483616611e-07, + "loss": 0.0276, + "reward": 1.6179606914520264, + "reward_std": 0.09867061674594879, + "rewards/accuracy_reward_stage2": 0.6179606914520264, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2767 + }, + { + "completion_length": 7.328125, + "epoch": 0.485018398458034, + "grad_norm": 12.288995085247821, + "kl": 0.1787109375, + "learning_rate": 5.151568249518136e-07, + "loss": -0.0257, + "reward": 1.5489342212677002, + "reward_std": 0.16947272419929504, + "rewards/accuracy_reward_stage2": 0.5958091616630554, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2768 + }, + { + "completion_length": 10.703125, + "epoch": 0.48519362186788156, + "grad_norm": 16.628816013262394, + "kl": 0.09033203125, + "learning_rate": 5.14981601541966e-07, + "loss": -0.0414, + "reward": 1.580203890800476, + "reward_std": 0.21251340210437775, + "rewards/accuracy_reward_stage2": 0.6114539504051208, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2769 + }, + { + "completion_length": 9.8125, + "epoch": 0.4853688452777291, + "grad_norm": 19.879967908938465, + "kl": 0.16796875, + "learning_rate": 5.148063781321185e-07, + "loss": 0.0068, + "reward": 1.3905811309814453, + "reward_std": 0.2664790153503418, + "rewards/accuracy_reward_stage2": 0.5468310117721558, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2770 + }, + { + "completion_length": 9.5, + "epoch": 0.48554406868757666, + "grad_norm": 21.409098111522415, + "kl": 0.177734375, + "learning_rate": 5.146311547222709e-07, + "loss": 0.04, + "reward": 1.4670194387435913, + "reward_std": 0.3582940399646759, + "rewards/accuracy_reward_stage2": 0.6076444387435913, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2771 + }, + { + "completion_length": 10.125, + "epoch": 0.4857192920974242, + "grad_norm": 17.443727373495843, + "kl": 0.0771484375, + "learning_rate": 5.144559313124233e-07, + "loss": -0.0068, + "reward": 1.7224445343017578, + "reward_std": 0.1532498300075531, + "rewards/accuracy_reward_stage2": 0.7380695939064026, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2772 + }, + { + "completion_length": 9.375, + "epoch": 0.48589451550727175, + "grad_norm": 18.096878447544068, + "kl": 0.1962890625, + "learning_rate": 5.142807079025758e-07, + "loss": 0.0119, + "reward": 1.708147406578064, + "reward_std": 0.2741885185241699, + "rewards/accuracy_reward_stage2": 0.7393973469734192, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2773 + }, + { + "completion_length": 11.8125, + "epoch": 0.48606973891711935, + "grad_norm": 23.912948047153062, + "kl": 0.259765625, + "learning_rate": 5.141054844927281e-07, + "loss": 0.0469, + "reward": 1.370512843132019, + "reward_std": 0.3569799065589905, + "rewards/accuracy_reward_stage2": 0.5267627835273743, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2774 + }, + { + "completion_length": 15.046875, + "epoch": 0.4862449623269669, + "grad_norm": 20.040363660948454, + "kl": 0.2275390625, + "learning_rate": 5.139302610828806e-07, + "loss": 0.0851, + "reward": 1.113850712776184, + "reward_std": 0.21950891613960266, + "rewards/accuracy_reward_stage2": 0.3638507127761841, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2775 + }, + { + "completion_length": 7.0625, + "epoch": 0.48642018573681445, + "grad_norm": 17.11712035620011, + "kl": 0.04052734375, + "learning_rate": 5.137550376730331e-07, + "loss": 0.0162, + "reward": 1.7094886302947998, + "reward_std": 0.18887673318386078, + "rewards/accuracy_reward_stage2": 0.709488570690155, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2776 + }, + { + "completion_length": 11.046875, + "epoch": 0.486595409146662, + "grad_norm": 14.330378866431904, + "kl": 0.054443359375, + "learning_rate": 5.135798142631855e-07, + "loss": -0.0223, + "reward": 1.7937867641448975, + "reward_std": 0.1975262314081192, + "rewards/accuracy_reward_stage2": 0.8094117641448975, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2777 + }, + { + "completion_length": 8.875, + "epoch": 0.48677063255650954, + "grad_norm": 14.410387426620687, + "kl": 0.09423828125, + "learning_rate": 5.13404590853338e-07, + "loss": 0.0377, + "reward": 1.3828563690185547, + "reward_std": 0.20289446413516998, + "rewards/accuracy_reward_stage2": 0.3828563094139099, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2778 + }, + { + "completion_length": 10.875, + "epoch": 0.4869458559663571, + "grad_norm": 20.288824451679844, + "kl": 0.2578125, + "learning_rate": 5.132293674434904e-07, + "loss": 0.1027, + "reward": 1.5620172023773193, + "reward_std": 0.21671007573604584, + "rewards/accuracy_reward_stage2": 0.6870173215866089, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2779 + }, + { + "completion_length": 9.96875, + "epoch": 0.48712107937620464, + "grad_norm": 20.68604980109284, + "kl": 0.169921875, + "learning_rate": 5.130541440336429e-07, + "loss": 0.0677, + "reward": 1.5459976196289062, + "reward_std": 0.30775919556617737, + "rewards/accuracy_reward_stage2": 0.6709976196289062, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2780 + }, + { + "completion_length": 11.015625, + "epoch": 0.48729630278605224, + "grad_norm": 17.98928611390982, + "kl": 0.05078125, + "learning_rate": 5.128789206237954e-07, + "loss": -0.0015, + "reward": 1.6310027837753296, + "reward_std": 0.28292495012283325, + "rewards/accuracy_reward_stage2": 0.6466277837753296, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2781 + }, + { + "completion_length": 8.15625, + "epoch": 0.4874715261958998, + "grad_norm": 19.448061377008912, + "kl": 0.2412109375, + "learning_rate": 5.127036972139478e-07, + "loss": 0.0522, + "reward": 1.4846069812774658, + "reward_std": 0.21984370052814484, + "rewards/accuracy_reward_stage2": 0.6252319812774658, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2782 + }, + { + "completion_length": 10.046875, + "epoch": 0.48764674960574733, + "grad_norm": 16.371705096836756, + "kl": 0.076171875, + "learning_rate": 5.125284738041003e-07, + "loss": -0.009, + "reward": 1.4768791198730469, + "reward_std": 0.21295681595802307, + "rewards/accuracy_reward_stage2": 0.4925040900707245, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2783 + }, + { + "completion_length": 15.4375, + "epoch": 0.4878219730155949, + "grad_norm": 16.31041929935746, + "kl": 0.07861328125, + "learning_rate": 5.123532503942527e-07, + "loss": 0.0, + "reward": 1.5155887603759766, + "reward_std": 0.2410578578710556, + "rewards/accuracy_reward_stage2": 0.5312137007713318, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2784 + }, + { + "completion_length": 6.984375, + "epoch": 0.4879971964254424, + "grad_norm": 20.353214126347986, + "kl": 0.1748046875, + "learning_rate": 5.12178026984405e-07, + "loss": -0.0075, + "reward": 1.3905422687530518, + "reward_std": 0.25600743293762207, + "rewards/accuracy_reward_stage2": 0.5467923283576965, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2785 + }, + { + "completion_length": 8.265625, + "epoch": 0.48817241983529, + "grad_norm": 21.969311129374322, + "kl": 0.09423828125, + "learning_rate": 5.120028035745575e-07, + "loss": 0.0171, + "reward": 1.532669186592102, + "reward_std": 0.25590693950653076, + "rewards/accuracy_reward_stage2": 0.548294186592102, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2786 + }, + { + "completion_length": 12.5625, + "epoch": 0.4883476432451376, + "grad_norm": 20.324773293148578, + "kl": 0.0294189453125, + "learning_rate": 5.118275801647099e-07, + "loss": 0.0117, + "reward": 1.6665804386138916, + "reward_std": 0.2079043984413147, + "rewards/accuracy_reward_stage2": 0.666580319404602, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2787 + }, + { + "completion_length": 8.484375, + "epoch": 0.4885228666549851, + "grad_norm": 28.316353788852847, + "kl": 0.1669921875, + "learning_rate": 5.116523567548624e-07, + "loss": 0.0384, + "reward": 1.57529616355896, + "reward_std": 0.41628655791282654, + "rewards/accuracy_reward_stage2": 0.59092116355896, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2788 + }, + { + "completion_length": 9.265625, + "epoch": 0.48869809006483267, + "grad_norm": 23.53590174525446, + "kl": 0.2080078125, + "learning_rate": 5.114771333450149e-07, + "loss": -0.012, + "reward": 1.5387616157531738, + "reward_std": 0.22167927026748657, + "rewards/accuracy_reward_stage2": 0.7106366157531738, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2789 + }, + { + "completion_length": 9.3125, + "epoch": 0.4888733134746802, + "grad_norm": 24.876890170809475, + "kl": 0.201171875, + "learning_rate": 5.113019099351673e-07, + "loss": 0.0098, + "reward": 1.638625144958496, + "reward_std": 0.35675370693206787, + "rewards/accuracy_reward_stage2": 0.6698752641677856, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2790 + }, + { + "completion_length": 14.4375, + "epoch": 0.48904853688452776, + "grad_norm": 15.22026672089996, + "kl": 0.1015625, + "learning_rate": 5.111266865253198e-07, + "loss": -0.0435, + "reward": 1.7338073253631592, + "reward_std": 0.19504126906394958, + "rewards/accuracy_reward_stage2": 0.765057384967804, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2791 + }, + { + "completion_length": 8.578125, + "epoch": 0.4892237602943753, + "grad_norm": 24.471302996369715, + "kl": 0.09814453125, + "learning_rate": 5.109514631154723e-07, + "loss": -0.005, + "reward": 1.4096736907958984, + "reward_std": 0.3426571488380432, + "rewards/accuracy_reward_stage2": 0.42529866099357605, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2792 + }, + { + "completion_length": 12.609375, + "epoch": 0.4893989837042229, + "grad_norm": 19.71721072991048, + "kl": 0.1650390625, + "learning_rate": 5.107762397056246e-07, + "loss": 0.0344, + "reward": 1.5647720098495483, + "reward_std": 0.2316807359457016, + "rewards/accuracy_reward_stage2": 0.5803970098495483, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2793 + }, + { + "completion_length": 8.4375, + "epoch": 0.48957420711407046, + "grad_norm": 29.986452757482443, + "kl": 0.2353515625, + "learning_rate": 5.106010162957771e-07, + "loss": 0.0639, + "reward": 1.599445104598999, + "reward_std": 0.4075753092765808, + "rewards/accuracy_reward_stage2": 0.615070104598999, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2794 + }, + { + "completion_length": 16.359375, + "epoch": 0.489749430523918, + "grad_norm": 16.643489660943185, + "kl": 0.119140625, + "learning_rate": 5.104257928859295e-07, + "loss": -0.0184, + "reward": 1.2576736211776733, + "reward_std": 0.22650864720344543, + "rewards/accuracy_reward_stage2": 0.2889236509799957, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2795 + }, + { + "completion_length": 15.671875, + "epoch": 0.48992465393376555, + "grad_norm": 15.229093807517911, + "kl": 0.1982421875, + "learning_rate": 5.10250569476082e-07, + "loss": -0.0527, + "reward": 1.4054017066955566, + "reward_std": 0.18351644277572632, + "rewards/accuracy_reward_stage2": 0.5772767663002014, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2796 + }, + { + "completion_length": 8.109375, + "epoch": 0.4900998773436131, + "grad_norm": 19.304101051588315, + "kl": 0.1962890625, + "learning_rate": 5.100753460662345e-07, + "loss": -0.0152, + "reward": 1.6429263353347778, + "reward_std": 0.30045899748802185, + "rewards/accuracy_reward_stage2": 0.6898013949394226, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2797 + }, + { + "completion_length": 9.84375, + "epoch": 0.49027510075346065, + "grad_norm": 14.141398792127648, + "kl": 0.158203125, + "learning_rate": 5.099001226563868e-07, + "loss": 0.0192, + "reward": 1.5334928035736084, + "reward_std": 0.2365182787179947, + "rewards/accuracy_reward_stage2": 0.549117922782898, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2798 + }, + { + "completion_length": 5.828125, + "epoch": 0.4904503241633082, + "grad_norm": 16.55445750105496, + "kl": 0.0673828125, + "learning_rate": 5.097248992465393e-07, + "loss": 0.027, + "reward": 1.758355975151062, + "reward_std": 0.20726990699768066, + "rewards/accuracy_reward_stage2": 0.758355975151062, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2799 + }, + { + "completion_length": 13.375, + "epoch": 0.4906255475731558, + "grad_norm": 26.371272784505244, + "kl": 0.248046875, + "learning_rate": 5.095496758366918e-07, + "loss": 0.0994, + "reward": 1.5160192251205444, + "reward_std": 0.11810261011123657, + "rewards/accuracy_reward_stage2": 0.6410192251205444, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2800 + }, + { + "completion_length": 13.515625, + "epoch": 0.49080077098300334, + "grad_norm": 14.68377419112434, + "kl": 0.06396484375, + "learning_rate": 5.093744524268442e-07, + "loss": 0.0255, + "reward": 1.655139684677124, + "reward_std": 0.08718352019786835, + "rewards/accuracy_reward_stage2": 0.6551397442817688, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2801 + }, + { + "completion_length": 14.921875, + "epoch": 0.4909759943928509, + "grad_norm": 20.048786580469528, + "kl": 0.2314453125, + "learning_rate": 5.091992290169967e-07, + "loss": -0.0057, + "reward": 1.4953479766845703, + "reward_std": 0.29281944036483765, + "rewards/accuracy_reward_stage2": 0.5422229170799255, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2802 + }, + { + "completion_length": 9.21875, + "epoch": 0.49115121780269844, + "grad_norm": 16.685234932334513, + "kl": 0.08984375, + "learning_rate": 5.09024005607149e-07, + "loss": -0.0077, + "reward": 1.5765955448150635, + "reward_std": 0.2887122929096222, + "rewards/accuracy_reward_stage2": 0.5922205448150635, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2803 + }, + { + "completion_length": 8.453125, + "epoch": 0.491326441212546, + "grad_norm": 19.963744178716368, + "kl": 0.255859375, + "learning_rate": 5.088487821973015e-07, + "loss": -0.0377, + "reward": 1.568144679069519, + "reward_std": 0.2530245780944824, + "rewards/accuracy_reward_stage2": 0.6306445598602295, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2804 + }, + { + "completion_length": 12.703125, + "epoch": 0.49150166462239353, + "grad_norm": 17.01684463077485, + "kl": 0.1748046875, + "learning_rate": 5.08673558787454e-07, + "loss": 0.0697, + "reward": 1.2630748748779297, + "reward_std": 0.22368191182613373, + "rewards/accuracy_reward_stage2": 0.5130749344825745, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2805 + }, + { + "completion_length": 13.1875, + "epoch": 0.49167688803224113, + "grad_norm": 23.596113627385748, + "kl": 0.1279296875, + "learning_rate": 5.084983353776064e-07, + "loss": 0.051, + "reward": 1.5360572338104248, + "reward_std": 0.3088216781616211, + "rewards/accuracy_reward_stage2": 0.6610572338104248, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2806 + }, + { + "completion_length": 10.84375, + "epoch": 0.4918521114420887, + "grad_norm": 14.753660421038694, + "kl": 0.1357421875, + "learning_rate": 5.083231119677589e-07, + "loss": 0.01, + "reward": 1.46462881565094, + "reward_std": 0.18203243613243103, + "rewards/accuracy_reward_stage2": 0.6052538156509399, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2807 + }, + { + "completion_length": 15.875, + "epoch": 0.4920273348519362, + "grad_norm": 20.038379015214403, + "kl": 0.060791015625, + "learning_rate": 5.081478885579114e-07, + "loss": 0.0244, + "reward": 1.5692226886749268, + "reward_std": 0.07769454270601273, + "rewards/accuracy_reward_stage2": 0.569222629070282, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2808 + }, + { + "completion_length": 15.8125, + "epoch": 0.4922025582617838, + "grad_norm": 17.85182485742762, + "kl": 0.09326171875, + "learning_rate": 5.079726651480638e-07, + "loss": -0.0049, + "reward": 1.5670794248580933, + "reward_std": 0.17647606134414673, + "rewards/accuracy_reward_stage2": 0.5827044248580933, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2809 + }, + { + "completion_length": 12.015625, + "epoch": 0.4923777816716313, + "grad_norm": 20.31571255444851, + "kl": 0.279296875, + "learning_rate": 5.077974417382162e-07, + "loss": 0.1007, + "reward": 1.331108808517456, + "reward_std": 0.18084828555583954, + "rewards/accuracy_reward_stage2": 0.4717338979244232, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2810 + }, + { + "completion_length": 12.109375, + "epoch": 0.49255300508147887, + "grad_norm": 14.528217315109835, + "kl": 0.146484375, + "learning_rate": 5.076222183283686e-07, + "loss": -0.0516, + "reward": 1.3952136039733887, + "reward_std": 0.20071014761924744, + "rewards/accuracy_reward_stage2": 0.44208866357803345, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2811 + }, + { + "completion_length": 12.71875, + "epoch": 0.4927282284913264, + "grad_norm": 15.939315200282937, + "kl": 0.1630859375, + "learning_rate": 5.074469949185211e-07, + "loss": -0.0107, + "reward": 1.4925997257232666, + "reward_std": 0.23163168132305145, + "rewards/accuracy_reward_stage2": 0.6488497257232666, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2812 + }, + { + "completion_length": 9.8125, + "epoch": 0.492903451901174, + "grad_norm": 14.135382545836258, + "kl": 0.1171875, + "learning_rate": 5.072717715086735e-07, + "loss": -0.0269, + "reward": 1.4884263277053833, + "reward_std": 0.2511558532714844, + "rewards/accuracy_reward_stage2": 0.5196763277053833, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2813 + }, + { + "completion_length": 20.140625, + "epoch": 0.49307867531102156, + "grad_norm": 18.67766438645424, + "kl": 0.068359375, + "learning_rate": 5.070965480988259e-07, + "loss": 0.0273, + "reward": 1.5892879962921143, + "reward_std": 0.16745012998580933, + "rewards/accuracy_reward_stage2": 0.5892879962921143, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2814 + }, + { + "completion_length": 7.984375, + "epoch": 0.4932538987208691, + "grad_norm": 27.37512850007755, + "kl": 0.2001953125, + "learning_rate": 5.069213246889784e-07, + "loss": 0.0801, + "reward": 1.6561710834503174, + "reward_std": 0.16692574322223663, + "rewards/accuracy_reward_stage2": 0.6561710834503174, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2815 + }, + { + "completion_length": 10.28125, + "epoch": 0.49342912213071666, + "grad_norm": 17.381150811786746, + "kl": 0.1103515625, + "learning_rate": 5.067461012791309e-07, + "loss": 0.0081, + "reward": 1.5132172107696533, + "reward_std": 0.25957632064819336, + "rewards/accuracy_reward_stage2": 0.5288421511650085, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2816 + }, + { + "completion_length": 16.078125, + "epoch": 0.4936043455405642, + "grad_norm": 15.233822447838989, + "kl": 0.037109375, + "learning_rate": 5.065708778692833e-07, + "loss": 0.0149, + "reward": 1.5294086933135986, + "reward_std": 0.2323162704706192, + "rewards/accuracy_reward_stage2": 0.5294086337089539, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2817 + }, + { + "completion_length": 10.578125, + "epoch": 0.49377956895041175, + "grad_norm": 17.999832607856707, + "kl": 0.166015625, + "learning_rate": 5.063956544594358e-07, + "loss": -0.0649, + "reward": 1.5089285373687744, + "reward_std": 0.3682054281234741, + "rewards/accuracy_reward_stage2": 0.5714285969734192, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2818 + }, + { + "completion_length": 13.1875, + "epoch": 0.49395479236025935, + "grad_norm": 16.343371924459856, + "kl": 0.08837890625, + "learning_rate": 5.062204310495882e-07, + "loss": -0.0088, + "reward": 1.4112939834594727, + "reward_std": 0.21046367287635803, + "rewards/accuracy_reward_stage2": 0.42691895365715027, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2819 + }, + { + "completion_length": 16.453125, + "epoch": 0.4941300157701069, + "grad_norm": 20.203369112743434, + "kl": 0.10595703125, + "learning_rate": 5.060452076397407e-07, + "loss": 0.009, + "reward": 1.5565242767333984, + "reward_std": 0.2382836639881134, + "rewards/accuracy_reward_stage2": 0.5721493363380432, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2820 + }, + { + "completion_length": 6.1875, + "epoch": 0.49430523917995445, + "grad_norm": 15.530217389099516, + "kl": 0.1015625, + "learning_rate": 5.058699842298932e-07, + "loss": -0.0477, + "reward": 1.446732997894287, + "reward_std": 0.22207242250442505, + "rewards/accuracy_reward_stage2": 0.47798293828964233, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2821 + }, + { + "completion_length": 8.703125, + "epoch": 0.494480462589802, + "grad_norm": 20.575110208490617, + "kl": 0.134765625, + "learning_rate": 5.056947608200456e-07, + "loss": -0.0344, + "reward": 1.4416877031326294, + "reward_std": 0.29794514179229736, + "rewards/accuracy_reward_stage2": 0.4729377031326294, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2822 + }, + { + "completion_length": 7.578125, + "epoch": 0.49465568599964954, + "grad_norm": 13.743752274823677, + "kl": 0.25, + "learning_rate": 5.055195374101979e-07, + "loss": 0.0217, + "reward": 1.6781415939331055, + "reward_std": 0.2623763084411621, + "rewards/accuracy_reward_stage2": 0.7093915939331055, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2823 + }, + { + "completion_length": 10.75, + "epoch": 0.4948309094094971, + "grad_norm": 16.738230361892004, + "kl": 0.06787109375, + "learning_rate": 5.053443140003503e-07, + "loss": 0.0062, + "reward": 1.3226996660232544, + "reward_std": 0.19135063886642456, + "rewards/accuracy_reward_stage2": 0.4633246660232544, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2824 + }, + { + "completion_length": 8.015625, + "epoch": 0.4950061328193447, + "grad_norm": 13.641295648230404, + "kl": 0.16796875, + "learning_rate": 5.051690905905028e-07, + "loss": -0.021, + "reward": 1.5862393379211426, + "reward_std": 0.2807191014289856, + "rewards/accuracy_reward_stage2": 0.6174893975257874, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2825 + }, + { + "completion_length": 9.609375, + "epoch": 0.49518135622919224, + "grad_norm": 19.495140221339017, + "kl": 0.0302734375, + "learning_rate": 5.049938671806553e-07, + "loss": 0.0121, + "reward": 1.5694129467010498, + "reward_std": 0.2659677267074585, + "rewards/accuracy_reward_stage2": 0.569412887096405, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2826 + }, + { + "completion_length": 8.90625, + "epoch": 0.4953565796390398, + "grad_norm": 19.298477917356017, + "kl": 0.11962890625, + "learning_rate": 5.048186437708077e-07, + "loss": 0.0036, + "reward": 1.730294942855835, + "reward_std": 0.17712485790252686, + "rewards/accuracy_reward_stage2": 0.745919942855835, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2827 + }, + { + "completion_length": 13.28125, + "epoch": 0.49553180304888733, + "grad_norm": 16.150852865470974, + "kl": 0.08251953125, + "learning_rate": 5.046434203609602e-07, + "loss": -0.0552, + "reward": 1.5572917461395264, + "reward_std": 0.2696126699447632, + "rewards/accuracy_reward_stage2": 0.5885416269302368, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2828 + }, + { + "completion_length": 12.265625, + "epoch": 0.4957070264587349, + "grad_norm": 24.21432717723203, + "kl": 0.375, + "learning_rate": 5.044681969511127e-07, + "loss": 0.0217, + "reward": 1.6967790126800537, + "reward_std": 0.31078892946243286, + "rewards/accuracy_reward_stage2": 0.7592791318893433, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2829 + }, + { + "completion_length": 10.671875, + "epoch": 0.4958822498685824, + "grad_norm": 16.31810279334247, + "kl": 0.142578125, + "learning_rate": 5.042929735412651e-07, + "loss": -0.0225, + "reward": 1.5832899808883667, + "reward_std": 0.17362064123153687, + "rewards/accuracy_reward_stage2": 0.6145399808883667, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2830 + }, + { + "completion_length": 9.921875, + "epoch": 0.49605747327843, + "grad_norm": 12.727870552143301, + "kl": 0.0791015625, + "learning_rate": 5.041177501314176e-07, + "loss": 0.0316, + "reward": 1.6768295764923096, + "reward_std": 0.13108648359775543, + "rewards/accuracy_reward_stage2": 0.6768295764923096, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2831 + }, + { + "completion_length": 26.1875, + "epoch": 0.4962326966882776, + "grad_norm": 19.66793729672711, + "kl": 0.458984375, + "learning_rate": 5.0394252672157e-07, + "loss": 0.1178, + "reward": 1.4868590831756592, + "reward_std": 0.2769075632095337, + "rewards/accuracy_reward_stage2": 0.6431090831756592, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2832 + }, + { + "completion_length": 13.5, + "epoch": 0.4964079200981251, + "grad_norm": 18.268128966158116, + "kl": 0.140625, + "learning_rate": 5.037673033117224e-07, + "loss": -0.042, + "reward": 1.4005197286605835, + "reward_std": 0.31960806250572205, + "rewards/accuracy_reward_stage2": 0.4473947286605835, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2833 + }, + { + "completion_length": 8.65625, + "epoch": 0.49658314350797267, + "grad_norm": 20.947171506470056, + "kl": 0.05712890625, + "learning_rate": 5.035920799018749e-07, + "loss": 0.0229, + "reward": 1.4880142211914062, + "reward_std": 0.3103662133216858, + "rewards/accuracy_reward_stage2": 0.48801422119140625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2834 + }, + { + "completion_length": 7.78125, + "epoch": 0.4967583669178202, + "grad_norm": 11.906433925116346, + "kl": 0.1171875, + "learning_rate": 5.034168564920273e-07, + "loss": -0.0209, + "reward": 1.453125, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2835 + }, + { + "completion_length": 14.6875, + "epoch": 0.49693359032766776, + "grad_norm": 20.864580568318974, + "kl": 0.1328125, + "learning_rate": 5.032416330821797e-07, + "loss": -0.0142, + "reward": 1.266761302947998, + "reward_std": 0.23117133975028992, + "rewards/accuracy_reward_stage2": 0.29801127314567566, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2836 + }, + { + "completion_length": 7.59375, + "epoch": 0.4971088137375153, + "grad_norm": 17.392723074485005, + "kl": 0.1728515625, + "learning_rate": 5.030664096723322e-07, + "loss": -0.004, + "reward": 1.7567867040634155, + "reward_std": 0.3129429817199707, + "rewards/accuracy_reward_stage2": 0.7880366444587708, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2837 + }, + { + "completion_length": 7.390625, + "epoch": 0.4972840371473629, + "grad_norm": 22.961837058645735, + "kl": 0.216796875, + "learning_rate": 5.028911862624846e-07, + "loss": -0.0142, + "reward": 1.252758502960205, + "reward_std": 0.1922820806503296, + "rewards/accuracy_reward_stage2": 0.2996334433555603, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2838 + }, + { + "completion_length": 8.984375, + "epoch": 0.49745926055721046, + "grad_norm": 12.840102408536092, + "kl": 0.171875, + "learning_rate": 5.027159628526371e-07, + "loss": -0.0292, + "reward": 1.4514057636260986, + "reward_std": 0.15803693234920502, + "rewards/accuracy_reward_stage2": 0.4982808530330658, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2839 + }, + { + "completion_length": 4.90625, + "epoch": 0.497634483967058, + "grad_norm": 16.67158345880587, + "kl": 0.119140625, + "learning_rate": 5.025407394427895e-07, + "loss": -0.0408, + "reward": 1.6197917461395264, + "reward_std": 0.2089996337890625, + "rewards/accuracy_reward_stage2": 0.6510416269302368, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2840 + }, + { + "completion_length": 8.5625, + "epoch": 0.49780970737690555, + "grad_norm": 16.771018106748706, + "kl": 0.052001953125, + "learning_rate": 5.02365516032942e-07, + "loss": 0.0209, + "reward": 1.415387511253357, + "reward_std": 0.2690582871437073, + "rewards/accuracy_reward_stage2": 0.5403875112533569, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2841 + }, + { + "completion_length": 9.984375, + "epoch": 0.4979849307867531, + "grad_norm": 17.667681684297957, + "kl": 0.10546875, + "learning_rate": 5.021902926230945e-07, + "loss": 0.0421, + "reward": 1.6814024448394775, + "reward_std": 0.2051592618227005, + "rewards/accuracy_reward_stage2": 0.6814025640487671, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2842 + }, + { + "completion_length": 9.53125, + "epoch": 0.49816015419660065, + "grad_norm": 17.135267251551593, + "kl": 0.11181640625, + "learning_rate": 5.020150692132468e-07, + "loss": 0.0005, + "reward": 1.5520660877227783, + "reward_std": 0.16430020332336426, + "rewards/accuracy_reward_stage2": 0.6926910877227783, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2843 + }, + { + "completion_length": 11.34375, + "epoch": 0.49833537760644825, + "grad_norm": 23.174818431076805, + "kl": 0.27734375, + "learning_rate": 5.018398458033993e-07, + "loss": -0.081, + "reward": 1.4596951007843018, + "reward_std": 0.29588693380355835, + "rewards/accuracy_reward_stage2": 0.5378201007843018, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 2844 + }, + { + "completion_length": 14.53125, + "epoch": 0.4985106010162958, + "grad_norm": 41.97659001913079, + "kl": 0.453125, + "learning_rate": 5.016646223935518e-07, + "loss": 0.1008, + "reward": 1.3339383602142334, + "reward_std": 0.2974085211753845, + "rewards/accuracy_reward_stage2": 0.4901883602142334, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2845 + }, + { + "completion_length": 8.484375, + "epoch": 0.49868582442614334, + "grad_norm": 19.85600785091683, + "kl": 0.150390625, + "learning_rate": 5.014893989837042e-07, + "loss": 0.0314, + "reward": 1.6979167461395264, + "reward_std": 0.35343262553215027, + "rewards/accuracy_reward_stage2": 0.7135416865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2846 + }, + { + "completion_length": 15.671875, + "epoch": 0.4988610478359909, + "grad_norm": 18.853279175915038, + "kl": 0.162109375, + "learning_rate": 5.013141755738567e-07, + "loss": -0.025, + "reward": 1.6923900842666626, + "reward_std": 0.23192203044891357, + "rewards/accuracy_reward_stage2": 0.7392650246620178, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2847 + }, + { + "completion_length": 6.65625, + "epoch": 0.49903627124583844, + "grad_norm": 22.894855403366556, + "kl": 0.0751953125, + "learning_rate": 5.011389521640091e-07, + "loss": 0.03, + "reward": 1.691043734550476, + "reward_std": 0.20182660222053528, + "rewards/accuracy_reward_stage2": 0.6910437345504761, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2848 + }, + { + "completion_length": 17.234375, + "epoch": 0.499211494655686, + "grad_norm": 18.017055889535754, + "kl": 0.10498046875, + "learning_rate": 5.009637287541615e-07, + "loss": -0.0363, + "reward": 1.4788634777069092, + "reward_std": 0.2652289867401123, + "rewards/accuracy_reward_stage2": 0.6351134181022644, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2849 + }, + { + "completion_length": 10.84375, + "epoch": 0.49938671806553353, + "grad_norm": 18.858962789288007, + "kl": 0.1591796875, + "learning_rate": 5.00788505344314e-07, + "loss": 0.0292, + "reward": 1.5011869668960571, + "reward_std": 0.1766315996646881, + "rewards/accuracy_reward_stage2": 0.6418120265007019, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2850 + }, + { + "completion_length": 8.65625, + "epoch": 0.49956194147538113, + "grad_norm": 18.731961150873207, + "kl": 0.2470703125, + "learning_rate": 5.006132819344664e-07, + "loss": 0.0211, + "reward": 1.4539709091186523, + "reward_std": 0.3791165053844452, + "rewards/accuracy_reward_stage2": 0.4852209687232971, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2851 + }, + { + "completion_length": 9.703125, + "epoch": 0.4997371648852287, + "grad_norm": 16.94499504943766, + "kl": 0.06640625, + "learning_rate": 5.004380585246189e-07, + "loss": -0.0176, + "reward": 1.596168041229248, + "reward_std": 0.15915852785110474, + "rewards/accuracy_reward_stage2": 0.736793041229248, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2852 + }, + { + "completion_length": 12.203125, + "epoch": 0.4999123882950762, + "grad_norm": 21.043689187254092, + "kl": 0.1611328125, + "learning_rate": 5.002628351147713e-07, + "loss": 0.0644, + "reward": 1.3016023635864258, + "reward_std": 0.22436624765396118, + "rewards/accuracy_reward_stage2": 0.426602303981781, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2853 + }, + { + "completion_length": 8.21875, + "epoch": 0.5000876117049238, + "grad_norm": 14.048458594879166, + "kl": 0.03662109375, + "learning_rate": 5.000876117049237e-07, + "loss": 0.0147, + "reward": 1.833432912826538, + "reward_std": 0.153619185090065, + "rewards/accuracy_reward_stage2": 0.8334329128265381, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2854 + }, + { + "completion_length": 10.421875, + "epoch": 0.5002628351147713, + "grad_norm": 21.185086913316407, + "kl": 0.05810546875, + "learning_rate": 4.999123882950762e-07, + "loss": 0.0232, + "reward": 1.7559123039245605, + "reward_std": 0.22805647552013397, + "rewards/accuracy_reward_stage2": 0.7559122443199158, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2855 + }, + { + "completion_length": 14.0, + "epoch": 0.5004380585246189, + "grad_norm": 18.58990052940012, + "kl": 0.04248046875, + "learning_rate": 4.997371648852286e-07, + "loss": 0.017, + "reward": 1.4115819931030273, + "reward_std": 0.21287208795547485, + "rewards/accuracy_reward_stage2": 0.4115820527076721, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2856 + }, + { + "completion_length": 9.78125, + "epoch": 0.5006132819344664, + "grad_norm": 27.798694752355708, + "kl": 0.10302734375, + "learning_rate": 4.995619414753811e-07, + "loss": -0.0028, + "reward": 1.7112268209457397, + "reward_std": 0.28396135568618774, + "rewards/accuracy_reward_stage2": 0.7268518805503845, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2857 + }, + { + "completion_length": 12.65625, + "epoch": 0.500788505344314, + "grad_norm": 18.676237589883304, + "kl": 0.142578125, + "learning_rate": 4.993867180655335e-07, + "loss": -0.0206, + "reward": 1.7398256063461304, + "reward_std": 0.24794884026050568, + "rewards/accuracy_reward_stage2": 0.7710756063461304, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2858 + }, + { + "completion_length": 16.875, + "epoch": 0.5009637287541615, + "grad_norm": 21.658396617934685, + "kl": 0.1142578125, + "learning_rate": 4.99211494655686e-07, + "loss": 0.0457, + "reward": 1.3541054725646973, + "reward_std": 0.24178023636341095, + "rewards/accuracy_reward_stage2": 0.47910550236701965, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2859 + }, + { + "completion_length": 7.6875, + "epoch": 0.5011389521640092, + "grad_norm": 20.286041713919595, + "kl": 0.095703125, + "learning_rate": 4.990362712458384e-07, + "loss": 0.0383, + "reward": 1.5827951431274414, + "reward_std": 0.26222479343414307, + "rewards/accuracy_reward_stage2": 0.7077950835227966, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2860 + }, + { + "completion_length": 10.71875, + "epoch": 0.5013141755738567, + "grad_norm": 18.808117184993183, + "kl": 0.1259765625, + "learning_rate": 4.988610478359909e-07, + "loss": 0.0503, + "reward": 1.3651602268218994, + "reward_std": 0.29143521189689636, + "rewards/accuracy_reward_stage2": 0.6151602268218994, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2861 + }, + { + "completion_length": 9.484375, + "epoch": 0.5014893989837043, + "grad_norm": 19.878420281823956, + "kl": 0.061279296875, + "learning_rate": 4.986858244261434e-07, + "loss": -0.0196, + "reward": 1.3310251235961914, + "reward_std": 0.34876665472984314, + "rewards/accuracy_reward_stage2": 0.3466501832008362, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2862 + }, + { + "completion_length": 11.40625, + "epoch": 0.5016646223935518, + "grad_norm": 14.843342929453732, + "kl": 0.03173828125, + "learning_rate": 4.985106010162957e-07, + "loss": 0.0127, + "reward": 1.3046542406082153, + "reward_std": 0.15470543503761292, + "rewards/accuracy_reward_stage2": 0.30465424060821533, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2863 + }, + { + "completion_length": 8.59375, + "epoch": 0.5018398458033994, + "grad_norm": 16.630839287099747, + "kl": 0.16796875, + "learning_rate": 4.983353776064482e-07, + "loss": -0.0189, + "reward": 1.6525473594665527, + "reward_std": 0.2849022150039673, + "rewards/accuracy_reward_stage2": 0.6837973594665527, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2864 + }, + { + "completion_length": 6.984375, + "epoch": 0.5020150692132469, + "grad_norm": 13.160546414311293, + "kl": 0.2490234375, + "learning_rate": 4.981601541966006e-07, + "loss": 0.0708, + "reward": 1.4527487754821777, + "reward_std": 0.09580159932374954, + "rewards/accuracy_reward_stage2": 0.5933738350868225, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2865 + }, + { + "completion_length": 10.5, + "epoch": 0.5021902926230944, + "grad_norm": 27.994825259025895, + "kl": 0.251953125, + "learning_rate": 4.979849307867531e-07, + "loss": 0.057, + "reward": 1.606134295463562, + "reward_std": 0.30084317922592163, + "rewards/accuracy_reward_stage2": 0.746759295463562, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2866 + }, + { + "completion_length": 10.03125, + "epoch": 0.502365516032942, + "grad_norm": 16.449446402433466, + "kl": 0.435546875, + "learning_rate": 4.978097073769055e-07, + "loss": 0.0975, + "reward": 1.328658938407898, + "reward_std": 0.23123815655708313, + "rewards/accuracy_reward_stage2": 0.609908938407898, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2867 + }, + { + "completion_length": 9.21875, + "epoch": 0.5025407394427895, + "grad_norm": 16.88690992041481, + "kl": 0.06494140625, + "learning_rate": 4.97634483967058e-07, + "loss": -0.0183, + "reward": 1.48995041847229, + "reward_std": 0.12207160145044327, + "rewards/accuracy_reward_stage2": 0.5055753588676453, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2868 + }, + { + "completion_length": 14.171875, + "epoch": 0.5027159628526371, + "grad_norm": 21.897199972009407, + "kl": 0.12353515625, + "learning_rate": 4.974592605572105e-07, + "loss": 0.0114, + "reward": 1.1664299964904785, + "reward_std": 0.267032653093338, + "rewards/accuracy_reward_stage2": 0.43205493688583374, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2869 + }, + { + "completion_length": 10.953125, + "epoch": 0.5028911862624846, + "grad_norm": 29.64452204880094, + "kl": 0.1640625, + "learning_rate": 4.972840371473629e-07, + "loss": 0.0411, + "reward": 1.656315803527832, + "reward_std": 0.14924439787864685, + "rewards/accuracy_reward_stage2": 0.7969407439231873, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2870 + }, + { + "completion_length": 12.234375, + "epoch": 0.5030664096723322, + "grad_norm": 15.783355354362861, + "kl": 0.060791015625, + "learning_rate": 4.971088137375153e-07, + "loss": 0.0243, + "reward": 1.633192539215088, + "reward_std": 0.07455827295780182, + "rewards/accuracy_reward_stage2": 0.6331925988197327, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2871 + }, + { + "completion_length": 17.078125, + "epoch": 0.5032416330821797, + "grad_norm": 20.14334209062457, + "kl": 0.140625, + "learning_rate": 4.969335903276678e-07, + "loss": 0.0122, + "reward": 1.4405739307403564, + "reward_std": 0.2437485158443451, + "rewards/accuracy_reward_stage2": 0.5811989903450012, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2872 + }, + { + "completion_length": 13.125, + "epoch": 0.5034168564920274, + "grad_norm": 19.491096865220893, + "kl": 0.0289306640625, + "learning_rate": 4.967583669178202e-07, + "loss": 0.0115, + "reward": 1.4652515649795532, + "reward_std": 0.14238084852695465, + "rewards/accuracy_reward_stage2": 0.465251624584198, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2873 + }, + { + "completion_length": 11.40625, + "epoch": 0.5035920799018749, + "grad_norm": 21.63362229573334, + "kl": 0.1259765625, + "learning_rate": 4.965831435079726e-07, + "loss": 0.0111, + "reward": 1.641181468963623, + "reward_std": 0.260013610124588, + "rewards/accuracy_reward_stage2": 0.656806468963623, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2874 + }, + { + "completion_length": 9.25, + "epoch": 0.5037673033117225, + "grad_norm": 161.09061090228093, + "kl": 0.83203125, + "learning_rate": 4.964079200981251e-07, + "loss": 0.2849, + "reward": 1.5068423748016357, + "reward_std": 0.2722551226615906, + "rewards/accuracy_reward_stage2": 0.538092315196991, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2875 + }, + { + "completion_length": 11.75, + "epoch": 0.50394252672157, + "grad_norm": 20.61916644423281, + "kl": 0.1064453125, + "learning_rate": 4.962326966882775e-07, + "loss": 0.0426, + "reward": 1.5803353786468506, + "reward_std": 0.17665207386016846, + "rewards/accuracy_reward_stage2": 0.5803354382514954, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2876 + }, + { + "completion_length": 13.4375, + "epoch": 0.5041177501314176, + "grad_norm": 20.31103660975051, + "kl": 0.07421875, + "learning_rate": 4.9605747327843e-07, + "loss": 0.0298, + "reward": 1.6658729314804077, + "reward_std": 0.22268161177635193, + "rewards/accuracy_reward_stage2": 0.6658729314804077, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2877 + }, + { + "completion_length": 9.28125, + "epoch": 0.5042929735412651, + "grad_norm": 22.3694362107489, + "kl": 0.26953125, + "learning_rate": 4.958822498685824e-07, + "loss": 0.0265, + "reward": 1.6044328212738037, + "reward_std": 0.31371209025382996, + "rewards/accuracy_reward_stage2": 0.6513078212738037, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2878 + }, + { + "completion_length": 11.234375, + "epoch": 0.5044681969511127, + "grad_norm": 20.70169028531261, + "kl": 0.0732421875, + "learning_rate": 4.957070264587349e-07, + "loss": 0.0294, + "reward": 1.6221519708633423, + "reward_std": 0.20389091968536377, + "rewards/accuracy_reward_stage2": 0.6221520900726318, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2879 + }, + { + "completion_length": 10.515625, + "epoch": 0.5046434203609602, + "grad_norm": 23.05165408478004, + "kl": 0.11962890625, + "learning_rate": 4.955318030488873e-07, + "loss": -0.0199, + "reward": 1.5865809917449951, + "reward_std": 0.2581280469894409, + "rewards/accuracy_reward_stage2": 0.6178310513496399, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2880 + }, + { + "completion_length": 10.09375, + "epoch": 0.5048186437708078, + "grad_norm": 19.21254274225465, + "kl": 0.054931640625, + "learning_rate": 4.953565796390398e-07, + "loss": 0.022, + "reward": 1.615790605545044, + "reward_std": 0.19559542834758759, + "rewards/accuracy_reward_stage2": 0.6157907247543335, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2881 + }, + { + "completion_length": 12.4375, + "epoch": 0.5049938671806553, + "grad_norm": 18.617819254545374, + "kl": 0.263671875, + "learning_rate": 4.951813562291923e-07, + "loss": 0.0959, + "reward": 1.452277421951294, + "reward_std": 0.1409415304660797, + "rewards/accuracy_reward_stage2": 0.702277421951294, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2882 + }, + { + "completion_length": 6.21875, + "epoch": 0.5051690905905029, + "grad_norm": 16.133214593703105, + "kl": 0.15234375, + "learning_rate": 4.950061328193446e-07, + "loss": -0.0037, + "reward": 1.5024305582046509, + "reward_std": 0.315075159072876, + "rewards/accuracy_reward_stage2": 0.5336805582046509, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2883 + }, + { + "completion_length": 10.359375, + "epoch": 0.5053443140003504, + "grad_norm": 17.226185966153473, + "kl": 0.166015625, + "learning_rate": 4.94830909409497e-07, + "loss": -0.0167, + "reward": 1.3070234060287476, + "reward_std": 0.3721379041671753, + "rewards/accuracy_reward_stage2": 0.47889846563339233, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2884 + }, + { + "completion_length": 11.421875, + "epoch": 0.505519537410198, + "grad_norm": 14.702899239685644, + "kl": 0.169921875, + "learning_rate": 4.946556859996495e-07, + "loss": 0.0237, + "reward": 1.5311923027038574, + "reward_std": 0.2328895628452301, + "rewards/accuracy_reward_stage2": 0.671817421913147, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2885 + }, + { + "completion_length": 11.46875, + "epoch": 0.5056947608200456, + "grad_norm": 23.022843906262594, + "kl": 0.1181640625, + "learning_rate": 4.94480462589802e-07, + "loss": 0.0205, + "reward": 1.471540927886963, + "reward_std": 0.23446643352508545, + "rewards/accuracy_reward_stage2": 0.4871658682823181, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2886 + }, + { + "completion_length": 6.40625, + "epoch": 0.5058699842298932, + "grad_norm": 14.457415335603917, + "kl": 0.0634765625, + "learning_rate": 4.943052391799544e-07, + "loss": 0.0255, + "reward": 1.6342511177062988, + "reward_std": 0.19844907522201538, + "rewards/accuracy_reward_stage2": 0.7592511773109436, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2887 + }, + { + "completion_length": 8.28125, + "epoch": 0.5060452076397407, + "grad_norm": 15.509418136970604, + "kl": 0.06396484375, + "learning_rate": 4.941300157701069e-07, + "loss": -0.0186, + "reward": 1.65625, + "reward_std": 0.1462520956993103, + "rewards/accuracy_reward_stage2": 0.671875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2888 + }, + { + "completion_length": 12.484375, + "epoch": 0.5062204310495882, + "grad_norm": 15.822741273420396, + "kl": 0.095703125, + "learning_rate": 4.939547923602594e-07, + "loss": -0.0059, + "reward": 1.6550612449645996, + "reward_std": 0.20866025984287262, + "rewards/accuracy_reward_stage2": 0.6706862449645996, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2889 + }, + { + "completion_length": 10.59375, + "epoch": 0.5063956544594358, + "grad_norm": 24.030731546327335, + "kl": 0.0810546875, + "learning_rate": 4.937795689504118e-07, + "loss": -0.056, + "reward": 1.701578140258789, + "reward_std": 0.13695769011974335, + "rewards/accuracy_reward_stage2": 0.7328281402587891, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2890 + }, + { + "completion_length": 11.65625, + "epoch": 0.5065708778692833, + "grad_norm": 11.550308632255781, + "kl": 0.1123046875, + "learning_rate": 4.936043455405642e-07, + "loss": 0.0448, + "reward": 1.6819041967391968, + "reward_std": 0.08682706952095032, + "rewards/accuracy_reward_stage2": 0.8069040775299072, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2891 + }, + { + "completion_length": 8.84375, + "epoch": 0.5067461012791309, + "grad_norm": 18.67077224667281, + "kl": 0.146484375, + "learning_rate": 4.934291221307166e-07, + "loss": 0.0152, + "reward": 1.401998519897461, + "reward_std": 0.17456203699111938, + "rewards/accuracy_reward_stage2": 0.6676234006881714, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2892 + }, + { + "completion_length": 9.71875, + "epoch": 0.5069213246889784, + "grad_norm": 19.36151135012173, + "kl": 0.142578125, + "learning_rate": 4.93253898720869e-07, + "loss": 0.057, + "reward": 1.6398862600326538, + "reward_std": 0.09990248829126358, + "rewards/accuracy_reward_stage2": 0.8898862600326538, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2893 + }, + { + "completion_length": 11.203125, + "epoch": 0.507096548098826, + "grad_norm": 17.91391947313902, + "kl": 0.1328125, + "learning_rate": 4.930786753110215e-07, + "loss": 0.0201, + "reward": 1.5040777921676636, + "reward_std": 0.2481655478477478, + "rewards/accuracy_reward_stage2": 0.6447028517723083, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2894 + }, + { + "completion_length": 6.171875, + "epoch": 0.5072717715086735, + "grad_norm": 19.930073948592227, + "kl": 0.057861328125, + "learning_rate": 4.92903451901174e-07, + "loss": 0.0231, + "reward": 1.7874478101730347, + "reward_std": 0.16011011600494385, + "rewards/accuracy_reward_stage2": 0.7874478697776794, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2895 + }, + { + "completion_length": 9.46875, + "epoch": 0.5074469949185211, + "grad_norm": 19.36729865534549, + "kl": 0.1552734375, + "learning_rate": 4.927282284913264e-07, + "loss": 0.0332, + "reward": 1.514788031578064, + "reward_std": 0.2984526455402374, + "rewards/accuracy_reward_stage2": 0.655413031578064, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2896 + }, + { + "completion_length": 9.203125, + "epoch": 0.5076222183283686, + "grad_norm": 16.005919521985984, + "kl": 0.1318359375, + "learning_rate": 4.925530050814788e-07, + "loss": -0.0286, + "reward": 1.7343182563781738, + "reward_std": 0.271266907453537, + "rewards/accuracy_reward_stage2": 0.7655682563781738, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2897 + }, + { + "completion_length": 17.8125, + "epoch": 0.5077974417382162, + "grad_norm": 25.43410976233458, + "kl": 0.07373046875, + "learning_rate": 4.923777816716313e-07, + "loss": -0.0147, + "reward": 1.6448153257369995, + "reward_std": 0.22978892922401428, + "rewards/accuracy_reward_stage2": 0.6604403257369995, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2898 + }, + { + "completion_length": 8.953125, + "epoch": 0.5079726651480638, + "grad_norm": 21.780308873263436, + "kl": 0.1904296875, + "learning_rate": 4.922025582617838e-07, + "loss": 0.0228, + "reward": 1.4488801956176758, + "reward_std": 0.2935516834259033, + "rewards/accuracy_reward_stage2": 0.6051301956176758, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2899 + }, + { + "completion_length": 12.234375, + "epoch": 0.5081478885579114, + "grad_norm": 25.732968985688775, + "kl": 0.0625, + "learning_rate": 4.920273348519362e-07, + "loss": -0.0162, + "reward": 1.666982650756836, + "reward_std": 0.25394535064697266, + "rewards/accuracy_reward_stage2": 0.6826076507568359, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2900 + }, + { + "completion_length": 12.421875, + "epoch": 0.5083231119677589, + "grad_norm": 15.150632376423873, + "kl": 0.1376953125, + "learning_rate": 4.918521114420887e-07, + "loss": 0.0114, + "reward": 1.6130900382995605, + "reward_std": 0.16419199109077454, + "rewards/accuracy_reward_stage2": 0.6287149786949158, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2901 + }, + { + "completion_length": 11.921875, + "epoch": 0.5084983353776065, + "grad_norm": 16.95905149443227, + "kl": 0.12451171875, + "learning_rate": 4.916768880322412e-07, + "loss": -0.0114, + "reward": 1.4094171524047852, + "reward_std": 0.2724335193634033, + "rewards/accuracy_reward_stage2": 0.4406670928001404, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2902 + }, + { + "completion_length": 10.25, + "epoch": 0.508673558787454, + "grad_norm": 18.371489893566416, + "kl": 0.21875, + "learning_rate": 4.915016646223935e-07, + "loss": 0.0198, + "reward": 1.40625, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2903 + }, + { + "completion_length": 9.5, + "epoch": 0.5088487821973016, + "grad_norm": 14.49199015434854, + "kl": 0.056884765625, + "learning_rate": 4.913264412125459e-07, + "loss": 0.0228, + "reward": 1.630352258682251, + "reward_std": 0.1146092489361763, + "rewards/accuracy_reward_stage2": 0.630352258682251, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2904 + }, + { + "completion_length": 4.453125, + "epoch": 0.5090240056071491, + "grad_norm": 14.870448324908958, + "kl": 0.21484375, + "learning_rate": 4.911512178026984e-07, + "loss": -0.0382, + "reward": 1.5221229791641235, + "reward_std": 0.2549004852771759, + "rewards/accuracy_reward_stage2": 0.5689980387687683, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2905 + }, + { + "completion_length": 8.78125, + "epoch": 0.5091992290169967, + "grad_norm": 13.835765742239484, + "kl": 0.1279296875, + "learning_rate": 4.909759943928509e-07, + "loss": 0.0069, + "reward": 1.5392454862594604, + "reward_std": 0.1716037094593048, + "rewards/accuracy_reward_stage2": 0.6798704862594604, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2906 + }, + { + "completion_length": 7.875, + "epoch": 0.5093744524268442, + "grad_norm": 16.148864286076268, + "kl": 0.1787109375, + "learning_rate": 4.908007709830033e-07, + "loss": -0.0166, + "reward": 1.460571527481079, + "reward_std": 0.22273962199687958, + "rewards/accuracy_reward_stage2": 0.49182161688804626, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2907 + }, + { + "completion_length": 23.84375, + "epoch": 0.5095496758366918, + "grad_norm": 20.654437519822974, + "kl": 0.07470703125, + "learning_rate": 4.906255475731558e-07, + "loss": 0.03, + "reward": 1.386010766029358, + "reward_std": 0.17949114739894867, + "rewards/accuracy_reward_stage2": 0.3860107660293579, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2908 + }, + { + "completion_length": 10.40625, + "epoch": 0.5097248992465393, + "grad_norm": 18.81732343688654, + "kl": 0.2490234375, + "learning_rate": 4.904503241633082e-07, + "loss": -0.0386, + "reward": 1.593791127204895, + "reward_std": 0.3051733076572418, + "rewards/accuracy_reward_stage2": 0.6562911868095398, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2909 + }, + { + "completion_length": 7.65625, + "epoch": 0.5099001226563868, + "grad_norm": 13.786113372966806, + "kl": 0.1787109375, + "learning_rate": 4.902751007534607e-07, + "loss": -0.0119, + "reward": 1.5133922100067139, + "reward_std": 0.23366865515708923, + "rewards/accuracy_reward_stage2": 0.6696423292160034, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2910 + }, + { + "completion_length": 10.46875, + "epoch": 0.5100753460662345, + "grad_norm": 16.89240087623471, + "kl": 0.1005859375, + "learning_rate": 4.900998773436131e-07, + "loss": -0.004, + "reward": 1.601118803024292, + "reward_std": 0.18279781937599182, + "rewards/accuracy_reward_stage2": 0.616743803024292, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2911 + }, + { + "completion_length": 11.875, + "epoch": 0.510250569476082, + "grad_norm": 16.514867762528304, + "kl": 0.20703125, + "learning_rate": 4.899246539337655e-07, + "loss": 0.0151, + "reward": 1.6914869546890259, + "reward_std": 0.22304078936576843, + "rewards/accuracy_reward_stage2": 0.7227370142936707, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2912 + }, + { + "completion_length": 11.765625, + "epoch": 0.5104257928859296, + "grad_norm": 16.041953386428794, + "kl": 0.11767578125, + "learning_rate": 4.897494305239179e-07, + "loss": 0.0029, + "reward": 1.6377475261688232, + "reward_std": 0.11044105887413025, + "rewards/accuracy_reward_stage2": 0.6533724665641785, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2913 + }, + { + "completion_length": 12.59375, + "epoch": 0.5106010162957771, + "grad_norm": 85.41909589603668, + "kl": 0.474609375, + "learning_rate": 4.895742071140704e-07, + "loss": 0.1459, + "reward": 1.59375, + "reward_std": 0.2756393849849701, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2914 + }, + { + "completion_length": 5.875, + "epoch": 0.5107762397056247, + "grad_norm": 18.587063592887425, + "kl": 0.2109375, + "learning_rate": 4.893989837042229e-07, + "loss": 0.0038, + "reward": 1.5096237659454346, + "reward_std": 0.17004084587097168, + "rewards/accuracy_reward_stage2": 0.6658737063407898, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2915 + }, + { + "completion_length": 7.0, + "epoch": 0.5109514631154722, + "grad_norm": 8.717009220923853, + "kl": 0.1357421875, + "learning_rate": 4.892237602943753e-07, + "loss": -0.034, + "reward": 1.571064829826355, + "reward_std": 0.15255174040794373, + "rewards/accuracy_reward_stage2": 0.602314829826355, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2916 + }, + { + "completion_length": 11.34375, + "epoch": 0.5111266865253198, + "grad_norm": 16.412978357934826, + "kl": 0.039794921875, + "learning_rate": 4.890485368845277e-07, + "loss": 0.0159, + "reward": 1.459886074066162, + "reward_std": 0.23499058187007904, + "rewards/accuracy_reward_stage2": 0.5848859548568726, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2917 + }, + { + "completion_length": 9.125, + "epoch": 0.5113019099351673, + "grad_norm": 16.269261166994866, + "kl": 0.109375, + "learning_rate": 4.888733134746802e-07, + "loss": 0.0438, + "reward": 1.6181046962738037, + "reward_std": 0.12335985898971558, + "rewards/accuracy_reward_stage2": 0.7431047558784485, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2918 + }, + { + "completion_length": 12.296875, + "epoch": 0.5114771333450149, + "grad_norm": 22.491302820648134, + "kl": 0.0849609375, + "learning_rate": 4.886980900648327e-07, + "loss": 0.0339, + "reward": 1.5203516483306885, + "reward_std": 0.29869550466537476, + "rewards/accuracy_reward_stage2": 0.5203516483306885, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2919 + }, + { + "completion_length": 8.234375, + "epoch": 0.5116523567548624, + "grad_norm": 17.17007086433005, + "kl": 0.07958984375, + "learning_rate": 4.885228666549851e-07, + "loss": -0.0412, + "reward": 1.6270606517791748, + "reward_std": 0.23870226740837097, + "rewards/accuracy_reward_stage2": 0.6583105325698853, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2920 + }, + { + "completion_length": 11.828125, + "epoch": 0.51182758016471, + "grad_norm": 21.541425158813357, + "kl": 0.061279296875, + "learning_rate": 4.883476432451376e-07, + "loss": 0.0246, + "reward": 1.8930007219314575, + "reward_std": 0.1855742335319519, + "rewards/accuracy_reward_stage2": 0.8930006623268127, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2921 + }, + { + "completion_length": 10.84375, + "epoch": 0.5120028035745575, + "grad_norm": 15.84245159309022, + "kl": 0.0732421875, + "learning_rate": 4.881724198352899e-07, + "loss": -0.0142, + "reward": 1.7038071155548096, + "reward_std": 0.19769446551799774, + "rewards/accuracy_reward_stage2": 0.7194320559501648, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2922 + }, + { + "completion_length": 9.265625, + "epoch": 0.5121780269844051, + "grad_norm": 19.064149989693348, + "kl": 0.07666015625, + "learning_rate": 4.879971964254424e-07, + "loss": 0.0306, + "reward": 1.669129729270935, + "reward_std": 0.21358340978622437, + "rewards/accuracy_reward_stage2": 0.6691297292709351, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2923 + }, + { + "completion_length": 11.671875, + "epoch": 0.5123532503942527, + "grad_norm": 20.056699028346667, + "kl": 0.181640625, + "learning_rate": 4.878219730155948e-07, + "loss": 0.0364, + "reward": 1.5925509929656982, + "reward_std": 0.3178279399871826, + "rewards/accuracy_reward_stage2": 0.6081759333610535, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2924 + }, + { + "completion_length": 12.03125, + "epoch": 0.5125284738041003, + "grad_norm": 19.022822203075908, + "kl": 0.212890625, + "learning_rate": 4.876467496057473e-07, + "loss": -0.0034, + "reward": 1.4258291721343994, + "reward_std": 0.2871686816215515, + "rewards/accuracy_reward_stage2": 0.45707911252975464, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2925 + }, + { + "completion_length": 6.828125, + "epoch": 0.5127036972139478, + "grad_norm": 17.1522057355123, + "kl": 0.1513671875, + "learning_rate": 4.874715261958998e-07, + "loss": 0.0162, + "reward": 1.7785899639129639, + "reward_std": 0.2176314890384674, + "rewards/accuracy_reward_stage2": 0.7942148447036743, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2926 + }, + { + "completion_length": 9.40625, + "epoch": 0.5128789206237954, + "grad_norm": 20.05783296001098, + "kl": 0.0712890625, + "learning_rate": 4.872963027860522e-07, + "loss": 0.0285, + "reward": 1.6176671981811523, + "reward_std": 0.1611122190952301, + "rewards/accuracy_reward_stage2": 0.6176671981811523, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2927 + }, + { + "completion_length": 10.15625, + "epoch": 0.5130541440336429, + "grad_norm": 20.272883626727722, + "kl": 0.1015625, + "learning_rate": 4.871210793762046e-07, + "loss": 0.024, + "reward": 1.423703908920288, + "reward_std": 0.24741846323013306, + "rewards/accuracy_reward_stage2": 0.4393288493156433, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2928 + }, + { + "completion_length": 8.640625, + "epoch": 0.5132293674434905, + "grad_norm": 15.959972536091819, + "kl": 0.09912109375, + "learning_rate": 4.869458559663571e-07, + "loss": 0.0395, + "reward": 1.4770452976226807, + "reward_std": 0.11607255786657333, + "rewards/accuracy_reward_stage2": 0.6020451784133911, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2929 + }, + { + "completion_length": 10.890625, + "epoch": 0.513404590853338, + "grad_norm": 17.518959100791523, + "kl": 0.09375, + "learning_rate": 4.867706325565096e-07, + "loss": 0.0247, + "reward": 1.5568358898162842, + "reward_std": 0.19567272067070007, + "rewards/accuracy_reward_stage2": 0.572460949420929, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2930 + }, + { + "completion_length": 12.71875, + "epoch": 0.5135798142631856, + "grad_norm": 20.847540218934004, + "kl": 0.12158203125, + "learning_rate": 4.86595409146662e-07, + "loss": 0.0486, + "reward": 1.689253330230713, + "reward_std": 0.16902679204940796, + "rewards/accuracy_reward_stage2": 0.6892533898353577, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2931 + }, + { + "completion_length": 9.25, + "epoch": 0.5137550376730331, + "grad_norm": 15.486698154520228, + "kl": 0.07861328125, + "learning_rate": 4.864201857368144e-07, + "loss": 0.0314, + "reward": 1.7223076820373535, + "reward_std": 0.08451945334672928, + "rewards/accuracy_reward_stage2": 0.722307562828064, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2932 + }, + { + "completion_length": 9.96875, + "epoch": 0.5139302610828806, + "grad_norm": 19.769809054977316, + "kl": 0.37890625, + "learning_rate": 4.862449623269668e-07, + "loss": 0.1053, + "reward": 1.2136902809143066, + "reward_std": 0.11439789831638336, + "rewards/accuracy_reward_stage2": 0.5886902809143066, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 2933 + }, + { + "completion_length": 9.5625, + "epoch": 0.5141054844927282, + "grad_norm": 18.595144713621774, + "kl": 0.1806640625, + "learning_rate": 4.860697389171193e-07, + "loss": -0.0367, + "reward": 1.689985990524292, + "reward_std": 0.26583412289619446, + "rewards/accuracy_reward_stage2": 0.861860990524292, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2934 + }, + { + "completion_length": 6.828125, + "epoch": 0.5142807079025757, + "grad_norm": 12.149659373020459, + "kl": 0.0751953125, + "learning_rate": 4.858945155072717e-07, + "loss": -0.0141, + "reward": 1.6463022232055664, + "reward_std": 0.07076901197433472, + "rewards/accuracy_reward_stage2": 0.6619271636009216, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2935 + }, + { + "completion_length": 12.203125, + "epoch": 0.5144559313124233, + "grad_norm": 15.91180690198129, + "kl": 0.263671875, + "learning_rate": 4.857192920974242e-07, + "loss": -0.0187, + "reward": 1.3695735931396484, + "reward_std": 0.2769041061401367, + "rewards/accuracy_reward_stage2": 0.4164485037326813, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2936 + }, + { + "completion_length": 7.78125, + "epoch": 0.514631154722271, + "grad_norm": 14.274518704484965, + "kl": 0.07861328125, + "learning_rate": 4.855440686875766e-07, + "loss": 0.0034, + "reward": 1.3541667461395264, + "reward_std": 0.18801738321781158, + "rewards/accuracy_reward_stage2": 0.4947916865348816, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2937 + }, + { + "completion_length": 20.28125, + "epoch": 0.5148063781321185, + "grad_norm": 18.40581395743968, + "kl": 0.09521484375, + "learning_rate": 4.853688452777291e-07, + "loss": 0.0046, + "reward": 1.336214542388916, + "reward_std": 0.22887608408927917, + "rewards/accuracy_reward_stage2": 0.47683948278427124, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2938 + }, + { + "completion_length": 7.96875, + "epoch": 0.514981601541966, + "grad_norm": 23.80466503473419, + "kl": 0.2333984375, + "learning_rate": 4.851936218678816e-07, + "loss": 0.0057, + "reward": 1.5006499290466309, + "reward_std": 0.2008485645055771, + "rewards/accuracy_reward_stage2": 0.5318998098373413, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2939 + }, + { + "completion_length": 8.359375, + "epoch": 0.5151568249518136, + "grad_norm": 17.946604121179828, + "kl": 0.1064453125, + "learning_rate": 4.85018398458034e-07, + "loss": 0.0425, + "reward": 1.7962557077407837, + "reward_std": 0.19889307022094727, + "rewards/accuracy_reward_stage2": 0.7962556481361389, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2940 + }, + { + "completion_length": 13.25, + "epoch": 0.5153320483616611, + "grad_norm": 21.0453056279407, + "kl": 0.2216796875, + "learning_rate": 4.848431750481863e-07, + "loss": -0.0437, + "reward": 1.3767890930175781, + "reward_std": 0.2692345082759857, + "rewards/accuracy_reward_stage2": 0.4236640930175781, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2941 + }, + { + "completion_length": 11.1875, + "epoch": 0.5155072717715087, + "grad_norm": 14.213308225508744, + "kl": 0.1337890625, + "learning_rate": 4.846679516383388e-07, + "loss": 0.0537, + "reward": 1.5524367094039917, + "reward_std": 0.16399207711219788, + "rewards/accuracy_reward_stage2": 0.6774367094039917, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2942 + }, + { + "completion_length": 7.578125, + "epoch": 0.5156824951813562, + "grad_norm": 14.856360108084575, + "kl": 0.16796875, + "learning_rate": 4.844927282284913e-07, + "loss": 0.0173, + "reward": 1.4467616081237793, + "reward_std": 0.15504275262355804, + "rewards/accuracy_reward_stage2": 0.4780115783214569, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2943 + }, + { + "completion_length": 10.953125, + "epoch": 0.5158577185912038, + "grad_norm": 11.990319904854841, + "kl": 0.162109375, + "learning_rate": 4.843175048186437e-07, + "loss": -0.0109, + "reward": 1.2604167461395264, + "reward_std": 0.1473139077425003, + "rewards/accuracy_reward_stage2": 0.4166666865348816, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2944 + }, + { + "completion_length": 14.734375, + "epoch": 0.5160329420010513, + "grad_norm": 19.809881898967866, + "kl": 0.08154296875, + "learning_rate": 4.841422814087962e-07, + "loss": -0.0065, + "reward": 1.6608145236968994, + "reward_std": 0.30760252475738525, + "rewards/accuracy_reward_stage2": 0.8014395236968994, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2945 + }, + { + "completion_length": 9.484375, + "epoch": 0.5162081654108989, + "grad_norm": 13.553033797905996, + "kl": 0.09521484375, + "learning_rate": 4.839670579989487e-07, + "loss": -0.006, + "reward": 1.5949015617370605, + "reward_std": 0.19428260624408722, + "rewards/accuracy_reward_stage2": 0.6105265617370605, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2946 + }, + { + "completion_length": 9.34375, + "epoch": 0.5163833888207464, + "grad_norm": 25.443446242256226, + "kl": 0.10888671875, + "learning_rate": 4.837918345891011e-07, + "loss": -0.0042, + "reward": 1.438382625579834, + "reward_std": 0.4678412675857544, + "rewards/accuracy_reward_stage2": 0.4696325361728668, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2947 + }, + { + "completion_length": 15.1875, + "epoch": 0.516558612230594, + "grad_norm": 21.525238194596895, + "kl": 0.11572265625, + "learning_rate": 4.836166111792535e-07, + "loss": 0.002, + "reward": 1.4573842287063599, + "reward_std": 0.3356036841869354, + "rewards/accuracy_reward_stage2": 0.4730091989040375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2948 + }, + { + "completion_length": 14.5, + "epoch": 0.5167338356404415, + "grad_norm": 19.532247302186448, + "kl": 0.1796875, + "learning_rate": 4.83441387769406e-07, + "loss": 0.0462, + "reward": 1.7068278789520264, + "reward_std": 0.15236984193325043, + "rewards/accuracy_reward_stage2": 0.7224528193473816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2949 + }, + { + "completion_length": 10.234375, + "epoch": 0.5169090590502892, + "grad_norm": 17.956263791790803, + "kl": 0.275390625, + "learning_rate": 4.832661643595585e-07, + "loss": -0.018, + "reward": 1.6681300401687622, + "reward_std": 0.2555171251296997, + "rewards/accuracy_reward_stage2": 0.7150050401687622, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2950 + }, + { + "completion_length": 13.546875, + "epoch": 0.5170842824601367, + "grad_norm": 20.608177429703655, + "kl": 0.203125, + "learning_rate": 4.830909409497109e-07, + "loss": 0.0028, + "reward": 1.7684814929962158, + "reward_std": 0.3044443726539612, + "rewards/accuracy_reward_stage2": 0.7997313737869263, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2951 + }, + { + "completion_length": 11.703125, + "epoch": 0.5172595058699843, + "grad_norm": 16.37850263732007, + "kl": 0.09033203125, + "learning_rate": 4.829157175398633e-07, + "loss": -0.0023, + "reward": 1.7461036443710327, + "reward_std": 0.20768775045871735, + "rewards/accuracy_reward_stage2": 0.7617285847663879, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2952 + }, + { + "completion_length": 7.828125, + "epoch": 0.5174347292798318, + "grad_norm": 15.786937308181098, + "kl": 0.09375, + "learning_rate": 4.827404941300157e-07, + "loss": -0.0057, + "reward": 1.6817526817321777, + "reward_std": 0.24270977079868317, + "rewards/accuracy_reward_stage2": 0.697377622127533, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2953 + }, + { + "completion_length": 9.046875, + "epoch": 0.5176099526896794, + "grad_norm": 12.787982745803061, + "kl": 0.05810546875, + "learning_rate": 4.825652707201682e-07, + "loss": -0.0204, + "reward": 1.5325255393981934, + "reward_std": 0.1728992760181427, + "rewards/accuracy_reward_stage2": 0.5481504797935486, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2954 + }, + { + "completion_length": 12.25, + "epoch": 0.5177851760995269, + "grad_norm": 18.344373606655147, + "kl": 0.058349609375, + "learning_rate": 4.823900473103206e-07, + "loss": -0.0209, + "reward": 1.4726002216339111, + "reward_std": 0.22146786749362946, + "rewards/accuracy_reward_stage2": 0.6132252216339111, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2955 + }, + { + "completion_length": 25.1875, + "epoch": 0.5179603995093744, + "grad_norm": 26.56599596321791, + "kl": 0.185546875, + "learning_rate": 4.822148239004731e-07, + "loss": 0.0741, + "reward": 1.4056503772735596, + "reward_std": 0.18882179260253906, + "rewards/accuracy_reward_stage2": 0.6556503176689148, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2956 + }, + { + "completion_length": 20.296875, + "epoch": 0.518135622919222, + "grad_norm": 16.152254574834682, + "kl": 0.0302734375, + "learning_rate": 4.820396004906255e-07, + "loss": 0.0121, + "reward": 1.3913934230804443, + "reward_std": 0.17937754094600677, + "rewards/accuracy_reward_stage2": 0.5163935422897339, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2957 + }, + { + "completion_length": 11.109375, + "epoch": 0.5183108463290695, + "grad_norm": 23.663345562826837, + "kl": 0.2734375, + "learning_rate": 4.81864377080778e-07, + "loss": 0.0676, + "reward": 1.383396029472351, + "reward_std": 0.33867397904396057, + "rewards/accuracy_reward_stage2": 0.5240209698677063, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2958 + }, + { + "completion_length": 7.671875, + "epoch": 0.5184860697389171, + "grad_norm": 15.080706050120135, + "kl": 0.0927734375, + "learning_rate": 4.816891536709305e-07, + "loss": -0.0061, + "reward": 1.7183098793029785, + "reward_std": 0.1505095362663269, + "rewards/accuracy_reward_stage2": 0.7339348793029785, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2959 + }, + { + "completion_length": 9.921875, + "epoch": 0.5186612931487646, + "grad_norm": 20.0616679391178, + "kl": 0.052978515625, + "learning_rate": 4.815139302610829e-07, + "loss": 0.0212, + "reward": 1.5980708599090576, + "reward_std": 0.19968780875205994, + "rewards/accuracy_reward_stage2": 0.5980708003044128, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2960 + }, + { + "completion_length": 12.09375, + "epoch": 0.5188365165586122, + "grad_norm": 19.9463560619159, + "kl": 0.11376953125, + "learning_rate": 4.813387068512352e-07, + "loss": 0.0166, + "reward": 1.589599609375, + "reward_std": 0.2128470540046692, + "rewards/accuracy_reward_stage2": 0.605224609375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2961 + }, + { + "completion_length": 12.53125, + "epoch": 0.5190117399684598, + "grad_norm": 18.95845814797562, + "kl": 0.1376953125, + "learning_rate": 4.811634834413877e-07, + "loss": 0.0203, + "reward": 1.2567017078399658, + "reward_std": 0.2735491394996643, + "rewards/accuracy_reward_stage2": 0.522326648235321, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2962 + }, + { + "completion_length": 8.359375, + "epoch": 0.5191869633783074, + "grad_norm": 18.40982447445592, + "kl": 0.07861328125, + "learning_rate": 4.809882600315402e-07, + "loss": 0.0315, + "reward": 1.7283145189285278, + "reward_std": 0.29172807931900024, + "rewards/accuracy_reward_stage2": 0.7283145189285278, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2963 + }, + { + "completion_length": 8.078125, + "epoch": 0.5193621867881549, + "grad_norm": 16.950258641470114, + "kl": 0.13671875, + "learning_rate": 4.808130366216926e-07, + "loss": 0.0547, + "reward": 1.5028691291809082, + "reward_std": 0.1037883311510086, + "rewards/accuracy_reward_stage2": 0.6278691291809082, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2964 + }, + { + "completion_length": 14.140625, + "epoch": 0.5195374101980025, + "grad_norm": 22.45684219909601, + "kl": 0.15625, + "learning_rate": 4.806378132118451e-07, + "loss": 0.0626, + "reward": 1.3568233251571655, + "reward_std": 0.2835046648979187, + "rewards/accuracy_reward_stage2": 0.48182329535484314, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2965 + }, + { + "completion_length": 12.921875, + "epoch": 0.51971263360785, + "grad_norm": 13.37469094535078, + "kl": 0.07666015625, + "learning_rate": 4.804625898019975e-07, + "loss": -0.0082, + "reward": 1.5271795988082886, + "reward_std": 0.16873487830162048, + "rewards/accuracy_reward_stage2": 0.5428045988082886, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2966 + }, + { + "completion_length": 8.53125, + "epoch": 0.5198878570176976, + "grad_norm": 16.56289414630539, + "kl": 0.1904296875, + "learning_rate": 4.8028736639215e-07, + "loss": 0.017, + "reward": 1.506408452987671, + "reward_std": 0.18759910762310028, + "rewards/accuracy_reward_stage2": 0.6626585721969604, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2967 + }, + { + "completion_length": 6.421875, + "epoch": 0.5200630804275451, + "grad_norm": 16.23170422508012, + "kl": 0.328125, + "learning_rate": 4.801121429823024e-07, + "loss": 0.0439, + "reward": 1.4529410600662231, + "reward_std": 0.29004305601119995, + "rewards/accuracy_reward_stage2": 0.6091910004615784, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2968 + }, + { + "completion_length": 7.421875, + "epoch": 0.5202383038373927, + "grad_norm": 17.728963477828742, + "kl": 0.23046875, + "learning_rate": 4.799369195724549e-07, + "loss": 0.0565, + "reward": 1.7198197841644287, + "reward_std": 0.21634772419929504, + "rewards/accuracy_reward_stage2": 0.8604447841644287, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2969 + }, + { + "completion_length": 8.28125, + "epoch": 0.5204135272472402, + "grad_norm": 17.077717905726193, + "kl": 0.1904296875, + "learning_rate": 4.797616961626073e-07, + "loss": 0.0433, + "reward": 1.4588346481323242, + "reward_std": 0.3143489360809326, + "rewards/accuracy_reward_stage2": 0.5994596481323242, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2970 + }, + { + "completion_length": 12.28125, + "epoch": 0.5205887506570878, + "grad_norm": 20.111701036269782, + "kl": 0.408203125, + "learning_rate": 4.795864727527598e-07, + "loss": 0.0546, + "reward": 1.3059935569763184, + "reward_std": 0.24852296710014343, + "rewards/accuracy_reward_stage2": 0.47786855697631836, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2971 + }, + { + "completion_length": 9.75, + "epoch": 0.5207639740669353, + "grad_norm": 51.80114420123987, + "kl": 0.369140625, + "learning_rate": 4.794112493429122e-07, + "loss": 0.1473, + "reward": 1.6538548469543457, + "reward_std": 0.268999308347702, + "rewards/accuracy_reward_stage2": 0.6538547873497009, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2972 + }, + { + "completion_length": 6.453125, + "epoch": 0.5209391974767829, + "grad_norm": 402.704170101784, + "kl": 2.21875, + "learning_rate": 4.792360259330646e-07, + "loss": 0.7794, + "reward": 1.4323503971099854, + "reward_std": 0.26757895946502686, + "rewards/accuracy_reward_stage2": 0.6042253971099854, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2973 + }, + { + "completion_length": 28.65625, + "epoch": 0.5211144208866304, + "grad_norm": 23.106544974841064, + "kl": 0.1630859375, + "learning_rate": 4.79060802523217e-07, + "loss": -0.0048, + "reward": 1.370764136314392, + "reward_std": 0.33060193061828613, + "rewards/accuracy_reward_stage2": 0.4020141363143921, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2974 + }, + { + "completion_length": 9.0625, + "epoch": 0.5212896442964781, + "grad_norm": 18.194894455569557, + "kl": 0.0498046875, + "learning_rate": 4.788855791133695e-07, + "loss": -0.0131, + "reward": 1.5386393070220947, + "reward_std": 0.36468303203582764, + "rewards/accuracy_reward_stage2": 0.5542643070220947, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2975 + }, + { + "completion_length": 7.515625, + "epoch": 0.5214648677063256, + "grad_norm": 22.506162643097728, + "kl": 0.208984375, + "learning_rate": 4.78710355703522e-07, + "loss": 0.0394, + "reward": 1.7304048538208008, + "reward_std": 0.17070415616035461, + "rewards/accuracy_reward_stage2": 0.8710298538208008, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2976 + }, + { + "completion_length": 10.28125, + "epoch": 0.5216400911161732, + "grad_norm": 272.80223531430477, + "kl": 1.5703125, + "learning_rate": 4.785351322936744e-07, + "loss": 0.4964, + "reward": 1.740801453590393, + "reward_std": 0.23649471998214722, + "rewards/accuracy_reward_stage2": 0.9126764535903931, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2977 + }, + { + "completion_length": 10.15625, + "epoch": 0.5218153145260207, + "grad_norm": 15.03608408563602, + "kl": 0.1552734375, + "learning_rate": 4.783599088838269e-07, + "loss": 0.0229, + "reward": 1.4743565320968628, + "reward_std": 0.1263759732246399, + "rewards/accuracy_reward_stage2": 0.614981472492218, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2978 + }, + { + "completion_length": 7.734375, + "epoch": 0.5219905379358682, + "grad_norm": 12.453380459386771, + "kl": 0.07958984375, + "learning_rate": 4.781846854739793e-07, + "loss": -0.01, + "reward": 1.7916667461395264, + "reward_std": 0.1455363929271698, + "rewards/accuracy_reward_stage2": 0.8072916269302368, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2979 + }, + { + "completion_length": 12.0, + "epoch": 0.5221657613457158, + "grad_norm": 16.390112633942664, + "kl": 0.1875, + "learning_rate": 4.780094620641318e-07, + "loss": -0.0685, + "reward": 1.3277325630187988, + "reward_std": 0.2840336561203003, + "rewards/accuracy_reward_stage2": 0.39023256301879883, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 2980 + }, + { + "completion_length": 10.234375, + "epoch": 0.5223409847555633, + "grad_norm": 17.47778157040926, + "kl": 0.146484375, + "learning_rate": 4.778342386542841e-07, + "loss": -0.0413, + "reward": 1.536125659942627, + "reward_std": 0.23933261632919312, + "rewards/accuracy_reward_stage2": 0.708000659942627, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2981 + }, + { + "completion_length": 9.34375, + "epoch": 0.5225162081654109, + "grad_norm": 19.88076286176023, + "kl": 0.08935546875, + "learning_rate": 4.776590152444366e-07, + "loss": 0.0034, + "reward": 1.3844552040100098, + "reward_std": 0.2918417453765869, + "rewards/accuracy_reward_stage2": 0.525080144405365, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2982 + }, + { + "completion_length": 9.71875, + "epoch": 0.5226914315752584, + "grad_norm": 28.094404649595983, + "kl": 0.189453125, + "learning_rate": 4.774837918345891e-07, + "loss": 0.0445, + "reward": 1.632422924041748, + "reward_std": 0.3590032756328583, + "rewards/accuracy_reward_stage2": 0.6480480432510376, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2983 + }, + { + "completion_length": 9.84375, + "epoch": 0.522866654985106, + "grad_norm": 24.415029498851087, + "kl": 0.31640625, + "learning_rate": 4.773085684247415e-07, + "loss": 0.0669, + "reward": 1.613487720489502, + "reward_std": 0.25573980808258057, + "rewards/accuracy_reward_stage2": 0.7697376608848572, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2984 + }, + { + "completion_length": 8.828125, + "epoch": 0.5230418783949535, + "grad_norm": 24.517554180624, + "kl": 0.2177734375, + "learning_rate": 4.77133345014894e-07, + "loss": 0.0001, + "reward": 1.3647611141204834, + "reward_std": 0.4705469608306885, + "rewards/accuracy_reward_stage2": 0.4116361737251282, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2985 + }, + { + "completion_length": 11.203125, + "epoch": 0.5232171018048011, + "grad_norm": 15.677994477134238, + "kl": 0.126953125, + "learning_rate": 4.769581216050464e-07, + "loss": 0.0064, + "reward": 1.60444176197052, + "reward_std": 0.31227320432662964, + "rewards/accuracy_reward_stage2": 0.6200668215751648, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2986 + }, + { + "completion_length": 9.53125, + "epoch": 0.5233923252146486, + "grad_norm": 18.468711010471285, + "kl": 0.2294921875, + "learning_rate": 4.7678289819519884e-07, + "loss": -0.015, + "reward": 1.2599871158599854, + "reward_std": 0.18454702198505402, + "rewards/accuracy_reward_stage2": 0.43186211585998535, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2987 + }, + { + "completion_length": 7.84375, + "epoch": 0.5235675486244963, + "grad_norm": 17.931504296092047, + "kl": 0.162109375, + "learning_rate": 4.766076747853513e-07, + "loss": -0.0675, + "reward": 1.6289443969726562, + "reward_std": 0.29932835698127747, + "rewards/accuracy_reward_stage2": 0.6758193969726562, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2988 + }, + { + "completion_length": 9.859375, + "epoch": 0.5237427720343438, + "grad_norm": 17.4591338186714, + "kl": 0.0810546875, + "learning_rate": 4.7643245137550377e-07, + "loss": -0.0059, + "reward": 1.6931253671646118, + "reward_std": 0.21513822674751282, + "rewards/accuracy_reward_stage2": 0.7087503671646118, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2989 + }, + { + "completion_length": 11.5625, + "epoch": 0.5239179954441914, + "grad_norm": 19.233603189651088, + "kl": 0.17578125, + "learning_rate": 4.762572279656562e-07, + "loss": -0.0478, + "reward": 1.3866586685180664, + "reward_std": 0.22251750528812408, + "rewards/accuracy_reward_stage2": 0.5585336685180664, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2990 + }, + { + "completion_length": 11.890625, + "epoch": 0.5240932188540389, + "grad_norm": 16.066061189933652, + "kl": 0.083984375, + "learning_rate": 4.7608200455580865e-07, + "loss": 0.0335, + "reward": 1.5238542556762695, + "reward_std": 0.22788162529468536, + "rewards/accuracy_reward_stage2": 0.5238542556762695, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2991 + }, + { + "completion_length": 9.9375, + "epoch": 0.5242684422638865, + "grad_norm": 18.861312346598808, + "kl": 0.1728515625, + "learning_rate": 4.7590678114596104e-07, + "loss": 0.0032, + "reward": 1.3086049556732178, + "reward_std": 0.33977043628692627, + "rewards/accuracy_reward_stage2": 0.3398548364639282, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2992 + }, + { + "completion_length": 9.125, + "epoch": 0.524443665673734, + "grad_norm": 17.42928363573299, + "kl": 0.287109375, + "learning_rate": 4.7573155773611353e-07, + "loss": 0.0014, + "reward": 1.328125, + "reward_std": 0.25688543915748596, + "rewards/accuracy_reward_stage2": 0.5, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2993 + }, + { + "completion_length": 7.40625, + "epoch": 0.5246188890835816, + "grad_norm": 27.018808557675843, + "kl": 0.197265625, + "learning_rate": 4.7555633432626597e-07, + "loss": 0.0057, + "reward": 1.4005868434906006, + "reward_std": 0.3192916214466095, + "rewards/accuracy_reward_stage2": 0.5568368434906006, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2994 + }, + { + "completion_length": 9.46875, + "epoch": 0.5247941124934291, + "grad_norm": 11.750319884502725, + "kl": 0.1630859375, + "learning_rate": 4.753811109164184e-07, + "loss": 0.0002, + "reward": 1.4829835891723633, + "reward_std": 0.14890187978744507, + "rewards/accuracy_reward_stage2": 0.5142335295677185, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2995 + }, + { + "completion_length": 14.453125, + "epoch": 0.5249693359032767, + "grad_norm": 15.66119969286439, + "kl": 0.1611328125, + "learning_rate": 4.7520588750657085e-07, + "loss": -0.066, + "reward": 1.5467138290405273, + "reward_std": 0.2760947048664093, + "rewards/accuracy_reward_stage2": 0.5935889482498169, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2996 + }, + { + "completion_length": 11.859375, + "epoch": 0.5251445593131242, + "grad_norm": 9.586425716781799, + "kl": 0.07470703125, + "learning_rate": 4.7503066409672334e-07, + "loss": -0.0144, + "reward": 1.40625, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2997 + }, + { + "completion_length": 11.140625, + "epoch": 0.5253197827229718, + "grad_norm": 18.620989828713803, + "kl": 0.09423828125, + "learning_rate": 4.7485544068687573e-07, + "loss": 0.0023, + "reward": 1.5689308643341064, + "reward_std": 0.26974961161613464, + "rewards/accuracy_reward_stage2": 0.5845559239387512, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2998 + }, + { + "completion_length": 6.46875, + "epoch": 0.5254950061328193, + "grad_norm": 16.402643244855273, + "kl": 0.0849609375, + "learning_rate": 4.7468021727702817e-07, + "loss": -0.0099, + "reward": 1.3922981023788452, + "reward_std": 0.1058889701962471, + "rewards/accuracy_reward_stage2": 0.5329231023788452, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2999 + }, + { + "completion_length": 12.15625, + "epoch": 0.525670229542667, + "grad_norm": 19.33626292725619, + "kl": 0.03564453125, + "learning_rate": 4.745049938671806e-07, + "loss": 0.0143, + "reward": 1.5242962837219238, + "reward_std": 0.12696348130702972, + "rewards/accuracy_reward_stage2": 0.5242962837219238, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3000 + }, + { + "completion_length": 10.125, + "epoch": 0.5258454529525145, + "grad_norm": 19.847748663580717, + "kl": 0.314453125, + "learning_rate": 4.743297704573331e-07, + "loss": 0.0834, + "reward": 1.1907129287719727, + "reward_std": 0.23560258746147156, + "rewards/accuracy_reward_stage2": 0.4563378691673279, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3001 + }, + { + "completion_length": 17.15625, + "epoch": 0.526020676362362, + "grad_norm": 20.586289372649574, + "kl": 0.171875, + "learning_rate": 4.7415454704748554e-07, + "loss": -0.0648, + "reward": 1.4292160272598267, + "reward_std": 0.31085318326950073, + "rewards/accuracy_reward_stage2": 0.6167160272598267, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 3002 + }, + { + "completion_length": 12.4375, + "epoch": 0.5261958997722096, + "grad_norm": 17.106269980841894, + "kl": 0.119140625, + "learning_rate": 4.73979323637638e-07, + "loss": -0.0152, + "reward": 1.213785171508789, + "reward_std": 0.33467233180999756, + "rewards/accuracy_reward_stage2": 0.37003517150878906, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3003 + }, + { + "completion_length": 11.234375, + "epoch": 0.5263711231820571, + "grad_norm": 17.56565559200859, + "kl": 0.062255859375, + "learning_rate": 4.738041002277904e-07, + "loss": -0.0091, + "reward": 1.3451817035675049, + "reward_std": 0.22466593980789185, + "rewards/accuracy_reward_stage2": 0.4858066439628601, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3004 + }, + { + "completion_length": 7.21875, + "epoch": 0.5265463465919047, + "grad_norm": 12.920353465243306, + "kl": 0.1044921875, + "learning_rate": 4.7362887681794286e-07, + "loss": 0.0178, + "reward": 1.4469187259674072, + "reward_std": 0.21581503748893738, + "rewards/accuracy_reward_stage2": 0.46254366636276245, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3005 + }, + { + "completion_length": 8.453125, + "epoch": 0.5267215700017522, + "grad_norm": 14.367077784056798, + "kl": 0.2216796875, + "learning_rate": 4.734536534080953e-07, + "loss": -0.0588, + "reward": 1.2829350233078003, + "reward_std": 0.2190045267343521, + "rewards/accuracy_reward_stage2": 0.4704349935054779, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 3006 + }, + { + "completion_length": 10.375, + "epoch": 0.5268967934115998, + "grad_norm": 19.523796969621742, + "kl": 0.302734375, + "learning_rate": 4.7327842999824774e-07, + "loss": 0.033, + "reward": 1.7166911363601685, + "reward_std": 0.27443280816078186, + "rewards/accuracy_reward_stage2": 0.7479411363601685, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3007 + }, + { + "completion_length": 11.671875, + "epoch": 0.5270720168214473, + "grad_norm": 28.468412075135017, + "kl": 0.2734375, + "learning_rate": 4.731032065884002e-07, + "loss": 0.0847, + "reward": 1.6269659996032715, + "reward_std": 0.17851251363754272, + "rewards/accuracy_reward_stage2": 0.767591118812561, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3008 + }, + { + "completion_length": 12.6875, + "epoch": 0.5272472402312949, + "grad_norm": 15.867000520209041, + "kl": 0.1923828125, + "learning_rate": 4.7292798317855267e-07, + "loss": 0.0393, + "reward": 1.3693108558654785, + "reward_std": 0.18513169884681702, + "rewards/accuracy_reward_stage2": 0.6349357962608337, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3009 + }, + { + "completion_length": 6.171875, + "epoch": 0.5274224636411424, + "grad_norm": 12.409857153191926, + "kl": 0.041015625, + "learning_rate": 4.727527597687051e-07, + "loss": 0.0165, + "reward": 1.7259947061538696, + "reward_std": 0.11311056464910507, + "rewards/accuracy_reward_stage2": 0.7259947061538696, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3010 + }, + { + "completion_length": 10.984375, + "epoch": 0.52759768705099, + "grad_norm": 20.049126613210415, + "kl": 0.103515625, + "learning_rate": 4.725775363588575e-07, + "loss": -0.0027, + "reward": 1.412109136581421, + "reward_std": 0.29860153794288635, + "rewards/accuracy_reward_stage2": 0.4277341663837433, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3011 + }, + { + "completion_length": 10.703125, + "epoch": 0.5277729104608375, + "grad_norm": 16.696484961038596, + "kl": 0.2158203125, + "learning_rate": 4.7240231294900993e-07, + "loss": -0.0326, + "reward": 1.3450486660003662, + "reward_std": 0.2671396732330322, + "rewards/accuracy_reward_stage2": 0.5169236660003662, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3012 + }, + { + "completion_length": 8.875, + "epoch": 0.5279481338706852, + "grad_norm": 14.689130441389223, + "kl": 0.2265625, + "learning_rate": 4.7222708953916243e-07, + "loss": 0.0248, + "reward": 1.6728073358535767, + "reward_std": 0.25592559576034546, + "rewards/accuracy_reward_stage2": 0.7040572762489319, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3013 + }, + { + "completion_length": 11.609375, + "epoch": 0.5281233572805327, + "grad_norm": 15.071699015429848, + "kl": 0.1845703125, + "learning_rate": 4.7205186612931487e-07, + "loss": -0.0164, + "reward": 1.5697612762451172, + "reward_std": 0.19837264716625214, + "rewards/accuracy_reward_stage2": 0.6166362762451172, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3014 + }, + { + "completion_length": 6.59375, + "epoch": 0.5282985806903803, + "grad_norm": 15.538520030910789, + "kl": 0.103515625, + "learning_rate": 4.718766427194673e-07, + "loss": -0.0027, + "reward": 1.6166914701461792, + "reward_std": 0.2432194948196411, + "rewards/accuracy_reward_stage2": 0.6323164701461792, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3015 + }, + { + "completion_length": 17.875, + "epoch": 0.5284738041002278, + "grad_norm": 17.517113919901284, + "kl": 0.11376953125, + "learning_rate": 4.7170141930961975e-07, + "loss": -0.0641, + "reward": 1.3405852317810059, + "reward_std": 0.17777365446090698, + "rewards/accuracy_reward_stage2": 0.5124603509902954, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3016 + }, + { + "completion_length": 13.578125, + "epoch": 0.5286490275100754, + "grad_norm": 16.46059554496943, + "kl": 0.265625, + "learning_rate": 4.715261958997722e-07, + "loss": 0.1058, + "reward": 1.5327612161636353, + "reward_std": 0.16156886518001556, + "rewards/accuracy_reward_stage2": 0.6577612161636353, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3017 + }, + { + "completion_length": 10.03125, + "epoch": 0.5288242509199229, + "grad_norm": 19.988363017500177, + "kl": 0.091796875, + "learning_rate": 4.713509724899246e-07, + "loss": 0.0367, + "reward": 1.6378380060195923, + "reward_std": 0.21499964594841003, + "rewards/accuracy_reward_stage2": 0.7628380060195923, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3018 + }, + { + "completion_length": 10.78125, + "epoch": 0.5289994743297705, + "grad_norm": 18.54168827454856, + "kl": 0.15234375, + "learning_rate": 4.7117574908007706e-07, + "loss": -0.0272, + "reward": 1.715771198272705, + "reward_std": 0.3070124387741089, + "rewards/accuracy_reward_stage2": 0.7470211982727051, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3019 + }, + { + "completion_length": 8.40625, + "epoch": 0.529174697739618, + "grad_norm": 16.69030502399357, + "kl": 0.2236328125, + "learning_rate": 4.710005256702295e-07, + "loss": -0.0819, + "reward": 1.4025707244873047, + "reward_std": 0.2415388822555542, + "rewards/accuracy_reward_stage2": 0.48069584369659424, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3020 + }, + { + "completion_length": 6.234375, + "epoch": 0.5293499211494656, + "grad_norm": 6.533390983035657, + "kl": 0.02685546875, + "learning_rate": 4.70825302260382e-07, + "loss": 0.0107, + "reward": 1.5693737268447876, + "reward_std": 0.03983701765537262, + "rewards/accuracy_reward_stage2": 0.5693736672401428, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3021 + }, + { + "completion_length": 10.171875, + "epoch": 0.5295251445593131, + "grad_norm": 17.603432652423468, + "kl": 0.09814453125, + "learning_rate": 4.7065007885053444e-07, + "loss": -0.0011, + "reward": 1.7323403358459473, + "reward_std": 0.3000224530696869, + "rewards/accuracy_reward_stage2": 0.7479652762413025, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3022 + }, + { + "completion_length": 12.703125, + "epoch": 0.5297003679691606, + "grad_norm": 20.196880943398675, + "kl": 0.1376953125, + "learning_rate": 4.704748554406869e-07, + "loss": 0.0178, + "reward": 1.5424933433532715, + "reward_std": 0.24900223314762115, + "rewards/accuracy_reward_stage2": 0.5581183433532715, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3023 + }, + { + "completion_length": 10.0625, + "epoch": 0.5298755913790082, + "grad_norm": 25.70022775673109, + "kl": 0.158203125, + "learning_rate": 4.7029963203083926e-07, + "loss": -0.0026, + "reward": 1.3462114334106445, + "reward_std": 0.270582914352417, + "rewards/accuracy_reward_stage2": 0.6274613738059998, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3024 + }, + { + "completion_length": 6.4375, + "epoch": 0.5300508147888557, + "grad_norm": 31.517862165800544, + "kl": 0.162109375, + "learning_rate": 4.7012440862099176e-07, + "loss": -0.0235, + "reward": 1.5729167461395264, + "reward_std": 0.31406548619270325, + "rewards/accuracy_reward_stage2": 0.6041666269302368, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3025 + }, + { + "completion_length": 12.375, + "epoch": 0.5302260381987034, + "grad_norm": 19.99666059416602, + "kl": 0.2421875, + "learning_rate": 4.699491852111442e-07, + "loss": -0.0321, + "reward": 1.5244319438934326, + "reward_std": 0.30928748846054077, + "rewards/accuracy_reward_stage2": 0.6963070034980774, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3026 + }, + { + "completion_length": 11.28125, + "epoch": 0.530401261608551, + "grad_norm": 20.72342344452126, + "kl": 0.1943359375, + "learning_rate": 4.6977396180129663e-07, + "loss": -0.0103, + "reward": 1.847987413406372, + "reward_std": 0.2692791819572449, + "rewards/accuracy_reward_stage2": 0.8792373538017273, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3027 + }, + { + "completion_length": 9.171875, + "epoch": 0.5305764850183985, + "grad_norm": 18.269406103081074, + "kl": 0.06640625, + "learning_rate": 4.695987383914491e-07, + "loss": 0.0265, + "reward": 1.4410545825958252, + "reward_std": 0.12092936038970947, + "rewards/accuracy_reward_stage2": 0.4410546123981476, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3028 + }, + { + "completion_length": 16.828125, + "epoch": 0.530751708428246, + "grad_norm": 24.097913134980068, + "kl": 0.251953125, + "learning_rate": 4.6942351498160157e-07, + "loss": 0.0468, + "reward": 1.2747653722763062, + "reward_std": 0.22782419621944427, + "rewards/accuracy_reward_stage2": 0.30601537227630615, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3029 + }, + { + "completion_length": 9.8125, + "epoch": 0.5309269318380936, + "grad_norm": 15.977143548134446, + "kl": 0.05810546875, + "learning_rate": 4.6924829157175395e-07, + "loss": 0.0232, + "reward": 1.7499685287475586, + "reward_std": 0.2890286445617676, + "rewards/accuracy_reward_stage2": 0.7499684691429138, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3030 + }, + { + "completion_length": 10.9375, + "epoch": 0.5311021552479411, + "grad_norm": 17.89330528689539, + "kl": 0.18359375, + "learning_rate": 4.690730681619064e-07, + "loss": 0.0112, + "reward": 1.570418357849121, + "reward_std": 0.1976543366909027, + "rewards/accuracy_reward_stage2": 0.6016684770584106, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3031 + }, + { + "completion_length": 9.09375, + "epoch": 0.5312773786577887, + "grad_norm": 18.17287625511143, + "kl": 0.0196533203125, + "learning_rate": 4.6889784475205883e-07, + "loss": 0.0079, + "reward": 1.1605315208435059, + "reward_std": 0.210140198469162, + "rewards/accuracy_reward_stage2": 0.28553152084350586, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3032 + }, + { + "completion_length": 10.421875, + "epoch": 0.5314526020676362, + "grad_norm": 20.40117868533523, + "kl": 0.125, + "learning_rate": 4.687226213422113e-07, + "loss": 0.0501, + "reward": 1.4888815879821777, + "reward_std": 0.17244793474674225, + "rewards/accuracy_reward_stage2": 0.4888816177845001, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3033 + }, + { + "completion_length": 10.609375, + "epoch": 0.5316278254774838, + "grad_norm": 25.24115334011376, + "kl": 0.2216796875, + "learning_rate": 4.6854739793236377e-07, + "loss": 0.0879, + "reward": 1.611750841140747, + "reward_std": 0.2850770652294159, + "rewards/accuracy_reward_stage2": 0.7367508411407471, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3034 + }, + { + "completion_length": 11.203125, + "epoch": 0.5318030488873313, + "grad_norm": 16.12155695115901, + "kl": 0.0419921875, + "learning_rate": 4.683721745225162e-07, + "loss": 0.0168, + "reward": 1.5541150569915771, + "reward_std": 0.16996827721595764, + "rewards/accuracy_reward_stage2": 0.5541150569915771, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3035 + }, + { + "completion_length": 7.328125, + "epoch": 0.5319782722971789, + "grad_norm": 12.940864496539184, + "kl": 0.06591796875, + "learning_rate": 4.6819695111266864e-07, + "loss": 0.002, + "reward": 1.6956827640533447, + "reward_std": 0.14037351310253143, + "rewards/accuracy_reward_stage2": 0.7113077640533447, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3036 + }, + { + "completion_length": 11.65625, + "epoch": 0.5321534957070264, + "grad_norm": 19.100056712000704, + "kl": 0.208984375, + "learning_rate": 4.6802172770282103e-07, + "loss": 0.0835, + "reward": 1.3029202222824097, + "reward_std": 0.17705127596855164, + "rewards/accuracy_reward_stage2": 0.5529202222824097, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3037 + }, + { + "completion_length": 9.53125, + "epoch": 0.532328719116874, + "grad_norm": 25.0008873405925, + "kl": 0.1201171875, + "learning_rate": 4.678465042929735e-07, + "loss": 0.0109, + "reward": 1.6256303787231445, + "reward_std": 0.26126065850257874, + "rewards/accuracy_reward_stage2": 0.6412553787231445, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3038 + }, + { + "completion_length": 16.171875, + "epoch": 0.5325039425267216, + "grad_norm": 15.480860684235363, + "kl": 0.1044921875, + "learning_rate": 4.6767128088312596e-07, + "loss": -0.0026, + "reward": 1.7870838642120361, + "reward_std": 0.15268449485301971, + "rewards/accuracy_reward_stage2": 0.8027088642120361, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3039 + }, + { + "completion_length": 13.625, + "epoch": 0.5326791659365692, + "grad_norm": 17.521574298852897, + "kl": 0.203125, + "learning_rate": 4.674960574732784e-07, + "loss": 0.0426, + "reward": 1.1582777500152588, + "reward_std": 0.19577035307884216, + "rewards/accuracy_reward_stage2": 0.4395277798175812, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3040 + }, + { + "completion_length": 10.421875, + "epoch": 0.5328543893464167, + "grad_norm": 15.826143042438844, + "kl": 0.11767578125, + "learning_rate": 4.6732083406343084e-07, + "loss": -0.0206, + "reward": 1.6439828872680664, + "reward_std": 0.2372094690799713, + "rewards/accuracy_reward_stage2": 0.6752328872680664, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3041 + }, + { + "completion_length": 7.28125, + "epoch": 0.5330296127562643, + "grad_norm": 14.504095571093211, + "kl": 0.0693359375, + "learning_rate": 4.6714561065358334e-07, + "loss": 0.0276, + "reward": 1.3369925022125244, + "reward_std": 0.16244575381278992, + "rewards/accuracy_reward_stage2": 0.4619925022125244, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3042 + }, + { + "completion_length": 12.234375, + "epoch": 0.5332048361661118, + "grad_norm": 18.798109631342566, + "kl": 0.0927734375, + "learning_rate": 4.669703872437357e-07, + "loss": -0.0071, + "reward": 1.4485533237457275, + "reward_std": 0.22703000903129578, + "rewards/accuracy_reward_stage2": 0.4641782343387604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3043 + }, + { + "completion_length": 11.875, + "epoch": 0.5333800595759594, + "grad_norm": 16.32852779302221, + "kl": 0.09912109375, + "learning_rate": 4.6679516383388816e-07, + "loss": 0.0186, + "reward": 1.5693191289901733, + "reward_std": 0.1421196162700653, + "rewards/accuracy_reward_stage2": 0.5849441289901733, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3044 + }, + { + "completion_length": 5.9375, + "epoch": 0.5335552829858069, + "grad_norm": 16.792635822140806, + "kl": 0.15625, + "learning_rate": 4.666199404240406e-07, + "loss": 0.0624, + "reward": 1.726854681968689, + "reward_std": 0.17972619831562042, + "rewards/accuracy_reward_stage2": 0.726854681968689, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3045 + }, + { + "completion_length": 27.21875, + "epoch": 0.5337305063956544, + "grad_norm": 18.453746951075754, + "kl": 0.103515625, + "learning_rate": 4.664447170141931e-07, + "loss": 0.0045, + "reward": 1.5340856313705444, + "reward_std": 0.17024879157543182, + "rewards/accuracy_reward_stage2": 0.5497106313705444, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3046 + }, + { + "completion_length": 15.609375, + "epoch": 0.533905729805502, + "grad_norm": 13.499514206736817, + "kl": 0.01263427734375, + "learning_rate": 4.6626949360434553e-07, + "loss": 0.0051, + "reward": 1.714925765991211, + "reward_std": 0.09920510649681091, + "rewards/accuracy_reward_stage2": 0.7149257063865662, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3047 + }, + { + "completion_length": 8.5625, + "epoch": 0.5340809532153495, + "grad_norm": 25.70082296105864, + "kl": 0.060791015625, + "learning_rate": 4.6609427019449797e-07, + "loss": 0.0243, + "reward": 1.793139934539795, + "reward_std": 0.22477105259895325, + "rewards/accuracy_reward_stage2": 0.7931399345397949, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3048 + }, + { + "completion_length": 8.609375, + "epoch": 0.5342561766251971, + "grad_norm": 17.500307037883275, + "kl": 0.43359375, + "learning_rate": 4.6591904678465036e-07, + "loss": 0.1737, + "reward": 1.3654680252075195, + "reward_std": 0.1928379386663437, + "rewards/accuracy_reward_stage2": 0.6154680252075195, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3049 + }, + { + "completion_length": 17.328125, + "epoch": 0.5344314000350446, + "grad_norm": 17.814760335668577, + "kl": 0.1748046875, + "learning_rate": 4.6574382337480285e-07, + "loss": 0.0375, + "reward": 1.3082983493804932, + "reward_std": 0.2186213731765747, + "rewards/accuracy_reward_stage2": 0.4489234387874603, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3050 + }, + { + "completion_length": 9.3125, + "epoch": 0.5346066234448923, + "grad_norm": 21.43077166996965, + "kl": 0.091796875, + "learning_rate": 4.655685999649553e-07, + "loss": 0.0367, + "reward": 1.7873187065124512, + "reward_std": 0.14868119359016418, + "rewards/accuracy_reward_stage2": 0.7873187065124512, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3051 + }, + { + "completion_length": 10.734375, + "epoch": 0.5347818468547398, + "grad_norm": 20.775333986536143, + "kl": 0.1474609375, + "learning_rate": 4.6539337655510773e-07, + "loss": 0.0148, + "reward": 1.6244642734527588, + "reward_std": 0.2634267210960388, + "rewards/accuracy_reward_stage2": 0.640089213848114, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3052 + }, + { + "completion_length": 10.5, + "epoch": 0.5349570702645874, + "grad_norm": 22.625466011199897, + "kl": 0.0810546875, + "learning_rate": 4.6521815314526017e-07, + "loss": 0.0324, + "reward": 1.4285982847213745, + "reward_std": 0.2603178024291992, + "rewards/accuracy_reward_stage2": 0.5535982251167297, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3053 + }, + { + "completion_length": 13.234375, + "epoch": 0.5351322936744349, + "grad_norm": 19.153618135380867, + "kl": 0.330078125, + "learning_rate": 4.6504292973541266e-07, + "loss": -0.0254, + "reward": 1.3712303638458252, + "reward_std": 0.3846546709537506, + "rewards/accuracy_reward_stage2": 0.5743553638458252, + "rewards/format_reward_stage1_pointerpad": 0.796875, + "scores/accuracy_reward_stage2": 0.796875, + "step": 3054 + }, + { + "completion_length": 9.15625, + "epoch": 0.5353075170842825, + "grad_norm": 15.593493994700411, + "kl": 0.07421875, + "learning_rate": 4.648677063255651e-07, + "loss": -0.0146, + "reward": 1.8773478269577026, + "reward_std": 0.14944106340408325, + "rewards/accuracy_reward_stage2": 0.8929727673530579, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3055 + }, + { + "completion_length": 6.3125, + "epoch": 0.53548274049413, + "grad_norm": 19.66066039095581, + "kl": 0.1513671875, + "learning_rate": 4.646924829157175e-07, + "loss": 0.0363, + "reward": 1.5343431234359741, + "reward_std": 0.19723013043403625, + "rewards/accuracy_reward_stage2": 0.6593431234359741, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3056 + }, + { + "completion_length": 6.78125, + "epoch": 0.5356579639039776, + "grad_norm": 21.526714673787414, + "kl": 0.158203125, + "learning_rate": 4.6451725950586993e-07, + "loss": 0.0342, + "reward": 1.617117166519165, + "reward_std": 0.26397401094436646, + "rewards/accuracy_reward_stage2": 0.7577422261238098, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3057 + }, + { + "completion_length": 15.265625, + "epoch": 0.5358331873138251, + "grad_norm": 12.15510432850708, + "kl": 0.17578125, + "learning_rate": 4.643420360960224e-07, + "loss": -0.0135, + "reward": 1.327011227607727, + "reward_std": 0.17525216937065125, + "rewards/accuracy_reward_stage2": 0.4832611382007599, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3058 + }, + { + "completion_length": 19.828125, + "epoch": 0.5360084107236727, + "grad_norm": 18.80841738380306, + "kl": 0.06787109375, + "learning_rate": 4.6416681268617486e-07, + "loss": -0.017, + "reward": 1.5321886539459229, + "reward_std": 0.249672532081604, + "rewards/accuracy_reward_stage2": 0.5478136539459229, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3059 + }, + { + "completion_length": 10.125, + "epoch": 0.5361836341335202, + "grad_norm": 20.60053039795327, + "kl": 0.09228515625, + "learning_rate": 4.639915892763273e-07, + "loss": 0.0369, + "reward": 1.5177383422851562, + "reward_std": 0.1550036072731018, + "rewards/accuracy_reward_stage2": 0.5177382826805115, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3060 + }, + { + "completion_length": 7.609375, + "epoch": 0.5363588575433678, + "grad_norm": 15.038847005498408, + "kl": 0.016845703125, + "learning_rate": 4.6381636586647974e-07, + "loss": 0.0068, + "reward": 1.6498210430145264, + "reward_std": 0.13509923219680786, + "rewards/accuracy_reward_stage2": 0.8998209834098816, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3061 + }, + { + "completion_length": 14.015625, + "epoch": 0.5365340809532153, + "grad_norm": 9.436819961252333, + "kl": 0.072265625, + "learning_rate": 4.636411424566322e-07, + "loss": -0.0054, + "reward": 1.6160824298858643, + "reward_std": 0.11158134788274765, + "rewards/accuracy_reward_stage2": 0.6317073702812195, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3062 + }, + { + "completion_length": 13.4375, + "epoch": 0.5367093043630629, + "grad_norm": 17.744512185558786, + "kl": 0.0908203125, + "learning_rate": 4.634659190467846e-07, + "loss": -0.0078, + "reward": 1.62375807762146, + "reward_std": 0.17992845177650452, + "rewards/accuracy_reward_stage2": 0.6393829584121704, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3063 + }, + { + "completion_length": 12.109375, + "epoch": 0.5368845277729105, + "grad_norm": 19.430148054219906, + "kl": 0.087890625, + "learning_rate": 4.6329069563693706e-07, + "loss": 0.0351, + "reward": 1.6683220863342285, + "reward_std": 0.2081729918718338, + "rewards/accuracy_reward_stage2": 0.6683220863342285, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3064 + }, + { + "completion_length": 14.953125, + "epoch": 0.5370597511827581, + "grad_norm": 14.130279849007, + "kl": 0.08251953125, + "learning_rate": 4.631154722270895e-07, + "loss": -0.0112, + "reward": 1.2821969985961914, + "reward_std": 0.17817632853984833, + "rewards/accuracy_reward_stage2": 0.4228219985961914, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3065 + }, + { + "completion_length": 11.921875, + "epoch": 0.5372349745926056, + "grad_norm": 16.998675878434767, + "kl": 0.119140625, + "learning_rate": 4.62940248817242e-07, + "loss": -0.0279, + "reward": 1.482663631439209, + "reward_std": 0.3255109190940857, + "rewards/accuracy_reward_stage2": 0.5139136910438538, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3066 + }, + { + "completion_length": 8.71875, + "epoch": 0.5374101980024532, + "grad_norm": 16.177190918806446, + "kl": 0.109375, + "learning_rate": 4.6276502540739443e-07, + "loss": 0.0048, + "reward": 1.3549516201019287, + "reward_std": 0.2306400090456009, + "rewards/accuracy_reward_stage2": 0.3705766797065735, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3067 + }, + { + "completion_length": 8.9375, + "epoch": 0.5375854214123007, + "grad_norm": 27.309356310930397, + "kl": 0.271484375, + "learning_rate": 4.625898019975468e-07, + "loss": -0.0098, + "reward": 1.3389118909835815, + "reward_std": 0.4090281128883362, + "rewards/accuracy_reward_stage2": 0.5107868313789368, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3068 + }, + { + "completion_length": 8.8125, + "epoch": 0.5377606448221482, + "grad_norm": 19.312543083938817, + "kl": 0.0771484375, + "learning_rate": 4.6241457858769926e-07, + "loss": 0.0309, + "reward": 1.7708431482315063, + "reward_std": 0.20213577151298523, + "rewards/accuracy_reward_stage2": 0.7708431482315063, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3069 + }, + { + "completion_length": 8.84375, + "epoch": 0.5379358682319958, + "grad_norm": 19.962249196207807, + "kl": 0.1640625, + "learning_rate": 4.6223935517785175e-07, + "loss": 0.0215, + "reward": 1.5813571214675903, + "reward_std": 0.25529611110687256, + "rewards/accuracy_reward_stage2": 0.5969820618629456, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3070 + }, + { + "completion_length": 8.515625, + "epoch": 0.5381110916418433, + "grad_norm": 15.90380238115599, + "kl": 0.0517578125, + "learning_rate": 4.620641317680042e-07, + "loss": 0.0207, + "reward": 1.7377853393554688, + "reward_std": 0.07723461091518402, + "rewards/accuracy_reward_stage2": 0.7377853393554688, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3071 + }, + { + "completion_length": 12.609375, + "epoch": 0.5382863150516909, + "grad_norm": 16.115969346139185, + "kl": 0.08056640625, + "learning_rate": 4.6188890835815663e-07, + "loss": -0.0343, + "reward": 1.4710911512374878, + "reward_std": 0.17701643705368042, + "rewards/accuracy_reward_stage2": 0.5023411512374878, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3072 + }, + { + "completion_length": 12.359375, + "epoch": 0.5384615384615384, + "grad_norm": 17.9484435026702, + "kl": 0.115234375, + "learning_rate": 4.6171368494830907e-07, + "loss": 0.0171, + "reward": 1.3882776498794556, + "reward_std": 0.29357674717903137, + "rewards/accuracy_reward_stage2": 0.40390264987945557, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3073 + }, + { + "completion_length": 17.203125, + "epoch": 0.538636761871386, + "grad_norm": 16.662349153141072, + "kl": 0.040771484375, + "learning_rate": 4.6153846153846156e-07, + "loss": 0.0164, + "reward": 1.5608493089675903, + "reward_std": 0.09860756993293762, + "rewards/accuracy_reward_stage2": 0.5608493685722351, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3074 + }, + { + "completion_length": 9.59375, + "epoch": 0.5388119852812335, + "grad_norm": 18.106733037371338, + "kl": 0.0703125, + "learning_rate": 4.6136323812861395e-07, + "loss": -0.0106, + "reward": 1.6378639936447144, + "reward_std": 0.27184975147247314, + "rewards/accuracy_reward_stage2": 0.6534889936447144, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3075 + }, + { + "completion_length": 8.3125, + "epoch": 0.5389872086910811, + "grad_norm": 21.56281782764618, + "kl": 0.267578125, + "learning_rate": 4.611880147187664e-07, + "loss": 0.0521, + "reward": 1.5611025094985962, + "reward_std": 0.35041025280952454, + "rewards/accuracy_reward_stage2": 0.717352569103241, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3076 + }, + { + "completion_length": 12.359375, + "epoch": 0.5391624321009287, + "grad_norm": 32.14585568367615, + "kl": 0.05517578125, + "learning_rate": 4.6101279130891883e-07, + "loss": 0.022, + "reward": 1.4294856786727905, + "reward_std": 0.29953643679618835, + "rewards/accuracy_reward_stage2": 0.4294856786727905, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3077 + }, + { + "completion_length": 16.875, + "epoch": 0.5393376555107763, + "grad_norm": 20.091125573660317, + "kl": 0.138671875, + "learning_rate": 4.608375678990713e-07, + "loss": 0.0112, + "reward": 1.5007598400115967, + "reward_std": 0.18287548422813416, + "rewards/accuracy_reward_stage2": 0.5163848996162415, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3078 + }, + { + "completion_length": 10.59375, + "epoch": 0.5395128789206238, + "grad_norm": 16.831158308135656, + "kl": 0.03369140625, + "learning_rate": 4.6066234448922376e-07, + "loss": -0.0307, + "reward": 1.6649608612060547, + "reward_std": 0.13924731314182281, + "rewards/accuracy_reward_stage2": 0.6805858612060547, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3079 + }, + { + "completion_length": 9.109375, + "epoch": 0.5396881023304714, + "grad_norm": 16.293936210247495, + "kl": 0.06884765625, + "learning_rate": 4.604871210793762e-07, + "loss": 0.0275, + "reward": 1.6581212282180786, + "reward_std": 0.10700437426567078, + "rewards/accuracy_reward_stage2": 0.6581212282180786, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3080 + }, + { + "completion_length": 12.046875, + "epoch": 0.5398633257403189, + "grad_norm": 16.85665873852233, + "kl": 0.1640625, + "learning_rate": 4.603118976695286e-07, + "loss": -0.0539, + "reward": 1.603559136390686, + "reward_std": 0.2619991898536682, + "rewards/accuracy_reward_stage2": 0.6504341959953308, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3081 + }, + { + "completion_length": 16.71875, + "epoch": 0.5400385491501665, + "grad_norm": 18.200123926002398, + "kl": 0.07666015625, + "learning_rate": 4.601366742596811e-07, + "loss": 0.0022, + "reward": 1.6544448137283325, + "reward_std": 0.13603055477142334, + "rewards/accuracy_reward_stage2": 0.6700698733329773, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3082 + }, + { + "completion_length": 8.578125, + "epoch": 0.540213772560014, + "grad_norm": 19.496810478238686, + "kl": 0.134765625, + "learning_rate": 4.599614508498335e-07, + "loss": 0.0537, + "reward": 1.5489877462387085, + "reward_std": 0.1919880509376526, + "rewards/accuracy_reward_stage2": 0.5489877462387085, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3083 + }, + { + "completion_length": 10.015625, + "epoch": 0.5403889959698616, + "grad_norm": 17.58848253241211, + "kl": 0.234375, + "learning_rate": 4.5978622743998596e-07, + "loss": -0.0312, + "reward": 1.385695457458496, + "reward_std": 0.14936351776123047, + "rewards/accuracy_reward_stage2": 0.4481954276561737, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3084 + }, + { + "completion_length": 13.8125, + "epoch": 0.5405642193797091, + "grad_norm": 21.304883569980227, + "kl": 0.25, + "learning_rate": 4.596110040301384e-07, + "loss": -0.0186, + "reward": 1.660407543182373, + "reward_std": 0.3340178430080414, + "rewards/accuracy_reward_stage2": 0.707282543182373, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3085 + }, + { + "completion_length": 10.9375, + "epoch": 0.5407394427895567, + "grad_norm": 24.291576382554858, + "kl": 0.08447265625, + "learning_rate": 4.594357806202909e-07, + "loss": 0.0338, + "reward": 1.649717092514038, + "reward_std": 0.26840633153915405, + "rewards/accuracy_reward_stage2": 0.6497172117233276, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3086 + }, + { + "completion_length": 11.46875, + "epoch": 0.5409146661994042, + "grad_norm": 16.023316583348308, + "kl": 0.181640625, + "learning_rate": 4.592605572104433e-07, + "loss": -0.0577, + "reward": 1.7659709453582764, + "reward_std": 0.2128397822380066, + "rewards/accuracy_reward_stage2": 0.8128460645675659, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3087 + }, + { + "completion_length": 9.796875, + "epoch": 0.5410898896092518, + "grad_norm": 19.777055191335933, + "kl": 0.09228515625, + "learning_rate": 4.590853338005957e-07, + "loss": 0.037, + "reward": 1.6594014167785645, + "reward_std": 0.2414485216140747, + "rewards/accuracy_reward_stage2": 0.6594013571739197, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3088 + }, + { + "completion_length": 10.125, + "epoch": 0.5412651130190993, + "grad_norm": 20.59924841280276, + "kl": 0.169921875, + "learning_rate": 4.5891011039074816e-07, + "loss": 0.0291, + "reward": 1.6514942646026611, + "reward_std": 0.36602866649627686, + "rewards/accuracy_reward_stage2": 0.6671192646026611, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3089 + }, + { + "completion_length": 14.1875, + "epoch": 0.541440336428947, + "grad_norm": 18.931291043266487, + "kl": 0.1044921875, + "learning_rate": 4.5873488698090065e-07, + "loss": -0.0038, + "reward": 1.771875023841858, + "reward_std": 0.2878369688987732, + "rewards/accuracy_reward_stage2": 0.8031250238418579, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3090 + }, + { + "completion_length": 9.71875, + "epoch": 0.5416155598387945, + "grad_norm": 18.664500855954135, + "kl": 0.041748046875, + "learning_rate": 4.585596635710531e-07, + "loss": 0.0167, + "reward": 1.3326388597488403, + "reward_std": 0.19817854464054108, + "rewards/accuracy_reward_stage2": 0.4576388895511627, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3091 + }, + { + "completion_length": 11.5, + "epoch": 0.541790783248642, + "grad_norm": 22.570017573761078, + "kl": 0.1376953125, + "learning_rate": 4.5838444016120553e-07, + "loss": 0.0025, + "reward": 1.433104395866394, + "reward_std": 0.2778227627277374, + "rewards/accuracy_reward_stage2": 0.573729395866394, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3092 + }, + { + "completion_length": 9.53125, + "epoch": 0.5419660066584896, + "grad_norm": 14.12496196439735, + "kl": 0.1748046875, + "learning_rate": 4.5820921675135797e-07, + "loss": -0.0091, + "reward": 1.6820327043533325, + "reward_std": 0.170151025056839, + "rewards/accuracy_reward_stage2": 0.7132827043533325, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3093 + }, + { + "completion_length": 8.359375, + "epoch": 0.5421412300683371, + "grad_norm": 15.999409442411078, + "kl": 0.197265625, + "learning_rate": 4.580339933415104e-07, + "loss": -0.0095, + "reward": 1.5700740814208984, + "reward_std": 0.25437554717063904, + "rewards/accuracy_reward_stage2": 0.6013240814208984, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3094 + }, + { + "completion_length": 9.0625, + "epoch": 0.5423164534781847, + "grad_norm": 51.15226086456877, + "kl": 0.2041015625, + "learning_rate": 4.5785876993166285e-07, + "loss": -0.0015, + "reward": 1.4735863208770752, + "reward_std": 0.2881781756877899, + "rewards/accuracy_reward_stage2": 0.6454613208770752, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3095 + }, + { + "completion_length": 9.65625, + "epoch": 0.5424916768880322, + "grad_norm": 22.2332921253652, + "kl": 0.220703125, + "learning_rate": 4.576835465218153e-07, + "loss": 0.0026, + "reward": 1.5193158388137817, + "reward_std": 0.25890079140663147, + "rewards/accuracy_reward_stage2": 0.5505658388137817, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3096 + }, + { + "completion_length": 26.015625, + "epoch": 0.5426669002978798, + "grad_norm": 22.528868259966305, + "kl": 0.11474609375, + "learning_rate": 4.5750832311196773e-07, + "loss": 0.0017, + "reward": 1.4433059692382812, + "reward_std": 0.19515696167945862, + "rewards/accuracy_reward_stage2": 0.458931028842926, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3097 + }, + { + "completion_length": 12.328125, + "epoch": 0.5428421237077273, + "grad_norm": 17.685266609921687, + "kl": 0.146484375, + "learning_rate": 4.573330997021202e-07, + "loss": 0.0095, + "reward": 1.5171079635620117, + "reward_std": 0.20841091871261597, + "rewards/accuracy_reward_stage2": 0.5483580231666565, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3098 + }, + { + "completion_length": 9.828125, + "epoch": 0.5430173471175749, + "grad_norm": 20.5074041806249, + "kl": 0.06640625, + "learning_rate": 4.5715787629227266e-07, + "loss": 0.0267, + "reward": 1.6943392753601074, + "reward_std": 0.2411803901195526, + "rewards/accuracy_reward_stage2": 0.8193392753601074, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3099 + }, + { + "completion_length": 11.09375, + "epoch": 0.5431925705274224, + "grad_norm": 17.269969614582326, + "kl": 0.2021484375, + "learning_rate": 4.5698265288242505e-07, + "loss": 0.0451, + "reward": 1.4439659118652344, + "reward_std": 0.21747536957263947, + "rewards/accuracy_reward_stage2": 0.5845909118652344, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3100 + }, + { + "completion_length": 27.078125, + "epoch": 0.54336779393727, + "grad_norm": 14.248365215207672, + "kl": 0.08935546875, + "learning_rate": 4.568074294725775e-07, + "loss": -0.0084, + "reward": 1.3167316913604736, + "reward_std": 0.1012648195028305, + "rewards/accuracy_reward_stage2": 0.3323565721511841, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3101 + }, + { + "completion_length": 16.828125, + "epoch": 0.5435430173471176, + "grad_norm": 14.166141616292743, + "kl": 0.05224609375, + "learning_rate": 4.5663220606273e-07, + "loss": -0.0204, + "reward": 1.2051842212677002, + "reward_std": 0.13409823179244995, + "rewards/accuracy_reward_stage2": 0.34580928087234497, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3102 + }, + { + "completion_length": 14.671875, + "epoch": 0.5437182407569652, + "grad_norm": 14.638131377176025, + "kl": 0.09521484375, + "learning_rate": 4.564569826528824e-07, + "loss": 0.0023, + "reward": 1.6929941177368164, + "reward_std": 0.1856401562690735, + "rewards/accuracy_reward_stage2": 0.7086191177368164, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3103 + }, + { + "completion_length": 10.515625, + "epoch": 0.5438934641668127, + "grad_norm": 21.71622363695326, + "kl": 0.1728515625, + "learning_rate": 4.5628175924303486e-07, + "loss": -0.0184, + "reward": 1.7654647827148438, + "reward_std": 0.24956831336021423, + "rewards/accuracy_reward_stage2": 0.7967147827148438, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3104 + }, + { + "completion_length": 11.390625, + "epoch": 0.5440686875766603, + "grad_norm": 20.51723862003228, + "kl": 0.09521484375, + "learning_rate": 4.561065358331873e-07, + "loss": 0.0379, + "reward": 1.5989583730697632, + "reward_std": 0.3667879104614258, + "rewards/accuracy_reward_stage2": 0.5989583730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3105 + }, + { + "completion_length": 10.9375, + "epoch": 0.5442439109865078, + "grad_norm": 23.295959325200524, + "kl": 0.16796875, + "learning_rate": 4.559313124233398e-07, + "loss": -0.0211, + "reward": 1.414116382598877, + "reward_std": 0.3159290850162506, + "rewards/accuracy_reward_stage2": 0.4453664720058441, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3106 + }, + { + "completion_length": 6.703125, + "epoch": 0.5444191343963554, + "grad_norm": 14.653730888336844, + "kl": 0.1142578125, + "learning_rate": 4.557560890134922e-07, + "loss": 0.0458, + "reward": 1.6861011981964111, + "reward_std": 0.1108192503452301, + "rewards/accuracy_reward_stage2": 0.6861011385917664, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3107 + }, + { + "completion_length": 11.484375, + "epoch": 0.5445943578062029, + "grad_norm": 12.71283407481726, + "kl": 0.07568359375, + "learning_rate": 4.555808656036446e-07, + "loss": -0.0138, + "reward": 1.5843769311904907, + "reward_std": 0.11037540435791016, + "rewards/accuracy_reward_stage2": 0.6000019311904907, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3108 + }, + { + "completion_length": 7.15625, + "epoch": 0.5447695812160505, + "grad_norm": 9.632664097158752, + "kl": 0.1015625, + "learning_rate": 4.5540564219379706e-07, + "loss": -0.0036, + "reward": 1.7535353899002075, + "reward_std": 0.11451417207717896, + "rewards/accuracy_reward_stage2": 0.7691603899002075, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3109 + }, + { + "completion_length": 11.484375, + "epoch": 0.544944804625898, + "grad_norm": 18.26844472901338, + "kl": 0.2177734375, + "learning_rate": 4.552304187839495e-07, + "loss": 0.0136, + "reward": 1.50527024269104, + "reward_std": 0.23741815984249115, + "rewards/accuracy_reward_stage2": 0.5365201830863953, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3110 + }, + { + "completion_length": 12.109375, + "epoch": 0.5451200280357456, + "grad_norm": 22.22406598511341, + "kl": 0.126953125, + "learning_rate": 4.55055195374102e-07, + "loss": 0.0069, + "reward": 1.6192907094955444, + "reward_std": 0.22934451699256897, + "rewards/accuracy_reward_stage2": 0.6349157691001892, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3111 + }, + { + "completion_length": 10.40625, + "epoch": 0.5452952514455931, + "grad_norm": 19.839667418496607, + "kl": 0.1494140625, + "learning_rate": 4.5487997196425443e-07, + "loss": 0.0197, + "reward": 1.7109891176223755, + "reward_std": 0.15221619606018066, + "rewards/accuracy_reward_stage2": 0.7266141772270203, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3112 + }, + { + "completion_length": 11.21875, + "epoch": 0.5454704748554406, + "grad_norm": 30.394901531165882, + "kl": 0.2236328125, + "learning_rate": 4.547047485544068e-07, + "loss": 0.0018, + "reward": 1.4713342189788818, + "reward_std": 0.26562297344207764, + "rewards/accuracy_reward_stage2": 0.5182092785835266, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3113 + }, + { + "completion_length": 37.921875, + "epoch": 0.5456456982652882, + "grad_norm": 18.317318007212805, + "kl": 0.1513671875, + "learning_rate": 4.5452952514455925e-07, + "loss": 0.0266, + "reward": 1.3703083992004395, + "reward_std": 0.23182103037834167, + "rewards/accuracy_reward_stage2": 0.3859333097934723, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3114 + }, + { + "completion_length": 9.609375, + "epoch": 0.5458209216751359, + "grad_norm": 18.277950300554696, + "kl": 0.3203125, + "learning_rate": 4.5435430173471175e-07, + "loss": 0.0185, + "reward": 1.4545676708221436, + "reward_std": 0.3653530478477478, + "rewards/accuracy_reward_stage2": 0.7514426708221436, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 3115 + }, + { + "completion_length": 10.34375, + "epoch": 0.5459961450849834, + "grad_norm": 19.3652231026983, + "kl": 0.232421875, + "learning_rate": 4.541790783248642e-07, + "loss": 0.0545, + "reward": 1.4572173357009888, + "reward_std": 0.3412941098213196, + "rewards/accuracy_reward_stage2": 0.48846733570098877, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3116 + }, + { + "completion_length": 12.28125, + "epoch": 0.5461713684948309, + "grad_norm": 18.07080948296114, + "kl": 0.07958984375, + "learning_rate": 4.540038549150166e-07, + "loss": 0.0318, + "reward": 1.0901241302490234, + "reward_std": 0.22192618250846863, + "rewards/accuracy_reward_stage2": 0.3401240110397339, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3117 + }, + { + "completion_length": 10.8125, + "epoch": 0.5463465919046785, + "grad_norm": 20.13719059887797, + "kl": 0.142578125, + "learning_rate": 4.5382863150516907e-07, + "loss": 0.0568, + "reward": 1.6008598804473877, + "reward_std": 0.19498933851718903, + "rewards/accuracy_reward_stage2": 0.7258598208427429, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3118 + }, + { + "completion_length": 8.140625, + "epoch": 0.546521815314526, + "grad_norm": 19.86921510263278, + "kl": 0.052734375, + "learning_rate": 4.536534080953215e-07, + "loss": 0.021, + "reward": 1.415239691734314, + "reward_std": 0.22165584564208984, + "rewards/accuracy_reward_stage2": 0.41523975133895874, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3119 + }, + { + "completion_length": 11.734375, + "epoch": 0.5466970387243736, + "grad_norm": 17.748671242725997, + "kl": 0.014404296875, + "learning_rate": 4.5347818468547394e-07, + "loss": 0.0058, + "reward": 1.7900738716125488, + "reward_std": 0.18254978954792023, + "rewards/accuracy_reward_stage2": 0.7900738716125488, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3120 + }, + { + "completion_length": 8.84375, + "epoch": 0.5468722621342211, + "grad_norm": 14.524084056879108, + "kl": 0.1982421875, + "learning_rate": 4.533029612756264e-07, + "loss": 0.0353, + "reward": 1.5100059509277344, + "reward_std": 0.16531233489513397, + "rewards/accuracy_reward_stage2": 0.6506309509277344, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3121 + }, + { + "completion_length": 11.734375, + "epoch": 0.5470474855440687, + "grad_norm": 22.884109798469716, + "kl": 0.119140625, + "learning_rate": 4.531277378657788e-07, + "loss": 0.0045, + "reward": 1.4116387367248535, + "reward_std": 0.35598820447921753, + "rewards/accuracy_reward_stage2": 0.5522637367248535, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3122 + }, + { + "completion_length": 10.03125, + "epoch": 0.5472227089539162, + "grad_norm": 20.57263042186901, + "kl": 0.1298828125, + "learning_rate": 4.529525144559313e-07, + "loss": 0.0078, + "reward": 1.525716781616211, + "reward_std": 0.2811649739742279, + "rewards/accuracy_reward_stage2": 0.5413416624069214, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3123 + }, + { + "completion_length": 14.171875, + "epoch": 0.5473979323637638, + "grad_norm": 18.40555272183468, + "kl": 0.1123046875, + "learning_rate": 4.5277729104608376e-07, + "loss": -0.0313, + "reward": 1.426608920097351, + "reward_std": 0.3710615336894989, + "rewards/accuracy_reward_stage2": 0.4578589200973511, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3124 + }, + { + "completion_length": 12.84375, + "epoch": 0.5475731557736113, + "grad_norm": 27.199703543281533, + "kl": 0.01708984375, + "learning_rate": 4.526020676362362e-07, + "loss": 0.0068, + "reward": 1.59226655960083, + "reward_std": 0.19375893473625183, + "rewards/accuracy_reward_stage2": 0.5922665596008301, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3125 + }, + { + "completion_length": 21.703125, + "epoch": 0.5477483791834589, + "grad_norm": 22.707419937102834, + "kl": 0.1513671875, + "learning_rate": 4.524268442263886e-07, + "loss": 0.0219, + "reward": 1.4392061233520508, + "reward_std": 0.1956329196691513, + "rewards/accuracy_reward_stage2": 0.5798312425613403, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3126 + }, + { + "completion_length": 17.0, + "epoch": 0.5479236025933064, + "grad_norm": 25.956664968558947, + "kl": 0.06884765625, + "learning_rate": 4.522516208165411e-07, + "loss": 0.0275, + "reward": 1.4441642761230469, + "reward_std": 0.3204311430454254, + "rewards/accuracy_reward_stage2": 0.4441642761230469, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3127 + }, + { + "completion_length": 10.015625, + "epoch": 0.5480988260031541, + "grad_norm": 18.86980089407399, + "kl": 0.130859375, + "learning_rate": 4.520763974066935e-07, + "loss": -0.0339, + "reward": 1.5849673748016357, + "reward_std": 0.21915815770626068, + "rewards/accuracy_reward_stage2": 0.6318423748016357, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3128 + }, + { + "completion_length": 17.4375, + "epoch": 0.5482740494130016, + "grad_norm": 17.85624212350077, + "kl": 0.0166015625, + "learning_rate": 4.5190117399684595e-07, + "loss": 0.0067, + "reward": 1.5974417924880981, + "reward_std": 0.13922935724258423, + "rewards/accuracy_reward_stage2": 0.7224418520927429, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3129 + }, + { + "completion_length": 12.390625, + "epoch": 0.5484492728228492, + "grad_norm": 16.83195774838754, + "kl": 0.010498046875, + "learning_rate": 4.517259505869984e-07, + "loss": 0.0042, + "reward": 1.7349507808685303, + "reward_std": 0.12773281335830688, + "rewards/accuracy_reward_stage2": 0.8599507212638855, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3130 + }, + { + "completion_length": 11.03125, + "epoch": 0.5486244962326967, + "grad_norm": 20.906730392221608, + "kl": 0.208984375, + "learning_rate": 4.515507271771509e-07, + "loss": 0.0837, + "reward": 1.2577917575836182, + "reward_std": 0.12434166669845581, + "rewards/accuracy_reward_stage2": 0.5077918767929077, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3131 + }, + { + "completion_length": 15.125, + "epoch": 0.5487997196425443, + "grad_norm": 18.725137651408236, + "kl": 0.1064453125, + "learning_rate": 4.5137550376730327e-07, + "loss": -0.041, + "reward": 1.314073085784912, + "reward_std": 0.1571890413761139, + "rewards/accuracy_reward_stage2": 0.3453230857849121, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3132 + }, + { + "completion_length": 7.640625, + "epoch": 0.5489749430523918, + "grad_norm": 20.149659224345918, + "kl": 0.203125, + "learning_rate": 4.512002803574557e-07, + "loss": -0.0268, + "reward": 1.5538485050201416, + "reward_std": 0.3465612530708313, + "rewards/accuracy_reward_stage2": 0.6007235646247864, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3133 + }, + { + "completion_length": 7.703125, + "epoch": 0.5491501664622394, + "grad_norm": 18.960739714891563, + "kl": 0.2041015625, + "learning_rate": 4.5102505694760815e-07, + "loss": -0.0285, + "reward": 1.3470832109451294, + "reward_std": 0.20021668076515198, + "rewards/accuracy_reward_stage2": 0.4095832407474518, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3134 + }, + { + "completion_length": 9.890625, + "epoch": 0.5493253898720869, + "grad_norm": 24.688239604005318, + "kl": 0.13671875, + "learning_rate": 4.5084983353776064e-07, + "loss": -0.0169, + "reward": 1.6175473928451538, + "reward_std": 0.3005647659301758, + "rewards/accuracy_reward_stage2": 0.6487972736358643, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3135 + }, + { + "completion_length": 10.71875, + "epoch": 0.5495006132819344, + "grad_norm": 13.618621416554019, + "kl": 0.059326171875, + "learning_rate": 4.506746101279131e-07, + "loss": 0.0237, + "reward": 1.6671037673950195, + "reward_std": 0.06964041292667389, + "rewards/accuracy_reward_stage2": 0.7921037077903748, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3136 + }, + { + "completion_length": 9.421875, + "epoch": 0.549675836691782, + "grad_norm": 14.399033573800883, + "kl": 0.1015625, + "learning_rate": 4.504993867180655e-07, + "loss": 0.0117, + "reward": 1.5416667461395264, + "reward_std": 0.19621436297893524, + "rewards/accuracy_reward_stage2": 0.5729166865348816, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3137 + }, + { + "completion_length": 11.546875, + "epoch": 0.5498510601016295, + "grad_norm": 11.203890840175257, + "kl": 0.1083984375, + "learning_rate": 4.503241633082179e-07, + "loss": -0.0349, + "reward": 1.8177083730697632, + "reward_std": 0.12734557688236237, + "rewards/accuracy_reward_stage2": 0.8489583730697632, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3138 + }, + { + "completion_length": 9.046875, + "epoch": 0.5500262835114771, + "grad_norm": 15.282853134979353, + "kl": 0.0595703125, + "learning_rate": 4.501489398983704e-07, + "loss": -0.0204, + "reward": 1.546875, + "reward_std": 0.1530819833278656, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3139 + }, + { + "completion_length": 11.640625, + "epoch": 0.5502015069213246, + "grad_norm": 22.126106131184848, + "kl": 0.2333984375, + "learning_rate": 4.4997371648852284e-07, + "loss": -0.0416, + "reward": 1.7610794305801392, + "reward_std": 0.23321378231048584, + "rewards/accuracy_reward_stage2": 0.8235794305801392, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3140 + }, + { + "completion_length": 21.234375, + "epoch": 0.5503767303311723, + "grad_norm": 16.174597906243484, + "kl": 0.2275390625, + "learning_rate": 4.497984930786753e-07, + "loss": 0.0187, + "reward": 1.4087018966674805, + "reward_std": 0.23245249688625336, + "rewards/accuracy_reward_stage2": 0.5649518966674805, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3141 + }, + { + "completion_length": 10.265625, + "epoch": 0.5505519537410198, + "grad_norm": 15.144513817072603, + "kl": 0.09765625, + "learning_rate": 4.496232696688277e-07, + "loss": -0.0051, + "reward": 1.6008020639419556, + "reward_std": 0.15015725791454315, + "rewards/accuracy_reward_stage2": 0.6164271235466003, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3142 + }, + { + "completion_length": 13.28125, + "epoch": 0.5507271771508674, + "grad_norm": 15.840311728508933, + "kl": 0.1806640625, + "learning_rate": 4.494480462589802e-07, + "loss": 0.0153, + "reward": 1.4902280569076538, + "reward_std": 0.2487790584564209, + "rewards/accuracy_reward_stage2": 0.5214781165122986, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3143 + }, + { + "completion_length": 10.75, + "epoch": 0.5509024005607149, + "grad_norm": 14.29696411929444, + "kl": 0.107421875, + "learning_rate": 4.4927282284913265e-07, + "loss": 0.0152, + "reward": 1.8940612077713013, + "reward_std": 0.11344745010137558, + "rewards/accuracy_reward_stage2": 0.9096862077713013, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3144 + }, + { + "completion_length": 9.71875, + "epoch": 0.5510776239705625, + "grad_norm": 18.577526022076846, + "kl": 0.11474609375, + "learning_rate": 4.4909759943928504e-07, + "loss": -0.0343, + "reward": 1.6097900867462158, + "reward_std": 0.3654620349407196, + "rewards/accuracy_reward_stage2": 0.7504150867462158, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3145 + }, + { + "completion_length": 9.15625, + "epoch": 0.55125284738041, + "grad_norm": 22.368273469070417, + "kl": 0.447265625, + "learning_rate": 4.489223760294375e-07, + "loss": 0.0436, + "reward": 1.6789296865463257, + "reward_std": 0.3216952681541443, + "rewards/accuracy_reward_stage2": 0.7414296865463257, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3146 + }, + { + "completion_length": 9.453125, + "epoch": 0.5514280707902576, + "grad_norm": 19.66384748677594, + "kl": 0.1240234375, + "learning_rate": 4.4874715261958997e-07, + "loss": 0.0104, + "reward": 1.4158316850662231, + "reward_std": 0.335426926612854, + "rewards/accuracy_reward_stage2": 0.43145668506622314, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3147 + }, + { + "completion_length": 8.03125, + "epoch": 0.5516032942001051, + "grad_norm": 16.66474206130544, + "kl": 0.212890625, + "learning_rate": 4.485719292097424e-07, + "loss": 0.0074, + "reward": 1.716360330581665, + "reward_std": 0.2283344864845276, + "rewards/accuracy_reward_stage2": 0.747610330581665, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3148 + }, + { + "completion_length": 11.125, + "epoch": 0.5517785176099527, + "grad_norm": 13.826401124368541, + "kl": 0.040771484375, + "learning_rate": 4.4839670579989485e-07, + "loss": 0.0163, + "reward": 1.4942870140075684, + "reward_std": 0.1060253232717514, + "rewards/accuracy_reward_stage2": 0.49428704380989075, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3149 + }, + { + "completion_length": 10.125, + "epoch": 0.5519537410198002, + "grad_norm": 16.087893374405123, + "kl": 0.08642578125, + "learning_rate": 4.482214823900473e-07, + "loss": -0.0095, + "reward": 1.328662395477295, + "reward_std": 0.20100915431976318, + "rewards/accuracy_reward_stage2": 0.3442873954772949, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3150 + }, + { + "completion_length": 9.828125, + "epoch": 0.5521289644296478, + "grad_norm": 15.359195592826731, + "kl": 0.06787109375, + "learning_rate": 4.4804625898019973e-07, + "loss": -0.0172, + "reward": 1.4719253778457642, + "reward_std": 0.18595364689826965, + "rewards/accuracy_reward_stage2": 0.4875503182411194, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3151 + }, + { + "completion_length": 14.3125, + "epoch": 0.5523041878394953, + "grad_norm": 16.956441792127453, + "kl": 0.1767578125, + "learning_rate": 4.4787103557035217e-07, + "loss": 0.0493, + "reward": 1.5719506740570068, + "reward_std": 0.21863603591918945, + "rewards/accuracy_reward_stage2": 0.7125757336616516, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3152 + }, + { + "completion_length": 7.84375, + "epoch": 0.552479411249343, + "grad_norm": 14.321242912915837, + "kl": 0.050537109375, + "learning_rate": 4.476958121605046e-07, + "loss": -0.0075, + "reward": 1.6056922674179077, + "reward_std": 0.23552852869033813, + "rewards/accuracy_reward_stage2": 0.6213172674179077, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3153 + }, + { + "completion_length": 11.203125, + "epoch": 0.5526546346591905, + "grad_norm": 19.342404513274424, + "kl": 0.23828125, + "learning_rate": 4.4752058875065705e-07, + "loss": 0.0174, + "reward": 1.4421981573104858, + "reward_std": 0.27634021639823914, + "rewards/accuracy_reward_stage2": 0.6140731573104858, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3154 + }, + { + "completion_length": 10.0, + "epoch": 0.5528298580690381, + "grad_norm": 18.155035009617933, + "kl": 0.1611328125, + "learning_rate": 4.4734536534080954e-07, + "loss": 0.0355, + "reward": 1.4873788356781006, + "reward_std": 0.23260822892189026, + "rewards/accuracy_reward_stage2": 0.503003716468811, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3155 + }, + { + "completion_length": 15.671875, + "epoch": 0.5530050814788856, + "grad_norm": 18.013443759182252, + "kl": 0.1748046875, + "learning_rate": 4.47170141930962e-07, + "loss": 0.0035, + "reward": 1.5860655307769775, + "reward_std": 0.1955593377351761, + "rewards/accuracy_reward_stage2": 0.6173155307769775, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3156 + }, + { + "completion_length": 19.03125, + "epoch": 0.5531803048887332, + "grad_norm": 15.498322945185874, + "kl": 0.1162109375, + "learning_rate": 4.469949185211144e-07, + "loss": 0.0131, + "reward": 1.1478610038757324, + "reward_std": 0.16122110188007355, + "rewards/accuracy_reward_stage2": 0.28848594427108765, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3157 + }, + { + "completion_length": 14.234375, + "epoch": 0.5533555282985807, + "grad_norm": 17.205042099112543, + "kl": 0.1982421875, + "learning_rate": 4.468196951112668e-07, + "loss": 0.0086, + "reward": 1.4277459383010864, + "reward_std": 0.3595418632030487, + "rewards/accuracy_reward_stage2": 0.4589959383010864, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3158 + }, + { + "completion_length": 8.90625, + "epoch": 0.5535307517084282, + "grad_norm": 11.244925743336225, + "kl": 0.052490234375, + "learning_rate": 4.466444717014193e-07, + "loss": -0.0232, + "reward": 1.59375, + "reward_std": 0.2177756428718567, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3159 + }, + { + "completion_length": 17.546875, + "epoch": 0.5537059751182758, + "grad_norm": 21.06181035329859, + "kl": 0.171875, + "learning_rate": 4.4646924829157174e-07, + "loss": 0.0687, + "reward": 1.4755263328552246, + "reward_std": 0.19179841876029968, + "rewards/accuracy_reward_stage2": 0.6005264520645142, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3160 + }, + { + "completion_length": 7.96875, + "epoch": 0.5538811985281233, + "grad_norm": 19.710182016252123, + "kl": 0.130859375, + "learning_rate": 4.462940248817242e-07, + "loss": 0.0205, + "reward": 1.6828030347824097, + "reward_std": 0.30897510051727295, + "rewards/accuracy_reward_stage2": 0.6984280347824097, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3161 + }, + { + "completion_length": 11.65625, + "epoch": 0.5540564219379709, + "grad_norm": 27.181609967617536, + "kl": 0.31640625, + "learning_rate": 4.461188014718766e-07, + "loss": -0.0273, + "reward": 1.2866575717926025, + "reward_std": 0.32743221521377563, + "rewards/accuracy_reward_stage2": 0.3491576611995697, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3162 + }, + { + "completion_length": 9.0625, + "epoch": 0.5542316453478184, + "grad_norm": 17.65054472095913, + "kl": 0.1064453125, + "learning_rate": 4.459435780620291e-07, + "loss": 0.0212, + "reward": 1.4841200113296509, + "reward_std": 0.24171821773052216, + "rewards/accuracy_reward_stage2": 0.4997449815273285, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3163 + }, + { + "completion_length": 8.140625, + "epoch": 0.554406868757666, + "grad_norm": 22.888330804031177, + "kl": 0.1982421875, + "learning_rate": 4.457683546521815e-07, + "loss": -0.0206, + "reward": 1.6924538612365723, + "reward_std": 0.3295304477214813, + "rewards/accuracy_reward_stage2": 0.739328920841217, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3164 + }, + { + "completion_length": 10.546875, + "epoch": 0.5545820921675135, + "grad_norm": 20.007332005698075, + "kl": 0.18359375, + "learning_rate": 4.4559313124233394e-07, + "loss": -0.0148, + "reward": 1.4425759315490723, + "reward_std": 0.29871058464050293, + "rewards/accuracy_reward_stage2": 0.4738258719444275, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3165 + }, + { + "completion_length": 7.640625, + "epoch": 0.5547573155773612, + "grad_norm": 17.282894759042968, + "kl": 0.0322265625, + "learning_rate": 4.454179078324864e-07, + "loss": 0.0129, + "reward": 1.3645219802856445, + "reward_std": 0.20641304552555084, + "rewards/accuracy_reward_stage2": 0.36452198028564453, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3166 + }, + { + "completion_length": 9.109375, + "epoch": 0.5549325389872087, + "grad_norm": 24.610090218699664, + "kl": 0.06982421875, + "learning_rate": 4.4524268442263887e-07, + "loss": 0.028, + "reward": 1.5005807876586914, + "reward_std": 0.32463937997817993, + "rewards/accuracy_reward_stage2": 0.6255807876586914, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3167 + }, + { + "completion_length": 7.171875, + "epoch": 0.5551077623970563, + "grad_norm": 21.88279887889821, + "kl": 0.078125, + "learning_rate": 4.450674610127913e-07, + "loss": 0.0312, + "reward": 1.7056117057800293, + "reward_std": 0.3208841383457184, + "rewards/accuracy_reward_stage2": 0.7056115865707397, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3168 + }, + { + "completion_length": 13.328125, + "epoch": 0.5552829858069038, + "grad_norm": 16.74587152982527, + "kl": 0.11669921875, + "learning_rate": 4.4489223760294375e-07, + "loss": 0.0467, + "reward": 1.2985787391662598, + "reward_std": 0.12602615356445312, + "rewards/accuracy_reward_stage2": 0.5485787391662598, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3169 + }, + { + "completion_length": 13.84375, + "epoch": 0.5554582092167514, + "grad_norm": 21.40490424954572, + "kl": 0.031982421875, + "learning_rate": 4.4471701419309614e-07, + "loss": 0.0128, + "reward": 1.5608049631118774, + "reward_std": 0.15516994893550873, + "rewards/accuracy_reward_stage2": 0.5608049631118774, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3170 + }, + { + "completion_length": 13.21875, + "epoch": 0.5556334326265989, + "grad_norm": 19.239122601412628, + "kl": 0.10791015625, + "learning_rate": 4.4454179078324863e-07, + "loss": 0.004, + "reward": 1.6224453449249268, + "reward_std": 0.2906746566295624, + "rewards/accuracy_reward_stage2": 0.7630704641342163, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3171 + }, + { + "completion_length": 13.09375, + "epoch": 0.5558086560364465, + "grad_norm": 19.094748291275916, + "kl": 0.06396484375, + "learning_rate": 4.4436656737340107e-07, + "loss": 0.0255, + "reward": 1.7014517784118652, + "reward_std": 0.25967666506767273, + "rewards/accuracy_reward_stage2": 0.70145183801651, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3172 + }, + { + "completion_length": 8.625, + "epoch": 0.555983879446294, + "grad_norm": 19.966829369849936, + "kl": 0.275390625, + "learning_rate": 4.441913439635535e-07, + "loss": 0.0069, + "reward": 1.5300395488739014, + "reward_std": 0.35093286633491516, + "rewards/accuracy_reward_stage2": 0.5925396680831909, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3173 + }, + { + "completion_length": 10.859375, + "epoch": 0.5561591028561416, + "grad_norm": 22.17268398172219, + "kl": 0.26953125, + "learning_rate": 4.4401612055370595e-07, + "loss": -0.0581, + "reward": 1.4169352054595947, + "reward_std": 0.2545914649963379, + "rewards/accuracy_reward_stage2": 0.4950602054595947, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3174 + }, + { + "completion_length": 8.578125, + "epoch": 0.5563343262659891, + "grad_norm": 24.07055012064237, + "kl": 0.2578125, + "learning_rate": 4.4384089714385844e-07, + "loss": -0.0169, + "reward": 1.4825234413146973, + "reward_std": 0.3476669192314148, + "rewards/accuracy_reward_stage2": 0.5450234413146973, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3175 + }, + { + "completion_length": 7.109375, + "epoch": 0.5565095496758367, + "grad_norm": 13.07784200629125, + "kl": 0.265625, + "learning_rate": 4.436656737340109e-07, + "loss": -0.0712, + "reward": 1.646902084350586, + "reward_std": 0.23995369672775269, + "rewards/accuracy_reward_stage2": 0.7250271439552307, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3176 + }, + { + "completion_length": 8.4375, + "epoch": 0.5566847730856842, + "grad_norm": 19.977964033841957, + "kl": 0.08447265625, + "learning_rate": 4.4349045032416327e-07, + "loss": -0.0102, + "reward": 1.647351861000061, + "reward_std": 0.29686886072158813, + "rewards/accuracy_reward_stage2": 0.662976861000061, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3177 + }, + { + "completion_length": 10.765625, + "epoch": 0.5568599964955318, + "grad_norm": 21.420053415746505, + "kl": 0.12451171875, + "learning_rate": 4.433152269143157e-07, + "loss": -0.0353, + "reward": 1.5312858819961548, + "reward_std": 0.3580089509487152, + "rewards/accuracy_reward_stage2": 0.5625358819961548, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3178 + }, + { + "completion_length": 10.421875, + "epoch": 0.5570352199053794, + "grad_norm": 17.138495113898916, + "kl": 0.1025390625, + "learning_rate": 4.4314000350446815e-07, + "loss": -0.0031, + "reward": 1.59375, + "reward_std": 0.2845909595489502, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3179 + }, + { + "completion_length": 11.625, + "epoch": 0.557210443315227, + "grad_norm": 11.558748314691904, + "kl": 0.0927734375, + "learning_rate": 4.4296478009462064e-07, + "loss": -0.049, + "reward": 1.595902442932129, + "reward_std": 0.1572880744934082, + "rewards/accuracy_reward_stage2": 0.6271524429321289, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3180 + }, + { + "completion_length": 8.390625, + "epoch": 0.5573856667250745, + "grad_norm": 11.752197546197085, + "kl": 0.06005859375, + "learning_rate": 4.427895566847731e-07, + "loss": -0.0202, + "reward": 1.3125, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.328125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3181 + }, + { + "completion_length": 12.796875, + "epoch": 0.557560890134922, + "grad_norm": 15.82381166113847, + "kl": 0.201171875, + "learning_rate": 4.426143332749255e-07, + "loss": -0.0258, + "reward": 1.6532623767852783, + "reward_std": 0.2777688503265381, + "rewards/accuracy_reward_stage2": 0.7001373767852783, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3182 + }, + { + "completion_length": 8.21875, + "epoch": 0.5577361135447696, + "grad_norm": 20.513792709599088, + "kl": 0.2099609375, + "learning_rate": 4.424391098650779e-07, + "loss": -0.0252, + "reward": 1.5753761529922485, + "reward_std": 0.3551340401172638, + "rewards/accuracy_reward_stage2": 0.6222511529922485, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3183 + }, + { + "completion_length": 14.953125, + "epoch": 0.5579113369546171, + "grad_norm": 22.883421913878937, + "kl": 0.1328125, + "learning_rate": 4.422638864552304e-07, + "loss": -0.0242, + "reward": 1.434954047203064, + "reward_std": 0.3454228639602661, + "rewards/accuracy_reward_stage2": 0.46620407700538635, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3184 + }, + { + "completion_length": 6.90625, + "epoch": 0.5580865603644647, + "grad_norm": 19.627035429188577, + "kl": 0.048828125, + "learning_rate": 4.4208866304538284e-07, + "loss": -0.0095, + "reward": 1.5062909126281738, + "reward_std": 0.2934325933456421, + "rewards/accuracy_reward_stage2": 0.521915853023529, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3185 + }, + { + "completion_length": 11.3125, + "epoch": 0.5582617837743122, + "grad_norm": 19.709667689059085, + "kl": 0.23046875, + "learning_rate": 4.419134396355353e-07, + "loss": 0.0353, + "reward": 1.686478853225708, + "reward_std": 0.2373899221420288, + "rewards/accuracy_reward_stage2": 0.8427289128303528, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3186 + }, + { + "completion_length": 13.734375, + "epoch": 0.5584370071841598, + "grad_norm": 22.747635084082575, + "kl": 0.1591796875, + "learning_rate": 4.417382162256877e-07, + "loss": 0.0637, + "reward": 1.4030470848083496, + "reward_std": 0.24681012332439423, + "rewards/accuracy_reward_stage2": 0.5280469655990601, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3187 + }, + { + "completion_length": 10.8125, + "epoch": 0.5586122305940073, + "grad_norm": 16.6917204221685, + "kl": 0.130859375, + "learning_rate": 4.415629928158402e-07, + "loss": 0.0081, + "reward": 1.6642496585845947, + "reward_std": 0.23828163743019104, + "rewards/accuracy_reward_stage2": 0.6798745393753052, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3188 + }, + { + "completion_length": 9.03125, + "epoch": 0.5587874540038549, + "grad_norm": 16.825558367044195, + "kl": 0.1015625, + "learning_rate": 4.413877694059926e-07, + "loss": 0.0018, + "reward": 1.5643525123596191, + "reward_std": 0.2210099995136261, + "rewards/accuracy_reward_stage2": 0.7049775123596191, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3189 + }, + { + "completion_length": 9.703125, + "epoch": 0.5589626774137024, + "grad_norm": 20.641601437340125, + "kl": 0.051513671875, + "learning_rate": 4.4121254599614504e-07, + "loss": 0.0206, + "reward": 1.7616381645202637, + "reward_std": 0.19968253374099731, + "rewards/accuracy_reward_stage2": 0.7616380453109741, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3190 + }, + { + "completion_length": 10.546875, + "epoch": 0.55913790082355, + "grad_norm": 20.21219529381267, + "kl": 0.08349609375, + "learning_rate": 4.410373225862975e-07, + "loss": 0.0332, + "reward": 1.7175178527832031, + "reward_std": 0.15806759893894196, + "rewards/accuracy_reward_stage2": 0.8425179123878479, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3191 + }, + { + "completion_length": 19.6875, + "epoch": 0.5593131242333976, + "grad_norm": 18.111853588114563, + "kl": 0.1240234375, + "learning_rate": 4.4086209917644997e-07, + "loss": -0.0375, + "reward": 1.7544395923614502, + "reward_std": 0.24202269315719604, + "rewards/accuracy_reward_stage2": 0.7856895923614502, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3192 + }, + { + "completion_length": 9.84375, + "epoch": 0.5594883476432452, + "grad_norm": 20.112560254113447, + "kl": 0.1767578125, + "learning_rate": 4.406868757666024e-07, + "loss": -0.0124, + "reward": 1.3385369777679443, + "reward_std": 0.36554330587387085, + "rewards/accuracy_reward_stage2": 0.49478694796562195, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3193 + }, + { + "completion_length": 6.84375, + "epoch": 0.5596635710530927, + "grad_norm": 19.564347720470963, + "kl": 0.283203125, + "learning_rate": 4.4051165235675485e-07, + "loss": -0.0118, + "reward": 1.7647864818572998, + "reward_std": 0.2844822108745575, + "rewards/accuracy_reward_stage2": 0.8272864818572998, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3194 + }, + { + "completion_length": 13.40625, + "epoch": 0.5598387944629403, + "grad_norm": 17.476056355454983, + "kl": 0.0703125, + "learning_rate": 4.403364289469073e-07, + "loss": 0.0281, + "reward": 1.5582443475723267, + "reward_std": 0.26723265647888184, + "rewards/accuracy_reward_stage2": 0.6832443475723267, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3195 + }, + { + "completion_length": 9.625, + "epoch": 0.5600140178727878, + "grad_norm": 20.52858424291766, + "kl": 0.26171875, + "learning_rate": 4.4016120553705973e-07, + "loss": 0.0215, + "reward": 1.647862195968628, + "reward_std": 0.20478901267051697, + "rewards/accuracy_reward_stage2": 0.6947371363639832, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3196 + }, + { + "completion_length": 9.984375, + "epoch": 0.5601892412826354, + "grad_norm": 11.87473457027194, + "kl": 0.049072265625, + "learning_rate": 4.3998598212721217e-07, + "loss": 0.0196, + "reward": 1.3483126163482666, + "reward_std": 0.12753789126873016, + "rewards/accuracy_reward_stage2": 0.4733126163482666, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3197 + }, + { + "completion_length": 11.765625, + "epoch": 0.5603644646924829, + "grad_norm": 16.538140753582518, + "kl": 0.2470703125, + "learning_rate": 4.398107587173646e-07, + "loss": 0.0256, + "reward": 1.498239517211914, + "reward_std": 0.23242546617984772, + "rewards/accuracy_reward_stage2": 0.7638646364212036, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3198 + }, + { + "completion_length": 11.109375, + "epoch": 0.5605396881023305, + "grad_norm": 23.394955904871278, + "kl": 0.2890625, + "learning_rate": 4.3963553530751705e-07, + "loss": 0.0801, + "reward": 1.5308568477630615, + "reward_std": 0.2671172618865967, + "rewards/accuracy_reward_stage2": 0.671481728553772, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3199 + }, + { + "completion_length": 9.875, + "epoch": 0.560714911512178, + "grad_norm": 15.441451965197013, + "kl": 0.109375, + "learning_rate": 4.3946031189766954e-07, + "loss": 0.0436, + "reward": 1.3800475597381592, + "reward_std": 0.08413176238536835, + "rewards/accuracy_reward_stage2": 0.5050475597381592, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3200 + }, + { + "completion_length": 6.890625, + "epoch": 0.5608901349220256, + "grad_norm": 21.575708325957677, + "kl": 0.09228515625, + "learning_rate": 4.39285088487822e-07, + "loss": 0.0369, + "reward": 1.7437188625335693, + "reward_std": 0.21275295317173004, + "rewards/accuracy_reward_stage2": 0.7437188625335693, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3201 + }, + { + "completion_length": 11.671875, + "epoch": 0.5610653583318731, + "grad_norm": 162.16611459049957, + "kl": 1.1640625, + "learning_rate": 4.3910986507797436e-07, + "loss": 0.3752, + "reward": 1.500571846961975, + "reward_std": 0.17933861911296844, + "rewards/accuracy_reward_stage2": 0.6568217873573303, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3202 + }, + { + "completion_length": 7.109375, + "epoch": 0.5612405817417206, + "grad_norm": 12.19803769706041, + "kl": 0.0146484375, + "learning_rate": 4.389346416681268e-07, + "loss": 0.0059, + "reward": 1.796875, + "reward_std": 0.189372718334198, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3203 + }, + { + "completion_length": 14.3125, + "epoch": 0.5614158051515683, + "grad_norm": 18.535694073248635, + "kl": 0.034423828125, + "learning_rate": 4.387594182582793e-07, + "loss": 0.0138, + "reward": 1.563488245010376, + "reward_std": 0.1409141570329666, + "rewards/accuracy_reward_stage2": 0.563488245010376, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3204 + }, + { + "completion_length": 11.453125, + "epoch": 0.5615910285614159, + "grad_norm": 17.69222335890946, + "kl": 0.28125, + "learning_rate": 4.3858419484843174e-07, + "loss": -0.0359, + "reward": 1.5310032367706299, + "reward_std": 0.16320835053920746, + "rewards/accuracy_reward_stage2": 0.5935031175613403, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3205 + }, + { + "completion_length": 13.0625, + "epoch": 0.5617662519712634, + "grad_norm": 14.769363797772218, + "kl": 0.1240234375, + "learning_rate": 4.384089714385842e-07, + "loss": 0.0082, + "reward": 1.7722058296203613, + "reward_std": 0.1137927919626236, + "rewards/accuracy_reward_stage2": 0.7878307700157166, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3206 + }, + { + "completion_length": 17.765625, + "epoch": 0.5619414753811109, + "grad_norm": 18.148990937885742, + "kl": 0.078125, + "learning_rate": 4.382337480287366e-07, + "loss": 0.0312, + "reward": 1.401296615600586, + "reward_std": 0.15238088369369507, + "rewards/accuracy_reward_stage2": 0.5262964963912964, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3207 + }, + { + "completion_length": 11.578125, + "epoch": 0.5621166987909585, + "grad_norm": 18.39241430040369, + "kl": 0.19140625, + "learning_rate": 4.380585246188891e-07, + "loss": -0.0008, + "reward": 1.5690919160842896, + "reward_std": 0.36027538776397705, + "rewards/accuracy_reward_stage2": 0.6003419160842896, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3208 + }, + { + "completion_length": 10.71875, + "epoch": 0.562291922200806, + "grad_norm": 19.185613600966136, + "kl": 0.107421875, + "learning_rate": 4.378833012090415e-07, + "loss": 0.022, + "reward": 1.5785123109817505, + "reward_std": 0.32527726888656616, + "rewards/accuracy_reward_stage2": 0.5941373109817505, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3209 + }, + { + "completion_length": 12.421875, + "epoch": 0.5624671456106536, + "grad_norm": 18.097225530702673, + "kl": 0.11328125, + "learning_rate": 4.3770807779919393e-07, + "loss": -0.0333, + "reward": 1.4982638359069824, + "reward_std": 0.25345471501350403, + "rewards/accuracy_reward_stage2": 0.5295138955116272, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3210 + }, + { + "completion_length": 12.3125, + "epoch": 0.5626423690205011, + "grad_norm": 17.231740141514813, + "kl": 0.0673828125, + "learning_rate": 4.375328543893464e-07, + "loss": 0.0269, + "reward": 1.429835319519043, + "reward_std": 0.1284599006175995, + "rewards/accuracy_reward_stage2": 0.5548353791236877, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3211 + }, + { + "completion_length": 19.421875, + "epoch": 0.5628175924303487, + "grad_norm": 20.159795366197336, + "kl": 0.01055908203125, + "learning_rate": 4.3735763097949887e-07, + "loss": 0.0042, + "reward": 1.4341033697128296, + "reward_std": 0.15376178920269012, + "rewards/accuracy_reward_stage2": 0.4341033101081848, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3212 + }, + { + "completion_length": 9.421875, + "epoch": 0.5629928158401962, + "grad_norm": 21.621095116440348, + "kl": 0.07080078125, + "learning_rate": 4.371824075696513e-07, + "loss": -0.016, + "reward": 1.5738193988800049, + "reward_std": 0.2998353838920593, + "rewards/accuracy_reward_stage2": 0.5894443392753601, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3213 + }, + { + "completion_length": 8.5625, + "epoch": 0.5631680392500438, + "grad_norm": 19.44838191969088, + "kl": 0.275390625, + "learning_rate": 4.3700718415980375e-07, + "loss": -0.0095, + "reward": 1.4690642356872559, + "reward_std": 0.39073270559310913, + "rewards/accuracy_reward_stage2": 0.5159392952919006, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3214 + }, + { + "completion_length": 8.828125, + "epoch": 0.5633432626598913, + "grad_norm": 16.7926305447221, + "kl": 0.022216796875, + "learning_rate": 4.3683196074995613e-07, + "loss": 0.0089, + "reward": 1.6463541984558105, + "reward_std": 0.11637798696756363, + "rewards/accuracy_reward_stage2": 0.6463541984558105, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3215 + }, + { + "completion_length": 7.921875, + "epoch": 0.5635184860697389, + "grad_norm": 16.575150696218373, + "kl": 0.1103515625, + "learning_rate": 4.366567373401086e-07, + "loss": 0.0072, + "reward": 1.7634837627410889, + "reward_std": 0.24160407483577728, + "rewards/accuracy_reward_stage2": 0.7791087627410889, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3216 + }, + { + "completion_length": 12.75, + "epoch": 0.5636937094795865, + "grad_norm": 17.25097670858031, + "kl": 0.08837890625, + "learning_rate": 4.3648151393026107e-07, + "loss": 0.0354, + "reward": 1.5628836154937744, + "reward_std": 0.27376827597618103, + "rewards/accuracy_reward_stage2": 0.5628836154937744, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3217 + }, + { + "completion_length": 10.8125, + "epoch": 0.5638689328894341, + "grad_norm": 16.21171001000319, + "kl": 0.16015625, + "learning_rate": 4.363062905204135e-07, + "loss": -0.0196, + "reward": 1.4027849435806274, + "reward_std": 0.2692621946334839, + "rewards/accuracy_reward_stage2": 0.4496598541736603, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3218 + }, + { + "completion_length": 9.828125, + "epoch": 0.5640441562992816, + "grad_norm": 14.414151768836764, + "kl": 0.025146484375, + "learning_rate": 4.3613106711056594e-07, + "loss": 0.0101, + "reward": 1.7850942611694336, + "reward_std": 0.09801465272903442, + "rewards/accuracy_reward_stage2": 0.7850942611694336, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3219 + }, + { + "completion_length": 13.453125, + "epoch": 0.5642193797091292, + "grad_norm": 24.6320558666092, + "kl": 0.25, + "learning_rate": 4.3595584370071844e-07, + "loss": 0.0128, + "reward": 1.4080870151519775, + "reward_std": 0.2708454430103302, + "rewards/accuracy_reward_stage2": 0.43933701515197754, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3220 + }, + { + "completion_length": 8.171875, + "epoch": 0.5643946031189767, + "grad_norm": 28.597504490295133, + "kl": 0.173828125, + "learning_rate": 4.357806202908708e-07, + "loss": 0.0429, + "reward": 1.449331521987915, + "reward_std": 0.32936540246009827, + "rewards/accuracy_reward_stage2": 0.46495649218559265, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3221 + }, + { + "completion_length": 8.234375, + "epoch": 0.5645698265288243, + "grad_norm": 16.205125598210785, + "kl": 0.10595703125, + "learning_rate": 4.3560539688102326e-07, + "loss": -0.0019, + "reward": 1.4258536100387573, + "reward_std": 0.1293002814054489, + "rewards/accuracy_reward_stage2": 0.5664785504341125, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3222 + }, + { + "completion_length": 11.328125, + "epoch": 0.5647450499386718, + "grad_norm": 10.876812503612529, + "kl": 0.07421875, + "learning_rate": 4.354301734711757e-07, + "loss": 0.0297, + "reward": 1.643942952156067, + "reward_std": 0.07415582239627838, + "rewards/accuracy_reward_stage2": 0.7689428925514221, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3223 + }, + { + "completion_length": 9.828125, + "epoch": 0.5649202733485194, + "grad_norm": 27.2544035722533, + "kl": 0.0458984375, + "learning_rate": 4.352549500613282e-07, + "loss": 0.0183, + "reward": 1.785082459449768, + "reward_std": 0.24246467649936676, + "rewards/accuracy_reward_stage2": 0.7850823402404785, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3224 + }, + { + "completion_length": 7.09375, + "epoch": 0.5650954967583669, + "grad_norm": 14.083083155362187, + "kl": 0.07861328125, + "learning_rate": 4.3507972665148064e-07, + "loss": 0.0314, + "reward": 1.7869019508361816, + "reward_std": 0.07399497926235199, + "rewards/accuracy_reward_stage2": 0.7869018316268921, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3225 + }, + { + "completion_length": 10.9375, + "epoch": 0.5652707201682144, + "grad_norm": 15.09002638533396, + "kl": 0.07470703125, + "learning_rate": 4.349045032416331e-07, + "loss": -0.0014, + "reward": 1.5789334774017334, + "reward_std": 0.22475594282150269, + "rewards/accuracy_reward_stage2": 0.5945584774017334, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3226 + }, + { + "completion_length": 9.21875, + "epoch": 0.565445943578062, + "grad_norm": 22.039352896693494, + "kl": 0.205078125, + "learning_rate": 4.347292798317855e-07, + "loss": 0.0593, + "reward": 1.3339595794677734, + "reward_std": 0.2276839017868042, + "rewards/accuracy_reward_stage2": 0.6152095198631287, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3227 + }, + { + "completion_length": 7.640625, + "epoch": 0.5656211669879095, + "grad_norm": 19.61843143593303, + "kl": 0.04150390625, + "learning_rate": 4.3455405642193795e-07, + "loss": 0.0166, + "reward": 1.7379176616668701, + "reward_std": 0.18134930729866028, + "rewards/accuracy_reward_stage2": 0.7379177808761597, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3228 + }, + { + "completion_length": 10.953125, + "epoch": 0.5657963903977571, + "grad_norm": 21.3555429487247, + "kl": 0.2578125, + "learning_rate": 4.343788330120904e-07, + "loss": 0.0654, + "reward": 1.4483630657196045, + "reward_std": 0.38449281454086304, + "rewards/accuracy_reward_stage2": 0.5889881253242493, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3229 + }, + { + "completion_length": 14.984375, + "epoch": 0.5659716138076047, + "grad_norm": 16.71334086382943, + "kl": 0.06396484375, + "learning_rate": 4.3420360960224283e-07, + "loss": 0.0256, + "reward": 1.512737512588501, + "reward_std": 0.15234015882015228, + "rewards/accuracy_reward_stage2": 0.512737512588501, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3230 + }, + { + "completion_length": 8.125, + "epoch": 0.5661468372174523, + "grad_norm": 22.757742414149945, + "kl": 0.162109375, + "learning_rate": 4.3402838619239527e-07, + "loss": 0.0647, + "reward": 1.5299479961395264, + "reward_std": 0.23831304907798767, + "rewards/accuracy_reward_stage2": 0.5299479365348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3231 + }, + { + "completion_length": 12.828125, + "epoch": 0.5663220606272998, + "grad_norm": 19.429740807099193, + "kl": 0.08251953125, + "learning_rate": 4.3385316278254777e-07, + "loss": -0.0057, + "reward": 1.5748710632324219, + "reward_std": 0.21447551250457764, + "rewards/accuracy_reward_stage2": 0.5904961824417114, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3232 + }, + { + "completion_length": 9.625, + "epoch": 0.5664972840371474, + "grad_norm": 17.21733104459262, + "kl": 0.09130859375, + "learning_rate": 4.336779393727002e-07, + "loss": 0.0363, + "reward": 1.4882630109786987, + "reward_std": 0.2861184775829315, + "rewards/accuracy_reward_stage2": 0.6132630109786987, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3233 + }, + { + "completion_length": 8.5625, + "epoch": 0.5666725074469949, + "grad_norm": 17.967560787175596, + "kl": 0.197265625, + "learning_rate": 4.335027159628526e-07, + "loss": -0.007, + "reward": 1.676719307899475, + "reward_std": 0.2228265404701233, + "rewards/accuracy_reward_stage2": 0.7079692482948303, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3234 + }, + { + "completion_length": 11.28125, + "epoch": 0.5668477308568425, + "grad_norm": 16.682095638705565, + "kl": 0.21875, + "learning_rate": 4.3332749255300503e-07, + "loss": -0.0576, + "reward": 1.720862865447998, + "reward_std": 0.27902647852897644, + "rewards/accuracy_reward_stage2": 0.7833628058433533, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3235 + }, + { + "completion_length": 13.171875, + "epoch": 0.56702295426669, + "grad_norm": 22.178935418309496, + "kl": 0.11328125, + "learning_rate": 4.331522691431575e-07, + "loss": -0.0209, + "reward": 1.4929907321929932, + "reward_std": 0.2933758497238159, + "rewards/accuracy_reward_stage2": 0.5242406129837036, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3236 + }, + { + "completion_length": 7.875, + "epoch": 0.5671981776765376, + "grad_norm": 18.121307300594328, + "kl": 0.0888671875, + "learning_rate": 4.3297704573330996e-07, + "loss": 0.0354, + "reward": 1.462658405303955, + "reward_std": 0.20405489206314087, + "rewards/accuracy_reward_stage2": 0.5876583456993103, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3237 + }, + { + "completion_length": 11.8125, + "epoch": 0.5673734010863851, + "grad_norm": 13.614637902537446, + "kl": 0.189453125, + "learning_rate": 4.328018223234624e-07, + "loss": -0.0101, + "reward": 1.1695168018341064, + "reward_std": 0.15401369333267212, + "rewards/accuracy_reward_stage2": 0.4507666826248169, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3238 + }, + { + "completion_length": 12.296875, + "epoch": 0.5675486244962327, + "grad_norm": 19.40824248666292, + "kl": 0.126953125, + "learning_rate": 4.3262659891361484e-07, + "loss": 0.0153, + "reward": 1.4984833002090454, + "reward_std": 0.2742450535297394, + "rewards/accuracy_reward_stage2": 0.5141082406044006, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3239 + }, + { + "completion_length": 12.03125, + "epoch": 0.5677238479060802, + "grad_norm": 18.540362816237884, + "kl": 0.03857421875, + "learning_rate": 4.324513755037673e-07, + "loss": 0.0154, + "reward": 1.5752973556518555, + "reward_std": 0.202922523021698, + "rewards/accuracy_reward_stage2": 0.5752974152565002, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3240 + }, + { + "completion_length": 11.09375, + "epoch": 0.5678990713159278, + "grad_norm": 14.610240774086812, + "kl": 0.2734375, + "learning_rate": 4.322761520939197e-07, + "loss": -0.0418, + "reward": 1.4417760372161865, + "reward_std": 0.21922755241394043, + "rewards/accuracy_reward_stage2": 0.5042760372161865, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3241 + }, + { + "completion_length": 5.53125, + "epoch": 0.5680742947257753, + "grad_norm": 21.848077026052675, + "kl": 0.10693359375, + "learning_rate": 4.3210092868407216e-07, + "loss": -0.0012, + "reward": 1.519402265548706, + "reward_std": 0.22549188137054443, + "rewards/accuracy_reward_stage2": 0.5350273251533508, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3242 + }, + { + "completion_length": 10.984375, + "epoch": 0.568249518135623, + "grad_norm": 18.597432240492093, + "kl": 0.2080078125, + "learning_rate": 4.319257052742246e-07, + "loss": -0.0013, + "reward": 1.4672976732254028, + "reward_std": 0.2723372280597687, + "rewards/accuracy_reward_stage2": 0.49854767322540283, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3243 + }, + { + "completion_length": 8.015625, + "epoch": 0.5684247415454705, + "grad_norm": 15.128065729385757, + "kl": 0.1796875, + "learning_rate": 4.317504818643771e-07, + "loss": 0.0051, + "reward": 1.5128765106201172, + "reward_std": 0.23403945565223694, + "rewards/accuracy_reward_stage2": 0.6535014510154724, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3244 + }, + { + "completion_length": 11.734375, + "epoch": 0.5685999649553181, + "grad_norm": 14.460722192729529, + "kl": 0.2021484375, + "learning_rate": 4.3157525845452953e-07, + "loss": -0.0005, + "reward": 1.4392145872116089, + "reward_std": 0.16994988918304443, + "rewards/accuracy_reward_stage2": 0.5954645276069641, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3245 + }, + { + "completion_length": 7.359375, + "epoch": 0.5687751883651656, + "grad_norm": 28.45521949425175, + "kl": 0.2734375, + "learning_rate": 4.31400035044682e-07, + "loss": 0.0649, + "reward": 1.4389383792877197, + "reward_std": 0.3385145366191864, + "rewards/accuracy_reward_stage2": 0.4545634388923645, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3246 + }, + { + "completion_length": 8.421875, + "epoch": 0.5689504117750132, + "grad_norm": 54.59620973864808, + "kl": 0.5546875, + "learning_rate": 4.3122481163483436e-07, + "loss": 0.003, + "reward": 1.2950458526611328, + "reward_std": 0.1936914175748825, + "rewards/accuracy_reward_stage2": 0.6544209122657776, + "rewards/format_reward_stage1_pointerpad": 0.640625, + "scores/accuracy_reward_stage2": 0.640625, + "step": 3247 + }, + { + "completion_length": 9.9375, + "epoch": 0.5691256351848607, + "grad_norm": 21.733520858543173, + "kl": 0.287109375, + "learning_rate": 4.3104958822498685e-07, + "loss": -0.0068, + "reward": 1.4032741785049438, + "reward_std": 0.3143426775932312, + "rewards/accuracy_reward_stage2": 0.45014917850494385, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3248 + }, + { + "completion_length": 11.03125, + "epoch": 0.5693008585947082, + "grad_norm": 28.34829740951872, + "kl": 0.08642578125, + "learning_rate": 4.308743648151393e-07, + "loss": -0.0097, + "reward": 1.5421795845031738, + "reward_std": 0.2444736212491989, + "rewards/accuracy_reward_stage2": 0.557804524898529, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3249 + }, + { + "completion_length": 12.578125, + "epoch": 0.5694760820045558, + "grad_norm": 19.836738683639865, + "kl": 0.15234375, + "learning_rate": 4.3069914140529173e-07, + "loss": -0.0679, + "reward": 1.429978847503662, + "reward_std": 0.3472153842449188, + "rewards/accuracy_reward_stage2": 0.4768539369106293, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3250 + }, + { + "completion_length": 10.46875, + "epoch": 0.5696513054144033, + "grad_norm": 21.483227411450418, + "kl": 0.0625, + "learning_rate": 4.3052391799544417e-07, + "loss": -0.0191, + "reward": 1.6257601976394653, + "reward_std": 0.26404887437820435, + "rewards/accuracy_reward_stage2": 0.6413851976394653, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3251 + }, + { + "completion_length": 7.140625, + "epoch": 0.5698265288242509, + "grad_norm": 19.57299909193561, + "kl": 0.12158203125, + "learning_rate": 4.3034869458559666e-07, + "loss": 0.0206, + "reward": 1.6792659759521484, + "reward_std": 0.2793692350387573, + "rewards/accuracy_reward_stage2": 0.6948908567428589, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3252 + }, + { + "completion_length": 9.1875, + "epoch": 0.5700017522340984, + "grad_norm": 19.498853474073666, + "kl": 0.05810546875, + "learning_rate": 4.3017347117574905e-07, + "loss": 0.0232, + "reward": 1.5725898742675781, + "reward_std": 0.1827564537525177, + "rewards/accuracy_reward_stage2": 0.6975898742675781, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3253 + }, + { + "completion_length": 9.0625, + "epoch": 0.570176975643946, + "grad_norm": 12.837473511274942, + "kl": 0.11083984375, + "learning_rate": 4.299982477659015e-07, + "loss": 0.0253, + "reward": 1.653282880783081, + "reward_std": 0.1653340756893158, + "rewards/accuracy_reward_stage2": 0.6689077615737915, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3254 + }, + { + "completion_length": 9.640625, + "epoch": 0.5703521990537936, + "grad_norm": 25.24313558707273, + "kl": 0.09912109375, + "learning_rate": 4.2982302435605393e-07, + "loss": 0.0396, + "reward": 1.8017048835754395, + "reward_std": 0.19715824723243713, + "rewards/accuracy_reward_stage2": 0.8017048835754395, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3255 + }, + { + "completion_length": 8.796875, + "epoch": 0.5705274224636412, + "grad_norm": 18.519638123786716, + "kl": 0.06591796875, + "learning_rate": 4.2964780094620637e-07, + "loss": 0.0263, + "reward": 1.553621768951416, + "reward_std": 0.22481247782707214, + "rewards/accuracy_reward_stage2": 0.5536218881607056, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3256 + }, + { + "completion_length": 9.71875, + "epoch": 0.5707026458734887, + "grad_norm": 19.563549003512524, + "kl": 0.1748046875, + "learning_rate": 4.2947257753635886e-07, + "loss": 0.0263, + "reward": 1.7168099880218506, + "reward_std": 0.3916775584220886, + "rewards/accuracy_reward_stage2": 0.7324349880218506, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3257 + }, + { + "completion_length": 15.375, + "epoch": 0.5708778692833363, + "grad_norm": 17.002630755032023, + "kl": 0.0888671875, + "learning_rate": 4.292973541265113e-07, + "loss": -0.0063, + "reward": 1.3024250268936157, + "reward_std": 0.2217259705066681, + "rewards/accuracy_reward_stage2": 0.44304996728897095, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3258 + }, + { + "completion_length": 8.5625, + "epoch": 0.5710530926931838, + "grad_norm": 18.175895476432554, + "kl": 0.158203125, + "learning_rate": 4.291221307166637e-07, + "loss": 0.0192, + "reward": 1.3714404106140137, + "reward_std": 0.17032676935195923, + "rewards/accuracy_reward_stage2": 0.6370653510093689, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3259 + }, + { + "completion_length": 12.046875, + "epoch": 0.5712283161030314, + "grad_norm": 19.595931739238125, + "kl": 0.072265625, + "learning_rate": 4.2894690730681613e-07, + "loss": 0.0289, + "reward": 1.6014660596847534, + "reward_std": 0.24930503964424133, + "rewards/accuracy_reward_stage2": 0.6014660000801086, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3260 + }, + { + "completion_length": 9.359375, + "epoch": 0.5714035395128789, + "grad_norm": 22.378946777705135, + "kl": 0.205078125, + "learning_rate": 4.287716838969686e-07, + "loss": 0.0454, + "reward": 1.6707240343093872, + "reward_std": 0.2596997022628784, + "rewards/accuracy_reward_stage2": 0.6863489747047424, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3261 + }, + { + "completion_length": 8.875, + "epoch": 0.5715787629227265, + "grad_norm": 13.881901021806375, + "kl": 0.099609375, + "learning_rate": 4.2859646048712106e-07, + "loss": -0.002, + "reward": 1.7320775985717773, + "reward_std": 0.10933174937963486, + "rewards/accuracy_reward_stage2": 0.7477025985717773, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3262 + }, + { + "completion_length": 6.3125, + "epoch": 0.571753986332574, + "grad_norm": 17.567583797864298, + "kl": 0.0732421875, + "learning_rate": 4.284212370772735e-07, + "loss": -0.0124, + "reward": 1.6161859035491943, + "reward_std": 0.2978987991809845, + "rewards/accuracy_reward_stage2": 0.6318109035491943, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3263 + }, + { + "completion_length": 11.5625, + "epoch": 0.5719292097424216, + "grad_norm": 22.225003221781638, + "kl": 0.15625, + "learning_rate": 4.2824601366742594e-07, + "loss": 0.0625, + "reward": 1.4586200714111328, + "reward_std": 0.3100087344646454, + "rewards/accuracy_reward_stage2": 0.45862019062042236, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3264 + }, + { + "completion_length": 10.140625, + "epoch": 0.5721044331522691, + "grad_norm": 17.636927566313457, + "kl": 0.0859375, + "learning_rate": 4.2807079025757843e-07, + "loss": -0.0056, + "reward": 1.7124474048614502, + "reward_std": 0.19638592004776, + "rewards/accuracy_reward_stage2": 0.7280724048614502, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3265 + }, + { + "completion_length": 10.859375, + "epoch": 0.5722796565621167, + "grad_norm": 15.538390882765091, + "kl": 0.1376953125, + "learning_rate": 4.278955668477308e-07, + "loss": 0.0109, + "reward": 1.5744549036026, + "reward_std": 0.17407383024692535, + "rewards/accuracy_reward_stage2": 0.5900799036026001, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3266 + }, + { + "completion_length": 6.171875, + "epoch": 0.5724548799719642, + "grad_norm": 8.95489831645316, + "kl": 0.072265625, + "learning_rate": 4.2772034343788326e-07, + "loss": 0.0289, + "reward": 1.5, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward_stage2": 0.625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3267 + }, + { + "completion_length": 9.671875, + "epoch": 0.5726301033818119, + "grad_norm": 23.077475062509656, + "kl": 0.1298828125, + "learning_rate": 4.275451200280357e-07, + "loss": 0.0518, + "reward": 1.4160887002944946, + "reward_std": 0.25131142139434814, + "rewards/accuracy_reward_stage2": 0.5410886406898499, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3268 + }, + { + "completion_length": 9.1875, + "epoch": 0.5728053267916594, + "grad_norm": 64.54471420516117, + "kl": 0.50390625, + "learning_rate": 4.273698966181882e-07, + "loss": 0.2022, + "reward": 1.4653161764144897, + "reward_std": 0.1672295182943344, + "rewards/accuracy_reward_stage2": 0.715316116809845, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3269 + }, + { + "completion_length": 8.71875, + "epoch": 0.572980550201507, + "grad_norm": 20.224070237612608, + "kl": 0.09521484375, + "learning_rate": 4.2719467320834063e-07, + "loss": 0.038, + "reward": 1.7414612770080566, + "reward_std": 0.14439380168914795, + "rewards/accuracy_reward_stage2": 0.7414612174034119, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3270 + }, + { + "completion_length": 11.921875, + "epoch": 0.5731557736113545, + "grad_norm": 11.893100562966017, + "kl": 0.10595703125, + "learning_rate": 4.2701944979849307e-07, + "loss": -0.0901, + "reward": 1.6197917461395264, + "reward_std": 0.2911488711833954, + "rewards/accuracy_reward_stage2": 0.6666666269302368, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3271 + }, + { + "completion_length": 13.0, + "epoch": 0.573330997021202, + "grad_norm": 18.123729535165488, + "kl": 0.1103515625, + "learning_rate": 4.2684422638864546e-07, + "loss": 0.0443, + "reward": 1.8838486671447754, + "reward_std": 0.19200080633163452, + "rewards/accuracy_reward_stage2": 0.8838486671447754, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3272 + }, + { + "completion_length": 12.796875, + "epoch": 0.5735062204310496, + "grad_norm": 16.187098739667356, + "kl": 0.166015625, + "learning_rate": 4.2666900297879795e-07, + "loss": 0.0375, + "reward": 1.6712990999221802, + "reward_std": 0.21961762011051178, + "rewards/accuracy_reward_stage2": 0.6869240999221802, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3273 + }, + { + "completion_length": 11.578125, + "epoch": 0.5736814438408971, + "grad_norm": 17.906472432934265, + "kl": 0.07177734375, + "learning_rate": 4.264937795689504e-07, + "loss": 0.0288, + "reward": 1.4957730770111084, + "reward_std": 0.1836545616388321, + "rewards/accuracy_reward_stage2": 0.4957730174064636, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3274 + }, + { + "completion_length": 10.875, + "epoch": 0.5738566672507447, + "grad_norm": 15.113223087609942, + "kl": 0.126953125, + "learning_rate": 4.2631855615910283e-07, + "loss": 0.0066, + "reward": 1.6197917461395264, + "reward_std": 0.21341678500175476, + "rewards/accuracy_reward_stage2": 0.8697916269302368, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3275 + }, + { + "completion_length": 14.546875, + "epoch": 0.5740318906605922, + "grad_norm": 25.882139996220964, + "kl": 0.1396484375, + "learning_rate": 4.2614333274925527e-07, + "loss": 0.056, + "reward": 1.4892785549163818, + "reward_std": 0.3860580623149872, + "rewards/accuracy_reward_stage2": 0.6142784357070923, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3276 + }, + { + "completion_length": 9.484375, + "epoch": 0.5742071140704398, + "grad_norm": 19.216446272569474, + "kl": 0.036865234375, + "learning_rate": 4.2596810933940776e-07, + "loss": 0.0148, + "reward": 1.5715553760528564, + "reward_std": 0.15116415917873383, + "rewards/accuracy_reward_stage2": 0.6965553760528564, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3277 + }, + { + "completion_length": 12.375, + "epoch": 0.5743823374802873, + "grad_norm": 21.357406798195083, + "kl": 0.130859375, + "learning_rate": 4.257928859295602e-07, + "loss": 0.0244, + "reward": 1.8721911907196045, + "reward_std": 0.24186889827251434, + "rewards/accuracy_reward_stage2": 0.8878160715103149, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3278 + }, + { + "completion_length": 7.1875, + "epoch": 0.5745575608901349, + "grad_norm": 19.448846480307616, + "kl": 0.13671875, + "learning_rate": 4.256176625197126e-07, + "loss": 0.0546, + "reward": 1.5123186111450195, + "reward_std": 0.18857964873313904, + "rewards/accuracy_reward_stage2": 0.5123186111450195, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3279 + }, + { + "completion_length": 15.390625, + "epoch": 0.5747327842999824, + "grad_norm": 21.288459897921552, + "kl": 0.1552734375, + "learning_rate": 4.2544243910986503e-07, + "loss": 0.0333, + "reward": 1.317323923110962, + "reward_std": 0.1799907237291336, + "rewards/accuracy_reward_stage2": 0.4579489529132843, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3280 + }, + { + "completion_length": 15.078125, + "epoch": 0.5749080077098301, + "grad_norm": 21.07082925093428, + "kl": 0.1865234375, + "learning_rate": 4.252672157000175e-07, + "loss": 0.0395, + "reward": 1.291269063949585, + "reward_std": 0.18487222492694855, + "rewards/accuracy_reward_stage2": 0.5568939447402954, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3281 + }, + { + "completion_length": 18.15625, + "epoch": 0.5750832311196776, + "grad_norm": 19.913425559679723, + "kl": 0.138671875, + "learning_rate": 4.2509199229016996e-07, + "loss": 0.0557, + "reward": 1.2240808010101318, + "reward_std": 0.22274786233901978, + "rewards/accuracy_reward_stage2": 0.47408074140548706, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3282 + }, + { + "completion_length": 11.765625, + "epoch": 0.5752584545295252, + "grad_norm": 21.420756353044013, + "kl": 0.05078125, + "learning_rate": 4.249167688803224e-07, + "loss": 0.0203, + "reward": 1.5513889789581299, + "reward_std": 0.2648440897464752, + "rewards/accuracy_reward_stage2": 0.6763889193534851, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3283 + }, + { + "completion_length": 8.15625, + "epoch": 0.5754336779393727, + "grad_norm": 17.8684394818383, + "kl": 0.054931640625, + "learning_rate": 4.2474154547047484e-07, + "loss": 0.0219, + "reward": 1.5660185813903809, + "reward_std": 0.2910422384738922, + "rewards/accuracy_reward_stage2": 0.5660187005996704, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3284 + }, + { + "completion_length": 7.75, + "epoch": 0.5756089013492203, + "grad_norm": 4.057894997719182, + "kl": 0.04150390625, + "learning_rate": 4.245663220606273e-07, + "loss": -0.0276, + "reward": 1.859375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3285 + }, + { + "completion_length": 8.953125, + "epoch": 0.5757841247590678, + "grad_norm": 22.31527931446079, + "kl": 0.1435546875, + "learning_rate": 4.243910986507797e-07, + "loss": 0.0478, + "reward": 1.3714749813079834, + "reward_std": 0.2784489393234253, + "rewards/accuracy_reward_stage2": 0.5120998620986938, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3286 + }, + { + "completion_length": 7.671875, + "epoch": 0.5759593481689154, + "grad_norm": 17.750280283958453, + "kl": 0.1279296875, + "learning_rate": 4.2421587524093216e-07, + "loss": 0.008, + "reward": 1.673346996307373, + "reward_std": 0.20908769965171814, + "rewards/accuracy_reward_stage2": 0.8139719367027283, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3287 + }, + { + "completion_length": 8.515625, + "epoch": 0.5761345715787629, + "grad_norm": 28.452469508830507, + "kl": 0.201171875, + "learning_rate": 4.240406518310846e-07, + "loss": 0.0459, + "reward": 1.407305121421814, + "reward_std": 0.3242839276790619, + "rewards/accuracy_reward_stage2": 0.547930121421814, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3288 + }, + { + "completion_length": 9.625, + "epoch": 0.5763097949886105, + "grad_norm": 20.764420247606466, + "kl": 0.158203125, + "learning_rate": 4.238654284212371e-07, + "loss": 0.0632, + "reward": 1.3577320575714111, + "reward_std": 0.3342527151107788, + "rewards/accuracy_reward_stage2": 0.6077320575714111, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3289 + }, + { + "completion_length": 7.265625, + "epoch": 0.576485018398458, + "grad_norm": 13.598394732574489, + "kl": 0.1103515625, + "learning_rate": 4.2369020501138953e-07, + "loss": -0.0002, + "reward": 1.894986629486084, + "reward_std": 0.11542447656393051, + "rewards/accuracy_reward_stage2": 0.910611629486084, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3290 + }, + { + "completion_length": 7.875, + "epoch": 0.5766602418083056, + "grad_norm": 14.29332734624939, + "kl": 0.0252685546875, + "learning_rate": 4.235149816015419e-07, + "loss": 0.0101, + "reward": 1.828125, + "reward_std": 0.10311973094940186, + "rewards/accuracy_reward_stage2": 0.828125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3291 + }, + { + "completion_length": 10.6875, + "epoch": 0.5768354652181531, + "grad_norm": 18.227115369491855, + "kl": 0.03564453125, + "learning_rate": 4.2333975819169436e-07, + "loss": 0.0143, + "reward": 1.329542875289917, + "reward_std": 0.17716088891029358, + "rewards/accuracy_reward_stage2": 0.5795428156852722, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3292 + }, + { + "completion_length": 9.984375, + "epoch": 0.5770106886280008, + "grad_norm": 17.617103800867277, + "kl": 0.064453125, + "learning_rate": 4.2316453478184685e-07, + "loss": 0.0258, + "reward": 1.662729024887085, + "reward_std": 0.15877869725227356, + "rewards/accuracy_reward_stage2": 0.662729024887085, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3293 + }, + { + "completion_length": 8.28125, + "epoch": 0.5771859120378483, + "grad_norm": 31.01227260578467, + "kl": 0.047607421875, + "learning_rate": 4.229893113719993e-07, + "loss": 0.0191, + "reward": 1.683934211730957, + "reward_std": 0.22675946354866028, + "rewards/accuracy_reward_stage2": 0.683934211730957, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3294 + }, + { + "completion_length": 7.515625, + "epoch": 0.5773611354476959, + "grad_norm": 12.294745744586768, + "kl": 0.1796875, + "learning_rate": 4.2281408796215173e-07, + "loss": -0.0054, + "reward": 1.859375, + "reward_std": 0.20189079642295837, + "rewards/accuracy_reward_stage2": 0.890625, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3295 + }, + { + "completion_length": 8.515625, + "epoch": 0.5775363588575434, + "grad_norm": 12.746122951348797, + "kl": 0.09033203125, + "learning_rate": 4.2263886455230417e-07, + "loss": -0.0081, + "reward": 1.6589066982269287, + "reward_std": 0.1634528785943985, + "rewards/accuracy_reward_stage2": 0.6745317578315735, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3296 + }, + { + "completion_length": 11.21875, + "epoch": 0.5777115822673909, + "grad_norm": 18.010685879221356, + "kl": 0.04736328125, + "learning_rate": 4.2246364114245666e-07, + "loss": 0.0189, + "reward": 1.5710866451263428, + "reward_std": 0.16242891550064087, + "rewards/accuracy_reward_stage2": 0.6960866451263428, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3297 + }, + { + "completion_length": 17.09375, + "epoch": 0.5778868056772385, + "grad_norm": 16.5643178297475, + "kl": 0.034423828125, + "learning_rate": 4.2228841773260905e-07, + "loss": 0.0138, + "reward": 1.6158677339553833, + "reward_std": 0.1544964611530304, + "rewards/accuracy_reward_stage2": 0.6158677339553833, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3298 + }, + { + "completion_length": 7.578125, + "epoch": 0.578062029087086, + "grad_norm": 21.732843806909315, + "kl": 0.060302734375, + "learning_rate": 4.221131943227615e-07, + "loss": 0.0241, + "reward": 1.7844496965408325, + "reward_std": 0.1314665973186493, + "rewards/accuracy_reward_stage2": 0.7844497561454773, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3299 + }, + { + "completion_length": 8.671875, + "epoch": 0.5782372524969336, + "grad_norm": 18.278951218401343, + "kl": 0.033447265625, + "learning_rate": 4.219379709129139e-07, + "loss": 0.0134, + "reward": 1.2312500476837158, + "reward_std": 0.1976664662361145, + "rewards/accuracy_reward_stage2": 0.23125001788139343, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3300 + }, + { + "completion_length": 10.265625, + "epoch": 0.5784124759067811, + "grad_norm": 85.56994101305463, + "kl": 0.466796875, + "learning_rate": 4.217627475030664e-07, + "loss": 0.1873, + "reward": 1.397374153137207, + "reward_std": 0.17502638697624207, + "rewards/accuracy_reward_stage2": 0.5223740935325623, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3301 + }, + { + "completion_length": 17.734375, + "epoch": 0.5785876993166287, + "grad_norm": 13.14692014606543, + "kl": 0.10400390625, + "learning_rate": 4.2158752409321886e-07, + "loss": 0.0026, + "reward": 1.8054771423339844, + "reward_std": 0.22878023982048035, + "rewards/accuracy_reward_stage2": 0.8211021423339844, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3302 + }, + { + "completion_length": 12.46875, + "epoch": 0.5787629227264762, + "grad_norm": 26.435353069087956, + "kl": 0.1005859375, + "learning_rate": 4.214123006833713e-07, + "loss": 0.0402, + "reward": 1.5523478984832764, + "reward_std": 0.25822532176971436, + "rewards/accuracy_reward_stage2": 0.5523478388786316, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3303 + }, + { + "completion_length": 7.5625, + "epoch": 0.5789381461363238, + "grad_norm": 17.94646914207544, + "kl": 0.1337890625, + "learning_rate": 4.212370772735237e-07, + "loss": 0.0168, + "reward": 1.4311730861663818, + "reward_std": 0.22365109622478485, + "rewards/accuracy_reward_stage2": 0.44679805636405945, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3304 + }, + { + "completion_length": 8.125, + "epoch": 0.5791133695461713, + "grad_norm": 20.73112396046394, + "kl": 0.1484375, + "learning_rate": 4.210618538636762e-07, + "loss": 0.0046, + "reward": 1.682603120803833, + "reward_std": 0.2785916328430176, + "rewards/accuracy_reward_stage2": 0.7138531804084778, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3305 + }, + { + "completion_length": 12.78125, + "epoch": 0.579288592956019, + "grad_norm": 17.497448798545236, + "kl": 0.150390625, + "learning_rate": 4.208866304538286e-07, + "loss": 0.0432, + "reward": 1.5918843746185303, + "reward_std": 0.16210268437862396, + "rewards/accuracy_reward_stage2": 0.7325093746185303, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3306 + }, + { + "completion_length": 8.953125, + "epoch": 0.5794638163658665, + "grad_norm": 18.037814272784697, + "kl": 0.10009765625, + "learning_rate": 4.2071140704398106e-07, + "loss": 0.0401, + "reward": 1.8454476594924927, + "reward_std": 0.18337611854076385, + "rewards/accuracy_reward_stage2": 0.8454477190971375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3307 + }, + { + "completion_length": 12.078125, + "epoch": 0.5796390397757141, + "grad_norm": 18.95994598789497, + "kl": 0.08837890625, + "learning_rate": 4.205361836341335e-07, + "loss": 0.0354, + "reward": 1.4737579822540283, + "reward_std": 0.13294795155525208, + "rewards/accuracy_reward_stage2": 0.4737580418586731, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3308 + }, + { + "completion_length": 9.515625, + "epoch": 0.5798142631855616, + "grad_norm": 10.774608652834829, + "kl": 0.12890625, + "learning_rate": 4.20360960224286e-07, + "loss": 0.0075, + "reward": 1.5416667461395264, + "reward_std": 0.1473139077425003, + "rewards/accuracy_reward_stage2": 0.6822916865348816, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3309 + }, + { + "completion_length": 9.1875, + "epoch": 0.5799894865954092, + "grad_norm": 17.368178364877604, + "kl": 0.035400390625, + "learning_rate": 4.201857368144384e-07, + "loss": 0.0141, + "reward": 1.4706447124481201, + "reward_std": 0.23802317678928375, + "rewards/accuracy_reward_stage2": 0.47064468264579773, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3310 + }, + { + "completion_length": 9.328125, + "epoch": 0.5801647100052567, + "grad_norm": 20.77313014791262, + "kl": 0.08203125, + "learning_rate": 4.200105134045908e-07, + "loss": 0.0328, + "reward": 1.747942566871643, + "reward_std": 0.24145202338695526, + "rewards/accuracy_reward_stage2": 0.7479425668716431, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3311 + }, + { + "completion_length": 9.625, + "epoch": 0.5803399334151043, + "grad_norm": 18.866424210680346, + "kl": 0.06591796875, + "learning_rate": 4.1983528999474325e-07, + "loss": 0.0263, + "reward": 1.5332136154174805, + "reward_std": 0.08188802003860474, + "rewards/accuracy_reward_stage2": 0.7832136750221252, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3312 + }, + { + "completion_length": 13.578125, + "epoch": 0.5805151568249518, + "grad_norm": 21.807267192943407, + "kl": 0.07568359375, + "learning_rate": 4.1966006658489575e-07, + "loss": 0.0303, + "reward": 1.578660011291504, + "reward_std": 0.20007401704788208, + "rewards/accuracy_reward_stage2": 0.5786599516868591, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3313 + }, + { + "completion_length": 25.265625, + "epoch": 0.5806903802347994, + "grad_norm": 20.275197723166627, + "kl": 0.046142578125, + "learning_rate": 4.194848431750482e-07, + "loss": 0.0185, + "reward": 1.633068561553955, + "reward_std": 0.09468790143728256, + "rewards/accuracy_reward_stage2": 0.6330685615539551, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3314 + }, + { + "completion_length": 7.609375, + "epoch": 0.5808656036446469, + "grad_norm": 16.043358849152995, + "kl": 0.1015625, + "learning_rate": 4.193096197652006e-07, + "loss": -0.0475, + "reward": 1.3541667461395264, + "reward_std": 0.1178511381149292, + "rewards/accuracy_reward_stage2": 0.5104166865348816, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3315 + }, + { + "completion_length": 13.671875, + "epoch": 0.5810408270544944, + "grad_norm": 23.18812295368805, + "kl": 0.1455078125, + "learning_rate": 4.1913439635535307e-07, + "loss": 0.0208, + "reward": 1.467007040977478, + "reward_std": 0.25777289271354675, + "rewards/accuracy_reward_stage2": 0.4826321005821228, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3316 + }, + { + "completion_length": 9.0, + "epoch": 0.581216050464342, + "grad_norm": 16.94807832750533, + "kl": 0.12060546875, + "learning_rate": 4.189591729455055e-07, + "loss": 0.0483, + "reward": 1.5463520288467407, + "reward_std": 0.20520761609077454, + "rewards/accuracy_reward_stage2": 0.6713520288467407, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3317 + }, + { + "completion_length": 10.484375, + "epoch": 0.5813912738741895, + "grad_norm": 15.590820843421067, + "kl": 0.2158203125, + "learning_rate": 4.1878394953565794e-07, + "loss": 0.0861, + "reward": 1.3699464797973633, + "reward_std": 0.10645575821399689, + "rewards/accuracy_reward_stage2": 0.4949463903903961, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3318 + }, + { + "completion_length": 7.625, + "epoch": 0.5815664972840372, + "grad_norm": 18.720185814431826, + "kl": 0.11083984375, + "learning_rate": 4.186087261258104e-07, + "loss": 0.0002, + "reward": 1.5887812376022339, + "reward_std": 0.19031430780887604, + "rewards/accuracy_reward_stage2": 0.6044062376022339, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3319 + }, + { + "completion_length": 10.375, + "epoch": 0.5817417206938847, + "grad_norm": 18.988882850470866, + "kl": 0.07373046875, + "learning_rate": 4.184335027159628e-07, + "loss": 0.0296, + "reward": 1.48932945728302, + "reward_std": 0.2587651014328003, + "rewards/accuracy_reward_stage2": 0.48932945728302, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3320 + }, + { + "completion_length": 13.140625, + "epoch": 0.5819169441037323, + "grad_norm": 15.464298948409237, + "kl": 0.1171875, + "learning_rate": 4.182582793061153e-07, + "loss": 0.0468, + "reward": 1.3898260593414307, + "reward_std": 0.2174256443977356, + "rewards/accuracy_reward_stage2": 0.5148261189460754, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3321 + }, + { + "completion_length": 5.78125, + "epoch": 0.5820921675135798, + "grad_norm": 14.489600309623796, + "kl": 0.18359375, + "learning_rate": 4.1808305589626776e-07, + "loss": 0.0732, + "reward": 1.6758959293365479, + "reward_std": 0.21336817741394043, + "rewards/accuracy_reward_stage2": 0.8008958101272583, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3322 + }, + { + "completion_length": 8.015625, + "epoch": 0.5822673909234274, + "grad_norm": 18.704860838352552, + "kl": 0.185546875, + "learning_rate": 4.1790783248642014e-07, + "loss": 0.0743, + "reward": 1.3735308647155762, + "reward_std": 0.21026110649108887, + "rewards/accuracy_reward_stage2": 0.4985308051109314, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3323 + }, + { + "completion_length": 9.984375, + "epoch": 0.5824426143332749, + "grad_norm": 20.995400462058033, + "kl": 0.1689453125, + "learning_rate": 4.177326090765726e-07, + "loss": 0.0461, + "reward": 1.6412947177886963, + "reward_std": 0.24216331541538239, + "rewards/accuracy_reward_stage2": 0.6569197177886963, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3324 + }, + { + "completion_length": 5.859375, + "epoch": 0.5826178377431225, + "grad_norm": 18.18017332400372, + "kl": 0.09228515625, + "learning_rate": 4.17557385666725e-07, + "loss": -0.0048, + "reward": 1.6639931201934814, + "reward_std": 0.19962234795093536, + "rewards/accuracy_reward_stage2": 0.6796180605888367, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3325 + }, + { + "completion_length": 11.421875, + "epoch": 0.58279306115297, + "grad_norm": 18.2333607957152, + "kl": 0.1572265625, + "learning_rate": 4.173821622568775e-07, + "loss": -0.0032, + "reward": 1.7223223447799683, + "reward_std": 0.2541934847831726, + "rewards/accuracy_reward_stage2": 0.7535722255706787, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3326 + }, + { + "completion_length": 14.15625, + "epoch": 0.5829682845628176, + "grad_norm": 22.09752305955603, + "kl": 0.09814453125, + "learning_rate": 4.1720693884702995e-07, + "loss": 0.0044, + "reward": 1.6956291198730469, + "reward_std": 0.3532159924507141, + "rewards/accuracy_reward_stage2": 0.7112541198730469, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3327 + }, + { + "completion_length": 10.734375, + "epoch": 0.5831435079726651, + "grad_norm": 21.51765639958224, + "kl": 0.11767578125, + "learning_rate": 4.170317154371824e-07, + "loss": -0.0271, + "reward": 1.7271101474761963, + "reward_std": 0.2501765787601471, + "rewards/accuracy_reward_stage2": 0.7583601474761963, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3328 + }, + { + "completion_length": 17.984375, + "epoch": 0.5833187313825127, + "grad_norm": 18.08290871320803, + "kl": 0.04833984375, + "learning_rate": 4.168564920273349e-07, + "loss": 0.0193, + "reward": 1.63839852809906, + "reward_std": 0.08872491121292114, + "rewards/accuracy_reward_stage2": 0.6383985877037048, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3329 + }, + { + "completion_length": 8.421875, + "epoch": 0.5834939547923602, + "grad_norm": 20.120055979618456, + "kl": 0.1708984375, + "learning_rate": 4.1668126861748727e-07, + "loss": -0.0333, + "reward": 1.407860517501831, + "reward_std": 0.34330761432647705, + "rewards/accuracy_reward_stage2": 0.45473557710647583, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3330 + }, + { + "completion_length": 14.484375, + "epoch": 0.5836691782022078, + "grad_norm": 18.739297757537408, + "kl": 0.07080078125, + "learning_rate": 4.165060452076397e-07, + "loss": 0.0283, + "reward": 1.487917423248291, + "reward_std": 0.20292870700359344, + "rewards/accuracy_reward_stage2": 0.6129173040390015, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3331 + }, + { + "completion_length": 11.3125, + "epoch": 0.5838444016120554, + "grad_norm": 16.014175859750328, + "kl": 0.05029296875, + "learning_rate": 4.1633082179779215e-07, + "loss": 0.0202, + "reward": 1.8262572288513184, + "reward_std": 0.1389392912387848, + "rewards/accuracy_reward_stage2": 0.8262572288513184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3332 + }, + { + "completion_length": 10.40625, + "epoch": 0.584019625021903, + "grad_norm": 22.951480118000163, + "kl": 0.10205078125, + "learning_rate": 4.161555983879446e-07, + "loss": 0.0119, + "reward": 1.6964985132217407, + "reward_std": 0.14089880883693695, + "rewards/accuracy_reward_stage2": 0.7121233940124512, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3333 + }, + { + "completion_length": 8.96875, + "epoch": 0.5841948484317505, + "grad_norm": 19.04509047433968, + "kl": 0.1025390625, + "learning_rate": 4.159803749780971e-07, + "loss": 0.041, + "reward": 1.8162028789520264, + "reward_std": 0.12637542188167572, + "rewards/accuracy_reward_stage2": 0.8162027597427368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3334 + }, + { + "completion_length": 8.359375, + "epoch": 0.5843700718415981, + "grad_norm": 16.61353049886891, + "kl": 0.1181640625, + "learning_rate": 4.158051515682495e-07, + "loss": 0.0474, + "reward": 1.424905776977539, + "reward_std": 0.14749082922935486, + "rewards/accuracy_reward_stage2": 0.5499057769775391, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3335 + }, + { + "completion_length": 12.421875, + "epoch": 0.5845452952514456, + "grad_norm": 15.680900749988403, + "kl": 0.0172119140625, + "learning_rate": 4.156299281584019e-07, + "loss": 0.0069, + "reward": 1.6614583730697632, + "reward_std": 0.12134584784507751, + "rewards/accuracy_reward_stage2": 0.6614583134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3336 + }, + { + "completion_length": 8.53125, + "epoch": 0.5847205186612932, + "grad_norm": 17.680179150240633, + "kl": 0.045166015625, + "learning_rate": 4.1545470474855435e-07, + "loss": 0.0181, + "reward": 1.447710394859314, + "reward_std": 0.159596785902977, + "rewards/accuracy_reward_stage2": 0.572710394859314, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3337 + }, + { + "completion_length": 15.8125, + "epoch": 0.5848957420711407, + "grad_norm": 16.446810362956278, + "kl": 0.10498046875, + "learning_rate": 4.1527948133870684e-07, + "loss": -0.0023, + "reward": 1.7206721305847168, + "reward_std": 0.27663755416870117, + "rewards/accuracy_reward_stage2": 0.736297070980072, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3338 + }, + { + "completion_length": 9.359375, + "epoch": 0.5850709654809882, + "grad_norm": 21.36734395553362, + "kl": 0.1552734375, + "learning_rate": 4.151042579288593e-07, + "loss": 0.0623, + "reward": 1.713259220123291, + "reward_std": 0.25472259521484375, + "rewards/accuracy_reward_stage2": 0.8382592797279358, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3339 + }, + { + "completion_length": 9.859375, + "epoch": 0.5852461888908358, + "grad_norm": 31.907468674326463, + "kl": 0.10400390625, + "learning_rate": 4.149290345190117e-07, + "loss": 0.0103, + "reward": 1.605589747428894, + "reward_std": 0.32869046926498413, + "rewards/accuracy_reward_stage2": 0.6212146878242493, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3340 + }, + { + "completion_length": 10.671875, + "epoch": 0.5854214123006833, + "grad_norm": 19.527174473286316, + "kl": 0.2373046875, + "learning_rate": 4.1475381110916416e-07, + "loss": -0.0196, + "reward": 1.3388493061065674, + "reward_std": 0.3259095251560211, + "rewards/accuracy_reward_stage2": 0.5107242465019226, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3341 + }, + { + "completion_length": 12.9375, + "epoch": 0.5855966357105309, + "grad_norm": 16.606210595175256, + "kl": 0.091796875, + "learning_rate": 4.145785876993166e-07, + "loss": 0.0366, + "reward": 1.2201833724975586, + "reward_std": 0.21854467689990997, + "rewards/accuracy_reward_stage2": 0.34518343210220337, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3342 + }, + { + "completion_length": 11.296875, + "epoch": 0.5857718591203784, + "grad_norm": 24.992915564062983, + "kl": 21.625, + "learning_rate": 4.1440336428946904e-07, + "loss": 8.678, + "reward": 1.3538477420806885, + "reward_std": 0.17989769577980042, + "rewards/accuracy_reward_stage2": 0.4788476824760437, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3343 + }, + { + "completion_length": 7.5, + "epoch": 0.5859470825302261, + "grad_norm": 20.341540212078797, + "kl": 0.17578125, + "learning_rate": 4.142281408796215e-07, + "loss": 0.0704, + "reward": 1.651244044303894, + "reward_std": 0.2022555023431778, + "rewards/accuracy_reward_stage2": 0.6512439846992493, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3344 + }, + { + "completion_length": 7.21875, + "epoch": 0.5861223059400736, + "grad_norm": 23.357858185375914, + "kl": 0.06884765625, + "learning_rate": 4.140529174697739e-07, + "loss": 0.0276, + "reward": 1.7915301322937012, + "reward_std": 0.26167210936546326, + "rewards/accuracy_reward_stage2": 0.7915301322937012, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3345 + }, + { + "completion_length": 12.4375, + "epoch": 0.5862975293499212, + "grad_norm": 7.846556815175532, + "kl": 0.056884765625, + "learning_rate": 4.138776940599264e-07, + "loss": 0.0227, + "reward": 1.734375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.859375, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3346 + }, + { + "completion_length": 11.84375, + "epoch": 0.5864727527597687, + "grad_norm": 18.95217996727311, + "kl": 0.1484375, + "learning_rate": 4.1370247065007885e-07, + "loss": 0.0594, + "reward": 1.5207836627960205, + "reward_std": 0.2098308801651001, + "rewards/accuracy_reward_stage2": 0.6457836627960205, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3347 + }, + { + "completion_length": 9.671875, + "epoch": 0.5866479761696163, + "grad_norm": 27.59838884419405, + "kl": 0.259765625, + "learning_rate": 4.135272472402313e-07, + "loss": 0.0232, + "reward": 1.4346290826797485, + "reward_std": 0.3767169117927551, + "rewards/accuracy_reward_stage2": 0.4658791422843933, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3348 + }, + { + "completion_length": 7.0, + "epoch": 0.5868231995794638, + "grad_norm": 9.903665941983448, + "kl": 0.12060546875, + "learning_rate": 4.133520238303837e-07, + "loss": 0.004, + "reward": 1.5, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3349 + }, + { + "completion_length": 6.234375, + "epoch": 0.5869984229893114, + "grad_norm": 18.693980258556312, + "kl": 0.119140625, + "learning_rate": 4.1317680042053617e-07, + "loss": 0.0034, + "reward": 1.6041667461395264, + "reward_std": 0.2158295214176178, + "rewards/accuracy_reward_stage2": 0.6197916865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3350 + }, + { + "completion_length": 10.703125, + "epoch": 0.5871736463991589, + "grad_norm": 22.560774928412627, + "kl": 0.169921875, + "learning_rate": 4.130015770106886e-07, + "loss": 0.068, + "reward": 1.4317594766616821, + "reward_std": 0.2960038483142853, + "rewards/accuracy_reward_stage2": 0.5567594766616821, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3351 + }, + { + "completion_length": 13.90625, + "epoch": 0.5873488698090065, + "grad_norm": 26.03373808729502, + "kl": 0.3046875, + "learning_rate": 4.1282635360084105e-07, + "loss": 0.0931, + "reward": 1.2304809093475342, + "reward_std": 0.17618829011917114, + "rewards/accuracy_reward_stage2": 0.4961059093475342, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3352 + }, + { + "completion_length": 9.171875, + "epoch": 0.587524093218854, + "grad_norm": 15.91817488703617, + "kl": 0.1259765625, + "learning_rate": 4.126511301909935e-07, + "loss": 0.0062, + "reward": 1.702805757522583, + "reward_std": 0.28147825598716736, + "rewards/accuracy_reward_stage2": 0.718430757522583, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3353 + }, + { + "completion_length": 10.328125, + "epoch": 0.5876993166287016, + "grad_norm": 18.887598806244405, + "kl": 0.2294921875, + "learning_rate": 4.12475906781146e-07, + "loss": 0.0067, + "reward": 1.3210906982421875, + "reward_std": 0.26162979006767273, + "rewards/accuracy_reward_stage2": 0.4773406982421875, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3354 + }, + { + "completion_length": 12.0625, + "epoch": 0.5878745400385491, + "grad_norm": 19.96975439666355, + "kl": 0.072265625, + "learning_rate": 4.1230068337129837e-07, + "loss": 0.029, + "reward": 1.5819294452667236, + "reward_std": 0.3019178509712219, + "rewards/accuracy_reward_stage2": 0.7069293260574341, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3355 + }, + { + "completion_length": 10.046875, + "epoch": 0.5880497634483967, + "grad_norm": 19.25697606660207, + "kl": 0.1376953125, + "learning_rate": 4.121254599614508e-07, + "loss": -0.0123, + "reward": 1.608581304550171, + "reward_std": 0.2221478521823883, + "rewards/accuracy_reward_stage2": 0.6398313641548157, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3356 + }, + { + "completion_length": 9.671875, + "epoch": 0.5882249868582443, + "grad_norm": 21.210323483932303, + "kl": 0.11767578125, + "learning_rate": 4.1195023655160325e-07, + "loss": 0.0471, + "reward": 1.3723440170288086, + "reward_std": 0.18884873390197754, + "rewards/accuracy_reward_stage2": 0.4973440170288086, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3357 + }, + { + "completion_length": 8.859375, + "epoch": 0.5884002102680919, + "grad_norm": 26.110479544755435, + "kl": 0.16015625, + "learning_rate": 4.1177501314175574e-07, + "loss": 0.0199, + "reward": 1.52445387840271, + "reward_std": 0.2836562991142273, + "rewards/accuracy_reward_stage2": 0.6650788187980652, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3358 + }, + { + "completion_length": 7.421875, + "epoch": 0.5885754336779394, + "grad_norm": 14.95239738422591, + "kl": 0.0859375, + "learning_rate": 4.115997897319082e-07, + "loss": -0.0099, + "reward": 1.7336453199386597, + "reward_std": 0.18917325139045715, + "rewards/accuracy_reward_stage2": 0.7492703199386597, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3359 + }, + { + "completion_length": 10.0, + "epoch": 0.588750657087787, + "grad_norm": 10.26649766816332, + "kl": 0.06201171875, + "learning_rate": 4.114245663220606e-07, + "loss": 0.0249, + "reward": 1.59375, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3360 + }, + { + "completion_length": 19.609375, + "epoch": 0.5889258804976345, + "grad_norm": 19.542471997914472, + "kl": 0.0301513671875, + "learning_rate": 4.11249342912213e-07, + "loss": 0.0121, + "reward": 1.4741116762161255, + "reward_std": 0.1889735460281372, + "rewards/accuracy_reward_stage2": 0.4741116762161255, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3361 + }, + { + "completion_length": 9.859375, + "epoch": 0.589101103907482, + "grad_norm": 15.580185778605106, + "kl": 0.1396484375, + "learning_rate": 4.110741195023655e-07, + "loss": -0.006, + "reward": 1.5328525304794312, + "reward_std": 0.24839192628860474, + "rewards/accuracy_reward_stage2": 0.5641025304794312, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3362 + }, + { + "completion_length": 10.8125, + "epoch": 0.5892763273173296, + "grad_norm": 27.01172403275803, + "kl": 0.0654296875, + "learning_rate": 4.1089889609251794e-07, + "loss": 0.0262, + "reward": 1.5872222185134888, + "reward_std": 0.27706378698349, + "rewards/accuracy_reward_stage2": 0.7122222185134888, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3363 + }, + { + "completion_length": 19.6875, + "epoch": 0.5894515507271771, + "grad_norm": 18.45051149058737, + "kl": 0.109375, + "learning_rate": 4.107236726826704e-07, + "loss": 0.0437, + "reward": 1.0697689056396484, + "reward_std": 0.15089532732963562, + "rewards/accuracy_reward_stage2": 0.31976890563964844, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3364 + }, + { + "completion_length": 10.375, + "epoch": 0.5896267741370247, + "grad_norm": 21.58470674598541, + "kl": 0.1376953125, + "learning_rate": 4.105484492728228e-07, + "loss": -0.0107, + "reward": 1.2813940048217773, + "reward_std": 0.26054540276527405, + "rewards/accuracy_reward_stage2": 0.43764400482177734, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3365 + }, + { + "completion_length": 4.265625, + "epoch": 0.5898019975468722, + "grad_norm": 11.649879587538361, + "kl": 0.1630859375, + "learning_rate": 4.103732258629753e-07, + "loss": -0.0623, + "reward": 1.796875, + "reward_std": 0.18139132857322693, + "rewards/accuracy_reward_stage2": 0.84375, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3366 + }, + { + "completion_length": 8.046875, + "epoch": 0.5899772209567198, + "grad_norm": 23.81306888391832, + "kl": 0.1279296875, + "learning_rate": 4.1019800245312775e-07, + "loss": 0.0511, + "reward": 1.5414574146270752, + "reward_std": 0.25763386487960815, + "rewards/accuracy_reward_stage2": 0.5414573550224304, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3367 + }, + { + "completion_length": 7.0625, + "epoch": 0.5901524443665673, + "grad_norm": 19.94454738981174, + "kl": 0.08837890625, + "learning_rate": 4.1002277904328014e-07, + "loss": 0.0354, + "reward": 1.7906076908111572, + "reward_std": 0.20431801676750183, + "rewards/accuracy_reward_stage2": 0.7906076908111572, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3368 + }, + { + "completion_length": 13.78125, + "epoch": 0.5903276677764149, + "grad_norm": 14.935642544111568, + "kl": 0.2294921875, + "learning_rate": 4.098475556334326e-07, + "loss": 0.0476, + "reward": 1.0843511819839478, + "reward_std": 0.10877098143100739, + "rewards/accuracy_reward_stage2": 0.34997621178627014, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3369 + }, + { + "completion_length": 8.484375, + "epoch": 0.5905028911862625, + "grad_norm": 16.50548252361066, + "kl": 0.142578125, + "learning_rate": 4.0967233222358507e-07, + "loss": 0.0571, + "reward": 1.4899513721466064, + "reward_std": 0.09331192076206207, + "rewards/accuracy_reward_stage2": 0.6149513721466064, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3370 + }, + { + "completion_length": 8.4375, + "epoch": 0.5906781145961101, + "grad_norm": 23.60007625752545, + "kl": 0.310546875, + "learning_rate": 4.094971088137375e-07, + "loss": 0.0226, + "reward": 1.373981237411499, + "reward_std": 0.37158650159835815, + "rewards/accuracy_reward_stage2": 0.545856237411499, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3371 + }, + { + "completion_length": 7.046875, + "epoch": 0.5908533380059576, + "grad_norm": 17.13091303333977, + "kl": 0.16015625, + "learning_rate": 4.0932188540388995e-07, + "loss": 0.0052, + "reward": 1.7232661247253418, + "reward_std": 0.2743530869483948, + "rewards/accuracy_reward_stage2": 0.7545161843299866, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3372 + }, + { + "completion_length": 7.6875, + "epoch": 0.5910285614158052, + "grad_norm": 12.522108130668892, + "kl": 0.126953125, + "learning_rate": 4.091466619940424e-07, + "loss": 0.011, + "reward": 1.3854740858078003, + "reward_std": 0.2172052413225174, + "rewards/accuracy_reward_stage2": 0.5260991454124451, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3373 + }, + { + "completion_length": 15.59375, + "epoch": 0.5912037848256527, + "grad_norm": 23.58891641696791, + "kl": 0.09375, + "learning_rate": 4.0897143858419483e-07, + "loss": 0.0375, + "reward": 1.3126802444458008, + "reward_std": 0.27276840806007385, + "rewards/accuracy_reward_stage2": 0.43768009543418884, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3374 + }, + { + "completion_length": 13.34375, + "epoch": 0.5913790082355003, + "grad_norm": 16.012524695728487, + "kl": 0.1005859375, + "learning_rate": 4.0879621517434727e-07, + "loss": 0.0026, + "reward": 1.7399613857269287, + "reward_std": 0.14063388109207153, + "rewards/accuracy_reward_stage2": 0.7712113261222839, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3375 + }, + { + "completion_length": 8.53125, + "epoch": 0.5915542316453478, + "grad_norm": 19.110171898355755, + "kl": 0.06201171875, + "learning_rate": 4.086209917644997e-07, + "loss": 0.0249, + "reward": 1.3935902118682861, + "reward_std": 0.20389771461486816, + "rewards/accuracy_reward_stage2": 0.39359015226364136, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3376 + }, + { + "completion_length": 9.015625, + "epoch": 0.5917294550551954, + "grad_norm": 18.917191334580906, + "kl": 0.3125, + "learning_rate": 4.0844576835465215e-07, + "loss": 0.0808, + "reward": 1.6681314706802368, + "reward_std": 0.2374303936958313, + "rewards/accuracy_reward_stage2": 0.6837565898895264, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3377 + }, + { + "completion_length": 37.875, + "epoch": 0.5919046784650429, + "grad_norm": 20.295157453686457, + "kl": 0.140625, + "learning_rate": 4.0827054494480464e-07, + "loss": -0.0315, + "reward": 1.2237355709075928, + "reward_std": 0.28009819984436035, + "rewards/accuracy_reward_stage2": 0.3799855411052704, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3378 + }, + { + "completion_length": 21.140625, + "epoch": 0.5920799018748905, + "grad_norm": 18.80728202640302, + "kl": 0.08837890625, + "learning_rate": 4.080953215349571e-07, + "loss": -0.0089, + "reward": 1.4754610061645508, + "reward_std": 0.16696128249168396, + "rewards/accuracy_reward_stage2": 0.4910860061645508, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3379 + }, + { + "completion_length": 9.828125, + "epoch": 0.592255125284738, + "grad_norm": 16.59325732582251, + "kl": 0.2470703125, + "learning_rate": 4.0792009812510947e-07, + "loss": 0.0359, + "reward": 1.3921682834625244, + "reward_std": 0.28522390127182007, + "rewards/accuracy_reward_stage2": 0.548418402671814, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3380 + }, + { + "completion_length": 6.8125, + "epoch": 0.5924303486945856, + "grad_norm": 13.811814446211153, + "kl": 0.2158203125, + "learning_rate": 4.077448747152619e-07, + "loss": 0.0424, + "reward": 1.3314099311828613, + "reward_std": 0.16977867484092712, + "rewards/accuracy_reward_stage2": 0.47203493118286133, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3381 + }, + { + "completion_length": 8.203125, + "epoch": 0.5926055721044331, + "grad_norm": 19.216188862686714, + "kl": 0.1435546875, + "learning_rate": 4.075696513054144e-07, + "loss": 0.0161, + "reward": 1.4336891174316406, + "reward_std": 0.3022252917289734, + "rewards/accuracy_reward_stage2": 0.5743141174316406, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3382 + }, + { + "completion_length": 9.890625, + "epoch": 0.5927807955142808, + "grad_norm": 17.67517194094775, + "kl": 0.1416015625, + "learning_rate": 4.0739442789556684e-07, + "loss": 0.0568, + "reward": 1.6875131130218506, + "reward_std": 0.202718585729599, + "rewards/accuracy_reward_stage2": 0.6875130534172058, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3383 + }, + { + "completion_length": 11.28125, + "epoch": 0.5929560189241283, + "grad_norm": 19.006467821007, + "kl": 0.2431640625, + "learning_rate": 4.072192044857193e-07, + "loss": 0.0185, + "reward": 1.2602907419204712, + "reward_std": 0.26582372188568115, + "rewards/accuracy_reward_stage2": 0.541540801525116, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3384 + }, + { + "completion_length": 19.546875, + "epoch": 0.5931312423339758, + "grad_norm": 17.659396295554313, + "kl": 0.16015625, + "learning_rate": 4.070439810758717e-07, + "loss": -0.0393, + "reward": 1.3948569297790527, + "reward_std": 0.3265957236289978, + "rewards/accuracy_reward_stage2": 0.44173192977905273, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3385 + }, + { + "completion_length": 11.25, + "epoch": 0.5933064657438234, + "grad_norm": 16.915472791830478, + "kl": 0.050048828125, + "learning_rate": 4.068687576660242e-07, + "loss": 0.02, + "reward": 1.7250510454177856, + "reward_std": 0.19554750621318817, + "rewards/accuracy_reward_stage2": 0.7250510454177856, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3386 + }, + { + "completion_length": 12.046875, + "epoch": 0.5934816891536709, + "grad_norm": 24.37913620423697, + "kl": 0.267578125, + "learning_rate": 4.066935342561766e-07, + "loss": -0.0634, + "reward": 1.6817896366119385, + "reward_std": 0.34547320008277893, + "rewards/accuracy_reward_stage2": 0.744289755821228, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3387 + }, + { + "completion_length": 7.4375, + "epoch": 0.5936569125635185, + "grad_norm": 14.314208272923713, + "kl": 0.10546875, + "learning_rate": 4.0651831084632904e-07, + "loss": -0.0021, + "reward": 1.3413195610046387, + "reward_std": 0.2445610910654068, + "rewards/accuracy_reward_stage2": 0.4819444417953491, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3388 + }, + { + "completion_length": 8.9375, + "epoch": 0.593832135973366, + "grad_norm": 12.188262249976322, + "kl": 0.06787109375, + "learning_rate": 4.063430874364815e-07, + "loss": -0.017, + "reward": 1.6598129272460938, + "reward_std": 0.13758717477321625, + "rewards/accuracy_reward_stage2": 0.6754379868507385, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3389 + }, + { + "completion_length": 9.09375, + "epoch": 0.5940073593832136, + "grad_norm": 20.565537812672964, + "kl": 0.189453125, + "learning_rate": 4.0616786402663397e-07, + "loss": 0.0332, + "reward": 1.424170732498169, + "reward_std": 0.27712106704711914, + "rewards/accuracy_reward_stage2": 0.4397958219051361, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3390 + }, + { + "completion_length": 10.5625, + "epoch": 0.5941825827930611, + "grad_norm": 18.537166743695696, + "kl": 0.2080078125, + "learning_rate": 4.059926406167864e-07, + "loss": -0.0532, + "reward": 1.3873106241226196, + "reward_std": 0.18024834990501404, + "rewards/accuracy_reward_stage2": 0.44981059432029724, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3391 + }, + { + "completion_length": 13.046875, + "epoch": 0.5943578062029087, + "grad_norm": 17.6139555607113, + "kl": 0.05029296875, + "learning_rate": 4.0581741720693885e-07, + "loss": 0.0201, + "reward": 1.756882905960083, + "reward_std": 0.1282881498336792, + "rewards/accuracy_reward_stage2": 0.7568830251693726, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3392 + }, + { + "completion_length": 9.609375, + "epoch": 0.5945330296127562, + "grad_norm": 22.452716228857593, + "kl": 0.1572265625, + "learning_rate": 4.0564219379709124e-07, + "loss": -0.0039, + "reward": 1.4564459323883057, + "reward_std": 0.3075176477432251, + "rewards/accuracy_reward_stage2": 0.4876958727836609, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3393 + }, + { + "completion_length": 8.78125, + "epoch": 0.5947082530226038, + "grad_norm": 28.37169774199766, + "kl": 0.2177734375, + "learning_rate": 4.054669703872437e-07, + "loss": 0.0546, + "reward": 1.4617120027542114, + "reward_std": 0.26479586958885193, + "rewards/accuracy_reward_stage2": 0.6023369431495667, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3394 + }, + { + "completion_length": 6.921875, + "epoch": 0.5948834764324514, + "grad_norm": 13.656636345638992, + "kl": 0.1591796875, + "learning_rate": 4.0529174697739617e-07, + "loss": 0.0197, + "reward": 1.4355125427246094, + "reward_std": 0.15464989840984344, + "rewards/accuracy_reward_stage2": 0.45113757252693176, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3395 + }, + { + "completion_length": 10.921875, + "epoch": 0.595058699842299, + "grad_norm": 21.538672452109356, + "kl": 0.0673828125, + "learning_rate": 4.051165235675486e-07, + "loss": 0.0271, + "reward": 1.7184603214263916, + "reward_std": 0.15963563323020935, + "rewards/accuracy_reward_stage2": 0.7184603214263916, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3396 + }, + { + "completion_length": 8.359375, + "epoch": 0.5952339232521465, + "grad_norm": 12.307626427997151, + "kl": 0.111328125, + "learning_rate": 4.0494130015770105e-07, + "loss": 0.0103, + "reward": 1.8679943084716797, + "reward_std": 0.11993659287691116, + "rewards/accuracy_reward_stage2": 0.8836191892623901, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3397 + }, + { + "completion_length": 7.5625, + "epoch": 0.5954091466619941, + "grad_norm": 13.598249244566306, + "kl": 0.10546875, + "learning_rate": 4.0476607674785354e-07, + "loss": 0.0207, + "reward": 1.5885417461395264, + "reward_std": 0.1236192062497139, + "rewards/accuracy_reward_stage2": 0.6041666865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3398 + }, + { + "completion_length": 11.296875, + "epoch": 0.5955843700718416, + "grad_norm": 19.143868313764987, + "kl": 0.1611328125, + "learning_rate": 4.04590853338006e-07, + "loss": 0.0646, + "reward": 1.5587762594223022, + "reward_std": 0.16119365394115448, + "rewards/accuracy_reward_stage2": 0.5587762594223022, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3399 + }, + { + "completion_length": 11.578125, + "epoch": 0.5957595934816892, + "grad_norm": 19.578673146528867, + "kl": 0.59765625, + "learning_rate": 4.0441562992815837e-07, + "loss": 0.0771, + "reward": 1.5015919208526611, + "reward_std": 0.26523348689079285, + "rewards/accuracy_reward_stage2": 0.6890919804573059, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 3400 + }, + { + "completion_length": 8.625, + "epoch": 0.5959348168915367, + "grad_norm": 19.872808171518628, + "kl": 0.2314453125, + "learning_rate": 4.042404065183108e-07, + "loss": 0.0123, + "reward": 1.413339614868164, + "reward_std": 0.29223260283470154, + "rewards/accuracy_reward_stage2": 0.5695896148681641, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3401 + }, + { + "completion_length": 10.859375, + "epoch": 0.5961100403013843, + "grad_norm": 20.536292079434425, + "kl": 0.1328125, + "learning_rate": 4.0406518310846324e-07, + "loss": 0.0242, + "reward": 1.3782663345336914, + "reward_std": 0.283265620470047, + "rewards/accuracy_reward_stage2": 0.5188913345336914, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3402 + }, + { + "completion_length": 11.59375, + "epoch": 0.5962852637112318, + "grad_norm": 14.543339866793787, + "kl": 0.138671875, + "learning_rate": 4.0388995969861574e-07, + "loss": 0.0138, + "reward": 1.5714879035949707, + "reward_std": 0.12914830446243286, + "rewards/accuracy_reward_stage2": 0.7121127843856812, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3403 + }, + { + "completion_length": 13.34375, + "epoch": 0.5964604871210794, + "grad_norm": 17.874908256904774, + "kl": 0.06787109375, + "learning_rate": 4.037147362887682e-07, + "loss": 0.0271, + "reward": 1.4587076902389526, + "reward_std": 0.17502731084823608, + "rewards/accuracy_reward_stage2": 0.45870766043663025, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3404 + }, + { + "completion_length": 7.3125, + "epoch": 0.5966357105309269, + "grad_norm": 22.047570139658138, + "kl": 0.0673828125, + "learning_rate": 4.035395128789206e-07, + "loss": -0.0172, + "reward": 1.726590871810913, + "reward_std": 0.14708679914474487, + "rewards/accuracy_reward_stage2": 0.7422158122062683, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3405 + }, + { + "completion_length": 10.234375, + "epoch": 0.5968109339407744, + "grad_norm": 17.266053412726244, + "kl": 0.08203125, + "learning_rate": 4.03364289469073e-07, + "loss": -0.0113, + "reward": 1.6297082901000977, + "reward_std": 0.13242757320404053, + "rewards/accuracy_reward_stage2": 0.7703334093093872, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3406 + }, + { + "completion_length": 9.59375, + "epoch": 0.596986157350622, + "grad_norm": 16.885928043424666, + "kl": 0.078125, + "learning_rate": 4.031890660592255e-07, + "loss": -0.013, + "reward": 1.5568372011184692, + "reward_std": 0.15844221413135529, + "rewards/accuracy_reward_stage2": 0.572462260723114, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3407 + }, + { + "completion_length": 12.84375, + "epoch": 0.5971613807604697, + "grad_norm": 20.659760081222462, + "kl": 0.16015625, + "learning_rate": 4.0301384264937794e-07, + "loss": 0.032, + "reward": 1.6425063610076904, + "reward_std": 0.22482311725616455, + "rewards/accuracy_reward_stage2": 0.6581313610076904, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3408 + }, + { + "completion_length": 12.28125, + "epoch": 0.5973366041703172, + "grad_norm": 10.630078981944184, + "kl": 0.076171875, + "learning_rate": 4.028386192395304e-07, + "loss": -0.0079, + "reward": 1.7965457439422607, + "reward_std": 0.07948299497365952, + "rewards/accuracy_reward_stage2": 0.9371707439422607, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3409 + }, + { + "completion_length": 10.546875, + "epoch": 0.5975118275801647, + "grad_norm": 26.564388093945244, + "kl": 0.1611328125, + "learning_rate": 4.026633958296828e-07, + "loss": 0.0267, + "reward": 1.467179775238037, + "reward_std": 0.24725881218910217, + "rewards/accuracy_reward_stage2": 0.48280471563339233, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3410 + }, + { + "completion_length": 9.015625, + "epoch": 0.5976870509900123, + "grad_norm": 18.764011624529996, + "kl": 0.1337890625, + "learning_rate": 4.024881724198353e-07, + "loss": -0.0347, + "reward": 1.767315149307251, + "reward_std": 0.1556333750486374, + "rewards/accuracy_reward_stage2": 0.7985650897026062, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3411 + }, + { + "completion_length": 8.5625, + "epoch": 0.5978622743998598, + "grad_norm": 12.278633061947433, + "kl": 0.007537841796875, + "learning_rate": 4.023129490099877e-07, + "loss": 0.003, + "reward": 1.734375, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3412 + }, + { + "completion_length": 10.078125, + "epoch": 0.5980374978097074, + "grad_norm": 17.42639047551563, + "kl": 0.061767578125, + "learning_rate": 4.0213772560014013e-07, + "loss": 0.0247, + "reward": 1.8013005256652832, + "reward_std": 0.18322604894638062, + "rewards/accuracy_reward_stage2": 0.8013004660606384, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3413 + }, + { + "completion_length": 10.28125, + "epoch": 0.5982127212195549, + "grad_norm": 20.745494626089844, + "kl": 0.1240234375, + "learning_rate": 4.0196250219029257e-07, + "loss": 0.0079, + "reward": 1.5774965286254883, + "reward_std": 0.30587121844291687, + "rewards/accuracy_reward_stage2": 0.5931214094161987, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3414 + }, + { + "completion_length": 11.828125, + "epoch": 0.5983879446294025, + "grad_norm": 22.787095634560245, + "kl": 0.1875, + "learning_rate": 4.0178727878044507e-07, + "loss": 0.0394, + "reward": 1.4983285665512085, + "reward_std": 0.19265775382518768, + "rewards/accuracy_reward_stage2": 0.5139535665512085, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3415 + }, + { + "completion_length": 8.390625, + "epoch": 0.59856316803925, + "grad_norm": 20.242710381195522, + "kl": 0.1318359375, + "learning_rate": 4.016120553705975e-07, + "loss": 0.0362, + "reward": 1.6618952751159668, + "reward_std": 0.21545787155628204, + "rewards/accuracy_reward_stage2": 0.6775202751159668, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3416 + }, + { + "completion_length": 13.03125, + "epoch": 0.5987383914490976, + "grad_norm": 15.263042608528119, + "kl": 0.09228515625, + "learning_rate": 4.0143683196074995e-07, + "loss": -0.0072, + "reward": 1.6668956279754639, + "reward_std": 0.192345529794693, + "rewards/accuracy_reward_stage2": 0.6825206279754639, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3417 + }, + { + "completion_length": 12.59375, + "epoch": 0.5989136148589451, + "grad_norm": 19.471023785017675, + "kl": 0.08154296875, + "learning_rate": 4.012616085509024e-07, + "loss": -0.0115, + "reward": 1.7478539943695068, + "reward_std": 0.2811315953731537, + "rewards/accuracy_reward_stage2": 0.7634790539741516, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3418 + }, + { + "completion_length": 7.359375, + "epoch": 0.5990888382687927, + "grad_norm": 19.587244365848665, + "kl": 0.146484375, + "learning_rate": 4.010863851410548e-07, + "loss": 0.0587, + "reward": 1.6780874729156494, + "reward_std": 0.19293718039989471, + "rewards/accuracy_reward_stage2": 0.6780875325202942, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3419 + }, + { + "completion_length": 20.625, + "epoch": 0.5992640616786402, + "grad_norm": 20.90755133142386, + "kl": 0.1015625, + "learning_rate": 4.0091116173120726e-07, + "loss": 0.0405, + "reward": 1.59331214427948, + "reward_std": 0.1857033371925354, + "rewards/accuracy_reward_stage2": 0.59331214427948, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3420 + }, + { + "completion_length": 9.875, + "epoch": 0.5994392850884879, + "grad_norm": 13.415275248514398, + "kl": 0.027099609375, + "learning_rate": 4.007359383213597e-07, + "loss": 0.0108, + "reward": 1.7339122295379639, + "reward_std": 0.15513893961906433, + "rewards/accuracy_reward_stage2": 0.7339121699333191, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3421 + }, + { + "completion_length": 10.09375, + "epoch": 0.5996145084983354, + "grad_norm": 17.46796887441957, + "kl": 0.1279296875, + "learning_rate": 4.0056071491151214e-07, + "loss": 0.0121, + "reward": 1.5939295291900635, + "reward_std": 0.18742156028747559, + "rewards/accuracy_reward_stage2": 0.6251795291900635, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3422 + }, + { + "completion_length": 21.328125, + "epoch": 0.599789731908183, + "grad_norm": 15.744692972576852, + "kl": 0.11669921875, + "learning_rate": 4.0038549150166464e-07, + "loss": 0.0467, + "reward": 1.5629197359085083, + "reward_std": 0.21470427513122559, + "rewards/accuracy_reward_stage2": 0.5629197955131531, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3423 + }, + { + "completion_length": 13.1875, + "epoch": 0.5999649553180305, + "grad_norm": 16.44038857892878, + "kl": 0.345703125, + "learning_rate": 4.002102680918171e-07, + "loss": 0.0582, + "reward": 1.5216660499572754, + "reward_std": 0.2286083847284317, + "rewards/accuracy_reward_stage2": 0.8029160499572754, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3424 + }, + { + "completion_length": 9.5, + "epoch": 0.6001401787278781, + "grad_norm": 10.97712282230342, + "kl": 0.0283203125, + "learning_rate": 4.0003504468196946e-07, + "loss": 0.0113, + "reward": 1.6370192766189575, + "reward_std": 0.08915039896965027, + "rewards/accuracy_reward_stage2": 0.6370192170143127, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3425 + }, + { + "completion_length": 9.546875, + "epoch": 0.6003154021377256, + "grad_norm": 17.1651898145542, + "kl": 0.1328125, + "learning_rate": 3.998598212721219e-07, + "loss": 0.0121, + "reward": 1.5052459239959717, + "reward_std": 0.2685433626174927, + "rewards/accuracy_reward_stage2": 0.6458709239959717, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3426 + }, + { + "completion_length": 11.71875, + "epoch": 0.6004906255475732, + "grad_norm": 18.557896383066335, + "kl": 0.169921875, + "learning_rate": 3.996845978622744e-07, + "loss": 0.0263, + "reward": 1.2808881998062134, + "reward_std": 0.19097641110420227, + "rewards/accuracy_reward_stage2": 0.6558881998062134, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 3427 + }, + { + "completion_length": 9.453125, + "epoch": 0.6006658489574207, + "grad_norm": 25.268672223120202, + "kl": 0.154296875, + "learning_rate": 3.9950937445242683e-07, + "loss": 0.0175, + "reward": 1.5014958381652832, + "reward_std": 0.2355181872844696, + "rewards/accuracy_reward_stage2": 0.5171208381652832, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3428 + }, + { + "completion_length": 17.28125, + "epoch": 0.6008410723672682, + "grad_norm": 27.698931415889906, + "kl": 0.34765625, + "learning_rate": 3.993341510425793e-07, + "loss": 0.0096, + "reward": 1.4189836978912354, + "reward_std": 0.3754510283470154, + "rewards/accuracy_reward_stage2": 0.48148372769355774, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3429 + }, + { + "completion_length": 13.078125, + "epoch": 0.6010162957771158, + "grad_norm": 14.964534700377712, + "kl": 0.025634765625, + "learning_rate": 3.991589276327317e-07, + "loss": 0.0102, + "reward": 1.8493903875350952, + "reward_std": 0.08784636110067368, + "rewards/accuracy_reward_stage2": 0.8493903875350952, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3430 + }, + { + "completion_length": 8.671875, + "epoch": 0.6011915191869633, + "grad_norm": 18.686887863480006, + "kl": 0.1904296875, + "learning_rate": 3.9898370422288415e-07, + "loss": -0.0013, + "reward": 1.4473824501037598, + "reward_std": 0.30469417572021484, + "rewards/accuracy_reward_stage2": 0.7286325097084045, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3431 + }, + { + "completion_length": 6.84375, + "epoch": 0.6013667425968109, + "grad_norm": 23.05137679558983, + "kl": 0.08642578125, + "learning_rate": 3.988084808130366e-07, + "loss": 0.0345, + "reward": 1.5047707557678223, + "reward_std": 0.23167124390602112, + "rewards/accuracy_reward_stage2": 0.5047707557678223, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3432 + }, + { + "completion_length": 24.25, + "epoch": 0.6015419660066584, + "grad_norm": 23.424671897950013, + "kl": 0.140625, + "learning_rate": 3.9863325740318903e-07, + "loss": -0.0319, + "reward": 1.5699257850646973, + "reward_std": 0.35112351179122925, + "rewards/accuracy_reward_stage2": 0.601175844669342, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3433 + }, + { + "completion_length": 8.9375, + "epoch": 0.6017171894165061, + "grad_norm": 20.79507527827949, + "kl": 0.06689453125, + "learning_rate": 3.9845803399334147e-07, + "loss": -0.0022, + "reward": 1.6292564868927002, + "reward_std": 0.1802724003791809, + "rewards/accuracy_reward_stage2": 0.6448814868927002, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3434 + }, + { + "completion_length": 7.578125, + "epoch": 0.6018924128263536, + "grad_norm": 15.589011966550148, + "kl": 0.0927734375, + "learning_rate": 3.9828281058349396e-07, + "loss": -0.0071, + "reward": 1.65333092212677, + "reward_std": 0.16445782780647278, + "rewards/accuracy_reward_stage2": 0.66895592212677, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3435 + }, + { + "completion_length": 8.953125, + "epoch": 0.6020676362362012, + "grad_norm": 34.07184555789485, + "kl": 0.11767578125, + "learning_rate": 3.981075871736464e-07, + "loss": 0.003, + "reward": 1.6030738353729248, + "reward_std": 0.29539844393730164, + "rewards/accuracy_reward_stage2": 0.6186988353729248, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3436 + }, + { + "completion_length": 7.953125, + "epoch": 0.6022428596460487, + "grad_norm": 16.218031005827555, + "kl": 0.16015625, + "learning_rate": 3.9793236376379884e-07, + "loss": 0.0638, + "reward": 0.9500302076339722, + "reward_std": 0.1618354320526123, + "rewards/accuracy_reward_stage2": 0.45003020763397217, + "rewards/format_reward_stage1_pointerpad": 0.5, + "scores/accuracy_reward_stage2": 0.5, + "step": 3437 + }, + { + "completion_length": 8.453125, + "epoch": 0.6024180830558963, + "grad_norm": 16.409700138315014, + "kl": 0.08984375, + "learning_rate": 3.9775714035395123e-07, + "loss": -0.0084, + "reward": 1.54030442237854, + "reward_std": 0.1720879077911377, + "rewards/accuracy_reward_stage2": 0.5559294819831848, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3438 + }, + { + "completion_length": 8.234375, + "epoch": 0.6025933064657438, + "grad_norm": 32.155513926011665, + "kl": 0.380859375, + "learning_rate": 3.975819169441037e-07, + "loss": 0.0541, + "reward": 1.3794606924057007, + "reward_std": 0.2892468273639679, + "rewards/accuracy_reward_stage2": 0.6763357520103455, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 3439 + }, + { + "completion_length": 9.75, + "epoch": 0.6027685298755914, + "grad_norm": 50.064485559731196, + "kl": 0.314453125, + "learning_rate": 3.9740669353425616e-07, + "loss": 0.1261, + "reward": 1.6192705631256104, + "reward_std": 0.15657562017440796, + "rewards/accuracy_reward_stage2": 0.7442706823348999, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3440 + }, + { + "completion_length": 14.25, + "epoch": 0.6029437532854389, + "grad_norm": 22.465520921403584, + "kl": 0.294921875, + "learning_rate": 3.972314701244086e-07, + "loss": 0.0255, + "reward": 1.5082569122314453, + "reward_std": 0.3638462722301483, + "rewards/accuracy_reward_stage2": 0.5551318526268005, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3441 + }, + { + "completion_length": 9.46875, + "epoch": 0.6031189766952865, + "grad_norm": 15.780525676682956, + "kl": 0.11572265625, + "learning_rate": 3.9705624671456104e-07, + "loss": 0.0021, + "reward": 1.501037359237671, + "reward_std": 0.10796771943569183, + "rewards/accuracy_reward_stage2": 0.5166622996330261, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3442 + }, + { + "completion_length": 29.609375, + "epoch": 0.603294200105134, + "grad_norm": 23.820338114426182, + "kl": 0.318359375, + "learning_rate": 3.9688102330471353e-07, + "loss": 0.0451, + "reward": 1.6326594352722168, + "reward_std": 0.2520653307437897, + "rewards/accuracy_reward_stage2": 0.6795344948768616, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3443 + }, + { + "completion_length": 9.53125, + "epoch": 0.6034694235149816, + "grad_norm": 16.184072771911413, + "kl": 0.08349609375, + "learning_rate": 3.967057998948659e-07, + "loss": -0.0069, + "reward": 1.723325252532959, + "reward_std": 0.14421439170837402, + "rewards/accuracy_reward_stage2": 0.7389503121376038, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3444 + }, + { + "completion_length": 9.1875, + "epoch": 0.6036446469248291, + "grad_norm": 19.017645766699943, + "kl": 0.06591796875, + "learning_rate": 3.9653057648501836e-07, + "loss": 0.0264, + "reward": 1.687445044517517, + "reward_std": 0.20073054730892181, + "rewards/accuracy_reward_stage2": 0.6874449253082275, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3445 + }, + { + "completion_length": 11.46875, + "epoch": 0.6038198703346768, + "grad_norm": 16.926725366578847, + "kl": 0.06591796875, + "learning_rate": 3.963553530751708e-07, + "loss": 0.0058, + "reward": 1.7377595901489258, + "reward_std": 0.13752949237823486, + "rewards/accuracy_reward_stage2": 0.7533845901489258, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3446 + }, + { + "completion_length": 11.3125, + "epoch": 0.6039950937445243, + "grad_norm": 13.770783596052299, + "kl": 0.10107421875, + "learning_rate": 3.961801296653233e-07, + "loss": -0.0004, + "reward": 1.466865062713623, + "reward_std": 0.2192572057247162, + "rewards/accuracy_reward_stage2": 0.48249009251594543, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3447 + }, + { + "completion_length": 8.40625, + "epoch": 0.6041703171543719, + "grad_norm": 17.04847051503246, + "kl": 0.1455078125, + "learning_rate": 3.9600490625547573e-07, + "loss": -0.0238, + "reward": 1.7534458637237549, + "reward_std": 0.2810730040073395, + "rewards/accuracy_reward_stage2": 0.7846959233283997, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3448 + }, + { + "completion_length": 10.078125, + "epoch": 0.6043455405642194, + "grad_norm": 17.546032869691548, + "kl": 0.10791015625, + "learning_rate": 3.9582968284562817e-07, + "loss": 0.0431, + "reward": 1.533093810081482, + "reward_std": 0.11157439649105072, + "rewards/accuracy_reward_stage2": 0.6580938100814819, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3449 + }, + { + "completion_length": 11.40625, + "epoch": 0.604520763974067, + "grad_norm": 19.356403016722467, + "kl": 0.17578125, + "learning_rate": 3.956544594357806e-07, + "loss": 0.0486, + "reward": 1.4633104801177979, + "reward_std": 0.19928845763206482, + "rewards/accuracy_reward_stage2": 0.47893548011779785, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3450 + }, + { + "completion_length": 12.8125, + "epoch": 0.6046959873839145, + "grad_norm": 16.01905817760592, + "kl": 0.134765625, + "learning_rate": 3.9547923602593305e-07, + "loss": 0.0098, + "reward": 1.3782914876937866, + "reward_std": 0.20190542936325073, + "rewards/accuracy_reward_stage2": 0.5189164876937866, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3451 + }, + { + "completion_length": 10.0, + "epoch": 0.604871210793762, + "grad_norm": 24.06284766509744, + "kl": 0.1533203125, + "learning_rate": 3.953040126160855e-07, + "loss": -0.0162, + "reward": 1.6933026313781738, + "reward_std": 0.24311666190624237, + "rewards/accuracy_reward_stage2": 0.7245526909828186, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3452 + }, + { + "completion_length": 13.671875, + "epoch": 0.6050464342036096, + "grad_norm": 16.836031995387494, + "kl": 0.10205078125, + "learning_rate": 3.9512878920623793e-07, + "loss": 0.0194, + "reward": 1.3007614612579346, + "reward_std": 0.2282509207725525, + "rewards/accuracy_reward_stage2": 0.5507614016532898, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3453 + }, + { + "completion_length": 10.9375, + "epoch": 0.6052216576134571, + "grad_norm": 28.102604670492564, + "kl": 0.068359375, + "learning_rate": 3.9495356579639037e-07, + "loss": 0.0273, + "reward": 1.6019675731658936, + "reward_std": 0.1741998940706253, + "rewards/accuracy_reward_stage2": 0.6019675731658936, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3454 + }, + { + "completion_length": 8.65625, + "epoch": 0.6053968810233047, + "grad_norm": 18.0939366145864, + "kl": 0.0732421875, + "learning_rate": 3.9477834238654286e-07, + "loss": 0.0293, + "reward": 1.3374698162078857, + "reward_std": 0.23530396819114685, + "rewards/accuracy_reward_stage2": 0.46246981620788574, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3455 + }, + { + "completion_length": 8.03125, + "epoch": 0.6055721044331522, + "grad_norm": 14.97976553552256, + "kl": 0.035400390625, + "learning_rate": 3.946031189766953e-07, + "loss": 0.0141, + "reward": 1.6035091876983643, + "reward_std": 0.13654939830303192, + "rewards/accuracy_reward_stage2": 0.6035091280937195, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3456 + }, + { + "completion_length": 20.328125, + "epoch": 0.6057473278429998, + "grad_norm": 19.07249570104564, + "kl": 0.24609375, + "learning_rate": 3.944278955668477e-07, + "loss": 0.0104, + "reward": 1.4960546493530273, + "reward_std": 0.2673536539077759, + "rewards/accuracy_reward_stage2": 0.6523047089576721, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3457 + }, + { + "completion_length": 9.4375, + "epoch": 0.6059225512528473, + "grad_norm": 12.936590524802542, + "kl": 0.06884765625, + "learning_rate": 3.9425267215700013e-07, + "loss": -0.0119, + "reward": 1.7319378852844238, + "reward_std": 0.13649073243141174, + "rewards/accuracy_reward_stage2": 0.7475628852844238, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3458 + }, + { + "completion_length": 11.71875, + "epoch": 0.606097774662695, + "grad_norm": 19.371274993588596, + "kl": 0.09765625, + "learning_rate": 3.940774487471526e-07, + "loss": 0.0391, + "reward": 1.417302131652832, + "reward_std": 0.23798641562461853, + "rewards/accuracy_reward_stage2": 0.542302131652832, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3459 + }, + { + "completion_length": 13.5, + "epoch": 0.6062729980725425, + "grad_norm": 19.240788759913016, + "kl": 0.2216796875, + "learning_rate": 3.9390222533730506e-07, + "loss": 0.0446, + "reward": 1.4564974308013916, + "reward_std": 0.19151608645915985, + "rewards/accuracy_reward_stage2": 0.5971223711967468, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3460 + }, + { + "completion_length": 11.9375, + "epoch": 0.6064482214823901, + "grad_norm": 24.204213129637967, + "kl": 0.232421875, + "learning_rate": 3.937270019274575e-07, + "loss": 0.022, + "reward": 1.5358624458312988, + "reward_std": 0.320328950881958, + "rewards/accuracy_reward_stage2": 0.6921124458312988, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3461 + }, + { + "completion_length": 9.171875, + "epoch": 0.6066234448922376, + "grad_norm": 20.175706890970673, + "kl": 0.07080078125, + "learning_rate": 3.9355177851760994e-07, + "loss": 0.0282, + "reward": 1.6413211822509766, + "reward_std": 0.1602167785167694, + "rewards/accuracy_reward_stage2": 0.641321063041687, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3462 + }, + { + "completion_length": 11.71875, + "epoch": 0.6067986683020852, + "grad_norm": 20.092099705100154, + "kl": 0.1689453125, + "learning_rate": 3.9337655510776233e-07, + "loss": 0.0678, + "reward": 1.5993422269821167, + "reward_std": 0.22892138361930847, + "rewards/accuracy_reward_stage2": 0.5993422269821167, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3463 + }, + { + "completion_length": 8.609375, + "epoch": 0.6069738917119327, + "grad_norm": 18.65820843756049, + "kl": 0.265625, + "learning_rate": 3.932013316979148e-07, + "loss": -0.0445, + "reward": 1.675358772277832, + "reward_std": 0.3607245981693268, + "rewards/accuracy_reward_stage2": 0.737858772277832, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3464 + }, + { + "completion_length": 9.84375, + "epoch": 0.6071491151217803, + "grad_norm": 21.585310238332063, + "kl": 0.02685546875, + "learning_rate": 3.9302610828806726e-07, + "loss": 0.0107, + "reward": 1.3237862586975098, + "reward_std": 0.17188136279582977, + "rewards/accuracy_reward_stage2": 0.32378625869750977, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3465 + }, + { + "completion_length": 9.734375, + "epoch": 0.6073243385316278, + "grad_norm": 17.21550996098564, + "kl": 0.4453125, + "learning_rate": 3.928508848782197e-07, + "loss": 0.0607, + "reward": 1.5191543102264404, + "reward_std": 0.3503054976463318, + "rewards/accuracy_reward_stage2": 0.6910292506217957, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3466 + }, + { + "completion_length": 7.25, + "epoch": 0.6074995619414754, + "grad_norm": 22.309264652599936, + "kl": 0.1884765625, + "learning_rate": 3.926756614683722e-07, + "loss": 0.0033, + "reward": 1.319457769393921, + "reward_std": 0.2027011513710022, + "rewards/accuracy_reward_stage2": 0.6007077693939209, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3467 + }, + { + "completion_length": 9.171875, + "epoch": 0.6076747853513229, + "grad_norm": 27.423521988934635, + "kl": 0.1669921875, + "learning_rate": 3.9250043805852463e-07, + "loss": 0.0668, + "reward": 1.6609394550323486, + "reward_std": 0.23472878336906433, + "rewards/accuracy_reward_stage2": 0.7859394550323486, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3468 + }, + { + "completion_length": 10.6875, + "epoch": 0.6078500087611705, + "grad_norm": 13.57082931790605, + "kl": 0.2490234375, + "learning_rate": 3.9232521464867707e-07, + "loss": 0.0493, + "reward": 1.3972986936569214, + "reward_std": 0.22089111804962158, + "rewards/accuracy_reward_stage2": 0.4285487234592438, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3469 + }, + { + "completion_length": 6.78125, + "epoch": 0.608025232171018, + "grad_norm": 15.588786781173503, + "kl": 0.07177734375, + "learning_rate": 3.9214999123882946e-07, + "loss": 0.0287, + "reward": 1.6718885898590088, + "reward_std": 0.18329882621765137, + "rewards/accuracy_reward_stage2": 0.6718885898590088, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3470 + }, + { + "completion_length": 7.84375, + "epoch": 0.6082004555808656, + "grad_norm": 17.22290028822834, + "kl": 0.06396484375, + "learning_rate": 3.919747678289819e-07, + "loss": 0.0256, + "reward": 1.6375272274017334, + "reward_std": 0.25864362716674805, + "rewards/accuracy_reward_stage2": 0.6375272870063782, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3471 + }, + { + "completion_length": 10.203125, + "epoch": 0.6083756789907132, + "grad_norm": 23.322277944614704, + "kl": 0.26953125, + "learning_rate": 3.917995444191344e-07, + "loss": 0.0542, + "reward": 1.6182160377502441, + "reward_std": 0.30346912145614624, + "rewards/accuracy_reward_stage2": 0.7744660973548889, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3472 + }, + { + "completion_length": 9.359375, + "epoch": 0.6085509024005608, + "grad_norm": 18.548762768660698, + "kl": 0.15234375, + "learning_rate": 3.9162432100928683e-07, + "loss": 0.019, + "reward": 1.5547361373901367, + "reward_std": 0.23848986625671387, + "rewards/accuracy_reward_stage2": 0.6953611373901367, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3473 + }, + { + "completion_length": 19.109375, + "epoch": 0.6087261258104083, + "grad_norm": 17.897521594424298, + "kl": 0.0927734375, + "learning_rate": 3.9144909759943927e-07, + "loss": -0.0018, + "reward": 1.5199809074401855, + "reward_std": 0.22173704206943512, + "rewards/accuracy_reward_stage2": 0.6606058478355408, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3474 + }, + { + "completion_length": 9.421875, + "epoch": 0.6089013492202558, + "grad_norm": 21.38906924678852, + "kl": 0.1708984375, + "learning_rate": 3.912738741895917e-07, + "loss": 0.0242, + "reward": 1.5925832986831665, + "reward_std": 0.31878191232681274, + "rewards/accuracy_reward_stage2": 0.7332083582878113, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3475 + }, + { + "completion_length": 8.15625, + "epoch": 0.6090765726301034, + "grad_norm": 14.91514219628995, + "kl": 0.1494140625, + "learning_rate": 3.9109865077974415e-07, + "loss": -0.0178, + "reward": 1.5367786884307861, + "reward_std": 0.22990933060646057, + "rewards/accuracy_reward_stage2": 0.5680287480354309, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3476 + }, + { + "completion_length": 6.0, + "epoch": 0.6092517960399509, + "grad_norm": 17.288656981091965, + "kl": 0.150390625, + "learning_rate": 3.909234273698966e-07, + "loss": 0.0182, + "reward": 1.673478364944458, + "reward_std": 0.20205412805080414, + "rewards/accuracy_reward_stage2": 0.8141033053398132, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3477 + }, + { + "completion_length": 13.78125, + "epoch": 0.6094270194497985, + "grad_norm": 19.521573383820197, + "kl": 0.1376953125, + "learning_rate": 3.9074820396004903e-07, + "loss": 0.0611, + "reward": 1.3979265689849854, + "reward_std": 0.1486382782459259, + "rewards/accuracy_reward_stage2": 0.5229264497756958, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3478 + }, + { + "completion_length": 10.015625, + "epoch": 0.609602242859646, + "grad_norm": 19.318238937602164, + "kl": 0.1689453125, + "learning_rate": 3.9057298055020147e-07, + "loss": 0.0254, + "reward": 1.523573637008667, + "reward_std": 0.18456262350082397, + "rewards/accuracy_reward_stage2": 0.5391986966133118, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3479 + }, + { + "completion_length": 9.34375, + "epoch": 0.6097774662694936, + "grad_norm": 27.932749619056537, + "kl": 0.3828125, + "learning_rate": 3.9039775714035396e-07, + "loss": -0.017, + "reward": 1.5458049774169922, + "reward_std": 0.387906551361084, + "rewards/accuracy_reward_stage2": 0.6239298582077026, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3480 + }, + { + "completion_length": 10.859375, + "epoch": 0.6099526896793411, + "grad_norm": 26.924752886684438, + "kl": 0.271484375, + "learning_rate": 3.902225337305064e-07, + "loss": 0.0641, + "reward": 1.078125, + "reward_std": 0.19939783215522766, + "rewards/accuracy_reward_stage2": 0.21875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3481 + }, + { + "completion_length": 10.984375, + "epoch": 0.6101279130891887, + "grad_norm": 22.92304205333242, + "kl": 0.2294921875, + "learning_rate": 3.900473103206588e-07, + "loss": 0.0088, + "reward": 1.5345546007156372, + "reward_std": 0.23967134952545166, + "rewards/accuracy_reward_stage2": 0.7064296007156372, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3482 + }, + { + "completion_length": 7.625, + "epoch": 0.6103031364990362, + "grad_norm": 16.23284608235178, + "kl": 0.1845703125, + "learning_rate": 3.898720869108112e-07, + "loss": -0.0396, + "reward": 1.6320732831954956, + "reward_std": 0.23797453939914703, + "rewards/accuracy_reward_stage2": 0.6789483428001404, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3483 + }, + { + "completion_length": 39.6875, + "epoch": 0.6104783599088838, + "grad_norm": 20.037190503078875, + "kl": 0.189453125, + "learning_rate": 3.896968635009637e-07, + "loss": 0.005, + "reward": 1.2335355281829834, + "reward_std": 0.31477969884872437, + "rewards/accuracy_reward_stage2": 0.3897854685783386, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3484 + }, + { + "completion_length": 13.421875, + "epoch": 0.6106535833187314, + "grad_norm": 17.271775369446903, + "kl": 0.09521484375, + "learning_rate": 3.8952164009111616e-07, + "loss": -0.0503, + "reward": 1.5214704275131226, + "reward_std": 0.18515348434448242, + "rewards/accuracy_reward_stage2": 0.5527204275131226, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3485 + }, + { + "completion_length": 7.359375, + "epoch": 0.610828806728579, + "grad_norm": 19.87792405646192, + "kl": 0.0615234375, + "learning_rate": 3.893464166812686e-07, + "loss": 0.0084, + "reward": 1.799248218536377, + "reward_std": 0.1444234848022461, + "rewards/accuracy_reward_stage2": 0.8148731589317322, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3486 + }, + { + "completion_length": 11.0, + "epoch": 0.6110040301384265, + "grad_norm": 13.932438148758836, + "kl": 0.049560546875, + "learning_rate": 3.8917119327142104e-07, + "loss": 0.0198, + "reward": 1.4347407817840576, + "reward_std": 0.11921636015176773, + "rewards/accuracy_reward_stage2": 0.43474066257476807, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3487 + }, + { + "completion_length": 16.546875, + "epoch": 0.6111792535482741, + "grad_norm": 20.81131020095072, + "kl": 0.154296875, + "learning_rate": 3.8899596986157353e-07, + "loss": 0.0781, + "reward": 1.1274425983428955, + "reward_std": 0.18740572035312653, + "rewards/accuracy_reward_stage2": 0.3774426579475403, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3488 + }, + { + "completion_length": 10.140625, + "epoch": 0.6113544769581216, + "grad_norm": 20.707528478298844, + "kl": 0.1005859375, + "learning_rate": 3.888207464517259e-07, + "loss": -0.004, + "reward": 1.4854450225830078, + "reward_std": 0.1825050711631775, + "rewards/accuracy_reward_stage2": 0.626069962978363, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3489 + }, + { + "completion_length": 8.453125, + "epoch": 0.6115297003679692, + "grad_norm": 21.871090043391153, + "kl": 0.1669921875, + "learning_rate": 3.8864552304187836e-07, + "loss": 0.0042, + "reward": 1.433689832687378, + "reward_std": 0.22336667776107788, + "rewards/accuracy_reward_stage2": 0.5899399518966675, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3490 + }, + { + "completion_length": 9.109375, + "epoch": 0.6117049237778167, + "grad_norm": 39.55167830738417, + "kl": 0.326171875, + "learning_rate": 3.884702996320308e-07, + "loss": 0.0546, + "reward": 1.3851423263549805, + "reward_std": 0.32784217596054077, + "rewards/accuracy_reward_stage2": 0.5413922071456909, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3491 + }, + { + "completion_length": 9.578125, + "epoch": 0.6118801471876643, + "grad_norm": 16.106440954832575, + "kl": 0.162109375, + "learning_rate": 3.882950762221833e-07, + "loss": -0.018, + "reward": 1.7969837188720703, + "reward_std": 0.22703614830970764, + "rewards/accuracy_reward_stage2": 0.8282337784767151, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3492 + }, + { + "completion_length": 10.578125, + "epoch": 0.6120553705975118, + "grad_norm": 16.4256066595653, + "kl": 0.1669921875, + "learning_rate": 3.8811985281233573e-07, + "loss": -0.0015, + "reward": 1.6016194820404053, + "reward_std": 0.18486538529396057, + "rewards/accuracy_reward_stage2": 0.6328696012496948, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3493 + }, + { + "completion_length": 8.4375, + "epoch": 0.6122305940073594, + "grad_norm": 17.446112304916678, + "kl": 0.1494140625, + "learning_rate": 3.8794462940248817e-07, + "loss": 0.0196, + "reward": 1.3984107971191406, + "reward_std": 0.2758486866950989, + "rewards/accuracy_reward_stage2": 0.5390357971191406, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3494 + }, + { + "completion_length": 6.140625, + "epoch": 0.6124058174172069, + "grad_norm": 18.34918591608612, + "kl": 0.19140625, + "learning_rate": 3.8776940599264055e-07, + "loss": -0.0011, + "reward": 1.5423123836517334, + "reward_std": 0.31714749336242676, + "rewards/accuracy_reward_stage2": 0.5735623240470886, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3495 + }, + { + "completion_length": 8.78125, + "epoch": 0.6125810408270544, + "grad_norm": 18.564979588961563, + "kl": 0.1279296875, + "learning_rate": 3.8759418258279305e-07, + "loss": 0.007, + "reward": 1.682100534439087, + "reward_std": 0.24820661544799805, + "rewards/accuracy_reward_stage2": 0.6977255344390869, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3496 + }, + { + "completion_length": 11.265625, + "epoch": 0.6127562642369021, + "grad_norm": 18.11383222850136, + "kl": 0.2392578125, + "learning_rate": 3.874189591729455e-07, + "loss": -0.0278, + "reward": 1.5729455947875977, + "reward_std": 0.3092484474182129, + "rewards/accuracy_reward_stage2": 0.6354456543922424, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3497 + }, + { + "completion_length": 13.03125, + "epoch": 0.6129314876467497, + "grad_norm": 21.12746990447389, + "kl": 0.119140625, + "learning_rate": 3.872437357630979e-07, + "loss": 0.0033, + "reward": 1.6204090118408203, + "reward_std": 0.20224544405937195, + "rewards/accuracy_reward_stage2": 0.6360338926315308, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3498 + }, + { + "completion_length": 8.828125, + "epoch": 0.6131067110565972, + "grad_norm": 15.40110651828435, + "kl": 0.078125, + "learning_rate": 3.8706851235325037e-07, + "loss": 0.0313, + "reward": 1.8171948194503784, + "reward_std": 0.137511745095253, + "rewards/accuracy_reward_stage2": 0.8171948194503784, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3499 + }, + { + "completion_length": 9.640625, + "epoch": 0.6132819344664447, + "grad_norm": 17.78726650450441, + "kl": 0.03955078125, + "learning_rate": 3.8689328894340286e-07, + "loss": 0.0158, + "reward": 1.6292483806610107, + "reward_std": 0.2666996121406555, + "rewards/accuracy_reward_stage2": 0.7542483806610107, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3500 + }, + { + "completion_length": 15.484375, + "epoch": 0.6134571578762923, + "grad_norm": 17.83496345339617, + "kl": 0.12158203125, + "learning_rate": 3.8671806553355524e-07, + "loss": -0.0372, + "reward": 1.5006381273269653, + "reward_std": 0.20013336837291718, + "rewards/accuracy_reward_stage2": 0.6568880677223206, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3501 + }, + { + "completion_length": 10.25, + "epoch": 0.6136323812861398, + "grad_norm": 18.81555321500434, + "kl": 0.17578125, + "learning_rate": 3.865428421237077e-07, + "loss": -0.0392, + "reward": 1.5049701929092407, + "reward_std": 0.40888711810112, + "rewards/accuracy_reward_stage2": 0.551845133304596, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3502 + }, + { + "completion_length": 8.9375, + "epoch": 0.6138076046959874, + "grad_norm": 15.004712159632119, + "kl": 0.07470703125, + "learning_rate": 3.863676187138601e-07, + "loss": 0.0083, + "reward": 1.730872631072998, + "reward_std": 0.15501649677753448, + "rewards/accuracy_reward_stage2": 0.7464977502822876, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3503 + }, + { + "completion_length": 7.609375, + "epoch": 0.6139828281058349, + "grad_norm": 17.480040288539744, + "kl": 0.267578125, + "learning_rate": 3.861923953040126e-07, + "loss": -0.0607, + "reward": 1.6104066371917725, + "reward_std": 0.35269561409950256, + "rewards/accuracy_reward_stage2": 0.6729066371917725, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3504 + }, + { + "completion_length": 11.171875, + "epoch": 0.6141580515156825, + "grad_norm": 21.1540640328661, + "kl": 0.345703125, + "learning_rate": 3.8601717189416506e-07, + "loss": 0.1385, + "reward": 1.5312515497207642, + "reward_std": 0.2088533341884613, + "rewards/accuracy_reward_stage2": 0.7812516093254089, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3505 + }, + { + "completion_length": 7.3125, + "epoch": 0.61433327492553, + "grad_norm": 19.042647399369816, + "kl": 0.185546875, + "learning_rate": 3.858419484843175e-07, + "loss": 0.0121, + "reward": 1.5548827648162842, + "reward_std": 0.23560212552547455, + "rewards/accuracy_reward_stage2": 0.5861326456069946, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3506 + }, + { + "completion_length": 7.171875, + "epoch": 0.6145084983353776, + "grad_norm": 18.408079381299036, + "kl": 0.193359375, + "learning_rate": 3.8566672507446994e-07, + "loss": -0.0061, + "reward": 1.2742847204208374, + "reward_std": 0.34039878845214844, + "rewards/accuracy_reward_stage2": 0.4305347204208374, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3507 + }, + { + "completion_length": 11.90625, + "epoch": 0.6146837217452251, + "grad_norm": 18.173494814858987, + "kl": 0.22265625, + "learning_rate": 3.854915016646224e-07, + "loss": -0.0637, + "reward": 1.5908520221710205, + "reward_std": 0.35630887746810913, + "rewards/accuracy_reward_stage2": 0.6533519625663757, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3508 + }, + { + "completion_length": 9.5625, + "epoch": 0.6148589451550727, + "grad_norm": 15.490980096963543, + "kl": 0.236328125, + "learning_rate": 3.853162782547748e-07, + "loss": 0.0217, + "reward": 1.8262255191802979, + "reward_std": 0.23086966574192047, + "rewards/accuracy_reward_stage2": 0.8574756383895874, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3509 + }, + { + "completion_length": 15.5, + "epoch": 0.6150341685649203, + "grad_norm": 15.170598682243257, + "kl": 0.1708984375, + "learning_rate": 3.8514105484492725e-07, + "loss": 0.0009, + "reward": 1.4647321701049805, + "reward_std": 0.23844221234321594, + "rewards/accuracy_reward_stage2": 0.6209821701049805, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3510 + }, + { + "completion_length": 9.859375, + "epoch": 0.6152093919747679, + "grad_norm": 98.77143590844044, + "kl": 0.7109375, + "learning_rate": 3.849658314350797e-07, + "loss": 0.1333, + "reward": 1.40625, + "reward_std": 0.1523548662662506, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 0.796875, + "scores/accuracy_reward_stage2": 0.796875, + "step": 3511 + }, + { + "completion_length": 4.359375, + "epoch": 0.6153846153846154, + "grad_norm": 28.602906237292615, + "kl": 0.1484375, + "learning_rate": 3.847906080252322e-07, + "loss": 0.0594, + "reward": 1.734375, + "reward_std": 0.2198973000049591, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3512 + }, + { + "completion_length": 8.671875, + "epoch": 0.615559838794463, + "grad_norm": 18.623178861985995, + "kl": 0.1669921875, + "learning_rate": 3.8461538461538463e-07, + "loss": -0.0431, + "reward": 1.6280450820922852, + "reward_std": 0.33348989486694336, + "rewards/accuracy_reward_stage2": 0.6749200224876404, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3513 + }, + { + "completion_length": 4.90625, + "epoch": 0.6157350622043105, + "grad_norm": 11.466048889383227, + "kl": 0.04150390625, + "learning_rate": 3.84440161205537e-07, + "loss": -0.0276, + "reward": 1.5, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3514 + }, + { + "completion_length": 13.015625, + "epoch": 0.6159102856141581, + "grad_norm": 22.1078229298361, + "kl": 0.04052734375, + "learning_rate": 3.8426493779568945e-07, + "loss": 0.0162, + "reward": 1.3823972940444946, + "reward_std": 0.2699623703956604, + "rewards/accuracy_reward_stage2": 0.5073972940444946, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3515 + }, + { + "completion_length": 12.078125, + "epoch": 0.6160855090240056, + "grad_norm": 26.06034416396247, + "kl": 0.11181640625, + "learning_rate": 3.8408971438584195e-07, + "loss": 0.0447, + "reward": 1.6742892265319824, + "reward_std": 0.2498525083065033, + "rewards/accuracy_reward_stage2": 0.6742891073226929, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3516 + }, + { + "completion_length": 13.328125, + "epoch": 0.6162607324338532, + "grad_norm": 9.834568489716801, + "kl": 0.087890625, + "learning_rate": 3.839144909759944e-07, + "loss": -0.0089, + "reward": 1.5580108165740967, + "reward_std": 0.09554215520620346, + "rewards/accuracy_reward_stage2": 0.5736356973648071, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3517 + }, + { + "completion_length": 6.546875, + "epoch": 0.6164359558437007, + "grad_norm": 16.192663583818383, + "kl": 0.1513671875, + "learning_rate": 3.837392675661468e-07, + "loss": 0.0391, + "reward": 1.5547268390655518, + "reward_std": 0.12057439982891083, + "rewards/accuracy_reward_stage2": 0.5703518986701965, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3518 + }, + { + "completion_length": 9.40625, + "epoch": 0.6166111792535482, + "grad_norm": 16.394920671685842, + "kl": 0.1796875, + "learning_rate": 3.8356404415629926e-07, + "loss": 0.0042, + "reward": 1.59375, + "reward_std": 0.19149437546730042, + "rewards/accuracy_reward_stage2": 0.75, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3519 + }, + { + "completion_length": 10.890625, + "epoch": 0.6167864026633958, + "grad_norm": 17.074565592754322, + "kl": 0.103515625, + "learning_rate": 3.8338882074645176e-07, + "loss": 0.0108, + "reward": 1.405001163482666, + "reward_std": 0.2863079905509949, + "rewards/accuracy_reward_stage2": 0.545626163482666, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3520 + }, + { + "completion_length": 8.75, + "epoch": 0.6169616260732433, + "grad_norm": 13.41627549720222, + "kl": 0.2041015625, + "learning_rate": 3.8321359733660414e-07, + "loss": -0.0509, + "reward": 1.6864609718322754, + "reward_std": 0.182004913687706, + "rewards/accuracy_reward_stage2": 0.7333359122276306, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3521 + }, + { + "completion_length": 11.15625, + "epoch": 0.6171368494830909, + "grad_norm": 25.295103218557728, + "kl": 0.224609375, + "learning_rate": 3.830383739267566e-07, + "loss": 0.0146, + "reward": 1.5392603874206543, + "reward_std": 0.2501576840877533, + "rewards/accuracy_reward_stage2": 0.6955103874206543, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3522 + }, + { + "completion_length": 10.515625, + "epoch": 0.6173120728929385, + "grad_norm": 17.035368352296782, + "kl": 0.466796875, + "learning_rate": 3.82863150516909e-07, + "loss": 0.1139, + "reward": 1.4368441104888916, + "reward_std": 0.2833743095397949, + "rewards/accuracy_reward_stage2": 0.5930941104888916, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3523 + }, + { + "completion_length": 10.25, + "epoch": 0.6174872963027861, + "grad_norm": 16.55324156650657, + "kl": 0.1953125, + "learning_rate": 3.826879271070615e-07, + "loss": -0.0034, + "reward": 1.6485011577606201, + "reward_std": 0.3175305128097534, + "rewards/accuracy_reward_stage2": 0.6797511577606201, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3524 + }, + { + "completion_length": 7.0625, + "epoch": 0.6176625197126336, + "grad_norm": 20.007918759937983, + "kl": 0.111328125, + "learning_rate": 3.8251270369721396e-07, + "loss": 0.0445, + "reward": 1.440403938293457, + "reward_std": 0.22194069623947144, + "rewards/accuracy_reward_stage2": 0.5654039978981018, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3525 + }, + { + "completion_length": 11.09375, + "epoch": 0.6178377431224812, + "grad_norm": 17.33559494877162, + "kl": 0.193359375, + "learning_rate": 3.823374802873664e-07, + "loss": 0.0333, + "reward": 1.4666603803634644, + "reward_std": 0.137288898229599, + "rewards/accuracy_reward_stage2": 0.6072853803634644, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3526 + }, + { + "completion_length": 10.40625, + "epoch": 0.6180129665323287, + "grad_norm": 10.617484295557372, + "kl": 0.1630859375, + "learning_rate": 3.821622568775188e-07, + "loss": 0.0208, + "reward": 1.38825261592865, + "reward_std": 0.13479651510715485, + "rewards/accuracy_reward_stage2": 0.5288775563240051, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3527 + }, + { + "completion_length": 10.671875, + "epoch": 0.6181881899421763, + "grad_norm": 17.033987129441545, + "kl": 0.21875, + "learning_rate": 3.819870334676713e-07, + "loss": 0.0434, + "reward": 1.446754813194275, + "reward_std": 0.23349672555923462, + "rewards/accuracy_reward_stage2": 0.5873798131942749, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3528 + }, + { + "completion_length": 8.71875, + "epoch": 0.6183634133520238, + "grad_norm": 15.973752398998958, + "kl": 0.193359375, + "learning_rate": 3.818118100578237e-07, + "loss": 0.044, + "reward": 1.4383138418197632, + "reward_std": 0.2546946406364441, + "rewards/accuracy_reward_stage2": 0.5789388418197632, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3529 + }, + { + "completion_length": 9.28125, + "epoch": 0.6185386367618714, + "grad_norm": 17.20861765125097, + "kl": 0.2138671875, + "learning_rate": 3.8163658664797615e-07, + "loss": -0.0312, + "reward": 1.8370803594589233, + "reward_std": 0.21137744188308716, + "rewards/accuracy_reward_stage2": 0.8839553594589233, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3530 + }, + { + "completion_length": 8.046875, + "epoch": 0.6187138601717189, + "grad_norm": 16.159578362111386, + "kl": 0.2138671875, + "learning_rate": 3.814613632381286e-07, + "loss": -0.0363, + "reward": 1.539421796798706, + "reward_std": 0.29987043142318726, + "rewards/accuracy_reward_stage2": 0.5862968564033508, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3531 + }, + { + "completion_length": 10.03125, + "epoch": 0.6188890835815665, + "grad_norm": 21.416384338002235, + "kl": 0.1396484375, + "learning_rate": 3.812861398282811e-07, + "loss": 0.0271, + "reward": 1.5553689002990723, + "reward_std": 0.26087063550949097, + "rewards/accuracy_reward_stage2": 0.5709939002990723, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3532 + }, + { + "completion_length": 10.484375, + "epoch": 0.619064306991414, + "grad_norm": 24.629877674014825, + "kl": 0.10498046875, + "learning_rate": 3.8111091641843347e-07, + "loss": 0.042, + "reward": 1.4916770458221436, + "reward_std": 0.2873051166534424, + "rewards/accuracy_reward_stage2": 0.49167704582214355, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3533 + }, + { + "completion_length": 8.59375, + "epoch": 0.6192395304012616, + "grad_norm": 17.835054328540952, + "kl": 0.1416015625, + "learning_rate": 3.809356930085859e-07, + "loss": -0.0317, + "reward": 1.408469796180725, + "reward_std": 0.17870807647705078, + "rewards/accuracy_reward_stage2": 0.4397197961807251, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3534 + }, + { + "completion_length": 10.0625, + "epoch": 0.6194147538111091, + "grad_norm": 18.655373426418993, + "kl": 0.1533203125, + "learning_rate": 3.8076046959873835e-07, + "loss": 0.0171, + "reward": 1.6664772033691406, + "reward_std": 0.2817244529724121, + "rewards/accuracy_reward_stage2": 0.6821021437644958, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3535 + }, + { + "completion_length": 9.859375, + "epoch": 0.6195899772209568, + "grad_norm": 18.72390316211415, + "kl": 0.37109375, + "learning_rate": 3.8058524618889084e-07, + "loss": -0.0262, + "reward": 1.4409722089767456, + "reward_std": 0.2888485789299011, + "rewards/accuracy_reward_stage2": 0.5190972089767456, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3536 + }, + { + "completion_length": 8.734375, + "epoch": 0.6197652006308043, + "grad_norm": 21.898155977185976, + "kl": 0.1328125, + "learning_rate": 3.804100227790433e-07, + "loss": 0.0403, + "reward": 1.5529836416244507, + "reward_std": 0.26357075572013855, + "rewards/accuracy_reward_stage2": 0.6936086416244507, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3537 + }, + { + "completion_length": 6.734375, + "epoch": 0.6199404240406519, + "grad_norm": 21.017998866038578, + "kl": 0.26953125, + "learning_rate": 3.802347993691957e-07, + "loss": -0.0248, + "reward": 1.6154170036315918, + "reward_std": 0.27425822615623474, + "rewards/accuracy_reward_stage2": 0.662291944026947, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3538 + }, + { + "completion_length": 9.65625, + "epoch": 0.6201156474504994, + "grad_norm": 24.37689125398419, + "kl": 0.390625, + "learning_rate": 3.8005957595934816e-07, + "loss": 0.1059, + "reward": 1.3345599174499512, + "reward_std": 0.24926936626434326, + "rewards/accuracy_reward_stage2": 0.6158099174499512, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3539 + }, + { + "completion_length": 10.078125, + "epoch": 0.620290870860347, + "grad_norm": 20.6787583059043, + "kl": 0.1259765625, + "learning_rate": 3.7988435254950055e-07, + "loss": 0.0503, + "reward": 1.629636526107788, + "reward_std": 0.24759991466999054, + "rewards/accuracy_reward_stage2": 0.6296364665031433, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3540 + }, + { + "completion_length": 10.65625, + "epoch": 0.6204660942701945, + "grad_norm": 14.225070278546896, + "kl": 0.08544921875, + "learning_rate": 3.7970912913965304e-07, + "loss": -0.0485, + "reward": 1.6456576585769653, + "reward_std": 0.1932823807001114, + "rewards/accuracy_reward_stage2": 0.6769076585769653, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3541 + }, + { + "completion_length": 8.875, + "epoch": 0.620641317680042, + "grad_norm": 17.902675872340364, + "kl": 0.1611328125, + "learning_rate": 3.795339057298055e-07, + "loss": 0.0205, + "reward": 1.6116595268249512, + "reward_std": 0.21022561192512512, + "rewards/accuracy_reward_stage2": 0.6272845268249512, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3542 + }, + { + "completion_length": 12.09375, + "epoch": 0.6208165410898896, + "grad_norm": 20.440861631124083, + "kl": 0.291015625, + "learning_rate": 3.793586823199579e-07, + "loss": -0.0145, + "reward": 1.5649584531784058, + "reward_std": 0.3023066520690918, + "rewards/accuracy_reward_stage2": 0.6274584531784058, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3543 + }, + { + "completion_length": 17.28125, + "epoch": 0.6209917644997371, + "grad_norm": 14.588766645869493, + "kl": 0.1474609375, + "learning_rate": 3.791834589101104e-07, + "loss": -0.0292, + "reward": 1.483196496963501, + "reward_std": 0.20009317994117737, + "rewards/accuracy_reward_stage2": 0.6394466161727905, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3544 + }, + { + "completion_length": 28.796875, + "epoch": 0.6211669879095847, + "grad_norm": 21.009877199345457, + "kl": 0.1103515625, + "learning_rate": 3.7900823550026285e-07, + "loss": 0.0, + "reward": 1.5804895162582397, + "reward_std": 0.1696164608001709, + "rewards/accuracy_reward_stage2": 0.5961145162582397, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3545 + }, + { + "completion_length": 11.9375, + "epoch": 0.6213422113194322, + "grad_norm": 14.908152840058406, + "kl": 0.060546875, + "learning_rate": 3.7883301209041524e-07, + "loss": 0.0242, + "reward": 1.5572917461395264, + "reward_std": 0.1236191987991333, + "rewards/accuracy_reward_stage2": 0.5572916865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3546 + }, + { + "completion_length": 9.21875, + "epoch": 0.6215174347292798, + "grad_norm": 16.157423263961284, + "kl": 0.083984375, + "learning_rate": 3.786577886805677e-07, + "loss": 0.0239, + "reward": 1.5393553972244263, + "reward_std": 0.1936211735010147, + "rewards/accuracy_reward_stage2": 0.5549803972244263, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3547 + }, + { + "completion_length": 7.578125, + "epoch": 0.6216926581391274, + "grad_norm": 15.889175938523843, + "kl": 0.05908203125, + "learning_rate": 3.784825652707201e-07, + "loss": -0.0206, + "reward": 1.8656994104385376, + "reward_std": 0.17255815863609314, + "rewards/accuracy_reward_stage2": 0.8813244104385376, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3548 + }, + { + "completion_length": 10.671875, + "epoch": 0.621867881548975, + "grad_norm": 19.960541953323727, + "kl": 0.1220703125, + "learning_rate": 3.783073418608726e-07, + "loss": 0.0488, + "reward": 1.6864854097366333, + "reward_std": 0.23788943886756897, + "rewards/accuracy_reward_stage2": 0.8114853501319885, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3549 + }, + { + "completion_length": 9.15625, + "epoch": 0.6220431049588225, + "grad_norm": 16.951175313083127, + "kl": 0.11279296875, + "learning_rate": 3.7813211845102505e-07, + "loss": 0.0008, + "reward": 1.6322424411773682, + "reward_std": 0.2910325527191162, + "rewards/accuracy_reward_stage2": 0.7728673815727234, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3550 + }, + { + "completion_length": 8.59375, + "epoch": 0.6222183283686701, + "grad_norm": 15.89765695432175, + "kl": 0.05078125, + "learning_rate": 3.779568950411775e-07, + "loss": 0.0204, + "reward": 1.7006888389587402, + "reward_std": 0.10197651386260986, + "rewards/accuracy_reward_stage2": 0.7006887197494507, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3551 + }, + { + "completion_length": 12.203125, + "epoch": 0.6223935517785176, + "grad_norm": 20.326690153829755, + "kl": 0.318359375, + "learning_rate": 3.777816716313299e-07, + "loss": 0.0028, + "reward": 1.4471937417984009, + "reward_std": 0.2985219359397888, + "rewards/accuracy_reward_stage2": 0.4940687119960785, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3552 + }, + { + "completion_length": 7.390625, + "epoch": 0.6225687751883652, + "grad_norm": 15.092191451627144, + "kl": 0.2373046875, + "learning_rate": 3.7760644822148237e-07, + "loss": -0.0871, + "reward": 1.6521281003952026, + "reward_std": 0.2880202829837799, + "rewards/accuracy_reward_stage2": 0.7302531003952026, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3553 + }, + { + "completion_length": 8.65625, + "epoch": 0.6227439985982127, + "grad_norm": 17.99898596971007, + "kl": 0.11767578125, + "learning_rate": 3.774312248116348e-07, + "loss": -0.0413, + "reward": 1.7212051153182983, + "reward_std": 0.29289162158966064, + "rewards/accuracy_reward_stage2": 0.7524551153182983, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3554 + }, + { + "completion_length": 6.859375, + "epoch": 0.6229192220080603, + "grad_norm": 16.940170418921618, + "kl": 0.1708984375, + "learning_rate": 3.7725600140178725e-07, + "loss": 0.0244, + "reward": 1.7089645862579346, + "reward_std": 0.11606550216674805, + "rewards/accuracy_reward_stage2": 0.8495896458625793, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3555 + }, + { + "completion_length": 5.90625, + "epoch": 0.6230944454179078, + "grad_norm": 17.429080589459772, + "kl": 0.09765625, + "learning_rate": 3.770807779919397e-07, + "loss": 0.0391, + "reward": 1.4759080410003662, + "reward_std": 0.18674036860466003, + "rewards/accuracy_reward_stage2": 0.600908100605011, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3556 + }, + { + "completion_length": 9.375, + "epoch": 0.6232696688277554, + "grad_norm": 20.458137533190115, + "kl": 0.130859375, + "learning_rate": 3.769055545820922e-07, + "loss": 0.0194, + "reward": 1.657368540763855, + "reward_std": 0.31080591678619385, + "rewards/accuracy_reward_stage2": 0.672993540763855, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3557 + }, + { + "completion_length": 8.40625, + "epoch": 0.6234448922376029, + "grad_norm": 13.461876617629324, + "kl": 0.10498046875, + "learning_rate": 3.767303311722446e-07, + "loss": 0.042, + "reward": 1.8790143728256226, + "reward_std": 0.12620574235916138, + "rewards/accuracy_reward_stage2": 0.8790143728256226, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3558 + }, + { + "completion_length": 10.3125, + "epoch": 0.6236201156474505, + "grad_norm": 21.25344102951621, + "kl": 0.2470703125, + "learning_rate": 3.76555107762397e-07, + "loss": 0.022, + "reward": 1.06238853931427, + "reward_std": 0.3604205250740051, + "rewards/accuracy_reward_stage2": 0.59363853931427, + "rewards/format_reward_stage1_pointerpad": 0.46875, + "scores/accuracy_reward_stage2": 0.46875, + "step": 3559 + }, + { + "completion_length": 13.28125, + "epoch": 0.623795339057298, + "grad_norm": 17.725282912474576, + "kl": 0.095703125, + "learning_rate": 3.7637988435254945e-07, + "loss": 0.0382, + "reward": 1.5480883121490479, + "reward_std": 0.2119196355342865, + "rewards/accuracy_reward_stage2": 0.5480883121490479, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3560 + }, + { + "completion_length": 13.328125, + "epoch": 0.6239705624671457, + "grad_norm": 23.053761024253866, + "kl": 0.11279296875, + "learning_rate": 3.7620466094270194e-07, + "loss": 0.045, + "reward": 1.4253368377685547, + "reward_std": 0.22176837921142578, + "rewards/accuracy_reward_stage2": 0.6753367185592651, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3561 + }, + { + "completion_length": 11.25, + "epoch": 0.6241457858769932, + "grad_norm": 264.2225226480569, + "kl": 1.2265625, + "learning_rate": 3.760294375328544e-07, + "loss": 0.3575, + "reward": 1.4801325798034668, + "reward_std": 0.20424708724021912, + "rewards/accuracy_reward_stage2": 0.5426324605941772, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3562 + }, + { + "completion_length": 9.015625, + "epoch": 0.6243210092868408, + "grad_norm": 18.901975442316548, + "kl": 0.11279296875, + "learning_rate": 3.758542141230068e-07, + "loss": 0.0453, + "reward": 1.456575632095337, + "reward_std": 0.15833720564842224, + "rewards/accuracy_reward_stage2": 0.4565756320953369, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3563 + }, + { + "completion_length": 9.390625, + "epoch": 0.6244962326966883, + "grad_norm": 17.899991222909144, + "kl": 0.1005859375, + "learning_rate": 3.7567899071315926e-07, + "loss": 0.0191, + "reward": 1.4772353172302246, + "reward_std": 0.21017813682556152, + "rewards/accuracy_reward_stage2": 0.49286025762557983, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3564 + }, + { + "completion_length": 8.765625, + "epoch": 0.6246714561065358, + "grad_norm": 11.97891870659925, + "kl": 0.020263671875, + "learning_rate": 3.755037673033117e-07, + "loss": 0.0081, + "reward": 1.6755764484405518, + "reward_std": 0.14607882499694824, + "rewards/accuracy_reward_stage2": 0.6755764484405518, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3565 + }, + { + "completion_length": 15.734375, + "epoch": 0.6248466795163834, + "grad_norm": 15.662134755135996, + "kl": 0.08349609375, + "learning_rate": 3.7532854389346414e-07, + "loss": -0.0102, + "reward": 1.5459709167480469, + "reward_std": 0.23149898648262024, + "rewards/accuracy_reward_stage2": 0.6865959167480469, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3566 + }, + { + "completion_length": 9.671875, + "epoch": 0.6250219029262309, + "grad_norm": 17.250786101493915, + "kl": 0.12060546875, + "learning_rate": 3.751533204836166e-07, + "loss": 0.0482, + "reward": 1.4936150312423706, + "reward_std": 0.16181641817092896, + "rewards/accuracy_reward_stage2": 0.6186150312423706, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3567 + }, + { + "completion_length": 8.859375, + "epoch": 0.6251971263360785, + "grad_norm": 21.18063832472489, + "kl": 0.1298828125, + "learning_rate": 3.74978097073769e-07, + "loss": -0.0081, + "reward": 1.6339540481567383, + "reward_std": 0.20301076769828796, + "rewards/accuracy_reward_stage2": 0.6652040481567383, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3568 + }, + { + "completion_length": 9.171875, + "epoch": 0.625372349745926, + "grad_norm": 16.632840171437252, + "kl": 0.1533203125, + "learning_rate": 3.748028736639215e-07, + "loss": 0.0169, + "reward": 1.4851830005645752, + "reward_std": 0.1846974790096283, + "rewards/accuracy_reward_stage2": 0.5008080005645752, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3569 + }, + { + "completion_length": 24.921875, + "epoch": 0.6255475731557736, + "grad_norm": 19.56369931496008, + "kl": 0.11376953125, + "learning_rate": 3.7462765025407395e-07, + "loss": 0.0119, + "reward": 1.5117580890655518, + "reward_std": 0.24193823337554932, + "rewards/accuracy_reward_stage2": 0.5273829698562622, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3570 + }, + { + "completion_length": 17.390625, + "epoch": 0.6257227965656211, + "grad_norm": 31.59243233511247, + "kl": 0.29296875, + "learning_rate": 3.744524268442264e-07, + "loss": 0.0291, + "reward": 1.5098161697387695, + "reward_std": 0.29780054092407227, + "rewards/accuracy_reward_stage2": 0.5410662889480591, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3571 + }, + { + "completion_length": 9.875, + "epoch": 0.6258980199754687, + "grad_norm": 16.968677555978577, + "kl": 0.1552734375, + "learning_rate": 3.742772034343788e-07, + "loss": -0.0247, + "reward": 1.6412062644958496, + "reward_std": 0.2023872286081314, + "rewards/accuracy_reward_stage2": 0.7974562048912048, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3572 + }, + { + "completion_length": 13.25, + "epoch": 0.6260732433853162, + "grad_norm": 23.035582765314654, + "kl": 0.09326171875, + "learning_rate": 3.7410198002453127e-07, + "loss": 0.0374, + "reward": 1.68117094039917, + "reward_std": 0.23287498950958252, + "rewards/accuracy_reward_stage2": 0.6811710000038147, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3573 + }, + { + "completion_length": 8.796875, + "epoch": 0.6262484667951639, + "grad_norm": 17.890803563885793, + "kl": 0.1318359375, + "learning_rate": 3.739267566146837e-07, + "loss": 0.0155, + "reward": 1.4391281604766846, + "reward_std": 0.18224698305130005, + "rewards/accuracy_reward_stage2": 0.5797532200813293, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3574 + }, + { + "completion_length": 11.78125, + "epoch": 0.6264236902050114, + "grad_norm": 18.294483074014767, + "kl": 0.10791015625, + "learning_rate": 3.7375153320483615e-07, + "loss": 0.043, + "reward": 1.550438642501831, + "reward_std": 0.13361681997776031, + "rewards/accuracy_reward_stage2": 0.6754387617111206, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3575 + }, + { + "completion_length": 9.46875, + "epoch": 0.626598913614859, + "grad_norm": 19.645375528005765, + "kl": 0.11474609375, + "learning_rate": 3.735763097949886e-07, + "loss": -0.0072, + "reward": 1.514617681503296, + "reward_std": 0.25796785950660706, + "rewards/accuracy_reward_stage2": 0.5458677411079407, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3576 + }, + { + "completion_length": 11.203125, + "epoch": 0.6267741370247065, + "grad_norm": 24.349105508571775, + "kl": 0.3671875, + "learning_rate": 3.734010863851411e-07, + "loss": 0.0613, + "reward": 1.490251898765564, + "reward_std": 0.15121126174926758, + "rewards/accuracy_reward_stage2": 0.521501898765564, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3577 + }, + { + "completion_length": 7.453125, + "epoch": 0.6269493604345541, + "grad_norm": 20.045855738595996, + "kl": 0.19140625, + "learning_rate": 3.7322586297529347e-07, + "loss": 0.0449, + "reward": 1.5475776195526123, + "reward_std": 0.2927602231502533, + "rewards/accuracy_reward_stage2": 0.5632026195526123, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3578 + }, + { + "completion_length": 7.65625, + "epoch": 0.6271245838444016, + "grad_norm": 18.788063060436127, + "kl": 0.14453125, + "learning_rate": 3.730506395654459e-07, + "loss": 0.0194, + "reward": 1.6328332424163818, + "reward_std": 0.1912064403295517, + "rewards/accuracy_reward_stage2": 0.6484582424163818, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3579 + }, + { + "completion_length": 12.84375, + "epoch": 0.6272998072542492, + "grad_norm": 18.013975028757724, + "kl": 0.1884765625, + "learning_rate": 3.7287541615559835e-07, + "loss": -0.0024, + "reward": 1.3137900829315186, + "reward_std": 0.30171117186546326, + "rewards/accuracy_reward_stage2": 0.3450400233268738, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3580 + }, + { + "completion_length": 8.84375, + "epoch": 0.6274750306640967, + "grad_norm": 20.12558351071326, + "kl": 0.036376953125, + "learning_rate": 3.7270019274575084e-07, + "loss": 0.0146, + "reward": 1.6001933813095093, + "reward_std": 0.14768247306346893, + "rewards/accuracy_reward_stage2": 0.600193440914154, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3581 + }, + { + "completion_length": 9.390625, + "epoch": 0.6276502540739443, + "grad_norm": 20.643454936736447, + "kl": 0.1689453125, + "learning_rate": 3.725249693359033e-07, + "loss": 0.0235, + "reward": 1.3290756940841675, + "reward_std": 0.14735127985477448, + "rewards/accuracy_reward_stage2": 0.46970072388648987, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3582 + }, + { + "completion_length": 9.625, + "epoch": 0.6278254774837918, + "grad_norm": 22.35769949056351, + "kl": 0.2294921875, + "learning_rate": 3.723497459260557e-07, + "loss": 0.0223, + "reward": 1.5896830558776855, + "reward_std": 0.32479965686798096, + "rewards/accuracy_reward_stage2": 0.620932936668396, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3583 + }, + { + "completion_length": 14.984375, + "epoch": 0.6280007008936394, + "grad_norm": 14.017040088597355, + "kl": 0.05712890625, + "learning_rate": 3.721745225162081e-07, + "loss": -0.0207, + "reward": 1.4739978313446045, + "reward_std": 0.16686731576919556, + "rewards/accuracy_reward_stage2": 0.48962289094924927, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3584 + }, + { + "completion_length": 7.78125, + "epoch": 0.6281759243034869, + "grad_norm": 22.618567158406375, + "kl": 0.11865234375, + "learning_rate": 3.719992991063606e-07, + "loss": 0.0474, + "reward": 1.9034717082977295, + "reward_std": 0.1325203776359558, + "rewards/accuracy_reward_stage2": 0.9034717082977295, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3585 + }, + { + "completion_length": 12.0, + "epoch": 0.6283511477133346, + "grad_norm": 19.571473796459003, + "kl": 0.06494140625, + "learning_rate": 3.7182407569651304e-07, + "loss": 0.0259, + "reward": 1.6627893447875977, + "reward_std": 0.3550585210323334, + "rewards/accuracy_reward_stage2": 0.6627893447875977, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3586 + }, + { + "completion_length": 11.125, + "epoch": 0.6285263711231821, + "grad_norm": 20.24306501516782, + "kl": 0.205078125, + "learning_rate": 3.716488522866655e-07, + "loss": 0.0378, + "reward": 1.3590320348739624, + "reward_std": 0.2013445496559143, + "rewards/accuracy_reward_stage2": 0.4996569752693176, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3587 + }, + { + "completion_length": 11.1875, + "epoch": 0.6287015945330297, + "grad_norm": 19.460810861658555, + "kl": 0.197265625, + "learning_rate": 3.714736288768179e-07, + "loss": 0.013, + "reward": 1.4457964897155762, + "reward_std": 0.19221317768096924, + "rewards/accuracy_reward_stage2": 0.602046549320221, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3588 + }, + { + "completion_length": 7.53125, + "epoch": 0.6288768179428772, + "grad_norm": 23.264413559011846, + "kl": 0.10546875, + "learning_rate": 3.712984054669704e-07, + "loss": 0.042, + "reward": 1.7394332885742188, + "reward_std": 0.20407696068286896, + "rewards/accuracy_reward_stage2": 0.7394333481788635, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3589 + }, + { + "completion_length": 11.015625, + "epoch": 0.6290520413527247, + "grad_norm": 22.753318894193853, + "kl": 0.1005859375, + "learning_rate": 3.7112318205712285e-07, + "loss": 0.0402, + "reward": 1.489595651626587, + "reward_std": 0.2987571656703949, + "rewards/accuracy_reward_stage2": 0.48959559202194214, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3590 + }, + { + "completion_length": 11.0625, + "epoch": 0.6292272647625723, + "grad_norm": 19.31772370933449, + "kl": 0.02783203125, + "learning_rate": 3.7094795864727524e-07, + "loss": 0.0111, + "reward": 1.5470237731933594, + "reward_std": 0.22848649322986603, + "rewards/accuracy_reward_stage2": 0.5470237731933594, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3591 + }, + { + "completion_length": 7.984375, + "epoch": 0.6294024881724198, + "grad_norm": 19.530075409106047, + "kl": 0.1259765625, + "learning_rate": 3.707727352374277e-07, + "loss": -0.036, + "reward": 1.4901680946350098, + "reward_std": 0.2815321087837219, + "rewards/accuracy_reward_stage2": 0.6464180946350098, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3592 + }, + { + "completion_length": 10.90625, + "epoch": 0.6295777115822674, + "grad_norm": 18.345966770518224, + "kl": 0.197265625, + "learning_rate": 3.7059751182758017e-07, + "loss": -0.0039, + "reward": 1.6659647226333618, + "reward_std": 0.2305700182914734, + "rewards/accuracy_reward_stage2": 0.6972147226333618, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3593 + }, + { + "completion_length": 11.03125, + "epoch": 0.6297529349921149, + "grad_norm": 21.17499941283475, + "kl": 0.22265625, + "learning_rate": 3.704222884177326e-07, + "loss": -0.0711, + "reward": 1.5663416385650635, + "reward_std": 0.3491690754890442, + "rewards/accuracy_reward_stage2": 0.6444666385650635, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3594 + }, + { + "completion_length": 9.671875, + "epoch": 0.6299281584019625, + "grad_norm": 14.065463779336932, + "kl": 0.146484375, + "learning_rate": 3.7024706500788505e-07, + "loss": -0.0295, + "reward": 1.5848331451416016, + "reward_std": 0.18891380727291107, + "rewards/accuracy_reward_stage2": 0.6160831451416016, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3595 + }, + { + "completion_length": 7.15625, + "epoch": 0.63010338181181, + "grad_norm": 21.11957075148225, + "kl": 0.1328125, + "learning_rate": 3.700718415980375e-07, + "loss": 0.0242, + "reward": 1.5459332466125488, + "reward_std": 0.25915971398353577, + "rewards/accuracy_reward_stage2": 0.5615583658218384, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3596 + }, + { + "completion_length": 12.671875, + "epoch": 0.6302786052216576, + "grad_norm": 19.24932292881549, + "kl": 0.263671875, + "learning_rate": 3.698966181881899e-07, + "loss": 0.0387, + "reward": 1.4856727123260498, + "reward_std": 0.3313537538051605, + "rewards/accuracy_reward_stage2": 0.6419227719306946, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3597 + }, + { + "completion_length": 12.65625, + "epoch": 0.6304538286315051, + "grad_norm": 15.765998728489336, + "kl": 0.1015625, + "learning_rate": 3.6972139477834237e-07, + "loss": 0.0408, + "reward": 1.6163502931594849, + "reward_std": 0.09977184236049652, + "rewards/accuracy_reward_stage2": 0.6163503527641296, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3598 + }, + { + "completion_length": 9.46875, + "epoch": 0.6306290520413528, + "grad_norm": 6.389241941568242, + "kl": 0.05810546875, + "learning_rate": 3.695461713684948e-07, + "loss": -0.0102, + "reward": 1.5, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3599 + }, + { + "completion_length": 10.453125, + "epoch": 0.6308042754512003, + "grad_norm": 18.805567669260828, + "kl": 0.150390625, + "learning_rate": 3.6937094795864725e-07, + "loss": 0.016, + "reward": 1.7800894975662231, + "reward_std": 0.1909758448600769, + "rewards/accuracy_reward_stage2": 0.7957144975662231, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3600 + }, + { + "completion_length": 11.125, + "epoch": 0.6309794988610479, + "grad_norm": 19.18921620561482, + "kl": 0.1376953125, + "learning_rate": 3.6919572454879974e-07, + "loss": -0.0322, + "reward": 1.4479596614837646, + "reward_std": 0.2607450485229492, + "rewards/accuracy_reward_stage2": 0.4948346018791199, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3601 + }, + { + "completion_length": 11.25, + "epoch": 0.6311547222708954, + "grad_norm": 16.58693480436364, + "kl": 0.04638671875, + "learning_rate": 3.690205011389522e-07, + "loss": 0.0185, + "reward": 1.5900869369506836, + "reward_std": 0.13465861976146698, + "rewards/accuracy_reward_stage2": 0.5900869369506836, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3602 + }, + { + "completion_length": 12.09375, + "epoch": 0.631329945680743, + "grad_norm": 14.017275262683858, + "kl": 0.1162109375, + "learning_rate": 3.6884527772910456e-07, + "loss": 0.0109, + "reward": 1.3748043775558472, + "reward_std": 0.25310125946998596, + "rewards/accuracy_reward_stage2": 0.6404293179512024, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3603 + }, + { + "completion_length": 7.203125, + "epoch": 0.6315051690905905, + "grad_norm": 17.0023380519842, + "kl": 0.15625, + "learning_rate": 3.68670054319257e-07, + "loss": 0.0182, + "reward": 1.296875, + "reward_std": 0.25726157426834106, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3604 + }, + { + "completion_length": 9.078125, + "epoch": 0.6316803925004381, + "grad_norm": 17.37072153704894, + "kl": 0.07421875, + "learning_rate": 3.684948309094095e-07, + "loss": 0.0298, + "reward": 1.6908620595932007, + "reward_std": 0.15819929540157318, + "rewards/accuracy_reward_stage2": 0.6908620595932007, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3605 + }, + { + "completion_length": 8.4375, + "epoch": 0.6318556159102856, + "grad_norm": 17.15164377499678, + "kl": 0.1572265625, + "learning_rate": 3.6831960749956194e-07, + "loss": 0.0433, + "reward": 1.407257318496704, + "reward_std": 0.17434148490428925, + "rewards/accuracy_reward_stage2": 0.4385073184967041, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3606 + }, + { + "completion_length": 8.375, + "epoch": 0.6320308393201332, + "grad_norm": 13.219982651126623, + "kl": 0.1044921875, + "learning_rate": 3.681443840897144e-07, + "loss": 0.0047, + "reward": 1.6666977405548096, + "reward_std": 0.082484170794487, + "rewards/accuracy_reward_stage2": 0.8073228597640991, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3607 + }, + { + "completion_length": 33.09375, + "epoch": 0.6322060627299807, + "grad_norm": 20.00487017872797, + "kl": 0.123046875, + "learning_rate": 3.679691606798668e-07, + "loss": 0.0052, + "reward": 1.6811567544937134, + "reward_std": 0.2361474633216858, + "rewards/accuracy_reward_stage2": 0.6967816352844238, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3608 + }, + { + "completion_length": 10.28125, + "epoch": 0.6323812861398282, + "grad_norm": 20.55657196512188, + "kl": 0.045166015625, + "learning_rate": 3.677939372700193e-07, + "loss": 0.0181, + "reward": 1.3361544609069824, + "reward_std": 0.2871898114681244, + "rewards/accuracy_reward_stage2": 0.33615443110466003, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3609 + }, + { + "completion_length": 7.140625, + "epoch": 0.6325565095496758, + "grad_norm": 16.762029088279338, + "kl": 0.1005859375, + "learning_rate": 3.676187138601717e-07, + "loss": 0.0402, + "reward": 1.59446382522583, + "reward_std": 0.15326127409934998, + "rewards/accuracy_reward_stage2": 0.5944638848304749, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3610 + }, + { + "completion_length": 7.515625, + "epoch": 0.6327317329595233, + "grad_norm": 23.174955130615718, + "kl": 0.1904296875, + "learning_rate": 3.6744349045032413e-07, + "loss": 0.01, + "reward": 1.806498408317566, + "reward_std": 0.23249441385269165, + "rewards/accuracy_reward_stage2": 0.8377484083175659, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3611 + }, + { + "completion_length": 8.21875, + "epoch": 0.632906956369371, + "grad_norm": 20.866554282875175, + "kl": 0.103515625, + "learning_rate": 3.672682670404766e-07, + "loss": 0.0144, + "reward": 1.630176067352295, + "reward_std": 0.23904339969158173, + "rewards/accuracy_reward_stage2": 0.6458011269569397, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3612 + }, + { + "completion_length": 11.828125, + "epoch": 0.6330821797792185, + "grad_norm": 18.307211353396795, + "kl": 0.169921875, + "learning_rate": 3.6709304363062907e-07, + "loss": -0.0082, + "reward": 1.4270917177200317, + "reward_std": 0.200740247964859, + "rewards/accuracy_reward_stage2": 0.5833417177200317, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3613 + }, + { + "completion_length": 10.9375, + "epoch": 0.6332574031890661, + "grad_norm": 18.065419136464577, + "kl": 0.11376953125, + "learning_rate": 3.669178202207815e-07, + "loss": 0.024, + "reward": 1.4780011177062988, + "reward_std": 0.2074006348848343, + "rewards/accuracy_reward_stage2": 0.4936261773109436, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3614 + }, + { + "completion_length": 7.546875, + "epoch": 0.6334326265989136, + "grad_norm": 16.572006317014594, + "kl": 0.048828125, + "learning_rate": 3.6674259681093395e-07, + "loss": 0.0196, + "reward": 1.697108507156372, + "reward_std": 0.10851763188838959, + "rewards/accuracy_reward_stage2": 0.6971083879470825, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3615 + }, + { + "completion_length": 9.78125, + "epoch": 0.6336078500087612, + "grad_norm": 14.499401917716359, + "kl": 0.1533203125, + "learning_rate": 3.6656737340108633e-07, + "loss": -0.0012, + "reward": 1.6847833395004272, + "reward_std": 0.1449342668056488, + "rewards/accuracy_reward_stage2": 0.7160332798957825, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3616 + }, + { + "completion_length": 8.75, + "epoch": 0.6337830734186087, + "grad_norm": 17.256413793076284, + "kl": 0.1787109375, + "learning_rate": 3.6639214999123877e-07, + "loss": -0.0611, + "reward": 1.5829840898513794, + "reward_std": 0.28531354665756226, + "rewards/accuracy_reward_stage2": 0.6298590302467346, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3617 + }, + { + "completion_length": 9.15625, + "epoch": 0.6339582968284563, + "grad_norm": 16.62546792326556, + "kl": 0.1396484375, + "learning_rate": 3.6621692658139126e-07, + "loss": 0.056, + "reward": 1.4859604835510254, + "reward_std": 0.13596408069133759, + "rewards/accuracy_reward_stage2": 0.6109604835510254, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3618 + }, + { + "completion_length": 8.171875, + "epoch": 0.6341335202383038, + "grad_norm": 21.861338130303448, + "kl": 0.049072265625, + "learning_rate": 3.660417031715437e-07, + "loss": 0.0196, + "reward": 1.789048671722412, + "reward_std": 0.15189215540885925, + "rewards/accuracy_reward_stage2": 0.7890486121177673, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3619 + }, + { + "completion_length": 10.609375, + "epoch": 0.6343087436481514, + "grad_norm": 18.131394365573538, + "kl": 0.046875, + "learning_rate": 3.6586647976169614e-07, + "loss": 0.0188, + "reward": 1.332848310470581, + "reward_std": 0.19504520297050476, + "rewards/accuracy_reward_stage2": 0.33284837007522583, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3620 + }, + { + "completion_length": 6.203125, + "epoch": 0.6344839670579989, + "grad_norm": 24.954239353297936, + "kl": 0.1064453125, + "learning_rate": 3.656912563518486e-07, + "loss": 0.0425, + "reward": 1.6431493759155273, + "reward_std": 0.28684133291244507, + "rewards/accuracy_reward_stage2": 0.6431494355201721, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3621 + }, + { + "completion_length": 14.90625, + "epoch": 0.6346591904678465, + "grad_norm": 20.359486450098274, + "kl": 0.1220703125, + "learning_rate": 3.655160329420011e-07, + "loss": 0.0102, + "reward": 1.1454540491104126, + "reward_std": 0.30089348554611206, + "rewards/accuracy_reward_stage2": 0.4110791087150574, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3622 + }, + { + "completion_length": 17.796875, + "epoch": 0.634834413877694, + "grad_norm": 23.201390222878917, + "kl": 0.1455078125, + "learning_rate": 3.6534080953215346e-07, + "loss": 0.0142, + "reward": 1.383394718170166, + "reward_std": 0.2742749750614166, + "rewards/accuracy_reward_stage2": 0.5240197777748108, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3623 + }, + { + "completion_length": 10.0625, + "epoch": 0.6350096372875416, + "grad_norm": 14.074983708968212, + "kl": 0.11376953125, + "learning_rate": 3.651655861223059e-07, + "loss": -0.0303, + "reward": 1.8645833730697632, + "reward_std": 0.1918574422597885, + "rewards/accuracy_reward_stage2": 0.8958333730697632, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3624 + }, + { + "completion_length": 13.515625, + "epoch": 0.6351848606973892, + "grad_norm": 35.487603494256746, + "kl": 0.109375, + "learning_rate": 3.6499036271245834e-07, + "loss": 0.0437, + "reward": 1.4434072971343994, + "reward_std": 0.3074309527873993, + "rewards/accuracy_reward_stage2": 0.5684072971343994, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3625 + }, + { + "completion_length": 10.453125, + "epoch": 0.6353600841072368, + "grad_norm": 17.94171065619323, + "kl": 0.2177734375, + "learning_rate": 3.6481513930261083e-07, + "loss": 0.0453, + "reward": 1.6170685291290283, + "reward_std": 0.19411921501159668, + "rewards/accuracy_reward_stage2": 0.7576935291290283, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3626 + }, + { + "completion_length": 7.171875, + "epoch": 0.6355353075170843, + "grad_norm": 18.337787479534004, + "kl": 0.1455078125, + "learning_rate": 3.646399158927633e-07, + "loss": -0.0252, + "reward": 1.666426181793213, + "reward_std": 0.22929048538208008, + "rewards/accuracy_reward_stage2": 0.6976761817932129, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3627 + }, + { + "completion_length": 4.671875, + "epoch": 0.6357105309269319, + "grad_norm": 11.80166842802175, + "kl": 0.041015625, + "learning_rate": 3.644646924829157e-07, + "loss": 0.0164, + "reward": 1.5, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward_stage2": 0.5, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3628 + }, + { + "completion_length": 9.6875, + "epoch": 0.6358857543367794, + "grad_norm": 14.412459917876111, + "kl": 0.099609375, + "learning_rate": 3.642894690730681e-07, + "loss": 0.0399, + "reward": 1.7028069496154785, + "reward_std": 0.1426396518945694, + "rewards/accuracy_reward_stage2": 0.7028070092201233, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3629 + }, + { + "completion_length": 10.75, + "epoch": 0.636060977746627, + "grad_norm": 21.876442571206635, + "kl": 0.064453125, + "learning_rate": 3.641142456632206e-07, + "loss": 0.0257, + "reward": 1.6312143802642822, + "reward_std": 0.364946186542511, + "rewards/accuracy_reward_stage2": 0.6312142610549927, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3630 + }, + { + "completion_length": 7.453125, + "epoch": 0.6362362011564745, + "grad_norm": 22.977326990070033, + "kl": 0.2119140625, + "learning_rate": 3.6393902225337303e-07, + "loss": -0.0167, + "reward": 1.5634841918945312, + "reward_std": 0.19824695587158203, + "rewards/accuracy_reward_stage2": 0.610359251499176, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3631 + }, + { + "completion_length": 12.09375, + "epoch": 0.636411424566322, + "grad_norm": 18.88546650132665, + "kl": 0.10009765625, + "learning_rate": 3.6376379884352547e-07, + "loss": 0.0401, + "reward": 1.6160304546356201, + "reward_std": 0.26513588428497314, + "rewards/accuracy_reward_stage2": 0.6160303354263306, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3632 + }, + { + "completion_length": 9.203125, + "epoch": 0.6365866479761696, + "grad_norm": 17.150632416002303, + "kl": 0.0634765625, + "learning_rate": 3.635885754336779e-07, + "loss": 0.0254, + "reward": 1.4632712602615356, + "reward_std": 0.1401752084493637, + "rewards/accuracy_reward_stage2": 0.5882712602615356, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3633 + }, + { + "completion_length": 15.359375, + "epoch": 0.6367618713860171, + "grad_norm": 23.459706095482968, + "kl": 0.15234375, + "learning_rate": 3.634133520238304e-07, + "loss": -0.0093, + "reward": 1.4876947402954102, + "reward_std": 0.20976431667804718, + "rewards/accuracy_reward_stage2": 0.6439447402954102, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3634 + }, + { + "completion_length": 8.765625, + "epoch": 0.6369370947958647, + "grad_norm": 16.70197187367129, + "kl": 0.25, + "learning_rate": 3.632381286139828e-07, + "loss": 0.0206, + "reward": 1.3416086435317993, + "reward_std": 0.246791273355484, + "rewards/accuracy_reward_stage2": 0.5134836435317993, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3635 + }, + { + "completion_length": 9.3125, + "epoch": 0.6371123182057122, + "grad_norm": 22.943919776948466, + "kl": 0.1201171875, + "learning_rate": 3.6306290520413523e-07, + "loss": 0.048, + "reward": 1.6054027080535889, + "reward_std": 0.30763548612594604, + "rewards/accuracy_reward_stage2": 0.6054026484489441, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3636 + }, + { + "completion_length": 8.71875, + "epoch": 0.6372875416155599, + "grad_norm": 34.82726050972871, + "kl": 0.1650390625, + "learning_rate": 3.6288768179428767e-07, + "loss": 0.0659, + "reward": 1.390625, + "reward_std": 0.30721208453178406, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3637 + }, + { + "completion_length": 14.46875, + "epoch": 0.6374627650254074, + "grad_norm": 18.081020575075673, + "kl": 0.05078125, + "learning_rate": 3.6271245838444016e-07, + "loss": 0.0203, + "reward": 1.5074986219406128, + "reward_std": 0.14961406588554382, + "rewards/accuracy_reward_stage2": 0.5074986219406128, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3638 + }, + { + "completion_length": 15.0625, + "epoch": 0.637637988435255, + "grad_norm": 19.973849226311064, + "kl": 0.173828125, + "learning_rate": 3.625372349745926e-07, + "loss": 0.0565, + "reward": 1.2945737838745117, + "reward_std": 0.2766297161579132, + "rewards/accuracy_reward_stage2": 0.31019875407218933, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3639 + }, + { + "completion_length": 9.65625, + "epoch": 0.6378132118451025, + "grad_norm": 23.292542316914666, + "kl": 0.103515625, + "learning_rate": 3.6236201156474504e-07, + "loss": 0.0414, + "reward": 1.5685745477676392, + "reward_std": 0.2816503643989563, + "rewards/accuracy_reward_stage2": 0.5685745477676392, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3640 + }, + { + "completion_length": 8.40625, + "epoch": 0.6379884352549501, + "grad_norm": 13.730848138316576, + "kl": 0.09716796875, + "learning_rate": 3.621867881548975e-07, + "loss": 0.0388, + "reward": 1.6940895318984985, + "reward_std": 0.16377386450767517, + "rewards/accuracy_reward_stage2": 0.8190895318984985, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3641 + }, + { + "completion_length": 13.484375, + "epoch": 0.6381636586647976, + "grad_norm": 19.131087121818375, + "kl": 0.107421875, + "learning_rate": 3.620115647450499e-07, + "loss": -0.0454, + "reward": 1.4821650981903076, + "reward_std": 0.2997850179672241, + "rewards/accuracy_reward_stage2": 0.5134150981903076, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3642 + }, + { + "completion_length": 10.328125, + "epoch": 0.6383388820746452, + "grad_norm": 13.516975395098173, + "kl": 0.046875, + "learning_rate": 3.6183634133520236e-07, + "loss": -0.0254, + "reward": 1.5661125183105469, + "reward_std": 0.21004100143909454, + "rewards/accuracy_reward_stage2": 0.7067373991012573, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3643 + }, + { + "completion_length": 10.171875, + "epoch": 0.6385141054844927, + "grad_norm": 15.89953798492155, + "kl": 0.14453125, + "learning_rate": 3.616611179253548e-07, + "loss": 0.0187, + "reward": 1.6142473220825195, + "reward_std": 0.1787492334842682, + "rewards/accuracy_reward_stage2": 0.7548723220825195, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3644 + }, + { + "completion_length": 12.953125, + "epoch": 0.6386893288943403, + "grad_norm": 17.22223205879188, + "kl": 0.25390625, + "learning_rate": 3.6148589451550724e-07, + "loss": 0.0136, + "reward": 1.6069388389587402, + "reward_std": 0.340746134519577, + "rewards/accuracy_reward_stage2": 0.6381888389587402, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3645 + }, + { + "completion_length": 13.46875, + "epoch": 0.6388645523041878, + "grad_norm": 25.223753101461224, + "kl": 0.04833984375, + "learning_rate": 3.6131067110565973e-07, + "loss": 0.0193, + "reward": 1.7002465724945068, + "reward_std": 0.28130415081977844, + "rewards/accuracy_reward_stage2": 0.7002465128898621, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3646 + }, + { + "completion_length": 20.359375, + "epoch": 0.6390397757140354, + "grad_norm": 21.772746332061597, + "kl": 0.11376953125, + "learning_rate": 3.6113544769581217e-07, + "loss": -0.0353, + "reward": 1.5794328451156616, + "reward_std": 0.22146174311637878, + "rewards/accuracy_reward_stage2": 0.6106828451156616, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3647 + }, + { + "completion_length": 31.984375, + "epoch": 0.6392149991238829, + "grad_norm": 9.787206170137264, + "kl": 0.0284423828125, + "learning_rate": 3.6096022428596456e-07, + "loss": 0.0114, + "reward": 1.4354877471923828, + "reward_std": 0.09317904710769653, + "rewards/accuracy_reward_stage2": 0.4354877173900604, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3648 + }, + { + "completion_length": 18.84375, + "epoch": 0.6393902225337305, + "grad_norm": 18.024907451498215, + "kl": 0.1708984375, + "learning_rate": 3.60785000876117e-07, + "loss": 0.0284, + "reward": 1.301550269126892, + "reward_std": 0.17121167480945587, + "rewards/accuracy_reward_stage2": 0.4421752393245697, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3649 + }, + { + "completion_length": 7.515625, + "epoch": 0.6395654459435781, + "grad_norm": 33.71627384871996, + "kl": 0.119140625, + "learning_rate": 3.606097774662695e-07, + "loss": -0.0067, + "reward": 1.144614577293396, + "reward_std": 0.19346420466899872, + "rewards/accuracy_reward_stage2": 0.300864577293396, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3650 + }, + { + "completion_length": 9.65625, + "epoch": 0.6397406693534257, + "grad_norm": 16.026084326842717, + "kl": 0.099609375, + "learning_rate": 3.6043455405642193e-07, + "loss": 0.0397, + "reward": 1.447823166847229, + "reward_std": 0.18850518763065338, + "rewards/accuracy_reward_stage2": 0.5728232264518738, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3651 + }, + { + "completion_length": 10.765625, + "epoch": 0.6399158927632732, + "grad_norm": 19.035674298729102, + "kl": 0.1494140625, + "learning_rate": 3.6025933064657437e-07, + "loss": -0.0211, + "reward": 1.4274215698242188, + "reward_std": 0.2881200909614563, + "rewards/accuracy_reward_stage2": 0.45867156982421875, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3652 + }, + { + "completion_length": 11.515625, + "epoch": 0.6400911161731208, + "grad_norm": 16.483873807580256, + "kl": 0.1640625, + "learning_rate": 3.600841072367268e-07, + "loss": -0.0587, + "reward": 1.6582226753234863, + "reward_std": 0.2703021466732025, + "rewards/accuracy_reward_stage2": 0.7050977349281311, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3653 + }, + { + "completion_length": 4.8125, + "epoch": 0.6402663395829683, + "grad_norm": 13.76375064303439, + "kl": 0.0196533203125, + "learning_rate": 3.5990888382687925e-07, + "loss": 0.0078, + "reward": 1.9034197330474854, + "reward_std": 0.05980297550559044, + "rewards/accuracy_reward_stage2": 0.9034197330474854, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3654 + }, + { + "completion_length": 13.21875, + "epoch": 0.6404415629928158, + "grad_norm": 18.985032534329388, + "kl": 0.0810546875, + "learning_rate": 3.597336604170317e-07, + "loss": 0.0325, + "reward": 1.4299408197402954, + "reward_std": 0.20411565899848938, + "rewards/accuracy_reward_stage2": 0.5549408197402954, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3655 + }, + { + "completion_length": 10.40625, + "epoch": 0.6406167864026634, + "grad_norm": 13.552495749076439, + "kl": 0.0308837890625, + "learning_rate": 3.5955843700718413e-07, + "loss": 0.0123, + "reward": 1.6765105724334717, + "reward_std": 0.1349574625492096, + "rewards/accuracy_reward_stage2": 0.6765106916427612, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3656 + }, + { + "completion_length": 9.953125, + "epoch": 0.6407920098125109, + "grad_norm": 23.11657657631506, + "kl": 0.1044921875, + "learning_rate": 3.5938321359733657e-07, + "loss": 0.0418, + "reward": 1.3100368976593018, + "reward_std": 0.3031858801841736, + "rewards/accuracy_reward_stage2": 0.435036838054657, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3657 + }, + { + "completion_length": 8.953125, + "epoch": 0.6409672332223585, + "grad_norm": 15.631149227925766, + "kl": 0.1826171875, + "learning_rate": 3.5920799018748906e-07, + "loss": -0.0026, + "reward": 1.4643263816833496, + "reward_std": 0.3079353868961334, + "rewards/accuracy_reward_stage2": 0.6205763816833496, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3658 + }, + { + "completion_length": 10.421875, + "epoch": 0.641142456632206, + "grad_norm": 28.95809032526528, + "kl": 0.2314453125, + "learning_rate": 3.590327667776415e-07, + "loss": 0.061, + "reward": 1.4065438508987427, + "reward_std": 0.3178490400314331, + "rewards/accuracy_reward_stage2": 0.5471689105033875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3659 + }, + { + "completion_length": 10.1875, + "epoch": 0.6413176800420536, + "grad_norm": 12.313051803100334, + "kl": 0.1728515625, + "learning_rate": 3.5885754336779394e-07, + "loss": -0.0482, + "reward": 1.7476816177368164, + "reward_std": 0.24349580705165863, + "rewards/accuracy_reward_stage2": 0.7945566177368164, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3660 + }, + { + "completion_length": 9.484375, + "epoch": 0.6414929034519011, + "grad_norm": 13.941713403383208, + "kl": 0.1162109375, + "learning_rate": 3.5868231995794633e-07, + "loss": -0.0419, + "reward": 1.8125, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward_stage2": 0.84375, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3661 + }, + { + "completion_length": 14.203125, + "epoch": 0.6416681268617487, + "grad_norm": 20.011229627655002, + "kl": 0.2255859375, + "learning_rate": 3.585070965480988e-07, + "loss": -0.0426, + "reward": 1.6828477382659912, + "reward_std": 0.2681369185447693, + "rewards/accuracy_reward_stage2": 0.7453478574752808, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3662 + }, + { + "completion_length": 15.46875, + "epoch": 0.6418433502715963, + "grad_norm": 16.98900105517778, + "kl": 0.0238037109375, + "learning_rate": 3.5833187313825126e-07, + "loss": 0.0095, + "reward": 1.6922528743743896, + "reward_std": 0.09544496238231659, + "rewards/accuracy_reward_stage2": 0.6922527551651001, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3663 + }, + { + "completion_length": 12.71875, + "epoch": 0.6420185736814439, + "grad_norm": 15.895188337342285, + "kl": 0.1064453125, + "learning_rate": 3.581566497284037e-07, + "loss": 0.0426, + "reward": 1.7854351997375488, + "reward_std": 0.12646767497062683, + "rewards/accuracy_reward_stage2": 0.910435140132904, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3664 + }, + { + "completion_length": 8.5625, + "epoch": 0.6421937970912914, + "grad_norm": 14.632761302952792, + "kl": 0.0986328125, + "learning_rate": 3.5798142631855614e-07, + "loss": -0.0048, + "reward": 1.78125, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3665 + }, + { + "completion_length": 10.109375, + "epoch": 0.642369020501139, + "grad_norm": 19.81936363509419, + "kl": 0.142578125, + "learning_rate": 3.5780620290870863e-07, + "loss": -0.0106, + "reward": 1.6112961769104004, + "reward_std": 0.2645382881164551, + "rewards/accuracy_reward_stage2": 0.6425461769104004, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3666 + }, + { + "completion_length": 10.28125, + "epoch": 0.6425442439109865, + "grad_norm": 14.471599410569507, + "kl": 0.1826171875, + "learning_rate": 3.57630979498861e-07, + "loss": -0.0099, + "reward": 1.6287720203399658, + "reward_std": 0.21368272602558136, + "rewards/accuracy_reward_stage2": 0.660021960735321, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3667 + }, + { + "completion_length": 13.078125, + "epoch": 0.6427194673208341, + "grad_norm": 13.667389974234641, + "kl": 0.09765625, + "learning_rate": 3.5745575608901346e-07, + "loss": -0.0051, + "reward": 1.296875, + "reward_std": 0.1530819833278656, + "rewards/accuracy_reward_stage2": 0.3125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3668 + }, + { + "completion_length": 9.71875, + "epoch": 0.6428946907306816, + "grad_norm": 18.914206371370327, + "kl": 0.10205078125, + "learning_rate": 3.572805326791659e-07, + "loss": 0.0077, + "reward": 1.6069194078445435, + "reward_std": 0.2152298241853714, + "rewards/accuracy_reward_stage2": 0.6225443482398987, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3669 + }, + { + "completion_length": 19.4375, + "epoch": 0.6430699141405292, + "grad_norm": 15.692748843568294, + "kl": 0.154296875, + "learning_rate": 3.571053092693184e-07, + "loss": -0.0261, + "reward": 1.3200486898422241, + "reward_std": 0.2600979506969452, + "rewards/accuracy_reward_stage2": 0.35129863023757935, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3670 + }, + { + "completion_length": 10.84375, + "epoch": 0.6432451375503767, + "grad_norm": 20.713042140015347, + "kl": 0.154296875, + "learning_rate": 3.5693008585947083e-07, + "loss": 0.0615, + "reward": 1.6474779844284058, + "reward_std": 0.19449205696582794, + "rewards/accuracy_reward_stage2": 0.7724780440330505, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3671 + }, + { + "completion_length": 7.90625, + "epoch": 0.6434203609602243, + "grad_norm": 18.939598476065225, + "kl": 0.09423828125, + "learning_rate": 3.5675486244962327e-07, + "loss": 0.0013, + "reward": 1.4366912841796875, + "reward_std": 0.11813464760780334, + "rewards/accuracy_reward_stage2": 0.45231637358665466, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3672 + }, + { + "completion_length": 8.59375, + "epoch": 0.6435955843700718, + "grad_norm": 22.560695552345376, + "kl": 0.09375, + "learning_rate": 3.5657963903977566e-07, + "loss": 0.0208, + "reward": 1.534196138381958, + "reward_std": 0.25116610527038574, + "rewards/accuracy_reward_stage2": 0.5498210787773132, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3673 + }, + { + "completion_length": 15.40625, + "epoch": 0.6437708077799194, + "grad_norm": 22.839026411280702, + "kl": 0.091796875, + "learning_rate": 3.5640441562992815e-07, + "loss": 0.0366, + "reward": 1.703125, + "reward_std": 0.2633790373802185, + "rewards/accuracy_reward_stage2": 0.828125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3674 + }, + { + "completion_length": 13.484375, + "epoch": 0.6439460311897669, + "grad_norm": 30.310752237469682, + "kl": 0.130859375, + "learning_rate": 3.562291922200806e-07, + "loss": 0.0081, + "reward": 1.406123161315918, + "reward_std": 0.16176798939704895, + "rewards/accuracy_reward_stage2": 0.546748161315918, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3675 + }, + { + "completion_length": 13.109375, + "epoch": 0.6441212545996146, + "grad_norm": 17.7024621132698, + "kl": 0.15234375, + "learning_rate": 3.5605396881023303e-07, + "loss": 0.0167, + "reward": 1.5824267864227295, + "reward_std": 0.2084314227104187, + "rewards/accuracy_reward_stage2": 0.5980518460273743, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3676 + }, + { + "completion_length": 13.15625, + "epoch": 0.6442964780094621, + "grad_norm": 17.017036528647274, + "kl": 0.11572265625, + "learning_rate": 3.5587874540038547e-07, + "loss": 0.0462, + "reward": 1.4068691730499268, + "reward_std": 0.1746881902217865, + "rewards/accuracy_reward_stage2": 0.6568692326545715, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3677 + }, + { + "completion_length": 9.6875, + "epoch": 0.6444717014193097, + "grad_norm": 19.01532393777833, + "kl": 0.25390625, + "learning_rate": 3.5570352199053796e-07, + "loss": 0.0139, + "reward": 1.7672874927520752, + "reward_std": 0.3108041286468506, + "rewards/accuracy_reward_stage2": 0.8141624927520752, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3678 + }, + { + "completion_length": 12.640625, + "epoch": 0.6446469248291572, + "grad_norm": 22.816178841710844, + "kl": 0.1806640625, + "learning_rate": 3.555282985806904e-07, + "loss": 0.0281, + "reward": 1.6114469766616821, + "reward_std": 0.24365541338920593, + "rewards/accuracy_reward_stage2": 0.6270719766616821, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3679 + }, + { + "completion_length": 8.921875, + "epoch": 0.6448221482390047, + "grad_norm": 19.43465114242342, + "kl": 0.26171875, + "learning_rate": 3.553530751708428e-07, + "loss": 0.0547, + "reward": 1.5898686647415161, + "reward_std": 0.3100131154060364, + "rewards/accuracy_reward_stage2": 0.6211186647415161, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3680 + }, + { + "completion_length": 11.578125, + "epoch": 0.6449973716488523, + "grad_norm": 21.807378310853615, + "kl": 0.21484375, + "learning_rate": 3.551778517609952e-07, + "loss": 0.0047, + "reward": 1.3561477661132812, + "reward_std": 0.2760908603668213, + "rewards/accuracy_reward_stage2": 0.4967726767063141, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3681 + }, + { + "completion_length": 7.71875, + "epoch": 0.6451725950586998, + "grad_norm": 23.159563899251044, + "kl": 0.1298828125, + "learning_rate": 3.550026283511477e-07, + "loss": 0.0078, + "reward": 1.6781278848648071, + "reward_std": 0.25165823101997375, + "rewards/accuracy_reward_stage2": 0.6937528848648071, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3682 + }, + { + "completion_length": 7.03125, + "epoch": 0.6453478184685474, + "grad_norm": 15.508065266492686, + "kl": 0.10107421875, + "learning_rate": 3.5482740494130016e-07, + "loss": 0.0403, + "reward": 1.6627414226531982, + "reward_std": 0.17357571423053741, + "rewards/accuracy_reward_stage2": 0.7877414226531982, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3683 + }, + { + "completion_length": 9.953125, + "epoch": 0.6455230418783949, + "grad_norm": 20.11035846039172, + "kl": 0.1669921875, + "learning_rate": 3.546521815314526e-07, + "loss": 0.0064, + "reward": 1.4375, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3684 + }, + { + "completion_length": 35.859375, + "epoch": 0.6456982652882425, + "grad_norm": 47.35011743804844, + "kl": 0.1123046875, + "learning_rate": 3.5447695812160504e-07, + "loss": 0.0009, + "reward": 1.5811420679092407, + "reward_std": 0.26867377758026123, + "rewards/accuracy_reward_stage2": 0.5967670679092407, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3685 + }, + { + "completion_length": 8.5625, + "epoch": 0.64587348869809, + "grad_norm": 19.582694344476494, + "kl": 0.083984375, + "learning_rate": 3.543017347117574e-07, + "loss": 0.0337, + "reward": 1.487224817276001, + "reward_std": 0.2638910114765167, + "rewards/accuracy_reward_stage2": 0.48722487688064575, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3686 + }, + { + "completion_length": 10.796875, + "epoch": 0.6460487121079376, + "grad_norm": 17.312223258225703, + "kl": 0.0830078125, + "learning_rate": 3.541265113019099e-07, + "loss": 0.0333, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3687 + }, + { + "completion_length": 9.15625, + "epoch": 0.6462239355177852, + "grad_norm": 22.85228117476544, + "kl": 0.1728515625, + "learning_rate": 3.5395128789206236e-07, + "loss": 0.034, + "reward": 1.21493399143219, + "reward_std": 0.31954365968704224, + "rewards/accuracy_reward_stage2": 0.48055899143218994, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3688 + }, + { + "completion_length": 9.5, + "epoch": 0.6463991589276328, + "grad_norm": 16.64890158145795, + "kl": 0.30859375, + "learning_rate": 3.537760644822148e-07, + "loss": 0.035, + "reward": 1.5417678356170654, + "reward_std": 0.23739787936210632, + "rewards/accuracy_reward_stage2": 0.6980177164077759, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3689 + }, + { + "completion_length": 14.671875, + "epoch": 0.6465743823374803, + "grad_norm": 18.750404329398975, + "kl": 0.150390625, + "learning_rate": 3.5360084107236724e-07, + "loss": -0.0516, + "reward": 1.405958890914917, + "reward_std": 0.1552903652191162, + "rewards/accuracy_reward_stage2": 0.7028338313102722, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 3690 + }, + { + "completion_length": 12.125, + "epoch": 0.6467496057473279, + "grad_norm": 17.833589733517734, + "kl": 0.16796875, + "learning_rate": 3.5342561766251973e-07, + "loss": 0.0671, + "reward": 1.3854167461395264, + "reward_std": 0.2868278920650482, + "rewards/accuracy_reward_stage2": 0.6354166865348816, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3691 + }, + { + "completion_length": 18.65625, + "epoch": 0.6469248291571754, + "grad_norm": 30.050526122991396, + "kl": 0.322265625, + "learning_rate": 3.5325039425267217e-07, + "loss": 0.0414, + "reward": 1.224075198173523, + "reward_std": 0.26675575971603394, + "rewards/accuracy_reward_stage2": 0.27095019817352295, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3692 + }, + { + "completion_length": 9.203125, + "epoch": 0.647100052567023, + "grad_norm": 17.85586035935803, + "kl": 0.1611328125, + "learning_rate": 3.5307517084282455e-07, + "loss": 0.0002, + "reward": 1.3594422340393066, + "reward_std": 0.22799652814865112, + "rewards/accuracy_reward_stage2": 0.39069223403930664, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3693 + }, + { + "completion_length": 15.890625, + "epoch": 0.6472752759768705, + "grad_norm": 17.995748941269156, + "kl": 0.1533203125, + "learning_rate": 3.52899947432977e-07, + "loss": -0.0501, + "reward": 1.5937397480010986, + "reward_std": 0.12785354256629944, + "rewards/accuracy_reward_stage2": 0.6406147480010986, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3694 + }, + { + "completion_length": 12.734375, + "epoch": 0.6474504993867181, + "grad_norm": 17.172686057454797, + "kl": 0.0751953125, + "learning_rate": 3.527247240231295e-07, + "loss": 0.03, + "reward": 1.4971519708633423, + "reward_std": 0.17167343199253082, + "rewards/accuracy_reward_stage2": 0.4971519112586975, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3695 + }, + { + "completion_length": 7.9375, + "epoch": 0.6476257227965656, + "grad_norm": 13.774901205039306, + "kl": 0.12109375, + "learning_rate": 3.5254950061328193e-07, + "loss": 0.0068, + "reward": 1.6474002599716187, + "reward_std": 0.1538010835647583, + "rewards/accuracy_reward_stage2": 0.6630252599716187, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3696 + }, + { + "completion_length": 10.328125, + "epoch": 0.6478009462064132, + "grad_norm": 17.499256588420277, + "kl": 0.125, + "learning_rate": 3.5237427720343437e-07, + "loss": -0.0003, + "reward": 1.6396540403366089, + "reward_std": 0.19111916422843933, + "rewards/accuracy_reward_stage2": 0.6709039807319641, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3697 + }, + { + "completion_length": 13.6875, + "epoch": 0.6479761696162607, + "grad_norm": 55.37711546967327, + "kl": 0.349609375, + "learning_rate": 3.521990537935868e-07, + "loss": 0.0996, + "reward": 1.0965315103530884, + "reward_std": 0.27788692712783813, + "rewards/accuracy_reward_stage2": 0.37778154015541077, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3698 + }, + { + "completion_length": 12.09375, + "epoch": 0.6481513930261082, + "grad_norm": 21.027429870961353, + "kl": 0.047607421875, + "learning_rate": 3.5202383038373925e-07, + "loss": 0.019, + "reward": 1.5767583847045898, + "reward_std": 0.2807679772377014, + "rewards/accuracy_reward_stage2": 0.5767583250999451, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3699 + }, + { + "completion_length": 11.515625, + "epoch": 0.6483266164359558, + "grad_norm": 20.49667679086188, + "kl": 0.1328125, + "learning_rate": 3.518486069738917e-07, + "loss": 0.0614, + "reward": 1.5157562494277954, + "reward_std": 0.2441159188747406, + "rewards/accuracy_reward_stage2": 0.6407562494277954, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3700 + }, + { + "completion_length": 7.171875, + "epoch": 0.6485018398458035, + "grad_norm": 18.043544748787948, + "kl": 0.0791015625, + "learning_rate": 3.516733835640441e-07, + "loss": 0.0018, + "reward": 1.5832839012145996, + "reward_std": 0.2336646169424057, + "rewards/accuracy_reward_stage2": 0.7239089012145996, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3701 + }, + { + "completion_length": 7.140625, + "epoch": 0.648677063255651, + "grad_norm": 16.274809205252975, + "kl": 0.06396484375, + "learning_rate": 3.5149816015419656e-07, + "loss": 0.0255, + "reward": 1.7552083730697632, + "reward_std": 0.17123916745185852, + "rewards/accuracy_reward_stage2": 0.7552083730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3702 + }, + { + "completion_length": 9.265625, + "epoch": 0.6488522866654985, + "grad_norm": 19.514395198585934, + "kl": 0.09033203125, + "learning_rate": 3.5132293674434906e-07, + "loss": 0.0056, + "reward": 1.402266502380371, + "reward_std": 0.28506118059158325, + "rewards/accuracy_reward_stage2": 0.4178914427757263, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3703 + }, + { + "completion_length": 10.25, + "epoch": 0.6490275100753461, + "grad_norm": 21.652850702390413, + "kl": 0.0947265625, + "learning_rate": 3.511477133345015e-07, + "loss": -0.0063, + "reward": 1.5164008140563965, + "reward_std": 0.23606063425540924, + "rewards/accuracy_reward_stage2": 0.6570256948471069, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3704 + }, + { + "completion_length": 12.375, + "epoch": 0.6492027334851936, + "grad_norm": 19.437959017734133, + "kl": 0.09130859375, + "learning_rate": 3.509724899246539e-07, + "loss": 0.0022, + "reward": 1.1945466995239258, + "reward_std": 0.23989106714725494, + "rewards/accuracy_reward_stage2": 0.33517172932624817, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3705 + }, + { + "completion_length": 11.890625, + "epoch": 0.6493779568950412, + "grad_norm": 27.20928171381491, + "kl": 0.1328125, + "learning_rate": 3.507972665148063e-07, + "loss": 0.0533, + "reward": 1.1246411800384521, + "reward_std": 0.16782069206237793, + "rewards/accuracy_reward_stage2": 0.49964118003845215, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 3706 + }, + { + "completion_length": 7.953125, + "epoch": 0.6495531803048887, + "grad_norm": 15.36476070824679, + "kl": 0.11572265625, + "learning_rate": 3.506220431049588e-07, + "loss": 0.0147, + "reward": 1.591088056564331, + "reward_std": 0.14347587525844574, + "rewards/accuracy_reward_stage2": 0.6067129969596863, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3707 + }, + { + "completion_length": 12.0, + "epoch": 0.6497284037147363, + "grad_norm": 21.447431886758366, + "kl": 0.2353515625, + "learning_rate": 3.5044681969511126e-07, + "loss": -0.1017, + "reward": 1.5135695934295654, + "reward_std": 0.3412063717842102, + "rewards/accuracy_reward_stage2": 0.7010695934295654, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 3708 + }, + { + "completion_length": 8.890625, + "epoch": 0.6499036271245838, + "grad_norm": 16.476302891145476, + "kl": 0.130859375, + "learning_rate": 3.502715962852637e-07, + "loss": 0.0523, + "reward": 1.5784977674484253, + "reward_std": 0.16375833749771118, + "rewards/accuracy_reward_stage2": 0.7034977674484253, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3709 + }, + { + "completion_length": 12.0, + "epoch": 0.6500788505344314, + "grad_norm": 18.689905011866053, + "kl": 0.12109375, + "learning_rate": 3.5009637287541613e-07, + "loss": 0.0075, + "reward": 1.5490682125091553, + "reward_std": 0.19050264358520508, + "rewards/accuracy_reward_stage2": 0.6896932125091553, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3710 + }, + { + "completion_length": 12.484375, + "epoch": 0.6502540739442789, + "grad_norm": 28.57532795015385, + "kl": 0.29296875, + "learning_rate": 3.4992114946556863e-07, + "loss": 0.0133, + "reward": 1.4129630327224731, + "reward_std": 0.27624937891960144, + "rewards/accuracy_reward_stage2": 0.5848380923271179, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3711 + }, + { + "completion_length": 11.421875, + "epoch": 0.6504292973541265, + "grad_norm": 18.780457042870765, + "kl": 0.1083984375, + "learning_rate": 3.49745926055721e-07, + "loss": 0.0433, + "reward": 1.4363348484039307, + "reward_std": 0.20736932754516602, + "rewards/accuracy_reward_stage2": 0.6863349676132202, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3712 + }, + { + "completion_length": 8.859375, + "epoch": 0.650604520763974, + "grad_norm": 16.69906655424036, + "kl": 0.06298828125, + "learning_rate": 3.4957070264587345e-07, + "loss": -0.0178, + "reward": 1.658174753189087, + "reward_std": 0.19843123853206635, + "rewards/accuracy_reward_stage2": 0.6737997531890869, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3713 + }, + { + "completion_length": 7.3125, + "epoch": 0.6507797441738217, + "grad_norm": 19.927210600002145, + "kl": 0.1552734375, + "learning_rate": 3.493954792360259e-07, + "loss": -0.0635, + "reward": 1.625319242477417, + "reward_std": 0.23469506204128265, + "rewards/accuracy_reward_stage2": 0.6721941828727722, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3714 + }, + { + "completion_length": 10.96875, + "epoch": 0.6509549675836692, + "grad_norm": 17.435068228575098, + "kl": 0.28515625, + "learning_rate": 3.492202558261784e-07, + "loss": 0.1136, + "reward": 1.1564494371414185, + "reward_std": 0.11023418605327606, + "rewards/accuracy_reward_stage2": 0.40644940733909607, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3715 + }, + { + "completion_length": 8.578125, + "epoch": 0.6511301909935168, + "grad_norm": 24.593109335212954, + "kl": 0.259765625, + "learning_rate": 3.490450324163308e-07, + "loss": 0.1039, + "reward": 1.4121159315109253, + "reward_std": 0.2062818557024002, + "rewards/accuracy_reward_stage2": 0.6621158719062805, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3716 + }, + { + "completion_length": 10.703125, + "epoch": 0.6513054144033643, + "grad_norm": 18.10507496549295, + "kl": 0.1884765625, + "learning_rate": 3.4886980900648326e-07, + "loss": 0.0111, + "reward": 1.7756869792938232, + "reward_std": 0.2656467854976654, + "rewards/accuracy_reward_stage2": 0.806937038898468, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3717 + }, + { + "completion_length": 9.46875, + "epoch": 0.6514806378132119, + "grad_norm": 19.52925181539543, + "kl": 0.2236328125, + "learning_rate": 3.4869458559663565e-07, + "loss": 0.0524, + "reward": 1.5218735933303833, + "reward_std": 0.19578316807746887, + "rewards/accuracy_reward_stage2": 0.5374986529350281, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3718 + }, + { + "completion_length": 8.9375, + "epoch": 0.6516558612230594, + "grad_norm": 20.85909176443267, + "kl": 0.13671875, + "learning_rate": 3.4851936218678814e-07, + "loss": -0.0004, + "reward": 1.602414846420288, + "reward_std": 0.20141032338142395, + "rewards/accuracy_reward_stage2": 0.6336649656295776, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3719 + }, + { + "completion_length": 7.578125, + "epoch": 0.651831084632907, + "grad_norm": 15.77366126598909, + "kl": 0.1982421875, + "learning_rate": 3.483441387769406e-07, + "loss": -0.0254, + "reward": 1.6306451559066772, + "reward_std": 0.20585371553897858, + "rewards/accuracy_reward_stage2": 0.677520215511322, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3720 + }, + { + "completion_length": 10.0625, + "epoch": 0.6520063080427545, + "grad_norm": 34.95961364568875, + "kl": 0.0947265625, + "learning_rate": 3.48168915367093e-07, + "loss": 0.0378, + "reward": 1.5415239334106445, + "reward_std": 0.2722419202327728, + "rewards/accuracy_reward_stage2": 0.5415239930152893, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3721 + }, + { + "completion_length": 7.28125, + "epoch": 0.652181531452602, + "grad_norm": 21.899136357172818, + "kl": 0.11376953125, + "learning_rate": 3.4799369195724546e-07, + "loss": 0.0028, + "reward": 1.6860486268997192, + "reward_std": 0.21623259782791138, + "rewards/accuracy_reward_stage2": 0.7016735076904297, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3722 + }, + { + "completion_length": 13.6875, + "epoch": 0.6523567548624496, + "grad_norm": 13.149917081596058, + "kl": 0.08935546875, + "learning_rate": 3.4781846854739796e-07, + "loss": -0.0147, + "reward": 1.4375, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward_stage2": 0.703125, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3723 + }, + { + "completion_length": 10.9375, + "epoch": 0.6525319782722971, + "grad_norm": 17.47616705957636, + "kl": 0.1826171875, + "learning_rate": 3.4764324513755034e-07, + "loss": 0.0343, + "reward": 1.4866011142730713, + "reward_std": 0.19845804572105408, + "rewards/accuracy_reward_stage2": 0.5022260546684265, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3724 + }, + { + "completion_length": 9.28125, + "epoch": 0.6527072016821447, + "grad_norm": 22.11286584635779, + "kl": 0.34765625, + "learning_rate": 3.474680217277028e-07, + "loss": -0.0108, + "reward": 1.545560598373413, + "reward_std": 0.3116529583930969, + "rewards/accuracy_reward_stage2": 0.6080605983734131, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3725 + }, + { + "completion_length": 7.59375, + "epoch": 0.6528824250919922, + "grad_norm": 18.219007326739145, + "kl": 0.32421875, + "learning_rate": 3.472927983178552e-07, + "loss": -0.0423, + "reward": 1.8254756927490234, + "reward_std": 0.3147159516811371, + "rewards/accuracy_reward_stage2": 0.8879756331443787, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3726 + }, + { + "completion_length": 11.09375, + "epoch": 0.6530576485018399, + "grad_norm": 39.50198001524787, + "kl": 0.384765625, + "learning_rate": 3.471175749080077e-07, + "loss": 0.1193, + "reward": 1.3389873504638672, + "reward_std": 0.3174276351928711, + "rewards/accuracy_reward_stage2": 0.6046122908592224, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3727 + }, + { + "completion_length": 11.65625, + "epoch": 0.6532328719116874, + "grad_norm": 28.740440477499103, + "kl": 0.2412109375, + "learning_rate": 3.4694235149816015e-07, + "loss": 0.0521, + "reward": 1.3430554866790771, + "reward_std": 0.3203160762786865, + "rewards/accuracy_reward_stage2": 0.4836805462837219, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3728 + }, + { + "completion_length": 10.125, + "epoch": 0.653408095321535, + "grad_norm": 13.062985034306717, + "kl": 0.1708984375, + "learning_rate": 3.467671280883126e-07, + "loss": 0.0373, + "reward": 1.4132962226867676, + "reward_std": 0.11710938811302185, + "rewards/accuracy_reward_stage2": 0.5539212822914124, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3729 + }, + { + "completion_length": 12.078125, + "epoch": 0.6535833187313825, + "grad_norm": 18.584620786462324, + "kl": 0.1611328125, + "learning_rate": 3.4659190467846503e-07, + "loss": -0.0143, + "reward": 1.328352451324463, + "reward_std": 0.3078764081001282, + "rewards/accuracy_reward_stage2": 0.48460254073143005, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3730 + }, + { + "completion_length": 8.625, + "epoch": 0.6537585421412301, + "grad_norm": 27.202464955173326, + "kl": 0.1484375, + "learning_rate": 3.4641668126861747e-07, + "loss": -0.0044, + "reward": 1.463038682937622, + "reward_std": 0.2783457338809967, + "rewards/accuracy_reward_stage2": 0.49428868293762207, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3731 + }, + { + "completion_length": 7.0625, + "epoch": 0.6539337655510776, + "grad_norm": 13.408109916174384, + "kl": 0.09228515625, + "learning_rate": 3.462414578587699e-07, + "loss": 0.0369, + "reward": 1.9514180421829224, + "reward_std": 0.09767099469900131, + "rewards/accuracy_reward_stage2": 0.9514180421829224, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3732 + }, + { + "completion_length": 15.65625, + "epoch": 0.6541089889609252, + "grad_norm": 12.027256239529795, + "kl": 0.14453125, + "learning_rate": 3.4606623444892235e-07, + "loss": 0.014, + "reward": 1.1358861923217773, + "reward_std": 0.16388946771621704, + "rewards/accuracy_reward_stage2": 0.27651113271713257, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3733 + }, + { + "completion_length": 9.265625, + "epoch": 0.6542842123707727, + "grad_norm": 18.95864959955592, + "kl": 0.154296875, + "learning_rate": 3.458910110390748e-07, + "loss": 0.033, + "reward": 1.50836181640625, + "reward_std": 0.25778210163116455, + "rewards/accuracy_reward_stage2": 0.64898681640625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3734 + }, + { + "completion_length": 10.59375, + "epoch": 0.6544594357806203, + "grad_norm": 21.764154397501247, + "kl": 0.146484375, + "learning_rate": 3.457157876292273e-07, + "loss": 0.0274, + "reward": 1.6949567794799805, + "reward_std": 0.2524658441543579, + "rewards/accuracy_reward_stage2": 0.7105817794799805, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3735 + }, + { + "completion_length": 9.375, + "epoch": 0.6546346591904678, + "grad_norm": 16.90160251412798, + "kl": 0.11376953125, + "learning_rate": 3.455405642193797e-07, + "loss": 0.0097, + "reward": 1.4191895723342896, + "reward_std": 0.16770751774311066, + "rewards/accuracy_reward_stage2": 0.4348146319389343, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3736 + }, + { + "completion_length": 14.53125, + "epoch": 0.6548098826003154, + "grad_norm": 18.18403251798853, + "kl": 0.056884765625, + "learning_rate": 3.453653408095321e-07, + "loss": 0.0228, + "reward": 1.6315476894378662, + "reward_std": 0.11001887172460556, + "rewards/accuracy_reward_stage2": 0.631547749042511, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3737 + }, + { + "completion_length": 16.34375, + "epoch": 0.6549851060101629, + "grad_norm": 23.436874690011468, + "kl": 0.216796875, + "learning_rate": 3.4519011739968455e-07, + "loss": 0.0622, + "reward": 1.3637266159057617, + "reward_std": 0.2299090176820755, + "rewards/accuracy_reward_stage2": 0.5043515563011169, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3738 + }, + { + "completion_length": 11.015625, + "epoch": 0.6551603294200106, + "grad_norm": 26.50739060775904, + "kl": 0.240234375, + "learning_rate": 3.4501489398983704e-07, + "loss": 0.0391, + "reward": 1.4403434991836548, + "reward_std": 0.3019851744174957, + "rewards/accuracy_reward_stage2": 0.61221843957901, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3739 + }, + { + "completion_length": 21.625, + "epoch": 0.6553355528298581, + "grad_norm": 18.513512395425863, + "kl": 0.3046875, + "learning_rate": 3.448396705799895e-07, + "loss": -0.0277, + "reward": 1.639461874961853, + "reward_std": 0.25874900817871094, + "rewards/accuracy_reward_stage2": 0.7019618153572083, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3740 + }, + { + "completion_length": 9.203125, + "epoch": 0.6555107762397057, + "grad_norm": 23.17622655740721, + "kl": 0.09228515625, + "learning_rate": 3.446644471701419e-07, + "loss": 0.0369, + "reward": 1.3237862586975098, + "reward_std": 0.2052987515926361, + "rewards/accuracy_reward_stage2": 0.5737862586975098, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3741 + }, + { + "completion_length": 8.890625, + "epoch": 0.6556859996495532, + "grad_norm": 13.285559996644166, + "kl": 0.1259765625, + "learning_rate": 3.4448922376029436e-07, + "loss": -0.0148, + "reward": 1.7263647317886353, + "reward_std": 0.15753847360610962, + "rewards/accuracy_reward_stage2": 0.7576147317886353, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3742 + }, + { + "completion_length": 11.96875, + "epoch": 0.6558612230594008, + "grad_norm": 14.989393310438379, + "kl": 0.0869140625, + "learning_rate": 3.4431400035044685e-07, + "loss": 0.0347, + "reward": 1.555059552192688, + "reward_std": 0.1714351773262024, + "rewards/accuracy_reward_stage2": 0.6800594925880432, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3743 + }, + { + "completion_length": 11.84375, + "epoch": 0.6560364464692483, + "grad_norm": 14.933381466929175, + "kl": 0.0908203125, + "learning_rate": 3.4413877694059924e-07, + "loss": -0.0015, + "reward": 1.4837230443954468, + "reward_std": 0.14175119996070862, + "rewards/accuracy_reward_stage2": 0.499347984790802, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3744 + }, + { + "completion_length": 9.5, + "epoch": 0.6562116698790958, + "grad_norm": 15.504366600818704, + "kl": 0.12158203125, + "learning_rate": 3.439635535307517e-07, + "loss": -0.0152, + "reward": 1.7632322311401367, + "reward_std": 0.1770913302898407, + "rewards/accuracy_reward_stage2": 0.7944821715354919, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3745 + }, + { + "completion_length": 10.90625, + "epoch": 0.6563868932889434, + "grad_norm": 19.398149361266665, + "kl": 0.2109375, + "learning_rate": 3.437883301209041e-07, + "loss": 0.032, + "reward": 1.597571849822998, + "reward_std": 0.2011193186044693, + "rewards/accuracy_reward_stage2": 0.753821849822998, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3746 + }, + { + "completion_length": 10.453125, + "epoch": 0.6565621166987909, + "grad_norm": 20.833322425182583, + "kl": 0.1201171875, + "learning_rate": 3.436131067110566e-07, + "loss": -0.0305, + "reward": 1.4291150569915771, + "reward_std": 0.2782094180583954, + "rewards/accuracy_reward_stage2": 0.4603649973869324, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3747 + }, + { + "completion_length": 17.34375, + "epoch": 0.6567373401086385, + "grad_norm": 17.334135646348553, + "kl": 0.06591796875, + "learning_rate": 3.4343788330120905e-07, + "loss": 0.0263, + "reward": 1.3547989130020142, + "reward_std": 0.1030765026807785, + "rewards/accuracy_reward_stage2": 0.47979891300201416, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3748 + }, + { + "completion_length": 8.421875, + "epoch": 0.656912563518486, + "grad_norm": 19.427073166610853, + "kl": 0.1748046875, + "learning_rate": 3.432626598913615e-07, + "loss": -0.006, + "reward": 1.499578595161438, + "reward_std": 0.27636945247650146, + "rewards/accuracy_reward_stage2": 0.530828595161438, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3749 + }, + { + "completion_length": 8.125, + "epoch": 0.6570877869283336, + "grad_norm": 14.00868203881481, + "kl": 0.140625, + "learning_rate": 3.430874364815139e-07, + "loss": 0.0119, + "reward": 1.8125, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.953125, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3750 + }, + { + "completion_length": 9.09375, + "epoch": 0.6572630103381811, + "grad_norm": 16.193634064377, + "kl": 0.058837890625, + "learning_rate": 3.4291221307166637e-07, + "loss": 0.0235, + "reward": 1.3800715208053589, + "reward_std": 0.15051256120204926, + "rewards/accuracy_reward_stage2": 0.3800715208053589, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3751 + }, + { + "completion_length": 21.34375, + "epoch": 0.6574382337480288, + "grad_norm": 22.072825583902596, + "kl": 0.0869140625, + "learning_rate": 3.427369896618188e-07, + "loss": -0.0095, + "reward": 1.7406294345855713, + "reward_std": 0.2594420909881592, + "rewards/accuracy_reward_stage2": 0.7562545537948608, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3752 + }, + { + "completion_length": 6.921875, + "epoch": 0.6576134571578763, + "grad_norm": 16.058494707079756, + "kl": 0.031005859375, + "learning_rate": 3.4256176625197125e-07, + "loss": 0.0124, + "reward": 1.695550560951233, + "reward_std": 0.1407569944858551, + "rewards/accuracy_reward_stage2": 0.8205506801605225, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3753 + }, + { + "completion_length": 10.65625, + "epoch": 0.6577886805677239, + "grad_norm": 22.634665908325573, + "kl": 0.146484375, + "learning_rate": 3.423865428421237e-07, + "loss": -0.0046, + "reward": 1.516603708267212, + "reward_std": 0.3547920286655426, + "rewards/accuracy_reward_stage2": 0.5478537082672119, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3754 + }, + { + "completion_length": 8.265625, + "epoch": 0.6579639039775714, + "grad_norm": 13.65605117039202, + "kl": 0.1728515625, + "learning_rate": 3.422113194322762e-07, + "loss": 0.0112, + "reward": 1.6286708116531372, + "reward_std": 0.14669831097126007, + "rewards/accuracy_reward_stage2": 0.6599206924438477, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3755 + }, + { + "completion_length": 5.40625, + "epoch": 0.658139127387419, + "grad_norm": 13.280947295786133, + "kl": 0.068359375, + "learning_rate": 3.4203609602242857e-07, + "loss": -0.0168, + "reward": 1.9375, + "reward_std": 0.1462520956993103, + "rewards/accuracy_reward_stage2": 0.953125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3756 + }, + { + "completion_length": 8.6875, + "epoch": 0.6583143507972665, + "grad_norm": 17.47601377460947, + "kl": 0.1171875, + "learning_rate": 3.41860872612581e-07, + "loss": 0.0153, + "reward": 1.4678363800048828, + "reward_std": 0.2227524071931839, + "rewards/accuracy_reward_stage2": 0.4834613502025604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3757 + }, + { + "completion_length": 13.046875, + "epoch": 0.6584895742071141, + "grad_norm": 18.490909376699747, + "kl": 0.177734375, + "learning_rate": 3.4168564920273345e-07, + "loss": -0.0174, + "reward": 1.4714080095291138, + "reward_std": 0.32452845573425293, + "rewards/accuracy_reward_stage2": 0.5026580691337585, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3758 + }, + { + "completion_length": 6.5, + "epoch": 0.6586647976169616, + "grad_norm": 23.224734562018334, + "kl": 0.1962890625, + "learning_rate": 3.415104257928859e-07, + "loss": -0.026, + "reward": 1.6008846759796143, + "reward_std": 0.32852935791015625, + "rewards/accuracy_reward_stage2": 0.6477595567703247, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3759 + }, + { + "completion_length": 11.125, + "epoch": 0.6588400210268092, + "grad_norm": 18.760357237054855, + "kl": 0.2041015625, + "learning_rate": 3.413352023830384e-07, + "loss": -0.0059, + "reward": 1.5383474826812744, + "reward_std": 0.3135913610458374, + "rewards/accuracy_reward_stage2": 0.6945973634719849, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3760 + }, + { + "completion_length": 12.59375, + "epoch": 0.6590152444366567, + "grad_norm": 16.016991487601743, + "kl": 0.09326171875, + "learning_rate": 3.411599789731908e-07, + "loss": -0.0021, + "reward": 1.5600254535675049, + "reward_std": 0.23971496522426605, + "rewards/accuracy_reward_stage2": 0.5756504535675049, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3761 + }, + { + "completion_length": 10.359375, + "epoch": 0.6591904678465043, + "grad_norm": 19.832552947194255, + "kl": 0.267578125, + "learning_rate": 3.4098475556334326e-07, + "loss": 0.0016, + "reward": 1.6911460161209106, + "reward_std": 0.3048381209373474, + "rewards/accuracy_reward_stage2": 0.7380210161209106, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3762 + }, + { + "completion_length": 9.84375, + "epoch": 0.6593656912563518, + "grad_norm": 18.808604800352317, + "kl": 0.189453125, + "learning_rate": 3.4080953215349565e-07, + "loss": -0.0427, + "reward": 1.629578948020935, + "reward_std": 0.28050941228866577, + "rewards/accuracy_reward_stage2": 0.8014539480209351, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3763 + }, + { + "completion_length": 8.078125, + "epoch": 0.6595409146661994, + "grad_norm": 18.145882072947025, + "kl": 0.21484375, + "learning_rate": 3.4063430874364814e-07, + "loss": -0.042, + "reward": 1.5573110580444336, + "reward_std": 0.20979392528533936, + "rewards/accuracy_reward_stage2": 0.6198111176490784, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3764 + }, + { + "completion_length": 7.40625, + "epoch": 0.659716138076047, + "grad_norm": 15.642888931193294, + "kl": 0.1015625, + "learning_rate": 3.404590853338006e-07, + "loss": -0.0034, + "reward": 1.6968038082122803, + "reward_std": 0.19613364338874817, + "rewards/accuracy_reward_stage2": 0.8374287486076355, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3765 + }, + { + "completion_length": 10.25, + "epoch": 0.6598913614858946, + "grad_norm": 23.440768685123555, + "kl": 0.09912109375, + "learning_rate": 3.40283861923953e-07, + "loss": -0.0435, + "reward": 1.463047981262207, + "reward_std": 0.37215834856033325, + "rewards/accuracy_reward_stage2": 0.4942980110645294, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3766 + }, + { + "completion_length": 11.171875, + "epoch": 0.6600665848957421, + "grad_norm": 18.89718950407704, + "kl": 0.1845703125, + "learning_rate": 3.4010863851410546e-07, + "loss": 0.0667, + "reward": 1.4453023672103882, + "reward_std": 0.29632043838500977, + "rewards/accuracy_reward_stage2": 0.6953023672103882, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3767 + }, + { + "completion_length": 9.734375, + "epoch": 0.6602418083055897, + "grad_norm": 18.261741322305895, + "kl": 0.15234375, + "learning_rate": 3.3993341510425795e-07, + "loss": 0.0607, + "reward": 1.595839023590088, + "reward_std": 0.18660318851470947, + "rewards/accuracy_reward_stage2": 0.7208389639854431, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3768 + }, + { + "completion_length": 20.8125, + "epoch": 0.6604170317154372, + "grad_norm": 20.39858492028997, + "kl": 0.1357421875, + "learning_rate": 3.3975819169441034e-07, + "loss": -0.0171, + "reward": 1.3963425159454346, + "reward_std": 0.27449262142181396, + "rewards/accuracy_reward_stage2": 0.42759257555007935, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3769 + }, + { + "completion_length": 13.09375, + "epoch": 0.6605922551252847, + "grad_norm": 17.313030395092323, + "kl": 0.1484375, + "learning_rate": 3.395829682845628e-07, + "loss": 0.0209, + "reward": 1.204599142074585, + "reward_std": 0.18435396254062653, + "rewards/accuracy_reward_stage2": 0.34522414207458496, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3770 + }, + { + "completion_length": 7.796875, + "epoch": 0.6607674785351323, + "grad_norm": 23.390768820207825, + "kl": 0.06494140625, + "learning_rate": 3.394077448747152e-07, + "loss": 0.026, + "reward": 1.7073495388031006, + "reward_std": 0.2253035306930542, + "rewards/accuracy_reward_stage2": 0.7073495984077454, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3771 + }, + { + "completion_length": 9.046875, + "epoch": 0.6609427019449798, + "grad_norm": 17.356307836688707, + "kl": 0.1259765625, + "learning_rate": 3.392325214648677e-07, + "loss": 0.0127, + "reward": 1.6848533153533936, + "reward_std": 0.18406951427459717, + "rewards/accuracy_reward_stage2": 0.7004783153533936, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3772 + }, + { + "completion_length": 10.75, + "epoch": 0.6611179253548274, + "grad_norm": 19.927632990776967, + "kl": 0.2431640625, + "learning_rate": 3.3905729805502015e-07, + "loss": -0.0305, + "reward": 1.4557608366012573, + "reward_std": 0.36149799823760986, + "rewards/accuracy_reward_stage2": 0.5182607769966125, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3773 + }, + { + "completion_length": 24.546875, + "epoch": 0.6612931487646749, + "grad_norm": 17.501135465012446, + "kl": 0.0830078125, + "learning_rate": 3.388820746451726e-07, + "loss": -0.0332, + "reward": 1.3385493755340576, + "reward_std": 0.12510152161121368, + "rewards/accuracy_reward_stage2": 0.3697994351387024, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3774 + }, + { + "completion_length": 6.609375, + "epoch": 0.6614683721745225, + "grad_norm": 17.16426784437837, + "kl": 0.1767578125, + "learning_rate": 3.38706851235325e-07, + "loss": 0.0083, + "reward": 1.2884865999221802, + "reward_std": 0.25939449667930603, + "rewards/accuracy_reward_stage2": 0.3197365701198578, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3775 + }, + { + "completion_length": 6.46875, + "epoch": 0.66164359558437, + "grad_norm": 18.955321642768602, + "kl": 0.26953125, + "learning_rate": 3.3853162782547747e-07, + "loss": 0.0137, + "reward": 1.5619122982025146, + "reward_std": 0.25723767280578613, + "rewards/accuracy_reward_stage2": 0.6087872982025146, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3776 + }, + { + "completion_length": 12.453125, + "epoch": 0.6618188189942176, + "grad_norm": 18.881526029270855, + "kl": 0.11376953125, + "learning_rate": 3.383564044156299e-07, + "loss": 0.0455, + "reward": 1.4165642261505127, + "reward_std": 0.30341148376464844, + "rewards/accuracy_reward_stage2": 0.4165641665458679, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3777 + }, + { + "completion_length": 13.515625, + "epoch": 0.6619940424040652, + "grad_norm": 20.948955131923597, + "kl": 0.138671875, + "learning_rate": 3.3818118100578235e-07, + "loss": 0.046, + "reward": 1.5264365673065186, + "reward_std": 0.30605074763298035, + "rewards/accuracy_reward_stage2": 0.5420615077018738, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3778 + }, + { + "completion_length": 8.734375, + "epoch": 0.6621692658139128, + "grad_norm": 14.31361978247023, + "kl": 0.234375, + "learning_rate": 3.380059575959348e-07, + "loss": 0.0496, + "reward": 1.4558091163635254, + "reward_std": 0.15434116125106812, + "rewards/accuracy_reward_stage2": 0.5964341163635254, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3779 + }, + { + "completion_length": 11.734375, + "epoch": 0.6623444892237603, + "grad_norm": 15.118890139919, + "kl": 0.1123046875, + "learning_rate": 3.378307341860873e-07, + "loss": 0.0048, + "reward": 1.7935502529144287, + "reward_std": 0.21152012050151825, + "rewards/accuracy_reward_stage2": 0.8091753125190735, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3780 + }, + { + "completion_length": 6.75, + "epoch": 0.6625197126336079, + "grad_norm": 19.08976605237901, + "kl": 0.408203125, + "learning_rate": 3.376555107762397e-07, + "loss": 0.0432, + "reward": 1.4881014823913574, + "reward_std": 0.3304804861545563, + "rewards/accuracy_reward_stage2": 0.5506014227867126, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3781 + }, + { + "completion_length": 11.5, + "epoch": 0.6626949360434554, + "grad_norm": 30.88448000309724, + "kl": 0.177734375, + "learning_rate": 3.374802873663921e-07, + "loss": 0.0773, + "reward": 1.3979463577270508, + "reward_std": 0.21971935033798218, + "rewards/accuracy_reward_stage2": 0.6479463577270508, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3782 + }, + { + "completion_length": 8.421875, + "epoch": 0.662870159453303, + "grad_norm": 14.383039345570571, + "kl": 0.1767578125, + "learning_rate": 3.3730506395654455e-07, + "loss": 0.0097, + "reward": 1.3854167461395264, + "reward_std": 0.19974718987941742, + "rewards/accuracy_reward_stage2": 0.4166666865348816, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3783 + }, + { + "completion_length": 11.6875, + "epoch": 0.6630453828631505, + "grad_norm": 32.26407359455346, + "kl": 0.1572265625, + "learning_rate": 3.3712984054669704e-07, + "loss": 0.0188, + "reward": 1.6351269483566284, + "reward_std": 0.12910021841526031, + "rewards/accuracy_reward_stage2": 0.9007519483566284, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3784 + }, + { + "completion_length": 9.609375, + "epoch": 0.6632206062729981, + "grad_norm": 18.12670377480947, + "kl": 0.142578125, + "learning_rate": 3.369546171368495e-07, + "loss": 0.0208, + "reward": 1.6540381908416748, + "reward_std": 0.2671361565589905, + "rewards/accuracy_reward_stage2": 0.66966313123703, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3785 + }, + { + "completion_length": 10.171875, + "epoch": 0.6633958296828456, + "grad_norm": 19.31629462658784, + "kl": 0.119140625, + "learning_rate": 3.367793937270019e-07, + "loss": 0.0034, + "reward": 1.6815602779388428, + "reward_std": 0.27131250500679016, + "rewards/accuracy_reward_stage2": 0.6971853971481323, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3786 + }, + { + "completion_length": 11.0625, + "epoch": 0.6635710530926932, + "grad_norm": 19.81498896949635, + "kl": 0.154296875, + "learning_rate": 3.3660417031715436e-07, + "loss": -0.0112, + "reward": 1.601413607597351, + "reward_std": 0.37887099385261536, + "rewards/accuracy_reward_stage2": 0.6326636075973511, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3787 + }, + { + "completion_length": 8.828125, + "epoch": 0.6637462765025407, + "grad_norm": 21.39361118233043, + "kl": 0.267578125, + "learning_rate": 3.364289469073068e-07, + "loss": 0.0186, + "reward": 1.3068628311157227, + "reward_std": 0.2723737359046936, + "rewards/accuracy_reward_stage2": 0.4631127715110779, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3788 + }, + { + "completion_length": 7.375, + "epoch": 0.6639214999123882, + "grad_norm": 16.641332895006535, + "kl": 0.11767578125, + "learning_rate": 3.3625372349745924e-07, + "loss": -0.0136, + "reward": 1.646165132522583, + "reward_std": 0.17670243978500366, + "rewards/accuracy_reward_stage2": 0.8024150729179382, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3789 + }, + { + "completion_length": 11.125, + "epoch": 0.6640967233222359, + "grad_norm": 12.84345404563453, + "kl": 0.031494140625, + "learning_rate": 3.360785000876117e-07, + "loss": 0.0126, + "reward": 1.646390438079834, + "reward_std": 0.09573078155517578, + "rewards/accuracy_reward_stage2": 0.646390438079834, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3790 + }, + { + "completion_length": 10.984375, + "epoch": 0.6642719467320835, + "grad_norm": 17.804293261151166, + "kl": 0.154296875, + "learning_rate": 3.359032766777641e-07, + "loss": 0.0173, + "reward": 1.6868422031402588, + "reward_std": 0.26586437225341797, + "rewards/accuracy_reward_stage2": 0.7024672031402588, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3791 + }, + { + "completion_length": 11.109375, + "epoch": 0.664447170141931, + "grad_norm": 22.70462929976115, + "kl": 0.11962890625, + "learning_rate": 3.357280532679166e-07, + "loss": 0.0169, + "reward": 1.4475001096725464, + "reward_std": 0.358273446559906, + "rewards/accuracy_reward_stage2": 0.4631251394748688, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3792 + }, + { + "completion_length": 8.984375, + "epoch": 0.6646223935517785, + "grad_norm": 13.1589326041502, + "kl": 0.19140625, + "learning_rate": 3.3555282985806905e-07, + "loss": -0.0119, + "reward": 1.8759760856628418, + "reward_std": 0.19113053381443024, + "rewards/accuracy_reward_stage2": 0.9072260856628418, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3793 + }, + { + "completion_length": 9.375, + "epoch": 0.6647976169616261, + "grad_norm": 16.2208180979175, + "kl": 0.1962890625, + "learning_rate": 3.3537760644822143e-07, + "loss": -0.0445, + "reward": 1.6830174922943115, + "reward_std": 0.301523357629776, + "rewards/accuracy_reward_stage2": 0.7298924326896667, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3794 + }, + { + "completion_length": 10.59375, + "epoch": 0.6649728403714736, + "grad_norm": 22.2857496134525, + "kl": 0.1591796875, + "learning_rate": 3.352023830383739e-07, + "loss": 0.0197, + "reward": 1.7518019676208496, + "reward_std": 0.22051015496253967, + "rewards/accuracy_reward_stage2": 0.8924268484115601, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3795 + }, + { + "completion_length": 10.78125, + "epoch": 0.6651480637813212, + "grad_norm": 11.82391521934736, + "kl": 0.1083984375, + "learning_rate": 3.3502715962852637e-07, + "loss": -0.0009, + "reward": 1.7660496234893799, + "reward_std": 0.17463505268096924, + "rewards/accuracy_reward_stage2": 0.7816746234893799, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3796 + }, + { + "completion_length": 11.796875, + "epoch": 0.6653232871911687, + "grad_norm": 19.721619897430543, + "kl": 0.0966796875, + "learning_rate": 3.348519362186788e-07, + "loss": -0.0012, + "reward": 1.1196482181549072, + "reward_std": 0.23411786556243896, + "rewards/accuracy_reward_stage2": 0.2602732479572296, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3797 + }, + { + "completion_length": 5.296875, + "epoch": 0.6654985106010163, + "grad_norm": 11.758927680436287, + "kl": 0.28125, + "learning_rate": 3.3467671280883125e-07, + "loss": 0.0241, + "reward": 1.6875, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward_stage2": 0.84375, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3798 + }, + { + "completion_length": 9.265625, + "epoch": 0.6656737340108638, + "grad_norm": 16.046421595628004, + "kl": 0.345703125, + "learning_rate": 3.345014893989837e-07, + "loss": -0.043, + "reward": 1.5293065309524536, + "reward_std": 0.28512483835220337, + "rewards/accuracy_reward_stage2": 0.7324315309524536, + "rewards/format_reward_stage1_pointerpad": 0.796875, + "scores/accuracy_reward_stage2": 0.796875, + "step": 3799 + }, + { + "completion_length": 13.265625, + "epoch": 0.6658489574207114, + "grad_norm": 18.830911554311168, + "kl": 0.017333984375, + "learning_rate": 3.343262659891362e-07, + "loss": 0.0069, + "reward": 1.7495684623718262, + "reward_std": 0.20599254965782166, + "rewards/accuracy_reward_stage2": 0.7495684027671814, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3800 + }, + { + "completion_length": 11.84375, + "epoch": 0.6660241808305589, + "grad_norm": 12.271733362216043, + "kl": 0.05517578125, + "learning_rate": 3.3415104257928856e-07, + "loss": 0.0221, + "reward": 1.5179245471954346, + "reward_std": 0.1245698481798172, + "rewards/accuracy_reward_stage2": 0.5179246068000793, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3801 + }, + { + "completion_length": 11.203125, + "epoch": 0.6661994042404065, + "grad_norm": 21.78953904690612, + "kl": 0.2412109375, + "learning_rate": 3.33975819169441e-07, + "loss": 0.0256, + "reward": 1.5020623207092285, + "reward_std": 0.2805957794189453, + "rewards/accuracy_reward_stage2": 0.5333123207092285, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3802 + }, + { + "completion_length": 11.203125, + "epoch": 0.6663746276502541, + "grad_norm": 20.18266093377564, + "kl": 0.326171875, + "learning_rate": 3.3380059575959344e-07, + "loss": -0.0134, + "reward": 1.2390034198760986, + "reward_std": 0.2471129447221756, + "rewards/accuracy_reward_stage2": 0.4265034794807434, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 3803 + }, + { + "completion_length": 14.59375, + "epoch": 0.6665498510601017, + "grad_norm": 18.078311944493244, + "kl": 0.1728515625, + "learning_rate": 3.3362537234974594e-07, + "loss": -0.0209, + "reward": 1.5762239694595337, + "reward_std": 0.277314692735672, + "rewards/accuracy_reward_stage2": 0.6230989098548889, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3804 + }, + { + "completion_length": 9.703125, + "epoch": 0.6667250744699492, + "grad_norm": 17.04188443236299, + "kl": 0.1328125, + "learning_rate": 3.334501489398984e-07, + "loss": -0.0306, + "reward": 1.7449889183044434, + "reward_std": 0.25133174657821655, + "rewards/accuracy_reward_stage2": 0.7762388586997986, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3805 + }, + { + "completion_length": 11.203125, + "epoch": 0.6669002978797968, + "grad_norm": 49.30679087926395, + "kl": 0.5625, + "learning_rate": 3.332749255300508e-07, + "loss": 0.1652, + "reward": 1.2607142925262451, + "reward_std": 0.21106266975402832, + "rewards/accuracy_reward_stage2": 0.41696426272392273, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3806 + }, + { + "completion_length": 9.53125, + "epoch": 0.6670755212896443, + "grad_norm": 20.956408970849377, + "kl": 0.259765625, + "learning_rate": 3.330997021202032e-07, + "loss": 0.0249, + "reward": 1.5182335376739502, + "reward_std": 0.2621886730194092, + "rewards/accuracy_reward_stage2": 0.549483597278595, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3807 + }, + { + "completion_length": 11.140625, + "epoch": 0.6672507446994919, + "grad_norm": 28.610511173855173, + "kl": 0.1904296875, + "learning_rate": 3.329244787103557e-07, + "loss": 0.0973, + "reward": 1.5078845024108887, + "reward_std": 0.20054185390472412, + "rewards/accuracy_reward_stage2": 0.6328844428062439, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3808 + }, + { + "completion_length": 5.421875, + "epoch": 0.6674259681093394, + "grad_norm": 15.317171215419528, + "kl": 0.1533203125, + "learning_rate": 3.3274925530050813e-07, + "loss": -0.0258, + "reward": 1.824300765991211, + "reward_std": 0.26174771785736084, + "rewards/accuracy_reward_stage2": 0.8555507063865662, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3809 + }, + { + "completion_length": 12.40625, + "epoch": 0.667601191519187, + "grad_norm": 23.217493178402837, + "kl": 0.07177734375, + "learning_rate": 3.325740318906606e-07, + "loss": 0.0323, + "reward": 1.3106931447982788, + "reward_std": 0.28896403312683105, + "rewards/accuracy_reward_stage2": 0.4356931447982788, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3810 + }, + { + "completion_length": 10.28125, + "epoch": 0.6677764149290345, + "grad_norm": 26.477477101239245, + "kl": 0.1142578125, + "learning_rate": 3.32398808480813e-07, + "loss": 0.0168, + "reward": 1.642218828201294, + "reward_std": 0.3020542860031128, + "rewards/accuracy_reward_stage2": 0.6578439474105835, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3811 + }, + { + "completion_length": 11.296875, + "epoch": 0.667951638338882, + "grad_norm": 23.32254696210251, + "kl": 0.318359375, + "learning_rate": 3.322235850709655e-07, + "loss": 0.052, + "reward": 1.5385891199111938, + "reward_std": 0.324771523475647, + "rewards/accuracy_reward_stage2": 0.7104641199111938, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3812 + }, + { + "completion_length": 11.34375, + "epoch": 0.6681268617487296, + "grad_norm": 19.10950592222298, + "kl": 0.18359375, + "learning_rate": 3.3204836166111795e-07, + "loss": 0.0445, + "reward": 1.59661066532135, + "reward_std": 0.25543177127838135, + "rewards/accuracy_reward_stage2": 0.6122356653213501, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3813 + }, + { + "completion_length": 20.6875, + "epoch": 0.6683020851585771, + "grad_norm": 19.02191313355224, + "kl": 0.1259765625, + "learning_rate": 3.3187313825127033e-07, + "loss": 0.0062, + "reward": 1.5268758535385132, + "reward_std": 0.18108849227428436, + "rewards/accuracy_reward_stage2": 0.5425008535385132, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3814 + }, + { + "completion_length": 16.15625, + "epoch": 0.6684773085684247, + "grad_norm": 18.578214590711934, + "kl": 0.1015625, + "learning_rate": 3.3169791484142277e-07, + "loss": -0.0034, + "reward": 1.443403959274292, + "reward_std": 0.22445048391819, + "rewards/accuracy_reward_stage2": 0.45902884006500244, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3815 + }, + { + "completion_length": 11.703125, + "epoch": 0.6686525319782723, + "grad_norm": 22.83780116713671, + "kl": 0.1845703125, + "learning_rate": 3.3152269143157527e-07, + "loss": 0.0297, + "reward": 1.686450719833374, + "reward_std": 0.24080437421798706, + "rewards/accuracy_reward_stage2": 0.827075719833374, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3816 + }, + { + "completion_length": 13.875, + "epoch": 0.6688277553881199, + "grad_norm": 16.72810642640274, + "kl": 0.1328125, + "learning_rate": 3.313474680217277e-07, + "loss": 0.012, + "reward": 1.5293313264846802, + "reward_std": 0.1489149034023285, + "rewards/accuracy_reward_stage2": 0.5449563264846802, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3817 + }, + { + "completion_length": 10.625, + "epoch": 0.6690029787979674, + "grad_norm": 19.73170627422111, + "kl": 0.2138671875, + "learning_rate": 3.3117224461188014e-07, + "loss": 0.0133, + "reward": 1.5520833730697632, + "reward_std": 0.3328608274459839, + "rewards/accuracy_reward_stage2": 0.7083333134651184, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3818 + }, + { + "completion_length": 17.078125, + "epoch": 0.669178202207815, + "grad_norm": 13.387912599799588, + "kl": 0.255859375, + "learning_rate": 3.309970212020326e-07, + "loss": 0.0244, + "reward": 1.3956577777862549, + "reward_std": 0.20234528183937073, + "rewards/accuracy_reward_stage2": 0.4269077777862549, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3819 + }, + { + "completion_length": 11.0625, + "epoch": 0.6693534256176625, + "grad_norm": 16.666920323225234, + "kl": 0.07861328125, + "learning_rate": 3.30821797792185e-07, + "loss": 0.0313, + "reward": 1.6344341039657593, + "reward_std": 0.12134034186601639, + "rewards/accuracy_reward_stage2": 0.759434163570404, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3820 + }, + { + "completion_length": 8.453125, + "epoch": 0.6695286490275101, + "grad_norm": 22.67417343158387, + "kl": 0.07470703125, + "learning_rate": 3.3064657438233746e-07, + "loss": 0.03, + "reward": 1.5556546449661255, + "reward_std": 0.19017130136489868, + "rewards/accuracy_reward_stage2": 0.5556546449661255, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3821 + }, + { + "completion_length": 19.84375, + "epoch": 0.6697038724373576, + "grad_norm": 16.46627356736991, + "kl": 0.036376953125, + "learning_rate": 3.304713509724899e-07, + "loss": 0.0146, + "reward": 1.5332281589508057, + "reward_std": 0.07025317847728729, + "rewards/accuracy_reward_stage2": 0.6582280397415161, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3822 + }, + { + "completion_length": 9.234375, + "epoch": 0.6698790958472052, + "grad_norm": 26.80407941462116, + "kl": 0.291015625, + "learning_rate": 3.3029612756264234e-07, + "loss": 0.038, + "reward": 1.3822216987609863, + "reward_std": 0.2539028525352478, + "rewards/accuracy_reward_stage2": 0.6634716987609863, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3823 + }, + { + "completion_length": 6.8125, + "epoch": 0.6700543192570527, + "grad_norm": 8.311960186174872, + "kl": 0.045166015625, + "learning_rate": 3.3012090415279484e-07, + "loss": 0.0181, + "reward": 1.71875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward_stage2": 0.71875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3824 + }, + { + "completion_length": 7.15625, + "epoch": 0.6702295426669003, + "grad_norm": 17.01314817220025, + "kl": 0.1689453125, + "learning_rate": 3.299456807429473e-07, + "loss": -0.0386, + "reward": 1.4015306234359741, + "reward_std": 0.24766838550567627, + "rewards/accuracy_reward_stage2": 0.5734056234359741, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3825 + }, + { + "completion_length": 11.09375, + "epoch": 0.6704047660767478, + "grad_norm": 14.522585950078515, + "kl": 0.1298828125, + "learning_rate": 3.2977045733309966e-07, + "loss": -0.0286, + "reward": 1.3737890720367432, + "reward_std": 0.1715371012687683, + "rewards/accuracy_reward_stage2": 0.4050390124320984, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3826 + }, + { + "completion_length": 11.515625, + "epoch": 0.6705799894865954, + "grad_norm": 19.10620552915964, + "kl": 0.09375, + "learning_rate": 3.295952339232521e-07, + "loss": -0.021, + "reward": 1.4859750270843506, + "reward_std": 0.20169678330421448, + "rewards/accuracy_reward_stage2": 0.5172249674797058, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3827 + }, + { + "completion_length": 10.203125, + "epoch": 0.6707552128964429, + "grad_norm": 26.683339127329393, + "kl": 0.1962890625, + "learning_rate": 3.2942001051340454e-07, + "loss": 0.0337, + "reward": 1.3072201013565063, + "reward_std": 0.2676845192909241, + "rewards/accuracy_reward_stage2": 0.44784507155418396, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3828 + }, + { + "completion_length": 10.953125, + "epoch": 0.6709304363062906, + "grad_norm": 17.501991848539717, + "kl": 0.07275390625, + "learning_rate": 3.2924478710355703e-07, + "loss": -0.0151, + "reward": 1.5807785987854004, + "reward_std": 0.1519029289484024, + "rewards/accuracy_reward_stage2": 0.5964034795761108, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3829 + }, + { + "completion_length": 10.65625, + "epoch": 0.6711056597161381, + "grad_norm": 16.364446061626836, + "kl": 0.16796875, + "learning_rate": 3.2906956369370947e-07, + "loss": -0.05, + "reward": 1.3926641941070557, + "reward_std": 0.30592912435531616, + "rewards/accuracy_reward_stage2": 0.4395391345024109, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3830 + }, + { + "completion_length": 8.59375, + "epoch": 0.6712808831259857, + "grad_norm": 14.233024889294107, + "kl": 0.17578125, + "learning_rate": 3.288943402838619e-07, + "loss": 0.0304, + "reward": 1.721284031867981, + "reward_std": 0.20616403222084045, + "rewards/accuracy_reward_stage2": 0.736909031867981, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3831 + }, + { + "completion_length": 10.84375, + "epoch": 0.6714561065358332, + "grad_norm": 19.467527727246598, + "kl": 0.31640625, + "learning_rate": 3.287191168740144e-07, + "loss": -0.0196, + "reward": 1.2939127683639526, + "reward_std": 0.3219314515590668, + "rewards/accuracy_reward_stage2": 0.37203776836395264, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3832 + }, + { + "completion_length": 13.296875, + "epoch": 0.6716313299456808, + "grad_norm": 19.922162130122, + "kl": 0.1416015625, + "learning_rate": 3.285438934641668e-07, + "loss": -0.0715, + "reward": 1.633192777633667, + "reward_std": 0.23791098594665527, + "rewards/accuracy_reward_stage2": 0.6800678372383118, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3833 + }, + { + "completion_length": 9.796875, + "epoch": 0.6718065533555283, + "grad_norm": 18.082579453339836, + "kl": 0.07470703125, + "learning_rate": 3.2836867005431923e-07, + "loss": -0.0143, + "reward": 1.5907870531082153, + "reward_std": 0.1954219490289688, + "rewards/accuracy_reward_stage2": 0.6064120531082153, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3834 + }, + { + "completion_length": 8.859375, + "epoch": 0.6719817767653758, + "grad_norm": 16.720129476556014, + "kl": 0.031982421875, + "learning_rate": 3.2819344664447167e-07, + "loss": 0.0128, + "reward": 1.455439805984497, + "reward_std": 0.23492306470870972, + "rewards/accuracy_reward_stage2": 0.45543980598449707, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3835 + }, + { + "completion_length": 8.34375, + "epoch": 0.6721570001752234, + "grad_norm": 32.88435394651061, + "kl": 0.26171875, + "learning_rate": 3.280182232346241e-07, + "loss": 0.0663, + "reward": 1.4930814504623413, + "reward_std": 0.18936826288700104, + "rewards/accuracy_reward_stage2": 0.6337064504623413, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3836 + }, + { + "completion_length": 12.78125, + "epoch": 0.6723322235850709, + "grad_norm": 17.80399433349402, + "kl": 0.130859375, + "learning_rate": 3.278429998247766e-07, + "loss": -0.0312, + "reward": 1.5512590408325195, + "reward_std": 0.29897886514663696, + "rewards/accuracy_reward_stage2": 0.5825091004371643, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3837 + }, + { + "completion_length": 9.21875, + "epoch": 0.6725074469949185, + "grad_norm": 13.305832739926364, + "kl": 0.177734375, + "learning_rate": 3.2766777641492904e-07, + "loss": -0.0165, + "reward": 1.3768309354782104, + "reward_std": 0.16466762125492096, + "rewards/accuracy_reward_stage2": 0.40808090567588806, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3838 + }, + { + "completion_length": 6.578125, + "epoch": 0.672682670404766, + "grad_norm": 19.53413402892302, + "kl": 0.0517578125, + "learning_rate": 3.2749255300508143e-07, + "loss": 0.0207, + "reward": 1.587104082107544, + "reward_std": 0.13184921443462372, + "rewards/accuracy_reward_stage2": 0.5871041417121887, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3839 + }, + { + "completion_length": 7.0625, + "epoch": 0.6728578938146136, + "grad_norm": 14.381656777284563, + "kl": 0.130859375, + "learning_rate": 3.2731732959523387e-07, + "loss": -0.021, + "reward": 1.6133270263671875, + "reward_std": 0.206298828125, + "rewards/accuracy_reward_stage2": 0.6445769667625427, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3840 + }, + { + "completion_length": 9.90625, + "epoch": 0.6730331172244612, + "grad_norm": 18.426939341494425, + "kl": 0.140625, + "learning_rate": 3.2714210618538636e-07, + "loss": 0.0211, + "reward": 1.5109727382659912, + "reward_std": 0.1667131632566452, + "rewards/accuracy_reward_stage2": 0.5265976786613464, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3841 + }, + { + "completion_length": 9.140625, + "epoch": 0.6732083406343088, + "grad_norm": 17.32055192337897, + "kl": 0.1826171875, + "learning_rate": 3.269668827755388e-07, + "loss": -0.0152, + "reward": 1.6525171995162964, + "reward_std": 0.357276976108551, + "rewards/accuracy_reward_stage2": 0.6837671399116516, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3842 + }, + { + "completion_length": 10.21875, + "epoch": 0.6733835640441563, + "grad_norm": 23.93115293151798, + "kl": 0.07373046875, + "learning_rate": 3.2679165936569124e-07, + "loss": -0.0037, + "reward": 1.4673311710357666, + "reward_std": 0.38846614956855774, + "rewards/accuracy_reward_stage2": 0.48295605182647705, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3843 + }, + { + "completion_length": 8.78125, + "epoch": 0.6735587874540039, + "grad_norm": 22.95815368304367, + "kl": 0.203125, + "learning_rate": 3.266164359558437e-07, + "loss": 0.0459, + "reward": 1.4174572229385376, + "reward_std": 0.27246612310409546, + "rewards/accuracy_reward_stage2": 0.5580822229385376, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3844 + }, + { + "completion_length": 9.40625, + "epoch": 0.6737340108638514, + "grad_norm": 20.06182804507912, + "kl": 0.08056640625, + "learning_rate": 3.264412125459961e-07, + "loss": -0.0119, + "reward": 1.3923089504241943, + "reward_std": 0.26168495416641235, + "rewards/accuracy_reward_stage2": 0.40793395042419434, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3845 + }, + { + "completion_length": 11.703125, + "epoch": 0.673909234273699, + "grad_norm": 18.47541852117159, + "kl": 0.030517578125, + "learning_rate": 3.2626598913614856e-07, + "loss": 0.0122, + "reward": 1.5444600582122803, + "reward_std": 0.1621989905834198, + "rewards/accuracy_reward_stage2": 0.544460117816925, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3846 + }, + { + "completion_length": 9.25, + "epoch": 0.6740844576835465, + "grad_norm": 18.265132964504144, + "kl": 0.185546875, + "learning_rate": 3.26090765726301e-07, + "loss": -0.0559, + "reward": 1.390181541442871, + "reward_std": 0.19728723168373108, + "rewards/accuracy_reward_stage2": 0.4526815414428711, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3847 + }, + { + "completion_length": 7.5625, + "epoch": 0.6742596810933941, + "grad_norm": 23.277133642814135, + "kl": 0.0625, + "learning_rate": 3.2591554231645344e-07, + "loss": 0.0251, + "reward": 1.5694992542266846, + "reward_std": 0.3129950165748596, + "rewards/accuracy_reward_stage2": 0.5694993138313293, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3848 + }, + { + "completion_length": 8.875, + "epoch": 0.6744349045032416, + "grad_norm": 17.61604651957286, + "kl": 0.0252685546875, + "learning_rate": 3.2574031890660593e-07, + "loss": 0.0101, + "reward": 1.8145318031311035, + "reward_std": 0.18421462178230286, + "rewards/accuracy_reward_stage2": 0.8145317435264587, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3849 + }, + { + "completion_length": 14.28125, + "epoch": 0.6746101279130892, + "grad_norm": 14.546044174712485, + "kl": 0.0291748046875, + "learning_rate": 3.2556509549675837e-07, + "loss": 0.0117, + "reward": 1.296875, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3850 + }, + { + "completion_length": 8.390625, + "epoch": 0.6747853513229367, + "grad_norm": 18.886644065465404, + "kl": 0.251953125, + "learning_rate": 3.253898720869108e-07, + "loss": 0.0403, + "reward": 1.5611273050308228, + "reward_std": 0.24465563893318176, + "rewards/accuracy_reward_stage2": 0.5923773050308228, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3851 + }, + { + "completion_length": 11.625, + "epoch": 0.6749605747327843, + "grad_norm": 15.88017314047495, + "kl": 0.09326171875, + "learning_rate": 3.252146486770632e-07, + "loss": 0.004, + "reward": 1.6079230308532715, + "reward_std": 0.19661104679107666, + "rewards/accuracy_reward_stage2": 0.6235479712486267, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3852 + }, + { + "completion_length": 7.0625, + "epoch": 0.6751357981426318, + "grad_norm": 14.757801964566244, + "kl": 0.142578125, + "learning_rate": 3.250394252672157e-07, + "loss": 0.0128, + "reward": 1.6863281726837158, + "reward_std": 0.18864640593528748, + "rewards/accuracy_reward_stage2": 0.7019531726837158, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3853 + }, + { + "completion_length": 8.640625, + "epoch": 0.6753110215524795, + "grad_norm": 16.493447509624776, + "kl": 0.271484375, + "learning_rate": 3.2486420185736813e-07, + "loss": -0.0021, + "reward": 1.39857017993927, + "reward_std": 0.2185536026954651, + "rewards/accuracy_reward_stage2": 0.5704452395439148, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3854 + }, + { + "completion_length": 15.359375, + "epoch": 0.675486244962327, + "grad_norm": 14.82144590858429, + "kl": 0.0712890625, + "learning_rate": 3.2468897844752057e-07, + "loss": 0.0285, + "reward": 1.5174546241760254, + "reward_std": 0.10479126870632172, + "rewards/accuracy_reward_stage2": 0.6424546837806702, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3855 + }, + { + "completion_length": 12.03125, + "epoch": 0.6756614683721746, + "grad_norm": 20.812155729239077, + "kl": 0.140625, + "learning_rate": 3.24513755037673e-07, + "loss": 0.0563, + "reward": 1.5602467060089111, + "reward_std": 0.19248421490192413, + "rewards/accuracy_reward_stage2": 0.6852467656135559, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3856 + }, + { + "completion_length": 7.15625, + "epoch": 0.6758366917820221, + "grad_norm": 11.617742248311606, + "kl": 0.07421875, + "learning_rate": 3.243385316278255e-07, + "loss": 0.0298, + "reward": 1.7109254598617554, + "reward_std": 0.06991486251354218, + "rewards/accuracy_reward_stage2": 0.7109254598617554, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3857 + }, + { + "completion_length": 12.5625, + "epoch": 0.6760119151918697, + "grad_norm": 27.784956148030506, + "kl": 0.25, + "learning_rate": 3.241633082179779e-07, + "loss": 0.0623, + "reward": 1.5572917461395264, + "reward_std": 0.3274396061897278, + "rewards/accuracy_reward_stage2": 0.6979166865348816, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3858 + }, + { + "completion_length": 8.734375, + "epoch": 0.6761871386017172, + "grad_norm": 25.093731646428207, + "kl": 0.08056640625, + "learning_rate": 3.2398808480813033e-07, + "loss": 0.0323, + "reward": 1.471142053604126, + "reward_std": 0.3559247851371765, + "rewards/accuracy_reward_stage2": 0.596142053604126, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3859 + }, + { + "completion_length": 14.96875, + "epoch": 0.6763623620115647, + "grad_norm": 19.326438345530867, + "kl": 0.0986328125, + "learning_rate": 3.2381286139828277e-07, + "loss": 0.0393, + "reward": 1.6141562461853027, + "reward_std": 0.1951741874217987, + "rewards/accuracy_reward_stage2": 0.6141563057899475, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3860 + }, + { + "completion_length": 9.03125, + "epoch": 0.6765375854214123, + "grad_norm": 23.327624059918623, + "kl": 0.212890625, + "learning_rate": 3.2363763798843526e-07, + "loss": 0.0177, + "reward": 1.4712347984313965, + "reward_std": 0.34458082914352417, + "rewards/accuracy_reward_stage2": 0.5024847984313965, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3861 + }, + { + "completion_length": 19.65625, + "epoch": 0.6767128088312598, + "grad_norm": 16.849083072806163, + "kl": 0.13671875, + "learning_rate": 3.234624145785877e-07, + "loss": -0.0213, + "reward": 1.222360372543335, + "reward_std": 0.2840573191642761, + "rewards/accuracy_reward_stage2": 0.37861043214797974, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3862 + }, + { + "completion_length": 11.234375, + "epoch": 0.6768880322411074, + "grad_norm": 17.208273637782654, + "kl": 0.2578125, + "learning_rate": 3.2328719116874014e-07, + "loss": 0.0356, + "reward": 1.5516068935394287, + "reward_std": 0.2445361167192459, + "rewards/accuracy_reward_stage2": 0.7078569531440735, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3863 + }, + { + "completion_length": 15.890625, + "epoch": 0.6770632556509549, + "grad_norm": 18.015938285221996, + "kl": 0.1787109375, + "learning_rate": 3.231119677588926e-07, + "loss": -0.0118, + "reward": 1.4646062850952148, + "reward_std": 0.33400917053222656, + "rewards/accuracy_reward_stage2": 0.4958563446998596, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3864 + }, + { + "completion_length": 14.921875, + "epoch": 0.6772384790608025, + "grad_norm": 18.196032916563585, + "kl": 0.126953125, + "learning_rate": 3.22936744349045e-07, + "loss": 0.0115, + "reward": 1.5467426776885986, + "reward_std": 0.22806578874588013, + "rewards/accuracy_reward_stage2": 0.5623677372932434, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3865 + }, + { + "completion_length": 8.765625, + "epoch": 0.67741370247065, + "grad_norm": 16.52686949186202, + "kl": 0.1728515625, + "learning_rate": 3.2276152093919746e-07, + "loss": -0.0469, + "reward": 1.771558403968811, + "reward_std": 0.23040996491909027, + "rewards/accuracy_reward_stage2": 0.818433403968811, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3866 + }, + { + "completion_length": 11.6875, + "epoch": 0.6775889258804977, + "grad_norm": 19.275094089668283, + "kl": 0.177734375, + "learning_rate": 3.225862975293499e-07, + "loss": 0.0131, + "reward": 1.3561549186706543, + "reward_std": 0.2779897451400757, + "rewards/accuracy_reward_stage2": 0.4967798888683319, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3867 + }, + { + "completion_length": 8.28125, + "epoch": 0.6777641492903452, + "grad_norm": 15.92255726519388, + "kl": 0.10888671875, + "learning_rate": 3.2241107411950234e-07, + "loss": -0.0284, + "reward": 1.4668264389038086, + "reward_std": 0.1967121809720993, + "rewards/accuracy_reward_stage2": 0.6074514389038086, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3868 + }, + { + "completion_length": 6.046875, + "epoch": 0.6779393727001928, + "grad_norm": 11.782657382689344, + "kl": 0.107421875, + "learning_rate": 3.2223585070965483e-07, + "loss": -0.0013, + "reward": 1.78125, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.796875, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3869 + }, + { + "completion_length": 7.09375, + "epoch": 0.6781145961100403, + "grad_norm": 15.969784495706426, + "kl": 0.058837890625, + "learning_rate": 3.2206062729980727e-07, + "loss": 0.0236, + "reward": 1.6320271492004395, + "reward_std": 0.2036266028881073, + "rewards/accuracy_reward_stage2": 0.6320271492004395, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3870 + }, + { + "completion_length": 10.390625, + "epoch": 0.6782898195198879, + "grad_norm": 77.97322350800017, + "kl": 0.625, + "learning_rate": 3.2188540388995966e-07, + "loss": 0.1614, + "reward": 1.2760417461395264, + "reward_std": 0.25043365359306335, + "rewards/accuracy_reward_stage2": 0.4322916567325592, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3871 + }, + { + "completion_length": 6.25, + "epoch": 0.6784650429297354, + "grad_norm": 21.606935492858284, + "kl": 0.400390625, + "learning_rate": 3.217101804801121e-07, + "loss": 0.0334, + "reward": 1.6798467636108398, + "reward_std": 0.38453322649002075, + "rewards/accuracy_reward_stage2": 0.7423468232154846, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3872 + }, + { + "completion_length": 8.3125, + "epoch": 0.678640266339583, + "grad_norm": 17.220294287482876, + "kl": 0.2197265625, + "learning_rate": 3.215349570702646e-07, + "loss": -0.0175, + "reward": 1.521311640739441, + "reward_std": 0.19599059224128723, + "rewards/accuracy_reward_stage2": 0.5681866407394409, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3873 + }, + { + "completion_length": 9.109375, + "epoch": 0.6788154897494305, + "grad_norm": 17.94561290455226, + "kl": 0.18359375, + "learning_rate": 3.2135973366041703e-07, + "loss": 0.0733, + "reward": 1.521716594696045, + "reward_std": 0.17642980813980103, + "rewards/accuracy_reward_stage2": 0.5217165946960449, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3874 + }, + { + "completion_length": 7.296875, + "epoch": 0.6789907131592781, + "grad_norm": 17.68863830864549, + "kl": 0.07666015625, + "learning_rate": 3.2118451025056947e-07, + "loss": 0.0307, + "reward": 1.7461333274841309, + "reward_std": 0.14099617302417755, + "rewards/accuracy_reward_stage2": 0.7461333870887756, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3875 + }, + { + "completion_length": 6.859375, + "epoch": 0.6791659365691256, + "grad_norm": 20.19153973213064, + "kl": 0.1689453125, + "learning_rate": 3.210092868407219e-07, + "loss": 0.0069, + "reward": 1.3125, + "reward_std": 0.2845909297466278, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3876 + }, + { + "completion_length": 8.828125, + "epoch": 0.6793411599789732, + "grad_norm": 18.381875937743548, + "kl": 0.11328125, + "learning_rate": 3.2083406343087435e-07, + "loss": 0.0453, + "reward": 1.6288864612579346, + "reward_std": 0.2961333096027374, + "rewards/accuracy_reward_stage2": 0.6288865208625793, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3877 + }, + { + "completion_length": 7.546875, + "epoch": 0.6795163833888207, + "grad_norm": 20.716311767333977, + "kl": 0.2294921875, + "learning_rate": 3.206588400210268e-07, + "loss": 0.011, + "reward": 1.4247196912765503, + "reward_std": 0.23277902603149414, + "rewards/accuracy_reward_stage2": 0.4559696614742279, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3878 + }, + { + "completion_length": 10.1875, + "epoch": 0.6796916067986684, + "grad_norm": 16.013344269281035, + "kl": 0.08447265625, + "learning_rate": 3.2048361661117923e-07, + "loss": -0.0045, + "reward": 1.6661726236343384, + "reward_std": 0.16137085855007172, + "rewards/accuracy_reward_stage2": 0.6817976236343384, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3879 + }, + { + "completion_length": 10.765625, + "epoch": 0.6798668302085159, + "grad_norm": 17.385526364163912, + "kl": 0.09228515625, + "learning_rate": 3.2030839320133167e-07, + "loss": -0.0072, + "reward": 1.4617502689361572, + "reward_std": 0.17313295602798462, + "rewards/accuracy_reward_stage2": 0.47737520933151245, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3880 + }, + { + "completion_length": 11.078125, + "epoch": 0.6800420536183635, + "grad_norm": 20.93387541319987, + "kl": 0.16015625, + "learning_rate": 3.2013316979148416e-07, + "loss": 0.0201, + "reward": 1.5362987518310547, + "reward_std": 0.2890404462814331, + "rewards/accuracy_reward_stage2": 0.5519237518310547, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3881 + }, + { + "completion_length": 7.625, + "epoch": 0.680217277028211, + "grad_norm": 29.31166403311481, + "kl": 0.41796875, + "learning_rate": 3.199579463816366e-07, + "loss": 0.0073, + "reward": 1.5820767879486084, + "reward_std": 0.3647039234638214, + "rewards/accuracy_reward_stage2": 0.7695767879486084, + "rewards/format_reward_stage1_pointerpad": 0.8125, + "scores/accuracy_reward_stage2": 0.8125, + "step": 3882 + }, + { + "completion_length": 14.9375, + "epoch": 0.6803925004380585, + "grad_norm": 21.82332493151507, + "kl": 0.08642578125, + "learning_rate": 3.1978272297178904e-07, + "loss": -0.0042, + "reward": 1.495707392692566, + "reward_std": 0.266140878200531, + "rewards/accuracy_reward_stage2": 0.6363324522972107, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3883 + }, + { + "completion_length": 12.09375, + "epoch": 0.6805677238479061, + "grad_norm": 18.232970730025155, + "kl": 0.2177734375, + "learning_rate": 3.196074995619414e-07, + "loss": 0.0871, + "reward": 1.3191421031951904, + "reward_std": 0.26958224177360535, + "rewards/accuracy_reward_stage2": 0.44414204359054565, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3884 + }, + { + "completion_length": 12.265625, + "epoch": 0.6807429472577536, + "grad_norm": 19.870742781236203, + "kl": 0.19140625, + "learning_rate": 3.194322761520939e-07, + "loss": 0.0428, + "reward": 1.4023401737213135, + "reward_std": 0.19982674717903137, + "rewards/accuracy_reward_stage2": 0.5273402333259583, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3885 + }, + { + "completion_length": 9.0625, + "epoch": 0.6809181706676012, + "grad_norm": 16.40488727080269, + "kl": 0.283203125, + "learning_rate": 3.1925705274224636e-07, + "loss": 0.0197, + "reward": 1.160224199295044, + "reward_std": 0.1855008602142334, + "rewards/accuracy_reward_stage2": 0.4414742588996887, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3886 + }, + { + "completion_length": 21.421875, + "epoch": 0.6810933940774487, + "grad_norm": 13.552039941001802, + "kl": 0.10986328125, + "learning_rate": 3.190818293323988e-07, + "loss": -0.0005, + "reward": 1.639201045036316, + "reward_std": 0.21364232897758484, + "rewards/accuracy_reward_stage2": 0.6704509854316711, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3887 + }, + { + "completion_length": 12.140625, + "epoch": 0.6812686174872963, + "grad_norm": 19.831530405238684, + "kl": 0.19921875, + "learning_rate": 3.1890660592255124e-07, + "loss": -0.0619, + "reward": 1.6471765041351318, + "reward_std": 0.25535136461257935, + "rewards/accuracy_reward_stage2": 0.7096765637397766, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3888 + }, + { + "completion_length": 17.640625, + "epoch": 0.6814438408971438, + "grad_norm": 24.320226089228928, + "kl": 0.0306396484375, + "learning_rate": 3.1873138251270373e-07, + "loss": 0.0122, + "reward": 1.5085279941558838, + "reward_std": 0.2112589329481125, + "rewards/accuracy_reward_stage2": 0.5085281133651733, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3889 + }, + { + "completion_length": 15.625, + "epoch": 0.6816190643069914, + "grad_norm": 17.509304200429426, + "kl": 0.0673828125, + "learning_rate": 3.185561591028561e-07, + "loss": 0.027, + "reward": 1.5866674184799194, + "reward_std": 0.13666321337223053, + "rewards/accuracy_reward_stage2": 0.5866674184799194, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3890 + }, + { + "completion_length": 8.0, + "epoch": 0.6817942877168389, + "grad_norm": 17.129707485149034, + "kl": 0.076171875, + "learning_rate": 3.1838093569300856e-07, + "loss": 0.0303, + "reward": 1.4643492698669434, + "reward_std": 0.195224791765213, + "rewards/accuracy_reward_stage2": 0.5893491506576538, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3891 + }, + { + "completion_length": 10.25, + "epoch": 0.6819695111266866, + "grad_norm": 19.25229769183077, + "kl": 0.27734375, + "learning_rate": 3.18205712283161e-07, + "loss": 0.0266, + "reward": 1.320204496383667, + "reward_std": 0.3462868928909302, + "rewards/accuracy_reward_stage2": 0.6170794367790222, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 3892 + }, + { + "completion_length": 9.109375, + "epoch": 0.6821447345365341, + "grad_norm": 18.117867087783885, + "kl": 0.10400390625, + "learning_rate": 3.180304888733135e-07, + "loss": -0.0014, + "reward": 1.7592573165893555, + "reward_std": 0.22831569612026215, + "rewards/accuracy_reward_stage2": 0.7748823165893555, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3893 + }, + { + "completion_length": 9.328125, + "epoch": 0.6823199579463817, + "grad_norm": 19.771147903621497, + "kl": 0.050537109375, + "learning_rate": 3.1785526546346593e-07, + "loss": 0.0202, + "reward": 1.2551724910736084, + "reward_std": 0.30035412311553955, + "rewards/accuracy_reward_stage2": 0.3801724910736084, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3894 + }, + { + "completion_length": 12.71875, + "epoch": 0.6824951813562292, + "grad_norm": 20.155780290967037, + "kl": 0.123046875, + "learning_rate": 3.1768004205361837e-07, + "loss": 0.005, + "reward": 1.3723958730697632, + "reward_std": 0.3768148124217987, + "rewards/accuracy_reward_stage2": 0.3880208730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3895 + }, + { + "completion_length": 7.296875, + "epoch": 0.6826704047660768, + "grad_norm": 20.313263656604374, + "kl": 0.30859375, + "learning_rate": 3.1750481864377075e-07, + "loss": 0.0369, + "reward": 1.384493350982666, + "reward_std": 0.3127593398094177, + "rewards/accuracy_reward_stage2": 0.5563682317733765, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3896 + }, + { + "completion_length": 12.890625, + "epoch": 0.6828456281759243, + "grad_norm": 23.104763723478158, + "kl": 0.125, + "learning_rate": 3.1732959523392325e-07, + "loss": 0.0501, + "reward": 1.526780128479004, + "reward_std": 0.22299879789352417, + "rewards/accuracy_reward_stage2": 0.5267801284790039, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3897 + }, + { + "completion_length": 11.625, + "epoch": 0.6830208515857719, + "grad_norm": 20.52548099394388, + "kl": 0.2490234375, + "learning_rate": 3.171543718240757e-07, + "loss": -0.0513, + "reward": 1.517575979232788, + "reward_std": 0.3787464499473572, + "rewards/accuracy_reward_stage2": 0.5957009792327881, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3898 + }, + { + "completion_length": 7.1875, + "epoch": 0.6831960749956194, + "grad_norm": 13.175094925157342, + "kl": 0.162109375, + "learning_rate": 3.169791484142281e-07, + "loss": 0.0209, + "reward": 1.5124504566192627, + "reward_std": 0.14380380511283875, + "rewards/accuracy_reward_stage2": 0.5280753970146179, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3899 + }, + { + "completion_length": 7.1875, + "epoch": 0.683371298405467, + "grad_norm": 16.721112796537913, + "kl": 0.055908203125, + "learning_rate": 3.1680392500438056e-07, + "loss": 0.0223, + "reward": 1.6026625633239746, + "reward_std": 0.1928408145904541, + "rewards/accuracy_reward_stage2": 0.6026625037193298, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3900 + }, + { + "completion_length": 10.59375, + "epoch": 0.6835465218153145, + "grad_norm": 19.18941387849459, + "kl": 0.0947265625, + "learning_rate": 3.1662870159453306e-07, + "loss": 0.0, + "reward": 1.66532564163208, + "reward_std": 0.23487676680088043, + "rewards/accuracy_reward_stage2": 0.6809506416320801, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3901 + }, + { + "completion_length": 10.296875, + "epoch": 0.683721745225162, + "grad_norm": 18.117473878505333, + "kl": 0.2197265625, + "learning_rate": 3.164534781846855e-07, + "loss": -0.0005, + "reward": 1.5684740543365479, + "reward_std": 0.21021094918251038, + "rewards/accuracy_reward_stage2": 0.8497240543365479, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3902 + }, + { + "completion_length": 20.421875, + "epoch": 0.6838969686350096, + "grad_norm": 19.232278453453848, + "kl": 0.048583984375, + "learning_rate": 3.162782547748379e-07, + "loss": 0.0194, + "reward": 1.5420706272125244, + "reward_std": 0.17480739951133728, + "rewards/accuracy_reward_stage2": 0.5420706868171692, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3903 + }, + { + "completion_length": 8.421875, + "epoch": 0.6840721920448571, + "grad_norm": 16.92202065403525, + "kl": 0.2177734375, + "learning_rate": 3.161030313649903e-07, + "loss": 0.0555, + "reward": 1.4832316637039185, + "reward_std": 0.22372540831565857, + "rewards/accuracy_reward_stage2": 0.6238567233085632, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3904 + }, + { + "completion_length": 8.328125, + "epoch": 0.6842474154547048, + "grad_norm": 20.351729004974555, + "kl": 0.12109375, + "learning_rate": 3.1592780795514276e-07, + "loss": -0.033, + "reward": 1.6287168264389038, + "reward_std": 0.20220136642456055, + "rewards/accuracy_reward_stage2": 0.6755918860435486, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3905 + }, + { + "completion_length": 12.875, + "epoch": 0.6844226388645523, + "grad_norm": 17.18872343506988, + "kl": 0.043701171875, + "learning_rate": 3.1575258454529526e-07, + "loss": 0.0175, + "reward": 1.587280511856079, + "reward_std": 0.20979759097099304, + "rewards/accuracy_reward_stage2": 0.5872805714607239, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3906 + }, + { + "completion_length": 9.28125, + "epoch": 0.6845978622743999, + "grad_norm": 15.510287644855367, + "kl": 0.06494140625, + "learning_rate": 3.155773611354477e-07, + "loss": 0.026, + "reward": 1.2447917461395264, + "reward_std": 0.19727420806884766, + "rewards/accuracy_reward_stage2": 0.4947916567325592, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3907 + }, + { + "completion_length": 11.8125, + "epoch": 0.6847730856842474, + "grad_norm": 10.721890124991495, + "kl": 0.08935546875, + "learning_rate": 3.1540213772560013e-07, + "loss": -0.0078, + "reward": 1.529199481010437, + "reward_std": 0.1329018622636795, + "rewards/accuracy_reward_stage2": 0.544824481010437, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3908 + }, + { + "completion_length": 10.625, + "epoch": 0.684948309094095, + "grad_norm": 16.564565836577522, + "kl": 0.10400390625, + "learning_rate": 3.152269143157525e-07, + "loss": 0.0249, + "reward": 1.642519474029541, + "reward_std": 0.14972180128097534, + "rewards/accuracy_reward_stage2": 0.658144474029541, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3909 + }, + { + "completion_length": 7.265625, + "epoch": 0.6851235325039425, + "grad_norm": 23.065037716747465, + "kl": 0.1845703125, + "learning_rate": 3.15051690905905e-07, + "loss": 0.0301, + "reward": 1.551328420639038, + "reward_std": 0.3457961678504944, + "rewards/accuracy_reward_stage2": 0.5669534206390381, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3910 + }, + { + "completion_length": 8.703125, + "epoch": 0.6852987559137901, + "grad_norm": 17.467165082127643, + "kl": 0.0986328125, + "learning_rate": 3.1487646749605745e-07, + "loss": -0.0019, + "reward": 1.4705981016159058, + "reward_std": 0.24642279744148254, + "rewards/accuracy_reward_stage2": 0.6112231016159058, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3911 + }, + { + "completion_length": 9.65625, + "epoch": 0.6854739793236376, + "grad_norm": 23.083623670273536, + "kl": 0.1455078125, + "learning_rate": 3.147012440862099e-07, + "loss": 0.0583, + "reward": 1.770646095275879, + "reward_std": 0.15404048562049866, + "rewards/accuracy_reward_stage2": 0.7706459760665894, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3912 + }, + { + "completion_length": 13.796875, + "epoch": 0.6856492027334852, + "grad_norm": 18.169266900451227, + "kl": 0.08935546875, + "learning_rate": 3.1452602067636233e-07, + "loss": -0.0083, + "reward": 1.6190228462219238, + "reward_std": 0.23725715279579163, + "rewards/accuracy_reward_stage2": 0.6346479058265686, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3913 + }, + { + "completion_length": 23.46875, + "epoch": 0.6858244261433327, + "grad_norm": 17.97592684350171, + "kl": 0.07666015625, + "learning_rate": 3.143507972665148e-07, + "loss": 0.0306, + "reward": 1.5842111110687256, + "reward_std": 0.13952378928661346, + "rewards/accuracy_reward_stage2": 0.584210991859436, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3914 + }, + { + "completion_length": 10.15625, + "epoch": 0.6859996495531803, + "grad_norm": 16.534376820236705, + "kl": 0.0732421875, + "learning_rate": 3.1417557385666727e-07, + "loss": 0.0293, + "reward": 1.6091794967651367, + "reward_std": 0.19951403141021729, + "rewards/accuracy_reward_stage2": 0.6091794371604919, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3915 + }, + { + "completion_length": 12.546875, + "epoch": 0.6861748729630278, + "grad_norm": 11.26361851638084, + "kl": 0.0703125, + "learning_rate": 3.1400035044681965e-07, + "loss": -0.0161, + "reward": 1.7864649295806885, + "reward_std": 0.1284366399049759, + "rewards/accuracy_reward_stage2": 0.8020899891853333, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3916 + }, + { + "completion_length": 10.4375, + "epoch": 0.6863500963728754, + "grad_norm": 25.026176432044366, + "kl": 0.416015625, + "learning_rate": 3.138251270369721e-07, + "loss": 0.0294, + "reward": 1.6315479278564453, + "reward_std": 0.36400657892227173, + "rewards/accuracy_reward_stage2": 0.6940478086471558, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3917 + }, + { + "completion_length": 25.125, + "epoch": 0.686525319782723, + "grad_norm": 12.195645963577283, + "kl": 0.21484375, + "learning_rate": 3.136499036271246e-07, + "loss": -0.0283, + "reward": 1.365476131439209, + "reward_std": 0.2613718509674072, + "rewards/accuracy_reward_stage2": 0.5373511910438538, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 3918 + }, + { + "completion_length": 11.78125, + "epoch": 0.6867005431925706, + "grad_norm": 23.591685414442438, + "kl": 0.283203125, + "learning_rate": 3.13474680217277e-07, + "loss": -0.0139, + "reward": 1.604864478111267, + "reward_std": 0.33831846714019775, + "rewards/accuracy_reward_stage2": 0.6673645377159119, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3919 + }, + { + "completion_length": 8.765625, + "epoch": 0.6868757666024181, + "grad_norm": 12.687983316888051, + "kl": 0.06494140625, + "learning_rate": 3.1329945680742946e-07, + "loss": 0.026, + "reward": 1.7347902059555054, + "reward_std": 0.18083685636520386, + "rewards/accuracy_reward_stage2": 0.7347902059555054, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3920 + }, + { + "completion_length": 12.515625, + "epoch": 0.6870509900122657, + "grad_norm": 35.26355604712914, + "kl": 0.2373046875, + "learning_rate": 3.131242333975819e-07, + "loss": 0.0171, + "reward": 1.3750255107879639, + "reward_std": 0.27525320649147034, + "rewards/accuracy_reward_stage2": 0.5312755703926086, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3921 + }, + { + "completion_length": 9.046875, + "epoch": 0.6872262134221132, + "grad_norm": 17.562680111645903, + "kl": 0.1357421875, + "learning_rate": 3.1294900998773434e-07, + "loss": 0.0169, + "reward": 1.6735283136367798, + "reward_std": 0.17841938138008118, + "rewards/accuracy_reward_stage2": 0.6891533732414246, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3922 + }, + { + "completion_length": 8.78125, + "epoch": 0.6874014368319608, + "grad_norm": 11.789574838636355, + "kl": 0.080078125, + "learning_rate": 3.127737865778868e-07, + "loss": -0.0122, + "reward": 1.745296597480774, + "reward_std": 0.09620551019906998, + "rewards/accuracy_reward_stage2": 0.7609216570854187, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3923 + }, + { + "completion_length": 10.5, + "epoch": 0.6875766602418083, + "grad_norm": 17.03907545161025, + "kl": 0.11865234375, + "learning_rate": 3.125985631680392e-07, + "loss": 0.02, + "reward": 1.565126657485962, + "reward_std": 0.22917722165584564, + "rewards/accuracy_reward_stage2": 0.5807517170906067, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3924 + }, + { + "completion_length": 9.1875, + "epoch": 0.6877518836516558, + "grad_norm": 19.054671720664587, + "kl": 0.2275390625, + "learning_rate": 3.1242333975819166e-07, + "loss": 0.0302, + "reward": 1.586341142654419, + "reward_std": 0.25396132469177246, + "rewards/accuracy_reward_stage2": 0.617591142654419, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3925 + }, + { + "completion_length": 5.671875, + "epoch": 0.6879271070615034, + "grad_norm": 21.235964599476986, + "kl": 0.08349609375, + "learning_rate": 3.1224811634834415e-07, + "loss": 0.0333, + "reward": 1.5080132484436035, + "reward_std": 0.18352213501930237, + "rewards/accuracy_reward_stage2": 0.5080131888389587, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3926 + }, + { + "completion_length": 9.21875, + "epoch": 0.6881023304713509, + "grad_norm": 19.730500202294046, + "kl": 0.0791015625, + "learning_rate": 3.120728929384966e-07, + "loss": 0.0317, + "reward": 1.629475712776184, + "reward_std": 0.20445698499679565, + "rewards/accuracy_reward_stage2": 0.6294757127761841, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3927 + }, + { + "completion_length": 6.9375, + "epoch": 0.6882775538811985, + "grad_norm": 18.653319443582067, + "kl": 0.302734375, + "learning_rate": 3.11897669528649e-07, + "loss": 0.0434, + "reward": 1.310603141784668, + "reward_std": 0.3170985281467438, + "rewards/accuracy_reward_stage2": 0.5918530821800232, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 3928 + }, + { + "completion_length": 10.609375, + "epoch": 0.688452777291046, + "grad_norm": 20.47469033235731, + "kl": 0.1328125, + "learning_rate": 3.117224461188014e-07, + "loss": 0.0533, + "reward": 1.6717946529388428, + "reward_std": 0.16436317563056946, + "rewards/accuracy_reward_stage2": 0.6717947721481323, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3929 + }, + { + "completion_length": 20.15625, + "epoch": 0.6886280007008937, + "grad_norm": 19.376279526164858, + "kl": 0.2216796875, + "learning_rate": 3.115472227089539e-07, + "loss": 0.0228, + "reward": 1.332094430923462, + "reward_std": 0.16052697598934174, + "rewards/accuracy_reward_stage2": 0.48834434151649475, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3930 + }, + { + "completion_length": 7.5625, + "epoch": 0.6888032241107412, + "grad_norm": 19.79598180371833, + "kl": 0.271484375, + "learning_rate": 3.1137199929910635e-07, + "loss": 0.0384, + "reward": 1.54587721824646, + "reward_std": 0.3018754720687866, + "rewards/accuracy_reward_stage2": 0.7021272778511047, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3931 + }, + { + "completion_length": 16.921875, + "epoch": 0.6889784475205888, + "grad_norm": 13.835350571440522, + "kl": 0.1494140625, + "learning_rate": 3.111967758892588e-07, + "loss": 0.016, + "reward": 1.5475966930389404, + "reward_std": 0.15770044922828674, + "rewards/accuracy_reward_stage2": 0.8132216930389404, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 3932 + }, + { + "completion_length": 8.734375, + "epoch": 0.6891536709304363, + "grad_norm": 20.491047232136104, + "kl": 0.185546875, + "learning_rate": 3.1102155247941123e-07, + "loss": 0.074, + "reward": 1.6471889019012451, + "reward_std": 0.18342958390712738, + "rewards/accuracy_reward_stage2": 0.7721887826919556, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3933 + }, + { + "completion_length": 14.265625, + "epoch": 0.6893288943402839, + "grad_norm": 14.746120338904305, + "kl": 0.171875, + "learning_rate": 3.108463290695637e-07, + "loss": -0.0042, + "reward": 1.4174625873565674, + "reward_std": 0.30715838074684143, + "rewards/accuracy_reward_stage2": 0.5737125873565674, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3934 + }, + { + "completion_length": 13.953125, + "epoch": 0.6895041177501314, + "grad_norm": 15.878902923650905, + "kl": 0.1328125, + "learning_rate": 3.106711056597161e-07, + "loss": -0.0028, + "reward": 1.6495393514633179, + "reward_std": 0.2503609359264374, + "rewards/accuracy_reward_stage2": 0.6807893514633179, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3935 + }, + { + "completion_length": 10.796875, + "epoch": 0.689679341159979, + "grad_norm": 18.115861963025267, + "kl": 0.09814453125, + "learning_rate": 3.1049588224986855e-07, + "loss": 0.0394, + "reward": 1.5655012130737305, + "reward_std": 0.3471730351448059, + "rewards/accuracy_reward_stage2": 0.5655011534690857, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3936 + }, + { + "completion_length": 7.40625, + "epoch": 0.6898545645698265, + "grad_norm": 17.574990338644515, + "kl": 0.181640625, + "learning_rate": 3.10320658840021e-07, + "loss": -0.0157, + "reward": 1.5906562805175781, + "reward_std": 0.34753066301345825, + "rewards/accuracy_reward_stage2": 0.6219062209129333, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3937 + }, + { + "completion_length": 7.0, + "epoch": 0.6900297879796741, + "grad_norm": 19.48735881575915, + "kl": 0.1064453125, + "learning_rate": 3.101454354301735e-07, + "loss": -0.0014, + "reward": 1.5208333730697632, + "reward_std": 0.24168574810028076, + "rewards/accuracy_reward_stage2": 0.5364583730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3938 + }, + { + "completion_length": 10.65625, + "epoch": 0.6902050113895216, + "grad_norm": 17.762705769766715, + "kl": 0.055908203125, + "learning_rate": 3.099702120203259e-07, + "loss": 0.0223, + "reward": 1.5739235877990723, + "reward_std": 0.20269808173179626, + "rewards/accuracy_reward_stage2": 0.5739235877990723, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3939 + }, + { + "completion_length": 10.6875, + "epoch": 0.6903802347993692, + "grad_norm": 20.91996892422379, + "kl": 0.25390625, + "learning_rate": 3.0979498861047836e-07, + "loss": 0.0024, + "reward": 1.3960648775100708, + "reward_std": 0.30436623096466064, + "rewards/accuracy_reward_stage2": 0.4429398775100708, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3940 + }, + { + "completion_length": 12.375, + "epoch": 0.6905554582092167, + "grad_norm": 26.349980975779463, + "kl": 0.08740234375, + "learning_rate": 3.0961976520063075e-07, + "loss": -0.0004, + "reward": 1.816171646118164, + "reward_std": 0.23692914843559265, + "rewards/accuracy_reward_stage2": 0.8317966461181641, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3941 + }, + { + "completion_length": 9.34375, + "epoch": 0.6907306816190643, + "grad_norm": 19.432604856174976, + "kl": 0.06787109375, + "learning_rate": 3.0944454179078324e-07, + "loss": 0.0271, + "reward": 1.5397982597351074, + "reward_std": 0.2259722501039505, + "rewards/accuracy_reward_stage2": 0.5397982001304626, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3942 + }, + { + "completion_length": 12.109375, + "epoch": 0.6909059050289119, + "grad_norm": 22.676757328029613, + "kl": 0.1728515625, + "learning_rate": 3.092693183809357e-07, + "loss": 0.0284, + "reward": 1.4862951040267944, + "reward_std": 0.28705430030822754, + "rewards/accuracy_reward_stage2": 0.6269201040267944, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3943 + }, + { + "completion_length": 28.703125, + "epoch": 0.6910811284387595, + "grad_norm": 14.315674866282164, + "kl": 0.09130859375, + "learning_rate": 3.090940949710881e-07, + "loss": -0.0077, + "reward": 1.6648551225662231, + "reward_std": 0.13941612839698792, + "rewards/accuracy_reward_stage2": 0.6804801225662231, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3944 + }, + { + "completion_length": 11.734375, + "epoch": 0.691256351848607, + "grad_norm": 18.54880608586595, + "kl": 0.1591796875, + "learning_rate": 3.0891887156124056e-07, + "loss": 0.0196, + "reward": 1.5700504779815674, + "reward_std": 0.18866734206676483, + "rewards/accuracy_reward_stage2": 0.5856754779815674, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3945 + }, + { + "completion_length": 10.40625, + "epoch": 0.6914315752584546, + "grad_norm": 20.708798753313303, + "kl": 0.255859375, + "learning_rate": 3.0874364815139305e-07, + "loss": -0.045, + "reward": 1.4974335432052612, + "reward_std": 0.3221808075904846, + "rewards/accuracy_reward_stage2": 0.5599335432052612, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 3946 + }, + { + "completion_length": 11.921875, + "epoch": 0.6916067986683021, + "grad_norm": 17.309036654287304, + "kl": 0.11328125, + "learning_rate": 3.0856842474154544e-07, + "loss": 0.0454, + "reward": 1.5039632320404053, + "reward_std": 0.2883744239807129, + "rewards/accuracy_reward_stage2": 0.6289632320404053, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3947 + }, + { + "completion_length": 7.96875, + "epoch": 0.6917820220781496, + "grad_norm": 20.163204338819533, + "kl": 0.185546875, + "learning_rate": 3.083932013316979e-07, + "loss": 0.0742, + "reward": 1.5174355506896973, + "reward_std": 0.1934971660375595, + "rewards/accuracy_reward_stage2": 0.6424355506896973, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3948 + }, + { + "completion_length": 8.484375, + "epoch": 0.6919572454879972, + "grad_norm": 23.202422932170183, + "kl": 0.1689453125, + "learning_rate": 3.082179779218503e-07, + "loss": -0.0156, + "reward": 1.4502408504486084, + "reward_std": 0.2823672890663147, + "rewards/accuracy_reward_stage2": 0.4971158802509308, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3949 + }, + { + "completion_length": 9.375, + "epoch": 0.6921324688978447, + "grad_norm": 18.667771850263488, + "kl": 0.2158203125, + "learning_rate": 3.080427545120028e-07, + "loss": -0.0125, + "reward": 1.7800071239471436, + "reward_std": 0.2526581585407257, + "rewards/accuracy_reward_stage2": 0.8268821835517883, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3950 + }, + { + "completion_length": 12.234375, + "epoch": 0.6923076923076923, + "grad_norm": 21.12215208307982, + "kl": 0.1025390625, + "learning_rate": 3.0786753110215525e-07, + "loss": 0.0409, + "reward": 1.3765630722045898, + "reward_std": 0.2770848870277405, + "rewards/accuracy_reward_stage2": 0.37656310200691223, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3951 + }, + { + "completion_length": 6.953125, + "epoch": 0.6924829157175398, + "grad_norm": 21.502918541623767, + "kl": 0.1767578125, + "learning_rate": 3.076923076923077e-07, + "loss": 0.0267, + "reward": 1.59840989112854, + "reward_std": 0.3457435965538025, + "rewards/accuracy_reward_stage2": 0.6140349507331848, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3952 + }, + { + "completion_length": 6.59375, + "epoch": 0.6926581391273874, + "grad_norm": 20.384638028256372, + "kl": 0.134765625, + "learning_rate": 3.0751708428246013e-07, + "loss": 0.0121, + "reward": 1.7731083631515503, + "reward_std": 0.2765722870826721, + "rewards/accuracy_reward_stage2": 0.8043583631515503, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3953 + }, + { + "completion_length": 6.421875, + "epoch": 0.6928333625372349, + "grad_norm": 12.88207861312547, + "kl": 0.07373046875, + "learning_rate": 3.0734186087261257e-07, + "loss": -0.0027, + "reward": 1.7528049945831299, + "reward_std": 0.12491665780544281, + "rewards/accuracy_reward_stage2": 0.7684298753738403, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3954 + }, + { + "completion_length": 9.921875, + "epoch": 0.6930085859470825, + "grad_norm": 18.404849601417556, + "kl": 0.1630859375, + "learning_rate": 3.07166637462765e-07, + "loss": 0.008, + "reward": 1.631026029586792, + "reward_std": 0.31937503814697266, + "rewards/accuracy_reward_stage2": 0.662276029586792, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3955 + }, + { + "completion_length": 11.984375, + "epoch": 0.6931838093569301, + "grad_norm": 18.07342096506507, + "kl": 0.166015625, + "learning_rate": 3.0699141405291745e-07, + "loss": 0.0023, + "reward": 1.658907413482666, + "reward_std": 0.27549779415130615, + "rewards/accuracy_reward_stage2": 0.6901572942733765, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3956 + }, + { + "completion_length": 8.78125, + "epoch": 0.6933590327667777, + "grad_norm": 15.03471222250481, + "kl": 0.1240234375, + "learning_rate": 3.068161906430699e-07, + "loss": -0.035, + "reward": 1.6224809885025024, + "reward_std": 0.209599107503891, + "rewards/accuracy_reward_stage2": 0.6537309885025024, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3957 + }, + { + "completion_length": 14.84375, + "epoch": 0.6935342561766252, + "grad_norm": 17.05975382067552, + "kl": 0.03662109375, + "learning_rate": 3.066409672332224e-07, + "loss": -0.0187, + "reward": 1.6875, + "reward_std": 0.213067427277565, + "rewards/accuracy_reward_stage2": 0.703125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3958 + }, + { + "completion_length": 9.53125, + "epoch": 0.6937094795864728, + "grad_norm": 19.50720645745515, + "kl": 0.07275390625, + "learning_rate": 3.064657438233748e-07, + "loss": 0.0291, + "reward": 1.7259900569915771, + "reward_std": 0.3334110379219055, + "rewards/accuracy_reward_stage2": 0.7259901165962219, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3959 + }, + { + "completion_length": 12.0625, + "epoch": 0.6938847029963203, + "grad_norm": 15.904502289401634, + "kl": 0.09521484375, + "learning_rate": 3.062905204135272e-07, + "loss": -0.0039, + "reward": 1.599704623222351, + "reward_std": 0.1589922308921814, + "rewards/accuracy_reward_stage2": 0.6153296232223511, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3960 + }, + { + "completion_length": 9.203125, + "epoch": 0.6940599264061679, + "grad_norm": 21.180187694098507, + "kl": 0.2412109375, + "learning_rate": 3.0611529700367965e-07, + "loss": 0.0627, + "reward": 1.4935078620910645, + "reward_std": 0.22299349308013916, + "rewards/accuracy_reward_stage2": 0.6341328620910645, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3961 + }, + { + "completion_length": 7.71875, + "epoch": 0.6942351498160154, + "grad_norm": 13.024524663270084, + "kl": 0.1484375, + "learning_rate": 3.0594007359383214e-07, + "loss": -0.0521, + "reward": 1.5260417461395264, + "reward_std": 0.2162405401468277, + "rewards/accuracy_reward_stage2": 0.5729166865348816, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3962 + }, + { + "completion_length": 6.5, + "epoch": 0.694410373225863, + "grad_norm": 12.515771796910617, + "kl": 0.1103515625, + "learning_rate": 3.057648501839846e-07, + "loss": -0.0185, + "reward": 1.783489465713501, + "reward_std": 0.09697789698839188, + "rewards/accuracy_reward_stage2": 0.8147395849227905, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3963 + }, + { + "completion_length": 9.34375, + "epoch": 0.6945855966357105, + "grad_norm": 16.101834277694934, + "kl": 0.1943359375, + "learning_rate": 3.05589626774137e-07, + "loss": -0.0135, + "reward": 1.6511366367340088, + "reward_std": 0.21041421592235565, + "rewards/accuracy_reward_stage2": 0.698011577129364, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3964 + }, + { + "completion_length": 11.703125, + "epoch": 0.6947608200455581, + "grad_norm": 20.639096960569315, + "kl": 0.1552734375, + "learning_rate": 3.0541440336428946e-07, + "loss": 0.0404, + "reward": 1.6338541507720947, + "reward_std": 0.2505960464477539, + "rewards/accuracy_reward_stage2": 0.7588541507720947, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3965 + }, + { + "completion_length": 7.921875, + "epoch": 0.6949360434554056, + "grad_norm": 25.032954546058857, + "kl": 0.2490234375, + "learning_rate": 3.052391799544419e-07, + "loss": -0.037, + "reward": 1.5356470346450806, + "reward_std": 0.3018898367881775, + "rewards/accuracy_reward_stage2": 0.6137720346450806, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3966 + }, + { + "completion_length": 9.125, + "epoch": 0.6951112668652532, + "grad_norm": 29.25886863913636, + "kl": 0.08154296875, + "learning_rate": 3.0506395654459434e-07, + "loss": 0.0327, + "reward": 1.6015243530273438, + "reward_std": 0.2491273283958435, + "rewards/accuracy_reward_stage2": 0.6015242338180542, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3967 + }, + { + "completion_length": 7.265625, + "epoch": 0.6952864902751007, + "grad_norm": 28.75762040222275, + "kl": 0.138671875, + "learning_rate": 3.048887331347468e-07, + "loss": -0.0252, + "reward": 1.357391595840454, + "reward_std": 0.20943915843963623, + "rewards/accuracy_reward_stage2": 0.6542665362358093, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 3968 + }, + { + "completion_length": 11.0, + "epoch": 0.6954617136849484, + "grad_norm": 17.793314482250434, + "kl": 0.056396484375, + "learning_rate": 3.047135097248992e-07, + "loss": 0.0226, + "reward": 1.714925765991211, + "reward_std": 0.18759344518184662, + "rewards/accuracy_reward_stage2": 0.7149257063865662, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3969 + }, + { + "completion_length": 11.890625, + "epoch": 0.6956369370947959, + "grad_norm": 10.884958540757461, + "kl": 0.162109375, + "learning_rate": 3.045382863150517e-07, + "loss": 0.0361, + "reward": 1.59375, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3970 + }, + { + "completion_length": 10.609375, + "epoch": 0.6958121605046435, + "grad_norm": 19.301987640237314, + "kl": 0.1025390625, + "learning_rate": 3.0436306290520415e-07, + "loss": 0.0115, + "reward": 1.6933300495147705, + "reward_std": 0.17335617542266846, + "rewards/accuracy_reward_stage2": 0.7089550495147705, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3971 + }, + { + "completion_length": 9.375, + "epoch": 0.695987383914491, + "grad_norm": 17.800448563142357, + "kl": 0.1787109375, + "learning_rate": 3.041878394953566e-07, + "loss": -0.0171, + "reward": 1.5350027084350586, + "reward_std": 0.30478453636169434, + "rewards/accuracy_reward_stage2": 0.5818777084350586, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3972 + }, + { + "completion_length": 8.171875, + "epoch": 0.6961626073243385, + "grad_norm": 17.08748803743485, + "kl": 0.1298828125, + "learning_rate": 3.04012616085509e-07, + "loss": 0.0278, + "reward": 1.4267973899841309, + "reward_std": 0.2559486925601959, + "rewards/accuracy_reward_stage2": 0.5674223899841309, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3973 + }, + { + "completion_length": 10.5625, + "epoch": 0.6963378307341861, + "grad_norm": 17.388386742225734, + "kl": 0.287109375, + "learning_rate": 3.038373926756614e-07, + "loss": 0.1151, + "reward": 1.4651490449905396, + "reward_std": 0.23490217328071594, + "rewards/accuracy_reward_stage2": 0.7151491641998291, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 3974 + }, + { + "completion_length": 11.765625, + "epoch": 0.6965130541440336, + "grad_norm": 14.604925955241491, + "kl": 0.027587890625, + "learning_rate": 3.036621692658139e-07, + "loss": 0.011, + "reward": 1.4230644702911377, + "reward_std": 0.17332643270492554, + "rewards/accuracy_reward_stage2": 0.4230644702911377, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3975 + }, + { + "completion_length": 8.34375, + "epoch": 0.6966882775538812, + "grad_norm": 19.10335138910953, + "kl": 0.06884765625, + "learning_rate": 3.0348694585596635e-07, + "loss": -0.0135, + "reward": 1.392409324645996, + "reward_std": 0.24867044389247894, + "rewards/accuracy_reward_stage2": 0.40803423523902893, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3976 + }, + { + "completion_length": 8.59375, + "epoch": 0.6968635009637287, + "grad_norm": 13.149312735503395, + "kl": 0.09423828125, + "learning_rate": 3.033117224461188e-07, + "loss": -0.0378, + "reward": 1.6302083730697632, + "reward_std": 0.24144160747528076, + "rewards/accuracy_reward_stage2": 0.6614583730697632, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3977 + }, + { + "completion_length": 10.9375, + "epoch": 0.6970387243735763, + "grad_norm": 15.296151278621773, + "kl": 0.1728515625, + "learning_rate": 3.031364990362713e-07, + "loss": -0.0421, + "reward": 1.8066771030426025, + "reward_std": 0.2580149173736572, + "rewards/accuracy_reward_stage2": 0.8535521030426025, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3978 + }, + { + "completion_length": 8.265625, + "epoch": 0.6972139477834238, + "grad_norm": 14.792175216235439, + "kl": 0.130859375, + "learning_rate": 3.0296127562642367e-07, + "loss": 0.0522, + "reward": 1.4834437370300293, + "reward_std": 0.093760646879673, + "rewards/accuracy_reward_stage2": 0.6084437966346741, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3979 + }, + { + "completion_length": 9.546875, + "epoch": 0.6973891711932714, + "grad_norm": 15.189126749746107, + "kl": 0.09326171875, + "learning_rate": 3.027860522165761e-07, + "loss": -0.0068, + "reward": 1.5334163904190063, + "reward_std": 0.1791066825389862, + "rewards/accuracy_reward_stage2": 0.5490414500236511, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3980 + }, + { + "completion_length": 11.203125, + "epoch": 0.697564394603119, + "grad_norm": 16.051310339535767, + "kl": 0.16796875, + "learning_rate": 3.0261082880672855e-07, + "loss": -0.0004, + "reward": 1.7741138935089111, + "reward_std": 0.11385900527238846, + "rewards/accuracy_reward_stage2": 0.8209889531135559, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3981 + }, + { + "completion_length": 10.859375, + "epoch": 0.6977396180129666, + "grad_norm": 18.411013346263637, + "kl": 0.259765625, + "learning_rate": 3.02435605396881e-07, + "loss": -0.0625, + "reward": 1.678377389907837, + "reward_std": 0.2788037061691284, + "rewards/accuracy_reward_stage2": 0.7565024495124817, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3982 + }, + { + "completion_length": 14.15625, + "epoch": 0.6979148414228141, + "grad_norm": 18.396572064637514, + "kl": 0.06298828125, + "learning_rate": 3.022603819870335e-07, + "loss": -0.0166, + "reward": 1.6920890808105469, + "reward_std": 0.2676393389701843, + "rewards/accuracy_reward_stage2": 0.7077139616012573, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3983 + }, + { + "completion_length": 9.015625, + "epoch": 0.6980900648326617, + "grad_norm": 15.892165582034131, + "kl": 0.07373046875, + "learning_rate": 3.020851585771859e-07, + "loss": 0.0296, + "reward": 1.5303912162780762, + "reward_std": 0.1921355128288269, + "rewards/accuracy_reward_stage2": 0.5303913354873657, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3984 + }, + { + "completion_length": 10.0625, + "epoch": 0.6982652882425092, + "grad_norm": 22.291782886833108, + "kl": 0.1728515625, + "learning_rate": 3.0190993516733836e-07, + "loss": 0.0251, + "reward": 1.3862011432647705, + "reward_std": 0.3098811209201813, + "rewards/accuracy_reward_stage2": 0.5268262028694153, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3985 + }, + { + "completion_length": 9.640625, + "epoch": 0.6984405116523568, + "grad_norm": 16.805373534048258, + "kl": 0.08984375, + "learning_rate": 3.0173471175749074e-07, + "loss": 0.036, + "reward": 1.4264421463012695, + "reward_std": 0.14173433184623718, + "rewards/accuracy_reward_stage2": 0.4264422655105591, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3986 + }, + { + "completion_length": 11.671875, + "epoch": 0.6986157350622043, + "grad_norm": 30.59496168350246, + "kl": 0.2109375, + "learning_rate": 3.0155948834764324e-07, + "loss": -0.0151, + "reward": 1.6473720073699951, + "reward_std": 0.28604885935783386, + "rewards/accuracy_reward_stage2": 0.6942470669746399, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 3987 + }, + { + "completion_length": 7.6875, + "epoch": 0.6987909584720519, + "grad_norm": 20.906417103919768, + "kl": 0.09619140625, + "learning_rate": 3.013842649377957e-07, + "loss": 0.0108, + "reward": 1.443253517150879, + "reward_std": 0.20594848692417145, + "rewards/accuracy_reward_stage2": 0.45887845754623413, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3988 + }, + { + "completion_length": 32.0625, + "epoch": 0.6989661818818994, + "grad_norm": 19.969257114079817, + "kl": 0.0380859375, + "learning_rate": 3.012090415279481e-07, + "loss": 0.0152, + "reward": 1.5706324577331543, + "reward_std": 0.09524843841791153, + "rewards/accuracy_reward_stage2": 0.5706325769424438, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3989 + }, + { + "completion_length": 13.515625, + "epoch": 0.699141405291747, + "grad_norm": 16.937478051824815, + "kl": 0.0791015625, + "learning_rate": 3.0103381811810056e-07, + "loss": -0.0126, + "reward": 1.5109663009643555, + "reward_std": 0.1787455677986145, + "rewards/accuracy_reward_stage2": 0.5265913605690002, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3990 + }, + { + "completion_length": 11.8125, + "epoch": 0.6993166287015945, + "grad_norm": 17.999730551940655, + "kl": 0.10302734375, + "learning_rate": 3.0085859470825305e-07, + "loss": -0.0447, + "reward": 1.6739494800567627, + "reward_std": 0.30125704407691956, + "rewards/accuracy_reward_stage2": 0.8301993608474731, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3991 + }, + { + "completion_length": 8.78125, + "epoch": 0.699491852111442, + "grad_norm": 20.101459352683804, + "kl": 0.251953125, + "learning_rate": 3.0068337129840543e-07, + "loss": 0.0563, + "reward": 1.437328577041626, + "reward_std": 0.2033752202987671, + "rewards/accuracy_reward_stage2": 0.577953577041626, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3992 + }, + { + "completion_length": 8.015625, + "epoch": 0.6996670755212896, + "grad_norm": 16.72200093482387, + "kl": 0.296875, + "learning_rate": 3.005081478885579e-07, + "loss": -0.0461, + "reward": 1.5274182558059692, + "reward_std": 0.2606680989265442, + "rewards/accuracy_reward_stage2": 0.6055432558059692, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 3993 + }, + { + "completion_length": 8.140625, + "epoch": 0.6998422989311373, + "grad_norm": 15.525598036631733, + "kl": 0.09423828125, + "learning_rate": 3.003329244787103e-07, + "loss": 0.0168, + "reward": 1.5954310894012451, + "reward_std": 0.21913869678974152, + "rewards/accuracy_reward_stage2": 0.6110560297966003, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3994 + }, + { + "completion_length": 13.515625, + "epoch": 0.7000175223409848, + "grad_norm": 24.49869160495708, + "kl": 0.1826171875, + "learning_rate": 3.001577010688628e-07, + "loss": -0.007, + "reward": 1.475322961807251, + "reward_std": 0.29399049282073975, + "rewards/accuracy_reward_stage2": 0.5065730214118958, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 3995 + }, + { + "completion_length": 8.765625, + "epoch": 0.7001927457508323, + "grad_norm": 21.277096836973257, + "kl": 0.16015625, + "learning_rate": 2.9998247765901525e-07, + "loss": 0.0458, + "reward": 1.388684630393982, + "reward_std": 0.29958435893058777, + "rewards/accuracy_reward_stage2": 0.5136846303939819, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 3996 + }, + { + "completion_length": 11.875, + "epoch": 0.7003679691606799, + "grad_norm": 17.36410670374809, + "kl": 0.171875, + "learning_rate": 2.998072542491677e-07, + "loss": 0.0297, + "reward": 1.2950433492660522, + "reward_std": 0.21796312928199768, + "rewards/accuracy_reward_stage2": 0.43566828966140747, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 3997 + }, + { + "completion_length": 10.671875, + "epoch": 0.7005431925705274, + "grad_norm": 19.323163806534783, + "kl": 0.21875, + "learning_rate": 2.9963203083932007e-07, + "loss": 0.0144, + "reward": 1.4738521575927734, + "reward_std": 0.3393661379814148, + "rewards/accuracy_reward_stage2": 0.6301021575927734, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 3998 + }, + { + "completion_length": 8.890625, + "epoch": 0.700718415980375, + "grad_norm": 20.288072612266532, + "kl": 0.11328125, + "learning_rate": 2.9945680742947257e-07, + "loss": 0.0009, + "reward": 1.7728909254074097, + "reward_std": 0.1919422149658203, + "rewards/accuracy_reward_stage2": 0.7885159254074097, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 3999 + }, + { + "completion_length": 10.140625, + "epoch": 0.7008936393902225, + "grad_norm": 19.459808071686343, + "kl": 0.09619140625, + "learning_rate": 2.99281584019625e-07, + "loss": -0.0037, + "reward": 1.4524281024932861, + "reward_std": 0.24864047765731812, + "rewards/accuracy_reward_stage2": 0.46805307269096375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 4000 + } + ], + "logging_steps": 1.0, + "max_steps": 5707, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}