diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,50433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.75830078125, + "eval_steps": 500, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 423.6328125, + "epoch": 0.00048828125, + "grad_norm": 1.6967331082339958, + "kl": 0.0, + "learning_rate": 9.998779296875e-07, + "loss": -0.0, + "reward": 1.3786234855651855, + "reward_std": 0.4677655100822449, + "rewards/format_reward": 0.8671875, + "rewards/ocr_reward": 0.5114360153675079, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.5859375, + "epoch": 0.0009765625, + "grad_norm": 3.2468303824500158, + "kl": 0.0004444122314453125, + "learning_rate": 9.99755859375e-07, + "loss": 0.0, + "reward": 1.34132719039917, + "reward_std": 0.22886180132627487, + "rewards/format_reward": 0.8671875, + "rewards/ocr_reward": 0.4741397053003311, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.9609375, + "epoch": 0.00146484375, + "grad_norm": 4.297129503903299, + "kl": 0.0007305145263671875, + "learning_rate": 9.996337890625e-07, + "loss": 0.0, + "reward": 1.3343781232833862, + "reward_std": 0.3735136389732361, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.44375310838222504, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.828125, + "epoch": 0.001953125, + "grad_norm": 3.657808420164072, + "kl": 0.00101470947265625, + "learning_rate": 9.995117187499999e-07, + "loss": 0.0, + "reward": 1.2803430557250977, + "reward_std": 0.3147875517606735, + "rewards/format_reward": 0.828125, + "rewards/ocr_reward": 0.4522180110216141, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.578125, + "epoch": 0.00244140625, + "grad_norm": 9.54822062112943, + "kl": 0.001285552978515625, + "learning_rate": 9.993896484375e-07, + "loss": 0.0001, + "reward": 1.4602121710777283, + "reward_std": 0.26758695393800735, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.49927467107772827, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.953125, + "epoch": 0.0029296875, + "grad_norm": 3.269030427449002, + "kl": 0.001796722412109375, + "learning_rate": 9.992675781249999e-07, + "loss": 0.0001, + "reward": 1.3741803765296936, + "reward_std": 0.25756245851516724, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.4523053914308548, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.5234375, + "epoch": 0.00341796875, + "grad_norm": 3.1536786445417637, + "kl": 0.00457763671875, + "learning_rate": 9.991455078125e-07, + "loss": 0.0002, + "reward": 1.4508002400398254, + "reward_std": 0.21975237131118774, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.49767518043518066, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.3515625, + "epoch": 0.00390625, + "grad_norm": 3.829024827560807, + "kl": 0.00382232666015625, + "learning_rate": 9.990234375e-07, + "loss": 0.0002, + "reward": 1.437036395072937, + "reward_std": 0.16978412866592407, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.4604738652706146, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.71875, + "epoch": 0.00439453125, + "grad_norm": 3.0867259887356244, + "kl": 0.00495147705078125, + "learning_rate": 9.989013671875e-07, + "loss": 0.0002, + "reward": 1.474764347076416, + "reward_std": 0.22620604932308197, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.498201847076416, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.453125, + "epoch": 0.0048828125, + "grad_norm": 3.2629945996638403, + "kl": 0.0056915283203125, + "learning_rate": 9.98779296875e-07, + "loss": 0.0002, + "reward": 1.5907155871391296, + "reward_std": 0.14950328320264816, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.590715616941452, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.390625, + "epoch": 0.00537109375, + "grad_norm": 1.6117640951539305, + "kl": 0.0078887939453125, + "learning_rate": 9.986572265624999e-07, + "loss": 0.0003, + "reward": 1.427464485168457, + "reward_std": 0.19085168838500977, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.5134019106626511, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.671875, + "epoch": 0.005859375, + "grad_norm": 9.963892714501458, + "kl": 0.006866455078125, + "learning_rate": 9.9853515625e-07, + "loss": 0.0003, + "reward": 1.506593108177185, + "reward_std": 0.21911517158150673, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5378430485725403, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.1484375, + "epoch": 0.00634765625, + "grad_norm": 1.198296114175371, + "kl": 0.00494384765625, + "learning_rate": 9.984130859374999e-07, + "loss": 0.0002, + "reward": 1.4732499718666077, + "reward_std": 0.16561511158943176, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.5513749718666077, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.5234375, + "epoch": 0.0068359375, + "grad_norm": 2.2044476834833233, + "kl": 0.00946044921875, + "learning_rate": 9.98291015625e-07, + "loss": 0.0004, + "reward": 1.3112062215805054, + "reward_std": 0.2594592794775963, + "rewards/format_reward": 0.8828125, + "rewards/ocr_reward": 0.4283936768770218, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.5078125, + "epoch": 0.00732421875, + "grad_norm": 6.042854996633053, + "kl": 0.0079193115234375, + "learning_rate": 9.981689453125e-07, + "loss": 0.0003, + "reward": 1.4512476921081543, + "reward_std": 0.15800564736127853, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.5606226921081543, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.2734375, + "epoch": 0.0078125, + "grad_norm": 5.388723086801915, + "kl": 0.008453369140625, + "learning_rate": 9.98046875e-07, + "loss": 0.0003, + "reward": 1.4160526990890503, + "reward_std": 0.19370869547128677, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.5254276692867279, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.1796875, + "epoch": 0.00830078125, + "grad_norm": 8.561270421888638, + "kl": 0.008331298828125, + "learning_rate": 9.979248046875e-07, + "loss": 0.0003, + "reward": 1.5414886474609375, + "reward_std": 0.24305763095617294, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5571136474609375, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.53125, + "epoch": 0.0087890625, + "grad_norm": 2.3271079058180053, + "kl": 0.0077972412109375, + "learning_rate": 9.978027343749999e-07, + "loss": 0.0003, + "reward": 1.4583409428596497, + "reward_std": 0.23799628019332886, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.48959091305732727, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.6953125, + "epoch": 0.00927734375, + "grad_norm": 5.347116097400923, + "kl": 0.014739990234375, + "learning_rate": 9.976806640625e-07, + "loss": 0.0006, + "reward": 1.4719247817993164, + "reward_std": 0.24416528642177582, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.5422372817993164, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.8203125, + "epoch": 0.009765625, + "grad_norm": 2.1593789937228456, + "kl": 0.0086669921875, + "learning_rate": 9.9755859375e-07, + "loss": 0.0003, + "reward": 1.5712983012199402, + "reward_std": 0.20670025050640106, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5791108012199402, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.53125, + "epoch": 0.01025390625, + "grad_norm": 5.174610588681323, + "kl": 0.010955810546875, + "learning_rate": 9.974365234375e-07, + "loss": 0.0004, + "reward": 1.6467618942260742, + "reward_std": 0.17008116841316223, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.670199453830719, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.9921875, + "epoch": 0.0107421875, + "grad_norm": 2.2196458415428073, + "kl": 0.0082244873046875, + "learning_rate": 9.97314453125e-07, + "loss": 0.0003, + "reward": 1.5177651643753052, + "reward_std": 0.183644600212574, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.5802651941776276, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.9140625, + "epoch": 0.01123046875, + "grad_norm": 2.1272235668522725, + "kl": 0.009613037109375, + "learning_rate": 9.971923828125e-07, + "loss": 0.0004, + "reward": 1.6449316143989563, + "reward_std": 0.11167065799236298, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6449315845966339, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.109375, + "epoch": 0.01171875, + "grad_norm": 2.695370319521806, + "kl": 0.0121002197265625, + "learning_rate": 9.970703125e-07, + "loss": 0.0005, + "reward": 1.7102810740470886, + "reward_std": 0.19407786428928375, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7180935442447662, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.171875, + "epoch": 0.01220703125, + "grad_norm": 2.4674039388994022, + "kl": 0.010650634765625, + "learning_rate": 9.969482421874999e-07, + "loss": 0.0004, + "reward": 1.4299457669258118, + "reward_std": 0.20515850186347961, + "rewards/format_reward": 0.8359375, + "rewards/ocr_reward": 0.5940082669258118, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.2265625, + "epoch": 0.0126953125, + "grad_norm": 1.7112292524853188, + "kl": 0.013671875, + "learning_rate": 9.96826171875e-07, + "loss": 0.0005, + "reward": 1.5269352197647095, + "reward_std": 0.12535615265369415, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5425602197647095, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.71875, + "epoch": 0.01318359375, + "grad_norm": 4.748056883738088, + "kl": 0.0106964111328125, + "learning_rate": 9.967041015625e-07, + "loss": 0.0004, + "reward": 1.4135064482688904, + "reward_std": 0.3039677292108536, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.5228813886642456, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.1953125, + "epoch": 0.013671875, + "grad_norm": 2.4024436743008613, + "kl": 0.008575439453125, + "learning_rate": 9.9658203125e-07, + "loss": 0.0003, + "reward": 1.4704246520996094, + "reward_std": 0.14263245463371277, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.5251121670007706, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.078125, + "epoch": 0.01416015625, + "grad_norm": 2.4271553949012716, + "kl": 0.0111083984375, + "learning_rate": 9.964599609375e-07, + "loss": 0.0004, + "reward": 1.679746925830841, + "reward_std": 0.17487338185310364, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6953718960285187, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.3671875, + "epoch": 0.0146484375, + "grad_norm": 1.7256261444170102, + "kl": 0.01251220703125, + "learning_rate": 9.963378906249999e-07, + "loss": 0.0005, + "reward": 1.3718626499176025, + "reward_std": 0.15719684958457947, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.44998762011528015, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.625, + "epoch": 0.01513671875, + "grad_norm": 3.1473342645539613, + "kl": 0.015167236328125, + "learning_rate": 9.962158203125e-07, + "loss": 0.0006, + "reward": 1.5455162525177002, + "reward_std": 0.09274030476808548, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6080162525177002, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.2265625, + "epoch": 0.015625, + "grad_norm": 2.719177732364667, + "kl": 0.01239013671875, + "learning_rate": 9.960937499999999e-07, + "loss": 0.0005, + "reward": 1.3972212672233582, + "reward_std": 0.2669922858476639, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.47534629702568054, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.8203125, + "epoch": 0.01611328125, + "grad_norm": 3.6252731496314583, + "kl": 0.014862060546875, + "learning_rate": 9.959716796875e-07, + "loss": 0.0006, + "reward": 1.3229502439498901, + "reward_std": 0.20802345871925354, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.41670016944408417, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5390625, + "epoch": 0.0166015625, + "grad_norm": 5.151875445266958, + "kl": 0.017364501953125, + "learning_rate": 9.95849609375e-07, + "loss": 0.0007, + "reward": 1.5725292563438416, + "reward_std": 0.18037345260381699, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5803417265415192, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.4296875, + "epoch": 0.01708984375, + "grad_norm": 1.6068668302465103, + "kl": 0.01483154296875, + "learning_rate": 9.957275390625e-07, + "loss": 0.0006, + "reward": 1.6186823844909668, + "reward_std": 0.20612449198961258, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6499324142932892, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.3828125, + "epoch": 0.017578125, + "grad_norm": 5.289657901378204, + "kl": 0.07623291015625, + "learning_rate": 9.9560546875e-07, + "loss": 0.0031, + "reward": 1.7034948468208313, + "reward_std": 0.10497300326824188, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7034948468208313, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.1875, + "epoch": 0.01806640625, + "grad_norm": 2.217914980304441, + "kl": 0.008026123046875, + "learning_rate": 9.954833984374999e-07, + "loss": 0.0003, + "reward": 1.5534625053405762, + "reward_std": 0.15290548652410507, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6237750053405762, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.5234375, + "epoch": 0.0185546875, + "grad_norm": 2.4752008410956976, + "kl": 0.01519775390625, + "learning_rate": 9.95361328125e-07, + "loss": 0.0006, + "reward": 1.5298476219177246, + "reward_std": 0.11099112778902054, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5298476219177246, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.5078125, + "epoch": 0.01904296875, + "grad_norm": 4.386993810362648, + "kl": 0.017669677734375, + "learning_rate": 9.952392578124999e-07, + "loss": 0.0007, + "reward": 1.3395265936851501, + "reward_std": 0.2638590559363365, + "rewards/format_reward": 0.8671875, + "rewards/ocr_reward": 0.47233910858631134, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.3203125, + "epoch": 0.01953125, + "grad_norm": 4.4842670885571865, + "kl": 0.014923095703125, + "learning_rate": 9.951171875e-07, + "loss": 0.0006, + "reward": 1.5370002388954163, + "reward_std": 0.2090120166540146, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.591687798500061, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.3359375, + "epoch": 0.02001953125, + "grad_norm": 8.295700223804772, + "kl": 0.0208740234375, + "learning_rate": 9.949951171875e-07, + "loss": 0.0008, + "reward": 1.5621129274368286, + "reward_std": 0.14411582052707672, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.562112957239151, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.6953125, + "epoch": 0.0205078125, + "grad_norm": 5.14140191685903, + "kl": 0.019622802734375, + "learning_rate": 9.94873046875e-07, + "loss": 0.0008, + "reward": 1.5798521041870117, + "reward_std": 0.27509623765945435, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6111020445823669, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.2734375, + "epoch": 0.02099609375, + "grad_norm": 6.525096801696474, + "kl": 0.03765869140625, + "learning_rate": 9.947509765625e-07, + "loss": 0.0015, + "reward": 1.5670145750045776, + "reward_std": 0.07265551388263702, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6295144557952881, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.953125, + "epoch": 0.021484375, + "grad_norm": 2.1028130721169855, + "kl": 0.04144287109375, + "learning_rate": 9.946289062499999e-07, + "loss": 0.0017, + "reward": 1.4101728200912476, + "reward_std": 0.20591440051794052, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.48829779028892517, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.484375, + "epoch": 0.02197265625, + "grad_norm": 3.8367388215242437, + "kl": 0.0299072265625, + "learning_rate": 9.945068359375e-07, + "loss": 0.0012, + "reward": 1.4227579236030579, + "reward_std": 0.262872114777565, + "rewards/format_reward": 0.8984375, + "rewards/ocr_reward": 0.5243203639984131, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.453125, + "epoch": 0.0224609375, + "grad_norm": 7.321597730270083, + "kl": 0.03070068359375, + "learning_rate": 9.94384765625e-07, + "loss": 0.0012, + "reward": 1.377393662929535, + "reward_std": 0.22495906800031662, + "rewards/format_reward": 0.828125, + "rewards/ocr_reward": 0.5492686927318573, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2578125, + "epoch": 0.02294921875, + "grad_norm": 3.0974827964713625, + "kl": 0.0308837890625, + "learning_rate": 9.942626953125e-07, + "loss": 0.0012, + "reward": 1.6084083914756775, + "reward_std": 0.09188483282923698, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6084084212779999, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.6796875, + "epoch": 0.0234375, + "grad_norm": 2.5869776097386885, + "kl": 0.02886962890625, + "learning_rate": 9.94140625e-07, + "loss": 0.0012, + "reward": 1.6284254789352417, + "reward_std": 0.10064487159252167, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6284254491329193, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.3359375, + "epoch": 0.02392578125, + "grad_norm": 2.157246364997451, + "kl": 0.03350830078125, + "learning_rate": 9.940185546875e-07, + "loss": 0.0013, + "reward": 1.672927439212799, + "reward_std": 0.2006322741508484, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7198024392127991, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.8359375, + "epoch": 0.0244140625, + "grad_norm": 3.073205929906485, + "kl": 0.0428466796875, + "learning_rate": 9.93896484375e-07, + "loss": 0.0017, + "reward": 1.679724395275116, + "reward_std": 0.1169515885412693, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7109744250774384, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.2734375, + "epoch": 0.02490234375, + "grad_norm": 1.2830008863309907, + "kl": 0.027587890625, + "learning_rate": 9.937744140624999e-07, + "loss": 0.0011, + "reward": 1.4477837085723877, + "reward_std": 0.17552587389945984, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.5337212085723877, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.7265625, + "epoch": 0.025390625, + "grad_norm": 2.6436926651819443, + "kl": 0.03045654296875, + "learning_rate": 9.9365234375e-07, + "loss": 0.0012, + "reward": 1.6024810075759888, + "reward_std": 0.1249840036034584, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6102935671806335, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.2109375, + "epoch": 0.02587890625, + "grad_norm": 2.4855128606318893, + "kl": 0.02911376953125, + "learning_rate": 9.935302734375e-07, + "loss": 0.0012, + "reward": 1.487706184387207, + "reward_std": 0.1580093577504158, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.542393684387207, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.6171875, + "epoch": 0.0263671875, + "grad_norm": 3.8492649462150337, + "kl": 0.02069091796875, + "learning_rate": 9.93408203125e-07, + "loss": 0.0008, + "reward": 1.4205285906791687, + "reward_std": 0.3576083779335022, + "rewards/format_reward": 0.8984375, + "rewards/ocr_reward": 0.5220911204814911, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.2109375, + "epoch": 0.02685546875, + "grad_norm": 3.3352616770122663, + "kl": 0.02423095703125, + "learning_rate": 9.932861328125e-07, + "loss": 0.001, + "reward": 1.4812852144241333, + "reward_std": 0.21911777555942535, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.5672226548194885, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.9453125, + "epoch": 0.02734375, + "grad_norm": 2.599784945706524, + "kl": 0.0146484375, + "learning_rate": 9.931640625e-07, + "loss": 0.0006, + "reward": 1.5850829482078552, + "reward_std": 0.17178751900792122, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6085204482078552, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.875, + "epoch": 0.02783203125, + "grad_norm": 4.155986577192788, + "kl": 0.01739501953125, + "learning_rate": 9.930419921875e-07, + "loss": 0.0007, + "reward": 1.448303461074829, + "reward_std": 0.38380755484104156, + "rewards/format_reward": 0.875, + "rewards/ocr_reward": 0.5733034014701843, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.765625, + "epoch": 0.0283203125, + "grad_norm": 1.736596362026434, + "kl": 0.0234375, + "learning_rate": 9.929199218749999e-07, + "loss": 0.0009, + "reward": 1.5162723660469055, + "reward_std": 0.2991267442703247, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.6022098660469055, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.3203125, + "epoch": 0.02880859375, + "grad_norm": 4.494090341930586, + "kl": 0.03082275390625, + "learning_rate": 9.927978515625e-07, + "loss": 0.0012, + "reward": 1.378541350364685, + "reward_std": 0.35002946853637695, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.46447885036468506, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.2109375, + "epoch": 0.029296875, + "grad_norm": 6.756047598527613, + "kl": 0.02886962890625, + "learning_rate": 9.9267578125e-07, + "loss": 0.0012, + "reward": 1.3606464862823486, + "reward_std": 0.36894528567790985, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.47002144157886505, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.421875, + "epoch": 0.02978515625, + "grad_norm": 4.731387630408789, + "kl": 0.028076171875, + "learning_rate": 9.925537109375e-07, + "loss": 0.0011, + "reward": 1.6455896496772766, + "reward_std": 0.27443696558475494, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6924646198749542, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.0, + "epoch": 0.0302734375, + "grad_norm": 1.9490370706301865, + "kl": 0.0244140625, + "learning_rate": 9.92431640625e-07, + "loss": 0.001, + "reward": 1.5852088928222656, + "reward_std": 0.3096665292978287, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6555215120315552, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.6875, + "epoch": 0.03076171875, + "grad_norm": 1.9868062360035326, + "kl": 0.01654052734375, + "learning_rate": 9.923095703124999e-07, + "loss": 0.0007, + "reward": 1.5640851855278015, + "reward_std": 0.33458730578422546, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6187726855278015, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.8046875, + "epoch": 0.03125, + "grad_norm": 3.1565682997286975, + "kl": 0.014373779296875, + "learning_rate": 9.921875e-07, + "loss": 0.0006, + "reward": 1.451416552066803, + "reward_std": 0.22569319605827332, + "rewards/format_reward": 0.875, + "rewards/ocr_reward": 0.576416552066803, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.4140625, + "epoch": 0.03173828125, + "grad_norm": 4.987882156519405, + "kl": 0.0198974609375, + "learning_rate": 9.920654296874999e-07, + "loss": 0.0008, + "reward": 1.3604365587234497, + "reward_std": 0.3875332325696945, + "rewards/format_reward": 0.8359375, + "rewards/ocr_reward": 0.5244990885257721, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.3359375, + "epoch": 0.0322265625, + "grad_norm": 2.1756835153062988, + "kl": 0.01727294921875, + "learning_rate": 9.91943359375e-07, + "loss": 0.0007, + "reward": 1.476547658443451, + "reward_std": 0.2438819855451584, + "rewards/format_reward": 0.8984375, + "rewards/ocr_reward": 0.5781101584434509, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.875, + "epoch": 0.03271484375, + "grad_norm": 2.9742812230395614, + "kl": 0.01806640625, + "learning_rate": 9.918212890625e-07, + "loss": 0.0007, + "reward": 1.575055181980133, + "reward_std": 0.09458094835281372, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5750551819801331, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.28125, + "epoch": 0.033203125, + "grad_norm": 5.874510923979259, + "kl": 0.01611328125, + "learning_rate": 9.9169921875e-07, + "loss": 0.0006, + "reward": 1.5294025540351868, + "reward_std": 0.14596965909004211, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.552839994430542, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.8203125, + "epoch": 0.03369140625, + "grad_norm": 9.28303459781024, + "kl": 0.021240234375, + "learning_rate": 9.915771484375e-07, + "loss": 0.0008, + "reward": 1.534590721130371, + "reward_std": 0.15341224521398544, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5736532807350159, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8828125, + "epoch": 0.0341796875, + "grad_norm": 10.443387554099303, + "kl": 0.02099609375, + "learning_rate": 9.914550781249999e-07, + "loss": 0.0008, + "reward": 1.741838276386261, + "reward_std": 0.12638744711875916, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7652758061885834, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.9140625, + "epoch": 0.03466796875, + "grad_norm": 6.002627107380703, + "kl": 0.02801513671875, + "learning_rate": 9.913330078125e-07, + "loss": 0.0011, + "reward": 1.5784024596214294, + "reward_std": 0.19862286746501923, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6018398702144623, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.9921875, + "epoch": 0.03515625, + "grad_norm": 3.901896848624282, + "kl": 0.013671875, + "learning_rate": 9.912109375e-07, + "loss": 0.0005, + "reward": 1.4875227212905884, + "reward_std": 0.12161608785390854, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.4953352212905884, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.0390625, + "epoch": 0.03564453125, + "grad_norm": 3.5931169826306353, + "kl": 0.02130126953125, + "learning_rate": 9.910888671875e-07, + "loss": 0.0009, + "reward": 1.7116557955741882, + "reward_std": 0.11958565562963486, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.711655855178833, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.4609375, + "epoch": 0.0361328125, + "grad_norm": 2.716899809253411, + "kl": 0.017120361328125, + "learning_rate": 9.90966796875e-07, + "loss": 0.0007, + "reward": 1.5722922682762146, + "reward_std": 0.13994912058115005, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.6816672682762146, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.40625, + "epoch": 0.03662109375, + "grad_norm": 3.1440498309793634, + "kl": 0.0177001953125, + "learning_rate": 9.908447265625e-07, + "loss": 0.0007, + "reward": 1.3587397933006287, + "reward_std": 0.1976253017783165, + "rewards/format_reward": 0.8359375, + "rewards/ocr_reward": 0.5228022933006287, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.5703125, + "epoch": 0.037109375, + "grad_norm": 4.514322351312445, + "kl": 0.010009765625, + "learning_rate": 9.9072265625e-07, + "loss": 0.0004, + "reward": 1.4641498923301697, + "reward_std": 0.22810623794794083, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.5344623029232025, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.765625, + "epoch": 0.03759765625, + "grad_norm": 1.8710146074895642, + "kl": 0.0203857421875, + "learning_rate": 9.906005859374999e-07, + "loss": 0.0008, + "reward": 1.6418211460113525, + "reward_std": 0.13721346855163574, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6574461758136749, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.859375, + "epoch": 0.0380859375, + "grad_norm": 2.1667842777691373, + "kl": 0.018096923828125, + "learning_rate": 9.90478515625e-07, + "loss": 0.0007, + "reward": 1.5189919471740723, + "reward_std": 0.10774445161223412, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.5971169471740723, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.0703125, + "epoch": 0.03857421875, + "grad_norm": 4.797612672867945, + "kl": 0.01641845703125, + "learning_rate": 9.903564453125e-07, + "loss": 0.0007, + "reward": 1.6151621341705322, + "reward_std": 0.059841278940439224, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6151621639728546, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.375, + "epoch": 0.0390625, + "grad_norm": 3.385092526026789, + "kl": 0.01837158203125, + "learning_rate": 9.90234375e-07, + "loss": 0.0007, + "reward": 1.7148075699806213, + "reward_std": 0.13570959120988846, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7226200103759766, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.6171875, + "epoch": 0.03955078125, + "grad_norm": 2.6814045791672685, + "kl": 0.014251708984375, + "learning_rate": 9.901123046875e-07, + "loss": 0.0006, + "reward": 1.515661358833313, + "reward_std": 0.15792688727378845, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.5703488886356354, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.6953125, + "epoch": 0.0400390625, + "grad_norm": 6.013775861153405, + "kl": 0.02032470703125, + "learning_rate": 9.89990234375e-07, + "loss": 0.0008, + "reward": 1.5584400296211243, + "reward_std": 0.12772930040955544, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5818775296211243, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.4140625, + "epoch": 0.04052734375, + "grad_norm": 1.72500127507919, + "kl": 0.0257568359375, + "learning_rate": 9.898681640625e-07, + "loss": 0.001, + "reward": 1.4484447836875916, + "reward_std": 0.1641346886754036, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.49531984329223633, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.09375, + "epoch": 0.041015625, + "grad_norm": 2.2558835120683733, + "kl": 0.0225830078125, + "learning_rate": 9.897460937499999e-07, + "loss": 0.0009, + "reward": 1.5773499011993408, + "reward_std": 0.13273335248231888, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6007874011993408, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.1953125, + "epoch": 0.04150390625, + "grad_norm": 2.729303138391425, + "kl": 0.011932373046875, + "learning_rate": 9.896240234375e-07, + "loss": 0.0005, + "reward": 1.5902302265167236, + "reward_std": 0.20242100954055786, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6214802265167236, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.453125, + "epoch": 0.0419921875, + "grad_norm": 1.6659116590542917, + "kl": 0.013641357421875, + "learning_rate": 9.89501953125e-07, + "loss": 0.0005, + "reward": 1.5991840958595276, + "reward_std": 0.08300643041729927, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5991840660572052, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.265625, + "epoch": 0.04248046875, + "grad_norm": 6.5083912720618455, + "kl": 0.02520751953125, + "learning_rate": 9.893798828125e-07, + "loss": 0.001, + "reward": 1.5490674376487732, + "reward_std": 0.1682056337594986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5490674078464508, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3984375, + "epoch": 0.04296875, + "grad_norm": 1.855137877800116, + "kl": 0.01483154296875, + "learning_rate": 9.892578125e-07, + "loss": 0.0006, + "reward": 1.6930819749832153, + "reward_std": 0.08091134577989578, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6930819451808929, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.734375, + "epoch": 0.04345703125, + "grad_norm": 3.0628004168463323, + "kl": 0.0181884765625, + "learning_rate": 9.891357421874999e-07, + "loss": 0.0007, + "reward": 1.5072910785675049, + "reward_std": 0.10918539017438889, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5072910487651825, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.328125, + "epoch": 0.0439453125, + "grad_norm": 2.4591507009268003, + "kl": 0.0205078125, + "learning_rate": 9.89013671875e-07, + "loss": 0.0008, + "reward": 1.730940043926239, + "reward_std": 0.10830854251980782, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7309400737285614, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.8984375, + "epoch": 0.04443359375, + "grad_norm": 1.387374494016842, + "kl": 0.015838623046875, + "learning_rate": 9.888916015624999e-07, + "loss": 0.0006, + "reward": 1.575575053691864, + "reward_std": 0.13328294083476067, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.583387479186058, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.703125, + "epoch": 0.044921875, + "grad_norm": 3.0271895365123136, + "kl": 0.0208740234375, + "learning_rate": 9.8876953125e-07, + "loss": 0.0008, + "reward": 1.5377304553985596, + "reward_std": 0.1667354628443718, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6080429553985596, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.578125, + "epoch": 0.04541015625, + "grad_norm": 2.6921828266080015, + "kl": 0.017822265625, + "learning_rate": 9.886474609375e-07, + "loss": 0.0007, + "reward": 1.6924698948860168, + "reward_std": 0.1142515130341053, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6924698948860168, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.109375, + "epoch": 0.0458984375, + "grad_norm": 3.472336958554876, + "kl": 0.01885986328125, + "learning_rate": 9.88525390625e-07, + "loss": 0.0008, + "reward": 1.5990204811096191, + "reward_std": 0.10428202897310257, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6146455407142639, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.703125, + "epoch": 0.04638671875, + "grad_norm": 2.6702059682407917, + "kl": 0.02020263671875, + "learning_rate": 9.884033203125e-07, + "loss": 0.0008, + "reward": 1.6923083066940308, + "reward_std": 0.181168332695961, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.715745747089386, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.0546875, + "epoch": 0.046875, + "grad_norm": 3.236307882884475, + "kl": 0.01593017578125, + "learning_rate": 9.882812499999999e-07, + "loss": 0.0006, + "reward": 1.4725679755210876, + "reward_std": 0.18862508982419968, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.49600549042224884, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5078125, + "epoch": 0.04736328125, + "grad_norm": 3.1662840188332004, + "kl": 0.020263671875, + "learning_rate": 9.881591796875e-07, + "loss": 0.0008, + "reward": 1.525817096233368, + "reward_std": 0.11687836796045303, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.5805045962333679, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.625, + "epoch": 0.0478515625, + "grad_norm": 2.623130324904824, + "kl": 0.01812744140625, + "learning_rate": 9.88037109375e-07, + "loss": 0.0007, + "reward": 1.7077008485794067, + "reward_std": 0.15604694932699203, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7077008485794067, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.859375, + "epoch": 0.04833984375, + "grad_norm": 1.591617609181676, + "kl": 0.019775390625, + "learning_rate": 9.879150390625e-07, + "loss": 0.0008, + "reward": 1.5951241254806519, + "reward_std": 0.14091318100690842, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6029366254806519, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.453125, + "epoch": 0.048828125, + "grad_norm": 4.771714723016947, + "kl": 0.01776123046875, + "learning_rate": 9.8779296875e-07, + "loss": 0.0007, + "reward": 1.5604987144470215, + "reward_std": 0.19173409044742584, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6229987442493439, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.0546875, + "epoch": 0.04931640625, + "grad_norm": 9.104596570687864, + "kl": 0.013702392578125, + "learning_rate": 9.876708984375e-07, + "loss": 0.0005, + "reward": 1.6190659403800964, + "reward_std": 0.12071932479739189, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6190659999847412, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.5390625, + "epoch": 0.0498046875, + "grad_norm": 2.1594646925560386, + "kl": 0.01641845703125, + "learning_rate": 9.87548828125e-07, + "loss": 0.0007, + "reward": 1.651434302330017, + "reward_std": 0.19432562589645386, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6592467725276947, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.6171875, + "epoch": 0.05029296875, + "grad_norm": 3.3906829108536165, + "kl": 0.01910400390625, + "learning_rate": 9.874267578124999e-07, + "loss": 0.0008, + "reward": 1.711862325668335, + "reward_std": 0.17600611969828606, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7431123554706573, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.109375, + "epoch": 0.05078125, + "grad_norm": 5.335142373058523, + "kl": 0.012725830078125, + "learning_rate": 9.873046875e-07, + "loss": 0.0005, + "reward": 1.5398271083831787, + "reward_std": 0.13578901067376137, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.5867020785808563, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.7421875, + "epoch": 0.05126953125, + "grad_norm": 2.246739195857841, + "kl": 0.016357421875, + "learning_rate": 9.871826171875e-07, + "loss": 0.0007, + "reward": 1.7209094762802124, + "reward_std": 0.09800073876976967, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7287219762802124, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.34375, + "epoch": 0.0517578125, + "grad_norm": 3.430606877922797, + "kl": 0.02001953125, + "learning_rate": 9.87060546875e-07, + "loss": 0.0008, + "reward": 1.63734370470047, + "reward_std": 0.12101611867547035, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6373437345027924, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.1171875, + "epoch": 0.05224609375, + "grad_norm": 6.621962333726358, + "kl": 0.01654052734375, + "learning_rate": 9.869384765625e-07, + "loss": 0.0007, + "reward": 1.5627512335777283, + "reward_std": 0.20160631090402603, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.6408762633800507, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.3046875, + "epoch": 0.052734375, + "grad_norm": 3.714058389222369, + "kl": 0.01226806640625, + "learning_rate": 9.8681640625e-07, + "loss": 0.0005, + "reward": 1.6258893013000488, + "reward_std": 0.2035977840423584, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.664951741695404, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.0546875, + "epoch": 0.05322265625, + "grad_norm": 2.225619976721532, + "kl": 0.01641845703125, + "learning_rate": 9.866943359375e-07, + "loss": 0.0007, + "reward": 1.5322623252868652, + "reward_std": 0.12227768450975418, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.61819988489151, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.03125, + "epoch": 0.0537109375, + "grad_norm": 3.398354345358393, + "kl": 0.01690673828125, + "learning_rate": 9.865722656249999e-07, + "loss": 0.0007, + "reward": 1.6276288628578186, + "reward_std": 0.07827305793762207, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6745038628578186, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.5625, + "epoch": 0.05419921875, + "grad_norm": 5.939409220904329, + "kl": 0.021087646484375, + "learning_rate": 9.864501953125e-07, + "loss": 0.0008, + "reward": 1.574878215789795, + "reward_std": 0.08811075612902641, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6373782455921173, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.9765625, + "epoch": 0.0546875, + "grad_norm": 2.2502789319358554, + "kl": 0.01654052734375, + "learning_rate": 9.86328125e-07, + "loss": 0.0007, + "reward": 1.6835005283355713, + "reward_std": 0.14054467901587486, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6913129687309265, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.515625, + "epoch": 0.05517578125, + "grad_norm": 1.0034153882792571, + "kl": 0.0185546875, + "learning_rate": 9.862060546875e-07, + "loss": 0.0007, + "reward": 1.8294273614883423, + "reward_std": 0.11719358898699284, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8528648614883423, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.7890625, + "epoch": 0.0556640625, + "grad_norm": 16.17730319934069, + "kl": 0.01983642578125, + "learning_rate": 9.86083984375e-07, + "loss": 0.0008, + "reward": 1.5960276126861572, + "reward_std": 0.14195309579372406, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6272775828838348, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.2578125, + "epoch": 0.05615234375, + "grad_norm": 2.541506995594959, + "kl": 0.024169921875, + "learning_rate": 9.859619140624999e-07, + "loss": 0.001, + "reward": 1.544093132019043, + "reward_std": 0.206620991230011, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6065930724143982, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.28125, + "epoch": 0.056640625, + "grad_norm": 2.7963737233551176, + "kl": 0.019287109375, + "learning_rate": 9.8583984375e-07, + "loss": 0.0008, + "reward": 1.6467041969299316, + "reward_std": 0.07854663208127022, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6545166969299316, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.8671875, + "epoch": 0.05712890625, + "grad_norm": 1.692239110975536, + "kl": 0.02508544921875, + "learning_rate": 9.857177734374999e-07, + "loss": 0.001, + "reward": 1.7825125455856323, + "reward_std": 0.08674684725701809, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7825126051902771, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.9609375, + "epoch": 0.0576171875, + "grad_norm": 2.115673596589236, + "kl": 0.03924560546875, + "learning_rate": 9.85595703125e-07, + "loss": 0.0016, + "reward": 1.5380715131759644, + "reward_std": 0.13539821282029152, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.6240090429782867, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.109375, + "epoch": 0.05810546875, + "grad_norm": 15.946926979376824, + "kl": 0.0296630859375, + "learning_rate": 9.854736328125e-07, + "loss": 0.0012, + "reward": 1.5573410987854004, + "reward_std": 0.11809306219220161, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6198410987854004, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.3203125, + "epoch": 0.05859375, + "grad_norm": 4.256018644115217, + "kl": 0.0277099609375, + "learning_rate": 9.853515625e-07, + "loss": 0.0011, + "reward": 1.7416256666183472, + "reward_std": 0.11583732068538666, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7494381666183472, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.0703125, + "epoch": 0.05908203125, + "grad_norm": 2.603585855956455, + "kl": 0.020050048828125, + "learning_rate": 9.852294921875e-07, + "loss": 0.0008, + "reward": 1.4661349058151245, + "reward_std": 0.1207830049097538, + "rewards/format_reward": 0.859375, + "rewards/ocr_reward": 0.6067598760128021, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.03125, + "epoch": 0.0595703125, + "grad_norm": 4.267397613222153, + "kl": 0.0263671875, + "learning_rate": 9.851074218749999e-07, + "loss": 0.0011, + "reward": 1.7082802057266235, + "reward_std": 0.1457432433962822, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7395302057266235, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.5390625, + "epoch": 0.06005859375, + "grad_norm": 5.592358687960914, + "kl": 0.02459716796875, + "learning_rate": 9.849853515625e-07, + "loss": 0.001, + "reward": 1.653491497039795, + "reward_std": 0.1756032481789589, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6613039970397949, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.34375, + "epoch": 0.060546875, + "grad_norm": 1.9342219325316152, + "kl": 0.0211181640625, + "learning_rate": 9.848632812499999e-07, + "loss": 0.0008, + "reward": 1.6969304084777832, + "reward_std": 0.09714720770716667, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.704742968082428, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.8671875, + "epoch": 0.06103515625, + "grad_norm": 3.5352626846601334, + "kl": 0.0206298828125, + "learning_rate": 9.847412109375e-07, + "loss": 0.0008, + "reward": 1.5322385430335999, + "reward_std": 0.17621152848005295, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.6103635132312775, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.5859375, + "epoch": 0.0615234375, + "grad_norm": 1.1126647607262938, + "kl": 0.01446533203125, + "learning_rate": 9.84619140625e-07, + "loss": 0.0006, + "reward": 1.5217909812927246, + "reward_std": 0.14773621410131454, + "rewards/format_reward": 0.859375, + "rewards/ocr_reward": 0.6624160408973694, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.2265625, + "epoch": 0.06201171875, + "grad_norm": 16.170587561342735, + "kl": 0.0191650390625, + "learning_rate": 9.844970703125e-07, + "loss": 0.0008, + "reward": 1.5967344641685486, + "reward_std": 0.14839724078774452, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.690484493970871, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.7265625, + "epoch": 0.0625, + "grad_norm": 1.7043882185778825, + "kl": 0.01751708984375, + "learning_rate": 9.84375e-07, + "loss": 0.0007, + "reward": 1.5539951920509338, + "reward_std": 0.12984895333647728, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5696201622486115, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.984375, + "epoch": 0.06298828125, + "grad_norm": 1.3531844603958978, + "kl": 0.02239990234375, + "learning_rate": 9.842529296874999e-07, + "loss": 0.0009, + "reward": 1.5346065759658813, + "reward_std": 0.08880486711859703, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5736691057682037, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.2890625, + "epoch": 0.0634765625, + "grad_norm": 4.715371554402326, + "kl": 0.022705078125, + "learning_rate": 9.84130859375e-07, + "loss": 0.0009, + "reward": 1.62563157081604, + "reward_std": 0.12431228160858154, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6412566304206848, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.578125, + "epoch": 0.06396484375, + "grad_norm": 3.5849539827861907, + "kl": 0.02325439453125, + "learning_rate": 9.840087890625e-07, + "loss": 0.0009, + "reward": 1.7144591212272644, + "reward_std": 0.13489311560988426, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.730084091424942, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.484375, + "epoch": 0.064453125, + "grad_norm": 1.4393273984880173, + "kl": 0.0228271484375, + "learning_rate": 9.8388671875e-07, + "loss": 0.0009, + "reward": 1.6248722076416016, + "reward_std": 0.13557805679738522, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6561221778392792, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.5390625, + "epoch": 0.06494140625, + "grad_norm": 8.4663670507505, + "kl": 0.01715087890625, + "learning_rate": 9.837646484375e-07, + "loss": 0.0007, + "reward": 1.613499641418457, + "reward_std": 0.18968282639980316, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6213121712207794, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.625, + "epoch": 0.0654296875, + "grad_norm": 5.360156728372636, + "kl": 0.02471923828125, + "learning_rate": 9.83642578125e-07, + "loss": 0.001, + "reward": 1.6982364058494568, + "reward_std": 0.21003302931785583, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7216738760471344, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.7890625, + "epoch": 0.06591796875, + "grad_norm": 2.4693839555178103, + "kl": 0.02178955078125, + "learning_rate": 9.835205078125e-07, + "loss": 0.0009, + "reward": 1.6611779928207397, + "reward_std": 0.15551955252885818, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6768029928207397, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.125, + "epoch": 0.06640625, + "grad_norm": 2.8329079084560735, + "kl": 0.02130126953125, + "learning_rate": 9.833984374999999e-07, + "loss": 0.0009, + "reward": 1.5032538771629333, + "reward_std": 0.1515774130821228, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.5579414367675781, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.4140625, + "epoch": 0.06689453125, + "grad_norm": 1.882217780602691, + "kl": 0.019775390625, + "learning_rate": 9.832763671875e-07, + "loss": 0.0008, + "reward": 1.484582245349884, + "reward_std": 0.11968936026096344, + "rewards/format_reward": 0.8828125, + "rewards/ocr_reward": 0.6017696857452393, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.3046875, + "epoch": 0.0673828125, + "grad_norm": 3.046463172685156, + "kl": 0.0228271484375, + "learning_rate": 9.83154296875e-07, + "loss": 0.0009, + "reward": 1.5993627905845642, + "reward_std": 0.20090486854314804, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6462377905845642, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.8984375, + "epoch": 0.06787109375, + "grad_norm": 3.0503455953592784, + "kl": 0.02288818359375, + "learning_rate": 9.830322265625e-07, + "loss": 0.0009, + "reward": 1.5906482934951782, + "reward_std": 0.15873637050390244, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6140858232975006, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5625, + "epoch": 0.068359375, + "grad_norm": 2.120223798059756, + "kl": 0.02313232421875, + "learning_rate": 9.8291015625e-07, + "loss": 0.0009, + "reward": 1.6196279525756836, + "reward_std": 0.11341691762208939, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.619627982378006, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.2578125, + "epoch": 0.06884765625, + "grad_norm": 7.285340344072795, + "kl": 0.022216796875, + "learning_rate": 9.827880859374999e-07, + "loss": 0.0009, + "reward": 1.6983768343925476, + "reward_std": 0.12035223841667175, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6983768343925476, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.0703125, + "epoch": 0.0693359375, + "grad_norm": 5.9889935265796685, + "kl": 0.01953125, + "learning_rate": 9.82666015625e-07, + "loss": 0.0008, + "reward": 1.4262371063232422, + "reward_std": 0.17644815146923065, + "rewards/format_reward": 0.890625, + "rewards/ocr_reward": 0.5356121361255646, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.7890625, + "epoch": 0.06982421875, + "grad_norm": 3.990164163837389, + "kl": 0.0240478515625, + "learning_rate": 9.825439453124999e-07, + "loss": 0.001, + "reward": 1.5707527995109558, + "reward_std": 0.11035867407917976, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5707527995109558, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.375, + "epoch": 0.0703125, + "grad_norm": 0.8573518157012104, + "kl": 0.017333984375, + "learning_rate": 9.82421875e-07, + "loss": 0.0007, + "reward": 1.71940678358078, + "reward_std": 0.14653569110669196, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.7819067537784576, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.625, + "epoch": 0.07080078125, + "grad_norm": 3.6713857102420615, + "kl": 0.02471923828125, + "learning_rate": 9.822998046875e-07, + "loss": 0.001, + "reward": 1.5883715152740479, + "reward_std": 0.04609652329236269, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5883715152740479, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.859375, + "epoch": 0.0712890625, + "grad_norm": 2.2594904451340994, + "kl": 0.02471923828125, + "learning_rate": 9.82177734375e-07, + "loss": 0.001, + "reward": 1.5935519933700562, + "reward_std": 0.12184244394302368, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6013644337654114, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.078125, + "epoch": 0.07177734375, + "grad_norm": 2.5367376087547227, + "kl": 0.020416259765625, + "learning_rate": 9.820556640625e-07, + "loss": 0.0008, + "reward": 1.6828487515449524, + "reward_std": 0.09689129143953323, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.68284872174263, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.0078125, + "epoch": 0.072265625, + "grad_norm": 24.91393756517066, + "kl": 0.029541015625, + "learning_rate": 9.819335937499999e-07, + "loss": 0.0012, + "reward": 1.576207160949707, + "reward_std": 0.1735726036131382, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.623082160949707, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.96875, + "epoch": 0.07275390625, + "grad_norm": 1.7405264781938403, + "kl": 0.0450439453125, + "learning_rate": 9.818115234375e-07, + "loss": 0.0018, + "reward": 1.6206218600273132, + "reward_std": 0.19472770392894745, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6674968600273132, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.7578125, + "epoch": 0.0732421875, + "grad_norm": 2.391591985194695, + "kl": 0.036865234375, + "learning_rate": 9.816894531249999e-07, + "loss": 0.0015, + "reward": 1.6796503067016602, + "reward_std": 0.05935625545680523, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6796503067016602, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.5859375, + "epoch": 0.07373046875, + "grad_norm": 2.9623351499295425, + "kl": 0.0338134765625, + "learning_rate": 9.815673828125e-07, + "loss": 0.0014, + "reward": 1.6772570610046387, + "reward_std": 0.14248831570148468, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7319445908069611, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.3125, + "epoch": 0.07421875, + "grad_norm": 3.546506424634404, + "kl": 0.030517578125, + "learning_rate": 9.814453125e-07, + "loss": 0.0012, + "reward": 1.5874695181846619, + "reward_std": 0.13588757812976837, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5874694883823395, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.953125, + "epoch": 0.07470703125, + "grad_norm": 6.170642074931075, + "kl": 0.0421142578125, + "learning_rate": 9.813232421875e-07, + "loss": 0.0017, + "reward": 1.6709791421890259, + "reward_std": 0.08850692212581635, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6709791421890259, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.4296875, + "epoch": 0.0751953125, + "grad_norm": 2.4910705552225747, + "kl": 0.03271484375, + "learning_rate": 9.81201171875e-07, + "loss": 0.0013, + "reward": 1.6164610385894775, + "reward_std": 0.1261097490787506, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6633360981941223, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.140625, + "epoch": 0.07568359375, + "grad_norm": 1.337539431613464, + "kl": 0.0283203125, + "learning_rate": 9.810791015624999e-07, + "loss": 0.0011, + "reward": 1.5733261704444885, + "reward_std": 0.23978520929813385, + "rewards/format_reward": 0.875, + "rewards/ocr_reward": 0.6983261406421661, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.8671875, + "epoch": 0.076171875, + "grad_norm": 3.2537345038786407, + "kl": 0.0272216796875, + "learning_rate": 9.8095703125e-07, + "loss": 0.0011, + "reward": 1.5053273439407349, + "reward_std": 0.13704759627580643, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5365773737430573, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.21875, + "epoch": 0.07666015625, + "grad_norm": 14.554632484007437, + "kl": 0.0223388671875, + "learning_rate": 9.808349609375e-07, + "loss": 0.0009, + "reward": 1.6272760033607483, + "reward_std": 0.14983859658241272, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6741509735584259, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.359375, + "epoch": 0.0771484375, + "grad_norm": 2.334403175128852, + "kl": 0.0277099609375, + "learning_rate": 9.80712890625e-07, + "loss": 0.0011, + "reward": 1.5797749757766724, + "reward_std": 0.08415070176124573, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5797749757766724, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1953125, + "epoch": 0.07763671875, + "grad_norm": 3.0249691058933306, + "kl": 0.03240966796875, + "learning_rate": 9.805908203125e-07, + "loss": 0.0013, + "reward": 1.5711604952812195, + "reward_std": 0.08924713358283043, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5867854952812195, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.1328125, + "epoch": 0.078125, + "grad_norm": 6.8155628419050185, + "kl": 0.02862548828125, + "learning_rate": 9.8046875e-07, + "loss": 0.0011, + "reward": 1.635881781578064, + "reward_std": 0.17367641627788544, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6827567219734192, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.8359375, + "epoch": 0.07861328125, + "grad_norm": 3.0299661554177377, + "kl": 0.0274658203125, + "learning_rate": 9.803466796875e-07, + "loss": 0.0011, + "reward": 1.6225927472114563, + "reward_std": 0.15056072175502777, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6382177472114563, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.4296875, + "epoch": 0.0791015625, + "grad_norm": 1.7406985012161398, + "kl": 0.03253173828125, + "learning_rate": 9.802246093749999e-07, + "loss": 0.0013, + "reward": 1.6942219734191895, + "reward_std": 0.0540752187371254, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6942219436168671, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.96875, + "epoch": 0.07958984375, + "grad_norm": 4.477055997348827, + "kl": 0.031494140625, + "learning_rate": 9.801025390625e-07, + "loss": 0.0013, + "reward": 1.5531994700431824, + "reward_std": 0.16673196852207184, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5844494700431824, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.96875, + "epoch": 0.080078125, + "grad_norm": 1.86554014043665, + "kl": 0.0247802734375, + "learning_rate": 9.7998046875e-07, + "loss": 0.001, + "reward": 1.5419456362724304, + "reward_std": 0.16365046054124832, + "rewards/format_reward": 0.8984375, + "rewards/ocr_reward": 0.643508106470108, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.3125, + "epoch": 0.08056640625, + "grad_norm": 6.446669102096267, + "kl": 0.03466796875, + "learning_rate": 9.798583984375e-07, + "loss": 0.0014, + "reward": 1.6253865957260132, + "reward_std": 0.13813912868499756, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6722615659236908, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.359375, + "epoch": 0.0810546875, + "grad_norm": 2.3945943551584623, + "kl": 0.0513916015625, + "learning_rate": 9.79736328125e-07, + "loss": 0.0021, + "reward": 1.5900596380233765, + "reward_std": 0.17274170368909836, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6213095486164093, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.1640625, + "epoch": 0.08154296875, + "grad_norm": 4.935892148171108, + "kl": 0.0283203125, + "learning_rate": 9.796142578125e-07, + "loss": 0.0011, + "reward": 1.5856729745864868, + "reward_std": 0.19583696871995926, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.609110414981842, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.7109375, + "epoch": 0.08203125, + "grad_norm": 2.759482806908666, + "kl": 0.0263671875, + "learning_rate": 9.794921875e-07, + "loss": 0.0011, + "reward": 1.6779165267944336, + "reward_std": 0.1504085585474968, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7326040267944336, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.921875, + "epoch": 0.08251953125, + "grad_norm": 2.0205610250383703, + "kl": 0.03204345703125, + "learning_rate": 9.793701171874999e-07, + "loss": 0.0013, + "reward": 1.6056262850761414, + "reward_std": 0.12054416164755821, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6446887850761414, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.140625, + "epoch": 0.0830078125, + "grad_norm": 2.9051619908357025, + "kl": 0.029052734375, + "learning_rate": 9.79248046875e-07, + "loss": 0.0012, + "reward": 1.693081021308899, + "reward_std": 0.17445684224367142, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7165184915065765, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.359375, + "epoch": 0.08349609375, + "grad_norm": 1.815375912478969, + "kl": 0.02777099609375, + "learning_rate": 9.791259765625e-07, + "loss": 0.0011, + "reward": 1.6882360577583313, + "reward_std": 0.13848505914211273, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7351110577583313, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.953125, + "epoch": 0.083984375, + "grad_norm": 2.2996318418317734, + "kl": 0.029541015625, + "learning_rate": 9.7900390625e-07, + "loss": 0.0012, + "reward": 1.4833272099494934, + "reward_std": 0.10671622306108475, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.483327180147171, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.0, + "epoch": 0.08447265625, + "grad_norm": 1.398704102180127, + "kl": 0.0325927734375, + "learning_rate": 9.788818359375e-07, + "loss": 0.0013, + "reward": 1.6148168444633484, + "reward_std": 0.07767279259860516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6148169040679932, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.6484375, + "epoch": 0.0849609375, + "grad_norm": 3.4750762750830453, + "kl": 0.0400390625, + "learning_rate": 9.787597656249999e-07, + "loss": 0.0016, + "reward": 1.6466941833496094, + "reward_std": 0.18221855908632278, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6779442429542542, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.7265625, + "epoch": 0.08544921875, + "grad_norm": 2.3793365576087435, + "kl": 0.034912109375, + "learning_rate": 9.786376953125e-07, + "loss": 0.0014, + "reward": 1.702051043510437, + "reward_std": 0.17722390592098236, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7098636031150818, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.421875, + "epoch": 0.0859375, + "grad_norm": 5.929036755770266, + "kl": 0.0335693359375, + "learning_rate": 9.785156249999999e-07, + "loss": 0.0013, + "reward": 1.702830970287323, + "reward_std": 0.10274038091301918, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.702830970287323, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.71875, + "epoch": 0.08642578125, + "grad_norm": 7.4527179001348856, + "kl": 0.02398681640625, + "learning_rate": 9.783935546875e-07, + "loss": 0.001, + "reward": 1.5519742965698242, + "reward_std": 0.1975010707974434, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6144742965698242, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.8515625, + "epoch": 0.0869140625, + "grad_norm": 4.553092566985503, + "kl": 0.03271484375, + "learning_rate": 9.78271484375e-07, + "loss": 0.0013, + "reward": 1.7317935228347778, + "reward_std": 0.1101585403084755, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7396060526371002, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.828125, + "epoch": 0.08740234375, + "grad_norm": 1.9568946330604422, + "kl": 0.0372314453125, + "learning_rate": 9.781494140625e-07, + "loss": 0.0015, + "reward": 1.4185363054275513, + "reward_std": 0.2236497402191162, + "rewards/format_reward": 0.796875, + "rewards/ocr_reward": 0.6216612756252289, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2421875, + "epoch": 0.087890625, + "grad_norm": 5.658211765236265, + "kl": 0.0313720703125, + "learning_rate": 9.7802734375e-07, + "loss": 0.0013, + "reward": 1.7431809902191162, + "reward_std": 0.04815910384058952, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7431809306144714, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.2890625, + "epoch": 0.08837890625, + "grad_norm": 1.3083032198586613, + "kl": 0.0234375, + "learning_rate": 9.779052734374999e-07, + "loss": 0.0009, + "reward": 1.8097354173660278, + "reward_std": 0.0659454632550478, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8097354471683502, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.796875, + "epoch": 0.0888671875, + "grad_norm": 4.109670831162618, + "kl": 0.02972412109375, + "learning_rate": 9.77783203125e-07, + "loss": 0.0012, + "reward": 1.6616966128349304, + "reward_std": 0.11840381100773811, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6773216724395752, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.4140625, + "epoch": 0.08935546875, + "grad_norm": 3.291287735681801, + "kl": 0.0384521484375, + "learning_rate": 9.776611328125e-07, + "loss": 0.0015, + "reward": 1.6017380952835083, + "reward_std": 0.18959469348192215, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6251756846904755, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.703125, + "epoch": 0.08984375, + "grad_norm": 1.3054821132450145, + "kl": 0.02685546875, + "learning_rate": 9.775390625e-07, + "loss": 0.0011, + "reward": 1.7018551230430603, + "reward_std": 0.06901280581951141, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7018550932407379, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.2421875, + "epoch": 0.09033203125, + "grad_norm": 3.269405991674511, + "kl": 0.0325927734375, + "learning_rate": 9.774169921875e-07, + "loss": 0.0013, + "reward": 1.5782784819602966, + "reward_std": 0.18268048018217087, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6329659819602966, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.5546875, + "epoch": 0.0908203125, + "grad_norm": 3.8655457222519, + "kl": 0.0316162109375, + "learning_rate": 9.77294921875e-07, + "loss": 0.0013, + "reward": 1.626326560974121, + "reward_std": 0.10836686193943024, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6263265609741211, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.9921875, + "epoch": 0.09130859375, + "grad_norm": 1.5866438731376145, + "kl": 0.032958984375, + "learning_rate": 9.771728515625e-07, + "loss": 0.0013, + "reward": 1.719668209552765, + "reward_std": 0.08064623922109604, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7274806201457977, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.21875, + "epoch": 0.091796875, + "grad_norm": 5.9177668434469775, + "kl": 0.03240966796875, + "learning_rate": 9.770507812499999e-07, + "loss": 0.0013, + "reward": 1.6977457404136658, + "reward_std": 0.15341190993785858, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7211832702159882, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.1953125, + "epoch": 0.09228515625, + "grad_norm": 2.0723613037564426, + "kl": 0.0316162109375, + "learning_rate": 9.769287109375e-07, + "loss": 0.0013, + "reward": 1.6144706010818481, + "reward_std": 0.15216557681560516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6144706010818481, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.046875, + "epoch": 0.0927734375, + "grad_norm": 8.245892693245946, + "kl": 0.02520751953125, + "learning_rate": 9.76806640625e-07, + "loss": 0.001, + "reward": 1.5022258758544922, + "reward_std": 0.1600368544459343, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.5491008907556534, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.828125, + "epoch": 0.09326171875, + "grad_norm": 3.030844823693362, + "kl": 0.02996826171875, + "learning_rate": 9.766845703125e-07, + "loss": 0.0012, + "reward": 1.6883333325386047, + "reward_std": 0.22212432324886322, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7195833027362823, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.0625, + "epoch": 0.09375, + "grad_norm": 1.8859191428821602, + "kl": 0.0260009765625, + "learning_rate": 9.765625e-07, + "loss": 0.001, + "reward": 1.5929180979728699, + "reward_std": 0.1562328040599823, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6319805383682251, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.875, + "epoch": 0.09423828125, + "grad_norm": 1.8480988076195144, + "kl": 0.02642822265625, + "learning_rate": 9.764404296875e-07, + "loss": 0.0011, + "reward": 1.6515385508537292, + "reward_std": 0.13265355303883553, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6827885508537292, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.6015625, + "epoch": 0.0947265625, + "grad_norm": 1.9325979913798101, + "kl": 0.03607177734375, + "learning_rate": 9.76318359375e-07, + "loss": 0.0014, + "reward": 1.761966586112976, + "reward_std": 0.06584762595593929, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7619665265083313, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.2109375, + "epoch": 0.09521484375, + "grad_norm": 1.9358707417121381, + "kl": 0.03466796875, + "learning_rate": 9.761962890624999e-07, + "loss": 0.0014, + "reward": 1.7281653881072998, + "reward_std": 0.16422076523303986, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7672278881072998, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.328125, + "epoch": 0.095703125, + "grad_norm": 3.023440850843629, + "kl": 0.024658203125, + "learning_rate": 9.7607421875e-07, + "loss": 0.001, + "reward": 1.7193759679794312, + "reward_std": 0.17258312553167343, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7428134679794312, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.890625, + "epoch": 0.09619140625, + "grad_norm": 2.595712818329617, + "kl": 0.038818359375, + "learning_rate": 9.759521484375e-07, + "loss": 0.0016, + "reward": 1.7148744463920593, + "reward_std": 0.1324017532169819, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7226869761943817, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.1640625, + "epoch": 0.0966796875, + "grad_norm": 1.3533967735031216, + "kl": 0.02520751953125, + "learning_rate": 9.75830078125e-07, + "loss": 0.001, + "reward": 1.6926743984222412, + "reward_std": 0.16136356070637703, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.716111958026886, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.78125, + "epoch": 0.09716796875, + "grad_norm": 1.7402462326884431, + "kl": 0.02752685546875, + "learning_rate": 9.757080078125e-07, + "loss": 0.0011, + "reward": 1.608244240283966, + "reward_std": 0.1039048321545124, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6082442104816437, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.0859375, + "epoch": 0.09765625, + "grad_norm": 2.518073418544392, + "kl": 0.034423828125, + "learning_rate": 9.755859374999999e-07, + "loss": 0.0014, + "reward": 1.5175416469573975, + "reward_std": 0.24317501485347748, + "rewards/format_reward": 0.8984375, + "rewards/ocr_reward": 0.6191041469573975, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.9453125, + "epoch": 0.09814453125, + "grad_norm": 6.096098416889449, + "kl": 0.0357666015625, + "learning_rate": 9.754638671875e-07, + "loss": 0.0014, + "reward": 1.6129422783851624, + "reward_std": 0.11736492812633514, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6129422634840012, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.2734375, + "epoch": 0.0986328125, + "grad_norm": 3.5514643689891483, + "kl": 0.038330078125, + "learning_rate": 9.753417968749999e-07, + "loss": 0.0015, + "reward": 1.7121334075927734, + "reward_std": 0.15303652733564377, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7590084075927734, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.0, + "epoch": 0.09912109375, + "grad_norm": 2.7929295597358172, + "kl": 0.03955078125, + "learning_rate": 9.752197265625e-07, + "loss": 0.0016, + "reward": 1.5464635491371155, + "reward_std": 0.12700794637203217, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5542759895324707, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.234375, + "epoch": 0.099609375, + "grad_norm": 2.0494012473671406, + "kl": 0.02825927734375, + "learning_rate": 9.7509765625e-07, + "loss": 0.0011, + "reward": 1.6630714535713196, + "reward_std": 0.10232871398329735, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6630714535713196, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.4140625, + "epoch": 0.10009765625, + "grad_norm": 7.6058284423417115, + "kl": 0.03277587890625, + "learning_rate": 9.749755859375e-07, + "loss": 0.0013, + "reward": 1.718002438545227, + "reward_std": 0.10656377673149109, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.718002438545227, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.984375, + "epoch": 0.1005859375, + "grad_norm": 4.022969206760325, + "kl": 0.0401611328125, + "learning_rate": 9.74853515625e-07, + "loss": 0.0016, + "reward": 1.7202000617980957, + "reward_std": 0.15844309329986572, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7358251512050629, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.8125, + "epoch": 0.10107421875, + "grad_norm": 1.9668753587480723, + "kl": 0.040283203125, + "learning_rate": 9.747314453124999e-07, + "loss": 0.0016, + "reward": 1.4545677304267883, + "reward_std": 0.25500622391700745, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.5483177602291107, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.6796875, + "epoch": 0.1015625, + "grad_norm": 4.526438674345018, + "kl": 0.0428466796875, + "learning_rate": 9.74609375e-07, + "loss": 0.0017, + "reward": 1.6247982382774353, + "reward_std": 0.07701070234179497, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6247982978820801, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.1875, + "epoch": 0.10205078125, + "grad_norm": 2.0398164468164968, + "kl": 0.0357666015625, + "learning_rate": 9.744873046874999e-07, + "loss": 0.0014, + "reward": 1.587377667427063, + "reward_std": 0.21580906957387924, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.642065167427063, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.7734375, + "epoch": 0.1025390625, + "grad_norm": 1.7111018400786906, + "kl": 0.03631591796875, + "learning_rate": 9.74365234375e-07, + "loss": 0.0015, + "reward": 1.5716455578804016, + "reward_std": 0.14876239746809006, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6185204684734344, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.4609375, + "epoch": 0.10302734375, + "grad_norm": 1.43762144541971, + "kl": 0.036865234375, + "learning_rate": 9.742431640625e-07, + "loss": 0.0015, + "reward": 1.7287642359733582, + "reward_std": 0.1475791335105896, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7443892359733582, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.3203125, + "epoch": 0.103515625, + "grad_norm": 1.2895212391486286, + "kl": 0.03558349609375, + "learning_rate": 9.7412109375e-07, + "loss": 0.0014, + "reward": 1.5150426030158997, + "reward_std": 0.18442986905574799, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5462925732135773, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.0, + "epoch": 0.10400390625, + "grad_norm": 15.302170515429042, + "kl": 0.04736328125, + "learning_rate": 9.739990234375e-07, + "loss": 0.0019, + "reward": 1.492401361465454, + "reward_std": 0.22428305447101593, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5158388316631317, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.21875, + "epoch": 0.1044921875, + "grad_norm": 3.158380258746703, + "kl": 0.041748046875, + "learning_rate": 9.738769531249999e-07, + "loss": 0.0017, + "reward": 1.7838861346244812, + "reward_std": 0.1373641975224018, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7838861048221588, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.8515625, + "epoch": 0.10498046875, + "grad_norm": 1.9522656170518902, + "kl": 0.0390625, + "learning_rate": 9.737548828125e-07, + "loss": 0.0016, + "reward": 1.7652413845062256, + "reward_std": 0.1776389330625534, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7964914739131927, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.9140625, + "epoch": 0.10546875, + "grad_norm": 1.1465475583145053, + "kl": 0.0390625, + "learning_rate": 9.736328125e-07, + "loss": 0.0016, + "reward": 1.6162505149841309, + "reward_std": 0.17765599489212036, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6553130149841309, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.8984375, + "epoch": 0.10595703125, + "grad_norm": 2.8690359668413126, + "kl": 0.0386962890625, + "learning_rate": 9.735107421875e-07, + "loss": 0.0015, + "reward": 1.6361339688301086, + "reward_std": 0.1363746039569378, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.690821498632431, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.28125, + "epoch": 0.1064453125, + "grad_norm": 3.731840752505406, + "kl": 0.04071044921875, + "learning_rate": 9.73388671875e-07, + "loss": 0.0016, + "reward": 1.7846105098724365, + "reward_std": 0.11500228941440582, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7846105098724365, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.15625, + "epoch": 0.10693359375, + "grad_norm": 1.3127699847264673, + "kl": 0.04736328125, + "learning_rate": 9.732666015625e-07, + "loss": 0.0019, + "reward": 1.7865891456604004, + "reward_std": 0.11259111389517784, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7944017052650452, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.640625, + "epoch": 0.107421875, + "grad_norm": 2.5515744642251033, + "kl": 0.0418701171875, + "learning_rate": 9.7314453125e-07, + "loss": 0.0017, + "reward": 1.5939872860908508, + "reward_std": 0.11360449716448784, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6096123307943344, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.9609375, + "epoch": 0.10791015625, + "grad_norm": 5.193842375036834, + "kl": 0.03350830078125, + "learning_rate": 9.730224609374999e-07, + "loss": 0.0013, + "reward": 1.6299118399620056, + "reward_std": 0.18084490299224854, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6533493101596832, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.28125, + "epoch": 0.1083984375, + "grad_norm": 3.180517710779925, + "kl": 0.03668212890625, + "learning_rate": 9.72900390625e-07, + "loss": 0.0015, + "reward": 1.7199169397354126, + "reward_std": 0.15011364966630936, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7511670291423798, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.828125, + "epoch": 0.10888671875, + "grad_norm": 8.771676797515104, + "kl": 0.03662109375, + "learning_rate": 9.727783203125e-07, + "loss": 0.0015, + "reward": 1.6318160891532898, + "reward_std": 0.060743046924471855, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6318160891532898, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.328125, + "epoch": 0.109375, + "grad_norm": 4.507274379774449, + "kl": 0.045166015625, + "learning_rate": 9.7265625e-07, + "loss": 0.0018, + "reward": 1.6135406494140625, + "reward_std": 0.11964382976293564, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6369781494140625, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.65625, + "epoch": 0.10986328125, + "grad_norm": 0.7984406651823597, + "kl": 0.040771484375, + "learning_rate": 9.725341796875e-07, + "loss": 0.0016, + "reward": 1.5897437930107117, + "reward_std": 0.09517102688550949, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6288062930107117, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.6015625, + "epoch": 0.1103515625, + "grad_norm": 3.691513509948555, + "kl": 0.02752685546875, + "learning_rate": 9.724121093749999e-07, + "loss": 0.0011, + "reward": 1.5694403648376465, + "reward_std": 0.2135012000799179, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.6631903648376465, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.109375, + "epoch": 0.11083984375, + "grad_norm": 1.9308963288087744, + "kl": 0.031982421875, + "learning_rate": 9.722900390625e-07, + "loss": 0.0013, + "reward": 1.846408486366272, + "reward_std": 0.11861564591526985, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8464084565639496, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.75, + "epoch": 0.111328125, + "grad_norm": 2.2747259722712303, + "kl": 0.03173828125, + "learning_rate": 9.721679687499999e-07, + "loss": 0.0013, + "reward": 1.3386054635047913, + "reward_std": 0.20506983995437622, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.40110543370246887, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.1953125, + "epoch": 0.11181640625, + "grad_norm": 3.898177359401665, + "kl": 0.02996826171875, + "learning_rate": 9.720458984375e-07, + "loss": 0.0012, + "reward": 1.562267780303955, + "reward_std": 0.21021173894405365, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6091427206993103, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.4375, + "epoch": 0.1123046875, + "grad_norm": 4.223589957552283, + "kl": 0.026611328125, + "learning_rate": 9.71923828125e-07, + "loss": 0.0011, + "reward": 1.6675159335136414, + "reward_std": 0.16610606014728546, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7065784335136414, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.71875, + "epoch": 0.11279296875, + "grad_norm": 1.379228938803975, + "kl": 0.0284423828125, + "learning_rate": 9.718017578125e-07, + "loss": 0.0011, + "reward": 1.5755912065505981, + "reward_std": 0.16047358512878418, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6068412065505981, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.90625, + "epoch": 0.11328125, + "grad_norm": 2.347155683071109, + "kl": 0.03155517578125, + "learning_rate": 9.716796875e-07, + "loss": 0.0013, + "reward": 1.6779637932777405, + "reward_std": 0.13747821748256683, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6857762336730957, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.453125, + "epoch": 0.11376953125, + "grad_norm": 1.9265679656994268, + "kl": 0.0267333984375, + "learning_rate": 9.715576171874999e-07, + "loss": 0.0011, + "reward": 1.6846604943275452, + "reward_std": 0.11376481875777245, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6846604943275452, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.4140625, + "epoch": 0.1142578125, + "grad_norm": 2.2869527266899348, + "kl": 0.02642822265625, + "learning_rate": 9.71435546875e-07, + "loss": 0.0011, + "reward": 1.5628395676612854, + "reward_std": 0.1172020323574543, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6175270974636078, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.7578125, + "epoch": 0.11474609375, + "grad_norm": 2.244893512292209, + "kl": 0.02557373046875, + "learning_rate": 9.713134765624999e-07, + "loss": 0.001, + "reward": 1.6671748161315918, + "reward_std": 0.12159543856978416, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.682799756526947, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.390625, + "epoch": 0.115234375, + "grad_norm": 2.953265867643204, + "kl": 0.0301513671875, + "learning_rate": 9.7119140625e-07, + "loss": 0.0012, + "reward": 1.4195521473884583, + "reward_std": 0.08454703539609909, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.42736467719078064, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.6171875, + "epoch": 0.11572265625, + "grad_norm": 5.848251432569352, + "kl": 0.0361328125, + "learning_rate": 9.710693359375e-07, + "loss": 0.0014, + "reward": 1.5740194916725159, + "reward_std": 0.21067717671394348, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5974570214748383, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.0546875, + "epoch": 0.1162109375, + "grad_norm": 3.4952892235581126, + "kl": 0.02520751953125, + "learning_rate": 9.70947265625e-07, + "loss": 0.001, + "reward": 1.545112133026123, + "reward_std": 0.19367430359125137, + "rewards/format_reward": 0.8984375, + "rewards/ocr_reward": 0.646674633026123, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.25, + "epoch": 0.11669921875, + "grad_norm": 6.611274101328795, + "kl": 0.0341796875, + "learning_rate": 9.708251953125e-07, + "loss": 0.0014, + "reward": 1.7572271823883057, + "reward_std": 0.13795867562294006, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7650396823883057, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.53125, + "epoch": 0.1171875, + "grad_norm": 3.1286325507021466, + "kl": 0.0322265625, + "learning_rate": 9.707031249999999e-07, + "loss": 0.0013, + "reward": 1.6395533084869385, + "reward_std": 0.09670542925596237, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6395533084869385, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.703125, + "epoch": 0.11767578125, + "grad_norm": 1.627275271283892, + "kl": 0.038330078125, + "learning_rate": 9.705810546875e-07, + "loss": 0.0015, + "reward": 1.644788920879364, + "reward_std": 0.04493547976016998, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6447888910770416, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.7890625, + "epoch": 0.1181640625, + "grad_norm": 6.14755791950804, + "kl": 0.03076171875, + "learning_rate": 9.70458984375e-07, + "loss": 0.0012, + "reward": 1.6965675354003906, + "reward_std": 0.12545205652713776, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7043800354003906, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.296875, + "epoch": 0.11865234375, + "grad_norm": 4.504232771428641, + "kl": 0.0355224609375, + "learning_rate": 9.703369140625e-07, + "loss": 0.0014, + "reward": 1.6774699091911316, + "reward_std": 0.10419408231973648, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6930948793888092, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.484375, + "epoch": 0.119140625, + "grad_norm": 1.7486968845761774, + "kl": 0.03228759765625, + "learning_rate": 9.7021484375e-07, + "loss": 0.0013, + "reward": 1.8203869462013245, + "reward_std": 0.08420379087328911, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8203868865966797, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.53125, + "epoch": 0.11962890625, + "grad_norm": 2.2888106990202304, + "kl": 0.031982421875, + "learning_rate": 9.700927734375e-07, + "loss": 0.0013, + "reward": 1.5356090068817139, + "reward_std": 0.18412478268146515, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5746715664863586, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.40625, + "epoch": 0.1201171875, + "grad_norm": 4.440936065671452, + "kl": 0.02557373046875, + "learning_rate": 9.69970703125e-07, + "loss": 0.001, + "reward": 1.6864354610443115, + "reward_std": 0.16976945102214813, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7176855206489563, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.296875, + "epoch": 0.12060546875, + "grad_norm": 1.6696154402341608, + "kl": 0.03326416015625, + "learning_rate": 9.698486328124999e-07, + "loss": 0.0013, + "reward": 1.6419482827186584, + "reward_std": 0.141361266374588, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6810107231140137, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.453125, + "epoch": 0.12109375, + "grad_norm": 5.609952930401551, + "kl": 0.03277587890625, + "learning_rate": 9.697265625e-07, + "loss": 0.0013, + "reward": 1.6594606637954712, + "reward_std": 0.09116644039750099, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6594606637954712, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.3125, + "epoch": 0.12158203125, + "grad_norm": 1.6010400179727664, + "kl": 0.028076171875, + "learning_rate": 9.696044921875e-07, + "loss": 0.0011, + "reward": 1.548350989818573, + "reward_std": 0.11171835660934448, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5639760047197342, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.5078125, + "epoch": 0.1220703125, + "grad_norm": 8.329224730907894, + "kl": 0.0340576171875, + "learning_rate": 9.69482421875e-07, + "loss": 0.0014, + "reward": 1.4914612770080566, + "reward_std": 0.21047968417406082, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.569586306810379, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.4296875, + "epoch": 0.12255859375, + "grad_norm": 3.6020583542087117, + "kl": 0.030029296875, + "learning_rate": 9.693603515625e-07, + "loss": 0.0012, + "reward": 1.811837911605835, + "reward_std": 0.045381875708699226, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8118377923965454, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.6171875, + "epoch": 0.123046875, + "grad_norm": 2.8891967781818044, + "kl": 0.02545166015625, + "learning_rate": 9.6923828125e-07, + "loss": 0.001, + "reward": 1.5089460015296936, + "reward_std": 0.3081662133336067, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.602696031332016, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.8046875, + "epoch": 0.12353515625, + "grad_norm": 3.246272295841944, + "kl": 0.04095458984375, + "learning_rate": 9.691162109375e-07, + "loss": 0.0016, + "reward": 1.7035585045814514, + "reward_std": 0.11963363364338875, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7035585343837738, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.7109375, + "epoch": 0.1240234375, + "grad_norm": 3.151160447979875, + "kl": 0.0423583984375, + "learning_rate": 9.689941406249999e-07, + "loss": 0.0017, + "reward": 1.6344158053398132, + "reward_std": 0.18827372789382935, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6656658351421356, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.65625, + "epoch": 0.12451171875, + "grad_norm": 6.070686166424575, + "kl": 0.03094482421875, + "learning_rate": 9.688720703125e-07, + "loss": 0.0012, + "reward": 1.71599280834198, + "reward_std": 0.15431293100118637, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7394302487373352, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.0234375, + "epoch": 0.125, + "grad_norm": 0.9304227502971951, + "kl": 0.03057861328125, + "learning_rate": 9.6875e-07, + "loss": 0.0012, + "reward": 1.7525382041931152, + "reward_std": 0.09551074542105198, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7759757041931152, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.71875, + "epoch": 0.12548828125, + "grad_norm": 3.7527081977389494, + "kl": 0.0892333984375, + "learning_rate": 9.686279296875e-07, + "loss": 0.0036, + "reward": 1.8059654235839844, + "reward_std": 0.11925885081291199, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8137778639793396, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.890625, + "epoch": 0.1259765625, + "grad_norm": 3.2980897081154468, + "kl": 0.027099609375, + "learning_rate": 9.68505859375e-07, + "loss": 0.0011, + "reward": 1.7137970328330994, + "reward_std": 0.13171366602182388, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7294220626354218, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.953125, + "epoch": 0.12646484375, + "grad_norm": 4.3680814156942285, + "kl": 0.055419921875, + "learning_rate": 9.683837890624999e-07, + "loss": 0.0022, + "reward": 1.719020664691925, + "reward_std": 0.10069620236754417, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7268331944942474, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.1484375, + "epoch": 0.126953125, + "grad_norm": 1.7575782040816468, + "kl": 0.032470703125, + "learning_rate": 9.6826171875e-07, + "loss": 0.0013, + "reward": 1.756038784980774, + "reward_std": 0.1373431235551834, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7560386955738068, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.1640625, + "epoch": 0.12744140625, + "grad_norm": 2.2674231639337674, + "kl": 0.0382080078125, + "learning_rate": 9.681396484374999e-07, + "loss": 0.0015, + "reward": 1.6681320667266846, + "reward_std": 0.08800495602190495, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6681320667266846, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.0390625, + "epoch": 0.1279296875, + "grad_norm": 2.133067633460261, + "kl": 0.032470703125, + "learning_rate": 9.68017578125e-07, + "loss": 0.0013, + "reward": 1.7665959596633911, + "reward_std": 0.11527542397379875, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7744084894657135, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.7734375, + "epoch": 0.12841796875, + "grad_norm": 2.9133910121332476, + "kl": 0.0313720703125, + "learning_rate": 9.678955078125e-07, + "loss": 0.0013, + "reward": 1.622836172580719, + "reward_std": 0.08527448028326035, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6228361874818802, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.0, + "epoch": 0.12890625, + "grad_norm": 2.595596598906522, + "kl": 0.0341796875, + "learning_rate": 9.677734375e-07, + "loss": 0.0014, + "reward": 1.753430426120758, + "reward_std": 0.06863740459084511, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7534304261207581, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.4609375, + "epoch": 0.12939453125, + "grad_norm": 1.9387469073690122, + "kl": 0.0390625, + "learning_rate": 9.676513671875e-07, + "loss": 0.0016, + "reward": 1.6287448406219482, + "reward_std": 0.15640820562839508, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6443698704242706, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.453125, + "epoch": 0.1298828125, + "grad_norm": 13.216425880817694, + "kl": 0.0343017578125, + "learning_rate": 9.675292968749999e-07, + "loss": 0.0014, + "reward": 1.731309413909912, + "reward_std": 0.08267020061612129, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.731309324502945, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.59375, + "epoch": 0.13037109375, + "grad_norm": 4.862426726552091, + "kl": 0.0419921875, + "learning_rate": 9.674072265625e-07, + "loss": 0.0017, + "reward": 1.648730993270874, + "reward_std": 0.07836638763546944, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6487309336662292, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.9375, + "epoch": 0.130859375, + "grad_norm": 21.763882065889554, + "kl": 0.0308837890625, + "learning_rate": 9.6728515625e-07, + "loss": 0.0012, + "reward": 1.6850923895835876, + "reward_std": 0.10728929005563259, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7007173895835876, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.265625, + "epoch": 0.13134765625, + "grad_norm": 2.082497501815107, + "kl": 0.0380859375, + "learning_rate": 9.671630859375e-07, + "loss": 0.0015, + "reward": 1.667827844619751, + "reward_std": 0.09125854074954987, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6678277850151062, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.4140625, + "epoch": 0.1318359375, + "grad_norm": 3.1061731600297717, + "kl": 0.0426025390625, + "learning_rate": 9.67041015625e-07, + "loss": 0.0017, + "reward": 1.6657472848892212, + "reward_std": 0.10530559718608856, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.665747344493866, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.1328125, + "epoch": 0.13232421875, + "grad_norm": 1.6926718726678105, + "kl": 0.03216552734375, + "learning_rate": 9.669189453125e-07, + "loss": 0.0013, + "reward": 1.6102675795555115, + "reward_std": 0.20465338230133057, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6415176093578339, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.875, + "epoch": 0.1328125, + "grad_norm": 1.5814783438080073, + "kl": 0.0360107421875, + "learning_rate": 9.66796875e-07, + "loss": 0.0014, + "reward": 1.6680699586868286, + "reward_std": 0.0880473144352436, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6680700480937958, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.203125, + "epoch": 0.13330078125, + "grad_norm": 2.340261215855065, + "kl": 0.0401611328125, + "learning_rate": 9.666748046874999e-07, + "loss": 0.0016, + "reward": 1.7241803407669067, + "reward_std": 0.1692553162574768, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7476178705692291, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.7578125, + "epoch": 0.1337890625, + "grad_norm": 4.11544829128727, + "kl": 0.0391845703125, + "learning_rate": 9.66552734375e-07, + "loss": 0.0016, + "reward": 1.781490683555603, + "reward_std": 0.13933787494897842, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7971156537532806, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.2578125, + "epoch": 0.13427734375, + "grad_norm": 1.4526251271367776, + "kl": 0.0401611328125, + "learning_rate": 9.664306640625e-07, + "loss": 0.0016, + "reward": 1.6937137246131897, + "reward_std": 0.1856069192290306, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7327762842178345, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.578125, + "epoch": 0.134765625, + "grad_norm": 10.727511575491055, + "kl": 0.0391845703125, + "learning_rate": 9.6630859375e-07, + "loss": 0.0016, + "reward": 1.5141828656196594, + "reward_std": 0.12065092846751213, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5219953954219818, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.8671875, + "epoch": 0.13525390625, + "grad_norm": 1.9833864945403907, + "kl": 0.03375244140625, + "learning_rate": 9.661865234375e-07, + "loss": 0.0013, + "reward": 1.536266803741455, + "reward_std": 0.21020027250051498, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.6143918633460999, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.4140625, + "epoch": 0.1357421875, + "grad_norm": 6.895919430163141, + "kl": 0.029541015625, + "learning_rate": 9.66064453125e-07, + "loss": 0.0012, + "reward": 1.6948537826538086, + "reward_std": 0.11981324478983879, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.702666312456131, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.328125, + "epoch": 0.13623046875, + "grad_norm": 2.2534331703734067, + "kl": 0.03424072265625, + "learning_rate": 9.659423828125e-07, + "loss": 0.0014, + "reward": 1.6411468386650085, + "reward_std": 0.08064734004437923, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6411468386650085, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.625, + "epoch": 0.13671875, + "grad_norm": 3.863896913907151, + "kl": 0.04150390625, + "learning_rate": 9.658203124999999e-07, + "loss": 0.0017, + "reward": 1.6285604238510132, + "reward_std": 0.12783172726631165, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6754354536533356, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.609375, + "epoch": 0.13720703125, + "grad_norm": 2.933038355393098, + "kl": 0.02764892578125, + "learning_rate": 9.656982421875e-07, + "loss": 0.0011, + "reward": 1.7419158220291138, + "reward_std": 0.14980874210596085, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7653533220291138, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.703125, + "epoch": 0.1376953125, + "grad_norm": 9.395073865019247, + "kl": 0.03985595703125, + "learning_rate": 9.65576171875e-07, + "loss": 0.0016, + "reward": 1.6920581459999084, + "reward_std": 0.12204625830054283, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6920581459999084, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.3984375, + "epoch": 0.13818359375, + "grad_norm": 3.37707988325681, + "kl": 0.035400390625, + "learning_rate": 9.654541015625e-07, + "loss": 0.0014, + "reward": 1.497445821762085, + "reward_std": 0.1840338483452797, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5365082919597626, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.3046875, + "epoch": 0.138671875, + "grad_norm": 2.0075105685871426, + "kl": 0.032958984375, + "learning_rate": 9.6533203125e-07, + "loss": 0.0013, + "reward": 1.6478480100631714, + "reward_std": 0.11625828593969345, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6634730100631714, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.578125, + "epoch": 0.13916015625, + "grad_norm": 6.271628293640765, + "kl": 0.03070068359375, + "learning_rate": 9.652099609374999e-07, + "loss": 0.0012, + "reward": 1.5877465605735779, + "reward_std": 0.18424838036298752, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6268090903759003, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.734375, + "epoch": 0.1396484375, + "grad_norm": 2.765333625422615, + "kl": 0.039306640625, + "learning_rate": 9.65087890625e-07, + "loss": 0.0016, + "reward": 1.6684794425964355, + "reward_std": 0.21452812105417252, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.699729323387146, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.046875, + "epoch": 0.14013671875, + "grad_norm": 2.836462248192525, + "kl": 0.0313720703125, + "learning_rate": 9.649658203124999e-07, + "loss": 0.0013, + "reward": 1.7276391983032227, + "reward_std": 0.19272325932979584, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7667016685009003, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.953125, + "epoch": 0.140625, + "grad_norm": 1.5790291022053742, + "kl": 0.02911376953125, + "learning_rate": 9.6484375e-07, + "loss": 0.0012, + "reward": 1.6144769787788391, + "reward_std": 0.1834145449101925, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6535394489765167, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.28125, + "epoch": 0.14111328125, + "grad_norm": 1.1042158738010264, + "kl": 0.0274658203125, + "learning_rate": 9.647216796875e-07, + "loss": 0.0011, + "reward": 1.6875471472740173, + "reward_std": 0.1275060921907425, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7031721770763397, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.125, + "epoch": 0.1416015625, + "grad_norm": 2.283777941321073, + "kl": 0.0245361328125, + "learning_rate": 9.64599609375e-07, + "loss": 0.001, + "reward": 1.6654972434043884, + "reward_std": 0.1885884590446949, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7045597434043884, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.46875, + "epoch": 0.14208984375, + "grad_norm": 1.5795066656688896, + "kl": 0.0257568359375, + "learning_rate": 9.644775390625e-07, + "loss": 0.001, + "reward": 1.7022438049316406, + "reward_std": 0.1274988241493702, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7256813049316406, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.4921875, + "epoch": 0.142578125, + "grad_norm": 2.172103990155339, + "kl": 0.024658203125, + "learning_rate": 9.643554687499999e-07, + "loss": 0.001, + "reward": 1.608510136604309, + "reward_std": 0.11927095800638199, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6475726366043091, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.3125, + "epoch": 0.14306640625, + "grad_norm": 3.281405859635334, + "kl": 0.038330078125, + "learning_rate": 9.642333984375e-07, + "loss": 0.0015, + "reward": 1.5347102880477905, + "reward_std": 0.10195699892938137, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5659602731466293, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.8046875, + "epoch": 0.1435546875, + "grad_norm": 1.4610945064230194, + "kl": 0.02362060546875, + "learning_rate": 9.64111328125e-07, + "loss": 0.0009, + "reward": 1.6751810312271118, + "reward_std": 0.1327841766178608, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6986185312271118, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.4375, + "epoch": 0.14404296875, + "grad_norm": 2.083770286674266, + "kl": 0.03302001953125, + "learning_rate": 9.639892578125e-07, + "loss": 0.0013, + "reward": 1.7501333951950073, + "reward_std": 0.09881668537855148, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7579458951950073, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.078125, + "epoch": 0.14453125, + "grad_norm": 3.9481991676001216, + "kl": 0.0372314453125, + "learning_rate": 9.638671875e-07, + "loss": 0.0015, + "reward": 1.6677301526069641, + "reward_std": 0.07496082410216331, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6677302122116089, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.6484375, + "epoch": 0.14501953125, + "grad_norm": 7.04024659813487, + "kl": 0.0308837890625, + "learning_rate": 9.637451171875e-07, + "loss": 0.0012, + "reward": 1.7570677399635315, + "reward_std": 0.09553324803709984, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7570676803588867, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.3203125, + "epoch": 0.1455078125, + "grad_norm": 4.161631659200875, + "kl": 0.0411376953125, + "learning_rate": 9.63623046875e-07, + "loss": 0.0016, + "reward": 1.5669713020324707, + "reward_std": 0.17707626521587372, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5904087424278259, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.9375, + "epoch": 0.14599609375, + "grad_norm": 2.9329021992243134, + "kl": 0.0474853515625, + "learning_rate": 9.635009765624999e-07, + "loss": 0.0019, + "reward": 1.6684596538543701, + "reward_std": 0.1166144497692585, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6997096538543701, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.0390625, + "epoch": 0.146484375, + "grad_norm": 4.806487359363194, + "kl": 0.033935546875, + "learning_rate": 9.6337890625e-07, + "loss": 0.0014, + "reward": 1.83830726146698, + "reward_std": 0.04638480953872204, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8383072018623352, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.6875, + "epoch": 0.14697265625, + "grad_norm": 1.8400115433509951, + "kl": 0.03753662109375, + "learning_rate": 9.632568359375e-07, + "loss": 0.0015, + "reward": 1.6406999826431274, + "reward_std": 0.05689780414104462, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.640699952840805, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.078125, + "epoch": 0.1474609375, + "grad_norm": 1.859129267311832, + "kl": 0.03448486328125, + "learning_rate": 9.63134765625e-07, + "loss": 0.0014, + "reward": 1.6312952637672424, + "reward_std": 0.08252920210361481, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6312953531742096, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.59375, + "epoch": 0.14794921875, + "grad_norm": 1.5493611450032359, + "kl": 0.02734375, + "learning_rate": 9.630126953125e-07, + "loss": 0.0011, + "reward": 1.7147611379623413, + "reward_std": 0.08944166824221611, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7225736379623413, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.2421875, + "epoch": 0.1484375, + "grad_norm": 1.8976423626253172, + "kl": 0.03570556640625, + "learning_rate": 9.62890625e-07, + "loss": 0.0014, + "reward": 1.6145520210266113, + "reward_std": 0.18844667822122574, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6614269018173218, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.640625, + "epoch": 0.14892578125, + "grad_norm": 7.425501960614286, + "kl": 0.047607421875, + "learning_rate": 9.627685546875e-07, + "loss": 0.0019, + "reward": 1.7208858728408813, + "reward_std": 0.1330663561820984, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7208858132362366, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.6484375, + "epoch": 0.1494140625, + "grad_norm": 2.769163068983383, + "kl": 0.03759765625, + "learning_rate": 9.626464843749999e-07, + "loss": 0.0015, + "reward": 1.5909721851348877, + "reward_std": 0.21000181138515472, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6144096851348877, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.984375, + "epoch": 0.14990234375, + "grad_norm": 5.7395657037691326, + "kl": 0.03375244140625, + "learning_rate": 9.625244140625e-07, + "loss": 0.0013, + "reward": 1.6622443199157715, + "reward_std": 0.03923766687512398, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6622443348169327, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.1328125, + "epoch": 0.150390625, + "grad_norm": 2.6717724620758663, + "kl": 0.02557373046875, + "learning_rate": 9.6240234375e-07, + "loss": 0.001, + "reward": 1.5063217282295227, + "reward_std": 0.20277608931064606, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.5531966686248779, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0234375, + "epoch": 0.15087890625, + "grad_norm": 2.18014699586722, + "kl": 0.041748046875, + "learning_rate": 9.622802734375e-07, + "loss": 0.0017, + "reward": 1.6508269906044006, + "reward_std": 0.13892033696174622, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.697702020406723, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.9765625, + "epoch": 0.1513671875, + "grad_norm": 1.717478656404003, + "kl": 0.0283203125, + "learning_rate": 9.62158203125e-07, + "loss": 0.0011, + "reward": 1.6870404481887817, + "reward_std": 0.06977767683565617, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6870404779911041, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.4765625, + "epoch": 0.15185546875, + "grad_norm": 1.7043876842099035, + "kl": 0.0340576171875, + "learning_rate": 9.620361328124999e-07, + "loss": 0.0014, + "reward": 1.5702768564224243, + "reward_std": 0.15926361829042435, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5859018266201019, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.2109375, + "epoch": 0.15234375, + "grad_norm": 2.125053862773254, + "kl": 0.028076171875, + "learning_rate": 9.619140625e-07, + "loss": 0.0011, + "reward": 1.6080606579780579, + "reward_std": 0.14491120725870132, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6471231281757355, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.90625, + "epoch": 0.15283203125, + "grad_norm": 3.1096892974164425, + "kl": 0.036865234375, + "learning_rate": 9.617919921874999e-07, + "loss": 0.0015, + "reward": 1.5795653462409973, + "reward_std": 0.11042843386530876, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5795653164386749, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.59375, + "epoch": 0.1533203125, + "grad_norm": 3.5995680052886527, + "kl": 0.0377197265625, + "learning_rate": 9.61669921875e-07, + "loss": 0.0015, + "reward": 1.646964430809021, + "reward_std": 0.12394942343235016, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.646964430809021, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.0390625, + "epoch": 0.15380859375, + "grad_norm": 1.956255504508322, + "kl": 0.0301513671875, + "learning_rate": 9.615478515625e-07, + "loss": 0.0012, + "reward": 1.7696388363838196, + "reward_std": 0.05953131802380085, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7696388363838196, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.515625, + "epoch": 0.154296875, + "grad_norm": 5.191246910967392, + "kl": 0.03155517578125, + "learning_rate": 9.6142578125e-07, + "loss": 0.0013, + "reward": 1.5658961534500122, + "reward_std": 0.12328368425369263, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6205836087465286, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.1796875, + "epoch": 0.15478515625, + "grad_norm": 4.500450512155949, + "kl": 0.03509521484375, + "learning_rate": 9.613037109375e-07, + "loss": 0.0014, + "reward": 1.6759998798370361, + "reward_std": 0.100888442248106, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6759998500347137, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.234375, + "epoch": 0.1552734375, + "grad_norm": 3.2796857668648842, + "kl": 0.0269775390625, + "learning_rate": 9.611816406249999e-07, + "loss": 0.0011, + "reward": 1.6215779781341553, + "reward_std": 0.1620483510196209, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6684529185295105, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.2578125, + "epoch": 0.15576171875, + "grad_norm": 2.7662804735100517, + "kl": 0.03497314453125, + "learning_rate": 9.610595703125e-07, + "loss": 0.0014, + "reward": 1.6629568934440613, + "reward_std": 0.14340640604496002, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.670769453048706, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.8671875, + "epoch": 0.15625, + "grad_norm": 4.542442828253781, + "kl": 0.0322265625, + "learning_rate": 9.609374999999999e-07, + "loss": 0.0013, + "reward": 1.711995244026184, + "reward_std": 0.19287973642349243, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7510578036308289, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.2265625, + "epoch": 0.15673828125, + "grad_norm": 4.052768372311868, + "kl": 0.0286865234375, + "learning_rate": 9.608154296875e-07, + "loss": 0.0011, + "reward": 1.6291555762290955, + "reward_std": 0.11671308055520058, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6525930762290955, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.5390625, + "epoch": 0.1572265625, + "grad_norm": 2.162537030435017, + "kl": 0.0362548828125, + "learning_rate": 9.60693359375e-07, + "loss": 0.0014, + "reward": 1.6343209147453308, + "reward_std": 0.16108915954828262, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6499459147453308, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.09375, + "epoch": 0.15771484375, + "grad_norm": 1.856325185116223, + "kl": 0.0341796875, + "learning_rate": 9.605712890625e-07, + "loss": 0.0014, + "reward": 1.7311798930168152, + "reward_std": 0.06938901171088219, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7311798632144928, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.4296875, + "epoch": 0.158203125, + "grad_norm": 5.005867500611158, + "kl": 0.0361328125, + "learning_rate": 9.6044921875e-07, + "loss": 0.0014, + "reward": 1.526106595993042, + "reward_std": 0.20226696878671646, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.5886066257953644, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.2109375, + "epoch": 0.15869140625, + "grad_norm": 1.6834488335758562, + "kl": 0.0377197265625, + "learning_rate": 9.603271484374999e-07, + "loss": 0.0015, + "reward": 1.7446966171264648, + "reward_std": 0.09506701678037643, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7446966171264648, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.0234375, + "epoch": 0.1591796875, + "grad_norm": 34.72900439154182, + "kl": 0.02752685546875, + "learning_rate": 9.60205078125e-07, + "loss": 0.0011, + "reward": 1.6595964431762695, + "reward_std": 0.16007909923791885, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6752214133739471, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.4609375, + "epoch": 0.15966796875, + "grad_norm": 4.142586916976736, + "kl": 0.0384521484375, + "learning_rate": 9.600830078125e-07, + "loss": 0.0015, + "reward": 1.7855232954025269, + "reward_std": 0.13429051637649536, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7933357656002045, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.6796875, + "epoch": 0.16015625, + "grad_norm": 2.0417286955446574, + "kl": 0.0391845703125, + "learning_rate": 9.599609375e-07, + "loss": 0.0016, + "reward": 1.7793214321136475, + "reward_std": 0.05697181820869446, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7793213725090027, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.84375, + "epoch": 0.16064453125, + "grad_norm": 1.4353302978671976, + "kl": 0.0302734375, + "learning_rate": 9.598388671875e-07, + "loss": 0.0012, + "reward": 1.6387850642204285, + "reward_std": 0.37150806188583374, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.7325350046157837, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.1328125, + "epoch": 0.1611328125, + "grad_norm": 3.1073764344455337, + "kl": 0.0394287109375, + "learning_rate": 9.59716796875e-07, + "loss": 0.0016, + "reward": 1.5094847083091736, + "reward_std": 0.13999176025390625, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.5719846189022064, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.53125, + "epoch": 0.16162109375, + "grad_norm": 1.3469828802315056, + "kl": 0.02386474609375, + "learning_rate": 9.595947265625e-07, + "loss": 0.001, + "reward": 1.7278481125831604, + "reward_std": 0.1803218349814415, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7747230529785156, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.1328125, + "epoch": 0.162109375, + "grad_norm": 2.4061717119122776, + "kl": 0.03167724609375, + "learning_rate": 9.594726562499999e-07, + "loss": 0.0013, + "reward": 1.7532138228416443, + "reward_std": 0.13944057375192642, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7610263526439667, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.578125, + "epoch": 0.16259765625, + "grad_norm": 1.5063714645040478, + "kl": 0.03363037109375, + "learning_rate": 9.593505859375e-07, + "loss": 0.0013, + "reward": 1.6756377220153809, + "reward_std": 0.06455008871853352, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6756377518177032, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.3125, + "epoch": 0.1630859375, + "grad_norm": 1.3347945227728137, + "kl": 0.02984619140625, + "learning_rate": 9.59228515625e-07, + "loss": 0.0012, + "reward": 1.7514132857322693, + "reward_std": 0.09506377205252647, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7592257857322693, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.109375, + "epoch": 0.16357421875, + "grad_norm": 3.723407278968701, + "kl": 0.0455322265625, + "learning_rate": 9.591064453125e-07, + "loss": 0.0018, + "reward": 1.6376798152923584, + "reward_std": 0.15414723008871078, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6767423748970032, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.53125, + "epoch": 0.1640625, + "grad_norm": 3.9448517340622655, + "kl": 0.031494140625, + "learning_rate": 9.58984375e-07, + "loss": 0.0013, + "reward": 1.668643057346344, + "reward_std": 0.07998159155249596, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6686430275440216, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.0390625, + "epoch": 0.16455078125, + "grad_norm": 11.966779228153586, + "kl": 0.0382080078125, + "learning_rate": 9.588623046875e-07, + "loss": 0.0015, + "reward": 1.6435166597366333, + "reward_std": 0.13468455523252487, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6903917491436005, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.390625, + "epoch": 0.1650390625, + "grad_norm": 1.3026863061956178, + "kl": 0.042724609375, + "learning_rate": 9.58740234375e-07, + "loss": 0.0017, + "reward": 1.6170286536216736, + "reward_std": 0.03771189600229263, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6170286238193512, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.6484375, + "epoch": 0.16552734375, + "grad_norm": 1.7776160609392315, + "kl": 0.029541015625, + "learning_rate": 9.586181640624999e-07, + "loss": 0.0012, + "reward": 1.6082661151885986, + "reward_std": 0.16181888803839684, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6238911151885986, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.3515625, + "epoch": 0.166015625, + "grad_norm": 3.4344192664071636, + "kl": 0.0374755859375, + "learning_rate": 9.5849609375e-07, + "loss": 0.0015, + "reward": 1.6772453784942627, + "reward_std": 0.09790786355733871, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7241203486919403, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.125, + "epoch": 0.16650390625, + "grad_norm": 3.4863206189382785, + "kl": 0.038330078125, + "learning_rate": 9.583740234375e-07, + "loss": 0.0015, + "reward": 1.7005472779273987, + "reward_std": 0.09716508537530899, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7083597481250763, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.265625, + "epoch": 0.1669921875, + "grad_norm": 16.588569687572924, + "kl": 0.03466796875, + "learning_rate": 9.58251953125e-07, + "loss": 0.0014, + "reward": 1.6174096465110779, + "reward_std": 0.11772006377577782, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6330346167087555, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.96875, + "epoch": 0.16748046875, + "grad_norm": 3.315958682715951, + "kl": 0.03314208984375, + "learning_rate": 9.581298828125e-07, + "loss": 0.0013, + "reward": 1.5178037285804749, + "reward_std": 0.1745915710926056, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5490537583827972, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.5, + "epoch": 0.16796875, + "grad_norm": 2.2349202653555365, + "kl": 0.0390625, + "learning_rate": 9.580078124999999e-07, + "loss": 0.0016, + "reward": 1.6745514273643494, + "reward_std": 0.1728959158062935, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.721426397562027, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.84375, + "epoch": 0.16845703125, + "grad_norm": 2.8256080294771637, + "kl": 0.04638671875, + "learning_rate": 9.578857421875e-07, + "loss": 0.0019, + "reward": 1.7369277477264404, + "reward_std": 0.05663881450891495, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7369276583194733, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.5703125, + "epoch": 0.1689453125, + "grad_norm": 2.627531421994624, + "kl": 0.0396728515625, + "learning_rate": 9.577636718749999e-07, + "loss": 0.0016, + "reward": 1.5510008335113525, + "reward_std": 0.17610786110162735, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5822509080171585, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.6015625, + "epoch": 0.16943359375, + "grad_norm": 3.1216112029414482, + "kl": 0.03594970703125, + "learning_rate": 9.576416015625e-07, + "loss": 0.0014, + "reward": 1.7094944715499878, + "reward_std": 0.08010836690664291, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7094944417476654, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.359375, + "epoch": 0.169921875, + "grad_norm": 2.0161100389850617, + "kl": 0.0460205078125, + "learning_rate": 9.5751953125e-07, + "loss": 0.0018, + "reward": 1.769561767578125, + "reward_std": 0.03940633311867714, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.769561767578125, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.3203125, + "epoch": 0.17041015625, + "grad_norm": 1.678953605120237, + "kl": 0.02923583984375, + "learning_rate": 9.573974609375e-07, + "loss": 0.0012, + "reward": 1.7712068557739258, + "reward_std": 0.1021023616194725, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7946443557739258, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.421875, + "epoch": 0.1708984375, + "grad_norm": 3.911611150477222, + "kl": 0.0369873046875, + "learning_rate": 9.57275390625e-07, + "loss": 0.0015, + "reward": 1.5145609378814697, + "reward_std": 0.2088497430086136, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.5614359080791473, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.7109375, + "epoch": 0.17138671875, + "grad_norm": 6.385932586698964, + "kl": 0.035400390625, + "learning_rate": 9.571533203124999e-07, + "loss": 0.0014, + "reward": 1.5890177488327026, + "reward_std": 0.0945354737341404, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5890178084373474, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.5625, + "epoch": 0.171875, + "grad_norm": 1.6280503492839655, + "kl": 0.037353515625, + "learning_rate": 9.5703125e-07, + "loss": 0.0015, + "reward": 1.6745615005493164, + "reward_std": 0.10443703085184097, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6823740601539612, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.9921875, + "epoch": 0.17236328125, + "grad_norm": 1.5117691402769504, + "kl": 0.03424072265625, + "learning_rate": 9.569091796875e-07, + "loss": 0.0014, + "reward": 1.6382949948310852, + "reward_std": 0.20788590610027313, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.66173255443573, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.6171875, + "epoch": 0.1728515625, + "grad_norm": 3.7063712081293416, + "kl": 0.0482177734375, + "learning_rate": 9.56787109375e-07, + "loss": 0.0019, + "reward": 1.6848008632659912, + "reward_std": 0.13139459863305092, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.684800922870636, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.5625, + "epoch": 0.17333984375, + "grad_norm": 1.5391314099083317, + "kl": 0.040283203125, + "learning_rate": 9.566650390625e-07, + "loss": 0.0016, + "reward": 1.6223503947257996, + "reward_std": 0.1576566994190216, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6301629543304443, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.78125, + "epoch": 0.173828125, + "grad_norm": 2.7058681092420795, + "kl": 0.0372314453125, + "learning_rate": 9.5654296875e-07, + "loss": 0.0015, + "reward": 1.525748610496521, + "reward_std": 0.19213548302650452, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5491860806941986, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.0390625, + "epoch": 0.17431640625, + "grad_norm": 2.0154711074208773, + "kl": 0.0439453125, + "learning_rate": 9.564208984375e-07, + "loss": 0.0018, + "reward": 1.7262452840805054, + "reward_std": 0.11205626837909222, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.726245254278183, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.6328125, + "epoch": 0.1748046875, + "grad_norm": 3.2566800818643813, + "kl": 0.036865234375, + "learning_rate": 9.562988281249999e-07, + "loss": 0.0015, + "reward": 1.5986173748970032, + "reward_std": 0.17809632420539856, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6142423450946808, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.9765625, + "epoch": 0.17529296875, + "grad_norm": 4.6715149905690545, + "kl": 0.0439453125, + "learning_rate": 9.561767578125e-07, + "loss": 0.0018, + "reward": 1.685244619846344, + "reward_std": 0.07497452571988106, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.685244619846344, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.46875, + "epoch": 0.17578125, + "grad_norm": 2.105222847919174, + "kl": 0.0450439453125, + "learning_rate": 9.560546875e-07, + "loss": 0.0018, + "reward": 1.6982702612876892, + "reward_std": 0.17531277611851692, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7138952612876892, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.984375, + "epoch": 0.17626953125, + "grad_norm": 5.232570191497886, + "kl": 0.0435791015625, + "learning_rate": 9.559326171875e-07, + "loss": 0.0017, + "reward": 1.7132031321525574, + "reward_std": 0.1074238047003746, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7132031321525574, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.71875, + "epoch": 0.1767578125, + "grad_norm": 2.1497345593947985, + "kl": 0.0511474609375, + "learning_rate": 9.55810546875e-07, + "loss": 0.002, + "reward": 1.4040643572807312, + "reward_std": 0.08128705434501171, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.4040642976760864, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.515625, + "epoch": 0.17724609375, + "grad_norm": 8.38599033866768, + "kl": 0.04931640625, + "learning_rate": 9.556884765625e-07, + "loss": 0.002, + "reward": 1.6172441244125366, + "reward_std": 0.10681581497192383, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6172442138195038, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.625, + "epoch": 0.177734375, + "grad_norm": 1.767290932124583, + "kl": 0.0501708984375, + "learning_rate": 9.5556640625e-07, + "loss": 0.002, + "reward": 1.6871461868286133, + "reward_std": 0.060712188482284546, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6871461272239685, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.125, + "epoch": 0.17822265625, + "grad_norm": 2.9068478143017344, + "kl": 0.0465087890625, + "learning_rate": 9.554443359374999e-07, + "loss": 0.0019, + "reward": 1.721463680267334, + "reward_std": 0.0778956264257431, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7214637100696564, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.359375, + "epoch": 0.1787109375, + "grad_norm": 5.5157852848407245, + "kl": 0.0372314453125, + "learning_rate": 9.55322265625e-07, + "loss": 0.0015, + "reward": 1.7536060810089111, + "reward_std": 0.10080629587173462, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7536060810089111, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.875, + "epoch": 0.17919921875, + "grad_norm": 2.984431123069507, + "kl": 0.0540771484375, + "learning_rate": 9.552001953125e-07, + "loss": 0.0022, + "reward": 1.6196495294570923, + "reward_std": 0.10086812451481819, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6196494698524475, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.8046875, + "epoch": 0.1796875, + "grad_norm": 3.157195401410498, + "kl": 0.063720703125, + "learning_rate": 9.55078125e-07, + "loss": 0.0025, + "reward": 1.715992033481598, + "reward_std": 0.1297970972955227, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7238045334815979, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.09375, + "epoch": 0.18017578125, + "grad_norm": 2.324085860520846, + "kl": 0.070556640625, + "learning_rate": 9.549560546875e-07, + "loss": 0.0028, + "reward": 1.700922667980194, + "reward_std": 0.08325351774692535, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7009226679801941, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.234375, + "epoch": 0.1806640625, + "grad_norm": 2.004151009626354, + "kl": 0.0498046875, + "learning_rate": 9.548339843749999e-07, + "loss": 0.002, + "reward": 1.6857663989067078, + "reward_std": 0.1576274000108242, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6935788691043854, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.8359375, + "epoch": 0.18115234375, + "grad_norm": 2.404587038530015, + "kl": 0.046875, + "learning_rate": 9.547119140625e-07, + "loss": 0.0019, + "reward": 1.6597256660461426, + "reward_std": 0.13613457418978214, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6831631660461426, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.0703125, + "epoch": 0.181640625, + "grad_norm": 2.834868853183062, + "kl": 0.0577392578125, + "learning_rate": 9.545898437499999e-07, + "loss": 0.0023, + "reward": 1.5693495869636536, + "reward_std": 0.14352120459079742, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5771620869636536, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.40625, + "epoch": 0.18212890625, + "grad_norm": 9.040737722489206, + "kl": 0.044921875, + "learning_rate": 9.544677734375e-07, + "loss": 0.0018, + "reward": 1.801272690296173, + "reward_std": 0.05337041616439819, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8012726902961731, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.078125, + "epoch": 0.1826171875, + "grad_norm": 3.5600976807232554, + "kl": 0.0465087890625, + "learning_rate": 9.54345703125e-07, + "loss": 0.0019, + "reward": 1.453054428100586, + "reward_std": 0.1263410821557045, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.4608669579029083, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.171875, + "epoch": 0.18310546875, + "grad_norm": 2.002398563178058, + "kl": 0.0484619140625, + "learning_rate": 9.542236328125e-07, + "loss": 0.0019, + "reward": 1.6627238988876343, + "reward_std": 0.07443033531308174, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6705364286899567, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.7890625, + "epoch": 0.18359375, + "grad_norm": 2.757249827207943, + "kl": 0.050537109375, + "learning_rate": 9.541015625e-07, + "loss": 0.002, + "reward": 1.6889954805374146, + "reward_std": 0.08430779352784157, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6889954209327698, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.2265625, + "epoch": 0.18408203125, + "grad_norm": 4.5073105256951775, + "kl": 0.0504150390625, + "learning_rate": 9.539794921874999e-07, + "loss": 0.002, + "reward": 1.5870369672775269, + "reward_std": 0.10734122432768345, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6182869672775269, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.421875, + "epoch": 0.1845703125, + "grad_norm": 7.196357548568271, + "kl": 0.037109375, + "learning_rate": 9.53857421875e-07, + "loss": 0.0015, + "reward": 1.6069696545600891, + "reward_std": 0.2218686118721962, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6772821247577667, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.28125, + "epoch": 0.18505859375, + "grad_norm": 4.45308500198343, + "kl": 0.058837890625, + "learning_rate": 9.537353515625e-07, + "loss": 0.0024, + "reward": 1.7543954253196716, + "reward_std": 0.06126508302986622, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7543954253196716, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.6171875, + "epoch": 0.185546875, + "grad_norm": 1.5466264285633915, + "kl": 0.046142578125, + "learning_rate": 9.536132812499999e-07, + "loss": 0.0018, + "reward": 1.6366318464279175, + "reward_std": 0.1529180034995079, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6600694358348846, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.171875, + "epoch": 0.18603515625, + "grad_norm": 1.5074271098619745, + "kl": 0.0447998046875, + "learning_rate": 9.534912109374999e-07, + "loss": 0.0018, + "reward": 1.6499249935150146, + "reward_std": 0.15157188847661018, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6889875531196594, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.7890625, + "epoch": 0.1865234375, + "grad_norm": 2.7697954920464434, + "kl": 0.051513671875, + "learning_rate": 9.533691406249999e-07, + "loss": 0.0021, + "reward": 1.5470696091651917, + "reward_std": 0.2721578925848007, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.593944638967514, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.8046875, + "epoch": 0.18701171875, + "grad_norm": 3.8213692120277054, + "kl": 0.06005859375, + "learning_rate": 9.532470703125e-07, + "loss": 0.0024, + "reward": 1.6255079507827759, + "reward_std": 0.21495968848466873, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6567580103874207, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.8671875, + "epoch": 0.1875, + "grad_norm": 4.4450768164862335, + "kl": 0.046630859375, + "learning_rate": 9.53125e-07, + "loss": 0.0019, + "reward": 1.6321772336959839, + "reward_std": 0.2580869309604168, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.7103022634983063, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.0703125, + "epoch": 0.18798828125, + "grad_norm": 2.000719653156199, + "kl": 0.0751953125, + "learning_rate": 9.530029296875e-07, + "loss": 0.003, + "reward": 1.6403818130493164, + "reward_std": 0.18112845346331596, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6794443130493164, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.9375, + "epoch": 0.1884765625, + "grad_norm": 1.2925552797273454, + "kl": 0.0565185546875, + "learning_rate": 9.52880859375e-07, + "loss": 0.0023, + "reward": 1.7097843885421753, + "reward_std": 0.08790682628750801, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7332218289375305, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.125, + "epoch": 0.18896484375, + "grad_norm": 4.476596386367477, + "kl": 0.0555419921875, + "learning_rate": 9.527587890624999e-07, + "loss": 0.0022, + "reward": 1.6251919269561768, + "reward_std": 0.1653646007180214, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6408169269561768, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.25, + "epoch": 0.189453125, + "grad_norm": 1.981834575248848, + "kl": 0.0638427734375, + "learning_rate": 9.526367187499999e-07, + "loss": 0.0026, + "reward": 1.7350217700004578, + "reward_std": 0.08974255621433258, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7428342998027802, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.265625, + "epoch": 0.18994140625, + "grad_norm": 3.714264090994581, + "kl": 0.0496826171875, + "learning_rate": 9.525146484375e-07, + "loss": 0.002, + "reward": 1.5587335228919983, + "reward_std": 0.1307937055826187, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5743584930896759, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.8828125, + "epoch": 0.1904296875, + "grad_norm": 0.7910658209647898, + "kl": 0.0504150390625, + "learning_rate": 9.52392578125e-07, + "loss": 0.002, + "reward": 1.738577127456665, + "reward_std": 0.05491543561220169, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7463896572589874, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.625, + "epoch": 0.19091796875, + "grad_norm": 1.412501157568577, + "kl": 0.0592041015625, + "learning_rate": 9.522705078125e-07, + "loss": 0.0024, + "reward": 1.6673744916915894, + "reward_std": 0.04720168560743332, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6673744320869446, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.765625, + "epoch": 0.19140625, + "grad_norm": 2.091929522012718, + "kl": 0.04931640625, + "learning_rate": 9.521484375e-07, + "loss": 0.002, + "reward": 1.6497448682785034, + "reward_std": 0.17027802020311356, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6731823682785034, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.2265625, + "epoch": 0.19189453125, + "grad_norm": 1.1250339673162928, + "kl": 0.0487060546875, + "learning_rate": 9.520263671874999e-07, + "loss": 0.0019, + "reward": 1.6197129487991333, + "reward_std": 0.15943622216582298, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6587753891944885, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.3515625, + "epoch": 0.1923828125, + "grad_norm": 2.6862482142648543, + "kl": 0.041015625, + "learning_rate": 9.519042968749999e-07, + "loss": 0.0016, + "reward": 1.7228458523750305, + "reward_std": 0.06821495667099953, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7306584417819977, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.046875, + "epoch": 0.19287109375, + "grad_norm": 2.295224742756099, + "kl": 0.0504150390625, + "learning_rate": 9.517822265624999e-07, + "loss": 0.002, + "reward": 1.712727427482605, + "reward_std": 0.13450950384140015, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.720539927482605, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.515625, + "epoch": 0.193359375, + "grad_norm": 1.949548832486549, + "kl": 0.0498046875, + "learning_rate": 9.5166015625e-07, + "loss": 0.002, + "reward": 1.5369553565979004, + "reward_std": 0.21717742085456848, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.607267826795578, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.125, + "epoch": 0.19384765625, + "grad_norm": 3.751089819487828, + "kl": 0.064697265625, + "learning_rate": 9.515380859375e-07, + "loss": 0.0026, + "reward": 1.7010602951049805, + "reward_std": 0.1037181131541729, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7010602951049805, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.109375, + "epoch": 0.1943359375, + "grad_norm": 2.0876288807152616, + "kl": 0.041748046875, + "learning_rate": 9.51416015625e-07, + "loss": 0.0017, + "reward": 1.6480942964553833, + "reward_std": 0.1627689152956009, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6871567964553833, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.9453125, + "epoch": 0.19482421875, + "grad_norm": 3.7142502808168616, + "kl": 0.0523681640625, + "learning_rate": 9.512939453125e-07, + "loss": 0.0021, + "reward": 1.6217145919799805, + "reward_std": 0.06836835853755474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6217146515846252, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.140625, + "epoch": 0.1953125, + "grad_norm": 3.234762690222564, + "kl": 0.056640625, + "learning_rate": 9.511718749999999e-07, + "loss": 0.0023, + "reward": 1.7640219926834106, + "reward_std": 0.12531143426895142, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7718344628810883, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.7109375, + "epoch": 0.19580078125, + "grad_norm": 2.2580241394701956, + "kl": 0.05419921875, + "learning_rate": 9.510498046874999e-07, + "loss": 0.0022, + "reward": 1.6812456250190735, + "reward_std": 0.11042129248380661, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6890580952167511, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.1015625, + "epoch": 0.1962890625, + "grad_norm": 2.113374918135095, + "kl": 0.0408935546875, + "learning_rate": 9.50927734375e-07, + "loss": 0.0016, + "reward": 1.7110391855239868, + "reward_std": 0.08690160885453224, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7110391855239868, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.28125, + "epoch": 0.19677734375, + "grad_norm": 2.854663005537301, + "kl": 0.0523681640625, + "learning_rate": 9.508056640625e-07, + "loss": 0.0021, + "reward": 1.698850393295288, + "reward_std": 0.16052530705928802, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7144753634929657, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.90625, + "epoch": 0.197265625, + "grad_norm": 3.566015688964678, + "kl": 0.05322265625, + "learning_rate": 9.5068359375e-07, + "loss": 0.0021, + "reward": 1.611766278743744, + "reward_std": 0.1618807651102543, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6352038085460663, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7109375, + "epoch": 0.19775390625, + "grad_norm": 27.087108641742997, + "kl": 0.0535888671875, + "learning_rate": 9.505615234375e-07, + "loss": 0.0021, + "reward": 1.5200156569480896, + "reward_std": 0.19639131426811218, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5512656569480896, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.78125, + "epoch": 0.1982421875, + "grad_norm": 2.142097886894366, + "kl": 0.0496826171875, + "learning_rate": 9.504394531249999e-07, + "loss": 0.002, + "reward": 1.6612927317619324, + "reward_std": 0.15089121460914612, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6925427317619324, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.6484375, + "epoch": 0.19873046875, + "grad_norm": 4.366971614094934, + "kl": 0.0616455078125, + "learning_rate": 9.503173828124999e-07, + "loss": 0.0025, + "reward": 1.6004191040992737, + "reward_std": 0.17288047075271606, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6160440444946289, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.625, + "epoch": 0.19921875, + "grad_norm": 1.7604823561903244, + "kl": 0.058837890625, + "learning_rate": 9.501953124999999e-07, + "loss": 0.0024, + "reward": 1.811613917350769, + "reward_std": 0.10434301942586899, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.819426417350769, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.203125, + "epoch": 0.19970703125, + "grad_norm": 9.878985501778747, + "kl": 0.041015625, + "learning_rate": 9.500732421875e-07, + "loss": 0.0016, + "reward": 1.7147305607795715, + "reward_std": 0.14838684350252151, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7225430309772491, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.2421875, + "epoch": 0.2001953125, + "grad_norm": 1.1168993817133859, + "kl": 0.0477294921875, + "learning_rate": 9.49951171875e-07, + "loss": 0.0019, + "reward": 1.6048610210418701, + "reward_std": 0.06566739082336426, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6048609614372253, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.0703125, + "epoch": 0.20068359375, + "grad_norm": 2.4027814049067366, + "kl": 0.0513916015625, + "learning_rate": 9.498291015625e-07, + "loss": 0.0021, + "reward": 1.5927820801734924, + "reward_std": 0.11115045472979546, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6318445801734924, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.1796875, + "epoch": 0.201171875, + "grad_norm": 2.2150494756825045, + "kl": 0.0562744140625, + "learning_rate": 9.4970703125e-07, + "loss": 0.0022, + "reward": 1.6925803422927856, + "reward_std": 0.12557360157370567, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7082052826881409, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.4140625, + "epoch": 0.20166015625, + "grad_norm": 10.234390592196027, + "kl": 0.0421142578125, + "learning_rate": 9.495849609374999e-07, + "loss": 0.0017, + "reward": 1.625154733657837, + "reward_std": 0.1099303588271141, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6485922932624817, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.53125, + "epoch": 0.2021484375, + "grad_norm": 1.8803371479808038, + "kl": 0.0467529296875, + "learning_rate": 9.494628906249999e-07, + "loss": 0.0019, + "reward": 1.7836476564407349, + "reward_std": 0.0976153276860714, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7914601564407349, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.1484375, + "epoch": 0.20263671875, + "grad_norm": 2.530860829474323, + "kl": 0.0577392578125, + "learning_rate": 9.493408203125e-07, + "loss": 0.0023, + "reward": 1.6098762154579163, + "reward_std": 0.23471946269273758, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.6880012154579163, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.828125, + "epoch": 0.203125, + "grad_norm": 4.285692563206467, + "kl": 0.0509033203125, + "learning_rate": 9.4921875e-07, + "loss": 0.002, + "reward": 1.609405517578125, + "reward_std": 0.10902727395296097, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6094054579734802, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.2890625, + "epoch": 0.20361328125, + "grad_norm": 1.8103328712037055, + "kl": 0.049560546875, + "learning_rate": 9.490966796875e-07, + "loss": 0.002, + "reward": 1.5358877182006836, + "reward_std": 0.08052598685026169, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5437001585960388, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.4609375, + "epoch": 0.2041015625, + "grad_norm": 2.3123346346362452, + "kl": 0.0615234375, + "learning_rate": 9.48974609375e-07, + "loss": 0.0025, + "reward": 1.5540345907211304, + "reward_std": 0.11965020000934601, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5540346205234528, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.8515625, + "epoch": 0.20458984375, + "grad_norm": 2.17850912465939, + "kl": 0.054443359375, + "learning_rate": 9.488525390624999e-07, + "loss": 0.0022, + "reward": 1.7468233108520508, + "reward_std": 0.07044094800949097, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7468233108520508, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.2578125, + "epoch": 0.205078125, + "grad_norm": 2.351303220308625, + "kl": 0.0440673828125, + "learning_rate": 9.487304687499999e-07, + "loss": 0.0018, + "reward": 1.6957443952560425, + "reward_std": 0.04969111829996109, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6957444846630096, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.0546875, + "epoch": 0.20556640625, + "grad_norm": 1.667715869495618, + "kl": 0.05908203125, + "learning_rate": 9.486083984374999e-07, + "loss": 0.0024, + "reward": 1.681714653968811, + "reward_std": 0.14231722056865692, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6973395347595215, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.75, + "epoch": 0.2060546875, + "grad_norm": 6.464882822483457, + "kl": 0.0556640625, + "learning_rate": 9.48486328125e-07, + "loss": 0.0022, + "reward": 1.6321836113929749, + "reward_std": 0.13830295950174332, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6399961411952972, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.7265625, + "epoch": 0.20654296875, + "grad_norm": 3.295724952577691, + "kl": 0.0535888671875, + "learning_rate": 9.483642578125e-07, + "loss": 0.0021, + "reward": 1.707470715045929, + "reward_std": 0.1862129084765911, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.730908215045929, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.15625, + "epoch": 0.20703125, + "grad_norm": 2.167637361364238, + "kl": 0.0465087890625, + "learning_rate": 9.482421875e-07, + "loss": 0.0019, + "reward": 1.6849753856658936, + "reward_std": 0.12426239252090454, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7162253856658936, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.203125, + "epoch": 0.20751953125, + "grad_norm": 2.0849894304046916, + "kl": 0.04150390625, + "learning_rate": 9.481201171875e-07, + "loss": 0.0017, + "reward": 1.6421186923980713, + "reward_std": 0.13160578161478043, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6655561625957489, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.25, + "epoch": 0.2080078125, + "grad_norm": 2.0360917274571073, + "kl": 0.062255859375, + "learning_rate": 9.479980468749999e-07, + "loss": 0.0025, + "reward": 1.8281482458114624, + "reward_std": 0.0754449162632227, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.82814821600914, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.7578125, + "epoch": 0.20849609375, + "grad_norm": 2.2356720906958594, + "kl": 0.0491943359375, + "learning_rate": 9.478759765624999e-07, + "loss": 0.002, + "reward": 1.7281526327133179, + "reward_std": 0.08592578768730164, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7281526327133179, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8203125, + "epoch": 0.208984375, + "grad_norm": 2.2370765591210873, + "kl": 0.0584716796875, + "learning_rate": 9.4775390625e-07, + "loss": 0.0023, + "reward": 1.6686657667160034, + "reward_std": 0.12740540876984596, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6842906475067139, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.578125, + "epoch": 0.20947265625, + "grad_norm": 1.6150656197994475, + "kl": 0.0616455078125, + "learning_rate": 9.476318359375e-07, + "loss": 0.0025, + "reward": 1.591238021850586, + "reward_std": 0.06698063388466835, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5912379920482635, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.9296875, + "epoch": 0.2099609375, + "grad_norm": 1.5061424458539174, + "kl": 0.035888671875, + "learning_rate": 9.47509765625e-07, + "loss": 0.0014, + "reward": 1.71708744764328, + "reward_std": 0.06687924265861511, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7170874178409576, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.96875, + "epoch": 0.21044921875, + "grad_norm": 4.580041027491469, + "kl": 0.0445556640625, + "learning_rate": 9.473876953125e-07, + "loss": 0.0018, + "reward": 1.7206860780715942, + "reward_std": 0.14960038661956787, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7441235482692719, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.1171875, + "epoch": 0.2109375, + "grad_norm": 3.9447569556870925, + "kl": 0.05615234375, + "learning_rate": 9.472656249999999e-07, + "loss": 0.0022, + "reward": 1.6832043528556824, + "reward_std": 0.08023593947291374, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6832043826580048, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.6953125, + "epoch": 0.21142578125, + "grad_norm": 2.547549175278579, + "kl": 0.0560302734375, + "learning_rate": 9.471435546874999e-07, + "loss": 0.0022, + "reward": 1.7101504802703857, + "reward_std": 0.1703593209385872, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7257755398750305, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.125, + "epoch": 0.2119140625, + "grad_norm": 1.6164994536934667, + "kl": 0.0482177734375, + "learning_rate": 9.470214843749999e-07, + "loss": 0.0019, + "reward": 1.6205175518989563, + "reward_std": 0.08653150871396065, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6283301115036011, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.953125, + "epoch": 0.21240234375, + "grad_norm": 2.1871293922270514, + "kl": 0.0439453125, + "learning_rate": 9.468994140625e-07, + "loss": 0.0018, + "reward": 1.7288724780082703, + "reward_std": 0.12284732609987259, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7444974780082703, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.5, + "epoch": 0.212890625, + "grad_norm": 1.1688720954359133, + "kl": 0.0484619140625, + "learning_rate": 9.4677734375e-07, + "loss": 0.0019, + "reward": 1.769058644771576, + "reward_std": 0.07042321562767029, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7768711447715759, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.4921875, + "epoch": 0.21337890625, + "grad_norm": 2.1010102368536674, + "kl": 0.0406494140625, + "learning_rate": 9.466552734375e-07, + "loss": 0.0016, + "reward": 1.750071406364441, + "reward_std": 0.05832614004611969, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7500714361667633, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.3984375, + "epoch": 0.2138671875, + "grad_norm": 1.482992322234722, + "kl": 0.041748046875, + "learning_rate": 9.46533203125e-07, + "loss": 0.0017, + "reward": 1.7221877574920654, + "reward_std": 0.04050422087311745, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.722187727689743, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.75, + "epoch": 0.21435546875, + "grad_norm": 3.5088817243048176, + "kl": 0.0543212890625, + "learning_rate": 9.464111328124999e-07, + "loss": 0.0022, + "reward": 1.7622933983802795, + "reward_std": 0.10088678449392319, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7701059281826019, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.7578125, + "epoch": 0.21484375, + "grad_norm": 5.788688495019036, + "kl": 0.0518798828125, + "learning_rate": 9.462890624999999e-07, + "loss": 0.0021, + "reward": 1.5054885149002075, + "reward_std": 0.09682680293917656, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5133009850978851, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.8984375, + "epoch": 0.21533203125, + "grad_norm": 2.1131575200131487, + "kl": 0.04443359375, + "learning_rate": 9.461669921875e-07, + "loss": 0.0018, + "reward": 1.6255145072937012, + "reward_std": 0.11543078348040581, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6411395072937012, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.9140625, + "epoch": 0.2158203125, + "grad_norm": 2.3429544954821306, + "kl": 0.044189453125, + "learning_rate": 9.46044921875e-07, + "loss": 0.0018, + "reward": 1.8504613637924194, + "reward_std": 0.06752173975110054, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8504613637924194, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.8125, + "epoch": 0.21630859375, + "grad_norm": 7.976580512227821, + "kl": 0.0391845703125, + "learning_rate": 9.459228515625e-07, + "loss": 0.0016, + "reward": 1.6511912941932678, + "reward_std": 0.14625184237957, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6824413239955902, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.3203125, + "epoch": 0.216796875, + "grad_norm": 4.95319644301388, + "kl": 0.0517578125, + "learning_rate": 9.4580078125e-07, + "loss": 0.0021, + "reward": 1.7569758296012878, + "reward_std": 0.1250363327562809, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7647883296012878, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.1875, + "epoch": 0.21728515625, + "grad_norm": 1.8993972344614596, + "kl": 0.0367431640625, + "learning_rate": 9.456787109374999e-07, + "loss": 0.0015, + "reward": 1.693526804447174, + "reward_std": 0.11432855390012264, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7325893044471741, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.2734375, + "epoch": 0.2177734375, + "grad_norm": 7.777889583760453, + "kl": 0.0439453125, + "learning_rate": 9.455566406249999e-07, + "loss": 0.0018, + "reward": 1.6474227905273438, + "reward_std": 0.11190011724829674, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6474227905273438, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.953125, + "epoch": 0.21826171875, + "grad_norm": 2.009296377838892, + "kl": 0.03955078125, + "learning_rate": 9.454345703124999e-07, + "loss": 0.0016, + "reward": 1.654877483844757, + "reward_std": 0.11765236407518387, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6705024838447571, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.390625, + "epoch": 0.21875, + "grad_norm": 2.925176137880303, + "kl": 0.0401611328125, + "learning_rate": 9.453125e-07, + "loss": 0.0016, + "reward": 1.738932490348816, + "reward_std": 0.06926981918513775, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7389324307441711, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.828125, + "epoch": 0.21923828125, + "grad_norm": 4.340531405543576, + "kl": 0.0577392578125, + "learning_rate": 9.451904296875e-07, + "loss": 0.0023, + "reward": 1.683348834514618, + "reward_std": 0.0711992010474205, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6833488345146179, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.421875, + "epoch": 0.2197265625, + "grad_norm": 2.518125948148069, + "kl": 0.04833984375, + "learning_rate": 9.45068359375e-07, + "loss": 0.0019, + "reward": 1.6158209443092346, + "reward_std": 0.11858320608735085, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6314459443092346, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7265625, + "epoch": 0.22021484375, + "grad_norm": 3.5363930541538355, + "kl": 0.044921875, + "learning_rate": 9.449462890625e-07, + "loss": 0.0018, + "reward": 1.5984613299369812, + "reward_std": 0.13294245302677155, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.60627381503582, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.515625, + "epoch": 0.220703125, + "grad_norm": 3.149130598742103, + "kl": 0.0435791015625, + "learning_rate": 9.448242187499999e-07, + "loss": 0.0017, + "reward": 1.6226680278778076, + "reward_std": 0.112908735871315, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6226680278778076, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.9453125, + "epoch": 0.22119140625, + "grad_norm": 2.4935032422406125, + "kl": 0.0389404296875, + "learning_rate": 9.447021484374999e-07, + "loss": 0.0016, + "reward": 1.6436303853988647, + "reward_std": 0.06870114244520664, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6436303853988647, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.171875, + "epoch": 0.2216796875, + "grad_norm": 3.6116771357404747, + "kl": 0.042236328125, + "learning_rate": 9.445800781249999e-07, + "loss": 0.0017, + "reward": 1.7960193157196045, + "reward_std": 0.03523706644773483, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7960193157196045, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.28125, + "epoch": 0.22216796875, + "grad_norm": 1.8643747959172932, + "kl": 0.0467529296875, + "learning_rate": 9.444580078125e-07, + "loss": 0.0019, + "reward": 1.6899807453155518, + "reward_std": 0.061588347889482975, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6899808049201965, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.3359375, + "epoch": 0.22265625, + "grad_norm": 2.378293437154049, + "kl": 0.0528564453125, + "learning_rate": 9.443359375e-07, + "loss": 0.0021, + "reward": 1.6194549202919006, + "reward_std": 0.07477627880871296, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6272674798965454, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7734375, + "epoch": 0.22314453125, + "grad_norm": 1.7628549042900312, + "kl": 0.050537109375, + "learning_rate": 9.442138671875e-07, + "loss": 0.002, + "reward": 1.6203702688217163, + "reward_std": 0.06756623834371567, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6203702390193939, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.234375, + "epoch": 0.2236328125, + "grad_norm": 3.212039118876721, + "kl": 0.0477294921875, + "learning_rate": 9.440917968749999e-07, + "loss": 0.0019, + "reward": 1.7326794862747192, + "reward_std": 0.15392906218767166, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7404920160770416, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.9609375, + "epoch": 0.22412109375, + "grad_norm": 2.20882934467843, + "kl": 0.0362548828125, + "learning_rate": 9.439697265624999e-07, + "loss": 0.0015, + "reward": 1.669608473777771, + "reward_std": 0.1226998120546341, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6696084141731262, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.7109375, + "epoch": 0.224609375, + "grad_norm": 3.919575507206736, + "kl": 0.037841796875, + "learning_rate": 9.438476562499999e-07, + "loss": 0.0015, + "reward": 1.8161649107933044, + "reward_std": 0.08291263319551945, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8161648809909821, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.1796875, + "epoch": 0.22509765625, + "grad_norm": 1.8813163144677836, + "kl": 0.0386962890625, + "learning_rate": 9.437255859375e-07, + "loss": 0.0015, + "reward": 1.5858674049377441, + "reward_std": 0.10385648906230927, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5858674198389053, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.9921875, + "epoch": 0.2255859375, + "grad_norm": 3.0446438454409304, + "kl": 0.04052734375, + "learning_rate": 9.43603515625e-07, + "loss": 0.0016, + "reward": 1.6508355736732483, + "reward_std": 0.13546227663755417, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6586481332778931, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.765625, + "epoch": 0.22607421875, + "grad_norm": 5.145994036718485, + "kl": 0.0416259765625, + "learning_rate": 9.434814453125e-07, + "loss": 0.0017, + "reward": 1.7009857892990112, + "reward_std": 0.11110249161720276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7009858191013336, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.203125, + "epoch": 0.2265625, + "grad_norm": 1.222277497239498, + "kl": 0.052978515625, + "learning_rate": 9.43359375e-07, + "loss": 0.0021, + "reward": 1.6271523833274841, + "reward_std": 0.0367429880425334, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6271523833274841, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.828125, + "epoch": 0.22705078125, + "grad_norm": 1.372718735073741, + "kl": 0.0382080078125, + "learning_rate": 9.432373046874999e-07, + "loss": 0.0015, + "reward": 1.7885666489601135, + "reward_std": 0.040253955870866776, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7885666787624359, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.0625, + "epoch": 0.2275390625, + "grad_norm": 1.7337186238761468, + "kl": 0.057861328125, + "learning_rate": 9.431152343749999e-07, + "loss": 0.0023, + "reward": 1.628583014011383, + "reward_std": 0.12626324221491814, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6442080438137054, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.8203125, + "epoch": 0.22802734375, + "grad_norm": 2.1322772710330575, + "kl": 0.042236328125, + "learning_rate": 9.429931640624999e-07, + "loss": 0.0017, + "reward": 1.6795091032981873, + "reward_std": 0.10694251582026482, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6951341331005096, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.109375, + "epoch": 0.228515625, + "grad_norm": 1.083227516677285, + "kl": 0.039794921875, + "learning_rate": 9.4287109375e-07, + "loss": 0.0016, + "reward": 1.5487976670265198, + "reward_std": 0.12695813924074173, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5800476670265198, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.0078125, + "epoch": 0.22900390625, + "grad_norm": 2.134927472325478, + "kl": 0.046875, + "learning_rate": 9.427490234375e-07, + "loss": 0.0019, + "reward": 1.7062729597091675, + "reward_std": 0.12723471224308014, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7140854597091675, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.5078125, + "epoch": 0.2294921875, + "grad_norm": 1.7045575082525455, + "kl": 0.033935546875, + "learning_rate": 9.42626953125e-07, + "loss": 0.0014, + "reward": 1.693819522857666, + "reward_std": 0.07639718800783157, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.693819522857666, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.546875, + "epoch": 0.22998046875, + "grad_norm": 1.4199964051941236, + "kl": 0.053466796875, + "learning_rate": 9.425048828124999e-07, + "loss": 0.0021, + "reward": 1.7025293707847595, + "reward_std": 0.03216167027130723, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7025294005870819, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.671875, + "epoch": 0.23046875, + "grad_norm": 2.2853712078997583, + "kl": 0.040283203125, + "learning_rate": 9.423828124999999e-07, + "loss": 0.0016, + "reward": 1.633117914199829, + "reward_std": 0.1547449231147766, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6565554141998291, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.4375, + "epoch": 0.23095703125, + "grad_norm": 2.9606587430145748, + "kl": 0.05859375, + "learning_rate": 9.422607421874999e-07, + "loss": 0.0023, + "reward": 1.7061492204666138, + "reward_std": 0.11958225071430206, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7139617204666138, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.4140625, + "epoch": 0.2314453125, + "grad_norm": 4.284919438683781, + "kl": 0.0533447265625, + "learning_rate": 9.42138671875e-07, + "loss": 0.0021, + "reward": 1.6417680978775024, + "reward_std": 0.10428282991051674, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.65739306807518, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.8828125, + "epoch": 0.23193359375, + "grad_norm": 3.9023448946857533, + "kl": 0.04266357421875, + "learning_rate": 9.420166015625e-07, + "loss": 0.0017, + "reward": 1.7126132249832153, + "reward_std": 0.09138727188110352, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7126132547855377, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.0859375, + "epoch": 0.232421875, + "grad_norm": 8.45607280183371, + "kl": 0.052734375, + "learning_rate": 9.4189453125e-07, + "loss": 0.0021, + "reward": 1.6888734102249146, + "reward_std": 0.08186532184481621, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6888734102249146, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.390625, + "epoch": 0.23291015625, + "grad_norm": 3.7100837878450594, + "kl": 0.04296875, + "learning_rate": 9.417724609375e-07, + "loss": 0.0017, + "reward": 1.7216296195983887, + "reward_std": 0.170655507594347, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7372545599937439, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.140625, + "epoch": 0.2333984375, + "grad_norm": 1.8396296874789484, + "kl": 0.041748046875, + "learning_rate": 9.416503906249999e-07, + "loss": 0.0017, + "reward": 1.6159728169441223, + "reward_std": 0.19198870658874512, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6706602573394775, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.1015625, + "epoch": 0.23388671875, + "grad_norm": 3.539797062671013, + "kl": 0.048583984375, + "learning_rate": 9.415283203124999e-07, + "loss": 0.0019, + "reward": 1.7715952396392822, + "reward_std": 0.07343994826078415, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7715952396392822, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.09375, + "epoch": 0.234375, + "grad_norm": 2.857203091318844, + "kl": 0.0416259765625, + "learning_rate": 9.414062499999999e-07, + "loss": 0.0017, + "reward": 1.706653356552124, + "reward_std": 0.09830702841281891, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7066532969474792, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.140625, + "epoch": 0.23486328125, + "grad_norm": 10.240794373354676, + "kl": 0.04052734375, + "learning_rate": 9.412841796875e-07, + "loss": 0.0016, + "reward": 1.755756914615631, + "reward_std": 0.07962564006447792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7557569742202759, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.9921875, + "epoch": 0.2353515625, + "grad_norm": 4.440714955855732, + "kl": 0.046630859375, + "learning_rate": 9.41162109375e-07, + "loss": 0.0019, + "reward": 1.588149607181549, + "reward_std": 0.07681831158697605, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5959621071815491, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.28125, + "epoch": 0.23583984375, + "grad_norm": 2.0909539375914736, + "kl": 0.03387451171875, + "learning_rate": 9.410400390625e-07, + "loss": 0.0014, + "reward": 1.768026053905487, + "reward_std": 0.1362891048192978, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7836510539054871, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.625, + "epoch": 0.236328125, + "grad_norm": 4.002760763893457, + "kl": 0.0521240234375, + "learning_rate": 9.4091796875e-07, + "loss": 0.0021, + "reward": 1.7147894501686096, + "reward_std": 0.08333645388484001, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.714789479970932, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.8984375, + "epoch": 0.23681640625, + "grad_norm": 2.2903477892017756, + "kl": 0.046875, + "learning_rate": 9.407958984374999e-07, + "loss": 0.0019, + "reward": 1.7409107685089111, + "reward_std": 0.04808063432574272, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7409107685089111, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.4140625, + "epoch": 0.2373046875, + "grad_norm": 2.3093773994918347, + "kl": 0.0355224609375, + "learning_rate": 9.406738281249999e-07, + "loss": 0.0014, + "reward": 1.6987740993499756, + "reward_std": 0.1341363899409771, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7143990695476532, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.5078125, + "epoch": 0.23779296875, + "grad_norm": 2.720150565834579, + "kl": 0.0426025390625, + "learning_rate": 9.405517578125e-07, + "loss": 0.0017, + "reward": 1.6762725114822388, + "reward_std": 0.09658115357160568, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6840850114822388, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.7734375, + "epoch": 0.23828125, + "grad_norm": 3.4864904794597886, + "kl": 0.0489501953125, + "learning_rate": 9.404296875e-07, + "loss": 0.002, + "reward": 1.6006226539611816, + "reward_std": 0.13109473884105682, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6162476539611816, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.515625, + "epoch": 0.23876953125, + "grad_norm": 2.6873145138368666, + "kl": 0.0526123046875, + "learning_rate": 9.403076171875e-07, + "loss": 0.0021, + "reward": 1.6052095890045166, + "reward_std": 0.17836012691259384, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6520847082138062, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.6796875, + "epoch": 0.2392578125, + "grad_norm": 3.62466135309983, + "kl": 0.05224609375, + "learning_rate": 9.40185546875e-07, + "loss": 0.0021, + "reward": 1.5602875351905823, + "reward_std": 0.09847164526581764, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5602875351905823, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.734375, + "epoch": 0.23974609375, + "grad_norm": 3.8234332031826432, + "kl": 0.0458984375, + "learning_rate": 9.400634765624999e-07, + "loss": 0.0018, + "reward": 1.6198760867118835, + "reward_std": 0.10435886308550835, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6198760569095612, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.84375, + "epoch": 0.240234375, + "grad_norm": 1.643266133894865, + "kl": 0.0531005859375, + "learning_rate": 9.399414062499999e-07, + "loss": 0.0021, + "reward": 1.6920706629753113, + "reward_std": 0.13326343521475792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6920706927776337, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.40625, + "epoch": 0.24072265625, + "grad_norm": 3.0050999042930053, + "kl": 0.04736328125, + "learning_rate": 9.398193359374999e-07, + "loss": 0.0019, + "reward": 1.6835868954658508, + "reward_std": 0.06352141872048378, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6835868954658508, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0234375, + "epoch": 0.2412109375, + "grad_norm": 1.982897008783888, + "kl": 0.0460205078125, + "learning_rate": 9.39697265625e-07, + "loss": 0.0018, + "reward": 1.7277058362960815, + "reward_std": 0.029988901689648628, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7277058362960815, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.3671875, + "epoch": 0.24169921875, + "grad_norm": 3.432562466521374, + "kl": 0.0494384765625, + "learning_rate": 9.395751953125e-07, + "loss": 0.002, + "reward": 1.630252718925476, + "reward_std": 0.10348817706108093, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6302527189254761, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.3203125, + "epoch": 0.2421875, + "grad_norm": 1.938316540344301, + "kl": 0.061279296875, + "learning_rate": 9.39453125e-07, + "loss": 0.0024, + "reward": 1.5763072967529297, + "reward_std": 0.1704563107341528, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5997447669506073, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.46875, + "epoch": 0.24267578125, + "grad_norm": 1.5475227992685436, + "kl": 0.0443115234375, + "learning_rate": 9.393310546875e-07, + "loss": 0.0018, + "reward": 1.796213448047638, + "reward_std": 0.08575894869863987, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8040259480476379, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.8046875, + "epoch": 0.2431640625, + "grad_norm": 6.194121349015061, + "kl": 0.0538330078125, + "learning_rate": 9.392089843749999e-07, + "loss": 0.0022, + "reward": 1.7806832194328308, + "reward_std": 0.04140526428818703, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7806831896305084, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.234375, + "epoch": 0.24365234375, + "grad_norm": 4.146851999613055, + "kl": 0.0482177734375, + "learning_rate": 9.390869140624999e-07, + "loss": 0.0019, + "reward": 1.6361583471298218, + "reward_std": 0.12326683104038239, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6517833769321442, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.875, + "epoch": 0.244140625, + "grad_norm": 2.3192212317561327, + "kl": 0.0556640625, + "learning_rate": 9.3896484375e-07, + "loss": 0.0022, + "reward": 1.7082719802856445, + "reward_std": 0.11271853744983673, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7160845100879669, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.09375, + "epoch": 0.24462890625, + "grad_norm": 2.530658345316516, + "kl": 0.0523681640625, + "learning_rate": 9.388427734375e-07, + "loss": 0.0021, + "reward": 1.6015813946723938, + "reward_std": 0.11473493091762066, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6406438946723938, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.4296875, + "epoch": 0.2451171875, + "grad_norm": 3.374686115356342, + "kl": 0.0780029296875, + "learning_rate": 9.38720703125e-07, + "loss": 0.0031, + "reward": 1.670573353767395, + "reward_std": 0.19502687454223633, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6940109133720398, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.9609375, + "epoch": 0.24560546875, + "grad_norm": 3.975204177665374, + "kl": 0.0465087890625, + "learning_rate": 9.385986328125e-07, + "loss": 0.0019, + "reward": 1.7460113763809204, + "reward_std": 0.06656001135706902, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7460113167762756, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1171875, + "epoch": 0.24609375, + "grad_norm": 2.3164265465647267, + "kl": 0.0430908203125, + "learning_rate": 9.384765624999999e-07, + "loss": 0.0017, + "reward": 1.789831519126892, + "reward_std": 0.06405875086784363, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7898315191268921, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.4765625, + "epoch": 0.24658203125, + "grad_norm": 3.6736958175167618, + "kl": 0.048095703125, + "learning_rate": 9.383544921874999e-07, + "loss": 0.0019, + "reward": 1.7674906253814697, + "reward_std": 0.04135966673493385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7674906253814697, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.5859375, + "epoch": 0.2470703125, + "grad_norm": 2.9483608938527186, + "kl": 0.0535888671875, + "learning_rate": 9.382324218749999e-07, + "loss": 0.0021, + "reward": 1.6795161962509155, + "reward_std": 0.1258496269583702, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6873287260532379, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.4609375, + "epoch": 0.24755859375, + "grad_norm": 10.00278448029272, + "kl": 0.04833984375, + "learning_rate": 9.381103515625e-07, + "loss": 0.0019, + "reward": 1.7105889916419983, + "reward_std": 0.06833425909280777, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7105889916419983, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.2734375, + "epoch": 0.248046875, + "grad_norm": 22.4048260879023, + "kl": 0.0421142578125, + "learning_rate": 9.3798828125e-07, + "loss": 0.0017, + "reward": 1.7356719970703125, + "reward_std": 0.14341094344854355, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7512970268726349, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.1328125, + "epoch": 0.24853515625, + "grad_norm": 3.690996101063405, + "kl": 0.0587158203125, + "learning_rate": 9.378662109375e-07, + "loss": 0.0023, + "reward": 1.5387169122695923, + "reward_std": 0.15988320112228394, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.5934043824672699, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.171875, + "epoch": 0.2490234375, + "grad_norm": 1.760028110863856, + "kl": 0.060302734375, + "learning_rate": 9.37744140625e-07, + "loss": 0.0024, + "reward": 1.6823578476905823, + "reward_std": 0.13578759506344795, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.690170407295227, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.75, + "epoch": 0.24951171875, + "grad_norm": 1.4416400099241444, + "kl": 0.0535888671875, + "learning_rate": 9.376220703124999e-07, + "loss": 0.0021, + "reward": 1.654783546924591, + "reward_std": 0.10625720396637917, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6625960469245911, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.4140625, + "epoch": 0.25, + "grad_norm": 1.168208336958592, + "kl": 0.05078125, + "learning_rate": 9.374999999999999e-07, + "loss": 0.002, + "reward": 1.5635674595832825, + "reward_std": 0.10676468908786774, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5713800489902496, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.8359375, + "epoch": 0.25048828125, + "grad_norm": 1.2959350822555695, + "kl": 0.049560546875, + "learning_rate": 9.373779296875e-07, + "loss": 0.002, + "reward": 1.6148346662521362, + "reward_std": 0.05857887305319309, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6148346662521362, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.2734375, + "epoch": 0.2509765625, + "grad_norm": 2.230807942706403, + "kl": 0.0537109375, + "learning_rate": 9.37255859375e-07, + "loss": 0.0021, + "reward": 1.7130588293075562, + "reward_std": 0.099614929407835, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7130588293075562, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7109375, + "epoch": 0.25146484375, + "grad_norm": 1.3346796140624877, + "kl": 0.0501708984375, + "learning_rate": 9.371337890625e-07, + "loss": 0.002, + "reward": 1.6742193698883057, + "reward_std": 0.10047866404056549, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6898443400859833, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.84375, + "epoch": 0.251953125, + "grad_norm": 1.5965524786083076, + "kl": 0.0374755859375, + "learning_rate": 9.3701171875e-07, + "loss": 0.0015, + "reward": 1.7269166707992554, + "reward_std": 0.09766197204589844, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7347292006015778, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.8828125, + "epoch": 0.25244140625, + "grad_norm": 2.5203178337483747, + "kl": 0.040771484375, + "learning_rate": 9.368896484374999e-07, + "loss": 0.0016, + "reward": 1.6974033117294312, + "reward_std": 0.03398803994059563, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6974032521247864, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.484375, + "epoch": 0.2529296875, + "grad_norm": 15.724284771000056, + "kl": 0.0438232421875, + "learning_rate": 9.367675781249999e-07, + "loss": 0.0018, + "reward": 1.769521713256836, + "reward_std": 0.07974059507250786, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7773342132568359, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.6484375, + "epoch": 0.25341796875, + "grad_norm": 12.294650487546035, + "kl": 0.041259765625, + "learning_rate": 9.366455078124999e-07, + "loss": 0.0017, + "reward": 1.7424204349517822, + "reward_std": 0.07815677672624588, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7424204349517822, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.1796875, + "epoch": 0.25390625, + "grad_norm": 1.9577664086872122, + "kl": 0.0418701171875, + "learning_rate": 9.365234375e-07, + "loss": 0.0017, + "reward": 1.5817663669586182, + "reward_std": 0.10854971595108509, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5895789265632629, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.546875, + "epoch": 0.25439453125, + "grad_norm": 1.9802073044830337, + "kl": 0.041259765625, + "learning_rate": 9.364013671875e-07, + "loss": 0.0017, + "reward": 1.775498867034912, + "reward_std": 0.06388038024306297, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7754988670349121, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.640625, + "epoch": 0.2548828125, + "grad_norm": 6.132107488020418, + "kl": 0.054931640625, + "learning_rate": 9.36279296875e-07, + "loss": 0.0022, + "reward": 1.8140791654586792, + "reward_std": 0.060922037810087204, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8140791952610016, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.2109375, + "epoch": 0.25537109375, + "grad_norm": 2.228119855943763, + "kl": 0.0501708984375, + "learning_rate": 9.361572265625e-07, + "loss": 0.002, + "reward": 1.6406488418579102, + "reward_std": 0.0903150886297226, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6406488716602325, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.828125, + "epoch": 0.255859375, + "grad_norm": 3.8874443170507753, + "kl": 0.041748046875, + "learning_rate": 9.360351562499999e-07, + "loss": 0.0017, + "reward": 1.7140488624572754, + "reward_std": 0.09602710604667664, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7140487730503082, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.203125, + "epoch": 0.25634765625, + "grad_norm": 12.272163041477432, + "kl": 0.0487060546875, + "learning_rate": 9.359130859374999e-07, + "loss": 0.0019, + "reward": 1.7117294073104858, + "reward_std": 0.10148574784398079, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7195418775081635, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.734375, + "epoch": 0.2568359375, + "grad_norm": 6.216510978293266, + "kl": 0.0477294921875, + "learning_rate": 9.35791015625e-07, + "loss": 0.0019, + "reward": 1.6478030681610107, + "reward_std": 0.13338213600218296, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6556155979633331, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.2421875, + "epoch": 0.25732421875, + "grad_norm": 1.1994853174094693, + "kl": 0.04541015625, + "learning_rate": 9.356689453125e-07, + "loss": 0.0018, + "reward": 1.741489827632904, + "reward_std": 0.0871284119784832, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.749302327632904, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.640625, + "epoch": 0.2578125, + "grad_norm": 2.7844412848504008, + "kl": 0.0419921875, + "learning_rate": 9.35546875e-07, + "loss": 0.0017, + "reward": 1.6761191487312317, + "reward_std": 0.06489459797739983, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6761191487312317, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.9921875, + "epoch": 0.25830078125, + "grad_norm": 2.445482124975617, + "kl": 0.05517578125, + "learning_rate": 9.354248046875e-07, + "loss": 0.0022, + "reward": 1.649292767047882, + "reward_std": 0.09409752860665321, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6492927670478821, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.5234375, + "epoch": 0.2587890625, + "grad_norm": 3.1389050778274794, + "kl": 0.0455322265625, + "learning_rate": 9.353027343749999e-07, + "loss": 0.0018, + "reward": 1.7897993326187134, + "reward_std": 0.06516874581575394, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7897992730140686, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.453125, + "epoch": 0.25927734375, + "grad_norm": 5.7428193650994075, + "kl": 0.0457763671875, + "learning_rate": 9.351806640624999e-07, + "loss": 0.0018, + "reward": 1.7966364622116089, + "reward_std": 0.10276348143815994, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7966364324092865, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.8984375, + "epoch": 0.259765625, + "grad_norm": 2.1752945927652134, + "kl": 0.0531005859375, + "learning_rate": 9.350585937499999e-07, + "loss": 0.0021, + "reward": 1.6889333724975586, + "reward_std": 0.06486545503139496, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6889333724975586, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.84375, + "epoch": 0.26025390625, + "grad_norm": 1.3878938520937938, + "kl": 0.044189453125, + "learning_rate": 9.349365234375e-07, + "loss": 0.0018, + "reward": 1.8459346890449524, + "reward_std": 0.03464473132044077, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8459346890449524, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.1015625, + "epoch": 0.2607421875, + "grad_norm": 2.9082227869155868, + "kl": 0.03704833984375, + "learning_rate": 9.34814453125e-07, + "loss": 0.0015, + "reward": 1.7827296257019043, + "reward_std": 0.05504240095615387, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7827296257019043, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.3203125, + "epoch": 0.26123046875, + "grad_norm": 2.5055735390456455, + "kl": 0.046630859375, + "learning_rate": 9.346923828125e-07, + "loss": 0.0019, + "reward": 1.6670472025871277, + "reward_std": 0.14498621970415115, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6826722323894501, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.328125, + "epoch": 0.26171875, + "grad_norm": 2.1695056878186287, + "kl": 0.05078125, + "learning_rate": 9.345703125e-07, + "loss": 0.002, + "reward": 1.7323461771011353, + "reward_std": 0.06361747533082962, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7323460876941681, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.4609375, + "epoch": 0.26220703125, + "grad_norm": 3.3136437260627503, + "kl": 0.04931640625, + "learning_rate": 9.344482421874999e-07, + "loss": 0.002, + "reward": 1.7232381105422974, + "reward_std": 0.12078379839658737, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7310506999492645, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.3515625, + "epoch": 0.2626953125, + "grad_norm": 2.456324374339407, + "kl": 0.04345703125, + "learning_rate": 9.343261718749999e-07, + "loss": 0.0017, + "reward": 1.7056252360343933, + "reward_std": 0.09498313069343567, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7056251764297485, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.734375, + "epoch": 0.26318359375, + "grad_norm": 4.079059533712564, + "kl": 0.039794921875, + "learning_rate": 9.342041015625e-07, + "loss": 0.0016, + "reward": 1.4682893753051758, + "reward_std": 0.1072283387184143, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.530789390206337, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.2265625, + "epoch": 0.263671875, + "grad_norm": 10.45654393994142, + "kl": 0.0357666015625, + "learning_rate": 9.3408203125e-07, + "loss": 0.0014, + "reward": 1.7631608843803406, + "reward_std": 0.13105908036231995, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7787858843803406, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.0, + "epoch": 0.26416015625, + "grad_norm": 1.7102279475438495, + "kl": 0.0360107421875, + "learning_rate": 9.339599609375e-07, + "loss": 0.0014, + "reward": 1.6794561743736267, + "reward_std": 0.06675281748175621, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6794561147689819, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.3828125, + "epoch": 0.2646484375, + "grad_norm": 4.522944699128447, + "kl": 0.04443359375, + "learning_rate": 9.33837890625e-07, + "loss": 0.0018, + "reward": 1.7426947355270386, + "reward_std": 0.08230987191200256, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7426947355270386, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.65625, + "epoch": 0.26513671875, + "grad_norm": 2.6650826688792857, + "kl": 0.0450439453125, + "learning_rate": 9.337158203124999e-07, + "loss": 0.0018, + "reward": 1.7618120908737183, + "reward_std": 0.06165020540356636, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7618121206760406, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.5078125, + "epoch": 0.265625, + "grad_norm": 2.037006291879131, + "kl": 0.051025390625, + "learning_rate": 9.335937499999999e-07, + "loss": 0.002, + "reward": 1.646928310394287, + "reward_std": 0.17904935777187347, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6469283103942871, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.9140625, + "epoch": 0.26611328125, + "grad_norm": 2.297298122509907, + "kl": 0.039306640625, + "learning_rate": 9.334716796874999e-07, + "loss": 0.0016, + "reward": 1.6567280888557434, + "reward_std": 0.10316119715571404, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6567280292510986, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.3125, + "epoch": 0.2666015625, + "grad_norm": 2.660291456006694, + "kl": 0.039306640625, + "learning_rate": 9.33349609375e-07, + "loss": 0.0016, + "reward": 1.823375165462494, + "reward_std": 0.05490726791322231, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8233751654624939, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.1640625, + "epoch": 0.26708984375, + "grad_norm": 6.13270694396882, + "kl": 0.0443115234375, + "learning_rate": 9.332275390625e-07, + "loss": 0.0018, + "reward": 1.6771809458732605, + "reward_std": 0.0902215950191021, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6771808862686157, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.15625, + "epoch": 0.267578125, + "grad_norm": 2.2633016069892777, + "kl": 0.0455322265625, + "learning_rate": 9.3310546875e-07, + "loss": 0.0018, + "reward": 1.6595528721809387, + "reward_std": 0.12929360568523407, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6673653721809387, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.203125, + "epoch": 0.26806640625, + "grad_norm": 2.805590510649819, + "kl": 0.0498046875, + "learning_rate": 9.329833984375e-07, + "loss": 0.002, + "reward": 1.7251054048538208, + "reward_std": 0.10590995103120804, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7407303750514984, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.640625, + "epoch": 0.2685546875, + "grad_norm": 1.5868774720575003, + "kl": 0.045166015625, + "learning_rate": 9.328613281249999e-07, + "loss": 0.0018, + "reward": 1.5312697887420654, + "reward_std": 0.10218230821192265, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5390822738409042, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.6171875, + "epoch": 0.26904296875, + "grad_norm": 5.217204405936032, + "kl": 0.0523681640625, + "learning_rate": 9.327392578124999e-07, + "loss": 0.0021, + "reward": 1.6690084338188171, + "reward_std": 0.11925657838582993, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7002584338188171, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0625, + "epoch": 0.26953125, + "grad_norm": 2.332959009144248, + "kl": 0.05322265625, + "learning_rate": 9.326171874999999e-07, + "loss": 0.0021, + "reward": 1.7313638925552368, + "reward_std": 0.06417965516448021, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7313639223575592, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.2421875, + "epoch": 0.27001953125, + "grad_norm": 2.6193153669407203, + "kl": 0.0447998046875, + "learning_rate": 9.324951171875e-07, + "loss": 0.0018, + "reward": 1.7203855514526367, + "reward_std": 0.07075966894626617, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7203856408596039, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.3515625, + "epoch": 0.2705078125, + "grad_norm": 2.5899560028614697, + "kl": 0.0489501953125, + "learning_rate": 9.32373046875e-07, + "loss": 0.002, + "reward": 1.6837428212165833, + "reward_std": 0.11842398717999458, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7071803212165833, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.625, + "epoch": 0.27099609375, + "grad_norm": 2.498447085669372, + "kl": 0.0595703125, + "learning_rate": 9.322509765625e-07, + "loss": 0.0024, + "reward": 1.6304301023483276, + "reward_std": 0.07888209074735641, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6304300427436829, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.5, + "epoch": 0.271484375, + "grad_norm": 4.097199567462612, + "kl": 0.072021484375, + "learning_rate": 9.321289062499999e-07, + "loss": 0.0029, + "reward": 1.775884211063385, + "reward_std": 0.07060272060334682, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7758842408657074, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.3046875, + "epoch": 0.27197265625, + "grad_norm": 4.5439033784171095, + "kl": 0.0517578125, + "learning_rate": 9.320068359374999e-07, + "loss": 0.0021, + "reward": 1.6852914690971375, + "reward_std": 0.1067960262298584, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6852914988994598, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.8046875, + "epoch": 0.2724609375, + "grad_norm": 1.7251350694603302, + "kl": 0.049560546875, + "learning_rate": 9.318847656249999e-07, + "loss": 0.002, + "reward": 1.6576202511787415, + "reward_std": 0.09488710761070251, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6654327809810638, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.03125, + "epoch": 0.27294921875, + "grad_norm": 3.92150434058649, + "kl": 0.041015625, + "learning_rate": 9.317626953125e-07, + "loss": 0.0016, + "reward": 1.5969886183738708, + "reward_std": 0.12209014222025871, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6126136183738708, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.3359375, + "epoch": 0.2734375, + "grad_norm": 2.585612743662862, + "kl": 0.0447998046875, + "learning_rate": 9.31640625e-07, + "loss": 0.0018, + "reward": 1.6855441331863403, + "reward_std": 0.11337171494960785, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6855441033840179, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.21875, + "epoch": 0.27392578125, + "grad_norm": 5.285724233905254, + "kl": 0.03778076171875, + "learning_rate": 9.315185546875e-07, + "loss": 0.0015, + "reward": 1.64777010679245, + "reward_std": 0.22076285630464554, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.69464510679245, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.671875, + "epoch": 0.2744140625, + "grad_norm": 1.8126430914250469, + "kl": 0.0450439453125, + "learning_rate": 9.31396484375e-07, + "loss": 0.0018, + "reward": 1.7356610298156738, + "reward_std": 0.10725349560379982, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7356610596179962, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.71875, + "epoch": 0.27490234375, + "grad_norm": 1.9656321676605797, + "kl": 0.05322265625, + "learning_rate": 9.312744140624999e-07, + "loss": 0.0021, + "reward": 1.6761003732681274, + "reward_std": 0.07711060158908367, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6761003732681274, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.0390625, + "epoch": 0.275390625, + "grad_norm": 3.2377061229845836, + "kl": 0.0592041015625, + "learning_rate": 9.311523437499999e-07, + "loss": 0.0024, + "reward": 1.6362444162368774, + "reward_std": 0.10095639899373055, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6518694162368774, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.8671875, + "epoch": 0.27587890625, + "grad_norm": 25.909912449399112, + "kl": 0.05029296875, + "learning_rate": 9.310302734374999e-07, + "loss": 0.002, + "reward": 1.686921238899231, + "reward_std": 0.07121211476624012, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.686921238899231, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.859375, + "epoch": 0.2763671875, + "grad_norm": 2.2331716793343084, + "kl": 0.0450439453125, + "learning_rate": 9.30908203125e-07, + "loss": 0.0018, + "reward": 1.7039056420326233, + "reward_std": 0.06212746538221836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7039056420326233, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.9453125, + "epoch": 0.27685546875, + "grad_norm": 2.2291085862491617, + "kl": 0.04150390625, + "learning_rate": 9.307861328125e-07, + "loss": 0.0017, + "reward": 1.7227251529693604, + "reward_std": 0.1121636014431715, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7383500933647156, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.796875, + "epoch": 0.27734375, + "grad_norm": 1.3421158309646601, + "kl": 0.0401611328125, + "learning_rate": 9.306640625e-07, + "loss": 0.0016, + "reward": 1.570694386959076, + "reward_std": 0.1121312715113163, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6019443571567535, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0625, + "epoch": 0.27783203125, + "grad_norm": 1.5020919887284745, + "kl": 0.0501708984375, + "learning_rate": 9.305419921875e-07, + "loss": 0.002, + "reward": 1.8749535083770752, + "reward_std": 0.025433492846786976, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8749535381793976, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.2734375, + "epoch": 0.2783203125, + "grad_norm": 2.342753166145787, + "kl": 0.0411376953125, + "learning_rate": 9.304199218749999e-07, + "loss": 0.0016, + "reward": 1.8102790713310242, + "reward_std": 0.09545211121439934, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8180915713310242, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.03125, + "epoch": 0.27880859375, + "grad_norm": 2.2742980387573652, + "kl": 0.0556640625, + "learning_rate": 9.302978515624999e-07, + "loss": 0.0022, + "reward": 1.8116941452026367, + "reward_std": 0.09244917519390583, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8116941154003143, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5859375, + "epoch": 0.279296875, + "grad_norm": 9.595557459930381, + "kl": 0.0496826171875, + "learning_rate": 9.3017578125e-07, + "loss": 0.002, + "reward": 1.6191758513450623, + "reward_std": 0.09628532081842422, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6348008215427399, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.3671875, + "epoch": 0.27978515625, + "grad_norm": 18.896318107676453, + "kl": 0.068115234375, + "learning_rate": 9.300537109375e-07, + "loss": 0.0027, + "reward": 1.7079237699508667, + "reward_std": 0.1069163903594017, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7079237401485443, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.5234375, + "epoch": 0.2802734375, + "grad_norm": 0.8129066081542312, + "kl": 0.0462646484375, + "learning_rate": 9.29931640625e-07, + "loss": 0.0018, + "reward": 1.6988362073898315, + "reward_std": 0.011203366797417402, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6988362371921539, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.3984375, + "epoch": 0.28076171875, + "grad_norm": 0.993947791093584, + "kl": 0.06201171875, + "learning_rate": 9.298095703125e-07, + "loss": 0.0025, + "reward": 1.794031023979187, + "reward_std": 0.08120781742036343, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8174684643745422, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.640625, + "epoch": 0.28125, + "grad_norm": 6.676572461323387, + "kl": 0.040283203125, + "learning_rate": 9.296874999999999e-07, + "loss": 0.0016, + "reward": 1.726797878742218, + "reward_std": 0.08194676041603088, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.726797878742218, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.9609375, + "epoch": 0.28173828125, + "grad_norm": 1.5079255512272232, + "kl": 0.060791015625, + "learning_rate": 9.295654296874999e-07, + "loss": 0.0024, + "reward": 1.7783808708190918, + "reward_std": 0.049073804169893265, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7783808708190918, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.9296875, + "epoch": 0.2822265625, + "grad_norm": 1.3798475015598377, + "kl": 0.0418701171875, + "learning_rate": 9.294433593749999e-07, + "loss": 0.0017, + "reward": 1.7986710667610168, + "reward_std": 0.03962104860693216, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7986710667610168, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.7578125, + "epoch": 0.28271484375, + "grad_norm": 2.505394028512915, + "kl": 0.0567626953125, + "learning_rate": 9.293212890625e-07, + "loss": 0.0023, + "reward": 1.6889582872390747, + "reward_std": 0.07442482188344002, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6889582574367523, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.0390625, + "epoch": 0.283203125, + "grad_norm": 1.8850278687560447, + "kl": 0.0411376953125, + "learning_rate": 9.2919921875e-07, + "loss": 0.0016, + "reward": 1.6971967816352844, + "reward_std": 0.09730785340070724, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6971967816352844, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.4609375, + "epoch": 0.28369140625, + "grad_norm": 1.7641909702416805, + "kl": 0.0494384765625, + "learning_rate": 9.290771484375e-07, + "loss": 0.002, + "reward": 1.7254841923713684, + "reward_std": 0.09819715097546577, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7332966923713684, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7109375, + "epoch": 0.2841796875, + "grad_norm": 1.5034625672902855, + "kl": 0.044677734375, + "learning_rate": 9.28955078125e-07, + "loss": 0.0018, + "reward": 1.6941693425178528, + "reward_std": 0.11884243786334991, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6941693425178528, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.484375, + "epoch": 0.28466796875, + "grad_norm": 1.8739175117936375, + "kl": 0.056640625, + "learning_rate": 9.288330078124999e-07, + "loss": 0.0023, + "reward": 1.7098997831344604, + "reward_std": 0.13007067143917084, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7489623129367828, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.8828125, + "epoch": 0.28515625, + "grad_norm": 3.302249530616915, + "kl": 0.0550537109375, + "learning_rate": 9.287109374999999e-07, + "loss": 0.0022, + "reward": 1.8262977600097656, + "reward_std": 0.07570694014430046, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8262978196144104, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.34375, + "epoch": 0.28564453125, + "grad_norm": 2.3247621756543406, + "kl": 0.039794921875, + "learning_rate": 9.285888671875e-07, + "loss": 0.0016, + "reward": 1.691820740699768, + "reward_std": 0.10432455316185951, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6918207406997681, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.078125, + "epoch": 0.2861328125, + "grad_norm": 2.919815872077742, + "kl": 0.0537109375, + "learning_rate": 9.28466796875e-07, + "loss": 0.0021, + "reward": 1.8132377862930298, + "reward_std": 0.0450489092618227, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8132377862930298, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.9453125, + "epoch": 0.28662109375, + "grad_norm": 2.315450631479818, + "kl": 0.0491943359375, + "learning_rate": 9.283447265625e-07, + "loss": 0.002, + "reward": 1.567336082458496, + "reward_std": 0.04566051810979843, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5673360526561737, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.9453125, + "epoch": 0.287109375, + "grad_norm": 4.866170468108119, + "kl": 0.0443115234375, + "learning_rate": 9.2822265625e-07, + "loss": 0.0018, + "reward": 1.7104328870773315, + "reward_std": 0.047424353659152985, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7104328274726868, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.1484375, + "epoch": 0.28759765625, + "grad_norm": 2.464877601750045, + "kl": 0.0465087890625, + "learning_rate": 9.281005859374999e-07, + "loss": 0.0019, + "reward": 1.6946337819099426, + "reward_std": 0.13272637128829956, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7258838415145874, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.453125, + "epoch": 0.2880859375, + "grad_norm": 2.8034456562750654, + "kl": 0.037841796875, + "learning_rate": 9.279785156249999e-07, + "loss": 0.0015, + "reward": 1.7395535707473755, + "reward_std": 0.10018676891922951, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7473660111427307, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.546875, + "epoch": 0.28857421875, + "grad_norm": 7.447215195080596, + "kl": 0.043701171875, + "learning_rate": 9.278564453124999e-07, + "loss": 0.0017, + "reward": 1.69717139005661, + "reward_std": 0.09286946710199118, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7518589496612549, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.8125, + "epoch": 0.2890625, + "grad_norm": 1.4897448677701148, + "kl": 0.0584716796875, + "learning_rate": 9.27734375e-07, + "loss": 0.0023, + "reward": 1.7233901023864746, + "reward_std": 0.04082547128200531, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7233900725841522, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.1953125, + "epoch": 0.28955078125, + "grad_norm": 3.7207228740501317, + "kl": 0.0531005859375, + "learning_rate": 9.276123046875e-07, + "loss": 0.0021, + "reward": 1.641968011856079, + "reward_std": 0.1139497272670269, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6575929820537567, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.921875, + "epoch": 0.2900390625, + "grad_norm": 2.237099552618115, + "kl": 0.03662109375, + "learning_rate": 9.27490234375e-07, + "loss": 0.0015, + "reward": 1.646517038345337, + "reward_std": 0.28582026064395905, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.7090170085430145, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.109375, + "epoch": 0.29052734375, + "grad_norm": 3.3208620909986246, + "kl": 0.0455322265625, + "learning_rate": 9.273681640625e-07, + "loss": 0.0018, + "reward": 1.6810371279716492, + "reward_std": 0.08739523217082024, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6810370683670044, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.03125, + "epoch": 0.291015625, + "grad_norm": 0.9791867105353927, + "kl": 0.04736328125, + "learning_rate": 9.272460937499999e-07, + "loss": 0.0019, + "reward": 1.8734647035598755, + "reward_std": 0.031122705899178982, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8734646737575531, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.09375, + "epoch": 0.29150390625, + "grad_norm": 2.1825079397965843, + "kl": 0.0369873046875, + "learning_rate": 9.271240234374999e-07, + "loss": 0.0015, + "reward": 1.8181806802749634, + "reward_std": 0.06168582662940025, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8181805908679962, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.9296875, + "epoch": 0.2919921875, + "grad_norm": 2.584490232315663, + "kl": 0.044677734375, + "learning_rate": 9.27001953125e-07, + "loss": 0.0018, + "reward": 1.6417737007141113, + "reward_std": 0.03647463582456112, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6417737007141113, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.6953125, + "epoch": 0.29248046875, + "grad_norm": 2.4277242597465607, + "kl": 0.0513916015625, + "learning_rate": 9.268798828125e-07, + "loss": 0.0021, + "reward": 1.7535077929496765, + "reward_std": 0.08582048118114471, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7535077333450317, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.9609375, + "epoch": 0.29296875, + "grad_norm": 12.486389397389315, + "kl": 0.0555419921875, + "learning_rate": 9.267578125e-07, + "loss": 0.0022, + "reward": 1.7247052192687988, + "reward_std": 0.06530194543302059, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.724705159664154, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.46875, + "epoch": 0.29345703125, + "grad_norm": 2.0579889539520035, + "kl": 0.04736328125, + "learning_rate": 9.266357421875e-07, + "loss": 0.0019, + "reward": 1.7377859354019165, + "reward_std": 0.08668200299143791, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7377859652042389, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.4296875, + "epoch": 0.2939453125, + "grad_norm": 1.2811733254368138, + "kl": 0.035400390625, + "learning_rate": 9.265136718749999e-07, + "loss": 0.0014, + "reward": 1.6522246599197388, + "reward_std": 0.10386989638209343, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.660037100315094, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.015625, + "epoch": 0.29443359375, + "grad_norm": 2.525240888001395, + "kl": 0.0458984375, + "learning_rate": 9.263916015624999e-07, + "loss": 0.0018, + "reward": 1.655815601348877, + "reward_std": 0.11304668337106705, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6636281311511993, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.46875, + "epoch": 0.294921875, + "grad_norm": 10.03655987535627, + "kl": 0.0621337890625, + "learning_rate": 9.262695312499999e-07, + "loss": 0.0025, + "reward": 1.736948013305664, + "reward_std": 0.16118024289608002, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7681980729103088, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.9765625, + "epoch": 0.29541015625, + "grad_norm": 3.25054352753453, + "kl": 0.04638671875, + "learning_rate": 9.261474609375e-07, + "loss": 0.0019, + "reward": 1.6929279565811157, + "reward_std": 0.08746526017785072, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7007405161857605, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.3359375, + "epoch": 0.2958984375, + "grad_norm": 1.9216710744456194, + "kl": 0.0396728515625, + "learning_rate": 9.26025390625e-07, + "loss": 0.0016, + "reward": 1.7309820652008057, + "reward_std": 0.08170492202043533, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7309820353984833, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.8359375, + "epoch": 0.29638671875, + "grad_norm": 1.833893246452737, + "kl": 0.0567626953125, + "learning_rate": 9.259033203125e-07, + "loss": 0.0023, + "reward": 1.6237656474113464, + "reward_std": 0.07675194926559925, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6237656772136688, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.375, + "epoch": 0.296875, + "grad_norm": 5.031728369153867, + "kl": 0.052734375, + "learning_rate": 9.2578125e-07, + "loss": 0.0021, + "reward": 1.7372384667396545, + "reward_std": 0.07356595061719418, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7450509369373322, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.8359375, + "epoch": 0.29736328125, + "grad_norm": 2.248212440247843, + "kl": 0.05078125, + "learning_rate": 9.256591796874999e-07, + "loss": 0.002, + "reward": 1.7162050604820251, + "reward_std": 0.0456718523055315, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7162051498889923, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.515625, + "epoch": 0.2978515625, + "grad_norm": 1.7714015306015924, + "kl": 0.0457763671875, + "learning_rate": 9.255371093749999e-07, + "loss": 0.0018, + "reward": 1.6449219584465027, + "reward_std": 0.04260050132870674, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6449219286441803, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.484375, + "epoch": 0.29833984375, + "grad_norm": 2.296362418962897, + "kl": 0.0465087890625, + "learning_rate": 9.254150390625e-07, + "loss": 0.0019, + "reward": 1.755751132965088, + "reward_std": 0.11303677409887314, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7557511329650879, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.6171875, + "epoch": 0.298828125, + "grad_norm": 2.6229221995817738, + "kl": 0.047607421875, + "learning_rate": 9.2529296875e-07, + "loss": 0.0019, + "reward": 1.6748383045196533, + "reward_std": 0.08769623376429081, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6826508045196533, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.8203125, + "epoch": 0.29931640625, + "grad_norm": 2.472905090535034, + "kl": 0.0384521484375, + "learning_rate": 9.251708984375e-07, + "loss": 0.0015, + "reward": 1.675347626209259, + "reward_std": 0.06301023997366428, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6753476560115814, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.7578125, + "epoch": 0.2998046875, + "grad_norm": 1.968304299505306, + "kl": 0.060791015625, + "learning_rate": 9.25048828125e-07, + "loss": 0.0024, + "reward": 1.7971341013908386, + "reward_std": 0.04744567163288593, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7971341013908386, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.796875, + "epoch": 0.30029296875, + "grad_norm": 4.384171526185067, + "kl": 0.051025390625, + "learning_rate": 9.249267578124999e-07, + "loss": 0.002, + "reward": 1.5328530669212341, + "reward_std": 0.06077993102371693, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5328530073165894, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.578125, + "epoch": 0.30078125, + "grad_norm": 9.232583830494594, + "kl": 0.0443115234375, + "learning_rate": 9.248046874999999e-07, + "loss": 0.0018, + "reward": 1.686236560344696, + "reward_std": 0.046148573979735374, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6862365305423737, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.75, + "epoch": 0.30126953125, + "grad_norm": 1.5236429587824718, + "kl": 0.0550537109375, + "learning_rate": 9.246826171874999e-07, + "loss": 0.0022, + "reward": 1.7975013256072998, + "reward_std": 0.045856970362365246, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7975013852119446, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.796875, + "epoch": 0.3017578125, + "grad_norm": 4.24143148058318, + "kl": 0.0596923828125, + "learning_rate": 9.24560546875e-07, + "loss": 0.0024, + "reward": 1.7057366967201233, + "reward_std": 0.09794734045863152, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7057366371154785, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.40625, + "epoch": 0.30224609375, + "grad_norm": 9.882057657700463, + "kl": 0.055908203125, + "learning_rate": 9.244384765625e-07, + "loss": 0.0022, + "reward": 1.6871796250343323, + "reward_std": 0.0694831982254982, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6871796250343323, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.8359375, + "epoch": 0.302734375, + "grad_norm": 1.9889738924594182, + "kl": 0.0484619140625, + "learning_rate": 9.2431640625e-07, + "loss": 0.0019, + "reward": 1.784572958946228, + "reward_std": 0.05175241082906723, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7845728695392609, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.1484375, + "epoch": 0.30322265625, + "grad_norm": 2.316432033456783, + "kl": 0.0416259765625, + "learning_rate": 9.241943359375e-07, + "loss": 0.0017, + "reward": 1.8488008379936218, + "reward_std": 0.06617510505020618, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8488008677959442, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.6953125, + "epoch": 0.3037109375, + "grad_norm": 2.9435316736306847, + "kl": 0.05517578125, + "learning_rate": 9.240722656249999e-07, + "loss": 0.0022, + "reward": 1.6083208918571472, + "reward_std": 0.15882696211338043, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6239458322525024, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.2734375, + "epoch": 0.30419921875, + "grad_norm": 2.620065207948406, + "kl": 0.05712890625, + "learning_rate": 9.239501953124999e-07, + "loss": 0.0023, + "reward": 1.5121939182281494, + "reward_std": 0.08841052651405334, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5121939033269882, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.5, + "epoch": 0.3046875, + "grad_norm": 1.706973036482709, + "kl": 0.0498046875, + "learning_rate": 9.23828125e-07, + "loss": 0.002, + "reward": 1.7409059405326843, + "reward_std": 0.14900105446577072, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7487184107303619, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.3359375, + "epoch": 0.30517578125, + "grad_norm": 1.697511564265202, + "kl": 0.0692138671875, + "learning_rate": 9.237060546875e-07, + "loss": 0.0028, + "reward": 1.623015284538269, + "reward_std": 0.08251120336353779, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6230152547359467, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.28125, + "epoch": 0.3056640625, + "grad_norm": 1.6225110716919982, + "kl": 0.0494384765625, + "learning_rate": 9.23583984375e-07, + "loss": 0.002, + "reward": 1.8155426383018494, + "reward_std": 0.040754200890660286, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8155426383018494, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.4296875, + "epoch": 0.30615234375, + "grad_norm": 5.245194521239568, + "kl": 0.0635986328125, + "learning_rate": 9.234619140625e-07, + "loss": 0.0025, + "reward": 1.7196524143218994, + "reward_std": 0.16773709654808044, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7274648249149323, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.7109375, + "epoch": 0.306640625, + "grad_norm": 2.495211042283095, + "kl": 0.041748046875, + "learning_rate": 9.233398437499999e-07, + "loss": 0.0017, + "reward": 1.7418628334999084, + "reward_std": 0.06394334509968758, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7418628334999084, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.515625, + "epoch": 0.30712890625, + "grad_norm": 1.074724101016986, + "kl": 0.0457763671875, + "learning_rate": 9.232177734374999e-07, + "loss": 0.0018, + "reward": 1.6888737678527832, + "reward_std": 0.15227380208671093, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6966862678527832, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.9765625, + "epoch": 0.3076171875, + "grad_norm": 2.138600967420769, + "kl": 0.05224609375, + "learning_rate": 9.230957031249999e-07, + "loss": 0.0021, + "reward": 1.528347134590149, + "reward_std": 0.06925049610435963, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5283471196889877, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.2109375, + "epoch": 0.30810546875, + "grad_norm": 2.2724969029807904, + "kl": 0.050048828125, + "learning_rate": 9.229736328125e-07, + "loss": 0.002, + "reward": 1.6192744374275208, + "reward_std": 0.10097651556134224, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6270869076251984, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.6796875, + "epoch": 0.30859375, + "grad_norm": 4.734654446629603, + "kl": 0.0465087890625, + "learning_rate": 9.228515625e-07, + "loss": 0.0019, + "reward": 1.5857577323913574, + "reward_std": 0.14710739254951477, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5935702323913574, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.828125, + "epoch": 0.30908203125, + "grad_norm": 1.3342057084147274, + "kl": 0.0533447265625, + "learning_rate": 9.227294921875e-07, + "loss": 0.0021, + "reward": 1.7136409878730774, + "reward_std": 0.10766054317355156, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7292659878730774, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.0703125, + "epoch": 0.3095703125, + "grad_norm": 1.5671664183297946, + "kl": 0.0543212890625, + "learning_rate": 9.22607421875e-07, + "loss": 0.0022, + "reward": 1.7468852996826172, + "reward_std": 0.1310337483882904, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.76251021027565, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.53125, + "epoch": 0.31005859375, + "grad_norm": 1.1794567116723214, + "kl": 0.0552978515625, + "learning_rate": 9.224853515624999e-07, + "loss": 0.0022, + "reward": 1.6757075786590576, + "reward_std": 0.07465480640530586, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6757076382637024, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.1328125, + "epoch": 0.310546875, + "grad_norm": 1.1485463519752817, + "kl": 0.0455322265625, + "learning_rate": 9.223632812499999e-07, + "loss": 0.0018, + "reward": 1.8436731696128845, + "reward_std": 0.05472866632044315, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8436731696128845, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.2265625, + "epoch": 0.31103515625, + "grad_norm": 1.6053099530571169, + "kl": 0.04443359375, + "learning_rate": 9.222412109375e-07, + "loss": 0.0018, + "reward": 1.828328251838684, + "reward_std": 0.07346354052424431, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8283282518386841, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.484375, + "epoch": 0.3115234375, + "grad_norm": 2.3253920128947945, + "kl": 0.053466796875, + "learning_rate": 9.22119140625e-07, + "loss": 0.0021, + "reward": 1.528764247894287, + "reward_std": 0.16795818135142326, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5678267776966095, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.125, + "epoch": 0.31201171875, + "grad_norm": 6.0280239889864164, + "kl": 0.058837890625, + "learning_rate": 9.219970703125e-07, + "loss": 0.0024, + "reward": 1.783986210823059, + "reward_std": 0.06189366802573204, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7917985916137695, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.7109375, + "epoch": 0.3125, + "grad_norm": 2.344773690623761, + "kl": 0.04766845703125, + "learning_rate": 9.21875e-07, + "loss": 0.0019, + "reward": 1.6732546091079712, + "reward_std": 0.20847465842962265, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7279421091079712, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.515625, + "epoch": 0.31298828125, + "grad_norm": 7.004317625286649, + "kl": 0.041259765625, + "learning_rate": 9.217529296874999e-07, + "loss": 0.0016, + "reward": 1.760383129119873, + "reward_std": 0.04730805940926075, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7603830993175507, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.4140625, + "epoch": 0.3134765625, + "grad_norm": 3.3601207436539684, + "kl": 0.0477294921875, + "learning_rate": 9.216308593749999e-07, + "loss": 0.0019, + "reward": 1.750933825969696, + "reward_std": 0.04815097339451313, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7509337961673737, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.2265625, + "epoch": 0.31396484375, + "grad_norm": 1.5510742673980848, + "kl": 0.059814453125, + "learning_rate": 9.215087890624999e-07, + "loss": 0.0024, + "reward": 1.720008671283722, + "reward_std": 0.04548669047653675, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7200086712837219, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.296875, + "epoch": 0.314453125, + "grad_norm": 1.4934380093652202, + "kl": 0.055419921875, + "learning_rate": 9.2138671875e-07, + "loss": 0.0022, + "reward": 1.7194246053695679, + "reward_std": 0.11597402952611446, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7350495755672455, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.40625, + "epoch": 0.31494140625, + "grad_norm": 2.2139528602712155, + "kl": 0.0509033203125, + "learning_rate": 9.212646484375e-07, + "loss": 0.002, + "reward": 1.7015312910079956, + "reward_std": 0.03155198786407709, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7015312910079956, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.7578125, + "epoch": 0.3154296875, + "grad_norm": 1.1749981752079521, + "kl": 0.0499267578125, + "learning_rate": 9.21142578125e-07, + "loss": 0.002, + "reward": 1.6304461359977722, + "reward_std": 0.12662875652313232, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6538836359977722, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.6484375, + "epoch": 0.31591796875, + "grad_norm": 1.9319087488376623, + "kl": 0.0384521484375, + "learning_rate": 9.210205078125e-07, + "loss": 0.0015, + "reward": 1.594020962715149, + "reward_std": 0.20690031349658966, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6487084329128265, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.0625, + "epoch": 0.31640625, + "grad_norm": 4.887517766882272, + "kl": 0.0421142578125, + "learning_rate": 9.208984374999999e-07, + "loss": 0.0017, + "reward": 1.6909980773925781, + "reward_std": 0.045924630016088486, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6909981518983841, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.6640625, + "epoch": 0.31689453125, + "grad_norm": 5.793761071887375, + "kl": 0.0443115234375, + "learning_rate": 9.207763671874999e-07, + "loss": 0.0018, + "reward": 1.7228538393974304, + "reward_std": 0.11347953602671623, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7228538393974304, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.6640625, + "epoch": 0.3173828125, + "grad_norm": 1.974099723600187, + "kl": 0.0489501953125, + "learning_rate": 9.206542968749999e-07, + "loss": 0.002, + "reward": 1.7972348928451538, + "reward_std": 0.08608914166688919, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7972348928451538, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.0546875, + "epoch": 0.31787109375, + "grad_norm": 1.0500416763251224, + "kl": 0.05810546875, + "learning_rate": 9.205322265625e-07, + "loss": 0.0023, + "reward": 1.6828134655952454, + "reward_std": 0.04013410210609436, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.682813435792923, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.4375, + "epoch": 0.318359375, + "grad_norm": 2.2876063816784025, + "kl": 0.0438232421875, + "learning_rate": 9.2041015625e-07, + "loss": 0.0018, + "reward": 1.6353506445884705, + "reward_std": 0.06418109219521284, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6353506445884705, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.3984375, + "epoch": 0.31884765625, + "grad_norm": 4.503532269564074, + "kl": 0.04638671875, + "learning_rate": 9.202880859375e-07, + "loss": 0.0019, + "reward": 1.6205086708068848, + "reward_std": 0.06150331161916256, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6205087304115295, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.3125, + "epoch": 0.3193359375, + "grad_norm": 2.2645762992688674, + "kl": 0.0555419921875, + "learning_rate": 9.201660156249999e-07, + "loss": 0.0022, + "reward": 1.7717258930206299, + "reward_std": 0.11558713018894196, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7873509228229523, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.8046875, + "epoch": 0.31982421875, + "grad_norm": 2.5450846485367165, + "kl": 0.067626953125, + "learning_rate": 9.200439453124999e-07, + "loss": 0.0027, + "reward": 1.62615168094635, + "reward_std": 0.08513330668210983, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6261517405509949, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.3046875, + "epoch": 0.3203125, + "grad_norm": 1.3612683167143704, + "kl": 0.044921875, + "learning_rate": 9.199218749999999e-07, + "loss": 0.0018, + "reward": 1.7125912308692932, + "reward_std": 0.028453302569687366, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.712591290473938, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.6953125, + "epoch": 0.32080078125, + "grad_norm": 1.485856746527386, + "kl": 0.0552978515625, + "learning_rate": 9.197998046875e-07, + "loss": 0.0022, + "reward": 1.7644490003585815, + "reward_std": 0.05199288483709097, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7722615003585815, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.5703125, + "epoch": 0.3212890625, + "grad_norm": 2.4798625266117824, + "kl": 0.0465087890625, + "learning_rate": 9.19677734375e-07, + "loss": 0.0019, + "reward": 1.6799516081809998, + "reward_std": 0.09173119999468327, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6799516975879669, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.4609375, + "epoch": 0.32177734375, + "grad_norm": 1.7859241168898383, + "kl": 0.0496826171875, + "learning_rate": 9.195556640625e-07, + "loss": 0.002, + "reward": 1.6372400522232056, + "reward_std": 0.06941110268235207, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6450526714324951, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.21875, + "epoch": 0.322265625, + "grad_norm": 9.488602048011655, + "kl": 0.05810546875, + "learning_rate": 9.1943359375e-07, + "loss": 0.0023, + "reward": 1.744973599910736, + "reward_std": 0.07817739248275757, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7449735701084137, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.84375, + "epoch": 0.32275390625, + "grad_norm": 1.4547620498319576, + "kl": 0.047119140625, + "learning_rate": 9.193115234374999e-07, + "loss": 0.0019, + "reward": 1.713306725025177, + "reward_std": 0.050102658569812775, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7133066952228546, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.6171875, + "epoch": 0.3232421875, + "grad_norm": 1.5620710613142204, + "kl": 0.053466796875, + "learning_rate": 9.191894531249999e-07, + "loss": 0.0021, + "reward": 1.6234807968139648, + "reward_std": 0.10660821199417114, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6391057670116425, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.1484375, + "epoch": 0.32373046875, + "grad_norm": 2.8851167506317354, + "kl": 0.0604248046875, + "learning_rate": 9.190673828124999e-07, + "loss": 0.0024, + "reward": 1.7698102593421936, + "reward_std": 0.09677816927433014, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7698102295398712, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.7421875, + "epoch": 0.32421875, + "grad_norm": 0.7451981174641832, + "kl": 0.0447998046875, + "learning_rate": 9.189453125e-07, + "loss": 0.0018, + "reward": 1.6291025876998901, + "reward_std": 0.08335762098431587, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6291025280952454, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.9453125, + "epoch": 0.32470703125, + "grad_norm": 12.413290735105186, + "kl": 0.053955078125, + "learning_rate": 9.188232421875e-07, + "loss": 0.0022, + "reward": 1.7409712076187134, + "reward_std": 0.06279715150594711, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7409711480140686, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.6875, + "epoch": 0.3251953125, + "grad_norm": 2.01957679366167, + "kl": 0.0726318359375, + "learning_rate": 9.18701171875e-07, + "loss": 0.0029, + "reward": 1.8215317130088806, + "reward_std": 0.035058433189988136, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8215316832065582, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.796875, + "epoch": 0.32568359375, + "grad_norm": 1.526764750037048, + "kl": 0.0648193359375, + "learning_rate": 9.185791015625e-07, + "loss": 0.0026, + "reward": 1.5901724696159363, + "reward_std": 0.1056349128484726, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5979849547147751, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.4609375, + "epoch": 0.326171875, + "grad_norm": 2.608301473030279, + "kl": 0.055419921875, + "learning_rate": 9.184570312499999e-07, + "loss": 0.0022, + "reward": 1.6940342783927917, + "reward_std": 0.14149951934814453, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7174717485904694, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.4921875, + "epoch": 0.32666015625, + "grad_norm": 2.87920366371091, + "kl": 0.0506591796875, + "learning_rate": 9.183349609374999e-07, + "loss": 0.002, + "reward": 1.698991298675537, + "reward_std": 0.14888149499893188, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7302412986755371, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.3828125, + "epoch": 0.3271484375, + "grad_norm": 2.6608957988350466, + "kl": 0.05029296875, + "learning_rate": 9.18212890625e-07, + "loss": 0.002, + "reward": 1.738844633102417, + "reward_std": 0.10035024397075176, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7466571033000946, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.953125, + "epoch": 0.32763671875, + "grad_norm": 1.0057918255069163, + "kl": 0.0528564453125, + "learning_rate": 9.180908203125e-07, + "loss": 0.0021, + "reward": 1.6742581129074097, + "reward_std": 0.09272240474820137, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6898830831050873, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.078125, + "epoch": 0.328125, + "grad_norm": 2.8489826502257394, + "kl": 0.0623779296875, + "learning_rate": 9.1796875e-07, + "loss": 0.0025, + "reward": 1.7578362226486206, + "reward_std": 0.1680883914232254, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7890861630439758, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.671875, + "epoch": 0.32861328125, + "grad_norm": 9.232271074394879, + "kl": 0.0506591796875, + "learning_rate": 9.178466796875e-07, + "loss": 0.002, + "reward": 1.664437174797058, + "reward_std": 0.11197399348020554, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6956871449947357, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.578125, + "epoch": 0.3291015625, + "grad_norm": 1.317088116181306, + "kl": 0.0521240234375, + "learning_rate": 9.177246093749999e-07, + "loss": 0.0021, + "reward": 1.7777928113937378, + "reward_std": 0.06812568381428719, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7777928411960602, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.6796875, + "epoch": 0.32958984375, + "grad_norm": 8.928077235738657, + "kl": 0.0562744140625, + "learning_rate": 9.176025390624999e-07, + "loss": 0.0023, + "reward": 1.752385139465332, + "reward_std": 0.08091514930129051, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7523851096630096, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.4375, + "epoch": 0.330078125, + "grad_norm": 3.3096762270610833, + "kl": 0.055419921875, + "learning_rate": 9.174804687499999e-07, + "loss": 0.0022, + "reward": 1.693404495716095, + "reward_std": 0.09680695086717606, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6934045851230621, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.6796875, + "epoch": 0.33056640625, + "grad_norm": 3.731533977246003, + "kl": 0.09912109375, + "learning_rate": 9.173583984375e-07, + "loss": 0.004, + "reward": 1.6468342542648315, + "reward_std": 0.06382020935416222, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6468342244625092, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.1328125, + "epoch": 0.3310546875, + "grad_norm": 3.732612140042579, + "kl": 0.05029296875, + "learning_rate": 9.17236328125e-07, + "loss": 0.002, + "reward": 1.7785282731056213, + "reward_std": 0.14171504974365234, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7785282731056213, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.671875, + "epoch": 0.33154296875, + "grad_norm": 18.476333671843648, + "kl": 0.0657958984375, + "learning_rate": 9.171142578125e-07, + "loss": 0.0026, + "reward": 1.6994601488113403, + "reward_std": 0.054649246856570244, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6994601488113403, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.6953125, + "epoch": 0.33203125, + "grad_norm": 3.4222567750636594, + "kl": 0.0540771484375, + "learning_rate": 9.169921875e-07, + "loss": 0.0022, + "reward": 1.7493921518325806, + "reward_std": 0.036320459097623825, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7493922114372253, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.046875, + "epoch": 0.33251953125, + "grad_norm": 7.10011144738821, + "kl": 0.0543212890625, + "learning_rate": 9.168701171874999e-07, + "loss": 0.0022, + "reward": 1.6045172810554504, + "reward_std": 0.06905798241496086, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.604517251253128, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.96875, + "epoch": 0.3330078125, + "grad_norm": 2.739310274689876, + "kl": 0.053955078125, + "learning_rate": 9.167480468749999e-07, + "loss": 0.0022, + "reward": 1.7031362056732178, + "reward_std": 0.0952284187078476, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7109486758708954, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.96875, + "epoch": 0.33349609375, + "grad_norm": 1.8552315668461727, + "kl": 0.0484619140625, + "learning_rate": 9.166259765625e-07, + "loss": 0.0019, + "reward": 1.6751747131347656, + "reward_std": 0.03803575597703457, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6751746535301208, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.1953125, + "epoch": 0.333984375, + "grad_norm": 3.66299552441634, + "kl": 0.064208984375, + "learning_rate": 9.1650390625e-07, + "loss": 0.0026, + "reward": 1.6129703521728516, + "reward_std": 0.06539808213710785, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6129703521728516, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.4453125, + "epoch": 0.33447265625, + "grad_norm": 1.7148741325377912, + "kl": 0.052978515625, + "learning_rate": 9.163818359375e-07, + "loss": 0.0021, + "reward": 1.6778011322021484, + "reward_std": 0.16429652273654938, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7168635725975037, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.640625, + "epoch": 0.3349609375, + "grad_norm": 2.4636169099366687, + "kl": 0.052001953125, + "learning_rate": 9.16259765625e-07, + "loss": 0.0021, + "reward": 1.7410337924957275, + "reward_std": 0.10263085551559925, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7566587924957275, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.3125, + "epoch": 0.33544921875, + "grad_norm": 2.3867093626757043, + "kl": 0.0496826171875, + "learning_rate": 9.161376953124999e-07, + "loss": 0.002, + "reward": 1.7729321718215942, + "reward_std": 0.04040984623134136, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7729321420192719, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.40625, + "epoch": 0.3359375, + "grad_norm": 2.7157119941800283, + "kl": 0.059326171875, + "learning_rate": 9.160156249999999e-07, + "loss": 0.0024, + "reward": 1.644744098186493, + "reward_std": 0.08117583952844143, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6447441577911377, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.765625, + "epoch": 0.33642578125, + "grad_norm": 3.1337083108597916, + "kl": 0.057861328125, + "learning_rate": 9.158935546874999e-07, + "loss": 0.0023, + "reward": 1.7134816646575928, + "reward_std": 0.10441340506076813, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7134817242622375, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.7734375, + "epoch": 0.3369140625, + "grad_norm": 3.76318572842791, + "kl": 0.05908203125, + "learning_rate": 9.15771484375e-07, + "loss": 0.0024, + "reward": 1.673618733882904, + "reward_std": 0.055534401908516884, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6736188232898712, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.1875, + "epoch": 0.33740234375, + "grad_norm": 3.0202608593298432, + "kl": 0.0506591796875, + "learning_rate": 9.156494140625e-07, + "loss": 0.002, + "reward": 1.7747870087623596, + "reward_std": 0.10330940037965775, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7747870087623596, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5078125, + "epoch": 0.337890625, + "grad_norm": 2.0419796805050003, + "kl": 0.05224609375, + "learning_rate": 9.1552734375e-07, + "loss": 0.0021, + "reward": 1.6370163559913635, + "reward_std": 0.09477332793176174, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6526413559913635, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.140625, + "epoch": 0.33837890625, + "grad_norm": 1.6907575692886263, + "kl": 0.0462646484375, + "learning_rate": 9.154052734375e-07, + "loss": 0.0019, + "reward": 1.7515615820884705, + "reward_std": 0.1177232563495636, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7671865224838257, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.28125, + "epoch": 0.3388671875, + "grad_norm": 2.8360519678229092, + "kl": 0.0552978515625, + "learning_rate": 9.152832031249999e-07, + "loss": 0.0022, + "reward": 1.6726796627044678, + "reward_std": 0.11348319053649902, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6883046329021454, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.8515625, + "epoch": 0.33935546875, + "grad_norm": 2.3989655228404994, + "kl": 0.0654296875, + "learning_rate": 9.151611328124999e-07, + "loss": 0.0026, + "reward": 1.5922715663909912, + "reward_std": 0.09356234222650528, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6391465961933136, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.9296875, + "epoch": 0.33984375, + "grad_norm": 2.653042005534019, + "kl": 0.053955078125, + "learning_rate": 9.150390625e-07, + "loss": 0.0022, + "reward": 1.656396508216858, + "reward_std": 0.09683545306324959, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6642089486122131, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.328125, + "epoch": 0.34033203125, + "grad_norm": 1.7573427123722254, + "kl": 0.06591796875, + "learning_rate": 9.149169921875e-07, + "loss": 0.0026, + "reward": 1.6544893980026245, + "reward_std": 0.0390294985845685, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6544894278049469, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.8984375, + "epoch": 0.3408203125, + "grad_norm": 1.3553270738396268, + "kl": 0.05078125, + "learning_rate": 9.14794921875e-07, + "loss": 0.002, + "reward": 1.7640778422355652, + "reward_std": 0.07602404989302158, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7718902826309204, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.078125, + "epoch": 0.34130859375, + "grad_norm": 3.683035859736295, + "kl": 0.0499267578125, + "learning_rate": 9.146728515625e-07, + "loss": 0.002, + "reward": 1.7418290376663208, + "reward_std": 0.1612987220287323, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7652665674686432, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.75, + "epoch": 0.341796875, + "grad_norm": 3.298949549269414, + "kl": 0.0498046875, + "learning_rate": 9.145507812499999e-07, + "loss": 0.002, + "reward": 1.6274558901786804, + "reward_std": 0.15485302917659283, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6665183305740356, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.75, + "epoch": 0.34228515625, + "grad_norm": 7.338680707895053, + "kl": 0.0615234375, + "learning_rate": 9.144287109374999e-07, + "loss": 0.0025, + "reward": 1.7153563499450684, + "reward_std": 0.13466084748506546, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7387937903404236, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.375, + "epoch": 0.3427734375, + "grad_norm": 1.4740042030073908, + "kl": 0.0465087890625, + "learning_rate": 9.143066406249999e-07, + "loss": 0.0019, + "reward": 1.674963891506195, + "reward_std": 0.04929056763648987, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6749638915061951, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.7578125, + "epoch": 0.34326171875, + "grad_norm": 2.2370564002640707, + "kl": 0.0498046875, + "learning_rate": 9.141845703125e-07, + "loss": 0.002, + "reward": 1.6746094226837158, + "reward_std": 0.1111318301409483, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.705859363079071, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.7578125, + "epoch": 0.34375, + "grad_norm": 3.652860145128934, + "kl": 0.0562744140625, + "learning_rate": 9.140625e-07, + "loss": 0.0022, + "reward": 1.8176313638687134, + "reward_std": 0.10441552102565765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.817631334066391, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.59375, + "epoch": 0.34423828125, + "grad_norm": 1.9373839999803215, + "kl": 0.052734375, + "learning_rate": 9.139404296875e-07, + "loss": 0.0021, + "reward": 1.6220948100090027, + "reward_std": 0.12009907513856888, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6924073398113251, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.734375, + "epoch": 0.3447265625, + "grad_norm": 10.496349482097362, + "kl": 0.05029296875, + "learning_rate": 9.13818359375e-07, + "loss": 0.002, + "reward": 1.7156425714492798, + "reward_std": 0.10354878753423691, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7468925714492798, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.6796875, + "epoch": 0.34521484375, + "grad_norm": 2.189163149145361, + "kl": 0.043701171875, + "learning_rate": 9.136962890624999e-07, + "loss": 0.0017, + "reward": 1.7362082600593567, + "reward_std": 0.1519409567117691, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7596457600593567, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.6640625, + "epoch": 0.345703125, + "grad_norm": 3.342079279569258, + "kl": 0.0635986328125, + "learning_rate": 9.135742187499999e-07, + "loss": 0.0025, + "reward": 1.8435426950454712, + "reward_std": 0.03380415961146355, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8435426652431488, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.875, + "epoch": 0.34619140625, + "grad_norm": 2.1844352528050766, + "kl": 0.048583984375, + "learning_rate": 9.134521484375e-07, + "loss": 0.0019, + "reward": 1.6824636459350586, + "reward_std": 0.10241992585361004, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7059011459350586, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.828125, + "epoch": 0.3466796875, + "grad_norm": 23.655131129237514, + "kl": 0.0670166015625, + "learning_rate": 9.13330078125e-07, + "loss": 0.0027, + "reward": 1.7630151510238647, + "reward_std": 0.10404928401112556, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7630151808261871, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.203125, + "epoch": 0.34716796875, + "grad_norm": 3.1467811559428855, + "kl": 0.039794921875, + "learning_rate": 9.132080078125e-07, + "loss": 0.0016, + "reward": 1.6210336685180664, + "reward_std": 0.10416779294610023, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6210336685180664, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.421875, + "epoch": 0.34765625, + "grad_norm": 2.5883626552063275, + "kl": 0.0665283203125, + "learning_rate": 9.130859375e-07, + "loss": 0.0027, + "reward": 1.679788887500763, + "reward_std": 0.08418247289955616, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6797888875007629, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.5546875, + "epoch": 0.34814453125, + "grad_norm": 4.799696463307074, + "kl": 0.0499267578125, + "learning_rate": 9.129638671874999e-07, + "loss": 0.002, + "reward": 1.6627951860427856, + "reward_std": 0.21561793982982635, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7174826860427856, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.8125, + "epoch": 0.3486328125, + "grad_norm": 5.690753458769312, + "kl": 0.055908203125, + "learning_rate": 9.128417968749999e-07, + "loss": 0.0022, + "reward": 1.596695363521576, + "reward_std": 0.08007179386913776, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5966953039169312, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.34375, + "epoch": 0.34912109375, + "grad_norm": 1.9981939045151302, + "kl": 0.0426025390625, + "learning_rate": 9.127197265624999e-07, + "loss": 0.0017, + "reward": 1.65777987241745, + "reward_std": 0.07475204393267632, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6577799022197723, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.1171875, + "epoch": 0.349609375, + "grad_norm": 1.0717553143893126, + "kl": 0.039306640625, + "learning_rate": 9.1259765625e-07, + "loss": 0.0016, + "reward": 1.5546205639839172, + "reward_std": 0.13567753694951534, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6014955639839172, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.1015625, + "epoch": 0.35009765625, + "grad_norm": 3.8384389147264586, + "kl": 0.046142578125, + "learning_rate": 9.124755859375e-07, + "loss": 0.0018, + "reward": 1.665941596031189, + "reward_std": 0.16234686970710754, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7128165364265442, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.4296875, + "epoch": 0.3505859375, + "grad_norm": 5.987053658956865, + "kl": 0.0538330078125, + "learning_rate": 9.12353515625e-07, + "loss": 0.0022, + "reward": 1.6448271870613098, + "reward_std": 0.04302874393761158, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6448271870613098, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.46875, + "epoch": 0.35107421875, + "grad_norm": 1.5749796860637693, + "kl": 0.0450439453125, + "learning_rate": 9.122314453125e-07, + "loss": 0.0018, + "reward": 1.682013988494873, + "reward_std": 0.16238265484571457, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6976390182971954, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.234375, + "epoch": 0.3515625, + "grad_norm": 1.321614461326094, + "kl": 0.052978515625, + "learning_rate": 9.121093749999999e-07, + "loss": 0.0021, + "reward": 1.6903913617134094, + "reward_std": 0.06298444792628288, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6903913915157318, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.5390625, + "epoch": 0.35205078125, + "grad_norm": 1.521983159109129, + "kl": 0.05126953125, + "learning_rate": 9.119873046874999e-07, + "loss": 0.002, + "reward": 1.662535011768341, + "reward_std": 0.09836256504058838, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6703475117683411, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.8046875, + "epoch": 0.3525390625, + "grad_norm": 5.592012602896816, + "kl": 0.0384521484375, + "learning_rate": 9.11865234375e-07, + "loss": 0.0015, + "reward": 1.6349376440048218, + "reward_std": 0.2715572118759155, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6974376440048218, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.828125, + "epoch": 0.35302734375, + "grad_norm": 7.607326995892132, + "kl": 0.0521240234375, + "learning_rate": 9.117431640625e-07, + "loss": 0.0021, + "reward": 1.7890739440917969, + "reward_std": 0.09455129504203796, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7890739738941193, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.28125, + "epoch": 0.353515625, + "grad_norm": 2.0564996524853925, + "kl": 0.048095703125, + "learning_rate": 9.1162109375e-07, + "loss": 0.0019, + "reward": 1.75455242395401, + "reward_std": 0.05431245639920235, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.75455242395401, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.390625, + "epoch": 0.35400390625, + "grad_norm": 1.891536592962387, + "kl": 0.0587158203125, + "learning_rate": 9.114990234375e-07, + "loss": 0.0023, + "reward": 1.6842128038406372, + "reward_std": 0.0393197163939476, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6842128038406372, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.7109375, + "epoch": 0.3544921875, + "grad_norm": 2.6735308908162017, + "kl": 0.050537109375, + "learning_rate": 9.113769531249999e-07, + "loss": 0.002, + "reward": 1.677639126777649, + "reward_std": 0.04795477353036404, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6776390969753265, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.1953125, + "epoch": 0.35498046875, + "grad_norm": 4.067919313994053, + "kl": 0.0478515625, + "learning_rate": 9.112548828124999e-07, + "loss": 0.0019, + "reward": 1.759274661540985, + "reward_std": 0.16164502501487732, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7670871615409851, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.03125, + "epoch": 0.35546875, + "grad_norm": 1.4554737342898283, + "kl": 0.05859375, + "learning_rate": 9.111328124999999e-07, + "loss": 0.0023, + "reward": 1.6114553213119507, + "reward_std": 0.07473801448941231, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6192677319049835, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.0078125, + "epoch": 0.35595703125, + "grad_norm": 5.937471894743553, + "kl": 0.0450439453125, + "learning_rate": 9.110107421875e-07, + "loss": 0.0018, + "reward": 1.8060181140899658, + "reward_std": 0.06944678723812103, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8060181140899658, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.1640625, + "epoch": 0.3564453125, + "grad_norm": 2.892417832234497, + "kl": 0.046875, + "learning_rate": 9.10888671875e-07, + "loss": 0.0019, + "reward": 1.699116826057434, + "reward_std": 0.030583031941205263, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6991168558597565, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.78125, + "epoch": 0.35693359375, + "grad_norm": 4.291871230697797, + "kl": 0.05224609375, + "learning_rate": 9.107666015625e-07, + "loss": 0.0021, + "reward": 1.6441110372543335, + "reward_std": 0.06929890811443329, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6441109776496887, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.1640625, + "epoch": 0.357421875, + "grad_norm": 5.6899357611370975, + "kl": 0.0576171875, + "learning_rate": 9.1064453125e-07, + "loss": 0.0023, + "reward": 1.7429944276809692, + "reward_std": 0.05181153491139412, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7429944574832916, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.9765625, + "epoch": 0.35791015625, + "grad_norm": 2.0271310342750906, + "kl": 0.045654296875, + "learning_rate": 9.105224609374999e-07, + "loss": 0.0018, + "reward": 1.7651514410972595, + "reward_std": 0.05858886428177357, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7651514708995819, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.46875, + "epoch": 0.3583984375, + "grad_norm": 4.625382081677046, + "kl": 0.056640625, + "learning_rate": 9.104003906249999e-07, + "loss": 0.0023, + "reward": 1.7296615242958069, + "reward_std": 0.18934501707553864, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7374739944934845, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.25, + "epoch": 0.35888671875, + "grad_norm": 2.106702124053642, + "kl": 0.0462646484375, + "learning_rate": 9.102783203125e-07, + "loss": 0.0019, + "reward": 1.6757431626319885, + "reward_std": 0.03387642838060856, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6757431626319885, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.421875, + "epoch": 0.359375, + "grad_norm": 2.2254393181907464, + "kl": 0.056884765625, + "learning_rate": 9.1015625e-07, + "loss": 0.0023, + "reward": 1.672728419303894, + "reward_std": 0.057467855513095856, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6727283895015717, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.921875, + "epoch": 0.35986328125, + "grad_norm": 2.8705551314702804, + "kl": 0.047607421875, + "learning_rate": 9.100341796875e-07, + "loss": 0.0019, + "reward": 1.7556483745574951, + "reward_std": 0.12223165854811668, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7634609639644623, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.859375, + "epoch": 0.3603515625, + "grad_norm": 1.9718644163448455, + "kl": 0.064208984375, + "learning_rate": 9.09912109375e-07, + "loss": 0.0026, + "reward": 1.5958901643753052, + "reward_std": 0.1189500167965889, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6037026047706604, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.140625, + "epoch": 0.36083984375, + "grad_norm": 1.1551556849612885, + "kl": 0.053466796875, + "learning_rate": 9.097900390624999e-07, + "loss": 0.0021, + "reward": 1.6770064234733582, + "reward_std": 0.06842825934290886, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6770065128803253, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.5, + "epoch": 0.361328125, + "grad_norm": 1.611938756026289, + "kl": 0.0496826171875, + "learning_rate": 9.096679687499999e-07, + "loss": 0.002, + "reward": 1.871577262878418, + "reward_std": 0.08976521715521812, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8793897330760956, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.3046875, + "epoch": 0.36181640625, + "grad_norm": 2.2338765528780593, + "kl": 0.06201171875, + "learning_rate": 9.095458984374999e-07, + "loss": 0.0025, + "reward": 1.7528924942016602, + "reward_std": 0.06052309833467007, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7528924942016602, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.9609375, + "epoch": 0.3623046875, + "grad_norm": 2.2843239877131483, + "kl": 0.04931640625, + "learning_rate": 9.09423828125e-07, + "loss": 0.002, + "reward": 1.7976442575454712, + "reward_std": 0.07765695080161095, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7976443469524384, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.171875, + "epoch": 0.36279296875, + "grad_norm": 2.5837045435100285, + "kl": 0.054443359375, + "learning_rate": 9.093017578125e-07, + "loss": 0.0022, + "reward": 1.7078853845596313, + "reward_std": 0.08771786838769913, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7156979441642761, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.65625, + "epoch": 0.36328125, + "grad_norm": 14.063807731932368, + "kl": 0.05712890625, + "learning_rate": 9.091796875e-07, + "loss": 0.0023, + "reward": 1.705945611000061, + "reward_std": 0.08700169250369072, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7059455513954163, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.1796875, + "epoch": 0.36376953125, + "grad_norm": 88.25210889413222, + "kl": 0.0533447265625, + "learning_rate": 9.090576171875e-07, + "loss": 0.0021, + "reward": 1.6465200781822205, + "reward_std": 0.10988815873861313, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6465200483798981, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.8203125, + "epoch": 0.3642578125, + "grad_norm": 7.516718824656179, + "kl": 0.051513671875, + "learning_rate": 9.089355468749999e-07, + "loss": 0.0021, + "reward": 1.7468097805976868, + "reward_std": 0.06035367026925087, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7468098104000092, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.4765625, + "epoch": 0.36474609375, + "grad_norm": 2.90080670052233, + "kl": 0.082275390625, + "learning_rate": 9.088134765624999e-07, + "loss": 0.0033, + "reward": 1.7886146306991577, + "reward_std": 0.053633132949471474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7886146605014801, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.3984375, + "epoch": 0.365234375, + "grad_norm": 2.4070521070996658, + "kl": 0.0474853515625, + "learning_rate": 9.0869140625e-07, + "loss": 0.0019, + "reward": 1.7509766221046448, + "reward_std": 0.07239764928817749, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7509766519069672, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.3203125, + "epoch": 0.36572265625, + "grad_norm": 2.3297506701189885, + "kl": 0.0506591796875, + "learning_rate": 9.085693359375e-07, + "loss": 0.002, + "reward": 1.7397636771202087, + "reward_std": 0.07227146998047829, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7397636771202087, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.75, + "epoch": 0.3662109375, + "grad_norm": 2.4906923300155035, + "kl": 0.049560546875, + "learning_rate": 9.08447265625e-07, + "loss": 0.002, + "reward": 1.7281315326690674, + "reward_std": 0.09530112892389297, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7281315326690674, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.296875, + "epoch": 0.36669921875, + "grad_norm": 2.095615441296317, + "kl": 0.083740234375, + "learning_rate": 9.083251953125e-07, + "loss": 0.0033, + "reward": 1.8014087677001953, + "reward_std": 0.1527663916349411, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8014088273048401, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.421875, + "epoch": 0.3671875, + "grad_norm": 2.023204451439646, + "kl": 0.05126953125, + "learning_rate": 9.082031249999999e-07, + "loss": 0.0021, + "reward": 1.5856056809425354, + "reward_std": 0.12917165458202362, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6090432107448578, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.1953125, + "epoch": 0.36767578125, + "grad_norm": 4.535857246107816, + "kl": 0.05615234375, + "learning_rate": 9.080810546874999e-07, + "loss": 0.0022, + "reward": 1.551247239112854, + "reward_std": 0.18378467857837677, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5746847093105316, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.421875, + "epoch": 0.3681640625, + "grad_norm": 10.79203763732622, + "kl": 0.1080322265625, + "learning_rate": 9.079589843749999e-07, + "loss": 0.0043, + "reward": 1.7111627459526062, + "reward_std": 0.11090904846787453, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7111627459526062, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.5859375, + "epoch": 0.36865234375, + "grad_norm": 4.366419277780804, + "kl": 0.0460205078125, + "learning_rate": 9.078369140625e-07, + "loss": 0.0018, + "reward": 1.5475510954856873, + "reward_std": 0.1516926810145378, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5788010954856873, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.40625, + "epoch": 0.369140625, + "grad_norm": 2.2502977034316816, + "kl": 0.0596923828125, + "learning_rate": 9.0771484375e-07, + "loss": 0.0024, + "reward": 1.6759621500968933, + "reward_std": 0.08574535697698593, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6837746500968933, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.6953125, + "epoch": 0.36962890625, + "grad_norm": 3.0394652562883557, + "kl": 0.045166015625, + "learning_rate": 9.075927734375e-07, + "loss": 0.0018, + "reward": 1.6703930497169495, + "reward_std": 0.10942208580672741, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6782055497169495, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.7109375, + "epoch": 0.3701171875, + "grad_norm": 3.209213137923711, + "kl": 0.06884765625, + "learning_rate": 9.07470703125e-07, + "loss": 0.0028, + "reward": 1.6818158030509949, + "reward_std": 0.06707624718546867, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6818158030509949, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.671875, + "epoch": 0.37060546875, + "grad_norm": 6.314452162326053, + "kl": 0.0498046875, + "learning_rate": 9.073486328124999e-07, + "loss": 0.002, + "reward": 1.7439817786216736, + "reward_std": 0.06930938735604286, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7439817786216736, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.3359375, + "epoch": 0.37109375, + "grad_norm": 1.6588947555909435, + "kl": 0.052490234375, + "learning_rate": 9.072265624999999e-07, + "loss": 0.0021, + "reward": 1.7676947116851807, + "reward_std": 0.06368311867117882, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7676947116851807, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0625, + "epoch": 0.37158203125, + "grad_norm": 2.056420743637371, + "kl": 0.0501708984375, + "learning_rate": 9.071044921874999e-07, + "loss": 0.002, + "reward": 1.5630112886428833, + "reward_std": 0.12552234530448914, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5864488482475281, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.34375, + "epoch": 0.3720703125, + "grad_norm": 2.008715753441554, + "kl": 0.062255859375, + "learning_rate": 9.06982421875e-07, + "loss": 0.0025, + "reward": 1.591386616230011, + "reward_std": 0.11909160390496254, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.607011616230011, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.421875, + "epoch": 0.37255859375, + "grad_norm": 6.277949214463844, + "kl": 0.05859375, + "learning_rate": 9.068603515625e-07, + "loss": 0.0023, + "reward": 1.6171656847000122, + "reward_std": 0.048231493681669235, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6171657145023346, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.671875, + "epoch": 0.373046875, + "grad_norm": 5.012112695502121, + "kl": 0.0579833984375, + "learning_rate": 9.0673828125e-07, + "loss": 0.0023, + "reward": 1.7434178590774536, + "reward_std": 0.06155427545309067, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7434178590774536, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.7578125, + "epoch": 0.37353515625, + "grad_norm": 9.599500758693086, + "kl": 0.0579833984375, + "learning_rate": 9.066162109375e-07, + "loss": 0.0023, + "reward": 1.5466606616973877, + "reward_std": 0.09597665816545486, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5544731467962265, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.09375, + "epoch": 0.3740234375, + "grad_norm": 7.306759003859089, + "kl": 0.0556640625, + "learning_rate": 9.064941406249999e-07, + "loss": 0.0022, + "reward": 1.763745129108429, + "reward_std": 0.08324461057782173, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7793701589107513, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.3828125, + "epoch": 0.37451171875, + "grad_norm": 1.8312574638865227, + "kl": 0.0537109375, + "learning_rate": 9.063720703124999e-07, + "loss": 0.0021, + "reward": 1.7746469378471375, + "reward_std": 0.04006502404808998, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7746469378471375, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.1953125, + "epoch": 0.375, + "grad_norm": 1.2098375787612152, + "kl": 0.04638671875, + "learning_rate": 9.0625e-07, + "loss": 0.0019, + "reward": 1.7166728377342224, + "reward_std": 0.05403112433850765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7166728675365448, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.984375, + "epoch": 0.37548828125, + "grad_norm": 8.565656666405385, + "kl": 0.050537109375, + "learning_rate": 9.061279296875e-07, + "loss": 0.002, + "reward": 1.6393229365348816, + "reward_std": 0.08363521099090576, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6393230259418488, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.203125, + "epoch": 0.3759765625, + "grad_norm": 1.638069098109109, + "kl": 0.0606689453125, + "learning_rate": 9.06005859375e-07, + "loss": 0.0024, + "reward": 1.7592090964317322, + "reward_std": 0.11197786778211594, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7670215368270874, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.5078125, + "epoch": 0.37646484375, + "grad_norm": 5.669521003197282, + "kl": 0.046875, + "learning_rate": 9.058837890625e-07, + "loss": 0.0019, + "reward": 1.7937055826187134, + "reward_std": 0.04042255226522684, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7937055230140686, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.2421875, + "epoch": 0.376953125, + "grad_norm": 2.4131694725255524, + "kl": 0.04736328125, + "learning_rate": 9.057617187499999e-07, + "loss": 0.0019, + "reward": 1.754515528678894, + "reward_std": 0.08119422942399979, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.762328028678894, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.125, + "epoch": 0.37744140625, + "grad_norm": 1.2158435987969012, + "kl": 0.0474853515625, + "learning_rate": 9.056396484374999e-07, + "loss": 0.0019, + "reward": 1.589455008506775, + "reward_std": 0.15698669105768204, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6128924638032913, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.1875, + "epoch": 0.3779296875, + "grad_norm": 3.511854371924245, + "kl": 0.064208984375, + "learning_rate": 9.055175781249999e-07, + "loss": 0.0026, + "reward": 1.747837781906128, + "reward_std": 0.047062634490430355, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7478377819061279, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.6875, + "epoch": 0.37841796875, + "grad_norm": 2.035616546962937, + "kl": 0.0478515625, + "learning_rate": 9.053955078125e-07, + "loss": 0.0019, + "reward": 1.7458081245422363, + "reward_std": 0.1068628765642643, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7536205947399139, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.9296875, + "epoch": 0.37890625, + "grad_norm": 9.725323832534293, + "kl": 0.052490234375, + "learning_rate": 9.052734375e-07, + "loss": 0.0021, + "reward": 1.630423367023468, + "reward_std": 0.16277416795492172, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.7085483372211456, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.5390625, + "epoch": 0.37939453125, + "grad_norm": 2.746445302407796, + "kl": 0.056640625, + "learning_rate": 9.051513671875e-07, + "loss": 0.0023, + "reward": 1.6005674600601196, + "reward_std": 0.1270945593714714, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6083799302577972, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.7265625, + "epoch": 0.3798828125, + "grad_norm": 2.171791972938971, + "kl": 0.0400390625, + "learning_rate": 9.05029296875e-07, + "loss": 0.0016, + "reward": 1.71004056930542, + "reward_std": 0.09835747629404068, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7100405395030975, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.8984375, + "epoch": 0.38037109375, + "grad_norm": 1.4601597679507083, + "kl": 0.0455322265625, + "learning_rate": 9.049072265624999e-07, + "loss": 0.0018, + "reward": 1.6127532720565796, + "reward_std": 0.08331700228154659, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6127532124519348, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.890625, + "epoch": 0.380859375, + "grad_norm": 1.245741857334542, + "kl": 0.0604248046875, + "learning_rate": 9.047851562499999e-07, + "loss": 0.0024, + "reward": 1.7605129480361938, + "reward_std": 0.0731951892375946, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7605129182338715, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.7578125, + "epoch": 0.38134765625, + "grad_norm": 1.9107655030172803, + "kl": 0.042724609375, + "learning_rate": 9.046630859375e-07, + "loss": 0.0017, + "reward": 1.7617830038070679, + "reward_std": 0.1986825242638588, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7930330038070679, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.421875, + "epoch": 0.3818359375, + "grad_norm": 1.6052928141201563, + "kl": 0.0528564453125, + "learning_rate": 9.04541015625e-07, + "loss": 0.0021, + "reward": 1.6557468175888062, + "reward_std": 0.1146387904882431, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6713717877864838, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.234375, + "epoch": 0.38232421875, + "grad_norm": 3.5152924375718095, + "kl": 0.0445556640625, + "learning_rate": 9.044189453125e-07, + "loss": 0.0018, + "reward": 1.7099875807762146, + "reward_std": 0.10192125290632248, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7412375807762146, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.75, + "epoch": 0.3828125, + "grad_norm": 1.513013153822231, + "kl": 0.059814453125, + "learning_rate": 9.04296875e-07, + "loss": 0.0024, + "reward": 1.7009785175323486, + "reward_std": 0.07997079566121101, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7087909579277039, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.765625, + "epoch": 0.38330078125, + "grad_norm": 1.1283536516038533, + "kl": 0.0535888671875, + "learning_rate": 9.041748046874999e-07, + "loss": 0.0021, + "reward": 1.795514464378357, + "reward_std": 0.03493136540055275, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7955144643783569, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.125, + "epoch": 0.3837890625, + "grad_norm": 1.7312441195961588, + "kl": 0.0504150390625, + "learning_rate": 9.040527343749999e-07, + "loss": 0.002, + "reward": 1.606820523738861, + "reward_std": 0.12857604026794434, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6224455237388611, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.40625, + "epoch": 0.38427734375, + "grad_norm": 2.4139628304327583, + "kl": 0.0506591796875, + "learning_rate": 9.039306640624999e-07, + "loss": 0.002, + "reward": 1.6258866786956787, + "reward_std": 0.22364450991153717, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6883866786956787, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.34375, + "epoch": 0.384765625, + "grad_norm": 1.0442715242086436, + "kl": 0.0576171875, + "learning_rate": 9.0380859375e-07, + "loss": 0.0023, + "reward": 1.6823328137397766, + "reward_std": 0.033012090250849724, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6823328137397766, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.1796875, + "epoch": 0.38525390625, + "grad_norm": 3.6998708553756336, + "kl": 0.051025390625, + "learning_rate": 9.036865234375e-07, + "loss": 0.002, + "reward": 1.6819748878479004, + "reward_std": 0.10724844038486481, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.689787358045578, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.703125, + "epoch": 0.3857421875, + "grad_norm": 2.743911349386069, + "kl": 0.0635986328125, + "learning_rate": 9.03564453125e-07, + "loss": 0.0025, + "reward": 1.6518617272377014, + "reward_std": 0.08327071741223335, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.651861697435379, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.3359375, + "epoch": 0.38623046875, + "grad_norm": 2.558950840214557, + "kl": 0.06005859375, + "learning_rate": 9.034423828125e-07, + "loss": 0.0024, + "reward": 1.8231335282325745, + "reward_std": 0.0741860456764698, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8231335282325745, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.84375, + "epoch": 0.38671875, + "grad_norm": 2.495274964010672, + "kl": 0.0565185546875, + "learning_rate": 9.033203124999999e-07, + "loss": 0.0023, + "reward": 1.7564613819122314, + "reward_std": 0.12410943582654, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7642738223075867, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.9453125, + "epoch": 0.38720703125, + "grad_norm": 2.7224758525530466, + "kl": 0.0693359375, + "learning_rate": 9.031982421874999e-07, + "loss": 0.0028, + "reward": 1.6794022917747498, + "reward_std": 0.0662822276353836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.679402232170105, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.828125, + "epoch": 0.3876953125, + "grad_norm": 2.008117882656302, + "kl": 0.047119140625, + "learning_rate": 9.03076171875e-07, + "loss": 0.0019, + "reward": 1.7753472328186035, + "reward_std": 0.06647790595889091, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7753472328186035, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.9765625, + "epoch": 0.38818359375, + "grad_norm": 1.7981907802676043, + "kl": 0.0562744140625, + "learning_rate": 9.029541015625e-07, + "loss": 0.0022, + "reward": 1.6716668605804443, + "reward_std": 0.10441191494464874, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6794794797897339, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.2109375, + "epoch": 0.388671875, + "grad_norm": 2.811941047744118, + "kl": 0.0623779296875, + "learning_rate": 9.0283203125e-07, + "loss": 0.0025, + "reward": 1.7318763136863708, + "reward_std": 0.13041818886995316, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7396888434886932, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.8515625, + "epoch": 0.38916015625, + "grad_norm": 1.5360755449383108, + "kl": 0.0634765625, + "learning_rate": 9.027099609375e-07, + "loss": 0.0025, + "reward": 1.62107652425766, + "reward_std": 0.11381476372480392, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6367015540599823, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.1484375, + "epoch": 0.3896484375, + "grad_norm": 2.2214500756443547, + "kl": 0.06005859375, + "learning_rate": 9.025878906249999e-07, + "loss": 0.0024, + "reward": 1.75680810213089, + "reward_std": 0.12856251932680607, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7724330723285675, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.2734375, + "epoch": 0.39013671875, + "grad_norm": 6.470318010002288, + "kl": 0.0697021484375, + "learning_rate": 9.024658203124999e-07, + "loss": 0.0028, + "reward": 1.7349693775177002, + "reward_std": 0.05125601589679718, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7349693477153778, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.96875, + "epoch": 0.390625, + "grad_norm": 4.746876315115609, + "kl": 0.0447998046875, + "learning_rate": 9.023437499999999e-07, + "loss": 0.0018, + "reward": 1.6792126893997192, + "reward_std": 0.10291677340865135, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6948377192020416, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.828125, + "epoch": 0.39111328125, + "grad_norm": 5.930786243054409, + "kl": 0.069091796875, + "learning_rate": 9.022216796875e-07, + "loss": 0.0028, + "reward": 1.64633446931839, + "reward_std": 0.03413202054798603, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6463344395160675, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.28125, + "epoch": 0.3916015625, + "grad_norm": 2.7295402764397503, + "kl": 0.0416259765625, + "learning_rate": 9.02099609375e-07, + "loss": 0.0017, + "reward": 1.7840456366539001, + "reward_std": 0.11326225847005844, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7918581366539001, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.3828125, + "epoch": 0.39208984375, + "grad_norm": 2.061940610036514, + "kl": 0.060302734375, + "learning_rate": 9.019775390625e-07, + "loss": 0.0024, + "reward": 1.684591829776764, + "reward_std": 0.039285000413656235, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6845918297767639, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.90625, + "epoch": 0.392578125, + "grad_norm": 2.6902536765952516, + "kl": 0.0595703125, + "learning_rate": 9.0185546875e-07, + "loss": 0.0024, + "reward": 1.6753877997398376, + "reward_std": 0.0330571923404932, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6753877997398376, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.28125, + "epoch": 0.39306640625, + "grad_norm": 1.8411761165738731, + "kl": 0.0643310546875, + "learning_rate": 9.017333984374999e-07, + "loss": 0.0026, + "reward": 1.7131445407867432, + "reward_std": 0.043391112238168716, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7131445109844208, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.6640625, + "epoch": 0.3935546875, + "grad_norm": 2.7482643969953817, + "kl": 0.05810546875, + "learning_rate": 9.016113281249999e-07, + "loss": 0.0023, + "reward": 1.7672026753425598, + "reward_std": 0.14146682620048523, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7672027349472046, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.8984375, + "epoch": 0.39404296875, + "grad_norm": 5.7000530933748434, + "kl": 0.0579833984375, + "learning_rate": 9.014892578125e-07, + "loss": 0.0023, + "reward": 1.7555674314498901, + "reward_std": 0.0818701907992363, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7555674016475677, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.0234375, + "epoch": 0.39453125, + "grad_norm": 2.716019769835689, + "kl": 0.0574951171875, + "learning_rate": 9.013671875e-07, + "loss": 0.0023, + "reward": 1.7889222502708435, + "reward_std": 0.07327684760093689, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7889222204685211, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.015625, + "epoch": 0.39501953125, + "grad_norm": 1.0051393363186003, + "kl": 0.046142578125, + "learning_rate": 9.012451171875e-07, + "loss": 0.0018, + "reward": 1.823382318019867, + "reward_std": 0.08756531029939651, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8311948478221893, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.9609375, + "epoch": 0.3955078125, + "grad_norm": 5.605975136552543, + "kl": 0.0535888671875, + "learning_rate": 9.01123046875e-07, + "loss": 0.0021, + "reward": 1.6305594444274902, + "reward_std": 0.18838153779506683, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6774344146251678, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.0390625, + "epoch": 0.39599609375, + "grad_norm": 2.347124532736606, + "kl": 0.067626953125, + "learning_rate": 9.010009765624999e-07, + "loss": 0.0027, + "reward": 1.6546881794929504, + "reward_std": 0.043467432260513306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6546881794929504, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.421875, + "epoch": 0.396484375, + "grad_norm": 5.529194590040078, + "kl": 0.0498046875, + "learning_rate": 9.008789062499999e-07, + "loss": 0.002, + "reward": 1.9438674449920654, + "reward_std": 0.1620844528079033, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9516799449920654, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.53125, + "epoch": 0.39697265625, + "grad_norm": 4.064564658886712, + "kl": 0.0567626953125, + "learning_rate": 9.007568359374999e-07, + "loss": 0.0023, + "reward": 1.7015685439109802, + "reward_std": 0.19768846035003662, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7250060141086578, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.6015625, + "epoch": 0.3974609375, + "grad_norm": 3.3533767553179086, + "kl": 0.0438232421875, + "learning_rate": 9.00634765625e-07, + "loss": 0.0018, + "reward": 1.708820104598999, + "reward_std": 0.19447695463895798, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7400700449943542, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.9765625, + "epoch": 0.39794921875, + "grad_norm": 2.6790906488328483, + "kl": 0.044921875, + "learning_rate": 9.005126953125e-07, + "loss": 0.0018, + "reward": 1.721143901348114, + "reward_std": 0.06836184859275818, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.721143901348114, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.3203125, + "epoch": 0.3984375, + "grad_norm": 1.1009725102471428, + "kl": 0.0548095703125, + "learning_rate": 9.00390625e-07, + "loss": 0.0022, + "reward": 1.8940476775169373, + "reward_std": 0.03376696538180113, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8940476477146149, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.0, + "epoch": 0.39892578125, + "grad_norm": 14.243353602718932, + "kl": 0.05517578125, + "learning_rate": 9.002685546875e-07, + "loss": 0.0022, + "reward": 1.7576437592506409, + "reward_std": 0.08673252165317535, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7576436996459961, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.03125, + "epoch": 0.3994140625, + "grad_norm": 4.958911702356191, + "kl": 0.0601806640625, + "learning_rate": 9.001464843749999e-07, + "loss": 0.0024, + "reward": 1.7922693490982056, + "reward_std": 0.07825984340161085, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8000818490982056, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.109375, + "epoch": 0.39990234375, + "grad_norm": 1.3155145245659365, + "kl": 0.050537109375, + "learning_rate": 9.000244140624999e-07, + "loss": 0.002, + "reward": 1.6974967122077942, + "reward_std": 0.11296156048774719, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7053092420101166, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.65625, + "epoch": 0.400390625, + "grad_norm": 2.13339913891259, + "kl": 0.055419921875, + "learning_rate": 8.9990234375e-07, + "loss": 0.0022, + "reward": 1.707070529460907, + "reward_std": 0.12426425144076347, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7148829698562622, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.984375, + "epoch": 0.40087890625, + "grad_norm": 3.546816241737904, + "kl": 0.063232421875, + "learning_rate": 8.997802734375e-07, + "loss": 0.0025, + "reward": 1.7018118500709534, + "reward_std": 0.15557154268026352, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.717436820268631, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.2421875, + "epoch": 0.4013671875, + "grad_norm": 2.7078843795022296, + "kl": 0.05029296875, + "learning_rate": 8.99658203125e-07, + "loss": 0.002, + "reward": 1.6248914003372192, + "reward_std": 0.09886835888028145, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.624891385436058, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.65625, + "epoch": 0.40185546875, + "grad_norm": 1.1173934118480147, + "kl": 0.056396484375, + "learning_rate": 8.995361328125e-07, + "loss": 0.0023, + "reward": 1.670366883277893, + "reward_std": 0.022474923171103, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6703668832778931, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.5546875, + "epoch": 0.40234375, + "grad_norm": 1.1487567312767635, + "kl": 0.0540771484375, + "learning_rate": 8.994140624999999e-07, + "loss": 0.0022, + "reward": 1.6762340068817139, + "reward_std": 0.06469432264566422, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6762339472770691, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.2578125, + "epoch": 0.40283203125, + "grad_norm": 0.47294641259340225, + "kl": 0.041015625, + "learning_rate": 8.992919921874999e-07, + "loss": 0.0016, + "reward": 1.9129234552383423, + "reward_std": 0.011541639920324087, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9129234850406647, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.625, + "epoch": 0.4033203125, + "grad_norm": 2.0908075256979757, + "kl": 0.0526123046875, + "learning_rate": 8.991699218749999e-07, + "loss": 0.0021, + "reward": 1.5521747469902039, + "reward_std": 0.1392434984445572, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.5990498065948486, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.71875, + "epoch": 0.40380859375, + "grad_norm": 1.0870502036918956, + "kl": 0.0484619140625, + "learning_rate": 8.990478515625e-07, + "loss": 0.0019, + "reward": 1.6554855108261108, + "reward_std": 0.15189751982688904, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6867355108261108, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.6640625, + "epoch": 0.404296875, + "grad_norm": 1.5086234778630288, + "kl": 0.0494384765625, + "learning_rate": 8.9892578125e-07, + "loss": 0.002, + "reward": 1.5656054019927979, + "reward_std": 0.13260553404688835, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5890428274869919, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.6328125, + "epoch": 0.40478515625, + "grad_norm": 3.1905532448295366, + "kl": 0.054931640625, + "learning_rate": 8.988037109375e-07, + "loss": 0.0022, + "reward": 1.792038083076477, + "reward_std": 0.08523118868470192, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7920379936695099, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.6484375, + "epoch": 0.4052734375, + "grad_norm": 1.4703480432855132, + "kl": 0.052978515625, + "learning_rate": 8.98681640625e-07, + "loss": 0.0021, + "reward": 1.7558820843696594, + "reward_std": 0.04787625931203365, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7558820843696594, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.6484375, + "epoch": 0.40576171875, + "grad_norm": 7.185703738083162, + "kl": 0.04248046875, + "learning_rate": 8.985595703124999e-07, + "loss": 0.0017, + "reward": 1.8221890926361084, + "reward_std": 0.05866616778075695, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8221890926361084, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.2890625, + "epoch": 0.40625, + "grad_norm": 2.145560956292342, + "kl": 0.063232421875, + "learning_rate": 8.984374999999999e-07, + "loss": 0.0025, + "reward": 1.685830295085907, + "reward_std": 0.036165340803563595, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6858302354812622, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.8515625, + "epoch": 0.40673828125, + "grad_norm": 3.391928796703693, + "kl": 0.0562744140625, + "learning_rate": 8.983154296875e-07, + "loss": 0.0023, + "reward": 1.6803425550460815, + "reward_std": 0.10987947136163712, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6803425848484039, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.0703125, + "epoch": 0.4072265625, + "grad_norm": 2.2044595955803206, + "kl": 0.043701171875, + "learning_rate": 8.98193359375e-07, + "loss": 0.0017, + "reward": 1.7496721744537354, + "reward_std": 0.12323963642120361, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.757484644651413, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.0546875, + "epoch": 0.40771484375, + "grad_norm": 3.3613379939285344, + "kl": 0.0556640625, + "learning_rate": 8.980712890625e-07, + "loss": 0.0022, + "reward": 1.6558890342712402, + "reward_std": 0.05606374144554138, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.655889093875885, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.203125, + "epoch": 0.408203125, + "grad_norm": 1.3604816411152902, + "kl": 0.0491943359375, + "learning_rate": 8.9794921875e-07, + "loss": 0.002, + "reward": 1.8900187015533447, + "reward_std": 0.06370699405670166, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8900187015533447, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.5625, + "epoch": 0.40869140625, + "grad_norm": 2.3095956319983113, + "kl": 0.0531005859375, + "learning_rate": 8.978271484374999e-07, + "loss": 0.0021, + "reward": 1.5155598521232605, + "reward_std": 0.11974064260721207, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5546222925186157, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.6015625, + "epoch": 0.4091796875, + "grad_norm": 1.6895406595858493, + "kl": 0.05511474609375, + "learning_rate": 8.977050781249999e-07, + "loss": 0.0022, + "reward": 1.7988107204437256, + "reward_std": 0.04724998027086258, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7988106906414032, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.4609375, + "epoch": 0.40966796875, + "grad_norm": 38.657918876895465, + "kl": 0.05810546875, + "learning_rate": 8.975830078124999e-07, + "loss": 0.0023, + "reward": 1.707718014717102, + "reward_std": 0.056786952540278435, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.707718014717102, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.0546875, + "epoch": 0.41015625, + "grad_norm": 3.54984380280518, + "kl": 0.043701171875, + "learning_rate": 8.974609375e-07, + "loss": 0.0017, + "reward": 1.6362690329551697, + "reward_std": 0.046866053715348244, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6362690329551697, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.84375, + "epoch": 0.41064453125, + "grad_norm": 4.880454547688104, + "kl": 0.0343017578125, + "learning_rate": 8.973388671875e-07, + "loss": 0.0014, + "reward": 1.6713696718215942, + "reward_std": 0.07554645650088787, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6791820526123047, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.75, + "epoch": 0.4111328125, + "grad_norm": 3.5422070331713558, + "kl": 0.047607421875, + "learning_rate": 8.97216796875e-07, + "loss": 0.0019, + "reward": 1.76535165309906, + "reward_std": 0.07970313355326653, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7731641829013824, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.625, + "epoch": 0.41162109375, + "grad_norm": 1.6668469246862412, + "kl": 0.0472412109375, + "learning_rate": 8.970947265625e-07, + "loss": 0.0019, + "reward": 1.647118866443634, + "reward_std": 0.15605220571160316, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6861813068389893, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.8125, + "epoch": 0.412109375, + "grad_norm": 1.4480209975960932, + "kl": 0.036865234375, + "learning_rate": 8.969726562499999e-07, + "loss": 0.0015, + "reward": 1.7060487270355225, + "reward_std": 0.19439689815044403, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7372986376285553, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.375, + "epoch": 0.41259765625, + "grad_norm": 2.6692780266475387, + "kl": 0.05126953125, + "learning_rate": 8.968505859374999e-07, + "loss": 0.0021, + "reward": 1.7914454340934753, + "reward_std": 0.07013567723333836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7914454638957977, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.234375, + "epoch": 0.4130859375, + "grad_norm": 1.4785225765927579, + "kl": 0.0511474609375, + "learning_rate": 8.96728515625e-07, + "loss": 0.002, + "reward": 1.6806397438049316, + "reward_std": 0.09639265388250351, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6884523034095764, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.46875, + "epoch": 0.41357421875, + "grad_norm": 0.978438992258654, + "kl": 0.0433349609375, + "learning_rate": 8.966064453125e-07, + "loss": 0.0017, + "reward": 1.7947252988815308, + "reward_std": 0.12237262353301048, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8103502690792084, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.859375, + "epoch": 0.4140625, + "grad_norm": 2.8553868780442206, + "kl": 0.053466796875, + "learning_rate": 8.96484375e-07, + "loss": 0.0021, + "reward": 1.7657200694084167, + "reward_std": 0.12218839675188065, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7891575396060944, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.046875, + "epoch": 0.41455078125, + "grad_norm": 1.2336624642234275, + "kl": 0.06689453125, + "learning_rate": 8.963623046875e-07, + "loss": 0.0027, + "reward": 1.6515643000602722, + "reward_std": 0.09395093843340874, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.675001859664917, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.1015625, + "epoch": 0.4150390625, + "grad_norm": 1.6078249545004046, + "kl": 0.058837890625, + "learning_rate": 8.96240234375e-07, + "loss": 0.0024, + "reward": 1.6375129222869873, + "reward_std": 0.09645092487335205, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6375128775835037, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.734375, + "epoch": 0.41552734375, + "grad_norm": 1.7476214032090949, + "kl": 0.0460205078125, + "learning_rate": 8.961181640624999e-07, + "loss": 0.0018, + "reward": 1.8477584719657898, + "reward_std": 0.020351408515125513, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8477585017681122, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.203125, + "epoch": 0.416015625, + "grad_norm": 1.538198841160422, + "kl": 0.0655517578125, + "learning_rate": 8.959960937499999e-07, + "loss": 0.0026, + "reward": 1.731358528137207, + "reward_std": 0.09056920558214188, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.731358528137207, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.8671875, + "epoch": 0.41650390625, + "grad_norm": 5.895599134478584, + "kl": 0.069091796875, + "learning_rate": 8.958740234375e-07, + "loss": 0.0028, + "reward": 1.6020338535308838, + "reward_std": 0.1456664614379406, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.617658793926239, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.546875, + "epoch": 0.4169921875, + "grad_norm": 9.469721145412421, + "kl": 0.056396484375, + "learning_rate": 8.95751953125e-07, + "loss": 0.0023, + "reward": 1.7787832021713257, + "reward_std": 0.025636928156018257, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7787831723690033, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7265625, + "epoch": 0.41748046875, + "grad_norm": 1.667838887256776, + "kl": 0.0594482421875, + "learning_rate": 8.956298828125e-07, + "loss": 0.0024, + "reward": 1.8459165692329407, + "reward_std": 0.11014392226934433, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8537290096282959, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.84375, + "epoch": 0.41796875, + "grad_norm": 4.431806930390354, + "kl": 0.0511474609375, + "learning_rate": 8.955078125e-07, + "loss": 0.002, + "reward": 1.7043548822402954, + "reward_std": 0.10635066404938698, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.712167501449585, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.4921875, + "epoch": 0.41845703125, + "grad_norm": 2.770248233041983, + "kl": 0.0433349609375, + "learning_rate": 8.953857421874999e-07, + "loss": 0.0017, + "reward": 1.7795735597610474, + "reward_std": 0.12282518297433853, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.795198529958725, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.84375, + "epoch": 0.4189453125, + "grad_norm": 2.12159812965168, + "kl": 0.064697265625, + "learning_rate": 8.952636718749999e-07, + "loss": 0.0026, + "reward": 1.587533950805664, + "reward_std": 0.1150995921343565, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6344089508056641, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.8828125, + "epoch": 0.41943359375, + "grad_norm": 1.8214435810008032, + "kl": 0.0404052734375, + "learning_rate": 8.951416015624999e-07, + "loss": 0.0016, + "reward": 1.7444366216659546, + "reward_std": 0.11982932686805725, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7600616216659546, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.28125, + "epoch": 0.419921875, + "grad_norm": 1.745129886806224, + "kl": 0.0435791015625, + "learning_rate": 8.9501953125e-07, + "loss": 0.0017, + "reward": 1.84866863489151, + "reward_std": 0.039832524955272675, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8486685752868652, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.9609375, + "epoch": 0.42041015625, + "grad_norm": 2.169553869726486, + "kl": 0.0478515625, + "learning_rate": 8.948974609375e-07, + "loss": 0.0019, + "reward": 1.6781877279281616, + "reward_std": 0.11633714661002159, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7094376981258392, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.546875, + "epoch": 0.4208984375, + "grad_norm": 2.890081333787121, + "kl": 0.05615234375, + "learning_rate": 8.94775390625e-07, + "loss": 0.0022, + "reward": 1.7455247640609741, + "reward_std": 0.09918822348117828, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7455247640609741, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.1796875, + "epoch": 0.42138671875, + "grad_norm": 1.885623336317658, + "kl": 0.054443359375, + "learning_rate": 8.946533203125e-07, + "loss": 0.0022, + "reward": 1.7564916610717773, + "reward_std": 0.12973085790872574, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7721166908740997, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.8359375, + "epoch": 0.421875, + "grad_norm": 10.017435172797613, + "kl": 0.1231689453125, + "learning_rate": 8.945312499999999e-07, + "loss": 0.0049, + "reward": 1.6419134140014648, + "reward_std": 0.15503490716218948, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6653508841991425, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.3359375, + "epoch": 0.42236328125, + "grad_norm": 5.291462812901847, + "kl": 0.0546875, + "learning_rate": 8.944091796874999e-07, + "loss": 0.0022, + "reward": 1.619509994983673, + "reward_std": 0.1739499308168888, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6429474651813507, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.203125, + "epoch": 0.4228515625, + "grad_norm": 3.9430144083478615, + "kl": 0.0474853515625, + "learning_rate": 8.94287109375e-07, + "loss": 0.0019, + "reward": 1.7385854721069336, + "reward_std": 0.03805091604590416, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7385854721069336, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.59375, + "epoch": 0.42333984375, + "grad_norm": 1.6092457486819296, + "kl": 0.04736328125, + "learning_rate": 8.941650390625e-07, + "loss": 0.0019, + "reward": 1.8048319220542908, + "reward_std": 0.06229471415281296, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8048319518566132, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.0546875, + "epoch": 0.423828125, + "grad_norm": 1.9104697019990664, + "kl": 0.0382080078125, + "learning_rate": 8.9404296875e-07, + "loss": 0.0015, + "reward": 1.6721869707107544, + "reward_std": 0.2335866540670395, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7268744707107544, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.390625, + "epoch": 0.42431640625, + "grad_norm": 1.4769192488084104, + "kl": 0.0577392578125, + "learning_rate": 8.939208984375e-07, + "loss": 0.0023, + "reward": 1.7876529693603516, + "reward_std": 0.12172145396471024, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8032780587673187, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.2265625, + "epoch": 0.4248046875, + "grad_norm": 2.4912707719685816, + "kl": 0.063232421875, + "learning_rate": 8.937988281249999e-07, + "loss": 0.0025, + "reward": 1.7117069959640503, + "reward_std": 0.10087519139051437, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7195195257663727, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.890625, + "epoch": 0.42529296875, + "grad_norm": 2.904180822248156, + "kl": 0.0533447265625, + "learning_rate": 8.936767578124999e-07, + "loss": 0.0021, + "reward": 1.6977179646492004, + "reward_std": 0.061658382415771484, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.697717934846878, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.3203125, + "epoch": 0.42578125, + "grad_norm": 3.9810543136057706, + "kl": 0.05419921875, + "learning_rate": 8.935546874999999e-07, + "loss": 0.0022, + "reward": 1.778558611869812, + "reward_std": 0.13054338097572327, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7941837012767792, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.5, + "epoch": 0.42626953125, + "grad_norm": 2.638694510235747, + "kl": 0.0634765625, + "learning_rate": 8.934326171875e-07, + "loss": 0.0025, + "reward": 1.684062123298645, + "reward_std": 0.1073136255145073, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6918745934963226, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.8359375, + "epoch": 0.4267578125, + "grad_norm": 1.0328136935198113, + "kl": 0.040283203125, + "learning_rate": 8.93310546875e-07, + "loss": 0.0016, + "reward": 1.7540948987007141, + "reward_std": 0.057166170328855515, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7540949583053589, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.1953125, + "epoch": 0.42724609375, + "grad_norm": 7.360158040184964, + "kl": 0.0521240234375, + "learning_rate": 8.931884765625e-07, + "loss": 0.0021, + "reward": 1.7804943919181824, + "reward_std": 0.02563006430864334, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7804943323135376, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.265625, + "epoch": 0.427734375, + "grad_norm": 2.550928083120494, + "kl": 0.070556640625, + "learning_rate": 8.9306640625e-07, + "loss": 0.0028, + "reward": 1.721437394618988, + "reward_std": 0.09169731847941875, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.721437394618988, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.578125, + "epoch": 0.42822265625, + "grad_norm": 4.314294285455361, + "kl": 0.0531005859375, + "learning_rate": 8.929443359374999e-07, + "loss": 0.0021, + "reward": 1.7071447968482971, + "reward_std": 0.12683348171412945, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7227697968482971, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.53125, + "epoch": 0.4287109375, + "grad_norm": 7.896509785870648, + "kl": 0.0496826171875, + "learning_rate": 8.928222656249999e-07, + "loss": 0.002, + "reward": 1.698940396308899, + "reward_std": 0.06823573168367147, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6989404261112213, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.1953125, + "epoch": 0.42919921875, + "grad_norm": 4.419894880080495, + "kl": 0.0576171875, + "learning_rate": 8.927001953125e-07, + "loss": 0.0023, + "reward": 1.6944845914840698, + "reward_std": 0.13580431789159775, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7022970914840698, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.640625, + "epoch": 0.4296875, + "grad_norm": 3.1624664713185777, + "kl": 0.0535888671875, + "learning_rate": 8.92578125e-07, + "loss": 0.0021, + "reward": 1.669293999671936, + "reward_std": 0.1342175379395485, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.669293999671936, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.4453125, + "epoch": 0.43017578125, + "grad_norm": 2.394405821979668, + "kl": 0.0677490234375, + "learning_rate": 8.924560546875e-07, + "loss": 0.0027, + "reward": 1.7102238535881042, + "reward_std": 0.07026012241840363, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7180363833904266, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.7578125, + "epoch": 0.4306640625, + "grad_norm": 1.9581328307353232, + "kl": 0.0467529296875, + "learning_rate": 8.92333984375e-07, + "loss": 0.0019, + "reward": 1.7540261149406433, + "reward_std": 0.10421252250671387, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7618384957313538, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.375, + "epoch": 0.43115234375, + "grad_norm": 2.6355277149440064, + "kl": 0.0487060546875, + "learning_rate": 8.922119140624999e-07, + "loss": 0.0019, + "reward": 1.5595695972442627, + "reward_std": 0.058571480214595795, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5595695376396179, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.453125, + "epoch": 0.431640625, + "grad_norm": 2.4619908418967618, + "kl": 0.0504150390625, + "learning_rate": 8.920898437499999e-07, + "loss": 0.002, + "reward": 1.718446969985962, + "reward_std": 0.15201827883720398, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7496969699859619, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.796875, + "epoch": 0.43212890625, + "grad_norm": 1.9043119997675124, + "kl": 0.0589599609375, + "learning_rate": 8.919677734374999e-07, + "loss": 0.0024, + "reward": 1.6135079860687256, + "reward_std": 0.0632172767072916, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6135080456733704, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.9453125, + "epoch": 0.4326171875, + "grad_norm": 2.828936256887094, + "kl": 0.0567626953125, + "learning_rate": 8.91845703125e-07, + "loss": 0.0023, + "reward": 1.8562658429145813, + "reward_std": 0.043327707797288895, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8562657833099365, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.5703125, + "epoch": 0.43310546875, + "grad_norm": 3.794004727125142, + "kl": 0.061767578125, + "learning_rate": 8.917236328125e-07, + "loss": 0.0025, + "reward": 1.6978505849838257, + "reward_std": 0.09938319772481918, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6978505551815033, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.9375, + "epoch": 0.43359375, + "grad_norm": 1.7545127538372571, + "kl": 0.050048828125, + "learning_rate": 8.916015625e-07, + "loss": 0.002, + "reward": 1.7462196350097656, + "reward_std": 0.08687572181224823, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7540321350097656, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.984375, + "epoch": 0.43408203125, + "grad_norm": 4.59557739433223, + "kl": 0.053466796875, + "learning_rate": 8.914794921875e-07, + "loss": 0.0021, + "reward": 1.664870023727417, + "reward_std": 0.13343672454357147, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.680495023727417, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.6796875, + "epoch": 0.4345703125, + "grad_norm": 3.0527472360411747, + "kl": 0.0570068359375, + "learning_rate": 8.913574218749999e-07, + "loss": 0.0023, + "reward": 1.633752703666687, + "reward_std": 0.2018553614616394, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.657190203666687, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.8671875, + "epoch": 0.43505859375, + "grad_norm": 5.7762219418155025, + "kl": 0.05517578125, + "learning_rate": 8.912353515624999e-07, + "loss": 0.0022, + "reward": 1.684194028377533, + "reward_std": 0.09600569307804108, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6841940879821777, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.671875, + "epoch": 0.435546875, + "grad_norm": 3.3065383732267257, + "kl": 0.0523681640625, + "learning_rate": 8.9111328125e-07, + "loss": 0.0021, + "reward": 1.6547590494155884, + "reward_std": 0.19414672255516052, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7094466388225555, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.4921875, + "epoch": 0.43603515625, + "grad_norm": 1.4003093807212736, + "kl": 0.063720703125, + "learning_rate": 8.909912109375e-07, + "loss": 0.0025, + "reward": 1.7039158940315247, + "reward_std": 0.11482829600572586, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7117283940315247, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.6640625, + "epoch": 0.4365234375, + "grad_norm": 1.3160510217088606, + "kl": 0.0574951171875, + "learning_rate": 8.90869140625e-07, + "loss": 0.0023, + "reward": 1.833851397037506, + "reward_std": 0.026355454698204994, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8338513970375061, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.1171875, + "epoch": 0.43701171875, + "grad_norm": 1.099488871794088, + "kl": 0.0526123046875, + "learning_rate": 8.907470703125e-07, + "loss": 0.0021, + "reward": 1.735145926475525, + "reward_std": 0.050868917256593704, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7351458072662354, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.3515625, + "epoch": 0.4375, + "grad_norm": 10.520352255307145, + "kl": 0.049560546875, + "learning_rate": 8.906249999999999e-07, + "loss": 0.002, + "reward": 1.5713690519332886, + "reward_std": 0.17703481018543243, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5869940519332886, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.0, + "epoch": 0.43798828125, + "grad_norm": 4.9102977822696525, + "kl": 0.059814453125, + "learning_rate": 8.905029296874999e-07, + "loss": 0.0024, + "reward": 1.6745886206626892, + "reward_std": 0.056372467428445816, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6745886504650116, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.609375, + "epoch": 0.4384765625, + "grad_norm": 2.850983593865307, + "kl": 0.04638671875, + "learning_rate": 8.903808593749999e-07, + "loss": 0.0019, + "reward": 1.7233811616897583, + "reward_std": 0.07739730924367905, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7311936020851135, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.828125, + "epoch": 0.43896484375, + "grad_norm": 2.0785423808690977, + "kl": 0.0546875, + "learning_rate": 8.902587890625e-07, + "loss": 0.0022, + "reward": 1.764865517616272, + "reward_std": 0.06689143739640713, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7648655772209167, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.4765625, + "epoch": 0.439453125, + "grad_norm": 3.4689071084431946, + "kl": 0.0462646484375, + "learning_rate": 8.9013671875e-07, + "loss": 0.0018, + "reward": 1.6617870926856995, + "reward_std": 0.1315966732800007, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6852246224880219, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.421875, + "epoch": 0.43994140625, + "grad_norm": 1.5582536488441514, + "kl": 0.0484619140625, + "learning_rate": 8.900146484375e-07, + "loss": 0.0019, + "reward": 1.6939795017242432, + "reward_std": 0.1498698815703392, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7252295911312103, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.890625, + "epoch": 0.4404296875, + "grad_norm": 3.2999043304034026, + "kl": 0.0615234375, + "learning_rate": 8.89892578125e-07, + "loss": 0.0025, + "reward": 1.6509242057800293, + "reward_std": 0.10151878371834755, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6509242355823517, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.03125, + "epoch": 0.44091796875, + "grad_norm": 16.23235566949286, + "kl": 0.0509033203125, + "learning_rate": 8.897705078124999e-07, + "loss": 0.002, + "reward": 1.7497307658195496, + "reward_std": 0.06852127611637115, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7497306764125824, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.1875, + "epoch": 0.44140625, + "grad_norm": 2.539616901726096, + "kl": 0.0562744140625, + "learning_rate": 8.896484374999999e-07, + "loss": 0.0023, + "reward": 1.6935822367668152, + "reward_std": 0.14617926999926567, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7170197069644928, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.1015625, + "epoch": 0.44189453125, + "grad_norm": 1.9069482845643857, + "kl": 0.050048828125, + "learning_rate": 8.895263671875e-07, + "loss": 0.002, + "reward": 1.6976945996284485, + "reward_std": 0.10638157278299332, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7055070698261261, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.7734375, + "epoch": 0.4423828125, + "grad_norm": 3.849069538906998, + "kl": 0.0516357421875, + "learning_rate": 8.89404296875e-07, + "loss": 0.0021, + "reward": 1.7132678627967834, + "reward_std": 0.138364490121603, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7132679224014282, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.0390625, + "epoch": 0.44287109375, + "grad_norm": 2.305802120564667, + "kl": 0.0523681640625, + "learning_rate": 8.892822265625e-07, + "loss": 0.0021, + "reward": 1.7869673371315002, + "reward_std": 0.07257464155554771, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7869673371315002, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.8671875, + "epoch": 0.443359375, + "grad_norm": 26.058746364247334, + "kl": 0.055908203125, + "learning_rate": 8.8916015625e-07, + "loss": 0.0022, + "reward": 1.628940463066101, + "reward_std": 0.13616503030061722, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6445655226707458, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.578125, + "epoch": 0.44384765625, + "grad_norm": 2.122149008743692, + "kl": 0.048095703125, + "learning_rate": 8.890380859374999e-07, + "loss": 0.0019, + "reward": 1.7516308426856995, + "reward_std": 0.0674322908744216, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7594433426856995, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.0546875, + "epoch": 0.4443359375, + "grad_norm": 4.580725380018095, + "kl": 0.0489501953125, + "learning_rate": 8.889160156249999e-07, + "loss": 0.002, + "reward": 1.730670690536499, + "reward_std": 0.11633214727044106, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7462956309318542, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.3828125, + "epoch": 0.44482421875, + "grad_norm": 1.320650252057893, + "kl": 0.0423583984375, + "learning_rate": 8.887939453124999e-07, + "loss": 0.0017, + "reward": 1.7391434907913208, + "reward_std": 0.19686751067638397, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7547684013843536, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.6328125, + "epoch": 0.4453125, + "grad_norm": 1.4523590967417215, + "kl": 0.0565185546875, + "learning_rate": 8.88671875e-07, + "loss": 0.0023, + "reward": 1.5424267649650574, + "reward_std": 0.07458284497261047, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.550239235162735, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.3515625, + "epoch": 0.44580078125, + "grad_norm": 1.719086973866284, + "kl": 0.052734375, + "learning_rate": 8.885498046875e-07, + "loss": 0.0021, + "reward": 1.642267882823944, + "reward_std": 0.12693988159298897, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6657053828239441, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.3359375, + "epoch": 0.4462890625, + "grad_norm": 3.2810083273339217, + "kl": 0.0501708984375, + "learning_rate": 8.88427734375e-07, + "loss": 0.002, + "reward": 1.6578189134597778, + "reward_std": 0.14238969795405865, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6890688836574554, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.0546875, + "epoch": 0.44677734375, + "grad_norm": 2.701614835256679, + "kl": 0.0416259765625, + "learning_rate": 8.883056640625e-07, + "loss": 0.0017, + "reward": 1.8191250562667847, + "reward_std": 0.08679736405611038, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8191250264644623, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.2890625, + "epoch": 0.447265625, + "grad_norm": 4.414560910867886, + "kl": 0.0560302734375, + "learning_rate": 8.881835937499999e-07, + "loss": 0.0022, + "reward": 1.700971245765686, + "reward_std": 0.0659741573035717, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7009712755680084, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.6953125, + "epoch": 0.44775390625, + "grad_norm": 1.7777346701649772, + "kl": 0.041259765625, + "learning_rate": 8.880615234374999e-07, + "loss": 0.0017, + "reward": 1.7923877239227295, + "reward_std": 0.052391206845641136, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7923877835273743, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.9921875, + "epoch": 0.4482421875, + "grad_norm": 0.9486222650729225, + "kl": 0.0435791015625, + "learning_rate": 8.87939453125e-07, + "loss": 0.0017, + "reward": 1.6959292888641357, + "reward_std": 0.02422085404396057, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6959293782711029, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.7734375, + "epoch": 0.44873046875, + "grad_norm": 1.6490253942008113, + "kl": 0.052490234375, + "learning_rate": 8.878173828125e-07, + "loss": 0.0021, + "reward": 1.5897186398506165, + "reward_std": 0.13733144104480743, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5975310802459717, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1015625, + "epoch": 0.44921875, + "grad_norm": 1.4659485502265033, + "kl": 0.0418701171875, + "learning_rate": 8.876953125e-07, + "loss": 0.0017, + "reward": 1.6723748445510864, + "reward_std": 0.09266382362693548, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6723748296499252, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.0234375, + "epoch": 0.44970703125, + "grad_norm": 2.0567450128282423, + "kl": 0.0418701171875, + "learning_rate": 8.875732421875e-07, + "loss": 0.0017, + "reward": 1.635401725769043, + "reward_std": 0.11173927411437035, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6432141959667206, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.125, + "epoch": 0.4501953125, + "grad_norm": 6.406156350449351, + "kl": 0.041748046875, + "learning_rate": 8.874511718749999e-07, + "loss": 0.0017, + "reward": 1.7291421294212341, + "reward_std": 0.051562756299972534, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7291421294212341, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.4765625, + "epoch": 0.45068359375, + "grad_norm": 2.9529489375072915, + "kl": 0.0458984375, + "learning_rate": 8.873291015624999e-07, + "loss": 0.0018, + "reward": 1.7142540216445923, + "reward_std": 0.157925084233284, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7298789620399475, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.0546875, + "epoch": 0.451171875, + "grad_norm": 0.9225384580987055, + "kl": 0.059326171875, + "learning_rate": 8.872070312499999e-07, + "loss": 0.0024, + "reward": 1.7622966170310974, + "reward_std": 0.06992994248867035, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7701090574264526, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.0703125, + "epoch": 0.45166015625, + "grad_norm": 0.8543291431386741, + "kl": 0.0465087890625, + "learning_rate": 8.870849609375e-07, + "loss": 0.0019, + "reward": 1.664566159248352, + "reward_std": 0.04729248210787773, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.664566159248352, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.671875, + "epoch": 0.4521484375, + "grad_norm": 4.361104873897493, + "kl": 0.051025390625, + "learning_rate": 8.86962890625e-07, + "loss": 0.002, + "reward": 1.6637241840362549, + "reward_std": 0.06688250973820686, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6637241840362549, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.9765625, + "epoch": 0.45263671875, + "grad_norm": 1.321086242390277, + "kl": 0.0428466796875, + "learning_rate": 8.868408203125e-07, + "loss": 0.0017, + "reward": 1.609758734703064, + "reward_std": 0.1022709459066391, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6331963092088699, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.4140625, + "epoch": 0.453125, + "grad_norm": 4.639254611969434, + "kl": 0.0499267578125, + "learning_rate": 8.8671875e-07, + "loss": 0.002, + "reward": 1.7151271104812622, + "reward_std": 0.056340851821005344, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7151271104812622, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.8671875, + "epoch": 0.45361328125, + "grad_norm": 9.59117348810988, + "kl": 0.044921875, + "learning_rate": 8.865966796874999e-07, + "loss": 0.0018, + "reward": 1.7646648287773132, + "reward_std": 0.08957374095916748, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7724774181842804, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.25, + "epoch": 0.4541015625, + "grad_norm": 1.774508659785477, + "kl": 0.0496826171875, + "learning_rate": 8.864746093749999e-07, + "loss": 0.002, + "reward": 1.6190925240516663, + "reward_std": 0.1272077076137066, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6269050240516663, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.6953125, + "epoch": 0.45458984375, + "grad_norm": 1.9603229874947314, + "kl": 0.0439453125, + "learning_rate": 8.863525390625e-07, + "loss": 0.0018, + "reward": 1.7904832363128662, + "reward_std": 0.0836594682186842, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7982957363128662, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.421875, + "epoch": 0.455078125, + "grad_norm": 3.3693206055984968, + "kl": 0.0513916015625, + "learning_rate": 8.8623046875e-07, + "loss": 0.0021, + "reward": 1.662086844444275, + "reward_std": 0.10136513970792294, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6698993444442749, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.234375, + "epoch": 0.45556640625, + "grad_norm": 2.867780998149891, + "kl": 0.0440673828125, + "learning_rate": 8.861083984375e-07, + "loss": 0.0018, + "reward": 1.7205028533935547, + "reward_std": 0.11783993989229202, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7205029428005219, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.390625, + "epoch": 0.4560546875, + "grad_norm": 3.335016088362262, + "kl": 0.0521240234375, + "learning_rate": 8.85986328125e-07, + "loss": 0.0021, + "reward": 1.6430580615997314, + "reward_std": 0.1293087601661682, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6664955615997314, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.234375, + "epoch": 0.45654296875, + "grad_norm": 3.2388328242370283, + "kl": 0.0621337890625, + "learning_rate": 8.858642578124999e-07, + "loss": 0.0025, + "reward": 1.7366413474082947, + "reward_std": 0.06143258325755596, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7366413474082947, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.390625, + "epoch": 0.45703125, + "grad_norm": 6.307742284245101, + "kl": 0.044921875, + "learning_rate": 8.857421874999999e-07, + "loss": 0.0018, + "reward": 1.8411588072776794, + "reward_std": 0.09207257255911827, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8411588072776794, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.828125, + "epoch": 0.45751953125, + "grad_norm": 1.8548370757528454, + "kl": 0.0482177734375, + "learning_rate": 8.856201171874999e-07, + "loss": 0.0019, + "reward": 1.673714518547058, + "reward_std": 0.12045683711767197, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6737145185470581, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.4453125, + "epoch": 0.4580078125, + "grad_norm": 2.9790179157150005, + "kl": 0.0499267578125, + "learning_rate": 8.85498046875e-07, + "loss": 0.002, + "reward": 1.758617639541626, + "reward_std": 0.06485863775014877, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7586176097393036, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.4375, + "epoch": 0.45849609375, + "grad_norm": 2.1066399082293747, + "kl": 0.0489501953125, + "learning_rate": 8.853759765625e-07, + "loss": 0.002, + "reward": 1.6640775203704834, + "reward_std": 0.08030284568667412, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6640775203704834, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.6640625, + "epoch": 0.458984375, + "grad_norm": 29.994085492419032, + "kl": 0.0548095703125, + "learning_rate": 8.8525390625e-07, + "loss": 0.0022, + "reward": 1.742246389389038, + "reward_std": 0.11800673604011536, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7500588893890381, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.8203125, + "epoch": 0.45947265625, + "grad_norm": 2.0008366557399246, + "kl": 0.057861328125, + "learning_rate": 8.851318359375e-07, + "loss": 0.0023, + "reward": 1.8033297061920166, + "reward_std": 0.15194324404001236, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8267672061920166, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.8125, + "epoch": 0.4599609375, + "grad_norm": 1.242202564610936, + "kl": 0.04638671875, + "learning_rate": 8.850097656249999e-07, + "loss": 0.0019, + "reward": 1.5236690640449524, + "reward_std": 0.09883632883429527, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5471065491437912, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.1484375, + "epoch": 0.46044921875, + "grad_norm": 2.723081001967419, + "kl": 0.0477294921875, + "learning_rate": 8.848876953124999e-07, + "loss": 0.0019, + "reward": 1.7074534893035889, + "reward_std": 0.08867547661066055, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7152659893035889, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.8203125, + "epoch": 0.4609375, + "grad_norm": 2.9268520095350783, + "kl": 0.0587158203125, + "learning_rate": 8.84765625e-07, + "loss": 0.0023, + "reward": 1.7078036665916443, + "reward_std": 0.04931185767054558, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7078036367893219, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.390625, + "epoch": 0.46142578125, + "grad_norm": 2.101619140746079, + "kl": 0.044921875, + "learning_rate": 8.846435546875e-07, + "loss": 0.0018, + "reward": 1.7718039155006409, + "reward_std": 0.051894426345825195, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7718039155006409, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5546875, + "epoch": 0.4619140625, + "grad_norm": 0.8404176478040042, + "kl": 0.0435791015625, + "learning_rate": 8.84521484375e-07, + "loss": 0.0017, + "reward": 1.7640173435211182, + "reward_std": 0.07772124605253339, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7718298435211182, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.078125, + "epoch": 0.46240234375, + "grad_norm": 1.1816393651871933, + "kl": 0.058837890625, + "learning_rate": 8.843994140625e-07, + "loss": 0.0024, + "reward": 1.8239731788635254, + "reward_std": 0.0738510899245739, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8317857086658478, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.6796875, + "epoch": 0.462890625, + "grad_norm": 2.4437099072575146, + "kl": 0.0467529296875, + "learning_rate": 8.8427734375e-07, + "loss": 0.0019, + "reward": 1.8321685194969177, + "reward_std": 0.06024608574807644, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8321685194969177, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.265625, + "epoch": 0.46337890625, + "grad_norm": 1.6390280058599591, + "kl": 0.064697265625, + "learning_rate": 8.841552734374999e-07, + "loss": 0.0026, + "reward": 1.795024573802948, + "reward_std": 0.08007996901869774, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7950246036052704, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3671875, + "epoch": 0.4638671875, + "grad_norm": 7.952462151127041, + "kl": 0.0531005859375, + "learning_rate": 8.840332031249999e-07, + "loss": 0.0021, + "reward": 1.6938014030456543, + "reward_std": 0.0921289250254631, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6938014030456543, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.28125, + "epoch": 0.46435546875, + "grad_norm": 2.6547911721421067, + "kl": 0.0528564453125, + "learning_rate": 8.839111328125e-07, + "loss": 0.0021, + "reward": 1.7591851353645325, + "reward_std": 0.14495818316936493, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7826226055622101, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.2265625, + "epoch": 0.46484375, + "grad_norm": 35.96771698306349, + "kl": 0.1925048828125, + "learning_rate": 8.837890625e-07, + "loss": 0.0077, + "reward": 1.7612931728363037, + "reward_std": 0.1344320885837078, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7847306728363037, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.1015625, + "epoch": 0.46533203125, + "grad_norm": 15.788401904321928, + "kl": 0.038330078125, + "learning_rate": 8.836669921875e-07, + "loss": 0.0015, + "reward": 1.7165034413337708, + "reward_std": 0.08753632940351963, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.724315881729126, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.203125, + "epoch": 0.4658203125, + "grad_norm": 1.8975472946536276, + "kl": 0.0462646484375, + "learning_rate": 8.83544921875e-07, + "loss": 0.0018, + "reward": 1.653282880783081, + "reward_std": 0.14220409467816353, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.668907880783081, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.84375, + "epoch": 0.46630859375, + "grad_norm": 0.9088916050596484, + "kl": 0.0372314453125, + "learning_rate": 8.834228515624999e-07, + "loss": 0.0015, + "reward": 1.8434149026870728, + "reward_std": 0.028221886605024338, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8434148728847504, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.3203125, + "epoch": 0.466796875, + "grad_norm": 2.81285479890254, + "kl": 0.060546875, + "learning_rate": 8.833007812499999e-07, + "loss": 0.0024, + "reward": 1.6376739144325256, + "reward_std": 0.12609807774424553, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6454864144325256, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.2421875, + "epoch": 0.46728515625, + "grad_norm": 0.7538661853407663, + "kl": 0.0445556640625, + "learning_rate": 8.831787109374999e-07, + "loss": 0.0018, + "reward": 1.6063887476921082, + "reward_std": 0.09519611299037933, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6376387178897858, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.9453125, + "epoch": 0.4677734375, + "grad_norm": 1.6060344661519697, + "kl": 0.0537109375, + "learning_rate": 8.83056640625e-07, + "loss": 0.0021, + "reward": 1.572835922241211, + "reward_std": 0.2594187408685684, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6431483775377274, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.53125, + "epoch": 0.46826171875, + "grad_norm": 3.1560722562556003, + "kl": 0.049072265625, + "learning_rate": 8.829345703125e-07, + "loss": 0.002, + "reward": 1.6071619987487793, + "reward_std": 0.10899307206273079, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6227870583534241, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.1015625, + "epoch": 0.46875, + "grad_norm": 2.778150778773273, + "kl": 0.0599365234375, + "learning_rate": 8.828125e-07, + "loss": 0.0024, + "reward": 1.7072476148605347, + "reward_std": 0.03206057846546173, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7072476148605347, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.4453125, + "epoch": 0.46923828125, + "grad_norm": 2.505049331757069, + "kl": 0.0565185546875, + "learning_rate": 8.826904296875e-07, + "loss": 0.0023, + "reward": 1.7165246605873108, + "reward_std": 0.11047841422259808, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7477746307849884, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.125, + "epoch": 0.4697265625, + "grad_norm": 2.3133113705274453, + "kl": 0.058349609375, + "learning_rate": 8.825683593749999e-07, + "loss": 0.0023, + "reward": 1.663506269454956, + "reward_std": 0.14057481661438942, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6947563290596008, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.5078125, + "epoch": 0.47021484375, + "grad_norm": 1.694135344638883, + "kl": 0.05712890625, + "learning_rate": 8.824462890624999e-07, + "loss": 0.0023, + "reward": 1.8244240880012512, + "reward_std": 0.05446392297744751, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8244240880012512, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.609375, + "epoch": 0.470703125, + "grad_norm": 2.6371365345290045, + "kl": 0.0440673828125, + "learning_rate": 8.8232421875e-07, + "loss": 0.0018, + "reward": 1.7416203618049622, + "reward_std": 0.12130639143288136, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7650578618049622, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.6484375, + "epoch": 0.47119140625, + "grad_norm": 2.218591135498477, + "kl": 0.04278564453125, + "learning_rate": 8.822021484375e-07, + "loss": 0.0017, + "reward": 1.6755053400993347, + "reward_std": 0.14992902055382729, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.7458178997039795, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.3359375, + "epoch": 0.4716796875, + "grad_norm": 1.303009090371895, + "kl": 0.061279296875, + "learning_rate": 8.82080078125e-07, + "loss": 0.0025, + "reward": 1.6562331914901733, + "reward_std": 0.08586933836340904, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6718582212924957, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.40625, + "epoch": 0.47216796875, + "grad_norm": 9.095150959885645, + "kl": 0.0653076171875, + "learning_rate": 8.819580078125e-07, + "loss": 0.0026, + "reward": 1.7505657076835632, + "reward_std": 0.04661328159272671, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.750565767288208, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.2890625, + "epoch": 0.47265625, + "grad_norm": 2.6818988160026658, + "kl": 0.05078125, + "learning_rate": 8.818359374999999e-07, + "loss": 0.002, + "reward": 1.7254713773727417, + "reward_std": 0.2372339516878128, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7645338177680969, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.75, + "epoch": 0.47314453125, + "grad_norm": 2.2365369221179696, + "kl": 0.062744140625, + "learning_rate": 8.817138671874999e-07, + "loss": 0.0025, + "reward": 1.7773956656455994, + "reward_std": 0.0699392519891262, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7773956060409546, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.4921875, + "epoch": 0.4736328125, + "grad_norm": 3.807273507290948, + "kl": 0.0625, + "learning_rate": 8.815917968749999e-07, + "loss": 0.0025, + "reward": 1.6723923683166504, + "reward_std": 0.1734137311577797, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.680204838514328, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.5625, + "epoch": 0.47412109375, + "grad_norm": 0.8599955165133477, + "kl": 0.0482177734375, + "learning_rate": 8.814697265625e-07, + "loss": 0.0019, + "reward": 1.854150652885437, + "reward_std": 0.06166762858629227, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.854150652885437, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.28125, + "epoch": 0.474609375, + "grad_norm": 4.899318801004212, + "kl": 0.043701171875, + "learning_rate": 8.8134765625e-07, + "loss": 0.0017, + "reward": 1.817629873752594, + "reward_std": 0.09293503686785698, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8254423439502716, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.8203125, + "epoch": 0.47509765625, + "grad_norm": 4.869813713992535, + "kl": 0.0604248046875, + "learning_rate": 8.812255859375e-07, + "loss": 0.0024, + "reward": 1.7895490527153015, + "reward_std": 0.06284810416400433, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7895489931106567, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.2265625, + "epoch": 0.4755859375, + "grad_norm": 2.71150507007993, + "kl": 0.0546875, + "learning_rate": 8.81103515625e-07, + "loss": 0.0022, + "reward": 1.8050071597099304, + "reward_std": 0.11932638473808765, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8206321597099304, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.890625, + "epoch": 0.47607421875, + "grad_norm": 2.495906498649002, + "kl": 0.0635986328125, + "learning_rate": 8.809814453124999e-07, + "loss": 0.0025, + "reward": 1.6766229271888733, + "reward_std": 0.06276751309633255, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6766228377819061, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.2890625, + "epoch": 0.4765625, + "grad_norm": 2.0928693525491595, + "kl": 0.048828125, + "learning_rate": 8.808593749999999e-07, + "loss": 0.002, + "reward": 1.6983801126480103, + "reward_std": 0.11064053699374199, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7140050232410431, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.6796875, + "epoch": 0.47705078125, + "grad_norm": 1.9579863122715575, + "kl": 0.0552978515625, + "learning_rate": 8.807373046875e-07, + "loss": 0.0022, + "reward": 1.6679657697677612, + "reward_std": 0.05069480650126934, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6679657995700836, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.921875, + "epoch": 0.4775390625, + "grad_norm": 1.0533835681539812, + "kl": 0.0489501953125, + "learning_rate": 8.80615234375e-07, + "loss": 0.002, + "reward": 1.7244895100593567, + "reward_std": 0.11370780691504478, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7401144802570343, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.9609375, + "epoch": 0.47802734375, + "grad_norm": 0.9675050638605003, + "kl": 0.038330078125, + "learning_rate": 8.804931640625e-07, + "loss": 0.0015, + "reward": 1.7015687227249146, + "reward_std": 0.1075466200709343, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7171937227249146, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.4140625, + "epoch": 0.478515625, + "grad_norm": 1.5989887143012946, + "kl": 0.0556640625, + "learning_rate": 8.8037109375e-07, + "loss": 0.0022, + "reward": 1.647861123085022, + "reward_std": 0.08899911493062973, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.647861123085022, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.03125, + "epoch": 0.47900390625, + "grad_norm": 3.564295998020266, + "kl": 0.047119140625, + "learning_rate": 8.802490234374999e-07, + "loss": 0.0019, + "reward": 1.699999213218689, + "reward_std": 0.20631136745214462, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.723436713218689, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.4765625, + "epoch": 0.4794921875, + "grad_norm": 16.58975911359156, + "kl": 0.0689697265625, + "learning_rate": 8.801269531249999e-07, + "loss": 0.0028, + "reward": 1.6581519842147827, + "reward_std": 0.06756994873285294, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6581519246101379, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.953125, + "epoch": 0.47998046875, + "grad_norm": 2.7300942691577283, + "kl": 0.05859375, + "learning_rate": 8.800048828124999e-07, + "loss": 0.0023, + "reward": 1.7354393601417542, + "reward_std": 0.050481900572776794, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7354393303394318, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.8671875, + "epoch": 0.48046875, + "grad_norm": 4.98249531747594, + "kl": 0.0531005859375, + "learning_rate": 8.798828125e-07, + "loss": 0.0021, + "reward": 1.7789223194122314, + "reward_std": 0.10021020472049713, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7789223790168762, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.734375, + "epoch": 0.48095703125, + "grad_norm": 2.6662511183569135, + "kl": 0.0576171875, + "learning_rate": 8.797607421875e-07, + "loss": 0.0023, + "reward": 1.7876138091087341, + "reward_std": 0.09649738110601902, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7954262793064117, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.53125, + "epoch": 0.4814453125, + "grad_norm": 1.910422280476843, + "kl": 0.0419921875, + "learning_rate": 8.79638671875e-07, + "loss": 0.0017, + "reward": 1.7141448259353638, + "reward_std": 0.129779651761055, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7297699153423309, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.3359375, + "epoch": 0.48193359375, + "grad_norm": 4.7292076134081364, + "kl": 0.058837890625, + "learning_rate": 8.795166015625e-07, + "loss": 0.0023, + "reward": 1.5576480627059937, + "reward_std": 0.08005259186029434, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5576481074094772, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.90625, + "epoch": 0.482421875, + "grad_norm": 88.09217055784015, + "kl": 0.0555419921875, + "learning_rate": 8.793945312499999e-07, + "loss": 0.0022, + "reward": 1.7375428676605225, + "reward_std": 0.055084478110075, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7375428080558777, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.5390625, + "epoch": 0.48291015625, + "grad_norm": 2.034132921180751, + "kl": 0.04638671875, + "learning_rate": 8.792724609374999e-07, + "loss": 0.0019, + "reward": 1.5709097981452942, + "reward_std": 0.16416695713996887, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.602159857749939, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.984375, + "epoch": 0.4833984375, + "grad_norm": 2.0802002056866566, + "kl": 0.0771484375, + "learning_rate": 8.79150390625e-07, + "loss": 0.0031, + "reward": 1.762831211090088, + "reward_std": 0.13553397357463837, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7862686514854431, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.3203125, + "epoch": 0.48388671875, + "grad_norm": 2.7772153375663637, + "kl": 0.091552734375, + "learning_rate": 8.790283203125e-07, + "loss": 0.0037, + "reward": 1.6474461555480957, + "reward_std": 0.05931936576962471, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6474461555480957, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.5703125, + "epoch": 0.484375, + "grad_norm": 6.876528890436644, + "kl": 0.072265625, + "learning_rate": 8.7890625e-07, + "loss": 0.0029, + "reward": 1.698776364326477, + "reward_std": 0.12324061989784241, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.714401364326477, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.3359375, + "epoch": 0.48486328125, + "grad_norm": 1.9851363650800362, + "kl": 0.0592041015625, + "learning_rate": 8.787841796875e-07, + "loss": 0.0024, + "reward": 1.696526050567627, + "reward_std": 0.10686031728982925, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7199635207653046, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.546875, + "epoch": 0.4853515625, + "grad_norm": 0.6005806822321215, + "kl": 0.039794921875, + "learning_rate": 8.786621093749999e-07, + "loss": 0.0016, + "reward": 1.7581510543823242, + "reward_std": 0.02178693562746048, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7581509947776794, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.90625, + "epoch": 0.48583984375, + "grad_norm": 1.2977429398294698, + "kl": 0.0615234375, + "learning_rate": 8.785400390624999e-07, + "loss": 0.0025, + "reward": 1.6917137503623962, + "reward_std": 0.0859937984496355, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6995262205600739, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.3984375, + "epoch": 0.486328125, + "grad_norm": 5.4475694378610235, + "kl": 0.044921875, + "learning_rate": 8.784179687499999e-07, + "loss": 0.0018, + "reward": 1.7169365882873535, + "reward_std": 0.12018711119890213, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7325615584850311, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.0703125, + "epoch": 0.48681640625, + "grad_norm": 1.8640936228979166, + "kl": 0.0543212890625, + "learning_rate": 8.782958984375e-07, + "loss": 0.0022, + "reward": 1.7613821029663086, + "reward_std": 0.09000418707728386, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7613821029663086, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.7734375, + "epoch": 0.4873046875, + "grad_norm": 3.3218342595257826, + "kl": 0.064453125, + "learning_rate": 8.78173828125e-07, + "loss": 0.0026, + "reward": 1.7463974952697754, + "reward_std": 0.06820238195359707, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7463975548744202, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.9765625, + "epoch": 0.48779296875, + "grad_norm": 2.845066128105519, + "kl": 0.045654296875, + "learning_rate": 8.780517578125e-07, + "loss": 0.0018, + "reward": 1.7427108883857727, + "reward_std": 0.05628257617354393, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7427108585834503, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.265625, + "epoch": 0.48828125, + "grad_norm": 1.9373093856251078, + "kl": 0.0430908203125, + "learning_rate": 8.779296875e-07, + "loss": 0.0017, + "reward": 1.673474371433258, + "reward_std": 0.06652860343456268, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6734744310379028, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.2109375, + "epoch": 0.48876953125, + "grad_norm": 1.0767612289001378, + "kl": 0.05712890625, + "learning_rate": 8.778076171874999e-07, + "loss": 0.0023, + "reward": 1.7587948441505432, + "reward_std": 0.07515900582075119, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7666072845458984, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.5625, + "epoch": 0.4892578125, + "grad_norm": 3.2104609137717213, + "kl": 0.0506591796875, + "learning_rate": 8.776855468749999e-07, + "loss": 0.002, + "reward": 1.6220356822013855, + "reward_std": 0.17616816610097885, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6376607120037079, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.5078125, + "epoch": 0.48974609375, + "grad_norm": 3.146050014446822, + "kl": 0.0667724609375, + "learning_rate": 8.775634765625e-07, + "loss": 0.0027, + "reward": 1.7525697350502014, + "reward_std": 0.05434555187821388, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.752569705247879, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.6484375, + "epoch": 0.490234375, + "grad_norm": 1.5821218198575908, + "kl": 0.059814453125, + "learning_rate": 8.7744140625e-07, + "loss": 0.0024, + "reward": 1.7353711128234863, + "reward_std": 0.10459060035645962, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7353711128234863, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5703125, + "epoch": 0.49072265625, + "grad_norm": 2.4207497788820755, + "kl": 0.076171875, + "learning_rate": 8.773193359375e-07, + "loss": 0.0031, + "reward": 1.7326418161392212, + "reward_std": 0.1155674196779728, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7326418459415436, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.421875, + "epoch": 0.4912109375, + "grad_norm": 1.288491196574845, + "kl": 0.0489501953125, + "learning_rate": 8.77197265625e-07, + "loss": 0.002, + "reward": 1.7518397569656372, + "reward_std": 0.12005745619535446, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7674647867679596, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.6015625, + "epoch": 0.49169921875, + "grad_norm": 2.17017216108325, + "kl": 0.05712890625, + "learning_rate": 8.770751953124999e-07, + "loss": 0.0023, + "reward": 1.7059745788574219, + "reward_std": 0.09743463061749935, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7137870192527771, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.2578125, + "epoch": 0.4921875, + "grad_norm": 2.2836236753098285, + "kl": 0.04638671875, + "learning_rate": 8.769531249999999e-07, + "loss": 0.0019, + "reward": 1.8077695965766907, + "reward_std": 0.07913680747151375, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8077695965766907, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.1796875, + "epoch": 0.49267578125, + "grad_norm": 2.175372559341028, + "kl": 0.057861328125, + "learning_rate": 8.768310546874999e-07, + "loss": 0.0023, + "reward": 1.5813584327697754, + "reward_std": 0.1646919883787632, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.620420902967453, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.109375, + "epoch": 0.4931640625, + "grad_norm": 4.3957866299990584, + "kl": 0.0538330078125, + "learning_rate": 8.76708984375e-07, + "loss": 0.0022, + "reward": 1.8353816866874695, + "reward_std": 0.08745286241173744, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8353817164897919, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.359375, + "epoch": 0.49365234375, + "grad_norm": 2.6582112897234027, + "kl": 0.049560546875, + "learning_rate": 8.765869140625e-07, + "loss": 0.002, + "reward": 1.784598708152771, + "reward_std": 0.0864131823182106, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7924111783504486, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.03125, + "epoch": 0.494140625, + "grad_norm": 1.71535782271039, + "kl": 0.04443359375, + "learning_rate": 8.7646484375e-07, + "loss": 0.0018, + "reward": 1.7793409824371338, + "reward_std": 0.08340132981538773, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7949659824371338, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.046875, + "epoch": 0.49462890625, + "grad_norm": 1.5058958043333488, + "kl": 0.0576171875, + "learning_rate": 8.763427734375e-07, + "loss": 0.0023, + "reward": 1.7034955024719238, + "reward_std": 0.054896529763936996, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7034954726696014, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.375, + "epoch": 0.4951171875, + "grad_norm": 2.2521914639437264, + "kl": 0.041748046875, + "learning_rate": 8.762207031249999e-07, + "loss": 0.0017, + "reward": 1.7619941234588623, + "reward_std": 0.05280686542391777, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7619940936565399, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.5078125, + "epoch": 0.49560546875, + "grad_norm": 2.6329502608180997, + "kl": 0.046630859375, + "learning_rate": 8.760986328124999e-07, + "loss": 0.0019, + "reward": 1.652110517024994, + "reward_std": 0.0947786420583725, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6833605170249939, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.859375, + "epoch": 0.49609375, + "grad_norm": 9.694014679211683, + "kl": 0.0516357421875, + "learning_rate": 8.759765625e-07, + "loss": 0.0021, + "reward": 1.6479641199111938, + "reward_std": 0.15935315564274788, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6870265901088715, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.9921875, + "epoch": 0.49658203125, + "grad_norm": 3.105492193769874, + "kl": 0.0606689453125, + "learning_rate": 8.758544921875e-07, + "loss": 0.0024, + "reward": 1.6694360971450806, + "reward_std": 0.15797552838921547, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6928735375404358, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.890625, + "epoch": 0.4970703125, + "grad_norm": 1.937880245962877, + "kl": 0.06005859375, + "learning_rate": 8.75732421875e-07, + "loss": 0.0024, + "reward": 1.6215600371360779, + "reward_std": 0.17177317291498184, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6293725073337555, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.0703125, + "epoch": 0.49755859375, + "grad_norm": 14.494394764668058, + "kl": 0.0552978515625, + "learning_rate": 8.756103515625e-07, + "loss": 0.0022, + "reward": 1.6862713098526, + "reward_std": 0.08157765120267868, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6862713694572449, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.6171875, + "epoch": 0.498046875, + "grad_norm": 1.0520254258307151, + "kl": 0.0523681640625, + "learning_rate": 8.754882812499999e-07, + "loss": 0.0021, + "reward": 1.700273334980011, + "reward_std": 0.10829027369618416, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7158983647823334, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.84375, + "epoch": 0.49853515625, + "grad_norm": 4.450345699769664, + "kl": 0.0531005859375, + "learning_rate": 8.753662109374999e-07, + "loss": 0.0021, + "reward": 1.7399001717567444, + "reward_std": 0.07643388211727142, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7477126717567444, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.078125, + "epoch": 0.4990234375, + "grad_norm": 1.7817445597680743, + "kl": 0.0594482421875, + "learning_rate": 8.752441406249999e-07, + "loss": 0.0024, + "reward": 1.7631664872169495, + "reward_std": 0.10064388811588287, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7787915170192719, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.2890625, + "epoch": 0.49951171875, + "grad_norm": 0.7791878426138603, + "kl": 0.0509033203125, + "learning_rate": 8.751220703125e-07, + "loss": 0.002, + "reward": 1.8146781921386719, + "reward_std": 0.040183124132454395, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8146781921386719, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7734375, + "epoch": 0.5, + "grad_norm": 7.200748288611341, + "kl": 0.0555419921875, + "learning_rate": 8.75e-07, + "loss": 0.0022, + "reward": 1.6464455127716064, + "reward_std": 0.11078909412026405, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6620705127716064, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.5625, + "epoch": 0.50048828125, + "grad_norm": 1.3967289593260515, + "kl": 0.0596923828125, + "learning_rate": 8.748779296875e-07, + "loss": 0.0024, + "reward": 1.7132092714309692, + "reward_std": 0.06034594029188156, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7132093012332916, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.6328125, + "epoch": 0.5009765625, + "grad_norm": 1.143532694929036, + "kl": 0.0494384765625, + "learning_rate": 8.74755859375e-07, + "loss": 0.002, + "reward": 1.7486969828605652, + "reward_std": 0.08863399224355817, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7643219530582428, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.1875, + "epoch": 0.50146484375, + "grad_norm": 2.946621458474271, + "kl": 0.0443115234375, + "learning_rate": 8.746337890624999e-07, + "loss": 0.0018, + "reward": 1.7483346462249756, + "reward_std": 0.08586347103118896, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.756147176027298, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.0859375, + "epoch": 0.501953125, + "grad_norm": 3.4713510932561054, + "kl": 0.05322265625, + "learning_rate": 8.745117187499999e-07, + "loss": 0.0021, + "reward": 1.6632013320922852, + "reward_std": 0.17927244305610657, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6866387724876404, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.5546875, + "epoch": 0.50244140625, + "grad_norm": 1.6093239601152483, + "kl": 0.0498046875, + "learning_rate": 8.743896484375e-07, + "loss": 0.002, + "reward": 1.6798649430274963, + "reward_std": 0.12823793105781078, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7111149281263351, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.953125, + "epoch": 0.5029296875, + "grad_norm": 2.576957508996577, + "kl": 0.063720703125, + "learning_rate": 8.74267578125e-07, + "loss": 0.0025, + "reward": 1.7268919944763184, + "reward_std": 0.11807430163025856, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7347044944763184, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.0625, + "epoch": 0.50341796875, + "grad_norm": 1.3370534115868566, + "kl": 0.051025390625, + "learning_rate": 8.741455078125e-07, + "loss": 0.002, + "reward": 1.7975549697875977, + "reward_std": 0.048098307102918625, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7975549101829529, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7265625, + "epoch": 0.50390625, + "grad_norm": 1.3836237431764185, + "kl": 0.0504150390625, + "learning_rate": 8.740234375e-07, + "loss": 0.002, + "reward": 1.634350836277008, + "reward_std": 0.098635109141469, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6421633064746857, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.8125, + "epoch": 0.50439453125, + "grad_norm": 2.5285386551064795, + "kl": 0.0504150390625, + "learning_rate": 8.739013671874999e-07, + "loss": 0.002, + "reward": 1.6886343359947205, + "reward_std": 0.1606372781097889, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7120718657970428, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.265625, + "epoch": 0.5048828125, + "grad_norm": 3.1872891481166574, + "kl": 0.0452880859375, + "learning_rate": 8.737792968749999e-07, + "loss": 0.0018, + "reward": 1.6827728748321533, + "reward_std": 0.12052744254469872, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6827729046344757, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.5234375, + "epoch": 0.50537109375, + "grad_norm": 1.4142083245543213, + "kl": 0.0538330078125, + "learning_rate": 8.736572265624999e-07, + "loss": 0.0022, + "reward": 1.706916630268097, + "reward_std": 0.13945464044809341, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7225416600704193, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.6953125, + "epoch": 0.505859375, + "grad_norm": 1.1813868455912004, + "kl": 0.0445556640625, + "learning_rate": 8.7353515625e-07, + "loss": 0.0018, + "reward": 1.8250656127929688, + "reward_std": 0.059078922495245934, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8250656723976135, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.765625, + "epoch": 0.50634765625, + "grad_norm": 1.9538556939291407, + "kl": 0.0528564453125, + "learning_rate": 8.734130859375e-07, + "loss": 0.0021, + "reward": 1.6294710636138916, + "reward_std": 0.12247138097882271, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6294711232185364, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.9296875, + "epoch": 0.5068359375, + "grad_norm": 1.4987364875717888, + "kl": 0.042236328125, + "learning_rate": 8.73291015625e-07, + "loss": 0.0017, + "reward": 1.797271490097046, + "reward_std": 0.05235948599874973, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7972714900970459, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.0, + "epoch": 0.50732421875, + "grad_norm": 2.833930667181314, + "kl": 0.04833984375, + "learning_rate": 8.731689453125e-07, + "loss": 0.0019, + "reward": 1.7178579568862915, + "reward_std": 0.09247782826423645, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7256704568862915, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.8203125, + "epoch": 0.5078125, + "grad_norm": 1.467240282257085, + "kl": 0.0572509765625, + "learning_rate": 8.730468749999999e-07, + "loss": 0.0023, + "reward": 1.7496626377105713, + "reward_std": 0.02964417589828372, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7496626377105713, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.703125, + "epoch": 0.50830078125, + "grad_norm": 1.108726705203492, + "kl": 0.052001953125, + "learning_rate": 8.729248046874999e-07, + "loss": 0.0021, + "reward": 1.7599297761917114, + "reward_std": 0.04675179207697511, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.759929746389389, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.34375, + "epoch": 0.5087890625, + "grad_norm": 1.5788272962847125, + "kl": 0.055908203125, + "learning_rate": 8.72802734375e-07, + "loss": 0.0022, + "reward": 1.8183218836784363, + "reward_std": 0.04584968835115433, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.818321943283081, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.1953125, + "epoch": 0.50927734375, + "grad_norm": 4.386222680431177, + "kl": 0.074951171875, + "learning_rate": 8.726806640625e-07, + "loss": 0.003, + "reward": 1.7009983658790588, + "reward_std": 0.1038425974547863, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7009983360767365, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.171875, + "epoch": 0.509765625, + "grad_norm": 2.3821930491899934, + "kl": 0.0462646484375, + "learning_rate": 8.7255859375e-07, + "loss": 0.0018, + "reward": 1.7736052870750427, + "reward_std": 0.10071777179837227, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.781417727470398, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.5078125, + "epoch": 0.51025390625, + "grad_norm": 1.549376759535485, + "kl": 0.052734375, + "learning_rate": 8.724365234375e-07, + "loss": 0.0021, + "reward": 1.7131693959236145, + "reward_std": 0.07255561649799347, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7131693959236145, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.6015625, + "epoch": 0.5107421875, + "grad_norm": 6.712788585227676, + "kl": 0.1573486328125, + "learning_rate": 8.72314453125e-07, + "loss": 0.0063, + "reward": 1.6211495399475098, + "reward_std": 0.13179854303598404, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6602120995521545, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.1171875, + "epoch": 0.51123046875, + "grad_norm": 3.0574803857364286, + "kl": 0.04736328125, + "learning_rate": 8.721923828124999e-07, + "loss": 0.0019, + "reward": 1.7683227062225342, + "reward_std": 0.08765990659594536, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7761352360248566, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.8046875, + "epoch": 0.51171875, + "grad_norm": 2.023235694735734, + "kl": 0.065185546875, + "learning_rate": 8.720703124999999e-07, + "loss": 0.0026, + "reward": 1.6004237532615662, + "reward_std": 0.05043849162757397, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6004238128662109, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.609375, + "epoch": 0.51220703125, + "grad_norm": 1.880850379976763, + "kl": 0.058837890625, + "learning_rate": 8.719482421875e-07, + "loss": 0.0024, + "reward": 1.828034520149231, + "reward_std": 0.05495187267661095, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.828034520149231, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.7421875, + "epoch": 0.5126953125, + "grad_norm": 1.737728691407687, + "kl": 0.05517578125, + "learning_rate": 8.71826171875e-07, + "loss": 0.0022, + "reward": 1.560300588607788, + "reward_std": 0.07925301790237427, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5681131184101105, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.6328125, + "epoch": 0.51318359375, + "grad_norm": 2.886774795094905, + "kl": 0.0634765625, + "learning_rate": 8.717041015625e-07, + "loss": 0.0025, + "reward": 1.6774318218231201, + "reward_std": 0.06801902502775192, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6852443218231201, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.0859375, + "epoch": 0.513671875, + "grad_norm": 1.0122422402580602, + "kl": 0.0460205078125, + "learning_rate": 8.7158203125e-07, + "loss": 0.0018, + "reward": 1.7169759273529053, + "reward_std": 0.05733257718384266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7169758975505829, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.25, + "epoch": 0.51416015625, + "grad_norm": 1.0407203824158495, + "kl": 0.0599365234375, + "learning_rate": 8.714599609374999e-07, + "loss": 0.0024, + "reward": 1.7637624740600586, + "reward_std": 0.09369587153196335, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7793874442577362, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.25, + "epoch": 0.5146484375, + "grad_norm": 44.02991363992667, + "kl": 0.0645751953125, + "learning_rate": 8.713378906249999e-07, + "loss": 0.0026, + "reward": 1.6559126377105713, + "reward_std": 0.057641902938485146, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6559126079082489, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.546875, + "epoch": 0.51513671875, + "grad_norm": 4.093271765716937, + "kl": 0.055908203125, + "learning_rate": 8.712158203124999e-07, + "loss": 0.0022, + "reward": 1.6613022685050964, + "reward_std": 0.11586426943540573, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6769272983074188, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.09375, + "epoch": 0.515625, + "grad_norm": 3.3041055571935196, + "kl": 0.046875, + "learning_rate": 8.7109375e-07, + "loss": 0.0019, + "reward": 1.7097191214561462, + "reward_std": 0.09788389131426811, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7253441214561462, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.1953125, + "epoch": 0.51611328125, + "grad_norm": 1.380486793481445, + "kl": 0.053466796875, + "learning_rate": 8.709716796875e-07, + "loss": 0.0021, + "reward": 1.7574412822723389, + "reward_std": 0.058644311502575874, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7574412822723389, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.3515625, + "epoch": 0.5166015625, + "grad_norm": 5.591919284669544, + "kl": 0.060302734375, + "learning_rate": 8.70849609375e-07, + "loss": 0.0024, + "reward": 1.8293917179107666, + "reward_std": 0.09176983684301376, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8293918073177338, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.9140625, + "epoch": 0.51708984375, + "grad_norm": 1.2725887032541328, + "kl": 0.066650390625, + "learning_rate": 8.707275390625e-07, + "loss": 0.0027, + "reward": 1.706933856010437, + "reward_std": 0.07794651389122009, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7069338262081146, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.875, + "epoch": 0.517578125, + "grad_norm": 3.145191103123939, + "kl": 0.077392578125, + "learning_rate": 8.706054687499999e-07, + "loss": 0.0031, + "reward": 1.6468673944473267, + "reward_std": 0.06843332573771477, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6468673646450043, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.234375, + "epoch": 0.51806640625, + "grad_norm": 2.5048711249337625, + "kl": 0.057861328125, + "learning_rate": 8.704833984374999e-07, + "loss": 0.0023, + "reward": 1.7483791708946228, + "reward_std": 0.10295334830880165, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7483791410923004, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.453125, + "epoch": 0.5185546875, + "grad_norm": 3.1332272679114657, + "kl": 0.0693359375, + "learning_rate": 8.70361328125e-07, + "loss": 0.0028, + "reward": 1.7026050090789795, + "reward_std": 0.07863837853074074, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7104175388813019, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.25, + "epoch": 0.51904296875, + "grad_norm": 2.9349932273222374, + "kl": 0.0595703125, + "learning_rate": 8.702392578125e-07, + "loss": 0.0024, + "reward": 1.8661960363388062, + "reward_std": 0.03855743817985058, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8661959767341614, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.734375, + "epoch": 0.51953125, + "grad_norm": 4.005208537757595, + "kl": 0.0533447265625, + "learning_rate": 8.701171875e-07, + "loss": 0.0021, + "reward": 1.6547017097473145, + "reward_std": 0.08106643706560135, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6547016203403473, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.265625, + "epoch": 0.52001953125, + "grad_norm": 14.385556488276366, + "kl": 0.056396484375, + "learning_rate": 8.699951171875e-07, + "loss": 0.0023, + "reward": 1.7611234784126282, + "reward_std": 0.06275673396885395, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7611234188079834, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.8203125, + "epoch": 0.5205078125, + "grad_norm": 2.572037094829945, + "kl": 0.055908203125, + "learning_rate": 8.698730468749999e-07, + "loss": 0.0022, + "reward": 1.8109251260757446, + "reward_std": 0.0447577740997076, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8109250664710999, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0703125, + "epoch": 0.52099609375, + "grad_norm": 1.5312642359752389, + "kl": 0.054443359375, + "learning_rate": 8.697509765624999e-07, + "loss": 0.0022, + "reward": 1.7536611557006836, + "reward_std": 0.06718228757381439, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7536611258983612, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.5703125, + "epoch": 0.521484375, + "grad_norm": 1.3523042468821218, + "kl": 0.055419921875, + "learning_rate": 8.696289062499999e-07, + "loss": 0.0022, + "reward": 1.644126534461975, + "reward_std": 0.06054982542991638, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6441265642642975, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.2734375, + "epoch": 0.52197265625, + "grad_norm": 1.9861791425441333, + "kl": 0.0662841796875, + "learning_rate": 8.695068359375e-07, + "loss": 0.0027, + "reward": 1.6830092668533325, + "reward_std": 0.0935671292245388, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6830093264579773, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.7890625, + "epoch": 0.5224609375, + "grad_norm": 7.973684507916818, + "kl": 0.0648193359375, + "learning_rate": 8.69384765625e-07, + "loss": 0.0026, + "reward": 1.6281793117523193, + "reward_std": 0.15630166232585907, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6359919011592865, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.984375, + "epoch": 0.52294921875, + "grad_norm": 2.766158319030079, + "kl": 0.0518798828125, + "learning_rate": 8.692626953125e-07, + "loss": 0.0021, + "reward": 1.7701700925827026, + "reward_std": 0.22449339926242828, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.8170450925827026, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.3203125, + "epoch": 0.5234375, + "grad_norm": 2.20414905944699, + "kl": 0.0506591796875, + "learning_rate": 8.69140625e-07, + "loss": 0.002, + "reward": 1.783662736415863, + "reward_std": 0.07873168960213661, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7914752662181854, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.203125, + "epoch": 0.52392578125, + "grad_norm": 4.143705460739188, + "kl": 0.0576171875, + "learning_rate": 8.690185546874999e-07, + "loss": 0.0023, + "reward": 1.6684030294418335, + "reward_std": 0.07677320018410683, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6684030294418335, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.8203125, + "epoch": 0.5244140625, + "grad_norm": 2.2380443410195157, + "kl": 0.062255859375, + "learning_rate": 8.688964843749999e-07, + "loss": 0.0025, + "reward": 1.8161095976829529, + "reward_std": 0.054756371304392815, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8161095678806305, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.46875, + "epoch": 0.52490234375, + "grad_norm": 2.096163231473889, + "kl": 0.06787109375, + "learning_rate": 8.687744140625e-07, + "loss": 0.0027, + "reward": 1.8008560538291931, + "reward_std": 0.09001643769443035, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8164810538291931, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.9375, + "epoch": 0.525390625, + "grad_norm": 9.937730648389389, + "kl": 0.0526123046875, + "learning_rate": 8.6865234375e-07, + "loss": 0.0021, + "reward": 1.7251918315887451, + "reward_std": 0.11758009344339371, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7486292719841003, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.7421875, + "epoch": 0.52587890625, + "grad_norm": 9.17932500787084, + "kl": 0.0606689453125, + "learning_rate": 8.685302734375e-07, + "loss": 0.0024, + "reward": 1.7291359901428223, + "reward_std": 0.07484306022524834, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.729136049747467, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.453125, + "epoch": 0.5263671875, + "grad_norm": 1.9438836015934406, + "kl": 0.058349609375, + "learning_rate": 8.68408203125e-07, + "loss": 0.0023, + "reward": 1.6683382987976074, + "reward_std": 0.07380038499832153, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6683382987976074, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.3984375, + "epoch": 0.52685546875, + "grad_norm": 5.847630596731743, + "kl": 0.07080078125, + "learning_rate": 8.682861328124999e-07, + "loss": 0.0028, + "reward": 1.7424799799919128, + "reward_std": 0.04051386937499046, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7424799799919128, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.6328125, + "epoch": 0.52734375, + "grad_norm": 4.09131971663552, + "kl": 0.060302734375, + "learning_rate": 8.681640624999999e-07, + "loss": 0.0024, + "reward": 1.725938320159912, + "reward_std": 0.07212316989898682, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7259383201599121, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.328125, + "epoch": 0.52783203125, + "grad_norm": 3.5388236065556327, + "kl": 0.0601806640625, + "learning_rate": 8.680419921874999e-07, + "loss": 0.0024, + "reward": 1.73984694480896, + "reward_std": 0.07245020382106304, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7398469150066376, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.59375, + "epoch": 0.5283203125, + "grad_norm": 3.757069955864066, + "kl": 0.0584716796875, + "learning_rate": 8.67919921875e-07, + "loss": 0.0023, + "reward": 1.7384961247444153, + "reward_std": 0.1039031371474266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7384961247444153, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.2421875, + "epoch": 0.52880859375, + "grad_norm": 6.03160828595789, + "kl": 0.0555419921875, + "learning_rate": 8.677978515625e-07, + "loss": 0.0022, + "reward": 1.748001754283905, + "reward_std": 0.06500514224171638, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.748001754283905, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.5625, + "epoch": 0.529296875, + "grad_norm": 2.655696243747116, + "kl": 0.0491943359375, + "learning_rate": 8.6767578125e-07, + "loss": 0.002, + "reward": 1.8072885274887085, + "reward_std": 0.09110748954117298, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8307260870933533, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0078125, + "epoch": 0.52978515625, + "grad_norm": 3.464007827161993, + "kl": 0.054443359375, + "learning_rate": 8.675537109375e-07, + "loss": 0.0022, + "reward": 1.8391188979148865, + "reward_std": 0.06963248923420906, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8391189575195312, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.0, + "epoch": 0.5302734375, + "grad_norm": 5.126535621984238, + "kl": 0.0596923828125, + "learning_rate": 8.674316406249999e-07, + "loss": 0.0024, + "reward": 1.6497421264648438, + "reward_std": 0.07221582159399986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6497421860694885, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.15625, + "epoch": 0.53076171875, + "grad_norm": 1.2442388532768656, + "kl": 0.055908203125, + "learning_rate": 8.673095703124999e-07, + "loss": 0.0022, + "reward": 1.7321175932884216, + "reward_std": 0.10106639470905066, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.739930123090744, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.734375, + "epoch": 0.53125, + "grad_norm": 2.995348861215367, + "kl": 0.0584716796875, + "learning_rate": 8.671875e-07, + "loss": 0.0023, + "reward": 1.7458354234695435, + "reward_std": 0.037329770624637604, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7458354234695435, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.6796875, + "epoch": 0.53173828125, + "grad_norm": 1.167096873944655, + "kl": 0.0460205078125, + "learning_rate": 8.670654296875e-07, + "loss": 0.0018, + "reward": 1.7276933789253235, + "reward_std": 0.136086568236351, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7589433491230011, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.84375, + "epoch": 0.5322265625, + "grad_norm": 2.968643105816318, + "kl": 0.0660400390625, + "learning_rate": 8.66943359375e-07, + "loss": 0.0026, + "reward": 1.7143974304199219, + "reward_std": 0.11442429013550282, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7222099602222443, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.90625, + "epoch": 0.53271484375, + "grad_norm": 14.99214457614237, + "kl": 0.0484619140625, + "learning_rate": 8.668212890625e-07, + "loss": 0.0019, + "reward": 1.7458195090293884, + "reward_std": 0.05967376381158829, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7458195388317108, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.953125, + "epoch": 0.533203125, + "grad_norm": 1.261439455958631, + "kl": 0.0469970703125, + "learning_rate": 8.666992187499999e-07, + "loss": 0.0019, + "reward": 1.7515225410461426, + "reward_std": 0.05775933898985386, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7515226006507874, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.2890625, + "epoch": 0.53369140625, + "grad_norm": 2.9498487401383953, + "kl": 0.068603515625, + "learning_rate": 8.665771484374999e-07, + "loss": 0.0027, + "reward": 1.7203047275543213, + "reward_std": 0.08913041837513447, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7203047573566437, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.765625, + "epoch": 0.5341796875, + "grad_norm": 7.1364746418285545, + "kl": 0.0576171875, + "learning_rate": 8.664550781249999e-07, + "loss": 0.0023, + "reward": 1.652907907962799, + "reward_std": 0.13294285163283348, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6607204079627991, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.078125, + "epoch": 0.53466796875, + "grad_norm": 1.8817117462815578, + "kl": 0.0555419921875, + "learning_rate": 8.663330078125e-07, + "loss": 0.0022, + "reward": 1.721267819404602, + "reward_std": 0.030888373032212257, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7212677896022797, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.53125, + "epoch": 0.53515625, + "grad_norm": 3.087003895884515, + "kl": 0.0443115234375, + "learning_rate": 8.662109375e-07, + "loss": 0.0018, + "reward": 1.7337496876716614, + "reward_std": 0.08758010156452656, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7571871876716614, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.8515625, + "epoch": 0.53564453125, + "grad_norm": 1.6271168573011427, + "kl": 0.054931640625, + "learning_rate": 8.660888671875e-07, + "loss": 0.0022, + "reward": 1.7751588225364685, + "reward_std": 0.03671477176249027, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7751587927341461, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.546875, + "epoch": 0.5361328125, + "grad_norm": 1.7985380156894293, + "kl": 0.040283203125, + "learning_rate": 8.65966796875e-07, + "loss": 0.0016, + "reward": 1.6898677945137024, + "reward_std": 0.09911376610398293, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6976803243160248, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.21875, + "epoch": 0.53662109375, + "grad_norm": 1.7440656859976609, + "kl": 0.050048828125, + "learning_rate": 8.658447265624999e-07, + "loss": 0.002, + "reward": 1.7902414798736572, + "reward_std": 0.07932448014616966, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7902414798736572, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.546875, + "epoch": 0.537109375, + "grad_norm": 0.8393827425684771, + "kl": 0.0535888671875, + "learning_rate": 8.657226562499999e-07, + "loss": 0.0021, + "reward": 1.746444582939148, + "reward_std": 0.04779106751084328, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7464446127414703, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.546875, + "epoch": 0.53759765625, + "grad_norm": 2.5411055906974207, + "kl": 0.0362548828125, + "learning_rate": 8.656005859375e-07, + "loss": 0.0014, + "reward": 1.7083318829536438, + "reward_std": 0.10737061686813831, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7239568829536438, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.9765625, + "epoch": 0.5380859375, + "grad_norm": 0.8904683187080978, + "kl": 0.0406494140625, + "learning_rate": 8.65478515625e-07, + "loss": 0.0016, + "reward": 1.6586171388626099, + "reward_std": 0.1240275464951992, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6820546984672546, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.546875, + "epoch": 0.53857421875, + "grad_norm": 2.333691146894965, + "kl": 0.043212890625, + "learning_rate": 8.653564453125e-07, + "loss": 0.0017, + "reward": 1.6594500541687012, + "reward_std": 0.07179497927427292, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6594500541687012, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.2890625, + "epoch": 0.5390625, + "grad_norm": 2.851343254553436, + "kl": 0.0518798828125, + "learning_rate": 8.65234375e-07, + "loss": 0.0021, + "reward": 1.8387314081192017, + "reward_std": 0.09944453835487366, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8387314081192017, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.1796875, + "epoch": 0.53955078125, + "grad_norm": 1.9310397948323363, + "kl": 0.04345703125, + "learning_rate": 8.651123046874999e-07, + "loss": 0.0017, + "reward": 1.8558620810508728, + "reward_std": 0.10813725739717484, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8714870512485504, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.3359375, + "epoch": 0.5400390625, + "grad_norm": 2.7147615717436575, + "kl": 0.0428466796875, + "learning_rate": 8.649902343749999e-07, + "loss": 0.0017, + "reward": 1.696751356124878, + "reward_std": 0.13834229856729507, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7280014157295227, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.7734375, + "epoch": 0.54052734375, + "grad_norm": 1.3847207971513456, + "kl": 0.0501708984375, + "learning_rate": 8.648681640624999e-07, + "loss": 0.002, + "reward": 1.6468215584754944, + "reward_std": 0.08769709430634975, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6624464988708496, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.703125, + "epoch": 0.541015625, + "grad_norm": 2.361071789324421, + "kl": 0.0533447265625, + "learning_rate": 8.6474609375e-07, + "loss": 0.0021, + "reward": 1.7735760807991028, + "reward_std": 0.12644799798727036, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7970135807991028, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.15625, + "epoch": 0.54150390625, + "grad_norm": 2.3227503298020578, + "kl": 0.0482177734375, + "learning_rate": 8.646240234375e-07, + "loss": 0.0019, + "reward": 1.6930591464042664, + "reward_std": 0.10155784152448177, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7164965569972992, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.6875, + "epoch": 0.5419921875, + "grad_norm": 3.0123794250756006, + "kl": 0.0523681640625, + "learning_rate": 8.64501953125e-07, + "loss": 0.0021, + "reward": 1.7508153915405273, + "reward_std": 0.04635917954146862, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7508153319358826, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.0546875, + "epoch": 0.54248046875, + "grad_norm": 0.5261264496982065, + "kl": 0.037109375, + "learning_rate": 8.643798828125e-07, + "loss": 0.0015, + "reward": 1.5648311376571655, + "reward_std": 0.14709511492401361, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6273311078548431, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.0625, + "epoch": 0.54296875, + "grad_norm": 1.5640578960440386, + "kl": 0.0469970703125, + "learning_rate": 8.642578124999999e-07, + "loss": 0.0019, + "reward": 1.8022651076316833, + "reward_std": 0.04896317049860954, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8022651076316833, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.1171875, + "epoch": 0.54345703125, + "grad_norm": 2.2857686368129295, + "kl": 0.042724609375, + "learning_rate": 8.641357421874999e-07, + "loss": 0.0017, + "reward": 1.7114735841751099, + "reward_std": 0.17102890089154243, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7505361139774323, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.453125, + "epoch": 0.5439453125, + "grad_norm": 2.4536320336668997, + "kl": 0.0411376953125, + "learning_rate": 8.64013671875e-07, + "loss": 0.0016, + "reward": 1.6378782987594604, + "reward_std": 0.08712486177682877, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6691283285617828, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.21875, + "epoch": 0.54443359375, + "grad_norm": 1.739157825797545, + "kl": 0.0460205078125, + "learning_rate": 8.638916015625e-07, + "loss": 0.0018, + "reward": 1.755341649055481, + "reward_std": 0.04750080406665802, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.755341649055481, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.7734375, + "epoch": 0.544921875, + "grad_norm": 2.4358654204024375, + "kl": 0.0413818359375, + "learning_rate": 8.6376953125e-07, + "loss": 0.0017, + "reward": 1.7550008893013, + "reward_std": 0.04658900573849678, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7550008594989777, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.9375, + "epoch": 0.54541015625, + "grad_norm": 1.9727736242781981, + "kl": 0.0411376953125, + "learning_rate": 8.636474609375e-07, + "loss": 0.0016, + "reward": 1.8357142806053162, + "reward_std": 0.043268971145153046, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8357143700122833, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.1484375, + "epoch": 0.5458984375, + "grad_norm": 1.4259508975204152, + "kl": 0.0421142578125, + "learning_rate": 8.635253906249999e-07, + "loss": 0.0017, + "reward": 1.7081193327903748, + "reward_std": 0.04622589237987995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7081193625926971, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.2109375, + "epoch": 0.54638671875, + "grad_norm": 21.16092755028298, + "kl": 0.0484619140625, + "learning_rate": 8.634033203124999e-07, + "loss": 0.0019, + "reward": 1.7775554656982422, + "reward_std": 0.06883841939270496, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7775554060935974, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.1328125, + "epoch": 0.546875, + "grad_norm": 1.8238098647122996, + "kl": 0.071044921875, + "learning_rate": 8.632812499999999e-07, + "loss": 0.0028, + "reward": 1.782721757888794, + "reward_std": 0.03212573006749153, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.782721757888794, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3125, + "epoch": 0.54736328125, + "grad_norm": 1.316049654009593, + "kl": 0.0457763671875, + "learning_rate": 8.631591796875e-07, + "loss": 0.0018, + "reward": 1.8062964081764221, + "reward_std": 0.053434714674949646, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8062964081764221, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.375, + "epoch": 0.5478515625, + "grad_norm": 2.177144782177925, + "kl": 0.0462646484375, + "learning_rate": 8.63037109375e-07, + "loss": 0.0019, + "reward": 1.6950576901435852, + "reward_std": 0.17912092059850693, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7497451901435852, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.453125, + "epoch": 0.54833984375, + "grad_norm": 1.8973596678524318, + "kl": 0.0445556640625, + "learning_rate": 8.629150390625e-07, + "loss": 0.0018, + "reward": 1.6502639651298523, + "reward_std": 0.042238444089889526, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6502639055252075, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.46875, + "epoch": 0.548828125, + "grad_norm": 1.620960824121817, + "kl": 0.067138671875, + "learning_rate": 8.6279296875e-07, + "loss": 0.0027, + "reward": 1.682121753692627, + "reward_std": 0.18632768094539642, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.736809253692627, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.09375, + "epoch": 0.54931640625, + "grad_norm": 1.8774955491831953, + "kl": 0.0504150390625, + "learning_rate": 8.626708984374999e-07, + "loss": 0.002, + "reward": 1.661731779575348, + "reward_std": 0.06446626409888268, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6617318093776703, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.8515625, + "epoch": 0.5498046875, + "grad_norm": 1.2653808730315304, + "kl": 0.0457763671875, + "learning_rate": 8.625488281249999e-07, + "loss": 0.0018, + "reward": 1.7855925559997559, + "reward_std": 0.08860567212104797, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7855925559997559, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.34375, + "epoch": 0.55029296875, + "grad_norm": 2.4810925421537715, + "kl": 0.05078125, + "learning_rate": 8.624267578125e-07, + "loss": 0.002, + "reward": 1.8083539009094238, + "reward_std": 0.07869580388069153, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8083539605140686, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.9453125, + "epoch": 0.55078125, + "grad_norm": 1.170883156742559, + "kl": 0.052001953125, + "learning_rate": 8.623046875e-07, + "loss": 0.0021, + "reward": 1.82011079788208, + "reward_std": 0.025017164181917906, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8201107978820801, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.84375, + "epoch": 0.55126953125, + "grad_norm": 0.7270591612895813, + "kl": 0.050048828125, + "learning_rate": 8.621826171875e-07, + "loss": 0.002, + "reward": 1.7170042395591736, + "reward_std": 0.0804044771939516, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7248167097568512, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.328125, + "epoch": 0.5517578125, + "grad_norm": 2.0226104624847783, + "kl": 0.0504150390625, + "learning_rate": 8.62060546875e-07, + "loss": 0.002, + "reward": 1.6977909207344055, + "reward_std": 0.07109425030648708, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6977909505367279, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.484375, + "epoch": 0.55224609375, + "grad_norm": 1.0935418228016645, + "kl": 0.046875, + "learning_rate": 8.619384765625e-07, + "loss": 0.0019, + "reward": 1.7301682233810425, + "reward_std": 0.0707546304911375, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7379806041717529, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.953125, + "epoch": 0.552734375, + "grad_norm": 2.088569309643272, + "kl": 0.0562744140625, + "learning_rate": 8.618164062499999e-07, + "loss": 0.0022, + "reward": 1.7825233340263367, + "reward_std": 0.06269277073442936, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7825233638286591, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.640625, + "epoch": 0.55322265625, + "grad_norm": 1.6778708166777785, + "kl": 0.0460205078125, + "learning_rate": 8.616943359374999e-07, + "loss": 0.0018, + "reward": 1.6941341757774353, + "reward_std": 0.03843311499804258, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6941341161727905, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.390625, + "epoch": 0.5537109375, + "grad_norm": 2.8756770323876784, + "kl": 0.0501708984375, + "learning_rate": 8.61572265625e-07, + "loss": 0.002, + "reward": 1.7191390991210938, + "reward_std": 0.05277089774608612, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7191390693187714, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.8671875, + "epoch": 0.55419921875, + "grad_norm": 1.7093735784110484, + "kl": 0.0504150390625, + "learning_rate": 8.614501953125e-07, + "loss": 0.002, + "reward": 1.875854730606079, + "reward_std": 0.03535257466137409, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8758547604084015, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.7109375, + "epoch": 0.5546875, + "grad_norm": 2.1214394895752355, + "kl": 0.05419921875, + "learning_rate": 8.61328125e-07, + "loss": 0.0022, + "reward": 1.693172812461853, + "reward_std": 0.10498131066560745, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6931727230548859, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.921875, + "epoch": 0.55517578125, + "grad_norm": 2.1311745798315873, + "kl": 0.048583984375, + "learning_rate": 8.612060546875e-07, + "loss": 0.0019, + "reward": 1.6516863703727722, + "reward_std": 0.06436803564429283, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6516863703727722, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.7890625, + "epoch": 0.5556640625, + "grad_norm": 2.5661871345952236, + "kl": 0.0592041015625, + "learning_rate": 8.610839843749999e-07, + "loss": 0.0024, + "reward": 1.5657876133918762, + "reward_std": 0.1475287228822708, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5814126431941986, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.8984375, + "epoch": 0.55615234375, + "grad_norm": 1.7584895844002197, + "kl": 0.0433349609375, + "learning_rate": 8.609619140624999e-07, + "loss": 0.0017, + "reward": 1.763173222541809, + "reward_std": 0.09580406174063683, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7787982225418091, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.6953125, + "epoch": 0.556640625, + "grad_norm": 1.5770899012379078, + "kl": 0.0501708984375, + "learning_rate": 8.6083984375e-07, + "loss": 0.002, + "reward": 1.785545527935028, + "reward_std": 0.1521715521812439, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8089830279350281, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.4375, + "epoch": 0.55712890625, + "grad_norm": 2.0376418633131888, + "kl": 0.0655517578125, + "learning_rate": 8.607177734375e-07, + "loss": 0.0026, + "reward": 1.6794911623001099, + "reward_std": 0.19039485603570938, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7029286623001099, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.203125, + "epoch": 0.5576171875, + "grad_norm": 4.0042765130772375, + "kl": 0.0548095703125, + "learning_rate": 8.60595703125e-07, + "loss": 0.0022, + "reward": 1.7438197135925293, + "reward_std": 0.1175292357802391, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7516322731971741, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.703125, + "epoch": 0.55810546875, + "grad_norm": 1.8931830827167573, + "kl": 0.04833984375, + "learning_rate": 8.604736328125e-07, + "loss": 0.0019, + "reward": 1.6692324876785278, + "reward_std": 0.10438600182533264, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6770449876785278, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.4140625, + "epoch": 0.55859375, + "grad_norm": 1.5525001439122468, + "kl": 0.0489501953125, + "learning_rate": 8.603515625e-07, + "loss": 0.002, + "reward": 1.722908079624176, + "reward_std": 0.10220470279455185, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.738533079624176, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.34375, + "epoch": 0.55908203125, + "grad_norm": 1.3304876430870214, + "kl": 0.051025390625, + "learning_rate": 8.602294921874999e-07, + "loss": 0.002, + "reward": 1.792259931564331, + "reward_std": 0.0483635775744915, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7922599613666534, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.015625, + "epoch": 0.5595703125, + "grad_norm": 3.74826680648072, + "kl": 0.04736328125, + "learning_rate": 8.601074218749999e-07, + "loss": 0.0019, + "reward": 1.6866852045059204, + "reward_std": 0.0666123665869236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.686685174703598, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.296875, + "epoch": 0.56005859375, + "grad_norm": 1.506824333758443, + "kl": 0.0487060546875, + "learning_rate": 8.599853515625e-07, + "loss": 0.0019, + "reward": 1.7012399435043335, + "reward_std": 0.11691510677337646, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7246775031089783, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3203125, + "epoch": 0.560546875, + "grad_norm": 1.229486203330246, + "kl": 0.048583984375, + "learning_rate": 8.5986328125e-07, + "loss": 0.0019, + "reward": 1.7387210130691528, + "reward_std": 0.1014990508556366, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7465335130691528, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.984375, + "epoch": 0.56103515625, + "grad_norm": 2.293838234033414, + "kl": 0.0511474609375, + "learning_rate": 8.597412109375e-07, + "loss": 0.002, + "reward": 1.7903647422790527, + "reward_std": 0.07567498832941055, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8138022422790527, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.15625, + "epoch": 0.5615234375, + "grad_norm": 11.895148102191571, + "kl": 0.0526123046875, + "learning_rate": 8.59619140625e-07, + "loss": 0.0021, + "reward": 1.7975794076919556, + "reward_std": 0.09720364585518837, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7975793480873108, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.8828125, + "epoch": 0.56201171875, + "grad_norm": 2.3961074495667747, + "kl": 0.0467529296875, + "learning_rate": 8.594970703124999e-07, + "loss": 0.0019, + "reward": 1.6959925889968872, + "reward_std": 0.1357579454779625, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7038050889968872, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.46875, + "epoch": 0.5625, + "grad_norm": 3.689726718036065, + "kl": 0.0567626953125, + "learning_rate": 8.593749999999999e-07, + "loss": 0.0023, + "reward": 1.7538256645202637, + "reward_std": 0.07644342631101608, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7538256645202637, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.9921875, + "epoch": 0.56298828125, + "grad_norm": 2.075275971718928, + "kl": 0.0587158203125, + "learning_rate": 8.592529296874999e-07, + "loss": 0.0023, + "reward": 1.788736641407013, + "reward_std": 0.047330291010439396, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7887366712093353, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.765625, + "epoch": 0.5634765625, + "grad_norm": 2.5741019637692033, + "kl": 0.0557861328125, + "learning_rate": 8.59130859375e-07, + "loss": 0.0022, + "reward": 1.5938833951950073, + "reward_std": 0.11518048122525215, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6251333653926849, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.2734375, + "epoch": 0.56396484375, + "grad_norm": 2.3696482124699134, + "kl": 0.048095703125, + "learning_rate": 8.590087890625e-07, + "loss": 0.0019, + "reward": 1.690087914466858, + "reward_std": 0.106233149766922, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6979004740715027, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.296875, + "epoch": 0.564453125, + "grad_norm": 2.4027424722018864, + "kl": 0.041015625, + "learning_rate": 8.5888671875e-07, + "loss": 0.0016, + "reward": 1.812968671321869, + "reward_std": 0.044363994151353836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8129686117172241, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.6484375, + "epoch": 0.56494140625, + "grad_norm": 2.829388054231261, + "kl": 0.052001953125, + "learning_rate": 8.587646484375e-07, + "loss": 0.0021, + "reward": 1.6969123482704163, + "reward_std": 0.12178021855652332, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7359748482704163, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.4375, + "epoch": 0.5654296875, + "grad_norm": 19.7758891461679, + "kl": 0.052734375, + "learning_rate": 8.586425781249999e-07, + "loss": 0.0021, + "reward": 1.7072020769119263, + "reward_std": 0.0726108830422163, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.722827136516571, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.9296875, + "epoch": 0.56591796875, + "grad_norm": 8.959080922512666, + "kl": 0.05517578125, + "learning_rate": 8.585205078124999e-07, + "loss": 0.0022, + "reward": 1.684401273727417, + "reward_std": 0.08838875964283943, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.684401273727417, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.0859375, + "epoch": 0.56640625, + "grad_norm": 2.362357041373118, + "kl": 0.039306640625, + "learning_rate": 8.583984375e-07, + "loss": 0.0016, + "reward": 1.6522272229194641, + "reward_std": 0.11733454465866089, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6834772229194641, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.859375, + "epoch": 0.56689453125, + "grad_norm": 11.949152567538816, + "kl": 0.046630859375, + "learning_rate": 8.582763671875e-07, + "loss": 0.0019, + "reward": 1.7947958111763, + "reward_std": 0.08994543924927711, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8026082813739777, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.59375, + "epoch": 0.5673828125, + "grad_norm": 1.9920746576348605, + "kl": 0.045166015625, + "learning_rate": 8.58154296875e-07, + "loss": 0.0018, + "reward": 1.666768193244934, + "reward_std": 0.09091871604323387, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6745807230472565, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.375, + "epoch": 0.56787109375, + "grad_norm": 2.2336338961392954, + "kl": 0.0648193359375, + "learning_rate": 8.580322265625e-07, + "loss": 0.0026, + "reward": 1.5089243650436401, + "reward_std": 0.1571076586842537, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5245493352413177, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.21875, + "epoch": 0.568359375, + "grad_norm": 1.6665465636302936, + "kl": 0.046142578125, + "learning_rate": 8.579101562499999e-07, + "loss": 0.0018, + "reward": 1.6286611557006836, + "reward_std": 0.09474155679345131, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6442860960960388, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.5234375, + "epoch": 0.56884765625, + "grad_norm": 2.6701091262123073, + "kl": 0.051025390625, + "learning_rate": 8.577880859374999e-07, + "loss": 0.002, + "reward": 1.7830933332443237, + "reward_std": 0.11209750175476074, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.790905773639679, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.9921875, + "epoch": 0.5693359375, + "grad_norm": 1.289301287486059, + "kl": 0.03955078125, + "learning_rate": 8.576660156249999e-07, + "loss": 0.0016, + "reward": 1.768127977848053, + "reward_std": 0.11993209552019835, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.783752977848053, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.171875, + "epoch": 0.56982421875, + "grad_norm": 1.9298565392126688, + "kl": 0.04833984375, + "learning_rate": 8.575439453125e-07, + "loss": 0.0019, + "reward": 1.7926509380340576, + "reward_std": 0.08671310544013977, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8004634380340576, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.203125, + "epoch": 0.5703125, + "grad_norm": 2.3663116967309654, + "kl": 0.0501708984375, + "learning_rate": 8.57421875e-07, + "loss": 0.002, + "reward": 1.712095022201538, + "reward_std": 0.13929815590381622, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7199075222015381, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.5078125, + "epoch": 0.57080078125, + "grad_norm": 1.7141572985706985, + "kl": 0.0439453125, + "learning_rate": 8.572998046875e-07, + "loss": 0.0018, + "reward": 1.6946678757667542, + "reward_std": 0.0815641526132822, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7024803757667542, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.078125, + "epoch": 0.5712890625, + "grad_norm": 3.523924490731831, + "kl": 0.0640869140625, + "learning_rate": 8.57177734375e-07, + "loss": 0.0026, + "reward": 1.6033125519752502, + "reward_std": 0.16777217388153076, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6423750221729279, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.8359375, + "epoch": 0.57177734375, + "grad_norm": 1.2742692467315286, + "kl": 0.0611572265625, + "learning_rate": 8.570556640624999e-07, + "loss": 0.0024, + "reward": 1.6901500225067139, + "reward_std": 0.07790947519242764, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6901499927043915, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.875, + "epoch": 0.572265625, + "grad_norm": 1.1420114365574225, + "kl": 0.05029296875, + "learning_rate": 8.569335937499999e-07, + "loss": 0.002, + "reward": 1.7600011825561523, + "reward_std": 0.04347742348909378, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7600011825561523, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.640625, + "epoch": 0.57275390625, + "grad_norm": 1.9539279563519465, + "kl": 0.052978515625, + "learning_rate": 8.568115234375e-07, + "loss": 0.0021, + "reward": 1.744015395641327, + "reward_std": 0.04951014555990696, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7440153956413269, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.8359375, + "epoch": 0.5732421875, + "grad_norm": 2.247065979267746, + "kl": 0.0506591796875, + "learning_rate": 8.56689453125e-07, + "loss": 0.002, + "reward": 1.649504840373993, + "reward_std": 0.12423533946275711, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6495048403739929, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.0, + "epoch": 0.57373046875, + "grad_norm": 1.111441732375308, + "kl": 0.0504150390625, + "learning_rate": 8.565673828125e-07, + "loss": 0.002, + "reward": 1.743337869644165, + "reward_std": 0.04013508930802345, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7433378994464874, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.9609375, + "epoch": 0.57421875, + "grad_norm": 2.1368401123680476, + "kl": 0.0535888671875, + "learning_rate": 8.564453125e-07, + "loss": 0.0021, + "reward": 1.6390271186828613, + "reward_std": 0.16274896264076233, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6624647080898285, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.6953125, + "epoch": 0.57470703125, + "grad_norm": 4.624399047687805, + "kl": 0.111328125, + "learning_rate": 8.563232421874999e-07, + "loss": 0.0045, + "reward": 1.5596601963043213, + "reward_std": 0.0850033089518547, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5596601665019989, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.0546875, + "epoch": 0.5751953125, + "grad_norm": 2.2574279851451604, + "kl": 0.0426025390625, + "learning_rate": 8.562011718749999e-07, + "loss": 0.0017, + "reward": 1.732949137687683, + "reward_std": 0.036547823809087276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7329491078853607, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.4453125, + "epoch": 0.57568359375, + "grad_norm": 3.7525805144535487, + "kl": 0.0596923828125, + "learning_rate": 8.560791015624999e-07, + "loss": 0.0024, + "reward": 1.653287649154663, + "reward_std": 0.11773675680160522, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6532876789569855, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.1015625, + "epoch": 0.576171875, + "grad_norm": 5.2466468575854215, + "kl": 0.0731201171875, + "learning_rate": 8.5595703125e-07, + "loss": 0.0029, + "reward": 1.6203824877738953, + "reward_std": 0.12023291178047657, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6594450175762177, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.7890625, + "epoch": 0.57666015625, + "grad_norm": 2.0042003028636453, + "kl": 0.0506591796875, + "learning_rate": 8.558349609375e-07, + "loss": 0.002, + "reward": 1.6139835119247437, + "reward_std": 0.12955578044056892, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.621796041727066, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.6328125, + "epoch": 0.5771484375, + "grad_norm": 1.827852163834602, + "kl": 0.0418701171875, + "learning_rate": 8.55712890625e-07, + "loss": 0.0017, + "reward": 1.7956476211547852, + "reward_std": 0.07434218749403954, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8034601211547852, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.1875, + "epoch": 0.57763671875, + "grad_norm": 1.4216545847402544, + "kl": 0.04736328125, + "learning_rate": 8.555908203125e-07, + "loss": 0.0019, + "reward": 1.7386606931686401, + "reward_std": 0.06302103772759438, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7386606633663177, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.140625, + "epoch": 0.578125, + "grad_norm": 1.436370223265228, + "kl": 0.077880859375, + "learning_rate": 8.554687499999999e-07, + "loss": 0.0031, + "reward": 1.8124098181724548, + "reward_std": 0.04216676577925682, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8124098777770996, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.890625, + "epoch": 0.57861328125, + "grad_norm": 1.0587591031913912, + "kl": 0.060546875, + "learning_rate": 8.553466796874999e-07, + "loss": 0.0024, + "reward": 1.7749245166778564, + "reward_std": 0.07318684877827764, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7827369868755341, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.8125, + "epoch": 0.5791015625, + "grad_norm": 1.7255970036733406, + "kl": 0.0589599609375, + "learning_rate": 8.55224609375e-07, + "loss": 0.0024, + "reward": 1.7037720680236816, + "reward_std": 0.05063655413687229, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7115845680236816, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.875, + "epoch": 0.57958984375, + "grad_norm": 2.1334824778608232, + "kl": 0.061767578125, + "learning_rate": 8.551025390625e-07, + "loss": 0.0025, + "reward": 1.7053207755088806, + "reward_std": 0.20426107943058014, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7209457159042358, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.46875, + "epoch": 0.580078125, + "grad_norm": 24.364302338249043, + "kl": 0.05615234375, + "learning_rate": 8.5498046875e-07, + "loss": 0.0022, + "reward": 1.665935754776001, + "reward_std": 0.116399385035038, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6737483143806458, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.0390625, + "epoch": 0.58056640625, + "grad_norm": 2.7033766379682644, + "kl": 0.0628662109375, + "learning_rate": 8.548583984375e-07, + "loss": 0.0025, + "reward": 1.8331878185272217, + "reward_std": 0.05261234473437071, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8331877589225769, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.7265625, + "epoch": 0.5810546875, + "grad_norm": 17.90905176018613, + "kl": 0.0579833984375, + "learning_rate": 8.547363281249999e-07, + "loss": 0.0023, + "reward": 1.6796801686286926, + "reward_std": 0.09237649664282799, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6874926686286926, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.078125, + "epoch": 0.58154296875, + "grad_norm": 2.1262717212577313, + "kl": 0.0618896484375, + "learning_rate": 8.546142578124999e-07, + "loss": 0.0025, + "reward": 1.6501364707946777, + "reward_std": 0.0826064795255661, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6501363515853882, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.3125, + "epoch": 0.58203125, + "grad_norm": 1.4136272078115328, + "kl": 0.047607421875, + "learning_rate": 8.544921874999999e-07, + "loss": 0.0019, + "reward": 1.8067971467971802, + "reward_std": 0.03229185566306114, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8067971765995026, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.8046875, + "epoch": 0.58251953125, + "grad_norm": 2.047535780569528, + "kl": 0.0599365234375, + "learning_rate": 8.543701171875e-07, + "loss": 0.0024, + "reward": 1.6728439927101135, + "reward_std": 0.1287621632218361, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6884690225124359, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.59375, + "epoch": 0.5830078125, + "grad_norm": 1.557581460070875, + "kl": 0.07373046875, + "learning_rate": 8.54248046875e-07, + "loss": 0.0029, + "reward": 1.7042672038078308, + "reward_std": 0.13340860605239868, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7198922336101532, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.5078125, + "epoch": 0.58349609375, + "grad_norm": 2.4518824995762207, + "kl": 0.05712890625, + "learning_rate": 8.541259765625e-07, + "loss": 0.0023, + "reward": 1.6440874338150024, + "reward_std": 0.14590797573328018, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6987749636173248, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.21875, + "epoch": 0.583984375, + "grad_norm": 7.367664763550004, + "kl": 0.048095703125, + "learning_rate": 8.5400390625e-07, + "loss": 0.0019, + "reward": 1.708031952381134, + "reward_std": 0.13708262518048286, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.723656952381134, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.921875, + "epoch": 0.58447265625, + "grad_norm": 5.15694588705896, + "kl": 0.056884765625, + "learning_rate": 8.538818359374999e-07, + "loss": 0.0023, + "reward": 1.557603120803833, + "reward_std": 0.18934200704097748, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5888532102108002, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.828125, + "epoch": 0.5849609375, + "grad_norm": 8.59471617688442, + "kl": 0.06298828125, + "learning_rate": 8.537597656249999e-07, + "loss": 0.0025, + "reward": 1.7653963565826416, + "reward_std": 0.06373865529894829, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7653963565826416, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.328125, + "epoch": 0.58544921875, + "grad_norm": 1.5188060960183658, + "kl": 0.0498046875, + "learning_rate": 8.536376953125e-07, + "loss": 0.002, + "reward": 1.5980682969093323, + "reward_std": 0.14940915256738663, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6371308267116547, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.234375, + "epoch": 0.5859375, + "grad_norm": 3.9345332601410505, + "kl": 0.0758056640625, + "learning_rate": 8.53515625e-07, + "loss": 0.003, + "reward": 1.6427003741264343, + "reward_std": 0.07084774971008301, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6427003443241119, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0390625, + "epoch": 0.58642578125, + "grad_norm": 1.6826818386421434, + "kl": 0.07373046875, + "learning_rate": 8.533935546875e-07, + "loss": 0.003, + "reward": 1.664411723613739, + "reward_std": 0.16828951984643936, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6878492832183838, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.9765625, + "epoch": 0.5869140625, + "grad_norm": 3.836908468134239, + "kl": 0.05517578125, + "learning_rate": 8.53271484375e-07, + "loss": 0.0022, + "reward": 1.614501714706421, + "reward_std": 0.20206372626125813, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6535641849040985, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.9453125, + "epoch": 0.58740234375, + "grad_norm": 2.9305798211200287, + "kl": 0.0498046875, + "learning_rate": 8.531494140624999e-07, + "loss": 0.002, + "reward": 1.7278985977172852, + "reward_std": 0.11820728331804276, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7435235381126404, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.671875, + "epoch": 0.587890625, + "grad_norm": 2.5378734213626157, + "kl": 0.056640625, + "learning_rate": 8.530273437499999e-07, + "loss": 0.0023, + "reward": 1.7331331968307495, + "reward_std": 0.12400734424591064, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7409456968307495, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.3984375, + "epoch": 0.58837890625, + "grad_norm": 2.6673697152137237, + "kl": 0.0645751953125, + "learning_rate": 8.529052734374999e-07, + "loss": 0.0026, + "reward": 1.7612251043319702, + "reward_std": 0.07167639397084713, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.761225014925003, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.84375, + "epoch": 0.5888671875, + "grad_norm": 1.282713456349074, + "kl": 0.06201171875, + "learning_rate": 8.52783203125e-07, + "loss": 0.0025, + "reward": 1.7675436735153198, + "reward_std": 0.06219838559627533, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7675436437129974, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.609375, + "epoch": 0.58935546875, + "grad_norm": 1.6085902533879708, + "kl": 0.073486328125, + "learning_rate": 8.526611328125e-07, + "loss": 0.0029, + "reward": 1.7036328315734863, + "reward_std": 0.07522736862301826, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7114453315734863, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.15625, + "epoch": 0.58984375, + "grad_norm": 1.4976152138206247, + "kl": 0.0673828125, + "learning_rate": 8.525390625e-07, + "loss": 0.0027, + "reward": 1.6411904096603394, + "reward_std": 0.09779999405145645, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.649002879858017, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.6875, + "epoch": 0.59033203125, + "grad_norm": 1.346538056309748, + "kl": 0.0423583984375, + "learning_rate": 8.524169921875e-07, + "loss": 0.0017, + "reward": 1.7001066207885742, + "reward_std": 0.1550520807504654, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7235440611839294, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.0, + "epoch": 0.5908203125, + "grad_norm": 1.7030655116642164, + "kl": 0.05322265625, + "learning_rate": 8.522949218749999e-07, + "loss": 0.0021, + "reward": 1.6805211901664734, + "reward_std": 0.15443892404437065, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7195836007595062, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.15625, + "epoch": 0.59130859375, + "grad_norm": 3.015177679321377, + "kl": 0.0623779296875, + "learning_rate": 8.521728515624999e-07, + "loss": 0.0025, + "reward": 1.6988528370857239, + "reward_std": 0.1044110469520092, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7144778072834015, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.34375, + "epoch": 0.591796875, + "grad_norm": 1.183423081882842, + "kl": 0.06494140625, + "learning_rate": 8.5205078125e-07, + "loss": 0.0026, + "reward": 1.6010417938232422, + "reward_std": 0.14411456137895584, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6244792938232422, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.4921875, + "epoch": 0.59228515625, + "grad_norm": 3.816793739656266, + "kl": 0.117919921875, + "learning_rate": 8.519287109375e-07, + "loss": 0.0047, + "reward": 1.7807039022445679, + "reward_std": 0.11940962262451649, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7885164320468903, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.96875, + "epoch": 0.5927734375, + "grad_norm": 1.2559013257637506, + "kl": 0.05029296875, + "learning_rate": 8.51806640625e-07, + "loss": 0.002, + "reward": 1.7532151341438293, + "reward_std": 0.09881580621004105, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7688401639461517, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.6171875, + "epoch": 0.59326171875, + "grad_norm": 3.6468537542322106, + "kl": 0.072509765625, + "learning_rate": 8.516845703125e-07, + "loss": 0.0029, + "reward": 1.6600202918052673, + "reward_std": 0.061623964458703995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6600202918052673, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.5859375, + "epoch": 0.59375, + "grad_norm": 10.981884504101897, + "kl": 0.05712890625, + "learning_rate": 8.515624999999999e-07, + "loss": 0.0023, + "reward": 1.701629638671875, + "reward_std": 0.0891575813293457, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7016295790672302, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.890625, + "epoch": 0.59423828125, + "grad_norm": 1.4336654046512236, + "kl": 0.0540771484375, + "learning_rate": 8.514404296874999e-07, + "loss": 0.0022, + "reward": 1.7428684830665588, + "reward_std": 0.08639609813690186, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7506809830665588, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.171875, + "epoch": 0.5947265625, + "grad_norm": 2.776622750108565, + "kl": 0.0618896484375, + "learning_rate": 8.513183593749999e-07, + "loss": 0.0025, + "reward": 1.8936978578567505, + "reward_std": 0.03348179440945387, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8936978578567505, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.7734375, + "epoch": 0.59521484375, + "grad_norm": 1.8814077810205045, + "kl": 0.0614013671875, + "learning_rate": 8.511962890625e-07, + "loss": 0.0025, + "reward": 1.7222504615783691, + "reward_std": 0.10825235769152641, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7378754019737244, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.734375, + "epoch": 0.595703125, + "grad_norm": 4.9487866605632975, + "kl": 0.050048828125, + "learning_rate": 8.5107421875e-07, + "loss": 0.002, + "reward": 1.6460736989974976, + "reward_std": 0.09841511398553848, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6538861691951752, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.9765625, + "epoch": 0.59619140625, + "grad_norm": 3.162686127759236, + "kl": 0.065673828125, + "learning_rate": 8.509521484375e-07, + "loss": 0.0026, + "reward": 1.6684702634811401, + "reward_std": 0.10185368359088898, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.676282674074173, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.0078125, + "epoch": 0.5966796875, + "grad_norm": 1.9616464372870683, + "kl": 0.0635986328125, + "learning_rate": 8.50830078125e-07, + "loss": 0.0025, + "reward": 1.640607476234436, + "reward_std": 0.14903107285499573, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6562323570251465, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.4921875, + "epoch": 0.59716796875, + "grad_norm": 4.351483415683568, + "kl": 0.056396484375, + "learning_rate": 8.507080078124999e-07, + "loss": 0.0023, + "reward": 1.6847114562988281, + "reward_std": 0.060587236657738686, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6847114562988281, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.6875, + "epoch": 0.59765625, + "grad_norm": 1.0863930931712735, + "kl": 0.0416259765625, + "learning_rate": 8.505859374999999e-07, + "loss": 0.0017, + "reward": 1.7114101648330688, + "reward_std": 0.1884886771440506, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7348476648330688, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.0234375, + "epoch": 0.59814453125, + "grad_norm": 4.182483948402709, + "kl": 0.060791015625, + "learning_rate": 8.504638671875e-07, + "loss": 0.0024, + "reward": 1.8124624490737915, + "reward_std": 0.06934082508087158, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8124624788761139, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.5703125, + "epoch": 0.5986328125, + "grad_norm": 1.8143814977652135, + "kl": 0.05712890625, + "learning_rate": 8.50341796875e-07, + "loss": 0.0023, + "reward": 1.7989888787269592, + "reward_std": 0.06971035525202751, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7989888489246368, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.265625, + "epoch": 0.59912109375, + "grad_norm": 1.3565201503680722, + "kl": 0.0521240234375, + "learning_rate": 8.502197265625e-07, + "loss": 0.0021, + "reward": 1.723208248615265, + "reward_std": 0.07503095269203186, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7310207486152649, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.375, + "epoch": 0.599609375, + "grad_norm": 2.093422094136838, + "kl": 0.0616455078125, + "learning_rate": 8.5009765625e-07, + "loss": 0.0025, + "reward": 1.780647873878479, + "reward_std": 0.05587127059698105, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7806479036808014, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.6328125, + "epoch": 0.60009765625, + "grad_norm": 0.9638564944473897, + "kl": 0.04931640625, + "learning_rate": 8.499755859375e-07, + "loss": 0.002, + "reward": 1.8005688786506653, + "reward_std": 0.034580922685563564, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8005689084529877, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.4609375, + "epoch": 0.6005859375, + "grad_norm": 1.7588410237791303, + "kl": 0.0498046875, + "learning_rate": 8.498535156249999e-07, + "loss": 0.002, + "reward": 1.7482208609580994, + "reward_std": 0.11355694010853767, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7560333609580994, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.7109375, + "epoch": 0.60107421875, + "grad_norm": 5.4986353498353235, + "kl": 0.0703125, + "learning_rate": 8.497314453124999e-07, + "loss": 0.0028, + "reward": 1.8072319626808167, + "reward_std": 0.09327958524227142, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.822856992483139, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.9609375, + "epoch": 0.6015625, + "grad_norm": 1.5860451056769687, + "kl": 0.05078125, + "learning_rate": 8.49609375e-07, + "loss": 0.002, + "reward": 1.631383240222931, + "reward_std": 0.1514478251338005, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6548207402229309, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.2109375, + "epoch": 0.60205078125, + "grad_norm": 5.160310522295723, + "kl": 0.064697265625, + "learning_rate": 8.494873046875e-07, + "loss": 0.0026, + "reward": 1.824979543685913, + "reward_std": 0.06196466274559498, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8249796032905579, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.578125, + "epoch": 0.6025390625, + "grad_norm": 3.0896098116019375, + "kl": 0.068603515625, + "learning_rate": 8.49365234375e-07, + "loss": 0.0027, + "reward": 1.6698785424232483, + "reward_std": 0.17456145584583282, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6776910722255707, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.078125, + "epoch": 0.60302734375, + "grad_norm": 2.3461866135262275, + "kl": 0.0556640625, + "learning_rate": 8.492431640625e-07, + "loss": 0.0022, + "reward": 1.726996123790741, + "reward_std": 0.07286924868822098, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7269961833953857, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.4765625, + "epoch": 0.603515625, + "grad_norm": 3.2008131777902302, + "kl": 0.076171875, + "learning_rate": 8.491210937499999e-07, + "loss": 0.0031, + "reward": 1.5925570726394653, + "reward_std": 0.13758273422718048, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6159945428371429, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.09375, + "epoch": 0.60400390625, + "grad_norm": 1.0869290753732679, + "kl": 0.0523681640625, + "learning_rate": 8.489990234374999e-07, + "loss": 0.0021, + "reward": 1.6631884574890137, + "reward_std": 0.1325419619679451, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6944384574890137, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.390625, + "epoch": 0.6044921875, + "grad_norm": 1.043664689306585, + "kl": 0.0596923828125, + "learning_rate": 8.48876953125e-07, + "loss": 0.0024, + "reward": 1.7231544256210327, + "reward_std": 0.029385648667812347, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7231544256210327, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.046875, + "epoch": 0.60498046875, + "grad_norm": 1.9882566923730944, + "kl": 0.0679931640625, + "learning_rate": 8.487548828125e-07, + "loss": 0.0027, + "reward": 1.755352258682251, + "reward_std": 0.050556398928165436, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.755352258682251, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.1875, + "epoch": 0.60546875, + "grad_norm": 0.9718933967102993, + "kl": 0.0498046875, + "learning_rate": 8.486328125e-07, + "loss": 0.002, + "reward": 1.7807682752609253, + "reward_std": 0.05785749014467001, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7807681560516357, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.15625, + "epoch": 0.60595703125, + "grad_norm": 1.0864992081357003, + "kl": 0.0439453125, + "learning_rate": 8.485107421875e-07, + "loss": 0.0018, + "reward": 1.8006258606910706, + "reward_std": 0.10033701360225677, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8162508606910706, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.28125, + "epoch": 0.6064453125, + "grad_norm": 0.72379267103834, + "kl": 0.0555419921875, + "learning_rate": 8.48388671875e-07, + "loss": 0.0022, + "reward": 1.7179449796676636, + "reward_std": 0.10580763639882207, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7491949796676636, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.1953125, + "epoch": 0.60693359375, + "grad_norm": 6.134340243573936, + "kl": 0.056640625, + "learning_rate": 8.482666015624999e-07, + "loss": 0.0023, + "reward": 1.6223798394203186, + "reward_std": 0.08309117332100868, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6223797798156738, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.03125, + "epoch": 0.607421875, + "grad_norm": 2.981575045817795, + "kl": 0.06689453125, + "learning_rate": 8.481445312499999e-07, + "loss": 0.0027, + "reward": 1.5559495091438293, + "reward_std": 0.12914244830608368, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.571574479341507, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.5703125, + "epoch": 0.60791015625, + "grad_norm": 1.167173976520968, + "kl": 0.0479736328125, + "learning_rate": 8.480224609375e-07, + "loss": 0.0019, + "reward": 1.8290737867355347, + "reward_std": 0.09445438906550407, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8446987867355347, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.4921875, + "epoch": 0.6083984375, + "grad_norm": 2.3473086338411733, + "kl": 0.041259765625, + "learning_rate": 8.47900390625e-07, + "loss": 0.0017, + "reward": 1.7552416920661926, + "reward_std": 0.0686273779720068, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7552417516708374, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.984375, + "epoch": 0.60888671875, + "grad_norm": 1.989690821238528, + "kl": 0.0418701171875, + "learning_rate": 8.477783203125e-07, + "loss": 0.0017, + "reward": 1.7533529996871948, + "reward_std": 0.08260135725140572, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7533529996871948, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.46875, + "epoch": 0.609375, + "grad_norm": 4.174383689747545, + "kl": 0.0361328125, + "learning_rate": 8.4765625e-07, + "loss": 0.0014, + "reward": 1.7376724481582642, + "reward_std": 0.12889265269041061, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7532974779605865, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.9375, + "epoch": 0.60986328125, + "grad_norm": 1.41606671516539, + "kl": 0.0467529296875, + "learning_rate": 8.475341796874999e-07, + "loss": 0.0019, + "reward": 1.7911608219146729, + "reward_std": 0.05567508563399315, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7911607921123505, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.8046875, + "epoch": 0.6103515625, + "grad_norm": 1.3841122351592616, + "kl": 0.0531005859375, + "learning_rate": 8.474121093749999e-07, + "loss": 0.0021, + "reward": 1.8352203965187073, + "reward_std": 0.06681127846240997, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8352203369140625, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.4140625, + "epoch": 0.61083984375, + "grad_norm": 1.6245690807772388, + "kl": 0.054443359375, + "learning_rate": 8.472900390624999e-07, + "loss": 0.0022, + "reward": 1.83678537607193, + "reward_std": 0.0678851343691349, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8367853164672852, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.6015625, + "epoch": 0.611328125, + "grad_norm": 1.6847918430394793, + "kl": 0.0633544921875, + "learning_rate": 8.4716796875e-07, + "loss": 0.0025, + "reward": 1.6871824860572815, + "reward_std": 0.12895482033491135, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6949949860572815, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.8671875, + "epoch": 0.61181640625, + "grad_norm": 1.9808612056878117, + "kl": 0.047119140625, + "learning_rate": 8.470458984375e-07, + "loss": 0.0019, + "reward": 1.8585364818572998, + "reward_std": 0.05595720373094082, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8585363328456879, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.1953125, + "epoch": 0.6123046875, + "grad_norm": 1.023636015557283, + "kl": 0.0531005859375, + "learning_rate": 8.46923828125e-07, + "loss": 0.0021, + "reward": 1.7553237080574036, + "reward_std": 0.017358798999339342, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7553236782550812, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.390625, + "epoch": 0.61279296875, + "grad_norm": 2.129666626613869, + "kl": 0.0482177734375, + "learning_rate": 8.468017578125e-07, + "loss": 0.0019, + "reward": 1.830765187740326, + "reward_std": 0.06046690791845322, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8307652175426483, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.875, + "epoch": 0.61328125, + "grad_norm": 5.092569077503816, + "kl": 0.072021484375, + "learning_rate": 8.466796874999999e-07, + "loss": 0.0029, + "reward": 1.7521468997001648, + "reward_std": 0.11612342670559883, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.75995934009552, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.5859375, + "epoch": 0.61376953125, + "grad_norm": 2.542365050667952, + "kl": 0.06787109375, + "learning_rate": 8.465576171874999e-07, + "loss": 0.0027, + "reward": 1.8093348741531372, + "reward_std": 0.05134081654250622, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8093349635601044, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.34375, + "epoch": 0.6142578125, + "grad_norm": 2.0593288502085576, + "kl": 0.0782470703125, + "learning_rate": 8.46435546875e-07, + "loss": 0.0031, + "reward": 1.7897635102272034, + "reward_std": 0.08628207445144653, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7975760698318481, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0390625, + "epoch": 0.61474609375, + "grad_norm": 1.1846536082668164, + "kl": 0.0538330078125, + "learning_rate": 8.463134765625e-07, + "loss": 0.0022, + "reward": 1.6509920954704285, + "reward_std": 0.08322879299521446, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6588045656681061, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.6796875, + "epoch": 0.615234375, + "grad_norm": 9.144569025352508, + "kl": 0.0601806640625, + "learning_rate": 8.4619140625e-07, + "loss": 0.0024, + "reward": 1.7124608755111694, + "reward_std": 0.06501621380448341, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7124608755111694, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.5546875, + "epoch": 0.61572265625, + "grad_norm": 10.55754939249481, + "kl": 0.060302734375, + "learning_rate": 8.460693359375e-07, + "loss": 0.0024, + "reward": 1.7932811379432678, + "reward_std": 0.07128015346825123, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7932811081409454, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.984375, + "epoch": 0.6162109375, + "grad_norm": 71.34719881454178, + "kl": 0.06298828125, + "learning_rate": 8.459472656249999e-07, + "loss": 0.0025, + "reward": 1.7777682542800903, + "reward_std": 0.025185417383909225, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7777682244777679, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.7578125, + "epoch": 0.61669921875, + "grad_norm": 1.978131656908925, + "kl": 0.061767578125, + "learning_rate": 8.458251953124999e-07, + "loss": 0.0025, + "reward": 1.6713601350784302, + "reward_std": 0.12586339935660362, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6869851052761078, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6796875, + "epoch": 0.6171875, + "grad_norm": 1.5988305057821919, + "kl": 0.0751953125, + "learning_rate": 8.457031249999999e-07, + "loss": 0.003, + "reward": 1.75057852268219, + "reward_std": 0.0573820099234581, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7505785524845123, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.5546875, + "epoch": 0.61767578125, + "grad_norm": 1.4597428566631678, + "kl": 0.0667724609375, + "learning_rate": 8.455810546875e-07, + "loss": 0.0027, + "reward": 1.759526014328003, + "reward_std": 0.08681388199329376, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7673385143280029, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.53125, + "epoch": 0.6181640625, + "grad_norm": 3.422813642948489, + "kl": 0.08837890625, + "learning_rate": 8.45458984375e-07, + "loss": 0.0035, + "reward": 1.8277055025100708, + "reward_std": 0.08797085843980312, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8355179727077484, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.109375, + "epoch": 0.61865234375, + "grad_norm": 1.3015209293279653, + "kl": 0.0609130859375, + "learning_rate": 8.453369140625e-07, + "loss": 0.0024, + "reward": 1.7601238489151, + "reward_std": 0.03359607141464949, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7601238191127777, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.8828125, + "epoch": 0.619140625, + "grad_norm": 3.1082145060006856, + "kl": 0.0732421875, + "learning_rate": 8.4521484375e-07, + "loss": 0.0029, + "reward": 1.5394993424415588, + "reward_std": 0.0939161665737629, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5394993126392365, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.5078125, + "epoch": 0.61962890625, + "grad_norm": 1.281242601050256, + "kl": 0.0672607421875, + "learning_rate": 8.450927734374999e-07, + "loss": 0.0027, + "reward": 1.7604122757911682, + "reward_std": 0.08974438905715942, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7604122757911682, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.515625, + "epoch": 0.6201171875, + "grad_norm": 3.739110086504897, + "kl": 0.05712890625, + "learning_rate": 8.449707031249999e-07, + "loss": 0.0023, + "reward": 1.8326544761657715, + "reward_std": 0.05452083423733711, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8326544463634491, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.171875, + "epoch": 0.62060546875, + "grad_norm": 2.0306719948385865, + "kl": 0.06689453125, + "learning_rate": 8.448486328125e-07, + "loss": 0.0027, + "reward": 1.6453559398651123, + "reward_std": 0.06446324661374092, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6453558802604675, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.484375, + "epoch": 0.62109375, + "grad_norm": 2.743793220567594, + "kl": 0.0662841796875, + "learning_rate": 8.447265625e-07, + "loss": 0.0026, + "reward": 1.7001383304595947, + "reward_std": 0.09162449836730957, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7001383602619171, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.734375, + "epoch": 0.62158203125, + "grad_norm": 2.553544875125356, + "kl": 0.0584716796875, + "learning_rate": 8.446044921875e-07, + "loss": 0.0023, + "reward": 1.5885206460952759, + "reward_std": 0.047395724803209305, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5885206460952759, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.375, + "epoch": 0.6220703125, + "grad_norm": 0.8661435940898954, + "kl": 0.057373046875, + "learning_rate": 8.44482421875e-07, + "loss": 0.0023, + "reward": 1.8218601942062378, + "reward_std": 0.047537509351968765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.821860134601593, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.9140625, + "epoch": 0.62255859375, + "grad_norm": 1.8229102343241281, + "kl": 0.0611572265625, + "learning_rate": 8.443603515624999e-07, + "loss": 0.0024, + "reward": 1.7554479241371155, + "reward_std": 0.06095794588327408, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7554478943347931, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.875, + "epoch": 0.623046875, + "grad_norm": 5.054715442884952, + "kl": 0.056640625, + "learning_rate": 8.442382812499999e-07, + "loss": 0.0023, + "reward": 1.698002815246582, + "reward_std": 0.1263159103691578, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.705815315246582, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.109375, + "epoch": 0.62353515625, + "grad_norm": 1.3577851009475093, + "kl": 0.05859375, + "learning_rate": 8.441162109374999e-07, + "loss": 0.0023, + "reward": 1.7819681763648987, + "reward_std": 0.05448628589510918, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7819681465625763, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.9609375, + "epoch": 0.6240234375, + "grad_norm": 1.9557945914662318, + "kl": 0.0556640625, + "learning_rate": 8.43994140625e-07, + "loss": 0.0022, + "reward": 1.5706565976142883, + "reward_std": 0.10779277980327606, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5784690678119659, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.9609375, + "epoch": 0.62451171875, + "grad_norm": 9.30380292114862, + "kl": 0.053955078125, + "learning_rate": 8.438720703125e-07, + "loss": 0.0022, + "reward": 1.780591070652008, + "reward_std": 0.047663201577961445, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7805911004543304, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.7109375, + "epoch": 0.625, + "grad_norm": 1.1932154787420253, + "kl": 0.0540771484375, + "learning_rate": 8.4375e-07, + "loss": 0.0022, + "reward": 1.7884827852249146, + "reward_std": 0.07211063336580992, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7962952554225922, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7265625, + "epoch": 0.62548828125, + "grad_norm": 1.9144731486079674, + "kl": 0.03955078125, + "learning_rate": 8.436279296875e-07, + "loss": 0.0016, + "reward": 1.8027944564819336, + "reward_std": 0.04255840554833412, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.802794486284256, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.203125, + "epoch": 0.6259765625, + "grad_norm": 2.772928558355846, + "kl": 0.0596923828125, + "learning_rate": 8.435058593749999e-07, + "loss": 0.0024, + "reward": 1.731988787651062, + "reward_std": 0.09315790981054306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7319887578487396, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.1953125, + "epoch": 0.62646484375, + "grad_norm": 9.787372043948942, + "kl": 0.0592041015625, + "learning_rate": 8.433837890624999e-07, + "loss": 0.0024, + "reward": 1.7648069858551025, + "reward_std": 0.03516199626028538, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7648070156574249, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.4765625, + "epoch": 0.626953125, + "grad_norm": 3.356579451338363, + "kl": 0.05126953125, + "learning_rate": 8.4326171875e-07, + "loss": 0.0021, + "reward": 1.696887195110321, + "reward_std": 0.05591726675629616, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6968871355056763, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.6015625, + "epoch": 0.62744140625, + "grad_norm": 2.549785582844497, + "kl": 0.0439453125, + "learning_rate": 8.431396484375e-07, + "loss": 0.0018, + "reward": 1.7408929467201233, + "reward_std": 0.11647412180900574, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7487053871154785, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.4609375, + "epoch": 0.6279296875, + "grad_norm": 0.9021938993838211, + "kl": 0.0540771484375, + "learning_rate": 8.43017578125e-07, + "loss": 0.0022, + "reward": 1.758631408214569, + "reward_std": 0.126407902687788, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7820688784122467, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.09375, + "epoch": 0.62841796875, + "grad_norm": 1.3267018883224448, + "kl": 0.0491943359375, + "learning_rate": 8.428955078125e-07, + "loss": 0.002, + "reward": 1.632334589958191, + "reward_std": 0.0630562799051404, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6401470899581909, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.5625, + "epoch": 0.62890625, + "grad_norm": 2.2552351150707017, + "kl": 0.048828125, + "learning_rate": 8.427734374999999e-07, + "loss": 0.002, + "reward": 1.8420506715774536, + "reward_std": 0.1419878453016281, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8498630821704865, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.1171875, + "epoch": 0.62939453125, + "grad_norm": 3.9365564848072605, + "kl": 0.0616455078125, + "learning_rate": 8.426513671874999e-07, + "loss": 0.0025, + "reward": 1.6480534076690674, + "reward_std": 0.04998471587896347, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6480533927679062, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.265625, + "epoch": 0.6298828125, + "grad_norm": 11.182204805928025, + "kl": 0.071044921875, + "learning_rate": 8.425292968749999e-07, + "loss": 0.0028, + "reward": 1.7481674551963806, + "reward_std": 0.10754155367612839, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7481675148010254, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.03125, + "epoch": 0.63037109375, + "grad_norm": 9.473971973141426, + "kl": 0.0615234375, + "learning_rate": 8.424072265625e-07, + "loss": 0.0025, + "reward": 1.7282034158706665, + "reward_std": 0.08554265275597572, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7282033860683441, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.9609375, + "epoch": 0.630859375, + "grad_norm": 1.5297115240623664, + "kl": 0.054931640625, + "learning_rate": 8.4228515625e-07, + "loss": 0.0022, + "reward": 1.7187672853469849, + "reward_std": 0.06834917794913054, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7265797853469849, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.2890625, + "epoch": 0.63134765625, + "grad_norm": 1.0070423771790409, + "kl": 0.048095703125, + "learning_rate": 8.421630859375e-07, + "loss": 0.0019, + "reward": 1.7618906497955322, + "reward_std": 0.0947088971734047, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7697031795978546, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.09375, + "epoch": 0.6318359375, + "grad_norm": 2.568966029071052, + "kl": 0.056396484375, + "learning_rate": 8.42041015625e-07, + "loss": 0.0023, + "reward": 1.6242307424545288, + "reward_std": 0.21024633944034576, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.663293182849884, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.21875, + "epoch": 0.63232421875, + "grad_norm": 1.9848309178933314, + "kl": 0.049560546875, + "learning_rate": 8.419189453124999e-07, + "loss": 0.002, + "reward": 1.7481633424758911, + "reward_std": 0.12243251502513885, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7716008424758911, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.59375, + "epoch": 0.6328125, + "grad_norm": 0.8761196952724171, + "kl": 0.0399169921875, + "learning_rate": 8.417968749999999e-07, + "loss": 0.0016, + "reward": 1.7893099188804626, + "reward_std": 0.05286476016044617, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7893097996711731, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.4296875, + "epoch": 0.63330078125, + "grad_norm": 1.1095061436724811, + "kl": 0.0584716796875, + "learning_rate": 8.416748046875e-07, + "loss": 0.0023, + "reward": 1.7433611750602722, + "reward_std": 0.061368606984615326, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7433610558509827, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.3125, + "epoch": 0.6337890625, + "grad_norm": 2.146602454130791, + "kl": 0.055419921875, + "learning_rate": 8.41552734375e-07, + "loss": 0.0022, + "reward": 1.7003250122070312, + "reward_std": 0.08811355289071798, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7081375122070312, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.0859375, + "epoch": 0.63427734375, + "grad_norm": 1.165844265780615, + "kl": 0.05322265625, + "learning_rate": 8.414306640625e-07, + "loss": 0.0021, + "reward": 1.6436746716499329, + "reward_std": 0.0825019795447588, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6436747312545776, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.765625, + "epoch": 0.634765625, + "grad_norm": 2.5464365422498356, + "kl": 0.0504150390625, + "learning_rate": 8.4130859375e-07, + "loss": 0.002, + "reward": 1.7102563381195068, + "reward_std": 0.0774708678945899, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7180688977241516, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.3984375, + "epoch": 0.63525390625, + "grad_norm": 2.7704272264732666, + "kl": 0.0579833984375, + "learning_rate": 8.411865234374999e-07, + "loss": 0.0023, + "reward": 1.7323628664016724, + "reward_std": 0.08244866505265236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7323628962039948, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.9296875, + "epoch": 0.6357421875, + "grad_norm": 1.5304947901397634, + "kl": 0.0517578125, + "learning_rate": 8.410644531249999e-07, + "loss": 0.0021, + "reward": 1.77633798122406, + "reward_std": 0.09737828373908997, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7841504514217377, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.390625, + "epoch": 0.63623046875, + "grad_norm": 2.9720279627250954, + "kl": 0.0599365234375, + "learning_rate": 8.409423828124999e-07, + "loss": 0.0024, + "reward": 1.7398386597633362, + "reward_std": 0.06009085476398468, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7398386597633362, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.8359375, + "epoch": 0.63671875, + "grad_norm": 1.4673255697862866, + "kl": 0.06982421875, + "learning_rate": 8.408203125e-07, + "loss": 0.0028, + "reward": 1.6804990768432617, + "reward_std": 0.05310311168432236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6804989874362946, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.875, + "epoch": 0.63720703125, + "grad_norm": 1.4253383817755978, + "kl": 0.057373046875, + "learning_rate": 8.406982421875e-07, + "loss": 0.0023, + "reward": 1.830526053905487, + "reward_std": 0.05299729108810425, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8305260539054871, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.9140625, + "epoch": 0.6376953125, + "grad_norm": 2.353756599931053, + "kl": 0.047607421875, + "learning_rate": 8.40576171875e-07, + "loss": 0.0019, + "reward": 1.719884991645813, + "reward_std": 0.11661730334162712, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.727697491645813, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.5703125, + "epoch": 0.63818359375, + "grad_norm": 1.3478820253260286, + "kl": 0.0499267578125, + "learning_rate": 8.404541015625e-07, + "loss": 0.002, + "reward": 1.6699483394622803, + "reward_std": 0.1071729026734829, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6777608394622803, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.5546875, + "epoch": 0.638671875, + "grad_norm": 1.8256350632514393, + "kl": 0.06689453125, + "learning_rate": 8.403320312499999e-07, + "loss": 0.0027, + "reward": 1.8012477159500122, + "reward_std": 0.0809515118598938, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8012477159500122, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.3359375, + "epoch": 0.63916015625, + "grad_norm": 4.051138050422444, + "kl": 0.0504150390625, + "learning_rate": 8.402099609374999e-07, + "loss": 0.002, + "reward": 1.8088411688804626, + "reward_std": 0.07655365020036697, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8088411688804626, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.5703125, + "epoch": 0.6396484375, + "grad_norm": 5.526840892766896, + "kl": 0.05615234375, + "learning_rate": 8.40087890625e-07, + "loss": 0.0022, + "reward": 1.7295674085617065, + "reward_std": 0.06708750128746033, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7295673787593842, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.0078125, + "epoch": 0.64013671875, + "grad_norm": 1.608420576473483, + "kl": 0.050537109375, + "learning_rate": 8.399658203125e-07, + "loss": 0.002, + "reward": 1.7628620862960815, + "reward_std": 0.0696718655526638, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7706745862960815, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.0, + "epoch": 0.640625, + "grad_norm": 1.5968411371770552, + "kl": 0.057373046875, + "learning_rate": 8.3984375e-07, + "loss": 0.0023, + "reward": 1.652459740638733, + "reward_std": 0.06461456045508385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6524598002433777, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.1875, + "epoch": 0.64111328125, + "grad_norm": 1.3200737326998475, + "kl": 0.054443359375, + "learning_rate": 8.397216796875e-07, + "loss": 0.0022, + "reward": 1.6885486841201782, + "reward_std": 0.08856038376688957, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6963611841201782, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.09375, + "epoch": 0.6416015625, + "grad_norm": 3.6728233406933257, + "kl": 0.068603515625, + "learning_rate": 8.395996093749999e-07, + "loss": 0.0027, + "reward": 1.6466514468193054, + "reward_std": 0.0514130312949419, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6544639468193054, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.0390625, + "epoch": 0.64208984375, + "grad_norm": 0.8191385325359082, + "kl": 0.0511474609375, + "learning_rate": 8.394775390624999e-07, + "loss": 0.002, + "reward": 1.7703859210014343, + "reward_std": 0.0926944687962532, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7860109210014343, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.75, + "epoch": 0.642578125, + "grad_norm": 3.0861231360855617, + "kl": 0.0599365234375, + "learning_rate": 8.393554687499999e-07, + "loss": 0.0024, + "reward": 1.6689003109931946, + "reward_std": 0.07885997742414474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6689003109931946, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.296875, + "epoch": 0.64306640625, + "grad_norm": 1.8166865063096989, + "kl": 0.0635986328125, + "learning_rate": 8.392333984375e-07, + "loss": 0.0025, + "reward": 1.6987890005111694, + "reward_std": 0.06034014839679003, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6987889111042023, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.046875, + "epoch": 0.6435546875, + "grad_norm": 1.4887931508827008, + "kl": 0.0628662109375, + "learning_rate": 8.39111328125e-07, + "loss": 0.0025, + "reward": 1.7936866283416748, + "reward_std": 0.04510762542486191, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7936865985393524, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.3671875, + "epoch": 0.64404296875, + "grad_norm": 1.2967345838531843, + "kl": 0.050537109375, + "learning_rate": 8.389892578125e-07, + "loss": 0.002, + "reward": 1.83639657497406, + "reward_std": 0.06598273664712906, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8363966047763824, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.5625, + "epoch": 0.64453125, + "grad_norm": 1.8104292750631013, + "kl": 0.067626953125, + "learning_rate": 8.388671875e-07, + "loss": 0.0027, + "reward": 1.7048075199127197, + "reward_std": 0.05657285824418068, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.704807460308075, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.359375, + "epoch": 0.64501953125, + "grad_norm": 2.7804914867197734, + "kl": 0.0479736328125, + "learning_rate": 8.387451171874999e-07, + "loss": 0.0019, + "reward": 1.791284441947937, + "reward_std": 0.04943067207932472, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.791284441947937, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.2578125, + "epoch": 0.6455078125, + "grad_norm": 1.5521266813840282, + "kl": 0.0557861328125, + "learning_rate": 8.386230468749999e-07, + "loss": 0.0022, + "reward": 1.6612219214439392, + "reward_std": 0.06563910469412804, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6612218618392944, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.65625, + "epoch": 0.64599609375, + "grad_norm": 1.6124206023884073, + "kl": 0.056396484375, + "learning_rate": 8.385009765625e-07, + "loss": 0.0023, + "reward": 1.753632366657257, + "reward_std": 0.051005132496356964, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7536323666572571, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.9765625, + "epoch": 0.646484375, + "grad_norm": 2.247706598191221, + "kl": 0.054443359375, + "learning_rate": 8.3837890625e-07, + "loss": 0.0022, + "reward": 1.7354570031166077, + "reward_std": 0.16325188055634499, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7510820627212524, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.796875, + "epoch": 0.64697265625, + "grad_norm": 11.389486659767021, + "kl": 0.0579833984375, + "learning_rate": 8.382568359375e-07, + "loss": 0.0023, + "reward": 1.7502111196517944, + "reward_std": 0.0659194141626358, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7502111196517944, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.5390625, + "epoch": 0.6474609375, + "grad_norm": 1.2834360713798052, + "kl": 0.0615234375, + "learning_rate": 8.38134765625e-07, + "loss": 0.0025, + "reward": 1.69877290725708, + "reward_std": 0.052922509610652924, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6987729072570801, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.453125, + "epoch": 0.64794921875, + "grad_norm": 1.2944757450752098, + "kl": 0.058837890625, + "learning_rate": 8.380126953125e-07, + "loss": 0.0024, + "reward": 1.7563305497169495, + "reward_std": 0.08665376901626587, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7563305497169495, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.6796875, + "epoch": 0.6484375, + "grad_norm": 6.608818467370714, + "kl": 0.0654296875, + "learning_rate": 8.378906249999999e-07, + "loss": 0.0026, + "reward": 1.6104682683944702, + "reward_std": 0.08941986411809921, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6104682385921478, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.015625, + "epoch": 0.64892578125, + "grad_norm": 1.7084744934823917, + "kl": 0.06494140625, + "learning_rate": 8.377685546874999e-07, + "loss": 0.0026, + "reward": 1.7628703117370605, + "reward_std": 0.044559099711477757, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7628703117370605, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.265625, + "epoch": 0.6494140625, + "grad_norm": 1.0774403862289603, + "kl": 0.054443359375, + "learning_rate": 8.37646484375e-07, + "loss": 0.0022, + "reward": 1.8073540329933167, + "reward_std": 0.07767094019800425, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8151665329933167, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.7421875, + "epoch": 0.64990234375, + "grad_norm": 1.8398660445850628, + "kl": 0.0673828125, + "learning_rate": 8.375244140625e-07, + "loss": 0.0027, + "reward": 1.6594313383102417, + "reward_std": 0.10600551217794418, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6672438383102417, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.0859375, + "epoch": 0.650390625, + "grad_norm": 3.934324761283697, + "kl": 0.08056640625, + "learning_rate": 8.3740234375e-07, + "loss": 0.0032, + "reward": 1.6837170720100403, + "reward_std": 0.029794931411743164, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6837171018123627, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.984375, + "epoch": 0.65087890625, + "grad_norm": 13.797314264854801, + "kl": 0.062744140625, + "learning_rate": 8.372802734375e-07, + "loss": 0.0025, + "reward": 1.8004740476608276, + "reward_std": 0.053687095642089844, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8004740178585052, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.578125, + "epoch": 0.6513671875, + "grad_norm": 1.1330439181277785, + "kl": 0.04736328125, + "learning_rate": 8.371582031249999e-07, + "loss": 0.0019, + "reward": 1.6666799783706665, + "reward_std": 0.10937470942735672, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6744924187660217, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.859375, + "epoch": 0.65185546875, + "grad_norm": 1.4553589678453058, + "kl": 0.0582275390625, + "learning_rate": 8.370361328124999e-07, + "loss": 0.0023, + "reward": 1.6906811594963074, + "reward_std": 0.04443385824561119, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6906810998916626, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.890625, + "epoch": 0.65234375, + "grad_norm": 1.302159303083921, + "kl": 0.048095703125, + "learning_rate": 8.369140625e-07, + "loss": 0.0019, + "reward": 1.7355791926383972, + "reward_std": 0.1092800498008728, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7512041926383972, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7109375, + "epoch": 0.65283203125, + "grad_norm": 17.77091921210839, + "kl": 0.0572509765625, + "learning_rate": 8.367919921875e-07, + "loss": 0.0023, + "reward": 1.644793450832367, + "reward_std": 0.09384549781680107, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6526058912277222, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.3046875, + "epoch": 0.6533203125, + "grad_norm": 0.9214814360203658, + "kl": 0.0531005859375, + "learning_rate": 8.36669921875e-07, + "loss": 0.0021, + "reward": 1.7342381477355957, + "reward_std": 0.03282667603343725, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7342380881309509, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.3359375, + "epoch": 0.65380859375, + "grad_norm": 1.2792896824125273, + "kl": 0.0487060546875, + "learning_rate": 8.365478515625e-07, + "loss": 0.0019, + "reward": 1.8167948126792908, + "reward_std": 0.05022166669368744, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8167948722839355, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.703125, + "epoch": 0.654296875, + "grad_norm": 2.0308611626495954, + "kl": 0.05078125, + "learning_rate": 8.3642578125e-07, + "loss": 0.002, + "reward": 1.793544888496399, + "reward_std": 0.05571754090487957, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7935448884963989, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.4609375, + "epoch": 0.65478515625, + "grad_norm": 2.8603926926077285, + "kl": 0.0635986328125, + "learning_rate": 8.363037109374999e-07, + "loss": 0.0025, + "reward": 1.7558925151824951, + "reward_std": 0.09720181487500668, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7558925449848175, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.359375, + "epoch": 0.6552734375, + "grad_norm": 1.7594024320160508, + "kl": 0.049072265625, + "learning_rate": 8.361816406249999e-07, + "loss": 0.002, + "reward": 1.7895704507827759, + "reward_std": 0.060751235112547874, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7895704209804535, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.984375, + "epoch": 0.65576171875, + "grad_norm": 2.025592104259594, + "kl": 0.0482177734375, + "learning_rate": 8.360595703125e-07, + "loss": 0.0019, + "reward": 1.7954835891723633, + "reward_std": 0.11320845782756805, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8032960891723633, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.828125, + "epoch": 0.65625, + "grad_norm": 2.5560286885728267, + "kl": 0.05908203125, + "learning_rate": 8.359375e-07, + "loss": 0.0024, + "reward": 1.6292362213134766, + "reward_std": 0.13106617331504822, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.637048751115799, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.296875, + "epoch": 0.65673828125, + "grad_norm": 2.542594283261598, + "kl": 0.0533447265625, + "learning_rate": 8.358154296875e-07, + "loss": 0.0021, + "reward": 1.7747212648391724, + "reward_std": 0.08434459567070007, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7747212648391724, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.21875, + "epoch": 0.6572265625, + "grad_norm": 1.7311059727406346, + "kl": 0.056396484375, + "learning_rate": 8.35693359375e-07, + "loss": 0.0023, + "reward": 1.7278847694396973, + "reward_std": 0.06932184100151062, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7278847694396973, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.7265625, + "epoch": 0.65771484375, + "grad_norm": 0.8011187117027543, + "kl": 0.0498046875, + "learning_rate": 8.355712890624999e-07, + "loss": 0.002, + "reward": 1.7831463813781738, + "reward_std": 0.018837594892829657, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7831464111804962, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.265625, + "epoch": 0.658203125, + "grad_norm": 1.535578949505016, + "kl": 0.0528564453125, + "learning_rate": 8.354492187499999e-07, + "loss": 0.0021, + "reward": 1.7664051055908203, + "reward_std": 0.0737844929099083, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7664050757884979, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.421875, + "epoch": 0.65869140625, + "grad_norm": 2.622932104029845, + "kl": 0.0594482421875, + "learning_rate": 8.353271484374999e-07, + "loss": 0.0024, + "reward": 1.6524591445922852, + "reward_std": 0.0812476146966219, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6524590253829956, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.984375, + "epoch": 0.6591796875, + "grad_norm": 1.1970666562029746, + "kl": 0.060302734375, + "learning_rate": 8.35205078125e-07, + "loss": 0.0024, + "reward": 1.6671356558799744, + "reward_std": 0.09243928454816341, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6827606558799744, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.109375, + "epoch": 0.65966796875, + "grad_norm": 2.009394767469386, + "kl": 0.0521240234375, + "learning_rate": 8.350830078125e-07, + "loss": 0.0021, + "reward": 1.629963755607605, + "reward_std": 0.10920102149248123, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.637776255607605, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5859375, + "epoch": 0.66015625, + "grad_norm": 2.3653616861770193, + "kl": 0.0565185546875, + "learning_rate": 8.349609375e-07, + "loss": 0.0023, + "reward": 1.7094528079032898, + "reward_std": 0.07993777468800545, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7172653079032898, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.7265625, + "epoch": 0.66064453125, + "grad_norm": 1.03341094245473, + "kl": 0.04931640625, + "learning_rate": 8.348388671875e-07, + "loss": 0.002, + "reward": 1.8251619338989258, + "reward_std": 0.03396361041814089, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8251619935035706, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.6953125, + "epoch": 0.6611328125, + "grad_norm": 3.4116771293429715, + "kl": 0.06494140625, + "learning_rate": 8.347167968749999e-07, + "loss": 0.0026, + "reward": 1.6079095005989075, + "reward_std": 0.1288512572646141, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6157219111919403, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.8203125, + "epoch": 0.66162109375, + "grad_norm": 1.5812188323632634, + "kl": 0.04443359375, + "learning_rate": 8.345947265624999e-07, + "loss": 0.0018, + "reward": 1.760659396648407, + "reward_std": 0.10924211144447327, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7684718370437622, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.6015625, + "epoch": 0.662109375, + "grad_norm": 1.3464559463319818, + "kl": 0.0501708984375, + "learning_rate": 8.3447265625e-07, + "loss": 0.002, + "reward": 1.6251134872436523, + "reward_std": 0.19322798028588295, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.65636345744133, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.7734375, + "epoch": 0.66259765625, + "grad_norm": 2.458918621457383, + "kl": 0.0506591796875, + "learning_rate": 8.343505859375e-07, + "loss": 0.002, + "reward": 1.6254653930664062, + "reward_std": 0.20456621050834656, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.719215452671051, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5625, + "epoch": 0.6630859375, + "grad_norm": 1.2995869911698037, + "kl": 0.0513916015625, + "learning_rate": 8.34228515625e-07, + "loss": 0.0021, + "reward": 1.7724117040634155, + "reward_std": 0.05441422015428543, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7724116742610931, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.9765625, + "epoch": 0.66357421875, + "grad_norm": 3.3800856344759063, + "kl": 0.055908203125, + "learning_rate": 8.341064453125e-07, + "loss": 0.0022, + "reward": 1.7442671656608582, + "reward_std": 0.14173447713255882, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7677046656608582, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.1484375, + "epoch": 0.6640625, + "grad_norm": 2.559332784444127, + "kl": 0.0445556640625, + "learning_rate": 8.339843749999999e-07, + "loss": 0.0018, + "reward": 1.716669738292694, + "reward_std": 0.11901552230119705, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7322947978973389, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.25, + "epoch": 0.66455078125, + "grad_norm": 1.4593259935506697, + "kl": 0.05419921875, + "learning_rate": 8.338623046874999e-07, + "loss": 0.0022, + "reward": 1.7508392333984375, + "reward_std": 0.02747677080333233, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7508392930030823, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.84375, + "epoch": 0.6650390625, + "grad_norm": 1.2002979458589371, + "kl": 0.0438232421875, + "learning_rate": 8.337402343749999e-07, + "loss": 0.0018, + "reward": 1.81014883518219, + "reward_std": 0.036064352840185165, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8101487755775452, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.1875, + "epoch": 0.66552734375, + "grad_norm": 2.6295827835981287, + "kl": 0.07568359375, + "learning_rate": 8.336181640625e-07, + "loss": 0.003, + "reward": 1.706138789653778, + "reward_std": 0.1014098059386015, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7217637896537781, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.4296875, + "epoch": 0.666015625, + "grad_norm": 3.4631557074514765, + "kl": 0.0574951171875, + "learning_rate": 8.3349609375e-07, + "loss": 0.0023, + "reward": 1.6869778037071228, + "reward_std": 0.1357739120721817, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6869778335094452, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.296875, + "epoch": 0.66650390625, + "grad_norm": 1.5320217928751798, + "kl": 0.044677734375, + "learning_rate": 8.333740234375e-07, + "loss": 0.0018, + "reward": 1.6233786344528198, + "reward_std": 0.08880849182605743, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6780661046504974, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.3984375, + "epoch": 0.6669921875, + "grad_norm": 3.0485251313362323, + "kl": 0.049072265625, + "learning_rate": 8.33251953125e-07, + "loss": 0.002, + "reward": 1.7159647345542908, + "reward_std": 0.04883173480629921, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7159647643566132, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.9375, + "epoch": 0.66748046875, + "grad_norm": 3.6031654217252314, + "kl": 0.0576171875, + "learning_rate": 8.331298828124999e-07, + "loss": 0.0023, + "reward": 1.7141339778900146, + "reward_std": 0.1232108511030674, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7297589182853699, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.6484375, + "epoch": 0.66796875, + "grad_norm": 2.3747920815803014, + "kl": 0.0528564453125, + "learning_rate": 8.330078124999999e-07, + "loss": 0.0021, + "reward": 1.63826584815979, + "reward_std": 0.07934099994599819, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6460783183574677, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.859375, + "epoch": 0.66845703125, + "grad_norm": 1.0962500762337954, + "kl": 0.05908203125, + "learning_rate": 8.328857421875e-07, + "loss": 0.0024, + "reward": 1.6661372780799866, + "reward_std": 0.14820329658687115, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.697387307882309, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.5078125, + "epoch": 0.6689453125, + "grad_norm": 12.019208496709846, + "kl": 0.053466796875, + "learning_rate": 8.32763671875e-07, + "loss": 0.0021, + "reward": 1.6565269231796265, + "reward_std": 0.11510607227683067, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6721519827842712, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.03125, + "epoch": 0.66943359375, + "grad_norm": 4.559375150848394, + "kl": 0.0618896484375, + "learning_rate": 8.326416015625e-07, + "loss": 0.0025, + "reward": 1.6015617847442627, + "reward_std": 0.11551137268543243, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6249993145465851, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.6328125, + "epoch": 0.669921875, + "grad_norm": 1.3152796356639234, + "kl": 0.0606689453125, + "learning_rate": 8.3251953125e-07, + "loss": 0.0024, + "reward": 1.7060803174972534, + "reward_std": 0.08381591830402613, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7217053174972534, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.1796875, + "epoch": 0.67041015625, + "grad_norm": 2.0534328187220456, + "kl": 0.0609130859375, + "learning_rate": 8.323974609374999e-07, + "loss": 0.0024, + "reward": 1.726146936416626, + "reward_std": 0.033384598791599274, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.726146936416626, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.5078125, + "epoch": 0.6708984375, + "grad_norm": 0.6956670519280784, + "kl": 0.0565185546875, + "learning_rate": 8.322753906249999e-07, + "loss": 0.0023, + "reward": 1.674700915813446, + "reward_std": 0.08457869663834572, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.698138415813446, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.4375, + "epoch": 0.67138671875, + "grad_norm": 1.8185143553728975, + "kl": 0.0467529296875, + "learning_rate": 8.321533203124999e-07, + "loss": 0.0019, + "reward": 1.8650157451629639, + "reward_std": 0.054776063188910484, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8650156855583191, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.65625, + "epoch": 0.671875, + "grad_norm": 0.7337982459358227, + "kl": 0.03857421875, + "learning_rate": 8.3203125e-07, + "loss": 0.0015, + "reward": 1.7965154647827148, + "reward_std": 0.038863107562065125, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7965154945850372, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0, + "epoch": 0.67236328125, + "grad_norm": 0.940234983069783, + "kl": 0.0484619140625, + "learning_rate": 8.319091796875e-07, + "loss": 0.0019, + "reward": 1.7178753018379211, + "reward_std": 0.03893340937793255, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7178753018379211, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.5625, + "epoch": 0.6728515625, + "grad_norm": 3.0895577377191903, + "kl": 0.0479736328125, + "learning_rate": 8.31787109375e-07, + "loss": 0.0019, + "reward": 1.6909393668174744, + "reward_std": 0.03531087189912796, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6909393668174744, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.78125, + "epoch": 0.67333984375, + "grad_norm": 5.314939113631747, + "kl": 0.0555419921875, + "learning_rate": 8.316650390625e-07, + "loss": 0.0022, + "reward": 1.8328039646148682, + "reward_std": 0.18444261699914932, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8406165242195129, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.2578125, + "epoch": 0.673828125, + "grad_norm": 2.773926019270338, + "kl": 0.0574951171875, + "learning_rate": 8.315429687499999e-07, + "loss": 0.0023, + "reward": 1.754637897014618, + "reward_std": 0.07902231067419052, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7546378672122955, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3125, + "epoch": 0.67431640625, + "grad_norm": 2.5991565514669133, + "kl": 0.05126953125, + "learning_rate": 8.314208984374999e-07, + "loss": 0.002, + "reward": 1.697754681110382, + "reward_std": 0.10455015674233437, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7055672109127045, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.9609375, + "epoch": 0.6748046875, + "grad_norm": 1.523278355501712, + "kl": 0.0596923828125, + "learning_rate": 8.31298828125e-07, + "loss": 0.0024, + "reward": 1.7877840995788574, + "reward_std": 0.10126758366823196, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7877840399742126, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.90625, + "epoch": 0.67529296875, + "grad_norm": 1.7215147395484285, + "kl": 0.0465087890625, + "learning_rate": 8.311767578125e-07, + "loss": 0.0019, + "reward": 1.7264689207077026, + "reward_std": 0.0403452143073082, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.726468950510025, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.3125, + "epoch": 0.67578125, + "grad_norm": 8.257297851754577, + "kl": 0.054931640625, + "learning_rate": 8.310546875e-07, + "loss": 0.0022, + "reward": 1.7433820962905884, + "reward_std": 0.18031665682792664, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.774632066488266, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.0859375, + "epoch": 0.67626953125, + "grad_norm": 1.0654545701165232, + "kl": 0.0400390625, + "learning_rate": 8.309326171875e-07, + "loss": 0.0016, + "reward": 1.7454291582107544, + "reward_std": 0.0993618592619896, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7610541582107544, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.7109375, + "epoch": 0.6767578125, + "grad_norm": 1.040028404639345, + "kl": 0.0380859375, + "learning_rate": 8.308105468749999e-07, + "loss": 0.0015, + "reward": 1.760416865348816, + "reward_std": 0.0644846223294735, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7604168355464935, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.84375, + "epoch": 0.67724609375, + "grad_norm": 3.9520714635227354, + "kl": 0.05078125, + "learning_rate": 8.306884765624999e-07, + "loss": 0.002, + "reward": 1.7678569555282593, + "reward_std": 0.04989023134112358, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.767857015132904, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.5703125, + "epoch": 0.677734375, + "grad_norm": 2.9876370676551103, + "kl": 0.0631103515625, + "learning_rate": 8.305664062499999e-07, + "loss": 0.0025, + "reward": 1.741922914981842, + "reward_std": 0.04209707863628864, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.741922914981842, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.703125, + "epoch": 0.67822265625, + "grad_norm": 2.5716646270109456, + "kl": 0.0513916015625, + "learning_rate": 8.304443359375e-07, + "loss": 0.0021, + "reward": 1.6752015352249146, + "reward_std": 0.0383878406137228, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6752015054225922, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.4609375, + "epoch": 0.6787109375, + "grad_norm": 3.5362926281368234, + "kl": 0.046142578125, + "learning_rate": 8.30322265625e-07, + "loss": 0.0018, + "reward": 1.8196292519569397, + "reward_std": 0.11842495948076248, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8274418413639069, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.296875, + "epoch": 0.67919921875, + "grad_norm": 1.7884111867788441, + "kl": 0.050537109375, + "learning_rate": 8.302001953125e-07, + "loss": 0.002, + "reward": 1.6618223786354065, + "reward_std": 0.07712319865822792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6618224382400513, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.703125, + "epoch": 0.6796875, + "grad_norm": 2.6027744639630392, + "kl": 0.0611572265625, + "learning_rate": 8.30078125e-07, + "loss": 0.0024, + "reward": 1.5982427597045898, + "reward_std": 0.08532186597585678, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5982428193092346, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.203125, + "epoch": 0.68017578125, + "grad_norm": 5.619667205772122, + "kl": 0.0548095703125, + "learning_rate": 8.299560546874999e-07, + "loss": 0.0022, + "reward": 1.825063407421112, + "reward_std": 0.07088461332023144, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8328758478164673, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.09375, + "epoch": 0.6806640625, + "grad_norm": 4.094562358242973, + "kl": 0.0579833984375, + "learning_rate": 8.298339843749999e-07, + "loss": 0.0023, + "reward": 1.619062602519989, + "reward_std": 0.0742390900850296, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.619062602519989, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.1875, + "epoch": 0.68115234375, + "grad_norm": 1.3270748091394964, + "kl": 0.05126953125, + "learning_rate": 8.297119140625e-07, + "loss": 0.0021, + "reward": 1.7332074046134949, + "reward_std": 0.08660921268165112, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7488323748111725, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.9765625, + "epoch": 0.681640625, + "grad_norm": 2.4733721239899285, + "kl": 0.064453125, + "learning_rate": 8.2958984375e-07, + "loss": 0.0026, + "reward": 1.7022396922111511, + "reward_std": 0.07674708962440491, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7022397220134735, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.8203125, + "epoch": 0.68212890625, + "grad_norm": 11.678019935504977, + "kl": 0.0526123046875, + "learning_rate": 8.294677734375e-07, + "loss": 0.0021, + "reward": 1.7020042538642883, + "reward_std": 0.0902528464794159, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7020042836666107, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.46875, + "epoch": 0.6826171875, + "grad_norm": 1.9958555775047342, + "kl": 0.0479736328125, + "learning_rate": 8.29345703125e-07, + "loss": 0.0019, + "reward": 1.612401008605957, + "reward_std": 0.11361010372638702, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6280259191989899, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.4609375, + "epoch": 0.68310546875, + "grad_norm": 5.611286013828831, + "kl": 0.067626953125, + "learning_rate": 8.292236328124999e-07, + "loss": 0.0027, + "reward": 1.6931315064430237, + "reward_std": 0.0486298855394125, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6931315362453461, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.75, + "epoch": 0.68359375, + "grad_norm": 64.51783733112462, + "kl": 0.068115234375, + "learning_rate": 8.291015624999999e-07, + "loss": 0.0027, + "reward": 1.7985500693321228, + "reward_std": 0.1456453576683998, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8219876885414124, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.5625, + "epoch": 0.68408203125, + "grad_norm": 1.4134996928177288, + "kl": 0.0511474609375, + "learning_rate": 8.289794921874999e-07, + "loss": 0.002, + "reward": 1.7496825456619263, + "reward_std": 0.063983004540205, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7574950754642487, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.15625, + "epoch": 0.6845703125, + "grad_norm": 2.2409064158617453, + "kl": 0.0560302734375, + "learning_rate": 8.28857421875e-07, + "loss": 0.0022, + "reward": 1.5890800952911377, + "reward_std": 0.07782717980444431, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5968926250934601, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.0078125, + "epoch": 0.68505859375, + "grad_norm": 1.0794027539847428, + "kl": 0.0517578125, + "learning_rate": 8.287353515625e-07, + "loss": 0.0021, + "reward": 1.7144798636436462, + "reward_std": 0.08601437509059906, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7144798934459686, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.0546875, + "epoch": 0.685546875, + "grad_norm": 3.5803094144613534, + "kl": 0.06982421875, + "learning_rate": 8.2861328125e-07, + "loss": 0.0028, + "reward": 1.7153080701828003, + "reward_std": 0.12694889307022095, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7231204807758331, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.25, + "epoch": 0.68603515625, + "grad_norm": 2.24836561629631, + "kl": 0.07275390625, + "learning_rate": 8.284912109375e-07, + "loss": 0.0029, + "reward": 1.707879662513733, + "reward_std": 0.07356316037476063, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7078795731067657, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.0625, + "epoch": 0.6865234375, + "grad_norm": 5.0710615285595075, + "kl": 0.0543212890625, + "learning_rate": 8.283691406249999e-07, + "loss": 0.0022, + "reward": 1.8069834113121033, + "reward_std": 0.08053146488964558, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8069833815097809, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.1484375, + "epoch": 0.68701171875, + "grad_norm": 1.4195625786352468, + "kl": 0.0478515625, + "learning_rate": 8.282470703124999e-07, + "loss": 0.0019, + "reward": 1.8579445481300354, + "reward_std": 0.06742975115776062, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8657570779323578, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.71875, + "epoch": 0.6875, + "grad_norm": 1.2744757065801622, + "kl": 0.049560546875, + "learning_rate": 8.28125e-07, + "loss": 0.002, + "reward": 1.8057802319526672, + "reward_std": 0.11631088703870773, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8214052319526672, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.703125, + "epoch": 0.68798828125, + "grad_norm": 1.8422456478028055, + "kl": 0.064697265625, + "learning_rate": 8.280029296875e-07, + "loss": 0.0026, + "reward": 1.7374829053878784, + "reward_std": 0.10645648092031479, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7452954351902008, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.625, + "epoch": 0.6884765625, + "grad_norm": 1.4604901987015795, + "kl": 0.0531005859375, + "learning_rate": 8.27880859375e-07, + "loss": 0.0021, + "reward": 1.8335306644439697, + "reward_std": 0.03559792507439852, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8335305750370026, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.5625, + "epoch": 0.68896484375, + "grad_norm": 2.0010636772477475, + "kl": 0.0567626953125, + "learning_rate": 8.277587890625e-07, + "loss": 0.0023, + "reward": 1.7226684093475342, + "reward_std": 0.0325869033113122, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7226683795452118, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.5390625, + "epoch": 0.689453125, + "grad_norm": 4.139675114890247, + "kl": 0.0548095703125, + "learning_rate": 8.2763671875e-07, + "loss": 0.0022, + "reward": 1.688076138496399, + "reward_std": 0.09778516367077827, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6880761086940765, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.7890625, + "epoch": 0.68994140625, + "grad_norm": 1.0825288165225206, + "kl": 0.0504150390625, + "learning_rate": 8.275146484374999e-07, + "loss": 0.002, + "reward": 1.7078955173492432, + "reward_std": 0.10900576412677765, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7235205173492432, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.7421875, + "epoch": 0.6904296875, + "grad_norm": 1.048683826956388, + "kl": 0.0548095703125, + "learning_rate": 8.273925781249999e-07, + "loss": 0.0022, + "reward": 1.711770236492157, + "reward_std": 0.14532910659909248, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.743020236492157, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.6796875, + "epoch": 0.69091796875, + "grad_norm": 3.6499422191186257, + "kl": 0.079833984375, + "learning_rate": 8.272705078125e-07, + "loss": 0.0032, + "reward": 1.7406352758407593, + "reward_std": 0.07238475233316422, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7406352758407593, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.4765625, + "epoch": 0.69140625, + "grad_norm": 6.765975646442193, + "kl": 0.057373046875, + "learning_rate": 8.271484375e-07, + "loss": 0.0023, + "reward": 1.7682831287384033, + "reward_std": 0.06250830553472042, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7682830989360809, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.4921875, + "epoch": 0.69189453125, + "grad_norm": 2.9978780027516225, + "kl": 0.06787109375, + "learning_rate": 8.270263671875e-07, + "loss": 0.0027, + "reward": 1.7275251150131226, + "reward_std": 0.03290243726223707, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7275251746177673, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.8671875, + "epoch": 0.6923828125, + "grad_norm": 0.9989255723691591, + "kl": 0.0592041015625, + "learning_rate": 8.26904296875e-07, + "loss": 0.0024, + "reward": 1.6132749319076538, + "reward_std": 0.14219776540994644, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6523374617099762, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.25, + "epoch": 0.69287109375, + "grad_norm": 1.733952595360354, + "kl": 0.0599365234375, + "learning_rate": 8.267822265624999e-07, + "loss": 0.0024, + "reward": 1.529246211051941, + "reward_std": 0.21004829555749893, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5683087408542633, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.4140625, + "epoch": 0.693359375, + "grad_norm": 1.4642430835196691, + "kl": 0.06689453125, + "learning_rate": 8.266601562499999e-07, + "loss": 0.0027, + "reward": 1.7179248332977295, + "reward_std": 0.07703246548771858, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7179248332977295, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.890625, + "epoch": 0.69384765625, + "grad_norm": 0.9887674725498403, + "kl": 0.0482177734375, + "learning_rate": 8.265380859375e-07, + "loss": 0.0019, + "reward": 1.8166847229003906, + "reward_std": 0.1480160653591156, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8323096930980682, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.015625, + "epoch": 0.6943359375, + "grad_norm": 2.2480619414400524, + "kl": 0.06787109375, + "learning_rate": 8.26416015625e-07, + "loss": 0.0027, + "reward": 1.640427827835083, + "reward_std": 0.15429722517728806, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6560528576374054, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.796875, + "epoch": 0.69482421875, + "grad_norm": 1.6615444011731868, + "kl": 0.0548095703125, + "learning_rate": 8.262939453125e-07, + "loss": 0.0022, + "reward": 1.8311384916305542, + "reward_std": 0.08753996156156063, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8389509916305542, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.15625, + "epoch": 0.6953125, + "grad_norm": 1.3951149530350935, + "kl": 0.0712890625, + "learning_rate": 8.26171875e-07, + "loss": 0.0028, + "reward": 1.7672319412231445, + "reward_std": 0.1136610172688961, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7672319114208221, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.6796875, + "epoch": 0.69580078125, + "grad_norm": 1.1270021475821117, + "kl": 0.043212890625, + "learning_rate": 8.260498046875e-07, + "loss": 0.0017, + "reward": 1.7945731282234192, + "reward_std": 0.10133310779929161, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8180106282234192, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.3515625, + "epoch": 0.6962890625, + "grad_norm": 1.0404208747858652, + "kl": 0.0528564453125, + "learning_rate": 8.259277343749999e-07, + "loss": 0.0021, + "reward": 1.8590461611747742, + "reward_std": 0.06993057578802109, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8590461909770966, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.4140625, + "epoch": 0.69677734375, + "grad_norm": 1.5437058523157996, + "kl": 0.0621337890625, + "learning_rate": 8.258056640624999e-07, + "loss": 0.0025, + "reward": 1.622244954109192, + "reward_std": 0.10384193528443575, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6378699541091919, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.0859375, + "epoch": 0.697265625, + "grad_norm": 1.7190179987884586, + "kl": 0.0633544921875, + "learning_rate": 8.2568359375e-07, + "loss": 0.0025, + "reward": 1.7580605745315552, + "reward_std": 0.05973019078373909, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7580606043338776, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.4296875, + "epoch": 0.69775390625, + "grad_norm": 1.6679202767670824, + "kl": 0.053955078125, + "learning_rate": 8.255615234375e-07, + "loss": 0.0022, + "reward": 1.8018113374710083, + "reward_std": 0.049905733205378056, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8018112778663635, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.078125, + "epoch": 0.6982421875, + "grad_norm": 1.2612943936774128, + "kl": 0.0543212890625, + "learning_rate": 8.25439453125e-07, + "loss": 0.0022, + "reward": 1.7309202551841736, + "reward_std": 0.09569451212882996, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7387327551841736, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1875, + "epoch": 0.69873046875, + "grad_norm": 1.6270467995733127, + "kl": 0.06689453125, + "learning_rate": 8.253173828125e-07, + "loss": 0.0027, + "reward": 1.8153039813041687, + "reward_std": 0.0769112091511488, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8309289515018463, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.2265625, + "epoch": 0.69921875, + "grad_norm": 3.113671634396718, + "kl": 0.05908203125, + "learning_rate": 8.251953124999999e-07, + "loss": 0.0024, + "reward": 1.7506902813911438, + "reward_std": 0.05832826718688011, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7506902813911438, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.28125, + "epoch": 0.69970703125, + "grad_norm": 1.5482948608367026, + "kl": 0.052490234375, + "learning_rate": 8.250732421874999e-07, + "loss": 0.0021, + "reward": 1.7495105266571045, + "reward_std": 0.047743687871843576, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7495104968547821, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.703125, + "epoch": 0.7001953125, + "grad_norm": 3.575458871440732, + "kl": 0.0628662109375, + "learning_rate": 8.24951171875e-07, + "loss": 0.0025, + "reward": 1.7153109312057495, + "reward_std": 0.10487351939082146, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7231234312057495, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.1015625, + "epoch": 0.70068359375, + "grad_norm": 0.9120133104435602, + "kl": 0.05419921875, + "learning_rate": 8.248291015625e-07, + "loss": 0.0022, + "reward": 1.70259028673172, + "reward_std": 0.0484690060839057, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.70259028673172, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.8828125, + "epoch": 0.701171875, + "grad_norm": 2.4781810496741126, + "kl": 0.0616455078125, + "learning_rate": 8.2470703125e-07, + "loss": 0.0025, + "reward": 1.8051986694335938, + "reward_std": 0.05168750695884228, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8051986396312714, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.0234375, + "epoch": 0.70166015625, + "grad_norm": 1.959151036800492, + "kl": 0.0552978515625, + "learning_rate": 8.245849609375e-07, + "loss": 0.0022, + "reward": 1.7046823501586914, + "reward_std": 0.07530912198126316, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.704682320356369, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.8671875, + "epoch": 0.7021484375, + "grad_norm": 1.1853692300427936, + "kl": 0.0565185546875, + "learning_rate": 8.24462890625e-07, + "loss": 0.0023, + "reward": 1.7881206274032593, + "reward_std": 0.07596137002110481, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7959331572055817, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.453125, + "epoch": 0.70263671875, + "grad_norm": 8.816391483092268, + "kl": 0.4215087890625, + "learning_rate": 8.243408203124999e-07, + "loss": 0.0169, + "reward": 1.803566336631775, + "reward_std": 0.08476324006915092, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8113788068294525, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.2578125, + "epoch": 0.703125, + "grad_norm": 3.150118996033216, + "kl": 0.079345703125, + "learning_rate": 8.242187499999999e-07, + "loss": 0.0032, + "reward": 1.65059894323349, + "reward_std": 0.1842002421617508, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.65059894323349, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.8125, + "epoch": 0.70361328125, + "grad_norm": 1.8365019943222014, + "kl": 0.047119140625, + "learning_rate": 8.240966796875e-07, + "loss": 0.0019, + "reward": 1.7905904650688171, + "reward_std": 0.026320545002818108, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7905905246734619, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.65625, + "epoch": 0.7041015625, + "grad_norm": 10.324751382988245, + "kl": 0.0635986328125, + "learning_rate": 8.23974609375e-07, + "loss": 0.0025, + "reward": 1.636955440044403, + "reward_std": 0.16419149935245514, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6447679400444031, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.6953125, + "epoch": 0.70458984375, + "grad_norm": 1.5794799873389354, + "kl": 0.0592041015625, + "learning_rate": 8.238525390625e-07, + "loss": 0.0024, + "reward": 1.7604875564575195, + "reward_std": 0.09106075949966908, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7604875266551971, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.96875, + "epoch": 0.705078125, + "grad_norm": 1.7122619531111243, + "kl": 0.049560546875, + "learning_rate": 8.2373046875e-07, + "loss": 0.002, + "reward": 1.7385912537574768, + "reward_std": 0.15688905864953995, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7932787239551544, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.359375, + "epoch": 0.70556640625, + "grad_norm": 4.933104035300985, + "kl": 0.0655517578125, + "learning_rate": 8.236083984374999e-07, + "loss": 0.0026, + "reward": 1.6635677814483643, + "reward_std": 0.3314622938632965, + "rewards/format_reward": 0.859375, + "rewards/ocr_reward": 0.8041927814483643, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.765625, + "epoch": 0.7060546875, + "grad_norm": 7.123204221199448, + "kl": 0.0577392578125, + "learning_rate": 8.234863281249999e-07, + "loss": 0.0023, + "reward": 1.5581657886505127, + "reward_std": 0.23571809381246567, + "rewards/format_reward": 0.8828125, + "rewards/ocr_reward": 0.6753532588481903, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.203125, + "epoch": 0.70654296875, + "grad_norm": 1.701577139579483, + "kl": 0.052490234375, + "learning_rate": 8.233642578125e-07, + "loss": 0.0021, + "reward": 1.5181033611297607, + "reward_std": 0.24801450222730637, + "rewards/format_reward": 0.90625, + "rewards/ocr_reward": 0.6118534803390503, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.609375, + "epoch": 0.70703125, + "grad_norm": 1.7700024228747016, + "kl": 0.0506591796875, + "learning_rate": 8.232421875e-07, + "loss": 0.002, + "reward": 1.7385361194610596, + "reward_std": 0.1337948441505432, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7775986790657043, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.0859375, + "epoch": 0.70751953125, + "grad_norm": 0.9369244908723074, + "kl": 0.0423583984375, + "learning_rate": 8.231201171875e-07, + "loss": 0.0017, + "reward": 1.7505207657814026, + "reward_std": 0.17919845134019852, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7739582657814026, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.28125, + "epoch": 0.7080078125, + "grad_norm": 1.7043675709930337, + "kl": 0.0504150390625, + "learning_rate": 8.22998046875e-07, + "loss": 0.002, + "reward": 1.778750240802765, + "reward_std": 0.11969216167926788, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7865626811981201, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.0234375, + "epoch": 0.70849609375, + "grad_norm": 3.7096740843912497, + "kl": 0.075439453125, + "learning_rate": 8.228759765625e-07, + "loss": 0.003, + "reward": 1.6811388731002808, + "reward_std": 0.16221491992473602, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7123888731002808, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.7890625, + "epoch": 0.708984375, + "grad_norm": 1.1016182213271797, + "kl": 0.0509033203125, + "learning_rate": 8.227539062499999e-07, + "loss": 0.002, + "reward": 1.730841338634491, + "reward_std": 0.043069666251540184, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7308413684368134, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.8046875, + "epoch": 0.70947265625, + "grad_norm": 4.52514586838674, + "kl": 0.0596923828125, + "learning_rate": 8.226318359374999e-07, + "loss": 0.0024, + "reward": 1.6379446983337402, + "reward_std": 0.09933317825198174, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6379446387290955, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.078125, + "epoch": 0.7099609375, + "grad_norm": 1.5113110300717163, + "kl": 0.0423583984375, + "learning_rate": 8.22509765625e-07, + "loss": 0.0017, + "reward": 1.743843913078308, + "reward_std": 0.09296439960598946, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7750938832759857, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.2890625, + "epoch": 0.71044921875, + "grad_norm": 2.5506627271636586, + "kl": 0.0540771484375, + "learning_rate": 8.223876953125e-07, + "loss": 0.0022, + "reward": 1.8323869109153748, + "reward_std": 0.10052505135536194, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8401993811130524, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.2421875, + "epoch": 0.7109375, + "grad_norm": 3.1270600079448694, + "kl": 0.0478515625, + "learning_rate": 8.22265625e-07, + "loss": 0.0019, + "reward": 1.7445816397666931, + "reward_std": 0.09944414719939232, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7680192291736603, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.7578125, + "epoch": 0.71142578125, + "grad_norm": 2.430403544666908, + "kl": 0.0513916015625, + "learning_rate": 8.221435546875e-07, + "loss": 0.0021, + "reward": 1.7487914562225342, + "reward_std": 0.07037571631371975, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7487914264202118, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.875, + "epoch": 0.7119140625, + "grad_norm": 2.5085519820373166, + "kl": 0.0556640625, + "learning_rate": 8.220214843749999e-07, + "loss": 0.0022, + "reward": 1.7057527303695679, + "reward_std": 0.13188474997878075, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7291902005672455, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.3515625, + "epoch": 0.71240234375, + "grad_norm": 2.925211143069262, + "kl": 0.0548095703125, + "learning_rate": 8.218994140624999e-07, + "loss": 0.0022, + "reward": 1.6985459327697754, + "reward_std": 0.09991316497325897, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7063583731651306, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.0703125, + "epoch": 0.712890625, + "grad_norm": 0.8890589616956593, + "kl": 0.0518798828125, + "learning_rate": 8.217773437499999e-07, + "loss": 0.0021, + "reward": 1.8091995120048523, + "reward_std": 0.03047786932438612, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8091995716094971, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.828125, + "epoch": 0.71337890625, + "grad_norm": 19.189378131745773, + "kl": 0.055419921875, + "learning_rate": 8.216552734375e-07, + "loss": 0.0022, + "reward": 1.7730653285980225, + "reward_std": 0.07386335171759129, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7808778285980225, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.3046875, + "epoch": 0.7138671875, + "grad_norm": 2.5112538884341795, + "kl": 0.051025390625, + "learning_rate": 8.21533203125e-07, + "loss": 0.002, + "reward": 1.7594855427742004, + "reward_std": 0.0999723095446825, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7751105725765228, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.09375, + "epoch": 0.71435546875, + "grad_norm": 1.8674018464220214, + "kl": 0.0516357421875, + "learning_rate": 8.214111328125e-07, + "loss": 0.0021, + "reward": 1.768634557723999, + "reward_std": 0.15559392422437668, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7842595875263214, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.4921875, + "epoch": 0.71484375, + "grad_norm": 3.647003211757384, + "kl": 0.05517578125, + "learning_rate": 8.212890625e-07, + "loss": 0.0022, + "reward": 1.6912202835083008, + "reward_std": 0.10662208870053291, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6912202537059784, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.1953125, + "epoch": 0.71533203125, + "grad_norm": 1.9989805075274047, + "kl": 0.0462646484375, + "learning_rate": 8.211669921874999e-07, + "loss": 0.0019, + "reward": 1.8012118935585022, + "reward_std": 0.04404502548277378, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8012118637561798, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.46875, + "epoch": 0.7158203125, + "grad_norm": 1.920187943953283, + "kl": 0.066650390625, + "learning_rate": 8.210449218749999e-07, + "loss": 0.0027, + "reward": 1.689796507358551, + "reward_std": 0.08326592482626438, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6897964179515839, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.328125, + "epoch": 0.71630859375, + "grad_norm": 1.177187426188002, + "kl": 0.0546875, + "learning_rate": 8.209228515625e-07, + "loss": 0.0022, + "reward": 1.8154129385948181, + "reward_std": 0.05495606176555157, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8154129683971405, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.4296875, + "epoch": 0.716796875, + "grad_norm": 2.430884718722628, + "kl": 0.0477294921875, + "learning_rate": 8.2080078125e-07, + "loss": 0.0019, + "reward": 1.6982349157333374, + "reward_std": 0.1398230344057083, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7138599455356598, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.640625, + "epoch": 0.71728515625, + "grad_norm": 17.02941879508463, + "kl": 0.0479736328125, + "learning_rate": 8.206787109375e-07, + "loss": 0.0019, + "reward": 1.6150004267692566, + "reward_std": 0.21229281276464462, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6618753671646118, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.484375, + "epoch": 0.7177734375, + "grad_norm": 2.690446110295151, + "kl": 0.052490234375, + "learning_rate": 8.20556640625e-07, + "loss": 0.0021, + "reward": 1.6503348350524902, + "reward_std": 0.1022627055644989, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6581473350524902, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.4375, + "epoch": 0.71826171875, + "grad_norm": 1.031554242337561, + "kl": 0.052001953125, + "learning_rate": 8.204345703124999e-07, + "loss": 0.0021, + "reward": 1.7112751603126526, + "reward_std": 0.13028262928128242, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7347126603126526, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.0234375, + "epoch": 0.71875, + "grad_norm": 2.5851949705933013, + "kl": 0.062744140625, + "learning_rate": 8.203124999999999e-07, + "loss": 0.0025, + "reward": 1.6422898769378662, + "reward_std": 0.12438905239105225, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6501024663448334, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.234375, + "epoch": 0.71923828125, + "grad_norm": 6.193724547415232, + "kl": 0.078125, + "learning_rate": 8.201904296874999e-07, + "loss": 0.0031, + "reward": 1.8023394346237183, + "reward_std": 0.09207788482308388, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8023395538330078, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.5625, + "epoch": 0.7197265625, + "grad_norm": 1.2219439151470035, + "kl": 0.0506591796875, + "learning_rate": 8.20068359375e-07, + "loss": 0.002, + "reward": 1.777301549911499, + "reward_std": 0.04901622235774994, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.785114049911499, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.9140625, + "epoch": 0.72021484375, + "grad_norm": 2.927095284762868, + "kl": 0.0487060546875, + "learning_rate": 8.199462890625e-07, + "loss": 0.0019, + "reward": 1.715105950832367, + "reward_std": 0.12777616456151009, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7229184210300446, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.546875, + "epoch": 0.720703125, + "grad_norm": 3.516281392962193, + "kl": 0.0496826171875, + "learning_rate": 8.1982421875e-07, + "loss": 0.002, + "reward": 1.6445563435554504, + "reward_std": 0.1418607532978058, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6601813733577728, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.234375, + "epoch": 0.72119140625, + "grad_norm": 1.3865604707622783, + "kl": 0.0531005859375, + "learning_rate": 8.197021484375e-07, + "loss": 0.0021, + "reward": 1.711554229259491, + "reward_std": 0.08468777127563953, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7193666994571686, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.078125, + "epoch": 0.7216796875, + "grad_norm": 1.345200066011625, + "kl": 0.065673828125, + "learning_rate": 8.195800781249999e-07, + "loss": 0.0026, + "reward": 1.7353711128234863, + "reward_std": 0.13099960051476955, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7588086724281311, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.4765625, + "epoch": 0.72216796875, + "grad_norm": 1.584912796001927, + "kl": 0.056884765625, + "learning_rate": 8.194580078124999e-07, + "loss": 0.0023, + "reward": 1.76387220621109, + "reward_std": 0.11961934715509415, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7638722062110901, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.9140625, + "epoch": 0.72265625, + "grad_norm": 1.7559205041484878, + "kl": 0.0538330078125, + "learning_rate": 8.193359375e-07, + "loss": 0.0022, + "reward": 1.7752057313919067, + "reward_std": 0.05388793349266052, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7752057611942291, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.1328125, + "epoch": 0.72314453125, + "grad_norm": 3.609119053953651, + "kl": 0.0548095703125, + "learning_rate": 8.192138671875e-07, + "loss": 0.0022, + "reward": 1.7668498158454895, + "reward_std": 0.05071160942316055, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7668498754501343, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.0390625, + "epoch": 0.7236328125, + "grad_norm": 2.9702748681991893, + "kl": 0.0516357421875, + "learning_rate": 8.19091796875e-07, + "loss": 0.0021, + "reward": 1.7055724263191223, + "reward_std": 0.14055679365992546, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7211975157260895, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.5234375, + "epoch": 0.72412109375, + "grad_norm": 2.6429417488503884, + "kl": 0.045166015625, + "learning_rate": 8.189697265625e-07, + "loss": 0.0018, + "reward": 1.8638358116149902, + "reward_std": 0.09294469654560089, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8638357818126678, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.1015625, + "epoch": 0.724609375, + "grad_norm": 1.1890261444927173, + "kl": 0.0614013671875, + "learning_rate": 8.188476562499999e-07, + "loss": 0.0025, + "reward": 1.6774348616600037, + "reward_std": 0.11033051460981369, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6852473616600037, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.4375, + "epoch": 0.72509765625, + "grad_norm": 1.8392107416740417, + "kl": 0.061279296875, + "learning_rate": 8.187255859374999e-07, + "loss": 0.0025, + "reward": 1.6773231625556946, + "reward_std": 0.12323976308107376, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6851356625556946, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.390625, + "epoch": 0.7255859375, + "grad_norm": 2.4085650228970623, + "kl": 0.06103515625, + "learning_rate": 8.186035156249999e-07, + "loss": 0.0024, + "reward": 1.7235342264175415, + "reward_std": 0.16048508323729038, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7469716966152191, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.8671875, + "epoch": 0.72607421875, + "grad_norm": 2.408009729634376, + "kl": 0.0439453125, + "learning_rate": 8.184814453125e-07, + "loss": 0.0018, + "reward": 1.7089346051216125, + "reward_std": 0.0976857841014862, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7089346349239349, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.7421875, + "epoch": 0.7265625, + "grad_norm": 1.6142248756995168, + "kl": 0.0546875, + "learning_rate": 8.18359375e-07, + "loss": 0.0022, + "reward": 1.774111568927765, + "reward_std": 0.09651333093643188, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7819240987300873, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.3515625, + "epoch": 0.72705078125, + "grad_norm": 1.8517869550068955, + "kl": 0.0472412109375, + "learning_rate": 8.182373046875e-07, + "loss": 0.0019, + "reward": 1.7990041971206665, + "reward_std": 0.14311717450618744, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8146291375160217, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.8984375, + "epoch": 0.7275390625, + "grad_norm": 2.7862564678417985, + "kl": 0.04931640625, + "learning_rate": 8.18115234375e-07, + "loss": 0.002, + "reward": 1.8134875893592834, + "reward_std": 0.046054454520344734, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8134876191616058, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.5546875, + "epoch": 0.72802734375, + "grad_norm": 1.0550017725621663, + "kl": 0.060791015625, + "learning_rate": 8.179931640624999e-07, + "loss": 0.0024, + "reward": 1.8162503838539124, + "reward_std": 0.043524582870304585, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8162504434585571, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.5703125, + "epoch": 0.728515625, + "grad_norm": 2.833553892053886, + "kl": 0.071044921875, + "learning_rate": 8.178710937499999e-07, + "loss": 0.0028, + "reward": 1.6208914518356323, + "reward_std": 0.1067028883844614, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6287039518356323, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.8671875, + "epoch": 0.72900390625, + "grad_norm": 2.8589313865508745, + "kl": 0.0618896484375, + "learning_rate": 8.177490234375e-07, + "loss": 0.0025, + "reward": 1.6766908764839172, + "reward_std": 0.06825340539216995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6766908168792725, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.9765625, + "epoch": 0.7294921875, + "grad_norm": 2.124207189204002, + "kl": 0.0760498046875, + "learning_rate": 8.17626953125e-07, + "loss": 0.003, + "reward": 1.5833302736282349, + "reward_std": 0.1273394152522087, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6067677438259125, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.203125, + "epoch": 0.72998046875, + "grad_norm": 2.3973851983698773, + "kl": 0.067626953125, + "learning_rate": 8.175048828125e-07, + "loss": 0.0027, + "reward": 1.7951343655586243, + "reward_std": 0.024024500511586666, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7951343059539795, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.328125, + "epoch": 0.73046875, + "grad_norm": 1.3073917262465702, + "kl": 0.055419921875, + "learning_rate": 8.173828125e-07, + "loss": 0.0022, + "reward": 1.7663521766662598, + "reward_std": 0.03600446879863739, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7663521468639374, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.453125, + "epoch": 0.73095703125, + "grad_norm": 1.6664383912533873, + "kl": 0.056396484375, + "learning_rate": 8.172607421874999e-07, + "loss": 0.0023, + "reward": 1.6536216139793396, + "reward_std": 0.16317107900977135, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6770591139793396, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.703125, + "epoch": 0.7314453125, + "grad_norm": 2.6043460259230633, + "kl": 0.052001953125, + "learning_rate": 8.171386718749999e-07, + "loss": 0.0021, + "reward": 1.8190729022026062, + "reward_std": 0.07409404963254929, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8190728724002838, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.03125, + "epoch": 0.73193359375, + "grad_norm": 1.6907569298922398, + "kl": 0.0511474609375, + "learning_rate": 8.170166015624999e-07, + "loss": 0.002, + "reward": 1.6964725852012634, + "reward_std": 0.07454644329845905, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7042850852012634, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.859375, + "epoch": 0.732421875, + "grad_norm": 2.688496165901201, + "kl": 0.060546875, + "learning_rate": 8.1689453125e-07, + "loss": 0.0024, + "reward": 1.7929801940917969, + "reward_std": 0.058575745671987534, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7929801940917969, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.828125, + "epoch": 0.73291015625, + "grad_norm": 2.8535089955662105, + "kl": 0.069091796875, + "learning_rate": 8.167724609375e-07, + "loss": 0.0028, + "reward": 1.6650619506835938, + "reward_std": 0.07004339620471, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6650619506835938, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.9765625, + "epoch": 0.7333984375, + "grad_norm": 2.0939627649247443, + "kl": 0.06982421875, + "learning_rate": 8.16650390625e-07, + "loss": 0.0028, + "reward": 1.6487025022506714, + "reward_std": 0.0736217126250267, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6487023830413818, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.0234375, + "epoch": 0.73388671875, + "grad_norm": 1.835761690123511, + "kl": 0.058349609375, + "learning_rate": 8.165283203125e-07, + "loss": 0.0023, + "reward": 1.6299150586128235, + "reward_std": 0.16713447123765945, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6689775288105011, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.5703125, + "epoch": 0.734375, + "grad_norm": 1.7502378181934324, + "kl": 0.0660400390625, + "learning_rate": 8.164062499999999e-07, + "loss": 0.0026, + "reward": 1.7362866401672363, + "reward_std": 0.06938813626766205, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7362865805625916, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.046875, + "epoch": 0.73486328125, + "grad_norm": 1.925910858116271, + "kl": 0.045654296875, + "learning_rate": 8.162841796874999e-07, + "loss": 0.0018, + "reward": 1.76516455411911, + "reward_std": 0.08768405765295029, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7729770541191101, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.421875, + "epoch": 0.7353515625, + "grad_norm": 1.8350835868922448, + "kl": 0.08154296875, + "learning_rate": 8.16162109375e-07, + "loss": 0.0033, + "reward": 1.7183210253715515, + "reward_std": 0.09937049448490143, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7261334359645844, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.296875, + "epoch": 0.73583984375, + "grad_norm": 1.2480606209927823, + "kl": 0.056884765625, + "learning_rate": 8.160400390625e-07, + "loss": 0.0023, + "reward": 1.800473690032959, + "reward_std": 0.036239128559827805, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8004737496376038, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.53125, + "epoch": 0.736328125, + "grad_norm": 3.8267954710393566, + "kl": 0.058349609375, + "learning_rate": 8.1591796875e-07, + "loss": 0.0023, + "reward": 1.8211953043937683, + "reward_std": 0.055461274459958076, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8211952745914459, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.0234375, + "epoch": 0.73681640625, + "grad_norm": 2.245415335980976, + "kl": 0.056640625, + "learning_rate": 8.157958984375e-07, + "loss": 0.0023, + "reward": 1.6801503896713257, + "reward_std": 0.15717144310474396, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7114003896713257, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.7421875, + "epoch": 0.7373046875, + "grad_norm": 0.8186989855394431, + "kl": 0.0587158203125, + "learning_rate": 8.15673828125e-07, + "loss": 0.0023, + "reward": 1.7053476572036743, + "reward_std": 0.09524018689990044, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7131602764129639, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.296875, + "epoch": 0.73779296875, + "grad_norm": 3.6173846814752046, + "kl": 0.05419921875, + "learning_rate": 8.155517578124999e-07, + "loss": 0.0022, + "reward": 1.7306747436523438, + "reward_std": 0.10750394687056541, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7384872734546661, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.4921875, + "epoch": 0.73828125, + "grad_norm": 1.951320596223026, + "kl": 0.066162109375, + "learning_rate": 8.154296874999999e-07, + "loss": 0.0026, + "reward": 1.6698977947235107, + "reward_std": 0.17629149928689003, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7245852947235107, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.9609375, + "epoch": 0.73876953125, + "grad_norm": 1.6633533126989715, + "kl": 0.059326171875, + "learning_rate": 8.153076171875e-07, + "loss": 0.0024, + "reward": 1.724345088005066, + "reward_std": 0.07205065805464983, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7321575880050659, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.828125, + "epoch": 0.7392578125, + "grad_norm": 2.1092767969234525, + "kl": 0.045166015625, + "learning_rate": 8.15185546875e-07, + "loss": 0.0018, + "reward": 1.6450571417808533, + "reward_std": 0.17002198845148087, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6997447311878204, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.03125, + "epoch": 0.73974609375, + "grad_norm": 1.0066903202791224, + "kl": 0.06298828125, + "learning_rate": 8.150634765625e-07, + "loss": 0.0025, + "reward": 1.6437667608261108, + "reward_std": 0.10886374488472939, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6672042906284332, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.671875, + "epoch": 0.740234375, + "grad_norm": 1.0622351578079063, + "kl": 0.060546875, + "learning_rate": 8.1494140625e-07, + "loss": 0.0024, + "reward": 1.641761600971222, + "reward_std": 0.15896305441856384, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.7120741009712219, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7421875, + "epoch": 0.74072265625, + "grad_norm": 1.6360500618274396, + "kl": 0.0634765625, + "learning_rate": 8.148193359374999e-07, + "loss": 0.0025, + "reward": 1.7401413321495056, + "reward_std": 0.042983127757906914, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.740141361951828, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.8828125, + "epoch": 0.7412109375, + "grad_norm": 1.010630811381439, + "kl": 0.0531005859375, + "learning_rate": 8.146972656249999e-07, + "loss": 0.0021, + "reward": 1.7111788988113403, + "reward_std": 0.07130059599876404, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7111788690090179, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.734375, + "epoch": 0.74169921875, + "grad_norm": 0.8278353373184272, + "kl": 0.0540771484375, + "learning_rate": 8.145751953125e-07, + "loss": 0.0022, + "reward": 1.8538936972618103, + "reward_std": 0.11458700150251389, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8695186972618103, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.609375, + "epoch": 0.7421875, + "grad_norm": 1.433959106877648, + "kl": 0.0457763671875, + "learning_rate": 8.14453125e-07, + "loss": 0.0018, + "reward": 1.8906309008598328, + "reward_std": 0.06328525394201279, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8906309306621552, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.28125, + "epoch": 0.74267578125, + "grad_norm": 8.396740339740772, + "kl": 0.075927734375, + "learning_rate": 8.143310546875e-07, + "loss": 0.003, + "reward": 1.7359008193016052, + "reward_std": 0.10937487334012985, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7359007894992828, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.2109375, + "epoch": 0.7431640625, + "grad_norm": 2.401371893041721, + "kl": 0.0625, + "learning_rate": 8.14208984375e-07, + "loss": 0.0025, + "reward": 1.8299207091331482, + "reward_std": 0.05006260797381401, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8299207091331482, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.5390625, + "epoch": 0.74365234375, + "grad_norm": 4.919411402041056, + "kl": 0.0687255859375, + "learning_rate": 8.140869140625e-07, + "loss": 0.0027, + "reward": 1.7033655643463135, + "reward_std": 0.13213280774652958, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7189904749393463, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.6640625, + "epoch": 0.744140625, + "grad_norm": 3.5288846647599983, + "kl": 0.0665283203125, + "learning_rate": 8.139648437499999e-07, + "loss": 0.0027, + "reward": 1.6737890839576721, + "reward_std": 0.11412935890257359, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6816015839576721, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.6171875, + "epoch": 0.74462890625, + "grad_norm": 0.9645495205101616, + "kl": 0.0430908203125, + "learning_rate": 8.138427734374999e-07, + "loss": 0.0017, + "reward": 1.7587640285491943, + "reward_std": 0.05514438450336456, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7587640285491943, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.875, + "epoch": 0.7451171875, + "grad_norm": 2.1056851901873856, + "kl": 0.0640869140625, + "learning_rate": 8.13720703125e-07, + "loss": 0.0026, + "reward": 1.68757826089859, + "reward_std": 0.0944238007068634, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6875782012939453, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.8828125, + "epoch": 0.74560546875, + "grad_norm": 12.263008903251595, + "kl": 0.0643310546875, + "learning_rate": 8.135986328125e-07, + "loss": 0.0026, + "reward": 1.6637163162231445, + "reward_std": 0.087074875831604, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6715288162231445, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.90625, + "epoch": 0.74609375, + "grad_norm": 0.7627901351331267, + "kl": 0.050537109375, + "learning_rate": 8.134765625e-07, + "loss": 0.002, + "reward": 1.863888144493103, + "reward_std": 0.08409961871802807, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8717006146907806, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.4296875, + "epoch": 0.74658203125, + "grad_norm": 1.4130142844576217, + "kl": 0.0479736328125, + "learning_rate": 8.133544921875e-07, + "loss": 0.0019, + "reward": 1.8731828331947327, + "reward_std": 0.04130223486572504, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8731828331947327, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.859375, + "epoch": 0.7470703125, + "grad_norm": 1.9620363842722353, + "kl": 0.052978515625, + "learning_rate": 8.132324218749999e-07, + "loss": 0.0021, + "reward": 1.7268319129943848, + "reward_std": 0.07933101058006287, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7346444129943848, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.53125, + "epoch": 0.74755859375, + "grad_norm": 8.38649985748051, + "kl": 0.0621337890625, + "learning_rate": 8.131103515624999e-07, + "loss": 0.0025, + "reward": 1.716759443283081, + "reward_std": 0.08667516149580479, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.724571943283081, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.7265625, + "epoch": 0.748046875, + "grad_norm": 1.7653142086688058, + "kl": 0.076416015625, + "learning_rate": 8.1298828125e-07, + "loss": 0.003, + "reward": 1.8313519358634949, + "reward_std": 0.060592420399188995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8313519060611725, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.2109375, + "epoch": 0.74853515625, + "grad_norm": 2.6054561123555398, + "kl": 0.0703125, + "learning_rate": 8.128662109375e-07, + "loss": 0.0028, + "reward": 1.737118661403656, + "reward_std": 0.10131100192666054, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7371186316013336, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.8984375, + "epoch": 0.7490234375, + "grad_norm": 2.674357738045574, + "kl": 0.0667724609375, + "learning_rate": 8.12744140625e-07, + "loss": 0.0027, + "reward": 1.7102715373039246, + "reward_std": 0.05548026505857706, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.710271567106247, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.125, + "epoch": 0.74951171875, + "grad_norm": 4.73739128631788, + "kl": 0.0626220703125, + "learning_rate": 8.126220703125e-07, + "loss": 0.0025, + "reward": 1.7168715000152588, + "reward_std": 0.04072634130716324, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7168715000152588, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.046875, + "epoch": 0.75, + "grad_norm": 1.1110552148698716, + "kl": 0.0604248046875, + "learning_rate": 8.125e-07, + "loss": 0.0024, + "reward": 1.8226452469825745, + "reward_std": 0.07101005595177412, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8226452171802521, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.375, + "epoch": 0.75048828125, + "grad_norm": 1.8999368365222582, + "kl": 0.06689453125, + "learning_rate": 8.123779296874999e-07, + "loss": 0.0027, + "reward": 1.7307413220405579, + "reward_std": 0.06915171444416046, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7307413220405579, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.84375, + "epoch": 0.7509765625, + "grad_norm": 1.3088249128930414, + "kl": 0.0545654296875, + "learning_rate": 8.122558593749999e-07, + "loss": 0.0022, + "reward": 1.7781055569648743, + "reward_std": 0.11129429191350937, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7781055569648743, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0390625, + "epoch": 0.75146484375, + "grad_norm": 1.8135902702393156, + "kl": 0.064453125, + "learning_rate": 8.121337890625e-07, + "loss": 0.0026, + "reward": 1.7776609063148499, + "reward_std": 0.05267609283328056, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7776609361171722, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.5703125, + "epoch": 0.751953125, + "grad_norm": 4.248969243149114, + "kl": 0.061767578125, + "learning_rate": 8.1201171875e-07, + "loss": 0.0025, + "reward": 1.6320134997367859, + "reward_std": 0.16212911903858185, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.671076089143753, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.0703125, + "epoch": 0.75244140625, + "grad_norm": 3.2351824322471487, + "kl": 0.0506591796875, + "learning_rate": 8.118896484375e-07, + "loss": 0.002, + "reward": 1.8071049451828003, + "reward_std": 0.03557584714144468, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8071048855781555, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.484375, + "epoch": 0.7529296875, + "grad_norm": 2.1607593809932912, + "kl": 0.0684814453125, + "learning_rate": 8.11767578125e-07, + "loss": 0.0027, + "reward": 1.7182350158691406, + "reward_std": 0.0703788474202156, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7182350158691406, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.171875, + "epoch": 0.75341796875, + "grad_norm": 2.28986197589543, + "kl": 0.0521240234375, + "learning_rate": 8.116455078124999e-07, + "loss": 0.0021, + "reward": 1.6649247407913208, + "reward_std": 0.10756101086735725, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.664924681186676, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.265625, + "epoch": 0.75390625, + "grad_norm": 1.3873712534548976, + "kl": 0.0587158203125, + "learning_rate": 8.115234374999999e-07, + "loss": 0.0023, + "reward": 1.7909113764762878, + "reward_std": 0.05301499832421541, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7909113466739655, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.96875, + "epoch": 0.75439453125, + "grad_norm": 1.3747071192917304, + "kl": 0.0538330078125, + "learning_rate": 8.114013671875e-07, + "loss": 0.0021, + "reward": 1.5880799293518066, + "reward_std": 0.05071425810456276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5880799889564514, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.078125, + "epoch": 0.7548828125, + "grad_norm": 2.082728546980293, + "kl": 0.0687255859375, + "learning_rate": 8.11279296875e-07, + "loss": 0.0027, + "reward": 1.7272522449493408, + "reward_std": 0.11839665472507477, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7272522151470184, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.8125, + "epoch": 0.75537109375, + "grad_norm": 4.2783261615543555, + "kl": 0.0557861328125, + "learning_rate": 8.111572265625e-07, + "loss": 0.0022, + "reward": 1.6325949430465698, + "reward_std": 0.08264567703008652, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6716574430465698, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.4609375, + "epoch": 0.755859375, + "grad_norm": 1.1440391734134507, + "kl": 0.046630859375, + "learning_rate": 8.1103515625e-07, + "loss": 0.0019, + "reward": 1.71382474899292, + "reward_std": 0.045681871473789215, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7138247489929199, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.328125, + "epoch": 0.75634765625, + "grad_norm": 10.451777449180478, + "kl": 0.0516357421875, + "learning_rate": 8.109130859375e-07, + "loss": 0.0021, + "reward": 1.6753268241882324, + "reward_std": 0.021205293014645576, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.67532679438591, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.796875, + "epoch": 0.7568359375, + "grad_norm": 2.3203833368592903, + "kl": 0.0751953125, + "learning_rate": 8.107910156249999e-07, + "loss": 0.003, + "reward": 1.7479038834571838, + "reward_std": 0.1040516346693039, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7557163834571838, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.453125, + "epoch": 0.75732421875, + "grad_norm": 1.314358162889798, + "kl": 0.05078125, + "learning_rate": 8.106689453124999e-07, + "loss": 0.002, + "reward": 1.6848346590995789, + "reward_std": 0.15354808419942856, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7160846590995789, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.9921875, + "epoch": 0.7578125, + "grad_norm": 2.609690852803103, + "kl": 0.055908203125, + "learning_rate": 8.10546875e-07, + "loss": 0.0022, + "reward": 1.7020440697669983, + "reward_std": 0.11408869549632072, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7176690995693207, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.4375, + "epoch": 0.75830078125, + "grad_norm": 1.6918813158728625, + "kl": 0.0628662109375, + "learning_rate": 8.104248046875e-07, + "loss": 0.0025, + "reward": 1.7733458280563354, + "reward_std": 0.1404724046587944, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7889708578586578, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.5546875, + "epoch": 0.7587890625, + "grad_norm": 1.7954408988834158, + "kl": 0.059326171875, + "learning_rate": 8.10302734375e-07, + "loss": 0.0024, + "reward": 1.7370250225067139, + "reward_std": 0.059584882110357285, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7370250523090363, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.9375, + "epoch": 0.75927734375, + "grad_norm": 1.500988736398781, + "kl": 0.047119140625, + "learning_rate": 8.101806640625e-07, + "loss": 0.0019, + "reward": 1.8506624698638916, + "reward_std": 0.09908335283398628, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8584749698638916, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.3359375, + "epoch": 0.759765625, + "grad_norm": 2.5780846344055774, + "kl": 0.04541015625, + "learning_rate": 8.100585937499999e-07, + "loss": 0.0018, + "reward": 1.6247803568840027, + "reward_std": 0.14903922379016876, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6638428568840027, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.1484375, + "epoch": 0.76025390625, + "grad_norm": 1.6802574412935145, + "kl": 0.0535888671875, + "learning_rate": 8.099365234374999e-07, + "loss": 0.0021, + "reward": 1.7854554653167725, + "reward_std": 0.0747103076428175, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7854554057121277, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.140625, + "epoch": 0.7607421875, + "grad_norm": 3.53135423693984, + "kl": 0.0521240234375, + "learning_rate": 8.098144531249999e-07, + "loss": 0.0021, + "reward": 1.5581438541412354, + "reward_std": 0.08005472645163536, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6050188541412354, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.15625, + "epoch": 0.76123046875, + "grad_norm": 1.2263929278952201, + "kl": 0.0550537109375, + "learning_rate": 8.096923828125e-07, + "loss": 0.0022, + "reward": 1.7725134491920471, + "reward_std": 0.09821948781609535, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8115759491920471, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.1484375, + "epoch": 0.76171875, + "grad_norm": 4.559002764522234, + "kl": 0.068115234375, + "learning_rate": 8.095703125e-07, + "loss": 0.0027, + "reward": 1.747908115386963, + "reward_std": 0.017046626191586256, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7479080855846405, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.4765625, + "epoch": 0.76220703125, + "grad_norm": 2.6821850910454974, + "kl": 0.0567626953125, + "learning_rate": 8.094482421875e-07, + "loss": 0.0023, + "reward": 1.6794939041137695, + "reward_std": 0.0593208484351635, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6794938743114471, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.1796875, + "epoch": 0.7626953125, + "grad_norm": 1.4073846788389592, + "kl": 0.05224609375, + "learning_rate": 8.09326171875e-07, + "loss": 0.0021, + "reward": 1.820260226726532, + "reward_std": 0.04938836395740509, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8202601671218872, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.3515625, + "epoch": 0.76318359375, + "grad_norm": 1.0253536738713116, + "kl": 0.068359375, + "learning_rate": 8.092041015624999e-07, + "loss": 0.0027, + "reward": 1.6596548557281494, + "reward_std": 0.02243457455188036, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6596548557281494, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.390625, + "epoch": 0.763671875, + "grad_norm": 0.802675306572281, + "kl": 0.06494140625, + "learning_rate": 8.090820312499999e-07, + "loss": 0.0026, + "reward": 1.7809888124465942, + "reward_std": 0.038061970844864845, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7809888422489166, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.953125, + "epoch": 0.76416015625, + "grad_norm": 1.8997263090024157, + "kl": 0.0560302734375, + "learning_rate": 8.089599609375e-07, + "loss": 0.0022, + "reward": 1.653722107410431, + "reward_std": 0.12127144634723663, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6615345478057861, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.703125, + "epoch": 0.7646484375, + "grad_norm": 1.958461815256613, + "kl": 0.0506591796875, + "learning_rate": 8.08837890625e-07, + "loss": 0.002, + "reward": 1.713492214679718, + "reward_std": 0.1120409145951271, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7291173040866852, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.421875, + "epoch": 0.76513671875, + "grad_norm": 1.0413990963846795, + "kl": 0.0498046875, + "learning_rate": 8.087158203125e-07, + "loss": 0.002, + "reward": 1.7677712440490723, + "reward_std": 0.09032433852553368, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7833963632583618, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.4765625, + "epoch": 0.765625, + "grad_norm": 2.975801159257183, + "kl": 0.0538330078125, + "learning_rate": 8.0859375e-07, + "loss": 0.0022, + "reward": 1.6949644684791565, + "reward_std": 0.1156335175037384, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7105894386768341, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.09375, + "epoch": 0.76611328125, + "grad_norm": 4.913374457578766, + "kl": 0.065185546875, + "learning_rate": 8.084716796874999e-07, + "loss": 0.0026, + "reward": 1.8527822494506836, + "reward_std": 0.03366856276988983, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.852782130241394, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.296875, + "epoch": 0.7666015625, + "grad_norm": 1.240155973042573, + "kl": 0.0506591796875, + "learning_rate": 8.083496093749999e-07, + "loss": 0.002, + "reward": 1.7566204071044922, + "reward_std": 0.0804799273610115, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7644328773021698, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.1640625, + "epoch": 0.76708984375, + "grad_norm": 1.4265984505569123, + "kl": 0.052001953125, + "learning_rate": 8.082275390624999e-07, + "loss": 0.0021, + "reward": 1.7467219233512878, + "reward_std": 0.017143062315881252, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7467218637466431, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.53125, + "epoch": 0.767578125, + "grad_norm": 1.5345523611187817, + "kl": 0.060546875, + "learning_rate": 8.0810546875e-07, + "loss": 0.0024, + "reward": 1.8325753211975098, + "reward_std": 0.04120937455445528, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8325753211975098, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.21875, + "epoch": 0.76806640625, + "grad_norm": 1.4045255992327337, + "kl": 0.0546875, + "learning_rate": 8.079833984375e-07, + "loss": 0.0022, + "reward": 1.6535959243774414, + "reward_std": 0.10518948920071125, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6770334243774414, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.7421875, + "epoch": 0.7685546875, + "grad_norm": 1.4480910955522743, + "kl": 0.056396484375, + "learning_rate": 8.07861328125e-07, + "loss": 0.0023, + "reward": 1.575450837612152, + "reward_std": 0.0833788514137268, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6223257780075073, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.40625, + "epoch": 0.76904296875, + "grad_norm": 1.6388667341544991, + "kl": 0.0498046875, + "learning_rate": 8.077392578125e-07, + "loss": 0.002, + "reward": 1.776998221874237, + "reward_std": 0.09045989066362381, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8004356920719147, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.3671875, + "epoch": 0.76953125, + "grad_norm": 4.580808288577953, + "kl": 0.06787109375, + "learning_rate": 8.076171874999999e-07, + "loss": 0.0027, + "reward": 1.7042137384414673, + "reward_std": 0.15083208680152893, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7354637086391449, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.4765625, + "epoch": 0.77001953125, + "grad_norm": 2.019507688897045, + "kl": 0.0489501953125, + "learning_rate": 8.074951171874999e-07, + "loss": 0.002, + "reward": 1.6543389558792114, + "reward_std": 0.1535024270415306, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7090264856815338, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.75, + "epoch": 0.7705078125, + "grad_norm": 1.5898119888098918, + "kl": 0.0535888671875, + "learning_rate": 8.07373046875e-07, + "loss": 0.0021, + "reward": 1.6351118683815002, + "reward_std": 0.1109085101634264, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6585493683815002, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.078125, + "epoch": 0.77099609375, + "grad_norm": 0.8244536832784812, + "kl": 0.0699462890625, + "learning_rate": 8.072509765625e-07, + "loss": 0.0028, + "reward": 1.7688223123550415, + "reward_std": 0.048665997572243214, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7688223123550415, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.9140625, + "epoch": 0.771484375, + "grad_norm": 7.833675662090654, + "kl": 0.0458984375, + "learning_rate": 8.0712890625e-07, + "loss": 0.0018, + "reward": 1.766732096672058, + "reward_std": 0.037401504814624786, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7667320668697357, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.5703125, + "epoch": 0.77197265625, + "grad_norm": 1.272735275661589, + "kl": 0.0555419921875, + "learning_rate": 8.070068359375e-07, + "loss": 0.0022, + "reward": 1.8723936080932617, + "reward_std": 0.06921904534101486, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8880185484886169, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.109375, + "epoch": 0.7724609375, + "grad_norm": 1.2808243375099266, + "kl": 0.0645751953125, + "learning_rate": 8.068847656249999e-07, + "loss": 0.0026, + "reward": 1.6261619925498962, + "reward_std": 0.07076285779476166, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6261619329452515, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.578125, + "epoch": 0.77294921875, + "grad_norm": 2.3321912897118713, + "kl": 0.078369140625, + "learning_rate": 8.067626953124999e-07, + "loss": 0.0031, + "reward": 1.7845726013183594, + "reward_std": 0.15707527101039886, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8001976609230042, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.8984375, + "epoch": 0.7734375, + "grad_norm": 1.0990319632980867, + "kl": 0.0711669921875, + "learning_rate": 8.066406249999999e-07, + "loss": 0.0028, + "reward": 1.7625375986099243, + "reward_std": 0.05330556631088257, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7703501284122467, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.3359375, + "epoch": 0.77392578125, + "grad_norm": 2.118844572781893, + "kl": 0.06396484375, + "learning_rate": 8.065185546875e-07, + "loss": 0.0026, + "reward": 1.7572776079177856, + "reward_std": 0.09189710766077042, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7572776079177856, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7734375, + "epoch": 0.7744140625, + "grad_norm": 3.9816311052864553, + "kl": 0.0728759765625, + "learning_rate": 8.06396484375e-07, + "loss": 0.0029, + "reward": 1.7010331749916077, + "reward_std": 0.1283012256026268, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7166581749916077, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.9140625, + "epoch": 0.77490234375, + "grad_norm": 1.3209118767681105, + "kl": 0.050537109375, + "learning_rate": 8.062744140625e-07, + "loss": 0.002, + "reward": 1.8046178817749023, + "reward_std": 0.10538148693740368, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8124303817749023, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.1484375, + "epoch": 0.775390625, + "grad_norm": 2.5132872532943624, + "kl": 0.046875, + "learning_rate": 8.0615234375e-07, + "loss": 0.0019, + "reward": 1.7433744668960571, + "reward_std": 0.09972074255347252, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.75118687748909, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.2578125, + "epoch": 0.77587890625, + "grad_norm": 3.40521583010745, + "kl": 0.085693359375, + "learning_rate": 8.060302734374999e-07, + "loss": 0.0034, + "reward": 1.628357172012329, + "reward_std": 0.09522592648863792, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6361695826053619, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.2265625, + "epoch": 0.7763671875, + "grad_norm": 3.0455379805887093, + "kl": 0.0606689453125, + "learning_rate": 8.059082031249999e-07, + "loss": 0.0024, + "reward": 1.641897439956665, + "reward_std": 0.08232817053794861, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6418974995613098, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.2734375, + "epoch": 0.77685546875, + "grad_norm": 2.2430367344910427, + "kl": 0.0614013671875, + "learning_rate": 8.057861328125e-07, + "loss": 0.0025, + "reward": 1.768738031387329, + "reward_std": 0.07021267339587212, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7687380015850067, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.3828125, + "epoch": 0.77734375, + "grad_norm": 2.20938289635979, + "kl": 0.069091796875, + "learning_rate": 8.056640625e-07, + "loss": 0.0028, + "reward": 1.8064799904823303, + "reward_std": 0.04143555276095867, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8064799904823303, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.671875, + "epoch": 0.77783203125, + "grad_norm": 2.7105879555153303, + "kl": 0.060791015625, + "learning_rate": 8.055419921875e-07, + "loss": 0.0024, + "reward": 1.6281877756118774, + "reward_std": 0.1271475814282894, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6281877756118774, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.84375, + "epoch": 0.7783203125, + "grad_norm": 5.064693794868452, + "kl": 0.0628662109375, + "learning_rate": 8.05419921875e-07, + "loss": 0.0025, + "reward": 1.704953670501709, + "reward_std": 0.09572456032037735, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.704953670501709, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.296875, + "epoch": 0.77880859375, + "grad_norm": 1.2322362890361453, + "kl": 0.0523681640625, + "learning_rate": 8.052978515624999e-07, + "loss": 0.0021, + "reward": 1.7966317534446716, + "reward_std": 0.09780865162611008, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8044441640377045, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.6875, + "epoch": 0.779296875, + "grad_norm": 2.587066136267263, + "kl": 0.0574951171875, + "learning_rate": 8.051757812499999e-07, + "loss": 0.0023, + "reward": 1.748351275920868, + "reward_std": 0.11127368733286858, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7483512759208679, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.96875, + "epoch": 0.77978515625, + "grad_norm": 8.678514629911282, + "kl": 0.0655517578125, + "learning_rate": 8.050537109374999e-07, + "loss": 0.0026, + "reward": 1.7842652797698975, + "reward_std": 0.056360941380262375, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7842652797698975, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.6796875, + "epoch": 0.7802734375, + "grad_norm": 3.8186079651609486, + "kl": 0.0589599609375, + "learning_rate": 8.04931640625e-07, + "loss": 0.0024, + "reward": 1.8417965769767761, + "reward_std": 0.07890859059989452, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8496091067790985, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.578125, + "epoch": 0.78076171875, + "grad_norm": 2.6414016724776146, + "kl": 0.0634765625, + "learning_rate": 8.048095703125e-07, + "loss": 0.0025, + "reward": 1.8113459348678589, + "reward_std": 0.042547447606921196, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8113458752632141, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.65625, + "epoch": 0.78125, + "grad_norm": 2.7042761180533192, + "kl": 0.0718994140625, + "learning_rate": 8.046875e-07, + "loss": 0.0029, + "reward": 1.7982996702194214, + "reward_std": 0.10386446584016085, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8061122000217438, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.8203125, + "epoch": 0.78173828125, + "grad_norm": 1.1262604316715779, + "kl": 0.078125, + "learning_rate": 8.045654296875e-07, + "loss": 0.0031, + "reward": 1.8404181599617004, + "reward_std": 0.03405761159956455, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8404182195663452, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.75, + "epoch": 0.7822265625, + "grad_norm": 2.3428168026992586, + "kl": 0.064697265625, + "learning_rate": 8.044433593749999e-07, + "loss": 0.0026, + "reward": 1.770385503768921, + "reward_std": 0.06261442601680756, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7703855335712433, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.7109375, + "epoch": 0.78271484375, + "grad_norm": 1.4573875580192406, + "kl": 0.0693359375, + "learning_rate": 8.043212890624999e-07, + "loss": 0.0028, + "reward": 1.8531925678253174, + "reward_std": 0.04907483607530594, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8531925678253174, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.0859375, + "epoch": 0.783203125, + "grad_norm": 2.097256901008272, + "kl": 0.04833984375, + "learning_rate": 8.0419921875e-07, + "loss": 0.0019, + "reward": 1.6914434432983398, + "reward_std": 0.12963934242725372, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7070684731006622, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.40625, + "epoch": 0.78369140625, + "grad_norm": 1.9320837029873765, + "kl": 0.0601806640625, + "learning_rate": 8.040771484375e-07, + "loss": 0.0024, + "reward": 1.680255651473999, + "reward_std": 0.1190883181989193, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6880680620670319, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.9375, + "epoch": 0.7841796875, + "grad_norm": 1.0873386862814498, + "kl": 0.0552978515625, + "learning_rate": 8.03955078125e-07, + "loss": 0.0022, + "reward": 1.7301841378211975, + "reward_std": 0.03086886089295149, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7301841080188751, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.640625, + "epoch": 0.78466796875, + "grad_norm": 1.305516177619871, + "kl": 0.07568359375, + "learning_rate": 8.038330078125e-07, + "loss": 0.003, + "reward": 1.651597023010254, + "reward_std": 0.0745653323829174, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6515969634056091, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.890625, + "epoch": 0.78515625, + "grad_norm": 1.853930940155658, + "kl": 0.0611572265625, + "learning_rate": 8.037109375e-07, + "loss": 0.0024, + "reward": 1.7472956776618958, + "reward_std": 0.036410000175237656, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.747295618057251, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.0390625, + "epoch": 0.78564453125, + "grad_norm": 1.5524349811107778, + "kl": 0.0557861328125, + "learning_rate": 8.035888671874999e-07, + "loss": 0.0022, + "reward": 1.757796585559845, + "reward_std": 0.096245177090168, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.773421585559845, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.859375, + "epoch": 0.7861328125, + "grad_norm": 12.21460617267679, + "kl": 0.0509033203125, + "learning_rate": 8.034667968749999e-07, + "loss": 0.002, + "reward": 1.6556835770606995, + "reward_std": 0.08177720569074154, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6556835770606995, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.9609375, + "epoch": 0.78662109375, + "grad_norm": 1.2988283875580688, + "kl": 0.0513916015625, + "learning_rate": 8.033447265625e-07, + "loss": 0.0021, + "reward": 1.7968943119049072, + "reward_std": 0.03818834759294987, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7968942821025848, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.8359375, + "epoch": 0.787109375, + "grad_norm": 1.853996721692202, + "kl": 0.0478515625, + "learning_rate": 8.0322265625e-07, + "loss": 0.0019, + "reward": 1.6934837102890015, + "reward_std": 0.06388338282704353, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6934837400913239, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0859375, + "epoch": 0.78759765625, + "grad_norm": 0.9164065489554429, + "kl": 0.0616455078125, + "learning_rate": 8.031005859375e-07, + "loss": 0.0025, + "reward": 1.6735413074493408, + "reward_std": 0.0621509775519371, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6735413372516632, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.4375, + "epoch": 0.7880859375, + "grad_norm": 2.82590067781753, + "kl": 0.0567626953125, + "learning_rate": 8.02978515625e-07, + "loss": 0.0023, + "reward": 1.6877517700195312, + "reward_std": 0.0880160890519619, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6877517700195312, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.6640625, + "epoch": 0.78857421875, + "grad_norm": 20.08903138023436, + "kl": 0.062744140625, + "learning_rate": 8.028564453124999e-07, + "loss": 0.0025, + "reward": 1.762800931930542, + "reward_std": 0.0888824425637722, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.762800931930542, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.515625, + "epoch": 0.7890625, + "grad_norm": 1.7330784289366532, + "kl": 0.0504150390625, + "learning_rate": 8.027343749999999e-07, + "loss": 0.002, + "reward": 1.8259278535842896, + "reward_std": 0.08520985394716263, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8415527939796448, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.75, + "epoch": 0.78955078125, + "grad_norm": 2.062097230316505, + "kl": 0.06298828125, + "learning_rate": 8.026123046875e-07, + "loss": 0.0025, + "reward": 1.7127341032028198, + "reward_std": 0.049558693543076515, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.712734043598175, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.4140625, + "epoch": 0.7900390625, + "grad_norm": 1.6783295426613631, + "kl": 0.065185546875, + "learning_rate": 8.02490234375e-07, + "loss": 0.0026, + "reward": 1.8034849166870117, + "reward_std": 0.10387159883975983, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8034849166870117, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.15625, + "epoch": 0.79052734375, + "grad_norm": 1.6933921135873553, + "kl": 0.0609130859375, + "learning_rate": 8.023681640625e-07, + "loss": 0.0024, + "reward": 1.8547474145889282, + "reward_std": 0.07110052555799484, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8547475039958954, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0234375, + "epoch": 0.791015625, + "grad_norm": 2.7831972038485997, + "kl": 0.070068359375, + "learning_rate": 8.0224609375e-07, + "loss": 0.0028, + "reward": 1.6170747876167297, + "reward_std": 0.07357279863208532, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6170747876167297, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.2109375, + "epoch": 0.79150390625, + "grad_norm": 1.7306710169661095, + "kl": 0.060302734375, + "learning_rate": 8.021240234375e-07, + "loss": 0.0024, + "reward": 1.7135973572731018, + "reward_std": 0.04537785239517689, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7135973572731018, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.7265625, + "epoch": 0.7919921875, + "grad_norm": 2.1555906518927137, + "kl": 0.0592041015625, + "learning_rate": 8.020019531249999e-07, + "loss": 0.0024, + "reward": 1.6998938918113708, + "reward_std": 0.07550182193517685, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6998938918113708, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.84375, + "epoch": 0.79248046875, + "grad_norm": 1.4507712079046071, + "kl": 0.0517578125, + "learning_rate": 8.018798828124999e-07, + "loss": 0.0021, + "reward": 1.7838214635849, + "reward_std": 0.08774328604340553, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7838214635848999, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.6171875, + "epoch": 0.79296875, + "grad_norm": 1.0757521287144136, + "kl": 0.041259765625, + "learning_rate": 8.017578125e-07, + "loss": 0.0017, + "reward": 1.8050659894943237, + "reward_std": 0.036978503689169884, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.805065929889679, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.75, + "epoch": 0.79345703125, + "grad_norm": 7.947108074112008, + "kl": 0.051513671875, + "learning_rate": 8.016357421875e-07, + "loss": 0.0021, + "reward": 1.6861704587936401, + "reward_std": 0.038627080619335175, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6861703991889954, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.2734375, + "epoch": 0.7939453125, + "grad_norm": 1.9454989447472923, + "kl": 0.06591796875, + "learning_rate": 8.01513671875e-07, + "loss": 0.0026, + "reward": 1.6758694648742676, + "reward_std": 0.05895833298563957, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6758694648742676, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.75, + "epoch": 0.79443359375, + "grad_norm": 0.7165968159288184, + "kl": 0.0621337890625, + "learning_rate": 8.013916015625e-07, + "loss": 0.0025, + "reward": 1.6823166608810425, + "reward_std": 0.02876619715243578, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6823166906833649, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.921875, + "epoch": 0.794921875, + "grad_norm": 1.5106278082685252, + "kl": 0.0543212890625, + "learning_rate": 8.012695312499999e-07, + "loss": 0.0022, + "reward": 1.6847730875015259, + "reward_std": 0.13465760834515095, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7082105278968811, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.421875, + "epoch": 0.79541015625, + "grad_norm": 3.32714688701594, + "kl": 0.063720703125, + "learning_rate": 8.011474609374999e-07, + "loss": 0.0025, + "reward": 1.7635406851768494, + "reward_std": 0.10127770528197289, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7635407149791718, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.3984375, + "epoch": 0.7958984375, + "grad_norm": 1.051302145633701, + "kl": 0.070556640625, + "learning_rate": 8.01025390625e-07, + "loss": 0.0028, + "reward": 1.6394500732421875, + "reward_std": 0.11420125816948712, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6707001626491547, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.1640625, + "epoch": 0.79638671875, + "grad_norm": 4.617980559742919, + "kl": 0.056396484375, + "learning_rate": 8.009033203125e-07, + "loss": 0.0023, + "reward": 1.6397234201431274, + "reward_std": 0.1288561257533729, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6553484499454498, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.515625, + "epoch": 0.796875, + "grad_norm": 3.2567614792509887, + "kl": 0.065185546875, + "learning_rate": 8.0078125e-07, + "loss": 0.0026, + "reward": 1.552538812160492, + "reward_std": 0.054762667044997215, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5525388121604919, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.671875, + "epoch": 0.79736328125, + "grad_norm": 1.051014835566516, + "kl": 0.0572509765625, + "learning_rate": 8.006591796875e-07, + "loss": 0.0023, + "reward": 1.8010156750679016, + "reward_std": 0.047447606921195984, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8088282346725464, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.84375, + "epoch": 0.7978515625, + "grad_norm": 2.2192496100152415, + "kl": 0.0694580078125, + "learning_rate": 8.00537109375e-07, + "loss": 0.0028, + "reward": 1.7100372314453125, + "reward_std": 0.07988406717777252, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7100372314453125, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.8828125, + "epoch": 0.79833984375, + "grad_norm": 6.815933212112274, + "kl": 0.0489501953125, + "learning_rate": 8.004150390624999e-07, + "loss": 0.002, + "reward": 1.5929869413375854, + "reward_std": 0.12428093701601028, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6476744413375854, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.5625, + "epoch": 0.798828125, + "grad_norm": 4.025394315862172, + "kl": 0.057373046875, + "learning_rate": 8.002929687499999e-07, + "loss": 0.0023, + "reward": 1.6940549612045288, + "reward_std": 0.12819510325789452, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7174924910068512, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.109375, + "epoch": 0.79931640625, + "grad_norm": 58.06981888749972, + "kl": 0.065673828125, + "learning_rate": 8.001708984375e-07, + "loss": 0.0026, + "reward": 1.725978434085846, + "reward_std": 0.02466776454821229, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.725978434085846, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.09375, + "epoch": 0.7998046875, + "grad_norm": 7.519107809173853, + "kl": 0.0826416015625, + "learning_rate": 8.00048828125e-07, + "loss": 0.0033, + "reward": 1.7485601305961609, + "reward_std": 0.04266110900789499, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7485601305961609, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.234375, + "epoch": 0.80029296875, + "grad_norm": 3.1218867448796446, + "kl": 0.0673828125, + "learning_rate": 7.999267578125e-07, + "loss": 0.0027, + "reward": 1.772888958454132, + "reward_std": 0.04786605387926102, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7728888988494873, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.421875, + "epoch": 0.80078125, + "grad_norm": 3.916540661111048, + "kl": 0.064697265625, + "learning_rate": 7.998046875e-07, + "loss": 0.0026, + "reward": 1.6346943378448486, + "reward_std": 0.15634194761514664, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6581318378448486, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.046875, + "epoch": 0.80126953125, + "grad_norm": 2.7275559731450985, + "kl": 0.068359375, + "learning_rate": 7.996826171874999e-07, + "loss": 0.0027, + "reward": 1.7609490156173706, + "reward_std": 0.09491265751421452, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7687614560127258, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.953125, + "epoch": 0.8017578125, + "grad_norm": 1.3559434809235391, + "kl": 0.05126953125, + "learning_rate": 7.995605468749999e-07, + "loss": 0.002, + "reward": 1.7670413851737976, + "reward_std": 0.046386873349547386, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7670413553714752, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.6484375, + "epoch": 0.80224609375, + "grad_norm": 0.9556319282815477, + "kl": 0.060302734375, + "learning_rate": 7.994384765625e-07, + "loss": 0.0024, + "reward": 1.7157460451126099, + "reward_std": 0.11387444660067558, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7548085451126099, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.09375, + "epoch": 0.802734375, + "grad_norm": 1.7945424457176717, + "kl": 0.0653076171875, + "learning_rate": 7.9931640625e-07, + "loss": 0.0026, + "reward": 1.6921892762184143, + "reward_std": 0.18082339316606522, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7156267762184143, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.6015625, + "epoch": 0.80322265625, + "grad_norm": 1.75248051233542, + "kl": 0.0631103515625, + "learning_rate": 7.991943359375e-07, + "loss": 0.0025, + "reward": 1.728013813495636, + "reward_std": 0.12820342928171158, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.751451313495636, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.4140625, + "epoch": 0.8037109375, + "grad_norm": 1.5536038507658307, + "kl": 0.054443359375, + "learning_rate": 7.99072265625e-07, + "loss": 0.0022, + "reward": 1.7276506423950195, + "reward_std": 0.08504182286560535, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7354631721973419, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.8671875, + "epoch": 0.80419921875, + "grad_norm": 1.3916694913938594, + "kl": 0.056884765625, + "learning_rate": 7.989501953125e-07, + "loss": 0.0023, + "reward": 1.7095491290092468, + "reward_std": 0.12051964923739433, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7329866290092468, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.9375, + "epoch": 0.8046875, + "grad_norm": 3.2993635236882883, + "kl": 0.0543212890625, + "learning_rate": 7.988281249999999e-07, + "loss": 0.0022, + "reward": 1.8719586730003357, + "reward_std": 0.06403150595724583, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8797712028026581, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.46875, + "epoch": 0.80517578125, + "grad_norm": 1.5159143122250482, + "kl": 0.0556640625, + "learning_rate": 7.987060546874999e-07, + "loss": 0.0022, + "reward": 1.6899768710136414, + "reward_std": 0.026911514345556498, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6899769306182861, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.15625, + "epoch": 0.8056640625, + "grad_norm": 1.7764803050121032, + "kl": 0.0616455078125, + "learning_rate": 7.98583984375e-07, + "loss": 0.0025, + "reward": 1.7939913868904114, + "reward_std": 0.08574027381837368, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8018038868904114, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.875, + "epoch": 0.80615234375, + "grad_norm": 1.7844460075537483, + "kl": 0.0601806640625, + "learning_rate": 7.984619140625e-07, + "loss": 0.0024, + "reward": 1.7695591449737549, + "reward_std": 0.07760765310376883, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7773716747760773, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.5078125, + "epoch": 0.806640625, + "grad_norm": 0.8825730937574552, + "kl": 0.04833984375, + "learning_rate": 7.9833984375e-07, + "loss": 0.0019, + "reward": 1.680052638053894, + "reward_std": 0.056173376739025116, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.680052638053894, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.84375, + "epoch": 0.80712890625, + "grad_norm": 1.582841548895619, + "kl": 0.0606689453125, + "learning_rate": 7.982177734375e-07, + "loss": 0.0024, + "reward": 1.8114255666732788, + "reward_std": 0.0704609714448452, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8114255368709564, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.46875, + "epoch": 0.8076171875, + "grad_norm": 3.246295454745955, + "kl": 0.0618896484375, + "learning_rate": 7.980957031249999e-07, + "loss": 0.0025, + "reward": 1.7257680296897888, + "reward_std": 0.097720542922616, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7413930892944336, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.828125, + "epoch": 0.80810546875, + "grad_norm": 8.02326668523963, + "kl": 0.06982421875, + "learning_rate": 7.979736328124999e-07, + "loss": 0.0028, + "reward": 1.832155466079712, + "reward_std": 0.06834479048848152, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8321554660797119, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.4609375, + "epoch": 0.80859375, + "grad_norm": 1.8274593522243785, + "kl": 0.0556640625, + "learning_rate": 7.978515624999999e-07, + "loss": 0.0022, + "reward": 1.7873517274856567, + "reward_std": 0.030621130019426346, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7873516976833344, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.140625, + "epoch": 0.80908203125, + "grad_norm": 1.4152329913916, + "kl": 0.0572509765625, + "learning_rate": 7.977294921875e-07, + "loss": 0.0023, + "reward": 1.7179552912712097, + "reward_std": 0.09283644892275333, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7179553210735321, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.5078125, + "epoch": 0.8095703125, + "grad_norm": 1.913575589493602, + "kl": 0.07080078125, + "learning_rate": 7.97607421875e-07, + "loss": 0.0028, + "reward": 1.7112517356872559, + "reward_std": 0.08975563384592533, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7190642654895782, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.5703125, + "epoch": 0.81005859375, + "grad_norm": 2.458378061669866, + "kl": 0.0548095703125, + "learning_rate": 7.974853515625e-07, + "loss": 0.0022, + "reward": 1.7066927552223206, + "reward_std": 0.0858432799577713, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7066927552223206, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.359375, + "epoch": 0.810546875, + "grad_norm": 17.6320865532267, + "kl": 0.0396728515625, + "learning_rate": 7.9736328125e-07, + "loss": 0.0016, + "reward": 1.7677738666534424, + "reward_std": 0.04016917198896408, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7677737772464752, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.046875, + "epoch": 0.81103515625, + "grad_norm": 2.7897545470605953, + "kl": 0.0859375, + "learning_rate": 7.972412109374999e-07, + "loss": 0.0034, + "reward": 1.7481929063796997, + "reward_std": 0.1298337448388338, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7560054063796997, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.28125, + "epoch": 0.8115234375, + "grad_norm": 1.3955624722306827, + "kl": 0.0496826171875, + "learning_rate": 7.971191406249999e-07, + "loss": 0.002, + "reward": 1.782720685005188, + "reward_std": 0.04922756180167198, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.782720685005188, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.265625, + "epoch": 0.81201171875, + "grad_norm": 2.3186462742034055, + "kl": 0.076171875, + "learning_rate": 7.969970703125e-07, + "loss": 0.003, + "reward": 1.715933918952942, + "reward_std": 0.09554797038435936, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7159339785575867, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.203125, + "epoch": 0.8125, + "grad_norm": 1.3672621736408754, + "kl": 0.0516357421875, + "learning_rate": 7.96875e-07, + "loss": 0.0021, + "reward": 1.6866209506988525, + "reward_std": 0.0732644684612751, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7022460103034973, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.0390625, + "epoch": 0.81298828125, + "grad_norm": 2.4881291764185476, + "kl": 0.053466796875, + "learning_rate": 7.967529296875e-07, + "loss": 0.0021, + "reward": 1.7344902157783508, + "reward_std": 0.056121040135622025, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7344902157783508, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.109375, + "epoch": 0.8134765625, + "grad_norm": 1.9066576061889415, + "kl": 0.056396484375, + "learning_rate": 7.96630859375e-07, + "loss": 0.0023, + "reward": 1.777056872844696, + "reward_std": 0.10177679359912872, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7770569026470184, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.15625, + "epoch": 0.81396484375, + "grad_norm": 5.827878805206846, + "kl": 0.05078125, + "learning_rate": 7.965087890624999e-07, + "loss": 0.002, + "reward": 1.8067973852157593, + "reward_std": 0.0667799562215805, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8067973554134369, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.53125, + "epoch": 0.814453125, + "grad_norm": 1.8284939496819252, + "kl": 0.0498046875, + "learning_rate": 7.963867187499999e-07, + "loss": 0.002, + "reward": 1.7949933409690857, + "reward_std": 0.12090729176998138, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8028059005737305, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.7578125, + "epoch": 0.81494140625, + "grad_norm": 1.7262715269151432, + "kl": 0.060791015625, + "learning_rate": 7.962646484374999e-07, + "loss": 0.0024, + "reward": 1.7913283109664917, + "reward_std": 0.09738441929221153, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7991408109664917, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.7421875, + "epoch": 0.8154296875, + "grad_norm": 3.374343895722311, + "kl": 0.04833984375, + "learning_rate": 7.96142578125e-07, + "loss": 0.0019, + "reward": 1.7700502276420593, + "reward_std": 0.11497660167515278, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7934877574443817, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.046875, + "epoch": 0.81591796875, + "grad_norm": 8.785283536535555, + "kl": 0.053466796875, + "learning_rate": 7.960205078125e-07, + "loss": 0.0021, + "reward": 1.803626537322998, + "reward_std": 0.04715009219944477, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8036265671253204, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.1796875, + "epoch": 0.81640625, + "grad_norm": 3.7883503464800756, + "kl": 0.05322265625, + "learning_rate": 7.958984375e-07, + "loss": 0.0021, + "reward": 1.789641559123993, + "reward_std": 0.07576981373131275, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7974540293216705, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.4296875, + "epoch": 0.81689453125, + "grad_norm": 2.623805320468466, + "kl": 0.05810546875, + "learning_rate": 7.957763671875e-07, + "loss": 0.0023, + "reward": 1.6763262748718262, + "reward_std": 0.07890587951987982, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6763262748718262, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.6484375, + "epoch": 0.8173828125, + "grad_norm": 1.265922144311732, + "kl": 0.0582275390625, + "learning_rate": 7.956542968749999e-07, + "loss": 0.0023, + "reward": 1.8311368823051453, + "reward_std": 0.05044192261993885, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.83113694190979, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.3828125, + "epoch": 0.81787109375, + "grad_norm": 1.1222356683541728, + "kl": 0.0572509765625, + "learning_rate": 7.955322265624999e-07, + "loss": 0.0023, + "reward": 1.777391493320465, + "reward_std": 0.028398778289556503, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7773914933204651, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.671875, + "epoch": 0.818359375, + "grad_norm": 1.3969206681569204, + "kl": 0.0718994140625, + "learning_rate": 7.9541015625e-07, + "loss": 0.0029, + "reward": 1.7802749872207642, + "reward_std": 0.03433122206479311, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7802750468254089, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.2421875, + "epoch": 0.81884765625, + "grad_norm": 1.7764370939806322, + "kl": 0.0650634765625, + "learning_rate": 7.952880859375e-07, + "loss": 0.0026, + "reward": 1.754819393157959, + "reward_std": 0.05870789662003517, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7548194527626038, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.0078125, + "epoch": 0.8193359375, + "grad_norm": 3.8365527733566775, + "kl": 0.0601806640625, + "learning_rate": 7.95166015625e-07, + "loss": 0.0024, + "reward": 1.7233573198318481, + "reward_std": 0.07569370232522488, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7233573496341705, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.8203125, + "epoch": 0.81982421875, + "grad_norm": 0.9365014960349338, + "kl": 0.0455322265625, + "learning_rate": 7.950439453125e-07, + "loss": 0.0018, + "reward": 1.6805170774459839, + "reward_std": 0.10303526744246483, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6961420774459839, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.375, + "epoch": 0.8203125, + "grad_norm": 2.7631417410653833, + "kl": 0.0693359375, + "learning_rate": 7.949218749999999e-07, + "loss": 0.0028, + "reward": 1.8013280034065247, + "reward_std": 0.04317835159599781, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8013280034065247, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.4921875, + "epoch": 0.82080078125, + "grad_norm": 1.7477349200279404, + "kl": 0.05322265625, + "learning_rate": 7.947998046874999e-07, + "loss": 0.0021, + "reward": 1.7636698484420776, + "reward_std": 0.08129507303237915, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7636699080467224, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2890625, + "epoch": 0.8212890625, + "grad_norm": 42.813192741926215, + "kl": 0.0697021484375, + "learning_rate": 7.946777343749999e-07, + "loss": 0.0028, + "reward": 1.707559585571289, + "reward_std": 0.04496626928448677, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7075595855712891, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.2734375, + "epoch": 0.82177734375, + "grad_norm": 0.9238209123448747, + "kl": 0.061279296875, + "learning_rate": 7.945556640625e-07, + "loss": 0.0024, + "reward": 1.7082937955856323, + "reward_std": 0.08139174059033394, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7161062955856323, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.890625, + "epoch": 0.822265625, + "grad_norm": 1.8243450346823835, + "kl": 0.0543212890625, + "learning_rate": 7.9443359375e-07, + "loss": 0.0022, + "reward": 1.7618046402931213, + "reward_std": 0.08494714740663767, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7696171402931213, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.28125, + "epoch": 0.82275390625, + "grad_norm": 0.9658134449963954, + "kl": 0.0517578125, + "learning_rate": 7.943115234375e-07, + "loss": 0.0021, + "reward": 1.8145250082015991, + "reward_std": 0.0333581417798996, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8145250082015991, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.1171875, + "epoch": 0.8232421875, + "grad_norm": 3.3053961272613166, + "kl": 0.0567626953125, + "learning_rate": 7.94189453125e-07, + "loss": 0.0023, + "reward": 1.7290484309196472, + "reward_std": 0.07806419394910336, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.736860990524292, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.3828125, + "epoch": 0.82373046875, + "grad_norm": 2.2314315017058552, + "kl": 0.0634765625, + "learning_rate": 7.940673828124999e-07, + "loss": 0.0025, + "reward": 1.694116473197937, + "reward_std": 0.09638424962759018, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6941164433956146, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.6484375, + "epoch": 0.82421875, + "grad_norm": 28.71281793769175, + "kl": 0.072509765625, + "learning_rate": 7.939453124999999e-07, + "loss": 0.0029, + "reward": 1.6306800842285156, + "reward_std": 0.06786506250500679, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6306800842285156, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.7734375, + "epoch": 0.82470703125, + "grad_norm": 1.81316275312339, + "kl": 0.062255859375, + "learning_rate": 7.938232421875e-07, + "loss": 0.0025, + "reward": 1.754611313343048, + "reward_std": 0.04153428506106138, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7546113431453705, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.328125, + "epoch": 0.8251953125, + "grad_norm": 1.230137752824096, + "kl": 0.0703125, + "learning_rate": 7.93701171875e-07, + "loss": 0.0028, + "reward": 1.7025874853134155, + "reward_std": 0.08435030654072762, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7025875449180603, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.2109375, + "epoch": 0.82568359375, + "grad_norm": 3.3713817064505265, + "kl": 0.072509765625, + "learning_rate": 7.935791015625e-07, + "loss": 0.0029, + "reward": 1.6434346437454224, + "reward_std": 0.04744470492005348, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6434346735477448, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.234375, + "epoch": 0.826171875, + "grad_norm": 1.7471049654231616, + "kl": 0.052978515625, + "learning_rate": 7.9345703125e-07, + "loss": 0.0021, + "reward": 1.7282820343971252, + "reward_std": 0.08323949202895164, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.73609459400177, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.8046875, + "epoch": 0.82666015625, + "grad_norm": 1.8268182609321926, + "kl": 0.0521240234375, + "learning_rate": 7.933349609375e-07, + "loss": 0.0021, + "reward": 1.845442295074463, + "reward_std": 0.08247396722435951, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8454422950744629, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.28125, + "epoch": 0.8271484375, + "grad_norm": 2.2899500423574617, + "kl": 0.0655517578125, + "learning_rate": 7.932128906249999e-07, + "loss": 0.0026, + "reward": 1.7103111743927002, + "reward_std": 0.018862903118133545, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7103111147880554, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.203125, + "epoch": 0.82763671875, + "grad_norm": 0.824676866426935, + "kl": 0.0635986328125, + "learning_rate": 7.930908203124999e-07, + "loss": 0.0025, + "reward": 1.7673900723457336, + "reward_std": 0.05198059044778347, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7673900127410889, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.6796875, + "epoch": 0.828125, + "grad_norm": 1.5355026194431203, + "kl": 0.066650390625, + "learning_rate": 7.9296875e-07, + "loss": 0.0027, + "reward": 1.683157503604889, + "reward_std": 0.14624864608049393, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6987824440002441, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.890625, + "epoch": 0.82861328125, + "grad_norm": 2.0932753654505145, + "kl": 0.0615234375, + "learning_rate": 7.928466796875e-07, + "loss": 0.0025, + "reward": 1.7546579837799072, + "reward_std": 0.048308661207556725, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7546580135822296, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.875, + "epoch": 0.8291015625, + "grad_norm": 1.4463454973748548, + "kl": 0.0565185546875, + "learning_rate": 7.92724609375e-07, + "loss": 0.0023, + "reward": 1.8585106134414673, + "reward_std": 0.07304185070097446, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8585106730461121, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.109375, + "epoch": 0.82958984375, + "grad_norm": 2.8574585912703454, + "kl": 0.052734375, + "learning_rate": 7.926025390625e-07, + "loss": 0.0021, + "reward": 1.8252478241920471, + "reward_std": 0.06224694475531578, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8252477645874023, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.15625, + "epoch": 0.830078125, + "grad_norm": 3.057914572093619, + "kl": 0.047119140625, + "learning_rate": 7.924804687499999e-07, + "loss": 0.0019, + "reward": 1.7930091619491577, + "reward_std": 0.0563307236880064, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7930091023445129, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.3984375, + "epoch": 0.83056640625, + "grad_norm": 4.684114511035403, + "kl": 0.078125, + "learning_rate": 7.923583984374999e-07, + "loss": 0.0031, + "reward": 1.716322124004364, + "reward_std": 0.10844194889068604, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7163220942020416, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.4140625, + "epoch": 0.8310546875, + "grad_norm": 0.8021038944890889, + "kl": 0.053466796875, + "learning_rate": 7.92236328125e-07, + "loss": 0.0021, + "reward": 1.882490634918213, + "reward_std": 0.033562688156962395, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8824906647205353, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.125, + "epoch": 0.83154296875, + "grad_norm": 1.6259472943086517, + "kl": 0.0560302734375, + "learning_rate": 7.921142578125e-07, + "loss": 0.0022, + "reward": 1.7306804060935974, + "reward_std": 0.06768567860126495, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.730680376291275, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.0625, + "epoch": 0.83203125, + "grad_norm": 1.5177838768420855, + "kl": 0.0523681640625, + "learning_rate": 7.919921875e-07, + "loss": 0.0021, + "reward": 1.7253316640853882, + "reward_std": 0.08401273377239704, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7331441044807434, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.3203125, + "epoch": 0.83251953125, + "grad_norm": 3.735972907900447, + "kl": 0.0751953125, + "learning_rate": 7.918701171875e-07, + "loss": 0.003, + "reward": 1.7001928091049194, + "reward_std": 0.12967666238546371, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7001928091049194, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.890625, + "epoch": 0.8330078125, + "grad_norm": 1.4428376073481957, + "kl": 0.072021484375, + "learning_rate": 7.91748046875e-07, + "loss": 0.0029, + "reward": 1.723404347896576, + "reward_std": 0.055715300142765045, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7234043180942535, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.4765625, + "epoch": 0.83349609375, + "grad_norm": 1.345764950057352, + "kl": 0.0791015625, + "learning_rate": 7.916259765624999e-07, + "loss": 0.0032, + "reward": 1.6827195286750793, + "reward_std": 0.04551626928150654, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6827195584774017, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.6796875, + "epoch": 0.833984375, + "grad_norm": 1.3356449765278338, + "kl": 0.048828125, + "learning_rate": 7.915039062499999e-07, + "loss": 0.002, + "reward": 1.700093388557434, + "reward_std": 0.07758311927318573, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7000934183597565, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.9765625, + "epoch": 0.83447265625, + "grad_norm": 4.208157265473066, + "kl": 0.0489501953125, + "learning_rate": 7.913818359375e-07, + "loss": 0.002, + "reward": 1.6747546792030334, + "reward_std": 0.05638587847352028, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6747547090053558, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0859375, + "epoch": 0.8349609375, + "grad_norm": 1.1933173641003354, + "kl": 0.057861328125, + "learning_rate": 7.91259765625e-07, + "loss": 0.0023, + "reward": 1.814025104045868, + "reward_std": 0.031041912734508514, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8140251636505127, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.109375, + "epoch": 0.83544921875, + "grad_norm": 1.533800629531482, + "kl": 0.0560302734375, + "learning_rate": 7.911376953125e-07, + "loss": 0.0022, + "reward": 1.785912573337555, + "reward_std": 0.04343899525702, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7859126031398773, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.8203125, + "epoch": 0.8359375, + "grad_norm": 1.5278482313965254, + "kl": 0.05859375, + "learning_rate": 7.91015625e-07, + "loss": 0.0023, + "reward": 1.6969901323318481, + "reward_std": 0.05987878702580929, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6969901621341705, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.40625, + "epoch": 0.83642578125, + "grad_norm": 2.174348828215157, + "kl": 0.0538330078125, + "learning_rate": 7.908935546874999e-07, + "loss": 0.0022, + "reward": 1.6450940370559692, + "reward_std": 0.13033273071050644, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6529065370559692, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.21875, + "epoch": 0.8369140625, + "grad_norm": 2.4062321676761877, + "kl": 0.0526123046875, + "learning_rate": 7.907714843749999e-07, + "loss": 0.0021, + "reward": 1.8309618830680847, + "reward_std": 0.04240616038441658, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8309618830680847, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.1171875, + "epoch": 0.83740234375, + "grad_norm": 2.977096488750238, + "kl": 0.05517578125, + "learning_rate": 7.906494140625e-07, + "loss": 0.0022, + "reward": 1.7772547602653503, + "reward_std": 0.07366564497351646, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.777254730463028, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.8203125, + "epoch": 0.837890625, + "grad_norm": 1.5557139163461535, + "kl": 0.0499267578125, + "learning_rate": 7.9052734375e-07, + "loss": 0.002, + "reward": 1.7700649499893188, + "reward_std": 0.04422549903392792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7700649499893188, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.1796875, + "epoch": 0.83837890625, + "grad_norm": 2.338925760005317, + "kl": 0.0511474609375, + "learning_rate": 7.904052734375e-07, + "loss": 0.002, + "reward": 1.6442299485206604, + "reward_std": 0.09395516850054264, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6520424783229828, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.1328125, + "epoch": 0.8388671875, + "grad_norm": 1.824044711639337, + "kl": 0.0684814453125, + "learning_rate": 7.90283203125e-07, + "loss": 0.0027, + "reward": 1.5608445405960083, + "reward_std": 0.09194111078977585, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5608445107936859, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.234375, + "epoch": 0.83935546875, + "grad_norm": 4.462873508127259, + "kl": 0.05078125, + "learning_rate": 7.901611328125e-07, + "loss": 0.002, + "reward": 1.8095470070838928, + "reward_std": 0.05928418226540089, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8095470666885376, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.0859375, + "epoch": 0.83984375, + "grad_norm": 1.7361959977725214, + "kl": 0.05419921875, + "learning_rate": 7.900390624999999e-07, + "loss": 0.0022, + "reward": 1.811360478401184, + "reward_std": 0.03823063708841801, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8113605082035065, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.4140625, + "epoch": 0.84033203125, + "grad_norm": 2.1880703343070445, + "kl": 0.0506591796875, + "learning_rate": 7.899169921874999e-07, + "loss": 0.002, + "reward": 1.8319576978683472, + "reward_std": 0.08859403431415558, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8397701978683472, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.0, + "epoch": 0.8408203125, + "grad_norm": 1.090543206713142, + "kl": 0.0567626953125, + "learning_rate": 7.89794921875e-07, + "loss": 0.0023, + "reward": 1.854802429676056, + "reward_std": 0.045012121088802814, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8548024296760559, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.2265625, + "epoch": 0.84130859375, + "grad_norm": 2.292205761747624, + "kl": 0.0625, + "learning_rate": 7.896728515625e-07, + "loss": 0.0025, + "reward": 1.6879829168319702, + "reward_std": 0.08614437095820904, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.687982976436615, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.21875, + "epoch": 0.841796875, + "grad_norm": 2.59542146501557, + "kl": 0.058349609375, + "learning_rate": 7.8955078125e-07, + "loss": 0.0023, + "reward": 1.6968461871147156, + "reward_std": 0.041891030967235565, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6968461871147156, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.75, + "epoch": 0.84228515625, + "grad_norm": 1.920538563839018, + "kl": 0.0645751953125, + "learning_rate": 7.894287109375e-07, + "loss": 0.0026, + "reward": 1.6629520654678345, + "reward_std": 0.02853654231876135, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6629520356655121, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.484375, + "epoch": 0.8427734375, + "grad_norm": 1.6048607833077935, + "kl": 0.047119140625, + "learning_rate": 7.893066406249999e-07, + "loss": 0.0019, + "reward": 1.7042565941810608, + "reward_std": 0.07790570706129074, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7198816537857056, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.890625, + "epoch": 0.84326171875, + "grad_norm": 1.3785295989211974, + "kl": 0.06787109375, + "learning_rate": 7.891845703124999e-07, + "loss": 0.0027, + "reward": 1.6507259607315063, + "reward_std": 0.03808063454926014, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6507259607315063, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.890625, + "epoch": 0.84375, + "grad_norm": 12.126077178793409, + "kl": 0.0609130859375, + "learning_rate": 7.890625e-07, + "loss": 0.0024, + "reward": 1.749779462814331, + "reward_std": 0.04450598731637001, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.749779462814331, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.5859375, + "epoch": 0.84423828125, + "grad_norm": 1.3126743985604528, + "kl": 0.0645751953125, + "learning_rate": 7.889404296875e-07, + "loss": 0.0026, + "reward": 1.726485550403595, + "reward_std": 0.04436471126973629, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.726485550403595, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.6953125, + "epoch": 0.8447265625, + "grad_norm": 7.765244544687918, + "kl": 0.0594482421875, + "learning_rate": 7.88818359375e-07, + "loss": 0.0024, + "reward": 1.559519112110138, + "reward_std": 0.044531380757689476, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5595191121101379, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.71875, + "epoch": 0.84521484375, + "grad_norm": 1.507983096496928, + "kl": 0.0557861328125, + "learning_rate": 7.886962890625e-07, + "loss": 0.0022, + "reward": 1.634689450263977, + "reward_std": 0.1870577111840248, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.689376950263977, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.0234375, + "epoch": 0.845703125, + "grad_norm": 0.6714636709114369, + "kl": 0.05126953125, + "learning_rate": 7.8857421875e-07, + "loss": 0.002, + "reward": 1.9045502543449402, + "reward_std": 0.06615402922034264, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.912362813949585, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.84375, + "epoch": 0.84619140625, + "grad_norm": 2.1809232819190214, + "kl": 0.064697265625, + "learning_rate": 7.884521484374999e-07, + "loss": 0.0026, + "reward": 1.6859930753707886, + "reward_std": 0.07783360034227371, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6859930753707886, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.8515625, + "epoch": 0.8466796875, + "grad_norm": 1.498211478873343, + "kl": 0.068603515625, + "learning_rate": 7.883300781249999e-07, + "loss": 0.0027, + "reward": 1.773053526878357, + "reward_std": 0.050117356702685356, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7730535268783569, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.3359375, + "epoch": 0.84716796875, + "grad_norm": 3.1690030279370487, + "kl": 0.0716552734375, + "learning_rate": 7.882080078125e-07, + "loss": 0.0029, + "reward": 1.632994532585144, + "reward_std": 0.06776593998074532, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6329945027828217, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.078125, + "epoch": 0.84765625, + "grad_norm": 3.785740665297623, + "kl": 0.0543212890625, + "learning_rate": 7.880859375e-07, + "loss": 0.0022, + "reward": 1.757651686668396, + "reward_std": 0.07453594170510769, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7654642462730408, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.1796875, + "epoch": 0.84814453125, + "grad_norm": 0.7651577403292609, + "kl": 0.0570068359375, + "learning_rate": 7.879638671875e-07, + "loss": 0.0023, + "reward": 1.7877587676048279, + "reward_std": 0.055866248439997435, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7955712676048279, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.3203125, + "epoch": 0.8486328125, + "grad_norm": 0.9326597932338185, + "kl": 0.04638671875, + "learning_rate": 7.87841796875e-07, + "loss": 0.0019, + "reward": 1.7322826385498047, + "reward_std": 0.06718971207737923, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7322825789451599, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.9296875, + "epoch": 0.84912109375, + "grad_norm": 1.093915850549881, + "kl": 0.051025390625, + "learning_rate": 7.877197265624999e-07, + "loss": 0.002, + "reward": 1.7318394184112549, + "reward_std": 0.057227155193686485, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7318393290042877, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.0625, + "epoch": 0.849609375, + "grad_norm": 4.525173371174026, + "kl": 0.077880859375, + "learning_rate": 7.875976562499999e-07, + "loss": 0.0031, + "reward": 1.5092061758041382, + "reward_std": 0.12106321007013321, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5248312056064606, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.8046875, + "epoch": 0.85009765625, + "grad_norm": 2.8086114016486823, + "kl": 0.071533203125, + "learning_rate": 7.874755859375e-07, + "loss": 0.0029, + "reward": 1.820200264453888, + "reward_std": 0.0870150737464428, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8202002048492432, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.40625, + "epoch": 0.8505859375, + "grad_norm": 3.4822870448370913, + "kl": 0.047607421875, + "learning_rate": 7.87353515625e-07, + "loss": 0.0019, + "reward": 1.8253133296966553, + "reward_std": 0.07836700230836868, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8253132998943329, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.7578125, + "epoch": 0.85107421875, + "grad_norm": 2.114293687003822, + "kl": 0.04248046875, + "learning_rate": 7.872314453125e-07, + "loss": 0.0017, + "reward": 1.8304061889648438, + "reward_std": 0.12630556523799896, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8460312485694885, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.8671875, + "epoch": 0.8515625, + "grad_norm": 2.6990671836659295, + "kl": 0.05908203125, + "learning_rate": 7.87109375e-07, + "loss": 0.0024, + "reward": 1.7995912432670593, + "reward_std": 0.04738871939480305, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7995912134647369, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3359375, + "epoch": 0.85205078125, + "grad_norm": 1.14016156478151, + "kl": 0.06689453125, + "learning_rate": 7.869873046875e-07, + "loss": 0.0027, + "reward": 1.6955284476280212, + "reward_std": 0.13801120221614838, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7345908880233765, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.328125, + "epoch": 0.8525390625, + "grad_norm": 1.7919196097725265, + "kl": 0.0452880859375, + "learning_rate": 7.868652343749999e-07, + "loss": 0.0018, + "reward": 1.7723374962806702, + "reward_std": 0.09062624350190163, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7801499664783478, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.28125, + "epoch": 0.85302734375, + "grad_norm": 2.2144660721415805, + "kl": 0.0555419921875, + "learning_rate": 7.867431640624999e-07, + "loss": 0.0022, + "reward": 1.814075231552124, + "reward_std": 0.08035072684288025, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.814075231552124, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.34375, + "epoch": 0.853515625, + "grad_norm": 2.5544601480394498, + "kl": 0.0596923828125, + "learning_rate": 7.8662109375e-07, + "loss": 0.0024, + "reward": 1.67475825548172, + "reward_std": 0.09878268092870712, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6747583150863647, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.3671875, + "epoch": 0.85400390625, + "grad_norm": 1.2963780193553962, + "kl": 0.0528564453125, + "learning_rate": 7.864990234375e-07, + "loss": 0.0021, + "reward": 1.7922492623329163, + "reward_std": 0.04872659081593156, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7922492027282715, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.25, + "epoch": 0.8544921875, + "grad_norm": 2.338549025732979, + "kl": 0.0543212890625, + "learning_rate": 7.86376953125e-07, + "loss": 0.0022, + "reward": 1.772305965423584, + "reward_std": 0.07593853399157524, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.780118465423584, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.546875, + "epoch": 0.85498046875, + "grad_norm": 1.4071606737568558, + "kl": 0.04296875, + "learning_rate": 7.862548828125e-07, + "loss": 0.0017, + "reward": 1.8112922310829163, + "reward_std": 0.13108721747994423, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8425421714782715, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.6328125, + "epoch": 0.85546875, + "grad_norm": 3.826688951216383, + "kl": 0.0850830078125, + "learning_rate": 7.861328124999999e-07, + "loss": 0.0034, + "reward": 1.6129182577133179, + "reward_std": 0.13975085318088531, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6285432279109955, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.6328125, + "epoch": 0.85595703125, + "grad_norm": 0.9707651176685911, + "kl": 0.0546875, + "learning_rate": 7.860107421874999e-07, + "loss": 0.0022, + "reward": 1.7708771228790283, + "reward_std": 0.022707084193825722, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7708771228790283, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.515625, + "epoch": 0.8564453125, + "grad_norm": 1.4575860899322022, + "kl": 0.0499267578125, + "learning_rate": 7.858886718749999e-07, + "loss": 0.002, + "reward": 1.7548741698265076, + "reward_std": 0.12130584567785263, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7704991102218628, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.8203125, + "epoch": 0.85693359375, + "grad_norm": 1.7544323097135777, + "kl": 0.0548095703125, + "learning_rate": 7.857666015625e-07, + "loss": 0.0022, + "reward": 1.6898673176765442, + "reward_std": 0.1721840798854828, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7054923176765442, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.5234375, + "epoch": 0.857421875, + "grad_norm": 0.8604056131796117, + "kl": 0.050537109375, + "learning_rate": 7.8564453125e-07, + "loss": 0.002, + "reward": 1.7485257983207703, + "reward_std": 0.15745490044355392, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7875882983207703, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.8359375, + "epoch": 0.85791015625, + "grad_norm": 1.4416503344043057, + "kl": 0.057861328125, + "learning_rate": 7.855224609375e-07, + "loss": 0.0023, + "reward": 1.7979487776756287, + "reward_std": 0.09817294403910637, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7979487776756287, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.09375, + "epoch": 0.8583984375, + "grad_norm": 5.069045278301706, + "kl": 0.0528564453125, + "learning_rate": 7.85400390625e-07, + "loss": 0.0021, + "reward": 1.7911220788955688, + "reward_std": 0.10761953145265579, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8067470788955688, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.5, + "epoch": 0.85888671875, + "grad_norm": 1.5478469273798983, + "kl": 0.0439453125, + "learning_rate": 7.852783203124999e-07, + "loss": 0.0018, + "reward": 1.7176623344421387, + "reward_std": 0.11607952415943146, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7645373642444611, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.921875, + "epoch": 0.859375, + "grad_norm": 0.8812677172192789, + "kl": 0.056640625, + "learning_rate": 7.851562499999999e-07, + "loss": 0.0023, + "reward": 1.5803175568580627, + "reward_std": 0.12807496264576912, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6115675568580627, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.8515625, + "epoch": 0.85986328125, + "grad_norm": 3.5600117966088054, + "kl": 0.0543212890625, + "learning_rate": 7.850341796875e-07, + "loss": 0.0022, + "reward": 1.7184030413627625, + "reward_std": 0.07983948290348053, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7262155115604401, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.03125, + "epoch": 0.8603515625, + "grad_norm": 9.189097579338886, + "kl": 0.055419921875, + "learning_rate": 7.84912109375e-07, + "loss": 0.0022, + "reward": 1.691203534603119, + "reward_std": 0.07379813119769096, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6912035048007965, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.8828125, + "epoch": 0.86083984375, + "grad_norm": 1.6292809358924043, + "kl": 0.062255859375, + "learning_rate": 7.847900390625e-07, + "loss": 0.0025, + "reward": 1.616748571395874, + "reward_std": 0.07066140696406364, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6245611011981964, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.6328125, + "epoch": 0.861328125, + "grad_norm": 8.127274020590354, + "kl": 0.08251953125, + "learning_rate": 7.8466796875e-07, + "loss": 0.0033, + "reward": 1.6578654646873474, + "reward_std": 0.11017253622412682, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6578654944896698, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.4453125, + "epoch": 0.86181640625, + "grad_norm": 1.8627529528124558, + "kl": 0.05224609375, + "learning_rate": 7.845458984374999e-07, + "loss": 0.0021, + "reward": 1.793116271495819, + "reward_std": 0.10687560588121414, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8009287714958191, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.15625, + "epoch": 0.8623046875, + "grad_norm": 5.016437160343661, + "kl": 0.05029296875, + "learning_rate": 7.844238281249999e-07, + "loss": 0.002, + "reward": 1.758280873298645, + "reward_std": 0.04120416380465031, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.758280873298645, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.0859375, + "epoch": 0.86279296875, + "grad_norm": 2.458305708230358, + "kl": 0.0460205078125, + "learning_rate": 7.843017578124999e-07, + "loss": 0.0018, + "reward": 1.6915509700775146, + "reward_std": 0.14069624990224838, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7071759104728699, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.53125, + "epoch": 0.86328125, + "grad_norm": 1.4127014544761989, + "kl": 0.067138671875, + "learning_rate": 7.841796875e-07, + "loss": 0.0027, + "reward": 1.727443516254425, + "reward_std": 0.09978067316114902, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.735256016254425, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.046875, + "epoch": 0.86376953125, + "grad_norm": 1.3815442374612499, + "kl": 0.0565185546875, + "learning_rate": 7.840576171875e-07, + "loss": 0.0023, + "reward": 1.7928959131240845, + "reward_std": 0.08073288947343826, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7928958535194397, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.1953125, + "epoch": 0.8642578125, + "grad_norm": 4.466680027866366, + "kl": 0.054931640625, + "learning_rate": 7.83935546875e-07, + "loss": 0.0022, + "reward": 1.7177514433860779, + "reward_std": 0.08392149582505226, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7177514135837555, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.921875, + "epoch": 0.86474609375, + "grad_norm": 1.1403297589321426, + "kl": 0.0496826171875, + "learning_rate": 7.838134765625e-07, + "loss": 0.002, + "reward": 1.70395165681839, + "reward_std": 0.04686661344021559, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7039515972137451, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.1953125, + "epoch": 0.865234375, + "grad_norm": 10.706867765852694, + "kl": 0.05419921875, + "learning_rate": 7.836914062499999e-07, + "loss": 0.0022, + "reward": 1.7995309829711914, + "reward_std": 0.03605970740318298, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.799530953168869, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.9609375, + "epoch": 0.86572265625, + "grad_norm": 3.3271446204369073, + "kl": 0.051025390625, + "learning_rate": 7.835693359374999e-07, + "loss": 0.002, + "reward": 1.6900931596755981, + "reward_std": 0.18629964627325535, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7369681894779205, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.296875, + "epoch": 0.8662109375, + "grad_norm": 1.834654441326854, + "kl": 0.053466796875, + "learning_rate": 7.83447265625e-07, + "loss": 0.0021, + "reward": 1.714508295059204, + "reward_std": 0.11370455846190453, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7613833248615265, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.765625, + "epoch": 0.86669921875, + "grad_norm": 1.879783754964238, + "kl": 0.05615234375, + "learning_rate": 7.833251953125e-07, + "loss": 0.0022, + "reward": 1.630328118801117, + "reward_std": 0.08714995346963406, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6381406188011169, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.265625, + "epoch": 0.8671875, + "grad_norm": 2.054382936637815, + "kl": 0.0517578125, + "learning_rate": 7.83203125e-07, + "loss": 0.0021, + "reward": 1.6842593550682068, + "reward_std": 0.028140094596892595, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6842593252658844, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.4609375, + "epoch": 0.86767578125, + "grad_norm": 1.6929850993037547, + "kl": 0.049560546875, + "learning_rate": 7.830810546875e-07, + "loss": 0.002, + "reward": 1.776337742805481, + "reward_std": 0.04738312214612961, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.776337742805481, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.15625, + "epoch": 0.8681640625, + "grad_norm": 1.944714939624994, + "kl": 0.046630859375, + "learning_rate": 7.829589843749999e-07, + "loss": 0.0019, + "reward": 1.8062950372695923, + "reward_std": 0.0485474169254303, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8062950074672699, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.921875, + "epoch": 0.86865234375, + "grad_norm": 2.0500856307717155, + "kl": 0.0758056640625, + "learning_rate": 7.828369140624999e-07, + "loss": 0.003, + "reward": 1.7174754738807678, + "reward_std": 0.05034205690026283, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7174754738807678, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.578125, + "epoch": 0.869140625, + "grad_norm": 1.8640037636723337, + "kl": 0.049560546875, + "learning_rate": 7.827148437499999e-07, + "loss": 0.002, + "reward": 1.6384202241897583, + "reward_std": 0.10762511938810349, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6462327837944031, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.34375, + "epoch": 0.86962890625, + "grad_norm": 3.8282446965440684, + "kl": 0.052978515625, + "learning_rate": 7.825927734375e-07, + "loss": 0.0021, + "reward": 1.77534019947052, + "reward_std": 0.057393044233322144, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7753402590751648, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.5390625, + "epoch": 0.8701171875, + "grad_norm": 1.8964741899587358, + "kl": 0.0552978515625, + "learning_rate": 7.82470703125e-07, + "loss": 0.0022, + "reward": 1.774406909942627, + "reward_std": 0.1195422075688839, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7744069397449493, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.875, + "epoch": 0.87060546875, + "grad_norm": 1.4724984337179923, + "kl": 0.06591796875, + "learning_rate": 7.823486328125e-07, + "loss": 0.0026, + "reward": 1.695317268371582, + "reward_std": 0.09004146233201027, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7031297087669373, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.9609375, + "epoch": 0.87109375, + "grad_norm": 0.6448629109262315, + "kl": 0.04736328125, + "learning_rate": 7.822265625e-07, + "loss": 0.0019, + "reward": 1.9076035022735596, + "reward_std": 0.01943269930779934, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.907603532075882, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.0234375, + "epoch": 0.87158203125, + "grad_norm": 3.027195875222421, + "kl": 0.0672607421875, + "learning_rate": 7.821044921874999e-07, + "loss": 0.0027, + "reward": 1.689346194267273, + "reward_std": 0.08278231136500835, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6893462538719177, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.359375, + "epoch": 0.8720703125, + "grad_norm": 1.1478726616336268, + "kl": 0.0518798828125, + "learning_rate": 7.819824218749999e-07, + "loss": 0.0021, + "reward": 1.7398544549942017, + "reward_std": 0.08145036175847054, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7789169549942017, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.3671875, + "epoch": 0.87255859375, + "grad_norm": 11.542962874754854, + "kl": 0.087890625, + "learning_rate": 7.818603515625e-07, + "loss": 0.0035, + "reward": 1.6417620182037354, + "reward_std": 0.12737858295440674, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6573870182037354, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.75, + "epoch": 0.873046875, + "grad_norm": 2.404778691827622, + "kl": 0.05078125, + "learning_rate": 7.8173828125e-07, + "loss": 0.002, + "reward": 1.8236736059188843, + "reward_std": 0.03153271973133087, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8236735463142395, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.8984375, + "epoch": 0.87353515625, + "grad_norm": 2.4749194133053596, + "kl": 0.0501708984375, + "learning_rate": 7.816162109375e-07, + "loss": 0.002, + "reward": 1.7504101991653442, + "reward_std": 0.08443843200802803, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7582226991653442, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.7421875, + "epoch": 0.8740234375, + "grad_norm": 2.6927007447476257, + "kl": 0.0517578125, + "learning_rate": 7.81494140625e-07, + "loss": 0.0021, + "reward": 1.6656638979911804, + "reward_std": 0.12951365113258362, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6734763383865356, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.71875, + "epoch": 0.87451171875, + "grad_norm": 3.271122934848648, + "kl": 0.047607421875, + "learning_rate": 7.813720703125e-07, + "loss": 0.0019, + "reward": 1.746010661125183, + "reward_std": 0.09499474987387657, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7460106015205383, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.2421875, + "epoch": 0.875, + "grad_norm": 1.9497103555365891, + "kl": 0.076171875, + "learning_rate": 7.812499999999999e-07, + "loss": 0.003, + "reward": 1.683960497379303, + "reward_std": 0.07071896642446518, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6839604675769806, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.1484375, + "epoch": 0.87548828125, + "grad_norm": 4.006011455889697, + "kl": 0.06005859375, + "learning_rate": 7.811279296874999e-07, + "loss": 0.0024, + "reward": 1.8330675959587097, + "reward_std": 0.023156346287578344, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8330676555633545, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.7265625, + "epoch": 0.8759765625, + "grad_norm": 25.852237909125545, + "kl": 0.062744140625, + "learning_rate": 7.81005859375e-07, + "loss": 0.0025, + "reward": 1.7323620319366455, + "reward_std": 0.049556052312254906, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7323620617389679, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.765625, + "epoch": 0.87646484375, + "grad_norm": 1.5076425145126342, + "kl": 0.0516357421875, + "learning_rate": 7.808837890625e-07, + "loss": 0.0021, + "reward": 1.819112241268158, + "reward_std": 0.0596193540841341, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8191123008728027, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.5703125, + "epoch": 0.876953125, + "grad_norm": 3.047457685966905, + "kl": 0.054443359375, + "learning_rate": 7.8076171875e-07, + "loss": 0.0022, + "reward": 1.673986792564392, + "reward_std": 0.09043450467288494, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6739867627620697, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.609375, + "epoch": 0.87744140625, + "grad_norm": 1.1587782691548503, + "kl": 0.0494384765625, + "learning_rate": 7.806396484375e-07, + "loss": 0.002, + "reward": 1.7295081615447998, + "reward_std": 0.06940071284770966, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7295081615447998, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.0703125, + "epoch": 0.8779296875, + "grad_norm": 4.452290043361228, + "kl": 0.0526123046875, + "learning_rate": 7.805175781249999e-07, + "loss": 0.0021, + "reward": 1.8323208689689636, + "reward_std": 0.05967606604099274, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8323208391666412, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.703125, + "epoch": 0.87841796875, + "grad_norm": 25.202792928468167, + "kl": 0.05419921875, + "learning_rate": 7.803955078124999e-07, + "loss": 0.0022, + "reward": 1.7258835434913635, + "reward_std": 0.09938307851552963, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7258834838867188, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.296875, + "epoch": 0.87890625, + "grad_norm": 0.6818241842690064, + "kl": 0.043701171875, + "learning_rate": 7.802734375e-07, + "loss": 0.0017, + "reward": 1.7057358026504517, + "reward_std": 0.05737200379371643, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7057357132434845, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.0, + "epoch": 0.87939453125, + "grad_norm": 3.693729462628242, + "kl": 0.062744140625, + "learning_rate": 7.801513671875e-07, + "loss": 0.0025, + "reward": 1.7509536743164062, + "reward_std": 0.04812243953347206, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7509536445140839, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8984375, + "epoch": 0.8798828125, + "grad_norm": 1.9779481675448194, + "kl": 0.042724609375, + "learning_rate": 7.80029296875e-07, + "loss": 0.0017, + "reward": 1.8703011870384216, + "reward_std": 0.03746516443789005, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8703011870384216, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.8984375, + "epoch": 0.88037109375, + "grad_norm": 2.2905824394928884, + "kl": 0.059326171875, + "learning_rate": 7.799072265625e-07, + "loss": 0.0024, + "reward": 1.8006829619407654, + "reward_std": 0.0814764704555273, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8006830215454102, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.2265625, + "epoch": 0.880859375, + "grad_norm": 1.6573366743968407, + "kl": 0.0570068359375, + "learning_rate": 7.7978515625e-07, + "loss": 0.0023, + "reward": 1.6909406185150146, + "reward_std": 0.0707071777433157, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.690940648317337, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.0625, + "epoch": 0.88134765625, + "grad_norm": 0.7192785248577067, + "kl": 0.051513671875, + "learning_rate": 7.796630859374999e-07, + "loss": 0.0021, + "reward": 1.6431750655174255, + "reward_std": 0.06791674718260765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6431750655174255, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5703125, + "epoch": 0.8818359375, + "grad_norm": 0.9208200253080467, + "kl": 0.058349609375, + "learning_rate": 7.795410156249999e-07, + "loss": 0.0023, + "reward": 1.709853172302246, + "reward_std": 0.11433164775371552, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7098531723022461, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.265625, + "epoch": 0.88232421875, + "grad_norm": 2.164142841901239, + "kl": 0.0660400390625, + "learning_rate": 7.794189453125e-07, + "loss": 0.0026, + "reward": 1.6406881213188171, + "reward_std": 0.11811601743102074, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6406880915164948, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.09375, + "epoch": 0.8828125, + "grad_norm": 15.416769114837617, + "kl": 0.072509765625, + "learning_rate": 7.79296875e-07, + "loss": 0.0029, + "reward": 1.8091920614242554, + "reward_std": 0.032884467393159866, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.809192031621933, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.3046875, + "epoch": 0.88330078125, + "grad_norm": 2.391571899624504, + "kl": 0.0584716796875, + "learning_rate": 7.791748046875e-07, + "loss": 0.0023, + "reward": 1.8211405277252197, + "reward_std": 0.05889258533716202, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.821140468120575, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.2421875, + "epoch": 0.8837890625, + "grad_norm": 1.7474116859623878, + "kl": 0.0506591796875, + "learning_rate": 7.79052734375e-07, + "loss": 0.002, + "reward": 1.7022829055786133, + "reward_std": 0.04944469407200813, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7022829353809357, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.1875, + "epoch": 0.88427734375, + "grad_norm": 5.108476758820766, + "kl": 0.0616455078125, + "learning_rate": 7.789306640624999e-07, + "loss": 0.0025, + "reward": 1.65779048204422, + "reward_std": 0.11469753831624985, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.65779048204422, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.09375, + "epoch": 0.884765625, + "grad_norm": 1.4333680344113544, + "kl": 0.0460205078125, + "learning_rate": 7.788085937499999e-07, + "loss": 0.0018, + "reward": 1.8354427814483643, + "reward_std": 0.10553473606705666, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8354427516460419, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.796875, + "epoch": 0.88525390625, + "grad_norm": 1.5118729896701895, + "kl": 0.0584716796875, + "learning_rate": 7.786865234375e-07, + "loss": 0.0023, + "reward": 1.817187786102295, + "reward_std": 0.08914723992347717, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8171877861022949, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.328125, + "epoch": 0.8857421875, + "grad_norm": 6.425239274241706, + "kl": 0.060791015625, + "learning_rate": 7.78564453125e-07, + "loss": 0.0024, + "reward": 1.7643995881080627, + "reward_std": 0.0862666517496109, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7643995881080627, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.5390625, + "epoch": 0.88623046875, + "grad_norm": 6.990205543539001, + "kl": 0.07421875, + "learning_rate": 7.784423828125e-07, + "loss": 0.003, + "reward": 1.6783007383346558, + "reward_std": 0.08350778743624687, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6783007085323334, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.078125, + "epoch": 0.88671875, + "grad_norm": 3.51961258176851, + "kl": 0.0556640625, + "learning_rate": 7.783203125e-07, + "loss": 0.0022, + "reward": 1.7718433737754822, + "reward_std": 0.054395925253629684, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7718433439731598, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.296875, + "epoch": 0.88720703125, + "grad_norm": 2.1702787043708143, + "kl": 0.0628662109375, + "learning_rate": 7.781982421875e-07, + "loss": 0.0025, + "reward": 1.8164880275726318, + "reward_std": 0.0386070990934968, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.816488116979599, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.2734375, + "epoch": 0.8876953125, + "grad_norm": 1.594469695809148, + "kl": 0.056640625, + "learning_rate": 7.780761718749999e-07, + "loss": 0.0023, + "reward": 1.6637941598892212, + "reward_std": 0.04839322529733181, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6637941598892212, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.59375, + "epoch": 0.88818359375, + "grad_norm": 1.0269338480997001, + "kl": 0.060302734375, + "learning_rate": 7.779541015624999e-07, + "loss": 0.0024, + "reward": 1.8385123014450073, + "reward_std": 0.044711560010910034, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8385123014450073, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.6875, + "epoch": 0.888671875, + "grad_norm": 4.6582457557107615, + "kl": 0.0445556640625, + "learning_rate": 7.7783203125e-07, + "loss": 0.0018, + "reward": 1.7252464294433594, + "reward_std": 0.08425504341721535, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.733058899641037, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.6640625, + "epoch": 0.88916015625, + "grad_norm": 4.495257505799833, + "kl": 0.0631103515625, + "learning_rate": 7.777099609375e-07, + "loss": 0.0025, + "reward": 1.815511703491211, + "reward_std": 0.05697597935795784, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8155117332935333, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.21875, + "epoch": 0.8896484375, + "grad_norm": 1.5187517576726908, + "kl": 0.07177734375, + "learning_rate": 7.77587890625e-07, + "loss": 0.0029, + "reward": 1.7230549454689026, + "reward_std": 0.03447245853021741, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7230549454689026, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.4140625, + "epoch": 0.89013671875, + "grad_norm": 1.5792144097013674, + "kl": 0.0489501953125, + "learning_rate": 7.774658203125e-07, + "loss": 0.002, + "reward": 1.6508527398109436, + "reward_std": 0.12545301765203476, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6586652100086212, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.2265625, + "epoch": 0.890625, + "grad_norm": 1.7338004675325442, + "kl": 0.0616455078125, + "learning_rate": 7.773437499999999e-07, + "loss": 0.0025, + "reward": 1.6528041362762451, + "reward_std": 0.03595791570842266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6528041362762451, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.875, + "epoch": 0.89111328125, + "grad_norm": 1.3828966259193087, + "kl": 0.0540771484375, + "learning_rate": 7.772216796874999e-07, + "loss": 0.0022, + "reward": 1.6901865601539612, + "reward_std": 0.06458355858922005, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6979990303516388, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.0390625, + "epoch": 0.8916015625, + "grad_norm": 4.53384888949427, + "kl": 0.0604248046875, + "learning_rate": 7.77099609375e-07, + "loss": 0.0024, + "reward": 1.7263333797454834, + "reward_std": 0.06643800996243954, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7341458201408386, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.4140625, + "epoch": 0.89208984375, + "grad_norm": 1.7484243028024995, + "kl": 0.0562744140625, + "learning_rate": 7.769775390625e-07, + "loss": 0.0022, + "reward": 1.7096668481826782, + "reward_std": 0.1043664738535881, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7096668183803558, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.5859375, + "epoch": 0.892578125, + "grad_norm": 4.9974540266792, + "kl": 0.0611572265625, + "learning_rate": 7.7685546875e-07, + "loss": 0.0025, + "reward": 1.6894102096557617, + "reward_std": 0.0981958694756031, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6894101500511169, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0703125, + "epoch": 0.89306640625, + "grad_norm": 3.5144777679347463, + "kl": 0.060546875, + "learning_rate": 7.767333984375e-07, + "loss": 0.0024, + "reward": 1.7617112398147583, + "reward_std": 0.09093910502269864, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7695237696170807, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.875, + "epoch": 0.8935546875, + "grad_norm": 1.5653767539180587, + "kl": 0.040771484375, + "learning_rate": 7.76611328125e-07, + "loss": 0.0016, + "reward": 1.8352625370025635, + "reward_std": 0.09809044748544693, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8430750966072083, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.125, + "epoch": 0.89404296875, + "grad_norm": 1.9658571112390546, + "kl": 0.0611572265625, + "learning_rate": 7.764892578124999e-07, + "loss": 0.0024, + "reward": 1.862768530845642, + "reward_std": 0.025783130899071693, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8627684712409973, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.453125, + "epoch": 0.89453125, + "grad_norm": 3.184036254441203, + "kl": 0.0498046875, + "learning_rate": 7.763671874999999e-07, + "loss": 0.002, + "reward": 1.756974220275879, + "reward_std": 0.04832346737384796, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7569742202758789, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.0390625, + "epoch": 0.89501953125, + "grad_norm": 1.7521194443833326, + "kl": 0.0618896484375, + "learning_rate": 7.762451171875e-07, + "loss": 0.0025, + "reward": 1.7110464572906494, + "reward_std": 0.07836447097361088, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7110464870929718, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.0859375, + "epoch": 0.8955078125, + "grad_norm": 11.051729675768927, + "kl": 0.0628662109375, + "learning_rate": 7.76123046875e-07, + "loss": 0.0025, + "reward": 1.6928837299346924, + "reward_std": 0.09242498874664307, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6928837299346924, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.4765625, + "epoch": 0.89599609375, + "grad_norm": 1.9611838635137748, + "kl": 0.0665283203125, + "learning_rate": 7.760009765625e-07, + "loss": 0.0027, + "reward": 1.7181519269943237, + "reward_std": 0.08656962960958481, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7181519567966461, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.765625, + "epoch": 0.896484375, + "grad_norm": 1.9097853521680372, + "kl": 0.063232421875, + "learning_rate": 7.7587890625e-07, + "loss": 0.0025, + "reward": 1.6719039678573608, + "reward_std": 0.0817815288901329, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6719039082527161, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.1484375, + "epoch": 0.89697265625, + "grad_norm": 2.040065547317354, + "kl": 0.054443359375, + "learning_rate": 7.757568359374999e-07, + "loss": 0.0022, + "reward": 1.8175336122512817, + "reward_std": 0.092707434669137, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.825346052646637, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.5703125, + "epoch": 0.8974609375, + "grad_norm": 2.114483844672761, + "kl": 0.06591796875, + "learning_rate": 7.756347656249999e-07, + "loss": 0.0026, + "reward": 1.7319183945655823, + "reward_std": 0.047073543071746826, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7319183647632599, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.71875, + "epoch": 0.89794921875, + "grad_norm": 0.7428215101894872, + "kl": 0.0401611328125, + "learning_rate": 7.755126953125e-07, + "loss": 0.0016, + "reward": 1.7587011456489563, + "reward_std": 0.026800723746418953, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7587011754512787, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.921875, + "epoch": 0.8984375, + "grad_norm": 0.7629811197679781, + "kl": 0.0484619140625, + "learning_rate": 7.75390625e-07, + "loss": 0.0019, + "reward": 1.6852461099624634, + "reward_std": 0.08999980986118317, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6930586099624634, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.3984375, + "epoch": 0.89892578125, + "grad_norm": 1.5555072595949755, + "kl": 0.0543212890625, + "learning_rate": 7.752685546875e-07, + "loss": 0.0022, + "reward": 1.7511460781097412, + "reward_std": 0.06476838141679764, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7511460781097412, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.75, + "epoch": 0.8994140625, + "grad_norm": 2.754364916244924, + "kl": 0.0587158203125, + "learning_rate": 7.75146484375e-07, + "loss": 0.0024, + "reward": 1.8262133598327637, + "reward_std": 0.02120867930352688, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8262133896350861, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.7421875, + "epoch": 0.89990234375, + "grad_norm": 24.641127364889815, + "kl": 0.0570068359375, + "learning_rate": 7.750244140625e-07, + "loss": 0.0023, + "reward": 1.6971803903579712, + "reward_std": 0.05197112262248993, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6971803903579712, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.484375, + "epoch": 0.900390625, + "grad_norm": 2.1238794464311006, + "kl": 0.06982421875, + "learning_rate": 7.749023437499999e-07, + "loss": 0.0028, + "reward": 1.6669594049453735, + "reward_std": 0.04614550992846489, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6669594645500183, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.78125, + "epoch": 0.90087890625, + "grad_norm": 2.071912757412851, + "kl": 0.0523681640625, + "learning_rate": 7.747802734374999e-07, + "loss": 0.0021, + "reward": 1.6606204509735107, + "reward_std": 0.08798486739397049, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6684330701828003, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.546875, + "epoch": 0.9013671875, + "grad_norm": 3.3768722950453633, + "kl": 0.050048828125, + "learning_rate": 7.74658203125e-07, + "loss": 0.002, + "reward": 1.7388845682144165, + "reward_std": 0.05811982438899577, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7545095980167389, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.4609375, + "epoch": 0.90185546875, + "grad_norm": 3.1836866801117893, + "kl": 0.0430908203125, + "learning_rate": 7.745361328125e-07, + "loss": 0.0017, + "reward": 1.8226521015167236, + "reward_std": 0.04751377273350954, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8226520419120789, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.1484375, + "epoch": 0.90234375, + "grad_norm": 1.576033496726682, + "kl": 0.0771484375, + "learning_rate": 7.744140625e-07, + "loss": 0.0031, + "reward": 1.7408050298690796, + "reward_std": 0.17640165239572525, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7720550298690796, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.8671875, + "epoch": 0.90283203125, + "grad_norm": 1.7373002567871683, + "kl": 0.0501708984375, + "learning_rate": 7.742919921875e-07, + "loss": 0.002, + "reward": 1.8163398504257202, + "reward_std": 0.04968139063566923, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8163398206233978, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.84375, + "epoch": 0.9033203125, + "grad_norm": 3.105106822155688, + "kl": 0.0472412109375, + "learning_rate": 7.741699218749999e-07, + "loss": 0.0019, + "reward": 1.8374771475791931, + "reward_std": 0.09784207679331303, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8452896475791931, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.21875, + "epoch": 0.90380859375, + "grad_norm": 1.089428365307123, + "kl": 0.0489501953125, + "learning_rate": 7.740478515624999e-07, + "loss": 0.002, + "reward": 1.68122398853302, + "reward_std": 0.09672827832400799, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6968489587306976, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.0546875, + "epoch": 0.904296875, + "grad_norm": 0.9032820450625452, + "kl": 0.0506591796875, + "learning_rate": 7.739257812499999e-07, + "loss": 0.002, + "reward": 1.7232590913772583, + "reward_std": 0.0855883564800024, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7310715913772583, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.8203125, + "epoch": 0.90478515625, + "grad_norm": 2.910697140965429, + "kl": 0.0596923828125, + "learning_rate": 7.738037109375e-07, + "loss": 0.0024, + "reward": 1.718904733657837, + "reward_std": 0.05999594181776047, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7189047038555145, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.71875, + "epoch": 0.9052734375, + "grad_norm": 2.3512710493981044, + "kl": 0.062744140625, + "learning_rate": 7.73681640625e-07, + "loss": 0.0025, + "reward": 1.8199704885482788, + "reward_std": 0.17502456158399582, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8199705183506012, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.9453125, + "epoch": 0.90576171875, + "grad_norm": 1.7378278510340661, + "kl": 0.055419921875, + "learning_rate": 7.735595703125e-07, + "loss": 0.0022, + "reward": 1.7747780680656433, + "reward_std": 0.08231132477521896, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7747780382633209, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.96875, + "epoch": 0.90625, + "grad_norm": 1.3484996843881978, + "kl": 0.06640625, + "learning_rate": 7.734375e-07, + "loss": 0.0027, + "reward": 1.7257348895072937, + "reward_std": 0.08233419992029667, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7335473895072937, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.859375, + "epoch": 0.90673828125, + "grad_norm": 1.3532753106816202, + "kl": 0.0523681640625, + "learning_rate": 7.733154296874999e-07, + "loss": 0.0021, + "reward": 1.6416913270950317, + "reward_std": 0.11033252347260714, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6495038270950317, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.0625, + "epoch": 0.9072265625, + "grad_norm": 2.4747324412246208, + "kl": 0.06201171875, + "learning_rate": 7.731933593749999e-07, + "loss": 0.0025, + "reward": 1.686651587486267, + "reward_std": 0.11174037307500839, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6944640278816223, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.75, + "epoch": 0.90771484375, + "grad_norm": 2.5596722124199562, + "kl": 0.0435791015625, + "learning_rate": 7.730712890625e-07, + "loss": 0.0017, + "reward": 1.7805684804916382, + "reward_std": 0.0784122459590435, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7883809506893158, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.2109375, + "epoch": 0.908203125, + "grad_norm": 1.9096008823123074, + "kl": 0.0513916015625, + "learning_rate": 7.7294921875e-07, + "loss": 0.0021, + "reward": 1.645488977432251, + "reward_std": 0.07388130389153957, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6533015072345734, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.8046875, + "epoch": 0.90869140625, + "grad_norm": 1.9563618836244545, + "kl": 0.057861328125, + "learning_rate": 7.728271484375e-07, + "loss": 0.0023, + "reward": 1.600885808467865, + "reward_std": 0.12279289960861206, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6086983382701874, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.578125, + "epoch": 0.9091796875, + "grad_norm": 2.251229228911326, + "kl": 0.06201171875, + "learning_rate": 7.72705078125e-07, + "loss": 0.0025, + "reward": 1.6833316087722778, + "reward_std": 0.09088350087404251, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6833316385746002, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.953125, + "epoch": 0.90966796875, + "grad_norm": 4.494937362372943, + "kl": 0.0693359375, + "learning_rate": 7.725830078124999e-07, + "loss": 0.0028, + "reward": 1.7052226066589355, + "reward_std": 0.0832928977906704, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7052225768566132, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.203125, + "epoch": 0.91015625, + "grad_norm": 0.8112151237513328, + "kl": 0.061279296875, + "learning_rate": 7.724609374999999e-07, + "loss": 0.0025, + "reward": 1.7318594455718994, + "reward_std": 0.03134281374514103, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.731859415769577, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.4453125, + "epoch": 0.91064453125, + "grad_norm": 2.793216129592739, + "kl": 0.068603515625, + "learning_rate": 7.723388671874999e-07, + "loss": 0.0027, + "reward": 1.750407099723816, + "reward_std": 0.12610271200537682, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7582195699214935, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.59375, + "epoch": 0.9111328125, + "grad_norm": 1.4003090946656476, + "kl": 0.0469970703125, + "learning_rate": 7.72216796875e-07, + "loss": 0.0019, + "reward": 1.7669113874435425, + "reward_std": 0.05195538140833378, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7669114768505096, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.6953125, + "epoch": 0.91162109375, + "grad_norm": 12.65215383630023, + "kl": 0.055908203125, + "learning_rate": 7.720947265625e-07, + "loss": 0.0022, + "reward": 1.841326653957367, + "reward_std": 0.046704126521945, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8413266539573669, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.6484375, + "epoch": 0.912109375, + "grad_norm": 1.6673497617014856, + "kl": 0.0653076171875, + "learning_rate": 7.7197265625e-07, + "loss": 0.0026, + "reward": 1.7550670504570007, + "reward_std": 0.08225375413894653, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7628795802593231, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.6328125, + "epoch": 0.91259765625, + "grad_norm": 1.5372288043835853, + "kl": 0.0531005859375, + "learning_rate": 7.718505859375e-07, + "loss": 0.0021, + "reward": 1.7609045505523682, + "reward_std": 0.03866549767553806, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.760904461145401, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.96875, + "epoch": 0.9130859375, + "grad_norm": 3.1166984800175563, + "kl": 0.06201171875, + "learning_rate": 7.717285156249999e-07, + "loss": 0.0025, + "reward": 1.7748718857765198, + "reward_std": 0.04408053681254387, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7748719453811646, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.65625, + "epoch": 0.91357421875, + "grad_norm": 1.146220052210111, + "kl": 0.059814453125, + "learning_rate": 7.716064453124999e-07, + "loss": 0.0024, + "reward": 1.8123140931129456, + "reward_std": 0.06602787971496582, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8123140633106232, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.8359375, + "epoch": 0.9140625, + "grad_norm": 2.7210699250281505, + "kl": 0.0550537109375, + "learning_rate": 7.71484375e-07, + "loss": 0.0022, + "reward": 1.7665232419967651, + "reward_std": 0.012389869894832373, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7665232121944427, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.6953125, + "epoch": 0.91455078125, + "grad_norm": 1.827791651361238, + "kl": 0.05517578125, + "learning_rate": 7.713623046875e-07, + "loss": 0.0022, + "reward": 1.8029692769050598, + "reward_std": 0.07247792184352875, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8029692471027374, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.203125, + "epoch": 0.9150390625, + "grad_norm": 1.5789037304261104, + "kl": 0.0679931640625, + "learning_rate": 7.71240234375e-07, + "loss": 0.0027, + "reward": 1.7826859951019287, + "reward_std": 0.07684960961341858, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7826859951019287, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.1875, + "epoch": 0.91552734375, + "grad_norm": 3.343706764989121, + "kl": 0.066650390625, + "learning_rate": 7.711181640625e-07, + "loss": 0.0027, + "reward": 1.7790513634681702, + "reward_std": 0.032321374863386154, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7790513634681702, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.9609375, + "epoch": 0.916015625, + "grad_norm": 2.533953944850406, + "kl": 0.048095703125, + "learning_rate": 7.709960937499999e-07, + "loss": 0.0019, + "reward": 1.7507587671279907, + "reward_std": 0.06509637832641602, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7507588565349579, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.703125, + "epoch": 0.91650390625, + "grad_norm": 0.7586952119724258, + "kl": 0.046875, + "learning_rate": 7.708740234374999e-07, + "loss": 0.0019, + "reward": 1.7596052885055542, + "reward_std": 0.12552650086581707, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7830427885055542, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.7890625, + "epoch": 0.9169921875, + "grad_norm": 1.6249630598487124, + "kl": 0.0576171875, + "learning_rate": 7.707519531249999e-07, + "loss": 0.0023, + "reward": 1.6994884610176086, + "reward_std": 0.03150587156414986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6994884312152863, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.3671875, + "epoch": 0.91748046875, + "grad_norm": 2.026286135731339, + "kl": 0.0614013671875, + "learning_rate": 7.706298828125e-07, + "loss": 0.0025, + "reward": 1.8732419610023499, + "reward_std": 0.06732478551566601, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8732418417930603, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.859375, + "epoch": 0.91796875, + "grad_norm": 2.345806175165156, + "kl": 0.040771484375, + "learning_rate": 7.705078125e-07, + "loss": 0.0016, + "reward": 1.6373432874679565, + "reward_std": 0.19714245945215225, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6764057576656342, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.4765625, + "epoch": 0.91845703125, + "grad_norm": 2.349540924433874, + "kl": 0.0516357421875, + "learning_rate": 7.703857421875e-07, + "loss": 0.0021, + "reward": 1.746773898601532, + "reward_std": 0.10035060532391071, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.762398898601532, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.234375, + "epoch": 0.9189453125, + "grad_norm": 2.33896483912966, + "kl": 0.060546875, + "learning_rate": 7.70263671875e-07, + "loss": 0.0024, + "reward": 1.564791977405548, + "reward_std": 0.12818468734622002, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5882294774055481, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.984375, + "epoch": 0.91943359375, + "grad_norm": 2.2240687275141218, + "kl": 0.05029296875, + "learning_rate": 7.701416015624999e-07, + "loss": 0.002, + "reward": 1.7980252504348755, + "reward_std": 0.08232726529240608, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8058376908302307, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.09375, + "epoch": 0.919921875, + "grad_norm": 2.8745265642260365, + "kl": 0.0523681640625, + "learning_rate": 7.700195312499999e-07, + "loss": 0.0021, + "reward": 1.7382362484931946, + "reward_std": 0.12476624548435211, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7538612484931946, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.75, + "epoch": 0.92041015625, + "grad_norm": 3.686867510401221, + "kl": 0.056640625, + "learning_rate": 7.698974609375e-07, + "loss": 0.0023, + "reward": 1.7156809568405151, + "reward_std": 0.08216442540287971, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7156809270381927, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.0, + "epoch": 0.9208984375, + "grad_norm": 6.288769712744168, + "kl": 0.0609130859375, + "learning_rate": 7.69775390625e-07, + "loss": 0.0024, + "reward": 1.7297690510749817, + "reward_std": 0.04128149338066578, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7297690212726593, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.8125, + "epoch": 0.92138671875, + "grad_norm": 2.0369024494011256, + "kl": 0.0589599609375, + "learning_rate": 7.696533203125e-07, + "loss": 0.0024, + "reward": 1.752552568912506, + "reward_std": 0.02822397742420435, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7525525689125061, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.0, + "epoch": 0.921875, + "grad_norm": 1.2842086090273468, + "kl": 0.0498046875, + "learning_rate": 7.6953125e-07, + "loss": 0.002, + "reward": 1.8643844723701477, + "reward_std": 0.03368113562464714, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8643843829631805, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.71875, + "epoch": 0.92236328125, + "grad_norm": 1.318950870858453, + "kl": 0.0462646484375, + "learning_rate": 7.694091796875e-07, + "loss": 0.0019, + "reward": 1.648529589176178, + "reward_std": 0.057421027682721615, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6563420593738556, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.390625, + "epoch": 0.9228515625, + "grad_norm": 1.577865853545429, + "kl": 0.082275390625, + "learning_rate": 7.692871093749999e-07, + "loss": 0.0033, + "reward": 1.63528710603714, + "reward_std": 0.06157683953642845, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6352871656417847, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.3359375, + "epoch": 0.92333984375, + "grad_norm": 7.518556987855353, + "kl": 0.06005859375, + "learning_rate": 7.691650390624999e-07, + "loss": 0.0024, + "reward": 1.722363293170929, + "reward_std": 0.1048150509595871, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.730175793170929, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.1640625, + "epoch": 0.923828125, + "grad_norm": 1.507211128713716, + "kl": 0.073486328125, + "learning_rate": 7.6904296875e-07, + "loss": 0.0029, + "reward": 1.7711586356163025, + "reward_std": 0.08003518357872963, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7711586952209473, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.4765625, + "epoch": 0.92431640625, + "grad_norm": 2.085935424953024, + "kl": 0.0565185546875, + "learning_rate": 7.689208984375e-07, + "loss": 0.0023, + "reward": 1.7579456567764282, + "reward_std": 0.06935618259012699, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7579456567764282, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.4140625, + "epoch": 0.9248046875, + "grad_norm": 1.3818555531186942, + "kl": 0.0494384765625, + "learning_rate": 7.68798828125e-07, + "loss": 0.002, + "reward": 1.8424060940742493, + "reward_std": 0.09461657330393791, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8502185940742493, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.734375, + "epoch": 0.92529296875, + "grad_norm": 1.1368563853147728, + "kl": 0.0433349609375, + "learning_rate": 7.686767578125e-07, + "loss": 0.0017, + "reward": 1.7353255152702332, + "reward_std": 0.056853363290429115, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7587630748748779, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.1171875, + "epoch": 0.92578125, + "grad_norm": 2.2064650956492744, + "kl": 0.06591796875, + "learning_rate": 7.685546874999999e-07, + "loss": 0.0026, + "reward": 1.7270656824111938, + "reward_std": 0.1103198304772377, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7348781824111938, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.03125, + "epoch": 0.92626953125, + "grad_norm": 1.4559467756111681, + "kl": 0.06201171875, + "learning_rate": 7.684326171874999e-07, + "loss": 0.0025, + "reward": 1.750020146369934, + "reward_std": 0.06828867271542549, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7500201165676117, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.8046875, + "epoch": 0.9267578125, + "grad_norm": 1.792759355086428, + "kl": 0.0550537109375, + "learning_rate": 7.68310546875e-07, + "loss": 0.0022, + "reward": 1.638475477695465, + "reward_std": 0.1530410349369049, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6541005373001099, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.265625, + "epoch": 0.92724609375, + "grad_norm": 4.437485735045951, + "kl": 0.0570068359375, + "learning_rate": 7.681884765625e-07, + "loss": 0.0023, + "reward": 1.79349684715271, + "reward_std": 0.03604122344404459, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7934968173503876, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.796875, + "epoch": 0.927734375, + "grad_norm": 2.779442073043061, + "kl": 0.0621337890625, + "learning_rate": 7.6806640625e-07, + "loss": 0.0025, + "reward": 1.6727771162986755, + "reward_std": 0.0650419145822525, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6727770864963531, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.4609375, + "epoch": 0.92822265625, + "grad_norm": 1.2941439692735104, + "kl": 0.0511474609375, + "learning_rate": 7.679443359375e-07, + "loss": 0.002, + "reward": 1.7836529612541199, + "reward_std": 0.10273768194019794, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7992779314517975, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.625, + "epoch": 0.9287109375, + "grad_norm": 1.451926128352837, + "kl": 0.058349609375, + "learning_rate": 7.67822265625e-07, + "loss": 0.0023, + "reward": 1.775130271911621, + "reward_std": 0.09728646278381348, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7829427421092987, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.890625, + "epoch": 0.92919921875, + "grad_norm": 2.2595384689614835, + "kl": 0.0582275390625, + "learning_rate": 7.677001953124999e-07, + "loss": 0.0023, + "reward": 1.7409818768501282, + "reward_std": 0.0586724728345871, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7409819066524506, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.546875, + "epoch": 0.9296875, + "grad_norm": 0.6888205497832584, + "kl": 0.060791015625, + "learning_rate": 7.675781249999999e-07, + "loss": 0.0024, + "reward": 1.8382083773612976, + "reward_std": 0.033903589239344, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.84602090716362, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.578125, + "epoch": 0.93017578125, + "grad_norm": 3.8284256312930305, + "kl": 0.0533447265625, + "learning_rate": 7.674560546875e-07, + "loss": 0.0021, + "reward": 1.8051932454109192, + "reward_std": 0.038204182870686054, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8051932752132416, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.984375, + "epoch": 0.9306640625, + "grad_norm": 1.1662088703849192, + "kl": 0.0560302734375, + "learning_rate": 7.67333984375e-07, + "loss": 0.0022, + "reward": 1.6394376754760742, + "reward_std": 0.12621871381998062, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6706876754760742, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.7109375, + "epoch": 0.93115234375, + "grad_norm": 1.0735713355110765, + "kl": 0.0506591796875, + "learning_rate": 7.672119140625e-07, + "loss": 0.002, + "reward": 1.775869071483612, + "reward_std": 0.038261422887444496, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7758690416812897, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.578125, + "epoch": 0.931640625, + "grad_norm": 1.1925985760085656, + "kl": 0.039306640625, + "learning_rate": 7.6708984375e-07, + "loss": 0.0016, + "reward": 1.8779195547103882, + "reward_std": 0.10916906967759132, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.893544614315033, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.703125, + "epoch": 0.93212890625, + "grad_norm": 0.8754908923158865, + "kl": 0.0565185546875, + "learning_rate": 7.669677734374999e-07, + "loss": 0.0023, + "reward": 1.790212869644165, + "reward_std": 0.04031490348279476, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7902128398418427, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.78125, + "epoch": 0.9326171875, + "grad_norm": 1.130446425832009, + "kl": 0.0740966796875, + "learning_rate": 7.668457031249999e-07, + "loss": 0.003, + "reward": 1.737060308456421, + "reward_std": 0.09539984166622162, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7526853680610657, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.2109375, + "epoch": 0.93310546875, + "grad_norm": 1.6278036854891171, + "kl": 0.0521240234375, + "learning_rate": 7.667236328125e-07, + "loss": 0.0021, + "reward": 1.7540555000305176, + "reward_std": 0.04432438686490059, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7540555000305176, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.65625, + "epoch": 0.93359375, + "grad_norm": 2.1925742495284313, + "kl": 0.0595703125, + "learning_rate": 7.666015625e-07, + "loss": 0.0024, + "reward": 1.7391607761383057, + "reward_std": 0.06034187972545624, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7391607463359833, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.921875, + "epoch": 0.93408203125, + "grad_norm": 7.166107534027712, + "kl": 0.066650390625, + "learning_rate": 7.664794921875e-07, + "loss": 0.0027, + "reward": 1.7412755489349365, + "reward_std": 0.06487971171736717, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7490880191326141, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.40625, + "epoch": 0.9345703125, + "grad_norm": 1.2690228720660945, + "kl": 0.0467529296875, + "learning_rate": 7.66357421875e-07, + "loss": 0.0019, + "reward": 1.739248275756836, + "reward_std": 0.03249887889251113, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7392483055591583, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.0078125, + "epoch": 0.93505859375, + "grad_norm": 1.7901065626462564, + "kl": 0.0469970703125, + "learning_rate": 7.662353515625e-07, + "loss": 0.0019, + "reward": 1.7768760919570923, + "reward_std": 0.0804726853966713, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7846885025501251, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.75, + "epoch": 0.935546875, + "grad_norm": 1.5712939062202214, + "kl": 0.0623779296875, + "learning_rate": 7.661132812499999e-07, + "loss": 0.0025, + "reward": 1.786317765712738, + "reward_std": 0.08603505790233612, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7863178253173828, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.0546875, + "epoch": 0.93603515625, + "grad_norm": 1.2722576840995556, + "kl": 0.055908203125, + "learning_rate": 7.659912109374999e-07, + "loss": 0.0022, + "reward": 1.8206439018249512, + "reward_std": 0.05164727196097374, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8206439316272736, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.4140625, + "epoch": 0.9365234375, + "grad_norm": 0.9577017742295563, + "kl": 0.070556640625, + "learning_rate": 7.65869140625e-07, + "loss": 0.0028, + "reward": 1.764600396156311, + "reward_std": 0.06939095444977283, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7724128663539886, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.96875, + "epoch": 0.93701171875, + "grad_norm": 1.1704098534116705, + "kl": 0.0428466796875, + "learning_rate": 7.657470703125e-07, + "loss": 0.0017, + "reward": 1.7799164652824402, + "reward_std": 0.06408461276441813, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7877289652824402, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.3203125, + "epoch": 0.9375, + "grad_norm": 1.3081495549462145, + "kl": 0.0550537109375, + "learning_rate": 7.65625e-07, + "loss": 0.0022, + "reward": 1.8275092840194702, + "reward_std": 0.12271393835544586, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8431342542171478, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.5703125, + "epoch": 0.93798828125, + "grad_norm": 2.0893342759380125, + "kl": 0.0570068359375, + "learning_rate": 7.655029296875e-07, + "loss": 0.0023, + "reward": 1.62141752243042, + "reward_std": 0.07201961986720562, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6214175224304199, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.0078125, + "epoch": 0.9384765625, + "grad_norm": 1.199646307200552, + "kl": 0.06201171875, + "learning_rate": 7.653808593749999e-07, + "loss": 0.0025, + "reward": 1.7694358825683594, + "reward_std": 0.0673837810754776, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.769435852766037, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.625, + "epoch": 0.93896484375, + "grad_norm": 0.9488691507074507, + "kl": 0.0556640625, + "learning_rate": 7.652587890624999e-07, + "loss": 0.0022, + "reward": 1.7294191718101501, + "reward_std": 0.06298989057540894, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7294191718101501, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5390625, + "epoch": 0.939453125, + "grad_norm": 0.9744407426363972, + "kl": 0.0570068359375, + "learning_rate": 7.6513671875e-07, + "loss": 0.0023, + "reward": 1.7692174911499023, + "reward_std": 0.12869003787636757, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7926550805568695, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.7265625, + "epoch": 0.93994140625, + "grad_norm": 8.440290119049392, + "kl": 0.048095703125, + "learning_rate": 7.650146484375e-07, + "loss": 0.0019, + "reward": 1.7950489521026611, + "reward_std": 0.06832708790898323, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7950489521026611, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.078125, + "epoch": 0.9404296875, + "grad_norm": 1.8177029228869115, + "kl": 0.0528564453125, + "learning_rate": 7.64892578125e-07, + "loss": 0.0021, + "reward": 1.6869670152664185, + "reward_std": 0.14585554599761963, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6947795152664185, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.0546875, + "epoch": 0.94091796875, + "grad_norm": 1.3445342169878876, + "kl": 0.0511474609375, + "learning_rate": 7.647705078125e-07, + "loss": 0.002, + "reward": 1.7540799379348755, + "reward_std": 0.06522182933986187, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7540798783302307, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.9453125, + "epoch": 0.94140625, + "grad_norm": 3.3655946441991906, + "kl": 0.0565185546875, + "learning_rate": 7.646484375e-07, + "loss": 0.0023, + "reward": 1.79484623670578, + "reward_std": 0.07020819000899792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.79484623670578, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.4921875, + "epoch": 0.94189453125, + "grad_norm": 2.7406095890255506, + "kl": 0.04931640625, + "learning_rate": 7.645263671874999e-07, + "loss": 0.002, + "reward": 1.650872528553009, + "reward_std": 0.07257736101746559, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.658685028553009, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.9609375, + "epoch": 0.9423828125, + "grad_norm": 9.439333223570415, + "kl": 0.061279296875, + "learning_rate": 7.644042968749999e-07, + "loss": 0.0025, + "reward": 1.7402021884918213, + "reward_std": 0.11112450435757637, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7558271884918213, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.953125, + "epoch": 0.94287109375, + "grad_norm": 3.0515199906985773, + "kl": 0.046875, + "learning_rate": 7.642822265625e-07, + "loss": 0.0019, + "reward": 1.806718111038208, + "reward_std": 0.04643261060118675, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8067179918289185, + "step": 1931 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.828125, + "epoch": 0.943359375, + "grad_norm": 2.3895842970175463, + "kl": 0.044189453125, + "learning_rate": 7.6416015625e-07, + "loss": 0.0018, + "reward": 1.8205534219741821, + "reward_std": 0.05191616341471672, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8205534815788269, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.4609375, + "epoch": 0.94384765625, + "grad_norm": 1.7762575067749533, + "kl": 0.0479736328125, + "learning_rate": 7.640380859375e-07, + "loss": 0.0019, + "reward": 1.8342650532722473, + "reward_std": 0.06186963617801666, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8342650830745697, + "step": 1933 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.515625, + "epoch": 0.9443359375, + "grad_norm": 1.1300241222412084, + "kl": 0.0462646484375, + "learning_rate": 7.63916015625e-07, + "loss": 0.0019, + "reward": 1.7758485078811646, + "reward_std": 0.0529699232429266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7758485078811646, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.7890625, + "epoch": 0.94482421875, + "grad_norm": 1.4612311004000913, + "kl": 0.048828125, + "learning_rate": 7.637939453124999e-07, + "loss": 0.002, + "reward": 1.7642263770103455, + "reward_std": 0.0411844439804554, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7642263472080231, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.5703125, + "epoch": 0.9453125, + "grad_norm": 1.0895138935674837, + "kl": 0.039306640625, + "learning_rate": 7.636718749999999e-07, + "loss": 0.0016, + "reward": 1.7819878458976746, + "reward_std": 0.11622267588973045, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.789800375699997, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.3046875, + "epoch": 0.94580078125, + "grad_norm": 1.116509437268169, + "kl": 0.049072265625, + "learning_rate": 7.635498046875e-07, + "loss": 0.002, + "reward": 1.7431734204292297, + "reward_std": 0.06907767802476883, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.750985860824585, + "step": 1937 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.5625, + "epoch": 0.9462890625, + "grad_norm": 1.0548626164801436, + "kl": 0.05615234375, + "learning_rate": 7.63427734375e-07, + "loss": 0.0022, + "reward": 1.7038698196411133, + "reward_std": 0.08116939291357994, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7038698196411133, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.7890625, + "epoch": 0.94677734375, + "grad_norm": 2.6074049824933714, + "kl": 0.063232421875, + "learning_rate": 7.633056640625e-07, + "loss": 0.0025, + "reward": 1.6791431903839111, + "reward_std": 0.11179608106613159, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6791431903839111, + "step": 1939 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.265625, + "epoch": 0.947265625, + "grad_norm": 4.189314836063207, + "kl": 0.0457763671875, + "learning_rate": 7.6318359375e-07, + "loss": 0.0018, + "reward": 1.7924708127975464, + "reward_std": 0.04897610656917095, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7924707531929016, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.3828125, + "epoch": 0.94775390625, + "grad_norm": 1.1917903075664644, + "kl": 0.0635986328125, + "learning_rate": 7.630615234375e-07, + "loss": 0.0025, + "reward": 1.7675382494926453, + "reward_std": 0.09949354082345963, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7753507494926453, + "step": 1941 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.703125, + "epoch": 0.9482421875, + "grad_norm": 0.6607591548251206, + "kl": 0.04150390625, + "learning_rate": 7.629394531249999e-07, + "loss": 0.0017, + "reward": 1.8325649499893188, + "reward_std": 0.01903275726363063, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8325649201869965, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.6015625, + "epoch": 0.94873046875, + "grad_norm": 6.132868339589314, + "kl": 0.073974609375, + "learning_rate": 7.628173828124999e-07, + "loss": 0.003, + "reward": 1.6471970677375793, + "reward_std": 0.060605697333812714, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6550095677375793, + "step": 1943 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.546875, + "epoch": 0.94921875, + "grad_norm": 1.4264393045627874, + "kl": 0.0548095703125, + "learning_rate": 7.626953125e-07, + "loss": 0.0022, + "reward": 1.7925902605056763, + "reward_std": 0.07345704361796379, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8004027009010315, + "step": 1944 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.046875, + "epoch": 0.94970703125, + "grad_norm": 1.319335546915897, + "kl": 0.049560546875, + "learning_rate": 7.625732421875e-07, + "loss": 0.002, + "reward": 1.7566935420036316, + "reward_std": 0.11580286920070648, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7723186016082764, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.7109375, + "epoch": 0.9501953125, + "grad_norm": 2.0534734192819637, + "kl": 0.0673828125, + "learning_rate": 7.62451171875e-07, + "loss": 0.0027, + "reward": 1.7132260203361511, + "reward_std": 0.08049709908664227, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7210385203361511, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.140625, + "epoch": 0.95068359375, + "grad_norm": 2.215564027477621, + "kl": 0.0567626953125, + "learning_rate": 7.623291015625e-07, + "loss": 0.0023, + "reward": 1.6834967136383057, + "reward_std": 0.04565897583961487, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6834966838359833, + "step": 1947 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.2734375, + "epoch": 0.951171875, + "grad_norm": 2.075455827754385, + "kl": 0.0616455078125, + "learning_rate": 7.622070312499999e-07, + "loss": 0.0025, + "reward": 1.688484787940979, + "reward_std": 0.09864621236920357, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.696297287940979, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.140625, + "epoch": 0.95166015625, + "grad_norm": 2.250374045260722, + "kl": 0.05810546875, + "learning_rate": 7.620849609374999e-07, + "loss": 0.0023, + "reward": 1.8399544954299927, + "reward_std": 0.07792560383677483, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8399545550346375, + "step": 1949 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.359375, + "epoch": 0.9521484375, + "grad_norm": 1.8302443569238318, + "kl": 0.0623779296875, + "learning_rate": 7.619628906249999e-07, + "loss": 0.0025, + "reward": 1.690042495727539, + "reward_std": 0.07748877070844173, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6900425255298615, + "step": 1950 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.0, + "epoch": 0.95263671875, + "grad_norm": 1.2995294061711256, + "kl": 0.056396484375, + "learning_rate": 7.618408203125e-07, + "loss": 0.0023, + "reward": 1.7628344893455505, + "reward_std": 0.038826122879981995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7628344595432281, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.875, + "epoch": 0.953125, + "grad_norm": 0.8592355590617015, + "kl": 0.0640869140625, + "learning_rate": 7.6171875e-07, + "loss": 0.0026, + "reward": 1.6795039176940918, + "reward_std": 0.03331707790493965, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6795038878917694, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.90625, + "epoch": 0.95361328125, + "grad_norm": 7.492200822054816, + "kl": 0.08251953125, + "learning_rate": 7.615966796875e-07, + "loss": 0.0033, + "reward": 1.6888149976730347, + "reward_std": 0.1778181865811348, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7122524678707123, + "step": 1953 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.5859375, + "epoch": 0.9541015625, + "grad_norm": 1.7680737074288075, + "kl": 0.060791015625, + "learning_rate": 7.61474609375e-07, + "loss": 0.0024, + "reward": 1.7545133829116821, + "reward_std": 0.07330542802810669, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7623258829116821, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.7421875, + "epoch": 0.95458984375, + "grad_norm": 1.5553080968803128, + "kl": 0.0546875, + "learning_rate": 7.613525390624999e-07, + "loss": 0.0022, + "reward": 1.8474570512771606, + "reward_std": 0.06518928147852421, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8474570512771606, + "step": 1955 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.25, + "epoch": 0.955078125, + "grad_norm": 3.5111551151395557, + "kl": 0.0557861328125, + "learning_rate": 7.612304687499999e-07, + "loss": 0.0022, + "reward": 1.6904324293136597, + "reward_std": 0.04466338828206062, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6904323995113373, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.453125, + "epoch": 0.95556640625, + "grad_norm": 1.8392727464500473, + "kl": 0.066162109375, + "learning_rate": 7.611083984375e-07, + "loss": 0.0027, + "reward": 1.7047904133796692, + "reward_std": 0.07609418779611588, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7126030325889587, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.6875, + "epoch": 0.9560546875, + "grad_norm": 4.599343237367128, + "kl": 0.073486328125, + "learning_rate": 7.60986328125e-07, + "loss": 0.0029, + "reward": 1.6910215616226196, + "reward_std": 0.06167110428214073, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.691021591424942, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.0703125, + "epoch": 0.95654296875, + "grad_norm": 1.7998521981288202, + "kl": 0.0550537109375, + "learning_rate": 7.608642578125e-07, + "loss": 0.0022, + "reward": 1.8349308371543884, + "reward_std": 0.06764688296243548, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8505558371543884, + "step": 1959 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.53125, + "epoch": 0.95703125, + "grad_norm": 1.7450693187577557, + "kl": 0.057373046875, + "learning_rate": 7.607421875e-07, + "loss": 0.0023, + "reward": 1.5837258696556091, + "reward_std": 0.10083448141813278, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5915383994579315, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.9921875, + "epoch": 0.95751953125, + "grad_norm": 11.114277968224194, + "kl": 0.0599365234375, + "learning_rate": 7.606201171874999e-07, + "loss": 0.0024, + "reward": 1.6976925134658813, + "reward_std": 0.056451691314578056, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6976925134658813, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.359375, + "epoch": 0.9580078125, + "grad_norm": 2.105224164489077, + "kl": 0.074951171875, + "learning_rate": 7.604980468749999e-07, + "loss": 0.003, + "reward": 1.6166620254516602, + "reward_std": 0.08987650275230408, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6244744658470154, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.96875, + "epoch": 0.95849609375, + "grad_norm": 0.9690709225249844, + "kl": 0.0579833984375, + "learning_rate": 7.603759765624999e-07, + "loss": 0.0023, + "reward": 1.6878407001495361, + "reward_std": 0.07451405934989452, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6878407299518585, + "step": 1963 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.375, + "epoch": 0.958984375, + "grad_norm": 3.78849539137508, + "kl": 0.0511474609375, + "learning_rate": 7.6025390625e-07, + "loss": 0.002, + "reward": 1.6614224910736084, + "reward_std": 0.15857132896780968, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6770474314689636, + "step": 1964 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.59375, + "epoch": 0.95947265625, + "grad_norm": 0.9652488508161328, + "kl": 0.0633544921875, + "learning_rate": 7.601318359375e-07, + "loss": 0.0025, + "reward": 1.6580791473388672, + "reward_std": 0.11845768243074417, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7049541175365448, + "step": 1965 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.3125, + "epoch": 0.9599609375, + "grad_norm": 1.8006023215240339, + "kl": 0.071044921875, + "learning_rate": 7.60009765625e-07, + "loss": 0.0028, + "reward": 1.6227675080299377, + "reward_std": 0.12311100959777832, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6227675080299377, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.796875, + "epoch": 0.96044921875, + "grad_norm": 1.2712786186046117, + "kl": 0.10009765625, + "learning_rate": 7.598876953125e-07, + "loss": 0.004, + "reward": 1.69350266456604, + "reward_std": 0.051613882184028625, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6935026347637177, + "step": 1967 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.4765625, + "epoch": 0.9609375, + "grad_norm": 5.457319655705029, + "kl": 0.0501708984375, + "learning_rate": 7.597656249999999e-07, + "loss": 0.002, + "reward": 1.769425630569458, + "reward_std": 0.040609823539853096, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7694256603717804, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.875, + "epoch": 0.96142578125, + "grad_norm": 1.1418180815570933, + "kl": 0.0518798828125, + "learning_rate": 7.596435546874999e-07, + "loss": 0.0021, + "reward": 1.7559481859207153, + "reward_std": 0.03939475491642952, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7559481859207153, + "step": 1969 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.5546875, + "epoch": 0.9619140625, + "grad_norm": 2.761562135305564, + "kl": 0.0643310546875, + "learning_rate": 7.59521484375e-07, + "loss": 0.0026, + "reward": 1.6901981830596924, + "reward_std": 0.044297732412815094, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.69019815325737, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.3125, + "epoch": 0.96240234375, + "grad_norm": 4.959957474481586, + "kl": 0.061279296875, + "learning_rate": 7.593994140625e-07, + "loss": 0.0024, + "reward": 1.7054769396781921, + "reward_std": 0.12026718631386757, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7054769396781921, + "step": 1971 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.2421875, + "epoch": 0.962890625, + "grad_norm": 1.0195643681683435, + "kl": 0.0621337890625, + "learning_rate": 7.5927734375e-07, + "loss": 0.0025, + "reward": 1.7764147520065308, + "reward_std": 0.022609219886362553, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7764147520065308, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.8359375, + "epoch": 0.96337890625, + "grad_norm": 2.7446568495906796, + "kl": 0.0595703125, + "learning_rate": 7.591552734375e-07, + "loss": 0.0024, + "reward": 1.7252334952354431, + "reward_std": 0.21081995964050293, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.7877334952354431, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.2734375, + "epoch": 0.9638671875, + "grad_norm": 0.8602906688907507, + "kl": 0.068603515625, + "learning_rate": 7.59033203125e-07, + "loss": 0.0027, + "reward": 1.701697051525116, + "reward_std": 0.06423486396670341, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7095095813274384, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.953125, + "epoch": 0.96435546875, + "grad_norm": 2.2124515036462893, + "kl": 0.06787109375, + "learning_rate": 7.589111328124999e-07, + "loss": 0.0027, + "reward": 1.7300501465797424, + "reward_std": 0.11370932310819626, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7378626465797424, + "step": 1975 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.4765625, + "epoch": 0.96484375, + "grad_norm": 1.2753647461496138, + "kl": 0.068115234375, + "learning_rate": 7.587890624999999e-07, + "loss": 0.0027, + "reward": 1.7049716711044312, + "reward_std": 0.04964887537062168, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7049716711044312, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.2421875, + "epoch": 0.96533203125, + "grad_norm": 1.3004843395679404, + "kl": 0.071044921875, + "learning_rate": 7.586669921875e-07, + "loss": 0.0028, + "reward": 1.7647384405136108, + "reward_std": 0.09294159710407257, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7647384405136108, + "step": 1977 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.484375, + "epoch": 0.9658203125, + "grad_norm": 2.020273111626442, + "kl": 0.073486328125, + "learning_rate": 7.58544921875e-07, + "loss": 0.0029, + "reward": 1.660966157913208, + "reward_std": 0.10214090719819069, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6687787771224976, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.21875, + "epoch": 0.96630859375, + "grad_norm": 3.3338455450291704, + "kl": 0.0609130859375, + "learning_rate": 7.584228515625e-07, + "loss": 0.0024, + "reward": 1.774294674396515, + "reward_std": 0.07802858576178551, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7742947041988373, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.3046875, + "epoch": 0.966796875, + "grad_norm": 1.4604029448276346, + "kl": 0.0670166015625, + "learning_rate": 7.5830078125e-07, + "loss": 0.0027, + "reward": 1.7650516033172607, + "reward_std": 0.10301512852311134, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7806766629219055, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.0546875, + "epoch": 0.96728515625, + "grad_norm": 1.6943394591205752, + "kl": 0.0499267578125, + "learning_rate": 7.581787109374999e-07, + "loss": 0.002, + "reward": 1.5032365322113037, + "reward_std": 0.17277055978775024, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.5422990322113037, + "step": 1981 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.65625, + "epoch": 0.9677734375, + "grad_norm": 2.8707007057084284, + "kl": 0.067626953125, + "learning_rate": 7.580566406249999e-07, + "loss": 0.0027, + "reward": 1.7244834303855896, + "reward_std": 0.09691913425922394, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7244834303855896, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.1015625, + "epoch": 0.96826171875, + "grad_norm": 2.317509515130158, + "kl": 0.0565185546875, + "learning_rate": 7.579345703125e-07, + "loss": 0.0023, + "reward": 1.8406411409378052, + "reward_std": 0.04954299796372652, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8406412601470947, + "step": 1983 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.0625, + "epoch": 0.96875, + "grad_norm": 1.1665337790056094, + "kl": 0.0460205078125, + "learning_rate": 7.578125e-07, + "loss": 0.0018, + "reward": 1.871698260307312, + "reward_std": 0.08394120261073112, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.879510760307312, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.65625, + "epoch": 0.96923828125, + "grad_norm": 1.5997848800347867, + "kl": 0.07080078125, + "learning_rate": 7.576904296875e-07, + "loss": 0.0028, + "reward": 1.6359334588050842, + "reward_std": 0.05299815069884062, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6359334290027618, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.8359375, + "epoch": 0.9697265625, + "grad_norm": 1.5599863734636639, + "kl": 0.0576171875, + "learning_rate": 7.57568359375e-07, + "loss": 0.0023, + "reward": 1.8334488272666931, + "reward_std": 0.046138789504766464, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8334488272666931, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.8984375, + "epoch": 0.97021484375, + "grad_norm": 1.444134658928476, + "kl": 0.078125, + "learning_rate": 7.574462890625e-07, + "loss": 0.0031, + "reward": 1.6236762404441833, + "reward_std": 0.04829781036823988, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6236762404441833, + "step": 1987 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.1953125, + "epoch": 0.970703125, + "grad_norm": 1.758426859641077, + "kl": 0.0589599609375, + "learning_rate": 7.573242187499999e-07, + "loss": 0.0024, + "reward": 1.7658716440200806, + "reward_std": 0.09004973247647285, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7658716142177582, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.7890625, + "epoch": 0.97119140625, + "grad_norm": 1.6967311663572766, + "kl": 0.053955078125, + "learning_rate": 7.572021484374999e-07, + "loss": 0.0022, + "reward": 1.6358023881912231, + "reward_std": 0.11093928292393684, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6436149477958679, + "step": 1989 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.8828125, + "epoch": 0.9716796875, + "grad_norm": 2.04207112904878, + "kl": 0.0439453125, + "learning_rate": 7.57080078125e-07, + "loss": 0.0018, + "reward": 1.81356942653656, + "reward_std": 0.050269074738025665, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8213819265365601, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.890625, + "epoch": 0.97216796875, + "grad_norm": 6.093675872108313, + "kl": 0.0615234375, + "learning_rate": 7.569580078125e-07, + "loss": 0.0025, + "reward": 1.7886452674865723, + "reward_std": 0.1061076745390892, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7964576780796051, + "step": 1991 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5078125, + "epoch": 0.97265625, + "grad_norm": 3.270820256088369, + "kl": 0.06689453125, + "learning_rate": 7.568359375e-07, + "loss": 0.0027, + "reward": 1.8523313999176025, + "reward_std": 0.07106838375329971, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8523313105106354, + "step": 1992 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.21875, + "epoch": 0.97314453125, + "grad_norm": 1.8113236486460977, + "kl": 0.05078125, + "learning_rate": 7.567138671875e-07, + "loss": 0.002, + "reward": 1.7407814264297485, + "reward_std": 0.10060215182602406, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7485939860343933, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.203125, + "epoch": 0.9736328125, + "grad_norm": 3.923636866884382, + "kl": 0.0592041015625, + "learning_rate": 7.565917968749999e-07, + "loss": 0.0024, + "reward": 1.7928686141967773, + "reward_std": 0.07088093087077141, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.792868584394455, + "step": 1994 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.65625, + "epoch": 0.97412109375, + "grad_norm": 1.0205069593502663, + "kl": 0.0491943359375, + "learning_rate": 7.564697265624999e-07, + "loss": 0.002, + "reward": 1.820485532283783, + "reward_std": 0.034951613284647465, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.820485532283783, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.3125, + "epoch": 0.974609375, + "grad_norm": 1.3146869441919158, + "kl": 0.0570068359375, + "learning_rate": 7.5634765625e-07, + "loss": 0.0023, + "reward": 1.6975049376487732, + "reward_std": 0.050795383751392365, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6975049078464508, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.9765625, + "epoch": 0.97509765625, + "grad_norm": 2.103512397807353, + "kl": 0.0506591796875, + "learning_rate": 7.562255859375e-07, + "loss": 0.002, + "reward": 1.7807487845420837, + "reward_std": 0.06361746462062001, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7885612845420837, + "step": 1997 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.015625, + "epoch": 0.9755859375, + "grad_norm": 9.688121646518365, + "kl": 0.076416015625, + "learning_rate": 7.56103515625e-07, + "loss": 0.0031, + "reward": 1.704875409603119, + "reward_std": 0.09089740738272667, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7126878798007965, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.953125, + "epoch": 0.97607421875, + "grad_norm": 1.7750779721775813, + "kl": 0.0582275390625, + "learning_rate": 7.559814453125e-07, + "loss": 0.0023, + "reward": 1.7654090523719788, + "reward_std": 0.07578187435865402, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7654090225696564, + "step": 1999 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.6640625, + "epoch": 0.9765625, + "grad_norm": 6.238100855818762, + "kl": 0.0582275390625, + "learning_rate": 7.55859375e-07, + "loss": 0.0023, + "reward": 1.7102959752082825, + "reward_std": 0.13805482536554337, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7259210050106049, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.859375, + "epoch": 0.97705078125, + "grad_norm": 4.341560277329405, + "kl": 0.054443359375, + "learning_rate": 7.557373046874999e-07, + "loss": 0.0022, + "reward": 1.7868390083312988, + "reward_std": 0.04986852779984474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7868389785289764, + "step": 2001 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.96875, + "epoch": 0.9775390625, + "grad_norm": 1.11717831151428, + "kl": 0.0552978515625, + "learning_rate": 7.556152343749999e-07, + "loss": 0.0022, + "reward": 1.6372390389442444, + "reward_std": 0.04036341607570648, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6372390389442444, + "step": 2002 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.09375, + "epoch": 0.97802734375, + "grad_norm": 1.1083383023566933, + "kl": 0.0694580078125, + "learning_rate": 7.554931640625e-07, + "loss": 0.0028, + "reward": 1.6198468804359436, + "reward_std": 0.08751692995429039, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.62765933573246, + "step": 2003 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.453125, + "epoch": 0.978515625, + "grad_norm": 3.2731263361849963, + "kl": 0.0699462890625, + "learning_rate": 7.5537109375e-07, + "loss": 0.0028, + "reward": 1.7005380988121033, + "reward_std": 0.07136748731136322, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7005380988121033, + "step": 2004 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.125, + "epoch": 0.97900390625, + "grad_norm": 4.932970901055641, + "kl": 0.056396484375, + "learning_rate": 7.552490234375e-07, + "loss": 0.0023, + "reward": 1.799683392047882, + "reward_std": 0.060001108795404434, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7996833622455597, + "step": 2005 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.4140625, + "epoch": 0.9794921875, + "grad_norm": 1.5941330883157996, + "kl": 0.0606689453125, + "learning_rate": 7.55126953125e-07, + "loss": 0.0024, + "reward": 1.773667812347412, + "reward_std": 0.058895327150821686, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7892928421497345, + "step": 2006 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.0703125, + "epoch": 0.97998046875, + "grad_norm": 1.3767151749195397, + "kl": 0.0626220703125, + "learning_rate": 7.550048828124999e-07, + "loss": 0.0025, + "reward": 1.7313017845153809, + "reward_std": 0.08502375334501266, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7391143441200256, + "step": 2007 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.9453125, + "epoch": 0.98046875, + "grad_norm": 2.620215945991878, + "kl": 0.0706787109375, + "learning_rate": 7.548828124999999e-07, + "loss": 0.0028, + "reward": 1.762086808681488, + "reward_std": 0.09928128868341446, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.769899308681488, + "step": 2008 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.2734375, + "epoch": 0.98095703125, + "grad_norm": 3.8108526407917065, + "kl": 0.0550537109375, + "learning_rate": 7.547607421875e-07, + "loss": 0.0022, + "reward": 1.695708990097046, + "reward_std": 0.052436916157603264, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6957089602947235, + "step": 2009 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.9140625, + "epoch": 0.9814453125, + "grad_norm": 1.733038006998932, + "kl": 0.0518798828125, + "learning_rate": 7.54638671875e-07, + "loss": 0.0021, + "reward": 1.791795015335083, + "reward_std": 0.05815849453210831, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.791795015335083, + "step": 2010 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.28125, + "epoch": 0.98193359375, + "grad_norm": 3.978067562200423, + "kl": 0.069091796875, + "learning_rate": 7.545166015625e-07, + "loss": 0.0028, + "reward": 1.7538942098617554, + "reward_std": 0.10411181300878525, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7617067396640778, + "step": 2011 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.6875, + "epoch": 0.982421875, + "grad_norm": 5.320638926802918, + "kl": 0.062255859375, + "learning_rate": 7.5439453125e-07, + "loss": 0.0025, + "reward": 1.755543053150177, + "reward_std": 0.07938620075583458, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.755543053150177, + "step": 2012 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5390625, + "epoch": 0.98291015625, + "grad_norm": 2.7469000275015745, + "kl": 0.058837890625, + "learning_rate": 7.542724609375e-07, + "loss": 0.0024, + "reward": 1.6718215942382812, + "reward_std": 0.09080488607287407, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6718215942382812, + "step": 2013 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.5703125, + "epoch": 0.9833984375, + "grad_norm": 1.7108353133321768, + "kl": 0.0496826171875, + "learning_rate": 7.541503906249999e-07, + "loss": 0.002, + "reward": 1.7525351643562317, + "reward_std": 0.06778106465935707, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7525351941585541, + "step": 2014 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.4765625, + "epoch": 0.98388671875, + "grad_norm": 1.4726300800994188, + "kl": 0.051513671875, + "learning_rate": 7.540283203124999e-07, + "loss": 0.0021, + "reward": 1.7786903977394104, + "reward_std": 0.03883876092731953, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7786904275417328, + "step": 2015 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.390625, + "epoch": 0.984375, + "grad_norm": 2.204556954621402, + "kl": 0.0611572265625, + "learning_rate": 7.5390625e-07, + "loss": 0.0024, + "reward": 1.788576900959015, + "reward_std": 0.05577550455927849, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7885768711566925, + "step": 2016 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.125, + "epoch": 0.98486328125, + "grad_norm": 1.666818054703339, + "kl": 0.048095703125, + "learning_rate": 7.537841796875e-07, + "loss": 0.0019, + "reward": 1.7710611820220947, + "reward_std": 0.1724838688969612, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8101237118244171, + "step": 2017 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.90625, + "epoch": 0.9853515625, + "grad_norm": 1.4819925595821943, + "kl": 0.055419921875, + "learning_rate": 7.53662109375e-07, + "loss": 0.0022, + "reward": 1.844101369380951, + "reward_std": 0.11457358300685883, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8519138097763062, + "step": 2018 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.1640625, + "epoch": 0.98583984375, + "grad_norm": 0.863402575226768, + "kl": 0.057373046875, + "learning_rate": 7.535400390625e-07, + "loss": 0.0023, + "reward": 1.8190799951553345, + "reward_std": 0.028183109126985073, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8190799951553345, + "step": 2019 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.9609375, + "epoch": 0.986328125, + "grad_norm": 1.693685033843237, + "kl": 0.0555419921875, + "learning_rate": 7.534179687499999e-07, + "loss": 0.0022, + "reward": 1.8829106092453003, + "reward_std": 0.03998455451801419, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8829106092453003, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.8046875, + "epoch": 0.98681640625, + "grad_norm": 1.5870071937237666, + "kl": 0.0589599609375, + "learning_rate": 7.532958984374999e-07, + "loss": 0.0024, + "reward": 1.7445420026779175, + "reward_std": 0.05026637949049473, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7445419728755951, + "step": 2021 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.890625, + "epoch": 0.9873046875, + "grad_norm": 1.6854847331554719, + "kl": 0.0511474609375, + "learning_rate": 7.53173828125e-07, + "loss": 0.002, + "reward": 1.7165476083755493, + "reward_std": 0.1217353455722332, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7399851083755493, + "step": 2022 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.1875, + "epoch": 0.98779296875, + "grad_norm": 3.876567272736264, + "kl": 0.0650634765625, + "learning_rate": 7.530517578125e-07, + "loss": 0.0026, + "reward": 1.648318886756897, + "reward_std": 0.11310148239135742, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.656131386756897, + "step": 2023 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.765625, + "epoch": 0.98828125, + "grad_norm": 17.515994932795444, + "kl": 0.0703125, + "learning_rate": 7.529296875e-07, + "loss": 0.0028, + "reward": 1.788986623287201, + "reward_std": 0.04432579409331083, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7889866828918457, + "step": 2024 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.109375, + "epoch": 0.98876953125, + "grad_norm": 3.17134976005998, + "kl": 0.056640625, + "learning_rate": 7.528076171875e-07, + "loss": 0.0023, + "reward": 1.741170048713684, + "reward_std": 0.1307641789317131, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7489825487136841, + "step": 2025 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.671875, + "epoch": 0.9892578125, + "grad_norm": 0.6360837641452156, + "kl": 0.063720703125, + "learning_rate": 7.52685546875e-07, + "loss": 0.0025, + "reward": 1.833198606967926, + "reward_std": 0.033319685608148575, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8331986367702484, + "step": 2026 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.640625, + "epoch": 0.98974609375, + "grad_norm": 2.1336492840699175, + "kl": 0.057373046875, + "learning_rate": 7.525634765624999e-07, + "loss": 0.0023, + "reward": 1.6746537685394287, + "reward_std": 0.06350501254200935, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6746538877487183, + "step": 2027 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.0703125, + "epoch": 0.990234375, + "grad_norm": 4.759167762843545, + "kl": 0.064453125, + "learning_rate": 7.524414062499999e-07, + "loss": 0.0026, + "reward": 1.7838603258132935, + "reward_std": 0.09581628814339638, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7838603258132935, + "step": 2028 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.75, + "epoch": 0.99072265625, + "grad_norm": 1.6603971592521138, + "kl": 0.063232421875, + "learning_rate": 7.523193359375e-07, + "loss": 0.0025, + "reward": 1.6969883441925049, + "reward_std": 0.08057832717895508, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6969882547855377, + "step": 2029 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.3125, + "epoch": 0.9912109375, + "grad_norm": 1.180993131330784, + "kl": 0.0533447265625, + "learning_rate": 7.52197265625e-07, + "loss": 0.0021, + "reward": 1.6063008308410645, + "reward_std": 0.17890335619449615, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6453633606433868, + "step": 2030 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.0234375, + "epoch": 0.99169921875, + "grad_norm": 1.095038299634487, + "kl": 0.0523681640625, + "learning_rate": 7.520751953125e-07, + "loss": 0.0021, + "reward": 1.779970109462738, + "reward_std": 0.08126384392380714, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7955950498580933, + "step": 2031 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.8984375, + "epoch": 0.9921875, + "grad_norm": 2.6979143608108265, + "kl": 0.0673828125, + "learning_rate": 7.51953125e-07, + "loss": 0.0027, + "reward": 1.740858554840088, + "reward_std": 0.09589342772960663, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7408585846424103, + "step": 2032 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.609375, + "epoch": 0.99267578125, + "grad_norm": 2.848196704773645, + "kl": 0.05908203125, + "learning_rate": 7.518310546874999e-07, + "loss": 0.0024, + "reward": 1.7175134420394897, + "reward_std": 0.028206244111061096, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.717513382434845, + "step": 2033 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.7890625, + "epoch": 0.9931640625, + "grad_norm": 3.5557294743989054, + "kl": 0.0601806640625, + "learning_rate": 7.517089843749999e-07, + "loss": 0.0024, + "reward": 1.8203625082969666, + "reward_std": 0.060490844771265984, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8203625082969666, + "step": 2034 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.21875, + "epoch": 0.99365234375, + "grad_norm": 1.016603411671492, + "kl": 0.0650634765625, + "learning_rate": 7.515869140625e-07, + "loss": 0.0026, + "reward": 1.7698943614959717, + "reward_std": 0.05688617751002312, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7698944211006165, + "step": 2035 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.234375, + "epoch": 0.994140625, + "grad_norm": 1.1139849774388095, + "kl": 0.0531005859375, + "learning_rate": 7.5146484375e-07, + "loss": 0.0021, + "reward": 1.829136312007904, + "reward_std": 0.030549502931535244, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8291363418102264, + "step": 2036 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.609375, + "epoch": 0.99462890625, + "grad_norm": 1.9862434556565576, + "kl": 0.08837890625, + "learning_rate": 7.513427734375e-07, + "loss": 0.0035, + "reward": 1.7540498971939087, + "reward_std": 0.019931727088987827, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7540498673915863, + "step": 2037 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.328125, + "epoch": 0.9951171875, + "grad_norm": 1.245846895225628, + "kl": 0.066650390625, + "learning_rate": 7.51220703125e-07, + "loss": 0.0027, + "reward": 1.775355041027069, + "reward_std": 0.03905859775841236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7753550708293915, + "step": 2038 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.9609375, + "epoch": 0.99560546875, + "grad_norm": 5.356625380854759, + "kl": 0.0936279296875, + "learning_rate": 7.510986328125e-07, + "loss": 0.0037, + "reward": 1.7761430740356445, + "reward_std": 0.08487707003951073, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7761430144309998, + "step": 2039 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.609375, + "epoch": 0.99609375, + "grad_norm": 0.8395847923657573, + "kl": 0.0499267578125, + "learning_rate": 7.509765624999999e-07, + "loss": 0.002, + "reward": 1.8729313015937805, + "reward_std": 0.02620452456176281, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8729313611984253, + "step": 2040 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.625, + "epoch": 0.99658203125, + "grad_norm": 2.022128826305776, + "kl": 0.052734375, + "learning_rate": 7.508544921874999e-07, + "loss": 0.0021, + "reward": 1.725024938583374, + "reward_std": 0.12236949801445007, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.732837438583374, + "step": 2041 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.859375, + "epoch": 0.9970703125, + "grad_norm": 15.315894492212703, + "kl": 0.0631103515625, + "learning_rate": 7.50732421875e-07, + "loss": 0.0025, + "reward": 1.7646106481552124, + "reward_std": 0.09433956071734428, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7724231779575348, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.3671875, + "epoch": 0.99755859375, + "grad_norm": 8.3408933772686, + "kl": 0.07275390625, + "learning_rate": 7.506103515625e-07, + "loss": 0.0029, + "reward": 1.7954939603805542, + "reward_std": 0.060113584622740746, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.795494019985199, + "step": 2043 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.3515625, + "epoch": 0.998046875, + "grad_norm": 2.462496443372101, + "kl": 0.0589599609375, + "learning_rate": 7.5048828125e-07, + "loss": 0.0024, + "reward": 1.7298526167869568, + "reward_std": 0.10707394033670425, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7298526465892792, + "step": 2044 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.171875, + "epoch": 0.99853515625, + "grad_norm": 2.026729878970871, + "kl": 0.0565185546875, + "learning_rate": 7.503662109375e-07, + "loss": 0.0023, + "reward": 1.7790692448616028, + "reward_std": 0.07894434407353401, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7868817448616028, + "step": 2045 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.328125, + "epoch": 0.9990234375, + "grad_norm": 2.3300701728667375, + "kl": 0.0714111328125, + "learning_rate": 7.502441406249999e-07, + "loss": 0.0029, + "reward": 1.7110105156898499, + "reward_std": 0.15207843482494354, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7422605454921722, + "step": 2046 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.68031311035156, + "epoch": 0.99951171875, + "grad_norm": 1.141841899027945, + "kl": 0.05810546875, + "learning_rate": 7.501220703124999e-07, + "loss": 0.0024, + "reward": 1.8359350562095642, + "reward_std": 0.14897098392248154, + "rewards/format_reward": 0.9754097759723663, + "rewards/ocr_reward": 0.8605252206325531, + "step": 2047 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.5625, + "epoch": 1.00048828125, + "grad_norm": 3.803502318684469, + "kl": 0.0550537109375, + "learning_rate": 7.5e-07, + "loss": 0.0022, + "reward": 1.8090072274208069, + "reward_std": 0.06662950664758682, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8090072870254517, + "step": 2048 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.046875, + "epoch": 1.0009765625, + "grad_norm": 0.7862946563465897, + "kl": 0.06298828125, + "learning_rate": 7.498779296875e-07, + "loss": 0.0025, + "reward": 1.6712990999221802, + "reward_std": 0.02280174382030964, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6712990701198578, + "step": 2049 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.5078125, + "epoch": 1.00146484375, + "grad_norm": 1.342303670259137, + "kl": 0.0587158203125, + "learning_rate": 7.49755859375e-07, + "loss": 0.0023, + "reward": 1.840239703655243, + "reward_std": 0.023719463497400284, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8402397036552429, + "step": 2050 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.3828125, + "epoch": 1.001953125, + "grad_norm": 0.9572879054411417, + "kl": 0.0562744140625, + "learning_rate": 7.496337890625e-07, + "loss": 0.0023, + "reward": 1.8105382919311523, + "reward_std": 0.09160010330379009, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8183507919311523, + "step": 2051 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.640625, + "epoch": 1.00244140625, + "grad_norm": 1.6102653045301574, + "kl": 0.05078125, + "learning_rate": 7.4951171875e-07, + "loss": 0.002, + "reward": 1.6806508302688599, + "reward_std": 0.1597279291599989, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7275258004665375, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.0, + "epoch": 1.0029296875, + "grad_norm": 1.4804709133667804, + "kl": 0.050537109375, + "learning_rate": 7.493896484374999e-07, + "loss": 0.002, + "reward": 1.804791808128357, + "reward_std": 0.11673609726130962, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8204168379306793, + "step": 2053 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.546875, + "epoch": 1.00341796875, + "grad_norm": 1.9657773937285294, + "kl": 0.056396484375, + "learning_rate": 7.492675781249999e-07, + "loss": 0.0023, + "reward": 1.8031712174415588, + "reward_std": 0.11098561063408852, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8109836876392365, + "step": 2054 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.2265625, + "epoch": 1.00390625, + "grad_norm": 1.6895047631875901, + "kl": 0.053955078125, + "learning_rate": 7.491455078125e-07, + "loss": 0.0022, + "reward": 1.648207187652588, + "reward_std": 0.2681009843945503, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7028946876525879, + "step": 2055 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.71875, + "epoch": 1.00439453125, + "grad_norm": 3.0164245238907608, + "kl": 0.0562744140625, + "learning_rate": 7.490234375e-07, + "loss": 0.0023, + "reward": 1.7384542226791382, + "reward_std": 0.09973935224115849, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7462667226791382, + "step": 2056 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.0078125, + "epoch": 1.0048828125, + "grad_norm": 1.4926521960283892, + "kl": 0.0579833984375, + "learning_rate": 7.489013671875e-07, + "loss": 0.0023, + "reward": 1.8106223940849304, + "reward_std": 0.02411152981221676, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8106224536895752, + "step": 2057 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.171875, + "epoch": 1.00537109375, + "grad_norm": 1.418929150113678, + "kl": 0.0513916015625, + "learning_rate": 7.48779296875e-07, + "loss": 0.0021, + "reward": 1.733910322189331, + "reward_std": 0.11833417788147926, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7417227923870087, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.359375, + "epoch": 1.005859375, + "grad_norm": 2.1455363975809427, + "kl": 0.0540771484375, + "learning_rate": 7.486572265624999e-07, + "loss": 0.0022, + "reward": 1.7620959281921387, + "reward_std": 0.13864869251847267, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7933458983898163, + "step": 2059 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.7578125, + "epoch": 1.00634765625, + "grad_norm": 1.4711170006450227, + "kl": 0.0628662109375, + "learning_rate": 7.485351562499999e-07, + "loss": 0.0025, + "reward": 1.8471481800079346, + "reward_std": 0.04248751141130924, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8471481502056122, + "step": 2060 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0859375, + "epoch": 1.0068359375, + "grad_norm": 1.6633546292914565, + "kl": 0.0538330078125, + "learning_rate": 7.484130859374999e-07, + "loss": 0.0022, + "reward": 1.6721433401107788, + "reward_std": 0.08107060939073563, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.695580929517746, + "step": 2061 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.359375, + "epoch": 1.00732421875, + "grad_norm": 2.7740994248622304, + "kl": 0.0533447265625, + "learning_rate": 7.48291015625e-07, + "loss": 0.0021, + "reward": 1.72020024061203, + "reward_std": 0.09270552173256874, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.72801274061203, + "step": 2062 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.21875, + "epoch": 1.0078125, + "grad_norm": 1.8930966049480564, + "kl": 0.0594482421875, + "learning_rate": 7.481689453125e-07, + "loss": 0.0024, + "reward": 1.8536216616630554, + "reward_std": 0.06580028869211674, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8692466914653778, + "step": 2063 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.9453125, + "epoch": 1.00830078125, + "grad_norm": 5.427251340055625, + "kl": 0.0599365234375, + "learning_rate": 7.48046875e-07, + "loss": 0.0024, + "reward": 1.766296148300171, + "reward_std": 0.07562026381492615, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7819211483001709, + "step": 2064 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.7265625, + "epoch": 1.0087890625, + "grad_norm": 1.6714670851072095, + "kl": 0.0499267578125, + "learning_rate": 7.479248046875e-07, + "loss": 0.002, + "reward": 1.8139212131500244, + "reward_std": 0.08759323135018349, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8217337727546692, + "step": 2065 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.5234375, + "epoch": 1.00927734375, + "grad_norm": 1.4835031644240106, + "kl": 0.0606689453125, + "learning_rate": 7.478027343749999e-07, + "loss": 0.0024, + "reward": 1.5658519268035889, + "reward_std": 0.17432072386145592, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.5971018970012665, + "step": 2066 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.8125, + "epoch": 1.009765625, + "grad_norm": 1.002420848386625, + "kl": 0.06494140625, + "learning_rate": 7.476806640624999e-07, + "loss": 0.0026, + "reward": 1.7776638269424438, + "reward_std": 0.08371632359921932, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7854763865470886, + "step": 2067 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.859375, + "epoch": 1.01025390625, + "grad_norm": 1.8316432776762526, + "kl": 0.055908203125, + "learning_rate": 7.4755859375e-07, + "loss": 0.0022, + "reward": 1.7354426980018616, + "reward_std": 0.08602847345173359, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.735442727804184, + "step": 2068 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.9609375, + "epoch": 1.0107421875, + "grad_norm": 0.8791339224441008, + "kl": 0.0540771484375, + "learning_rate": 7.474365234375e-07, + "loss": 0.0022, + "reward": 1.8300225734710693, + "reward_std": 0.035275645554065704, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.830022543668747, + "step": 2069 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.171875, + "epoch": 1.01123046875, + "grad_norm": 1.4982277249016536, + "kl": 0.04736328125, + "learning_rate": 7.47314453125e-07, + "loss": 0.0019, + "reward": 1.8361040353775024, + "reward_std": 0.03841123543679714, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8361040949821472, + "step": 2070 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.234375, + "epoch": 1.01171875, + "grad_norm": 0.6837506894975471, + "kl": 0.0616455078125, + "learning_rate": 7.471923828125e-07, + "loss": 0.0025, + "reward": 1.788848340511322, + "reward_std": 0.058198969811201096, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7888484299182892, + "step": 2071 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.6015625, + "epoch": 1.01220703125, + "grad_norm": 1.443895916653485, + "kl": 0.077392578125, + "learning_rate": 7.470703125e-07, + "loss": 0.0031, + "reward": 1.6802573204040527, + "reward_std": 0.11816703528165817, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6880699098110199, + "step": 2072 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.5859375, + "epoch": 1.0126953125, + "grad_norm": 6.059167846056267, + "kl": 0.065673828125, + "learning_rate": 7.469482421874999e-07, + "loss": 0.0026, + "reward": 1.7388933897018433, + "reward_std": 0.07620543800294399, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7388934195041656, + "step": 2073 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.5078125, + "epoch": 1.01318359375, + "grad_norm": 2.3817776057089093, + "kl": 0.06494140625, + "learning_rate": 7.468261718749999e-07, + "loss": 0.0026, + "reward": 1.7072933316230774, + "reward_std": 0.0740668810904026, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.707293301820755, + "step": 2074 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.8671875, + "epoch": 1.013671875, + "grad_norm": 1.3422401028685387, + "kl": 0.072998046875, + "learning_rate": 7.467041015625e-07, + "loss": 0.0029, + "reward": 1.733224332332611, + "reward_std": 0.06582791358232498, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7332243919372559, + "step": 2075 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.046875, + "epoch": 1.01416015625, + "grad_norm": 0.9975798038955767, + "kl": 0.061767578125, + "learning_rate": 7.4658203125e-07, + "loss": 0.0025, + "reward": 1.645218014717102, + "reward_std": 0.02710463386029005, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.645218014717102, + "step": 2076 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.125, + "epoch": 1.0146484375, + "grad_norm": 1.565205712470546, + "kl": 0.07861328125, + "learning_rate": 7.464599609375e-07, + "loss": 0.0031, + "reward": 1.6886191368103027, + "reward_std": 0.03423266182653606, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.688619077205658, + "step": 2077 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.8984375, + "epoch": 1.01513671875, + "grad_norm": 2.642889571657758, + "kl": 0.0645751953125, + "learning_rate": 7.46337890625e-07, + "loss": 0.0026, + "reward": 1.787465751171112, + "reward_std": 0.04049981106072664, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7874657511711121, + "step": 2078 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.75, + "epoch": 1.015625, + "grad_norm": 2.426800209345326, + "kl": 0.0562744140625, + "learning_rate": 7.462158203124999e-07, + "loss": 0.0023, + "reward": 1.71806001663208, + "reward_std": 0.030627473257482052, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7180599868297577, + "step": 2079 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.2890625, + "epoch": 1.01611328125, + "grad_norm": 5.675745821516389, + "kl": 0.062255859375, + "learning_rate": 7.460937499999999e-07, + "loss": 0.0025, + "reward": 1.7228577136993408, + "reward_std": 0.10195358097553253, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7306701838970184, + "step": 2080 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.0390625, + "epoch": 1.0166015625, + "grad_norm": 3.7013692307890995, + "kl": 0.060302734375, + "learning_rate": 7.459716796875e-07, + "loss": 0.0024, + "reward": 1.9432930946350098, + "reward_std": 0.15386238880455494, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9432931840419769, + "step": 2081 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.078125, + "epoch": 1.01708984375, + "grad_norm": 2.032809989173598, + "kl": 0.076416015625, + "learning_rate": 7.45849609375e-07, + "loss": 0.0031, + "reward": 1.7166752815246582, + "reward_std": 0.0454743467271328, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7166752815246582, + "step": 2082 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.65625, + "epoch": 1.017578125, + "grad_norm": 3.864692981065383, + "kl": 0.0621337890625, + "learning_rate": 7.457275390625e-07, + "loss": 0.0025, + "reward": 1.8227131962776184, + "reward_std": 0.0507346335798502, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.822713166475296, + "step": 2083 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.96875, + "epoch": 1.01806640625, + "grad_norm": 1.0624085888750547, + "kl": 0.05126953125, + "learning_rate": 7.4560546875e-07, + "loss": 0.0021, + "reward": 1.7786809802055359, + "reward_std": 0.10613211244344711, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7864934504032135, + "step": 2084 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.7265625, + "epoch": 1.0185546875, + "grad_norm": 2.255916713761177, + "kl": 0.075927734375, + "learning_rate": 7.454833984375e-07, + "loss": 0.003, + "reward": 1.7361916899681091, + "reward_std": 0.1337970271706581, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7440041303634644, + "step": 2085 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.0859375, + "epoch": 1.01904296875, + "grad_norm": 3.5672731701933262, + "kl": 0.0615234375, + "learning_rate": 7.453613281249999e-07, + "loss": 0.0025, + "reward": 1.834282636642456, + "reward_std": 0.04526693467050791, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8342825770378113, + "step": 2086 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.8515625, + "epoch": 1.01953125, + "grad_norm": 1.1390787112646121, + "kl": 0.0599365234375, + "learning_rate": 7.452392578124999e-07, + "loss": 0.0024, + "reward": 1.8966719508171082, + "reward_std": 0.14545264467597008, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9044845402240753, + "step": 2087 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.3671875, + "epoch": 1.02001953125, + "grad_norm": 1.1424293003741923, + "kl": 0.05712890625, + "learning_rate": 7.451171875e-07, + "loss": 0.0023, + "reward": 1.776362955570221, + "reward_std": 0.08313069678843021, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7841754257678986, + "step": 2088 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.109375, + "epoch": 1.0205078125, + "grad_norm": 1.2961993858286134, + "kl": 0.0609130859375, + "learning_rate": 7.449951171875e-07, + "loss": 0.0024, + "reward": 1.7457592487335205, + "reward_std": 0.06624248251318932, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7457592785358429, + "step": 2089 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.46875, + "epoch": 1.02099609375, + "grad_norm": 2.9122243864865833, + "kl": 0.0662841796875, + "learning_rate": 7.44873046875e-07, + "loss": 0.0027, + "reward": 1.7659137845039368, + "reward_std": 0.09149673208594322, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7815387845039368, + "step": 2090 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.5703125, + "epoch": 1.021484375, + "grad_norm": 1.7837165721571753, + "kl": 0.0560302734375, + "learning_rate": 7.447509765625e-07, + "loss": 0.0022, + "reward": 1.7793264389038086, + "reward_std": 0.07757101766765118, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.779326468706131, + "step": 2091 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.6875, + "epoch": 1.02197265625, + "grad_norm": 2.6677175324191764, + "kl": 0.0672607421875, + "learning_rate": 7.446289062499999e-07, + "loss": 0.0027, + "reward": 1.7153024673461914, + "reward_std": 0.06006733886897564, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7153024673461914, + "step": 2092 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.6484375, + "epoch": 1.0224609375, + "grad_norm": 8.212754923875098, + "kl": 0.06787109375, + "learning_rate": 7.445068359374999e-07, + "loss": 0.0027, + "reward": 1.731951892375946, + "reward_std": 0.08415350876748562, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7319517731666565, + "step": 2093 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.8359375, + "epoch": 1.02294921875, + "grad_norm": 1.0081680404176827, + "kl": 0.059326171875, + "learning_rate": 7.44384765625e-07, + "loss": 0.0024, + "reward": 1.7743852734565735, + "reward_std": 0.06954375258646905, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7743852734565735, + "step": 2094 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.75, + "epoch": 1.0234375, + "grad_norm": 2.3423641729543694, + "kl": 0.07568359375, + "learning_rate": 7.442626953125e-07, + "loss": 0.003, + "reward": 1.8153335452079773, + "reward_std": 0.06889502890408039, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8153335452079773, + "step": 2095 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.1171875, + "epoch": 1.02392578125, + "grad_norm": 0.9005825292004632, + "kl": 0.06982421875, + "learning_rate": 7.44140625e-07, + "loss": 0.0028, + "reward": 1.7364252805709839, + "reward_std": 0.14264655858278275, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7676753103733063, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.9609375, + "epoch": 1.0244140625, + "grad_norm": 1.2457369377691487, + "kl": 0.0660400390625, + "learning_rate": 7.440185546875e-07, + "loss": 0.0026, + "reward": 1.8361563086509705, + "reward_std": 0.03824903070926666, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8361562192440033, + "step": 2097 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.1171875, + "epoch": 1.02490234375, + "grad_norm": 4.994694191481811, + "kl": 0.070556640625, + "learning_rate": 7.43896484375e-07, + "loss": 0.0028, + "reward": 1.7435556650161743, + "reward_std": 0.08704771101474762, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7513681650161743, + "step": 2098 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.4140625, + "epoch": 1.025390625, + "grad_norm": 1.291822341118298, + "kl": 0.069091796875, + "learning_rate": 7.437744140624999e-07, + "loss": 0.0028, + "reward": 1.729736089706421, + "reward_std": 0.05247452110052109, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7297360599040985, + "step": 2099 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.8671875, + "epoch": 1.02587890625, + "grad_norm": 1.6815056470609293, + "kl": 0.063720703125, + "learning_rate": 7.436523437499999e-07, + "loss": 0.0026, + "reward": 1.813611924648285, + "reward_std": 0.048070028424263, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8136118948459625, + "step": 2100 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.421875, + "epoch": 1.0263671875, + "grad_norm": 2.2241505771119563, + "kl": 0.07421875, + "learning_rate": 7.435302734375e-07, + "loss": 0.003, + "reward": 1.8664068579673767, + "reward_std": 0.04198060557246208, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8664068281650543, + "step": 2101 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.03125, + "epoch": 1.02685546875, + "grad_norm": 1.3461961566886906, + "kl": 0.0640869140625, + "learning_rate": 7.43408203125e-07, + "loss": 0.0026, + "reward": 1.7394928336143494, + "reward_std": 0.11257979273796082, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7473053336143494, + "step": 2102 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.8046875, + "epoch": 1.02734375, + "grad_norm": 0.7096896402429057, + "kl": 0.0537109375, + "learning_rate": 7.432861328125e-07, + "loss": 0.0022, + "reward": 1.7980987429618835, + "reward_std": 0.1013258621096611, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8137237429618835, + "step": 2103 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.5625, + "epoch": 1.02783203125, + "grad_norm": 0.9569277555942687, + "kl": 0.0828857421875, + "learning_rate": 7.431640625e-07, + "loss": 0.0033, + "reward": 1.8101829886436462, + "reward_std": 0.045853691175580025, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8101829886436462, + "step": 2104 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.3125, + "epoch": 1.0283203125, + "grad_norm": 4.832884831931728, + "kl": 0.0743408203125, + "learning_rate": 7.430419921874999e-07, + "loss": 0.003, + "reward": 1.7681997418403625, + "reward_std": 0.12344841938465834, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7760122716426849, + "step": 2105 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.640625, + "epoch": 1.02880859375, + "grad_norm": 3.3258791760247073, + "kl": 0.06689453125, + "learning_rate": 7.429199218749999e-07, + "loss": 0.0027, + "reward": 1.7421391010284424, + "reward_std": 0.05865258723497391, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7421391606330872, + "step": 2106 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.921875, + "epoch": 1.029296875, + "grad_norm": 2.6371942953766623, + "kl": 0.066650390625, + "learning_rate": 7.427978515625e-07, + "loss": 0.0027, + "reward": 1.6952205896377563, + "reward_std": 0.1647278480231762, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7420955300331116, + "step": 2107 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.5234375, + "epoch": 1.02978515625, + "grad_norm": 2.0605570090114793, + "kl": 0.058349609375, + "learning_rate": 7.4267578125e-07, + "loss": 0.0023, + "reward": 1.773497223854065, + "reward_std": 0.22887670993804932, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8047472238540649, + "step": 2108 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.0, + "epoch": 1.0302734375, + "grad_norm": 2.492828590978083, + "kl": 0.06640625, + "learning_rate": 7.425537109375e-07, + "loss": 0.0027, + "reward": 1.7053377032279968, + "reward_std": 0.1504954844713211, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7287752032279968, + "step": 2109 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.8984375, + "epoch": 1.03076171875, + "grad_norm": 2.696773965633565, + "kl": 0.07080078125, + "learning_rate": 7.42431640625e-07, + "loss": 0.0028, + "reward": 1.8146610260009766, + "reward_std": 0.12038443237543106, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8302860260009766, + "step": 2110 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.90625, + "epoch": 1.03125, + "grad_norm": 1.8989371313788723, + "kl": 0.0650634765625, + "learning_rate": 7.423095703125e-07, + "loss": 0.0026, + "reward": 1.7706368565559387, + "reward_std": 0.2233428657054901, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8096993565559387, + "step": 2111 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.6640625, + "epoch": 1.03173828125, + "grad_norm": 3.6497635388956224, + "kl": 0.140625, + "learning_rate": 7.421874999999999e-07, + "loss": 0.0056, + "reward": 1.586828589439392, + "reward_std": 0.30886563658714294, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6493285894393921, + "step": 2112 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.25, + "epoch": 1.0322265625, + "grad_norm": 0.9780344434658866, + "kl": 0.06201171875, + "learning_rate": 7.420654296874999e-07, + "loss": 0.0025, + "reward": 1.62257981300354, + "reward_std": 0.3575499951839447, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.70851731300354, + "step": 2113 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.1640625, + "epoch": 1.03271484375, + "grad_norm": 5.949761627807109, + "kl": 0.065185546875, + "learning_rate": 7.41943359375e-07, + "loss": 0.0026, + "reward": 1.6299309730529785, + "reward_std": 0.39395518600940704, + "rewards/format_reward": 0.9140625, + "rewards/ocr_reward": 0.7158684730529785, + "step": 2114 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.59375, + "epoch": 1.033203125, + "grad_norm": 1.059347621749564, + "kl": 0.062744140625, + "learning_rate": 7.418212890625e-07, + "loss": 0.0025, + "reward": 1.7748578786849976, + "reward_std": 0.18243324011564255, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8061079382896423, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.53125, + "epoch": 1.03369140625, + "grad_norm": 0.6484997754740194, + "kl": 0.0679931640625, + "learning_rate": 7.4169921875e-07, + "loss": 0.0027, + "reward": 1.6713528037071228, + "reward_std": 0.351689875125885, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.7494778335094452, + "step": 2116 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.7109375, + "epoch": 1.0341796875, + "grad_norm": 2.682455320676119, + "kl": 0.0614013671875, + "learning_rate": 7.415771484375e-07, + "loss": 0.0025, + "reward": 1.7464085221290588, + "reward_std": 0.1380649134516716, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7698459923267365, + "step": 2117 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.53125, + "epoch": 1.03466796875, + "grad_norm": 0.9380873546654088, + "kl": 0.0721435546875, + "learning_rate": 7.414550781249999e-07, + "loss": 0.0029, + "reward": 1.8222784996032715, + "reward_std": 0.10763486847281456, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8379034996032715, + "step": 2118 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.125, + "epoch": 1.03515625, + "grad_norm": 1.7089694241065019, + "kl": 0.066650390625, + "learning_rate": 7.413330078124999e-07, + "loss": 0.0027, + "reward": 1.680859923362732, + "reward_std": 0.1279044784605503, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7121100127696991, + "step": 2119 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.1015625, + "epoch": 1.03564453125, + "grad_norm": 1.184804985444957, + "kl": 0.0596923828125, + "learning_rate": 7.412109375e-07, + "loss": 0.0024, + "reward": 1.7026260495185852, + "reward_std": 0.0845637135207653, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7104385793209076, + "step": 2120 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.8984375, + "epoch": 1.0361328125, + "grad_norm": 2.4689879281350584, + "kl": 0.0672607421875, + "learning_rate": 7.410888671875e-07, + "loss": 0.0027, + "reward": 1.6870477795600891, + "reward_std": 0.08825328946113586, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6948603093624115, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.4609375, + "epoch": 1.03662109375, + "grad_norm": 2.637185167724473, + "kl": 0.073974609375, + "learning_rate": 7.40966796875e-07, + "loss": 0.003, + "reward": 1.7651128768920898, + "reward_std": 0.04329609777778387, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7651128768920898, + "step": 2122 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.7109375, + "epoch": 1.037109375, + "grad_norm": 15.312775728411562, + "kl": 0.059814453125, + "learning_rate": 7.408447265625e-07, + "loss": 0.0024, + "reward": 1.7873907089233398, + "reward_std": 0.053318215534090996, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7873907387256622, + "step": 2123 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.375, + "epoch": 1.03759765625, + "grad_norm": 2.1162197557292513, + "kl": 0.086181640625, + "learning_rate": 7.4072265625e-07, + "loss": 0.0034, + "reward": 1.637177586555481, + "reward_std": 0.0317330677062273, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6371775567531586, + "step": 2124 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.6171875, + "epoch": 1.0380859375, + "grad_norm": 5.324096223818963, + "kl": 0.058837890625, + "learning_rate": 7.406005859374999e-07, + "loss": 0.0024, + "reward": 1.7509996891021729, + "reward_std": 0.06184336729347706, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7509996891021729, + "step": 2125 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.1875, + "epoch": 1.03857421875, + "grad_norm": 0.9586175770588368, + "kl": 0.0633544921875, + "learning_rate": 7.404785156249999e-07, + "loss": 0.0025, + "reward": 1.820866346359253, + "reward_std": 0.11166086047887802, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8599288463592529, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.234375, + "epoch": 1.0390625, + "grad_norm": 1.9880497607902226, + "kl": 0.063232421875, + "learning_rate": 7.403564453125e-07, + "loss": 0.0025, + "reward": 1.682478904724121, + "reward_std": 0.03829295188188553, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6824789345264435, + "step": 2127 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.234375, + "epoch": 1.03955078125, + "grad_norm": 0.9645795198072094, + "kl": 0.0556640625, + "learning_rate": 7.40234375e-07, + "loss": 0.0022, + "reward": 1.6462610960006714, + "reward_std": 0.03332418855279684, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.646261066198349, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.3671875, + "epoch": 1.0400390625, + "grad_norm": 1.5537365872327011, + "kl": 0.072509765625, + "learning_rate": 7.401123046875e-07, + "loss": 0.0029, + "reward": 1.7381998300552368, + "reward_std": 0.12407108163461089, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.792887270450592, + "step": 2129 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.203125, + "epoch": 1.04052734375, + "grad_norm": 2.7636605699535606, + "kl": 0.0556640625, + "learning_rate": 7.39990234375e-07, + "loss": 0.0022, + "reward": 1.6674214005470276, + "reward_std": 0.03596335183829069, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.66742143034935, + "step": 2130 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.0859375, + "epoch": 1.041015625, + "grad_norm": 1.2451863266792458, + "kl": 0.07861328125, + "learning_rate": 7.398681640624999e-07, + "loss": 0.0031, + "reward": 1.7419597506523132, + "reward_std": 0.0864316001534462, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7497721910476685, + "step": 2131 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.0390625, + "epoch": 1.04150390625, + "grad_norm": 1.5936629511193108, + "kl": 0.0469970703125, + "learning_rate": 7.397460937499999e-07, + "loss": 0.0019, + "reward": 1.6851105093955994, + "reward_std": 0.0796204935759306, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.692922979593277, + "step": 2132 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.2421875, + "epoch": 1.0419921875, + "grad_norm": 3.690196248709246, + "kl": 0.0635986328125, + "learning_rate": 7.396240234375e-07, + "loss": 0.0025, + "reward": 1.7646411657333374, + "reward_std": 0.08438229188323021, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7724536657333374, + "step": 2133 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.53125, + "epoch": 1.04248046875, + "grad_norm": 2.1257395476547387, + "kl": 0.04931640625, + "learning_rate": 7.39501953125e-07, + "loss": 0.002, + "reward": 1.6398783326148987, + "reward_std": 0.19317952543497086, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.7180033326148987, + "step": 2134 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.9765625, + "epoch": 1.04296875, + "grad_norm": 0.8869364175450244, + "kl": 0.0697021484375, + "learning_rate": 7.393798828125e-07, + "loss": 0.0028, + "reward": 1.6968178749084473, + "reward_std": 0.11358075961470604, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7046304047107697, + "step": 2135 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.8828125, + "epoch": 1.04345703125, + "grad_norm": 2.2017880020005243, + "kl": 0.064453125, + "learning_rate": 7.392578125e-07, + "loss": 0.0026, + "reward": 1.795127511024475, + "reward_std": 0.044227102771401405, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7951274216175079, + "step": 2136 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.1328125, + "epoch": 1.0439453125, + "grad_norm": 2.79303089558888, + "kl": 0.0516357421875, + "learning_rate": 7.391357421875e-07, + "loss": 0.0021, + "reward": 1.7270812392234802, + "reward_std": 0.11675109714269638, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7348937392234802, + "step": 2137 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.0546875, + "epoch": 1.04443359375, + "grad_norm": 1.05482411179448, + "kl": 0.0499267578125, + "learning_rate": 7.390136718749999e-07, + "loss": 0.002, + "reward": 1.8569371104240417, + "reward_std": 0.027390625327825546, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8569370210170746, + "step": 2138 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.578125, + "epoch": 1.044921875, + "grad_norm": 3.373738723448789, + "kl": 0.0653076171875, + "learning_rate": 7.388916015624999e-07, + "loss": 0.0026, + "reward": 1.7933273315429688, + "reward_std": 0.04443395556882024, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7933273315429688, + "step": 2139 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.296875, + "epoch": 1.04541015625, + "grad_norm": 0.9983903579493524, + "kl": 0.046630859375, + "learning_rate": 7.3876953125e-07, + "loss": 0.0019, + "reward": 1.681038737297058, + "reward_std": 0.07797625940293074, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7201012670993805, + "step": 2140 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.15625, + "epoch": 1.0458984375, + "grad_norm": 1.8879030546706417, + "kl": 0.0511474609375, + "learning_rate": 7.386474609375e-07, + "loss": 0.002, + "reward": 1.6292105913162231, + "reward_std": 0.21559580974280834, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6838980913162231, + "step": 2141 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.0078125, + "epoch": 1.04638671875, + "grad_norm": 1.3052945256803228, + "kl": 0.0604248046875, + "learning_rate": 7.38525390625e-07, + "loss": 0.0024, + "reward": 1.658437967300415, + "reward_std": 0.11939615942537785, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6896880269050598, + "step": 2142 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.453125, + "epoch": 1.046875, + "grad_norm": 1.1710672837881737, + "kl": 0.0650634765625, + "learning_rate": 7.384033203125e-07, + "loss": 0.0026, + "reward": 1.8534250855445862, + "reward_std": 0.0744034256786108, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8612376153469086, + "step": 2143 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.9765625, + "epoch": 1.04736328125, + "grad_norm": 5.639567766118087, + "kl": 0.057861328125, + "learning_rate": 7.382812499999999e-07, + "loss": 0.0023, + "reward": 1.7248152494430542, + "reward_std": 0.13105768337845802, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7482527196407318, + "step": 2144 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.171875, + "epoch": 1.0478515625, + "grad_norm": 2.0263228158297935, + "kl": 0.0672607421875, + "learning_rate": 7.381591796874999e-07, + "loss": 0.0027, + "reward": 1.6987344622612, + "reward_std": 0.07333962060511112, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6987345218658447, + "step": 2145 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.328125, + "epoch": 1.04833984375, + "grad_norm": 0.9474657856304045, + "kl": 0.045654296875, + "learning_rate": 7.38037109375e-07, + "loss": 0.0018, + "reward": 1.8015679717063904, + "reward_std": 0.06693462654948235, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8093804717063904, + "step": 2146 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.84375, + "epoch": 1.048828125, + "grad_norm": 1.899611714790272, + "kl": 0.069580078125, + "learning_rate": 7.379150390625e-07, + "loss": 0.0028, + "reward": 1.776853621006012, + "reward_std": 0.07761351764202118, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7846660614013672, + "step": 2147 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.53125, + "epoch": 1.04931640625, + "grad_norm": 1.2684760918178732, + "kl": 0.072265625, + "learning_rate": 7.3779296875e-07, + "loss": 0.0029, + "reward": 1.7402900457382202, + "reward_std": 0.050347575917840004, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.740289956331253, + "step": 2148 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.546875, + "epoch": 1.0498046875, + "grad_norm": 0.8851627060755088, + "kl": 0.068603515625, + "learning_rate": 7.376708984375e-07, + "loss": 0.0028, + "reward": 1.66942298412323, + "reward_std": 0.06483565643429756, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6694230437278748, + "step": 2149 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.9765625, + "epoch": 1.05029296875, + "grad_norm": 1.0989385292696576, + "kl": 0.0732421875, + "learning_rate": 7.37548828125e-07, + "loss": 0.0029, + "reward": 1.734316349029541, + "reward_std": 0.10244572162628174, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7421287596225739, + "step": 2150 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.375, + "epoch": 1.05078125, + "grad_norm": 1.3053195164204594, + "kl": 0.0589599609375, + "learning_rate": 7.374267578124999e-07, + "loss": 0.0024, + "reward": 1.7859277725219727, + "reward_std": 0.09848207421600819, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8015527129173279, + "step": 2151 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.9609375, + "epoch": 1.05126953125, + "grad_norm": 4.695553247711008, + "kl": 0.0660400390625, + "learning_rate": 7.373046874999999e-07, + "loss": 0.0026, + "reward": 1.8197780847549438, + "reward_std": 0.04821081645786762, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8197780549526215, + "step": 2152 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.5703125, + "epoch": 1.0517578125, + "grad_norm": 1.4462058737833505, + "kl": 0.0604248046875, + "learning_rate": 7.371826171875e-07, + "loss": 0.0024, + "reward": 1.705611228942871, + "reward_std": 0.04024476930499077, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7056111991405487, + "step": 2153 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7109375, + "epoch": 1.05224609375, + "grad_norm": 3.3498088491590505, + "kl": 0.05126953125, + "learning_rate": 7.37060546875e-07, + "loss": 0.002, + "reward": 1.7300852537155151, + "reward_std": 0.08649061527103186, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7378977537155151, + "step": 2154 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.6328125, + "epoch": 1.052734375, + "grad_norm": 1.7860979292641492, + "kl": 0.066650390625, + "learning_rate": 7.369384765625e-07, + "loss": 0.0027, + "reward": 1.749430775642395, + "reward_std": 0.05913347005844116, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.749430775642395, + "step": 2155 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.8828125, + "epoch": 1.05322265625, + "grad_norm": 14.752756244291026, + "kl": 0.0562744140625, + "learning_rate": 7.3681640625e-07, + "loss": 0.0023, + "reward": 1.8864418268203735, + "reward_std": 0.051150595769286156, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8942543268203735, + "step": 2156 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.8359375, + "epoch": 1.0537109375, + "grad_norm": 1.4880684525219599, + "kl": 0.065673828125, + "learning_rate": 7.366943359374999e-07, + "loss": 0.0026, + "reward": 1.7557436227798462, + "reward_std": 0.07325353659689426, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.763556182384491, + "step": 2157 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.8203125, + "epoch": 1.05419921875, + "grad_norm": 1.227305473803353, + "kl": 0.05078125, + "learning_rate": 7.365722656249999e-07, + "loss": 0.002, + "reward": 1.6383469700813293, + "reward_std": 0.07099130935966969, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6461593806743622, + "step": 2158 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.7109375, + "epoch": 1.0546875, + "grad_norm": 0.7035510885180098, + "kl": 0.0516357421875, + "learning_rate": 7.364501953124999e-07, + "loss": 0.0021, + "reward": 1.8514134287834167, + "reward_std": 0.047458380460739136, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8514134883880615, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.8984375, + "epoch": 1.05517578125, + "grad_norm": 1.1121154130874542, + "kl": 0.0587158203125, + "learning_rate": 7.36328125e-07, + "loss": 0.0023, + "reward": 1.8369617462158203, + "reward_std": 0.05468747764825821, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8369618058204651, + "step": 2160 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.53125, + "epoch": 1.0556640625, + "grad_norm": 3.7658263078887484, + "kl": 0.056640625, + "learning_rate": 7.362060546875e-07, + "loss": 0.0023, + "reward": 1.7871454954147339, + "reward_std": 0.10257207229733467, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7949579656124115, + "step": 2161 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.6171875, + "epoch": 1.05615234375, + "grad_norm": 1.1368885648860967, + "kl": 0.065673828125, + "learning_rate": 7.36083984375e-07, + "loss": 0.0026, + "reward": 1.685727596282959, + "reward_std": 0.13240405172109604, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.693540096282959, + "step": 2162 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.2421875, + "epoch": 1.056640625, + "grad_norm": 1.4737797498246885, + "kl": 0.070068359375, + "learning_rate": 7.359619140625e-07, + "loss": 0.0028, + "reward": 1.6595805883407593, + "reward_std": 0.034869059920310974, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6595805883407593, + "step": 2163 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.234375, + "epoch": 1.05712890625, + "grad_norm": 4.315861274519875, + "kl": 0.0625, + "learning_rate": 7.358398437499999e-07, + "loss": 0.0025, + "reward": 1.779079794883728, + "reward_std": 0.05369388684630394, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7790797650814056, + "step": 2164 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.6875, + "epoch": 1.0576171875, + "grad_norm": 2.32247382157244, + "kl": 0.0498046875, + "learning_rate": 7.357177734374999e-07, + "loss": 0.002, + "reward": 1.7220231294631958, + "reward_std": 0.049990251660346985, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7220230996608734, + "step": 2165 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.7109375, + "epoch": 1.05810546875, + "grad_norm": 1.3332007134810149, + "kl": 0.0616455078125, + "learning_rate": 7.35595703125e-07, + "loss": 0.0025, + "reward": 1.776341736316681, + "reward_std": 0.09242127742618322, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8075916767120361, + "step": 2166 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.34375, + "epoch": 1.05859375, + "grad_norm": 2.707627459785679, + "kl": 0.08056640625, + "learning_rate": 7.354736328125e-07, + "loss": 0.0032, + "reward": 1.7095162868499756, + "reward_std": 0.0930749960243702, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7173287868499756, + "step": 2167 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.8671875, + "epoch": 1.05908203125, + "grad_norm": 1.5790115563997662, + "kl": 0.0679931640625, + "learning_rate": 7.353515625e-07, + "loss": 0.0027, + "reward": 1.833968698978424, + "reward_std": 0.1615981161594391, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8574061989784241, + "step": 2168 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.4609375, + "epoch": 1.0595703125, + "grad_norm": 1.216637779595074, + "kl": 0.067138671875, + "learning_rate": 7.352294921875e-07, + "loss": 0.0027, + "reward": 1.8065576553344727, + "reward_std": 0.048122160136699677, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8065576553344727, + "step": 2169 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.9375, + "epoch": 1.06005859375, + "grad_norm": 1.7503160947547827, + "kl": 0.0791015625, + "learning_rate": 7.35107421875e-07, + "loss": 0.0032, + "reward": 1.676461935043335, + "reward_std": 0.07576654106378555, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.684274435043335, + "step": 2170 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.359375, + "epoch": 1.060546875, + "grad_norm": 2.082251913418121, + "kl": 0.07080078125, + "learning_rate": 7.349853515624999e-07, + "loss": 0.0028, + "reward": 1.5780660510063171, + "reward_std": 0.08183467015624046, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5780660212039948, + "step": 2171 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.2265625, + "epoch": 1.06103515625, + "grad_norm": 1.3618484884392383, + "kl": 0.066162109375, + "learning_rate": 7.348632812499999e-07, + "loss": 0.0026, + "reward": 1.6594600677490234, + "reward_std": 0.1468387171626091, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6985225975513458, + "step": 2172 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.78125, + "epoch": 1.0615234375, + "grad_norm": 1.1921095797514936, + "kl": 0.091552734375, + "learning_rate": 7.347412109375e-07, + "loss": 0.0037, + "reward": 1.7010453343391418, + "reward_std": 0.07318814843893051, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7088578939437866, + "step": 2173 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.5546875, + "epoch": 1.06201171875, + "grad_norm": 0.8033656102067087, + "kl": 0.0726318359375, + "learning_rate": 7.34619140625e-07, + "loss": 0.0029, + "reward": 1.5880872011184692, + "reward_std": 0.21278053149580956, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6271496415138245, + "step": 2174 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.2109375, + "epoch": 1.0625, + "grad_norm": 1.5869611446935907, + "kl": 0.07666015625, + "learning_rate": 7.344970703125e-07, + "loss": 0.0031, + "reward": 1.7660531997680664, + "reward_std": 0.09016413614153862, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7738656997680664, + "step": 2175 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.4921875, + "epoch": 1.06298828125, + "grad_norm": 1.677072145006438, + "kl": 0.072021484375, + "learning_rate": 7.34375e-07, + "loss": 0.0029, + "reward": 1.797443151473999, + "reward_std": 0.06796230189502239, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7974432110786438, + "step": 2176 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.8515625, + "epoch": 1.0634765625, + "grad_norm": 1.3737869324375402, + "kl": 0.0567626953125, + "learning_rate": 7.342529296874999e-07, + "loss": 0.0023, + "reward": 1.7921919226646423, + "reward_std": 0.03785792738199234, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7921919226646423, + "step": 2177 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.09375, + "epoch": 1.06396484375, + "grad_norm": 0.8350475539832674, + "kl": 0.052734375, + "learning_rate": 7.341308593749999e-07, + "loss": 0.0021, + "reward": 1.8188701272010803, + "reward_std": 0.13417918607592583, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8423075675964355, + "step": 2178 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.6015625, + "epoch": 1.064453125, + "grad_norm": 2.0295192660753987, + "kl": 0.076171875, + "learning_rate": 7.340087890625e-07, + "loss": 0.003, + "reward": 1.6831302642822266, + "reward_std": 0.11725856736302376, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6987552344799042, + "step": 2179 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.7734375, + "epoch": 1.06494140625, + "grad_norm": 1.823650973787344, + "kl": 0.083251953125, + "learning_rate": 7.3388671875e-07, + "loss": 0.0033, + "reward": 1.677744746208191, + "reward_std": 0.09554462134838104, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6855571866035461, + "step": 2180 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.46875, + "epoch": 1.0654296875, + "grad_norm": 0.5930383971596412, + "kl": 0.059814453125, + "learning_rate": 7.337646484375e-07, + "loss": 0.0024, + "reward": 1.9693381786346436, + "reward_std": 0.1475313939154148, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 1.000588208436966, + "step": 2181 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.2890625, + "epoch": 1.06591796875, + "grad_norm": 0.8776224591700195, + "kl": 0.0567626953125, + "learning_rate": 7.33642578125e-07, + "loss": 0.0023, + "reward": 1.7541506886482239, + "reward_std": 0.04516553692519665, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7541506886482239, + "step": 2182 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.109375, + "epoch": 1.06640625, + "grad_norm": 1.6024822866707846, + "kl": 0.0609130859375, + "learning_rate": 7.335205078125e-07, + "loss": 0.0024, + "reward": 1.7111125588417053, + "reward_std": 0.055892692878842354, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7111125588417053, + "step": 2183 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.7421875, + "epoch": 1.06689453125, + "grad_norm": 1.2580164449344338, + "kl": 0.070068359375, + "learning_rate": 7.333984374999999e-07, + "loss": 0.0028, + "reward": 1.823473334312439, + "reward_std": 0.0914062550291419, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8312858641147614, + "step": 2184 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.5625, + "epoch": 1.0673828125, + "grad_norm": 2.0897595557191124, + "kl": 0.0743408203125, + "learning_rate": 7.332763671874999e-07, + "loss": 0.003, + "reward": 1.7943394780158997, + "reward_std": 0.071324672549963, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7943394482135773, + "step": 2185 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.859375, + "epoch": 1.06787109375, + "grad_norm": 1.711752883290858, + "kl": 0.072509765625, + "learning_rate": 7.33154296875e-07, + "loss": 0.0029, + "reward": 1.6719991564750671, + "reward_std": 0.05058279260993004, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6719991564750671, + "step": 2186 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.8359375, + "epoch": 1.068359375, + "grad_norm": 1.7191611604780395, + "kl": 0.064453125, + "learning_rate": 7.330322265625e-07, + "loss": 0.0026, + "reward": 1.7175182700157166, + "reward_std": 0.08080036751925945, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7253307402133942, + "step": 2187 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.0859375, + "epoch": 1.06884765625, + "grad_norm": 1.5113242303718457, + "kl": 0.0703125, + "learning_rate": 7.3291015625e-07, + "loss": 0.0028, + "reward": 1.8223052620887756, + "reward_std": 0.03729821881279349, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8223052620887756, + "step": 2188 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.984375, + "epoch": 1.0693359375, + "grad_norm": 1.8504864852407057, + "kl": 0.072998046875, + "learning_rate": 7.327880859375e-07, + "loss": 0.0029, + "reward": 1.8702041506767273, + "reward_std": 0.08449077978730202, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8702041506767273, + "step": 2189 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.265625, + "epoch": 1.06982421875, + "grad_norm": 1.8618054773039303, + "kl": 0.08642578125, + "learning_rate": 7.326660156249999e-07, + "loss": 0.0035, + "reward": 1.7093619108200073, + "reward_std": 0.04153325408697128, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7093620002269745, + "step": 2190 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.109375, + "epoch": 1.0703125, + "grad_norm": 2.721104333800809, + "kl": 0.078125, + "learning_rate": 7.325439453124999e-07, + "loss": 0.0031, + "reward": 1.7475308179855347, + "reward_std": 0.06819172203540802, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7475307881832123, + "step": 2191 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.4609375, + "epoch": 1.07080078125, + "grad_norm": 1.1564680853165057, + "kl": 0.0570068359375, + "learning_rate": 7.32421875e-07, + "loss": 0.0023, + "reward": 1.8769598603248596, + "reward_std": 0.027791874017566442, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8769599199295044, + "step": 2192 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.421875, + "epoch": 1.0712890625, + "grad_norm": 7.862477096986793, + "kl": 0.0655517578125, + "learning_rate": 7.322998046875e-07, + "loss": 0.0026, + "reward": 1.7873188257217407, + "reward_std": 0.1392914056777954, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8185688853263855, + "step": 2193 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3359375, + "epoch": 1.07177734375, + "grad_norm": 2.5077562517083343, + "kl": 0.071044921875, + "learning_rate": 7.32177734375e-07, + "loss": 0.0028, + "reward": 1.8010922074317932, + "reward_std": 0.0915432795882225, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8010921478271484, + "step": 2194 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.109375, + "epoch": 1.072265625, + "grad_norm": 1.8689579905691132, + "kl": 0.075927734375, + "learning_rate": 7.320556640625e-07, + "loss": 0.003, + "reward": 1.7513406872749329, + "reward_std": 0.09631795436143875, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7513406872749329, + "step": 2195 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.1640625, + "epoch": 1.07275390625, + "grad_norm": 1.7191972586224031, + "kl": 0.0577392578125, + "learning_rate": 7.3193359375e-07, + "loss": 0.0023, + "reward": 1.7447272539138794, + "reward_std": 0.16263741254806519, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7759771645069122, + "step": 2196 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.796875, + "epoch": 1.0732421875, + "grad_norm": 8.308076466127147, + "kl": 0.071533203125, + "learning_rate": 7.318115234374999e-07, + "loss": 0.0029, + "reward": 1.70778489112854, + "reward_std": 0.11081130802631378, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.71559739112854, + "step": 2197 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.90625, + "epoch": 1.07373046875, + "grad_norm": 5.039129274024845, + "kl": 0.075439453125, + "learning_rate": 7.316894531249999e-07, + "loss": 0.003, + "reward": 1.765058159828186, + "reward_std": 0.10599537566304207, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7728706300258636, + "step": 2198 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.2109375, + "epoch": 1.07421875, + "grad_norm": 2.6078909888453574, + "kl": 0.071533203125, + "learning_rate": 7.315673828125e-07, + "loss": 0.0029, + "reward": 1.7972966432571411, + "reward_std": 0.05169523321092129, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7972966134548187, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.546875, + "epoch": 1.07470703125, + "grad_norm": 2.26953245897716, + "kl": 0.069580078125, + "learning_rate": 7.314453125e-07, + "loss": 0.0028, + "reward": 1.7383949160575867, + "reward_std": 0.08352330699563026, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7540199458599091, + "step": 2200 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.9375, + "epoch": 1.0751953125, + "grad_norm": 1.6816262192340925, + "kl": 0.0643310546875, + "learning_rate": 7.313232421875e-07, + "loss": 0.0026, + "reward": 1.6924183368682861, + "reward_std": 0.06870663538575172, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6924182772636414, + "step": 2201 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.75, + "epoch": 1.07568359375, + "grad_norm": 4.603195904454922, + "kl": 0.066162109375, + "learning_rate": 7.31201171875e-07, + "loss": 0.0026, + "reward": 1.8771857619285583, + "reward_std": 0.15731638204306364, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.900623232126236, + "step": 2202 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.4921875, + "epoch": 1.076171875, + "grad_norm": 9.605729154350279, + "kl": 0.0625, + "learning_rate": 7.310791015624999e-07, + "loss": 0.0025, + "reward": 1.7749759554862976, + "reward_std": 0.17696334049105644, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.80622598528862, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.359375, + "epoch": 1.07666015625, + "grad_norm": 2.172820429250745, + "kl": 0.0635986328125, + "learning_rate": 7.309570312499999e-07, + "loss": 0.0025, + "reward": 1.7500771880149841, + "reward_std": 0.04752637818455696, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7500771284103394, + "step": 2204 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.8984375, + "epoch": 1.0771484375, + "grad_norm": 1.4018219648742998, + "kl": 0.07421875, + "learning_rate": 7.308349609375e-07, + "loss": 0.003, + "reward": 1.7220321893692017, + "reward_std": 0.11039461940526962, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7298446595668793, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.5546875, + "epoch": 1.07763671875, + "grad_norm": 0.8089076092208608, + "kl": 0.064208984375, + "learning_rate": 7.30712890625e-07, + "loss": 0.0026, + "reward": 1.7933659553527832, + "reward_std": 0.04852168867364526, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.793365865945816, + "step": 2206 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.34375, + "epoch": 1.078125, + "grad_norm": 1.005591084680332, + "kl": 0.0633544921875, + "learning_rate": 7.305908203125e-07, + "loss": 0.0025, + "reward": 1.637376844882965, + "reward_std": 0.13998160883784294, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6686268448829651, + "step": 2207 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.0546875, + "epoch": 1.07861328125, + "grad_norm": 0.8635824378840103, + "kl": 0.06103515625, + "learning_rate": 7.3046875e-07, + "loss": 0.0024, + "reward": 1.8138604164123535, + "reward_std": 0.016972179524600506, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8138603866100311, + "step": 2208 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.828125, + "epoch": 1.0791015625, + "grad_norm": 4.198636543673609, + "kl": 0.0733642578125, + "learning_rate": 7.303466796875e-07, + "loss": 0.0029, + "reward": 1.786705732345581, + "reward_std": 0.08687588106840849, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.794518232345581, + "step": 2209 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.015625, + "epoch": 1.07958984375, + "grad_norm": 3.47696025291616, + "kl": 0.0675048828125, + "learning_rate": 7.302246093749999e-07, + "loss": 0.0027, + "reward": 1.8078295588493347, + "reward_std": 0.08413361757993698, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8156421184539795, + "step": 2210 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.359375, + "epoch": 1.080078125, + "grad_norm": 3.8183450251736746, + "kl": 0.083740234375, + "learning_rate": 7.301025390624999e-07, + "loss": 0.0034, + "reward": 1.746371567249298, + "reward_std": 0.06931864097714424, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7463716566562653, + "step": 2211 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.0625, + "epoch": 1.08056640625, + "grad_norm": 4.84020947064792, + "kl": 0.06640625, + "learning_rate": 7.2998046875e-07, + "loss": 0.0027, + "reward": 1.7851145267486572, + "reward_std": 0.10719123855233192, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.816364586353302, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.9765625, + "epoch": 1.0810546875, + "grad_norm": 1.9009974118221236, + "kl": 0.0609130859375, + "learning_rate": 7.298583984375e-07, + "loss": 0.0024, + "reward": 1.6994649171829224, + "reward_std": 0.0930807814002037, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7072774171829224, + "step": 2213 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.7890625, + "epoch": 1.08154296875, + "grad_norm": 8.11275335382126, + "kl": 0.068115234375, + "learning_rate": 7.29736328125e-07, + "loss": 0.0027, + "reward": 1.7040226459503174, + "reward_std": 0.13488183170557022, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7274601459503174, + "step": 2214 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.3984375, + "epoch": 1.08203125, + "grad_norm": 2.137119825015467, + "kl": 0.07373046875, + "learning_rate": 7.296142578125e-07, + "loss": 0.0029, + "reward": 1.944073498249054, + "reward_std": 0.17653799057006836, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9518861174583435, + "step": 2215 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.9375, + "epoch": 1.08251953125, + "grad_norm": 1.1241963585520527, + "kl": 0.072021484375, + "learning_rate": 7.294921874999999e-07, + "loss": 0.0029, + "reward": 1.6532188653945923, + "reward_std": 0.06231350637972355, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6610313355922699, + "step": 2216 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.546875, + "epoch": 1.0830078125, + "grad_norm": 2.7140903444876523, + "kl": 0.0634765625, + "learning_rate": 7.293701171874999e-07, + "loss": 0.0025, + "reward": 1.8621540069580078, + "reward_std": 0.038842491805553436, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8621540367603302, + "step": 2217 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.359375, + "epoch": 1.08349609375, + "grad_norm": 2.191163607204704, + "kl": 0.063720703125, + "learning_rate": 7.29248046875e-07, + "loss": 0.0026, + "reward": 1.8771589994430542, + "reward_std": 0.025433420203626156, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8771590292453766, + "step": 2218 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.3125, + "epoch": 1.083984375, + "grad_norm": 3.2019990973459733, + "kl": 0.064453125, + "learning_rate": 7.291259765625e-07, + "loss": 0.0026, + "reward": 1.6369213461875916, + "reward_std": 0.11391383782029152, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6447338461875916, + "step": 2219 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.75, + "epoch": 1.08447265625, + "grad_norm": 1.2457890025032394, + "kl": 0.07177734375, + "learning_rate": 7.2900390625e-07, + "loss": 0.0029, + "reward": 1.7395640015602112, + "reward_std": 0.06605142541229725, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7395639717578888, + "step": 2220 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.578125, + "epoch": 1.0849609375, + "grad_norm": 2.786622632317203, + "kl": 0.073974609375, + "learning_rate": 7.288818359375e-07, + "loss": 0.003, + "reward": 1.773886501789093, + "reward_std": 0.06187342666089535, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7738864719867706, + "step": 2221 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.6015625, + "epoch": 1.08544921875, + "grad_norm": 1.4337096043043174, + "kl": 0.067626953125, + "learning_rate": 7.28759765625e-07, + "loss": 0.0027, + "reward": 1.717581033706665, + "reward_std": 0.03319558780640364, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7175810039043427, + "step": 2222 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.96875, + "epoch": 1.0859375, + "grad_norm": 1.5681450356473852, + "kl": 0.056640625, + "learning_rate": 7.286376953124999e-07, + "loss": 0.0023, + "reward": 1.812729299068451, + "reward_std": 0.07597517222166061, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8127292990684509, + "step": 2223 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.9921875, + "epoch": 1.08642578125, + "grad_norm": 1.4098337592431196, + "kl": 0.07763671875, + "learning_rate": 7.285156249999999e-07, + "loss": 0.0031, + "reward": 1.8502396941184998, + "reward_std": 0.03288627602159977, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8502396941184998, + "step": 2224 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.7421875, + "epoch": 1.0869140625, + "grad_norm": 1.078144462327901, + "kl": 0.071044921875, + "learning_rate": 7.283935546875e-07, + "loss": 0.0028, + "reward": 1.8840885162353516, + "reward_std": 0.07063583564013243, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8919010162353516, + "step": 2225 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.9140625, + "epoch": 1.08740234375, + "grad_norm": 3.635268117383236, + "kl": 0.0703125, + "learning_rate": 7.28271484375e-07, + "loss": 0.0028, + "reward": 1.791632056236267, + "reward_std": 0.11924531310796738, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8150696158409119, + "step": 2226 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.9453125, + "epoch": 1.087890625, + "grad_norm": 2.4520320812851817, + "kl": 0.070068359375, + "learning_rate": 7.281494140625e-07, + "loss": 0.0028, + "reward": 1.7973357439041138, + "reward_std": 0.0609661303460598, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7973355948925018, + "step": 2227 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.953125, + "epoch": 1.08837890625, + "grad_norm": 3.1691940235472074, + "kl": 0.08837890625, + "learning_rate": 7.2802734375e-07, + "loss": 0.0035, + "reward": 1.6005699038505554, + "reward_std": 0.058797843754291534, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6083824634552002, + "step": 2228 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.8359375, + "epoch": 1.0888671875, + "grad_norm": 1.4854856618683832, + "kl": 0.069091796875, + "learning_rate": 7.279052734374999e-07, + "loss": 0.0028, + "reward": 1.8553495407104492, + "reward_std": 0.07698746025562286, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8553495407104492, + "step": 2229 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.7734375, + "epoch": 1.08935546875, + "grad_norm": 1.6672821205576769, + "kl": 0.079345703125, + "learning_rate": 7.277832031249999e-07, + "loss": 0.0032, + "reward": 1.7386040091514587, + "reward_std": 0.09536767937242985, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7542290091514587, + "step": 2230 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.9296875, + "epoch": 1.08984375, + "grad_norm": 3.060835361852339, + "kl": 0.0650634765625, + "learning_rate": 7.276611328125e-07, + "loss": 0.0026, + "reward": 1.7035446763038635, + "reward_std": 0.10484276339411736, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7426071465015411, + "step": 2231 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.453125, + "epoch": 1.09033203125, + "grad_norm": 2.383603260793994, + "kl": 0.0888671875, + "learning_rate": 7.275390625e-07, + "loss": 0.0036, + "reward": 1.7988132238388062, + "reward_std": 0.07059590518474579, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7988132238388062, + "step": 2232 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.7421875, + "epoch": 1.0908203125, + "grad_norm": 1.4816291696715755, + "kl": 0.0946044921875, + "learning_rate": 7.274169921875e-07, + "loss": 0.0038, + "reward": 1.8334471583366394, + "reward_std": 0.08519222773611546, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.856884628534317, + "step": 2233 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.78125, + "epoch": 1.09130859375, + "grad_norm": 1.8894162923830333, + "kl": 0.07861328125, + "learning_rate": 7.27294921875e-07, + "loss": 0.0031, + "reward": 1.6886449456214905, + "reward_std": 0.09802973223850131, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7042699754238129, + "step": 2234 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.0, + "epoch": 1.091796875, + "grad_norm": 1.062327646687958, + "kl": 0.0654296875, + "learning_rate": 7.271728515625e-07, + "loss": 0.0026, + "reward": 1.7099770307540894, + "reward_std": 0.0585494851693511, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7099769711494446, + "step": 2235 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.640625, + "epoch": 1.09228515625, + "grad_norm": 2.6614706971362003, + "kl": 0.0672607421875, + "learning_rate": 7.270507812499999e-07, + "loss": 0.0027, + "reward": 1.5697939991950989, + "reward_std": 0.12034578062593937, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5854189693927765, + "step": 2236 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.5390625, + "epoch": 1.0927734375, + "grad_norm": 3.4880008369377125, + "kl": 0.084228515625, + "learning_rate": 7.269287109374999e-07, + "loss": 0.0034, + "reward": 1.8251853585243225, + "reward_std": 0.03789713280275464, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8251853585243225, + "step": 2237 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.890625, + "epoch": 1.09326171875, + "grad_norm": 1.8948511963642394, + "kl": 0.091064453125, + "learning_rate": 7.26806640625e-07, + "loss": 0.0036, + "reward": 1.7872638702392578, + "reward_std": 0.08164054993540049, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8028888702392578, + "step": 2238 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.71875, + "epoch": 1.09375, + "grad_norm": 1.391838149766285, + "kl": 0.091796875, + "learning_rate": 7.266845703125e-07, + "loss": 0.0037, + "reward": 1.7387813925743103, + "reward_std": 0.18217945843935013, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7622189223766327, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.0546875, + "epoch": 1.09423828125, + "grad_norm": 0.7556167192817572, + "kl": 0.083740234375, + "learning_rate": 7.265625e-07, + "loss": 0.0033, + "reward": 1.7732893228530884, + "reward_std": 0.03759356215596199, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7732893526554108, + "step": 2240 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.28125, + "epoch": 1.0947265625, + "grad_norm": 2.16796468008093, + "kl": 0.079833984375, + "learning_rate": 7.264404296875e-07, + "loss": 0.0032, + "reward": 1.8013297319412231, + "reward_std": 0.15706830099225044, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8247672319412231, + "step": 2241 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.2578125, + "epoch": 1.09521484375, + "grad_norm": 0.929112376338198, + "kl": 0.065185546875, + "learning_rate": 7.263183593749999e-07, + "loss": 0.0026, + "reward": 1.570958137512207, + "reward_std": 0.17315081879496574, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.602208137512207, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.65625, + "epoch": 1.095703125, + "grad_norm": 1.2746975120920747, + "kl": 0.0648193359375, + "learning_rate": 7.261962890624999e-07, + "loss": 0.0026, + "reward": 1.7777118682861328, + "reward_std": 0.06291940249502659, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7777118384838104, + "step": 2243 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.734375, + "epoch": 1.09619140625, + "grad_norm": 1.7464595790772883, + "kl": 0.070556640625, + "learning_rate": 7.2607421875e-07, + "loss": 0.0028, + "reward": 1.8250086903572083, + "reward_std": 0.1407541036605835, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8406336605548859, + "step": 2244 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.875, + "epoch": 1.0966796875, + "grad_norm": 1.8014959178143817, + "kl": 0.0888671875, + "learning_rate": 7.259521484375e-07, + "loss": 0.0035, + "reward": 1.7720280885696411, + "reward_std": 0.1236952543258667, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7798406481742859, + "step": 2245 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.4765625, + "epoch": 1.09716796875, + "grad_norm": 1.692290936562916, + "kl": 0.07861328125, + "learning_rate": 7.25830078125e-07, + "loss": 0.0032, + "reward": 1.8413395881652832, + "reward_std": 0.05821367911994457, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8413394689559937, + "step": 2246 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.640625, + "epoch": 1.09765625, + "grad_norm": 3.439561683238805, + "kl": 0.0521240234375, + "learning_rate": 7.257080078125e-07, + "loss": 0.0021, + "reward": 1.773597240447998, + "reward_std": 0.060536185279488564, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7735972106456757, + "step": 2247 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.5703125, + "epoch": 1.09814453125, + "grad_norm": 3.0080579318285294, + "kl": 0.072021484375, + "learning_rate": 7.255859375e-07, + "loss": 0.0029, + "reward": 1.6555711030960083, + "reward_std": 0.08909568935632706, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6555710732936859, + "step": 2248 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.2578125, + "epoch": 1.0986328125, + "grad_norm": 4.282105322707064, + "kl": 0.0579833984375, + "learning_rate": 7.254638671874999e-07, + "loss": 0.0023, + "reward": 1.7793956995010376, + "reward_std": 0.0656171664595604, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7793957591056824, + "step": 2249 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.7578125, + "epoch": 1.09912109375, + "grad_norm": 1.1360255555472931, + "kl": 0.079345703125, + "learning_rate": 7.253417968749999e-07, + "loss": 0.0032, + "reward": 1.7589207887649536, + "reward_std": 0.16139283776283264, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7823582887649536, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.578125, + "epoch": 1.099609375, + "grad_norm": 1.5781494589643414, + "kl": 0.06640625, + "learning_rate": 7.252197265625e-07, + "loss": 0.0027, + "reward": 1.781304121017456, + "reward_std": 0.15949422121047974, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8047415614128113, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.1484375, + "epoch": 1.10009765625, + "grad_norm": 2.2259355754649603, + "kl": 0.0631103515625, + "learning_rate": 7.2509765625e-07, + "loss": 0.0025, + "reward": 1.8216727375984192, + "reward_std": 0.07048023492097855, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8216726779937744, + "step": 2252 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.328125, + "epoch": 1.1005859375, + "grad_norm": 2.982676387228891, + "kl": 0.071533203125, + "learning_rate": 7.249755859375e-07, + "loss": 0.0029, + "reward": 1.5965590476989746, + "reward_std": 0.10238468833267689, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6043716222047806, + "step": 2253 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.8671875, + "epoch": 1.10107421875, + "grad_norm": 1.865700837610057, + "kl": 0.0584716796875, + "learning_rate": 7.24853515625e-07, + "loss": 0.0023, + "reward": 1.6730265617370605, + "reward_std": 0.1227874830365181, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6964640319347382, + "step": 2254 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.9453125, + "epoch": 1.1015625, + "grad_norm": 3.122567250012638, + "kl": 0.07275390625, + "learning_rate": 7.247314453125e-07, + "loss": 0.0029, + "reward": 1.7611711025238037, + "reward_std": 0.10396287217736244, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7767961025238037, + "step": 2255 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.015625, + "epoch": 1.10205078125, + "grad_norm": 0.9976986455436946, + "kl": 0.06396484375, + "learning_rate": 7.246093749999999e-07, + "loss": 0.0026, + "reward": 1.740997850894928, + "reward_std": 0.07786927185952663, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7409978210926056, + "step": 2256 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.4453125, + "epoch": 1.1025390625, + "grad_norm": 1.1672582198818704, + "kl": 0.065185546875, + "learning_rate": 7.244873046874999e-07, + "loss": 0.0026, + "reward": 1.817870855331421, + "reward_std": 0.0449886042624712, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8178708851337433, + "step": 2257 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.8203125, + "epoch": 1.10302734375, + "grad_norm": 3.1200822986848706, + "kl": 0.0601806640625, + "learning_rate": 7.24365234375e-07, + "loss": 0.0024, + "reward": 1.7606118321418762, + "reward_std": 0.05959334224462509, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7684243321418762, + "step": 2258 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.4921875, + "epoch": 1.103515625, + "grad_norm": 3.7522311592920388, + "kl": 0.0587158203125, + "learning_rate": 7.242431640625e-07, + "loss": 0.0024, + "reward": 1.800333023071289, + "reward_std": 0.08324110694229603, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8081456422805786, + "step": 2259 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.1484375, + "epoch": 1.10400390625, + "grad_norm": 2.944245123139588, + "kl": 0.05859375, + "learning_rate": 7.2412109375e-07, + "loss": 0.0023, + "reward": 1.8839264512062073, + "reward_std": 0.0873430147767067, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8839264810085297, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.4765625, + "epoch": 1.1044921875, + "grad_norm": 1.7540471861864702, + "kl": 0.0521240234375, + "learning_rate": 7.239990234375e-07, + "loss": 0.0021, + "reward": 1.7210676670074463, + "reward_std": 0.14550930261611938, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7445051968097687, + "step": 2261 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.7265625, + "epoch": 1.10498046875, + "grad_norm": 3.2632375700049128, + "kl": 0.0548095703125, + "learning_rate": 7.238769531249999e-07, + "loss": 0.0022, + "reward": 1.754812240600586, + "reward_std": 0.04835915379226208, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7548122107982635, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.734375, + "epoch": 1.10546875, + "grad_norm": 2.9460406884659176, + "kl": 0.0577392578125, + "learning_rate": 7.237548828124999e-07, + "loss": 0.0023, + "reward": 1.8352848291397095, + "reward_std": 0.16885582357645035, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8665347993373871, + "step": 2263 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.5, + "epoch": 1.10595703125, + "grad_norm": 1.0028134561756605, + "kl": 0.052001953125, + "learning_rate": 7.236328125e-07, + "loss": 0.0021, + "reward": 1.858300268650055, + "reward_std": 0.04617350362241268, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8583002388477325, + "step": 2264 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.625, + "epoch": 1.1064453125, + "grad_norm": 4.3938839661401135, + "kl": 0.0679931640625, + "learning_rate": 7.235107421875e-07, + "loss": 0.0027, + "reward": 1.7961427569389343, + "reward_std": 0.04284539166837931, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7961426973342896, + "step": 2265 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.609375, + "epoch": 1.10693359375, + "grad_norm": 1.1672021555808423, + "kl": 0.068115234375, + "learning_rate": 7.23388671875e-07, + "loss": 0.0027, + "reward": 1.7131685614585876, + "reward_std": 0.05846385471522808, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7131686210632324, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.4453125, + "epoch": 1.107421875, + "grad_norm": 0.856824937232512, + "kl": 0.06591796875, + "learning_rate": 7.232666015625e-07, + "loss": 0.0026, + "reward": 1.7381606698036194, + "reward_std": 0.17856748402118683, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7537856698036194, + "step": 2267 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.5703125, + "epoch": 1.10791015625, + "grad_norm": 3.9627794439298425, + "kl": 0.076416015625, + "learning_rate": 7.2314453125e-07, + "loss": 0.0031, + "reward": 1.6982629299163818, + "reward_std": 0.07101480662822723, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6982628703117371, + "step": 2268 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.5625, + "epoch": 1.1083984375, + "grad_norm": 1.914687836920094, + "kl": 0.072509765625, + "learning_rate": 7.230224609374999e-07, + "loss": 0.0029, + "reward": 1.6740421056747437, + "reward_std": 0.10210954397916794, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6740420460700989, + "step": 2269 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.359375, + "epoch": 1.10888671875, + "grad_norm": 1.1215367514845478, + "kl": 0.073486328125, + "learning_rate": 7.229003906249999e-07, + "loss": 0.0029, + "reward": 1.6971306204795837, + "reward_std": 0.11026806011795998, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7049430906772614, + "step": 2270 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.265625, + "epoch": 1.109375, + "grad_norm": 1.1173595552235849, + "kl": 0.0670166015625, + "learning_rate": 7.227783203125e-07, + "loss": 0.0027, + "reward": 1.8012632131576538, + "reward_std": 0.062459973618388176, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8090757131576538, + "step": 2271 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.8984375, + "epoch": 1.10986328125, + "grad_norm": 0.7894205502775127, + "kl": 0.0615234375, + "learning_rate": 7.2265625e-07, + "loss": 0.0025, + "reward": 1.776045799255371, + "reward_std": 0.04911462590098381, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7760457992553711, + "step": 2272 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.125, + "epoch": 1.1103515625, + "grad_norm": 1.0027845954069041, + "kl": 0.078857421875, + "learning_rate": 7.225341796875e-07, + "loss": 0.0032, + "reward": 1.722011387348175, + "reward_std": 0.07575457729399204, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7220114171504974, + "step": 2273 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.453125, + "epoch": 1.11083984375, + "grad_norm": 10.409760523516997, + "kl": 0.05712890625, + "learning_rate": 7.22412109375e-07, + "loss": 0.0023, + "reward": 1.7924315929412842, + "reward_std": 0.08167605847120285, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7924315631389618, + "step": 2274 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.5625, + "epoch": 1.111328125, + "grad_norm": 2.834746616866118, + "kl": 0.0628662109375, + "learning_rate": 7.222900390624999e-07, + "loss": 0.0025, + "reward": 1.8050841689109802, + "reward_std": 0.07527113519608974, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8128966391086578, + "step": 2275 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.0625, + "epoch": 1.11181640625, + "grad_norm": 2.7173474996855993, + "kl": 0.0703125, + "learning_rate": 7.221679687499999e-07, + "loss": 0.0028, + "reward": 1.7315622568130493, + "reward_std": 0.06561515107750893, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7393746674060822, + "step": 2276 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.0, + "epoch": 1.1123046875, + "grad_norm": 1.4927968621455245, + "kl": 0.068603515625, + "learning_rate": 7.220458984375e-07, + "loss": 0.0027, + "reward": 1.8361743092536926, + "reward_std": 0.03312414512038231, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8361742496490479, + "step": 2277 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.2734375, + "epoch": 1.11279296875, + "grad_norm": 3.339265407657078, + "kl": 0.080078125, + "learning_rate": 7.21923828125e-07, + "loss": 0.0032, + "reward": 1.728402554988861, + "reward_std": 0.13297371938824654, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7284024953842163, + "step": 2278 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.0625, + "epoch": 1.11328125, + "grad_norm": 6.372972770256252, + "kl": 0.083984375, + "learning_rate": 7.218017578125e-07, + "loss": 0.0034, + "reward": 1.7280917167663574, + "reward_std": 0.052303411066532135, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.728091686964035, + "step": 2279 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.84375, + "epoch": 1.11376953125, + "grad_norm": 4.351237154663654, + "kl": 0.088623046875, + "learning_rate": 7.216796875e-07, + "loss": 0.0035, + "reward": 1.7941367626190186, + "reward_std": 0.08051660470664501, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7941367924213409, + "step": 2280 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.8984375, + "epoch": 1.1142578125, + "grad_norm": 1.0055680467034633, + "kl": 0.0589599609375, + "learning_rate": 7.215576171875e-07, + "loss": 0.0024, + "reward": 1.7715181112289429, + "reward_std": 0.07638098672032356, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7715181410312653, + "step": 2281 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.2421875, + "epoch": 1.11474609375, + "grad_norm": 2.028756106382298, + "kl": 0.067626953125, + "learning_rate": 7.214355468749999e-07, + "loss": 0.0027, + "reward": 1.8816375732421875, + "reward_std": 0.13509927690029144, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8816376030445099, + "step": 2282 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.8515625, + "epoch": 1.115234375, + "grad_norm": 1.7934862249340708, + "kl": 0.070068359375, + "learning_rate": 7.213134765624999e-07, + "loss": 0.0028, + "reward": 1.7188656330108643, + "reward_std": 0.05575054790824652, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7188656330108643, + "step": 2283 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.9140625, + "epoch": 1.11572265625, + "grad_norm": 0.7312070262378836, + "kl": 0.07373046875, + "learning_rate": 7.2119140625e-07, + "loss": 0.003, + "reward": 1.8175573348999023, + "reward_std": 0.01690027490258217, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.81755730509758, + "step": 2284 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.078125, + "epoch": 1.1162109375, + "grad_norm": 2.030078969268682, + "kl": 0.05712890625, + "learning_rate": 7.210693359375e-07, + "loss": 0.0023, + "reward": 1.7928734421730042, + "reward_std": 0.06959575228393078, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7928734421730042, + "step": 2285 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.765625, + "epoch": 1.11669921875, + "grad_norm": 1.6694669880542703, + "kl": 0.085205078125, + "learning_rate": 7.20947265625e-07, + "loss": 0.0034, + "reward": 1.8147171139717102, + "reward_std": 0.11030293442308903, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8303420841693878, + "step": 2286 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.234375, + "epoch": 1.1171875, + "grad_norm": 0.9274138439502736, + "kl": 0.06103515625, + "learning_rate": 7.208251953125e-07, + "loss": 0.0024, + "reward": 1.8761171102523804, + "reward_std": 0.04183840565383434, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8761171698570251, + "step": 2287 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.671875, + "epoch": 1.11767578125, + "grad_norm": 13.021849727212583, + "kl": 0.07421875, + "learning_rate": 7.207031249999999e-07, + "loss": 0.003, + "reward": 1.7165254354476929, + "reward_std": 0.092881940305233, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7243379950523376, + "step": 2288 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.03125, + "epoch": 1.1181640625, + "grad_norm": 1.334539370731693, + "kl": 0.08642578125, + "learning_rate": 7.205810546874999e-07, + "loss": 0.0035, + "reward": 1.8177083730697632, + "reward_std": 0.09210111945867538, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8177083432674408, + "step": 2289 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.1796875, + "epoch": 1.11865234375, + "grad_norm": 1.9031635634435449, + "kl": 0.0635986328125, + "learning_rate": 7.20458984375e-07, + "loss": 0.0025, + "reward": 1.652937114238739, + "reward_std": 0.07407059520483017, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6529370546340942, + "step": 2290 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.296875, + "epoch": 1.119140625, + "grad_norm": 0.8218601392312246, + "kl": 0.060791015625, + "learning_rate": 7.203369140625e-07, + "loss": 0.0024, + "reward": 1.813718318939209, + "reward_std": 0.07145040668547153, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.813718318939209, + "step": 2291 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5703125, + "epoch": 1.11962890625, + "grad_norm": 1.467119269254602, + "kl": 0.0623779296875, + "learning_rate": 7.2021484375e-07, + "loss": 0.0025, + "reward": 1.740653932094574, + "reward_std": 0.08345598913729191, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.740653932094574, + "step": 2292 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.6328125, + "epoch": 1.1201171875, + "grad_norm": 1.0241103270793768, + "kl": 0.0494384765625, + "learning_rate": 7.200927734375e-07, + "loss": 0.002, + "reward": 1.7661115527153015, + "reward_std": 0.14179787784814835, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7973615527153015, + "step": 2293 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.765625, + "epoch": 1.12060546875, + "grad_norm": 2.4337023505740434, + "kl": 0.057373046875, + "learning_rate": 7.19970703125e-07, + "loss": 0.0023, + "reward": 1.7818644642829895, + "reward_std": 0.13211066648364067, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7896769940853119, + "step": 2294 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.0859375, + "epoch": 1.12109375, + "grad_norm": 2.159922083493317, + "kl": 0.066650390625, + "learning_rate": 7.198486328124999e-07, + "loss": 0.0027, + "reward": 1.7772237658500671, + "reward_std": 0.08314445242285728, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7928487956523895, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.1328125, + "epoch": 1.12158203125, + "grad_norm": 1.7534330818508779, + "kl": 0.0537109375, + "learning_rate": 7.197265624999999e-07, + "loss": 0.0021, + "reward": 1.7676746845245361, + "reward_std": 0.09902366809546947, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8067372441291809, + "step": 2296 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.7734375, + "epoch": 1.1220703125, + "grad_norm": 1.211513703798563, + "kl": 0.04931640625, + "learning_rate": 7.196044921875e-07, + "loss": 0.002, + "reward": 1.797850251197815, + "reward_std": 0.07715418934822083, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7978502511978149, + "step": 2297 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.0703125, + "epoch": 1.12255859375, + "grad_norm": 1.4152986860468773, + "kl": 0.0484619140625, + "learning_rate": 7.19482421875e-07, + "loss": 0.0019, + "reward": 1.9088245630264282, + "reward_std": 0.04718828946352005, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.908824622631073, + "step": 2298 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.6484375, + "epoch": 1.123046875, + "grad_norm": 1.3403931709546952, + "kl": 0.0687255859375, + "learning_rate": 7.193603515625e-07, + "loss": 0.0027, + "reward": 1.7832393050193787, + "reward_std": 0.06331999599933624, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7832392752170563, + "step": 2299 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.2421875, + "epoch": 1.12353515625, + "grad_norm": 2.1834678287173177, + "kl": 0.072998046875, + "learning_rate": 7.1923828125e-07, + "loss": 0.0029, + "reward": 1.7606186866760254, + "reward_std": 0.12387818098068237, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7684311270713806, + "step": 2300 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.921875, + "epoch": 1.1240234375, + "grad_norm": 1.5319637176279595, + "kl": 0.0673828125, + "learning_rate": 7.191162109374999e-07, + "loss": 0.0027, + "reward": 1.6937076449394226, + "reward_std": 0.09303374774754047, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6937075853347778, + "step": 2301 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.09375, + "epoch": 1.12451171875, + "grad_norm": 2.6885932706099895, + "kl": 0.0579833984375, + "learning_rate": 7.189941406249999e-07, + "loss": 0.0023, + "reward": 1.7834733128547668, + "reward_std": 0.042033152654767036, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7834733128547668, + "step": 2302 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7109375, + "epoch": 1.125, + "grad_norm": 1.5126183880414965, + "kl": 0.0614013671875, + "learning_rate": 7.188720703125e-07, + "loss": 0.0025, + "reward": 1.7206319570541382, + "reward_std": 0.03807243797928095, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7206319570541382, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.953125, + "epoch": 1.12548828125, + "grad_norm": 2.3253192988082416, + "kl": 0.08740234375, + "learning_rate": 7.1875e-07, + "loss": 0.0035, + "reward": 1.697283923625946, + "reward_std": 0.038683134131133556, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.697283923625946, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.3125, + "epoch": 1.1259765625, + "grad_norm": 1.8971968478232022, + "kl": 0.0548095703125, + "learning_rate": 7.186279296875e-07, + "loss": 0.0022, + "reward": 1.5334055423736572, + "reward_std": 0.17225759476423264, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.595905601978302, + "step": 2305 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.265625, + "epoch": 1.12646484375, + "grad_norm": 2.1324487343914655, + "kl": 0.086669921875, + "learning_rate": 7.18505859375e-07, + "loss": 0.0035, + "reward": 1.7237046957015991, + "reward_std": 0.059116460382938385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7237046360969543, + "step": 2306 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.015625, + "epoch": 1.126953125, + "grad_norm": 1.8565481261961394, + "kl": 0.079833984375, + "learning_rate": 7.183837890625e-07, + "loss": 0.0032, + "reward": 1.5828680992126465, + "reward_std": 0.043210539035499096, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5828680694103241, + "step": 2307 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.15625, + "epoch": 1.12744140625, + "grad_norm": 0.4105661860693138, + "kl": 0.0496826171875, + "learning_rate": 7.182617187499999e-07, + "loss": 0.002, + "reward": 1.7693156599998474, + "reward_std": 0.06646117940545082, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7771281003952026, + "step": 2308 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.3046875, + "epoch": 1.1279296875, + "grad_norm": 5.888256460420409, + "kl": 0.0693359375, + "learning_rate": 7.181396484374999e-07, + "loss": 0.0028, + "reward": 1.7534179091453552, + "reward_std": 0.0729428380727768, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7534180283546448, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.0, + "epoch": 1.12841796875, + "grad_norm": 1.3969601396928006, + "kl": 0.0654296875, + "learning_rate": 7.18017578125e-07, + "loss": 0.0026, + "reward": 1.7623464465141296, + "reward_std": 0.10179652273654938, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7779714465141296, + "step": 2310 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.9453125, + "epoch": 1.12890625, + "grad_norm": 1.5953364275642339, + "kl": 0.0555419921875, + "learning_rate": 7.178955078125e-07, + "loss": 0.0022, + "reward": 1.7922866940498352, + "reward_std": 0.032217446714639664, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7922867238521576, + "step": 2311 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.1953125, + "epoch": 1.12939453125, + "grad_norm": 1.7562620435246095, + "kl": 0.06103515625, + "learning_rate": 7.177734375e-07, + "loss": 0.0024, + "reward": 1.7869414687156677, + "reward_std": 0.06921962834894657, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7869414389133453, + "step": 2312 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.8671875, + "epoch": 1.1298828125, + "grad_norm": 0.6578555181903704, + "kl": 0.0479736328125, + "learning_rate": 7.176513671875e-07, + "loss": 0.0019, + "reward": 1.7676367163658142, + "reward_std": 0.0514880558475852, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7754492163658142, + "step": 2313 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.0, + "epoch": 1.13037109375, + "grad_norm": 6.75545813289656, + "kl": 0.0570068359375, + "learning_rate": 7.175292968749999e-07, + "loss": 0.0023, + "reward": 1.7726652026176453, + "reward_std": 0.03823528438806534, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7726651132106781, + "step": 2314 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.8828125, + "epoch": 1.130859375, + "grad_norm": 1.2309637241993088, + "kl": 0.05224609375, + "learning_rate": 7.174072265624999e-07, + "loss": 0.0021, + "reward": 1.7631428241729736, + "reward_std": 0.03950107842683792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7631428837776184, + "step": 2315 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.3828125, + "epoch": 1.13134765625, + "grad_norm": 0.886493942657421, + "kl": 0.06494140625, + "learning_rate": 7.1728515625e-07, + "loss": 0.0026, + "reward": 1.791369915008545, + "reward_std": 0.06947879865765572, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7991823554039001, + "step": 2316 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.15625, + "epoch": 1.1318359375, + "grad_norm": 1.6449264544715965, + "kl": 0.076171875, + "learning_rate": 7.171630859375e-07, + "loss": 0.003, + "reward": 1.820662021636963, + "reward_std": 0.07318861410021782, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8206620216369629, + "step": 2317 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.15625, + "epoch": 1.13232421875, + "grad_norm": 2.512895927696398, + "kl": 0.095703125, + "learning_rate": 7.17041015625e-07, + "loss": 0.0038, + "reward": 1.6627293825149536, + "reward_std": 0.08104284480214119, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.662729412317276, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.03125, + "epoch": 1.1328125, + "grad_norm": 3.257367159754489, + "kl": 0.080810546875, + "learning_rate": 7.169189453125e-07, + "loss": 0.0032, + "reward": 1.7540556192398071, + "reward_std": 0.026656273752450943, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7540555894374847, + "step": 2319 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.09375, + "epoch": 1.13330078125, + "grad_norm": 2.3206529623904815, + "kl": 0.076416015625, + "learning_rate": 7.16796875e-07, + "loss": 0.0031, + "reward": 1.7328737378120422, + "reward_std": 0.06623134948313236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7328737676143646, + "step": 2320 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.71875, + "epoch": 1.1337890625, + "grad_norm": 2.7928034489135585, + "kl": 0.0673828125, + "learning_rate": 7.166748046874999e-07, + "loss": 0.0027, + "reward": 1.7304103374481201, + "reward_std": 0.08190120384097099, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7304103970527649, + "step": 2321 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.1015625, + "epoch": 1.13427734375, + "grad_norm": 1.999816042395783, + "kl": 0.0556640625, + "learning_rate": 7.165527343749999e-07, + "loss": 0.0022, + "reward": 1.838409960269928, + "reward_std": 0.058572327718138695, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.838409960269928, + "step": 2322 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.8984375, + "epoch": 1.134765625, + "grad_norm": 1.9833817250272083, + "kl": 0.081298828125, + "learning_rate": 7.164306640625e-07, + "loss": 0.0032, + "reward": 1.7137295007705688, + "reward_std": 0.1032501645386219, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7137295603752136, + "step": 2323 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.0078125, + "epoch": 1.13525390625, + "grad_norm": 1.8188360125282768, + "kl": 0.068115234375, + "learning_rate": 7.1630859375e-07, + "loss": 0.0027, + "reward": 1.7000929713249207, + "reward_std": 0.02517910674214363, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.700093001127243, + "step": 2324 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.5703125, + "epoch": 1.1357421875, + "grad_norm": 8.765129216379982, + "kl": 0.0595703125, + "learning_rate": 7.161865234375e-07, + "loss": 0.0024, + "reward": 1.8266154527664185, + "reward_std": 0.05450385436415672, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8266153633594513, + "step": 2325 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.3828125, + "epoch": 1.13623046875, + "grad_norm": 2.8091258579231586, + "kl": 0.0650634765625, + "learning_rate": 7.16064453125e-07, + "loss": 0.0026, + "reward": 1.7895740866661072, + "reward_std": 0.13050633668899536, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7973865866661072, + "step": 2326 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.375, + "epoch": 1.13671875, + "grad_norm": 0.6840155091097224, + "kl": 0.06005859375, + "learning_rate": 7.159423828124999e-07, + "loss": 0.0024, + "reward": 1.7351736426353455, + "reward_std": 0.014591011684387922, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7351736426353455, + "step": 2327 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.015625, + "epoch": 1.13720703125, + "grad_norm": 1.4187623188678093, + "kl": 0.0736083984375, + "learning_rate": 7.158203124999999e-07, + "loss": 0.0029, + "reward": 1.7072078585624695, + "reward_std": 0.057367969304323196, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7072078585624695, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.71875, + "epoch": 1.1376953125, + "grad_norm": 0.9798231552523725, + "kl": 0.0670166015625, + "learning_rate": 7.156982421875e-07, + "loss": 0.0027, + "reward": 1.818799912929535, + "reward_std": 0.05518978089094162, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8187999427318573, + "step": 2329 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.6875, + "epoch": 1.13818359375, + "grad_norm": 2.8180791927814703, + "kl": 0.057373046875, + "learning_rate": 7.15576171875e-07, + "loss": 0.0023, + "reward": 1.7160167694091797, + "reward_std": 0.040740249678492546, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7160168290138245, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.4453125, + "epoch": 1.138671875, + "grad_norm": 2.169648391438765, + "kl": 0.0693359375, + "learning_rate": 7.154541015625e-07, + "loss": 0.0028, + "reward": 1.6581083536148071, + "reward_std": 0.11767758429050446, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6581082940101624, + "step": 2331 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.1875, + "epoch": 1.13916015625, + "grad_norm": 1.0398806310768511, + "kl": 0.056884765625, + "learning_rate": 7.1533203125e-07, + "loss": 0.0023, + "reward": 1.6300272941589355, + "reward_std": 0.08064482361078262, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6378397643566132, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.3359375, + "epoch": 1.1396484375, + "grad_norm": 2.2107422861873403, + "kl": 0.0625, + "learning_rate": 7.152099609375e-07, + "loss": 0.0025, + "reward": 1.5821011662483215, + "reward_std": 0.18704190105199814, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5899136066436768, + "step": 2333 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.625, + "epoch": 1.14013671875, + "grad_norm": 0.8138393830121889, + "kl": 0.050048828125, + "learning_rate": 7.150878906249999e-07, + "loss": 0.002, + "reward": 1.7018383741378784, + "reward_std": 0.10387471597641706, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7487134337425232, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.2109375, + "epoch": 1.140625, + "grad_norm": 1.7447653610028946, + "kl": 0.0712890625, + "learning_rate": 7.149658203124999e-07, + "loss": 0.0028, + "reward": 1.771507978439331, + "reward_std": 0.02958191279321909, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7715080082416534, + "step": 2335 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.3203125, + "epoch": 1.14111328125, + "grad_norm": 0.8398042884116915, + "kl": 0.04736328125, + "learning_rate": 7.1484375e-07, + "loss": 0.0019, + "reward": 1.6066496968269348, + "reward_std": 0.1151208933442831, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6378996670246124, + "step": 2336 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.6953125, + "epoch": 1.1416015625, + "grad_norm": 4.216576593319963, + "kl": 0.0782470703125, + "learning_rate": 7.147216796875e-07, + "loss": 0.0031, + "reward": 1.6847835779190063, + "reward_std": 0.06190246529877186, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6847835183143616, + "step": 2337 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.1015625, + "epoch": 1.14208984375, + "grad_norm": 1.4146700885087296, + "kl": 0.0638427734375, + "learning_rate": 7.14599609375e-07, + "loss": 0.0026, + "reward": 1.741209328174591, + "reward_std": 0.06798835471272469, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7412092983722687, + "step": 2338 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.3125, + "epoch": 1.142578125, + "grad_norm": 0.8804028276553788, + "kl": 0.0650634765625, + "learning_rate": 7.144775390625e-07, + "loss": 0.0026, + "reward": 1.9364939332008362, + "reward_std": 0.07677973434329033, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9443064332008362, + "step": 2339 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.8984375, + "epoch": 1.14306640625, + "grad_norm": 3.528715281639079, + "kl": 0.0560302734375, + "learning_rate": 7.143554687499999e-07, + "loss": 0.0022, + "reward": 1.7230896353721619, + "reward_std": 0.11856443714350462, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7387146353721619, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.4375, + "epoch": 1.1435546875, + "grad_norm": 2.133887430565937, + "kl": 0.103515625, + "learning_rate": 7.142333984374999e-07, + "loss": 0.0041, + "reward": 1.8173925876617432, + "reward_std": 0.05221419036388397, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8252050876617432, + "step": 2341 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.265625, + "epoch": 1.14404296875, + "grad_norm": 2.0667954941578897, + "kl": 0.061767578125, + "learning_rate": 7.14111328125e-07, + "loss": 0.0025, + "reward": 1.8616973161697388, + "reward_std": 0.0397907979786396, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8616973757743835, + "step": 2342 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.5703125, + "epoch": 1.14453125, + "grad_norm": 1.1764096283379186, + "kl": 0.06494140625, + "learning_rate": 7.139892578125e-07, + "loss": 0.0026, + "reward": 1.6104283928871155, + "reward_std": 0.1447310373187065, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6338659226894379, + "step": 2343 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.2265625, + "epoch": 1.14501953125, + "grad_norm": 1.5286119984509583, + "kl": 0.09033203125, + "learning_rate": 7.138671875e-07, + "loss": 0.0036, + "reward": 1.5864279866218567, + "reward_std": 0.05582820437848568, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5864280462265015, + "step": 2344 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.3125, + "epoch": 1.1455078125, + "grad_norm": 1.7441444479284598, + "kl": 0.050537109375, + "learning_rate": 7.137451171875e-07, + "loss": 0.002, + "reward": 1.8168761134147644, + "reward_std": 0.10111106187105179, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8246885538101196, + "step": 2345 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.4609375, + "epoch": 1.14599609375, + "grad_norm": 0.7495543880714017, + "kl": 0.05126953125, + "learning_rate": 7.13623046875e-07, + "loss": 0.0021, + "reward": 1.703368902206421, + "reward_std": 0.0950179323554039, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7111814320087433, + "step": 2346 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.359375, + "epoch": 1.146484375, + "grad_norm": 2.3548131843986604, + "kl": 0.06591796875, + "learning_rate": 7.135009765624999e-07, + "loss": 0.0026, + "reward": 1.7099797129631042, + "reward_std": 0.1034369133412838, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7177922427654266, + "step": 2347 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.3984375, + "epoch": 1.14697265625, + "grad_norm": 1.4519222766859223, + "kl": 0.0609130859375, + "learning_rate": 7.133789062499999e-07, + "loss": 0.0024, + "reward": 1.7761664390563965, + "reward_std": 0.05691366642713547, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7761664390563965, + "step": 2348 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.1015625, + "epoch": 1.1474609375, + "grad_norm": 2.4476278864349985, + "kl": 0.0703125, + "learning_rate": 7.132568359375e-07, + "loss": 0.0028, + "reward": 1.6659197211265564, + "reward_std": 0.09462928026914597, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6659197509288788, + "step": 2349 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.3515625, + "epoch": 1.14794921875, + "grad_norm": 17.700669961292856, + "kl": 0.05859375, + "learning_rate": 7.13134765625e-07, + "loss": 0.0023, + "reward": 1.6661089062690735, + "reward_std": 0.1225300058722496, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6895464062690735, + "step": 2350 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.7265625, + "epoch": 1.1484375, + "grad_norm": 1.5194236550305051, + "kl": 0.0618896484375, + "learning_rate": 7.130126953125e-07, + "loss": 0.0025, + "reward": 1.8306081295013428, + "reward_std": 0.0984015129506588, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8384206891059875, + "step": 2351 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.96875, + "epoch": 1.14892578125, + "grad_norm": 4.927122085791297, + "kl": 0.055908203125, + "learning_rate": 7.12890625e-07, + "loss": 0.0022, + "reward": 1.7711811065673828, + "reward_std": 0.0485275574028492, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7711811363697052, + "step": 2352 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.6328125, + "epoch": 1.1494140625, + "grad_norm": 1.2056839611949453, + "kl": 0.0712890625, + "learning_rate": 7.127685546875e-07, + "loss": 0.0028, + "reward": 1.8243648409843445, + "reward_std": 0.027884284034371376, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8243648409843445, + "step": 2353 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.109375, + "epoch": 1.14990234375, + "grad_norm": 2.287808978760034, + "kl": 0.075927734375, + "learning_rate": 7.126464843749999e-07, + "loss": 0.003, + "reward": 1.8395601511001587, + "reward_std": 0.017794081941246986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8395601511001587, + "step": 2354 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.375, + "epoch": 1.150390625, + "grad_norm": 3.6128285986688082, + "kl": 0.068603515625, + "learning_rate": 7.125244140624999e-07, + "loss": 0.0027, + "reward": 1.7480557560920715, + "reward_std": 0.11520683020353317, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7636808156967163, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.0234375, + "epoch": 1.15087890625, + "grad_norm": 1.1324473137402855, + "kl": 0.04541015625, + "learning_rate": 7.1240234375e-07, + "loss": 0.0018, + "reward": 1.7983075976371765, + "reward_std": 0.07985487952828407, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7983075678348541, + "step": 2356 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.546875, + "epoch": 1.1513671875, + "grad_norm": 1.8359334191454069, + "kl": 0.08544921875, + "learning_rate": 7.122802734375e-07, + "loss": 0.0034, + "reward": 1.6366276741027832, + "reward_std": 0.029252098873257637, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6366276144981384, + "step": 2357 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.4296875, + "epoch": 1.15185546875, + "grad_norm": 4.262206911923877, + "kl": 0.0587158203125, + "learning_rate": 7.12158203125e-07, + "loss": 0.0023, + "reward": 1.7909355163574219, + "reward_std": 0.05595472827553749, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7909355759620667, + "step": 2358 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.3515625, + "epoch": 1.15234375, + "grad_norm": 7.9709277642475715, + "kl": 0.0582275390625, + "learning_rate": 7.120361328125e-07, + "loss": 0.0023, + "reward": 1.8145057559013367, + "reward_std": 0.11433425173163414, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8223182857036591, + "step": 2359 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.15625, + "epoch": 1.15283203125, + "grad_norm": 1.3114368895833535, + "kl": 0.053955078125, + "learning_rate": 7.119140624999999e-07, + "loss": 0.0022, + "reward": 1.6761323809623718, + "reward_std": 0.1001717671751976, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6995699405670166, + "step": 2360 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.5390625, + "epoch": 1.1533203125, + "grad_norm": 1.868219050538953, + "kl": 0.0592041015625, + "learning_rate": 7.117919921874999e-07, + "loss": 0.0024, + "reward": 1.7569094896316528, + "reward_std": 0.11767644435167313, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7647219896316528, + "step": 2361 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.0234375, + "epoch": 1.15380859375, + "grad_norm": 1.2672827889776317, + "kl": 0.0457763671875, + "learning_rate": 7.11669921875e-07, + "loss": 0.0018, + "reward": 1.7769032716751099, + "reward_std": 0.06862462218850851, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7769032120704651, + "step": 2362 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.6015625, + "epoch": 1.154296875, + "grad_norm": 1.2955402321867606, + "kl": 0.0517578125, + "learning_rate": 7.115478515625e-07, + "loss": 0.0021, + "reward": 1.7472400069236755, + "reward_std": 0.1712198220193386, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7784900069236755, + "step": 2363 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.0546875, + "epoch": 1.15478515625, + "grad_norm": 2.2402729174427516, + "kl": 0.06640625, + "learning_rate": 7.1142578125e-07, + "loss": 0.0026, + "reward": 1.7722741961479187, + "reward_std": 0.1572416089475155, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7878992259502411, + "step": 2364 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.7890625, + "epoch": 1.1552734375, + "grad_norm": 1.0208911322785654, + "kl": 0.066650390625, + "learning_rate": 7.113037109375e-07, + "loss": 0.0027, + "reward": 1.755761444568634, + "reward_std": 0.07006818428635597, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7557614147663116, + "step": 2365 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.25, + "epoch": 1.15576171875, + "grad_norm": 2.065142149687345, + "kl": 0.063720703125, + "learning_rate": 7.11181640625e-07, + "loss": 0.0025, + "reward": 1.7702951431274414, + "reward_std": 0.03799489140510559, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7781076431274414, + "step": 2366 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.5234375, + "epoch": 1.15625, + "grad_norm": 1.9266816192587584, + "kl": 0.067138671875, + "learning_rate": 7.110595703124999e-07, + "loss": 0.0027, + "reward": 1.733154058456421, + "reward_std": 0.04852524399757385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7331540882587433, + "step": 2367 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.8828125, + "epoch": 1.15673828125, + "grad_norm": 1.268819129630388, + "kl": 0.076416015625, + "learning_rate": 7.109374999999999e-07, + "loss": 0.0031, + "reward": 1.727283000946045, + "reward_std": 0.04405433498322964, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7272829711437225, + "step": 2368 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.796875, + "epoch": 1.1572265625, + "grad_norm": 1.3015855264210114, + "kl": 0.056396484375, + "learning_rate": 7.108154296875e-07, + "loss": 0.0023, + "reward": 1.7930294871330261, + "reward_std": 0.056975074112415314, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7930294573307037, + "step": 2369 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.6796875, + "epoch": 1.15771484375, + "grad_norm": 1.4361230941581011, + "kl": 0.0504150390625, + "learning_rate": 7.10693359375e-07, + "loss": 0.002, + "reward": 1.783621370792389, + "reward_std": 0.04106577858328819, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7836213707923889, + "step": 2370 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.4140625, + "epoch": 1.158203125, + "grad_norm": 1.375569448092502, + "kl": 0.0589599609375, + "learning_rate": 7.105712890625e-07, + "loss": 0.0024, + "reward": 1.7498807311058044, + "reward_std": 0.06763119343668222, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7576932013034821, + "step": 2371 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.375, + "epoch": 1.15869140625, + "grad_norm": 0.5023786129439751, + "kl": 0.05517578125, + "learning_rate": 7.1044921875e-07, + "loss": 0.0022, + "reward": 1.8890994787216187, + "reward_std": 0.020692605525255203, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8890994787216187, + "step": 2372 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.3984375, + "epoch": 1.1591796875, + "grad_norm": 1.7393139909940287, + "kl": 0.081298828125, + "learning_rate": 7.103271484374999e-07, + "loss": 0.0032, + "reward": 1.7087842226028442, + "reward_std": 0.033358908258378506, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7087842524051666, + "step": 2373 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.7734375, + "epoch": 1.15966796875, + "grad_norm": 1.628089694830596, + "kl": 0.0567626953125, + "learning_rate": 7.102050781249999e-07, + "loss": 0.0023, + "reward": 1.7870800495147705, + "reward_std": 0.04601499065756798, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7870800197124481, + "step": 2374 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.046875, + "epoch": 1.16015625, + "grad_norm": 1.0821890704171715, + "kl": 0.0506591796875, + "learning_rate": 7.100830078125e-07, + "loss": 0.002, + "reward": 1.8657442927360535, + "reward_std": 0.10163949802517891, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8735567629337311, + "step": 2375 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.375, + "epoch": 1.16064453125, + "grad_norm": 1.5005134329516172, + "kl": 0.0582275390625, + "learning_rate": 7.099609375e-07, + "loss": 0.0023, + "reward": 1.7599137425422668, + "reward_std": 0.05832270160317421, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7599137425422668, + "step": 2376 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.4453125, + "epoch": 1.1611328125, + "grad_norm": 1.1183281293568572, + "kl": 0.0633544921875, + "learning_rate": 7.098388671875e-07, + "loss": 0.0025, + "reward": 1.8047245144844055, + "reward_std": 0.08250847831368446, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8047245144844055, + "step": 2377 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.28125, + "epoch": 1.16162109375, + "grad_norm": 1.5202076197640586, + "kl": 0.056396484375, + "learning_rate": 7.09716796875e-07, + "loss": 0.0023, + "reward": 1.9078629612922668, + "reward_std": 0.029499279335141182, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9078629016876221, + "step": 2378 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.4296875, + "epoch": 1.162109375, + "grad_norm": 1.806142323321523, + "kl": 0.0577392578125, + "learning_rate": 7.095947265625e-07, + "loss": 0.0023, + "reward": 1.755949079990387, + "reward_std": 0.09004699625074863, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.755949079990387, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.3359375, + "epoch": 1.16259765625, + "grad_norm": 2.5409784656610634, + "kl": 0.06298828125, + "learning_rate": 7.094726562499999e-07, + "loss": 0.0025, + "reward": 1.785530149936676, + "reward_std": 0.046783702448010445, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7855301201343536, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.84375, + "epoch": 1.1630859375, + "grad_norm": 1.5410866702591242, + "kl": 0.0582275390625, + "learning_rate": 7.093505859374999e-07, + "loss": 0.0023, + "reward": 1.6306228041648865, + "reward_std": 0.10624398104846478, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6462478041648865, + "step": 2381 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.1328125, + "epoch": 1.16357421875, + "grad_norm": 1.5076095428158183, + "kl": 0.0550537109375, + "learning_rate": 7.09228515625e-07, + "loss": 0.0022, + "reward": 1.8531184792518616, + "reward_std": 0.09160671941936016, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.860931009054184, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.921875, + "epoch": 1.1640625, + "grad_norm": 2.1539995428471905, + "kl": 0.070068359375, + "learning_rate": 7.091064453125e-07, + "loss": 0.0028, + "reward": 1.7696999311447144, + "reward_std": 0.10432733595371246, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7853248119354248, + "step": 2383 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.0, + "epoch": 1.16455078125, + "grad_norm": 2.777834781481418, + "kl": 0.05322265625, + "learning_rate": 7.08984375e-07, + "loss": 0.0021, + "reward": 1.7855026125907898, + "reward_std": 0.08844604343175888, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7933151125907898, + "step": 2384 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.21875, + "epoch": 1.1650390625, + "grad_norm": 2.3212115933760553, + "kl": 0.072021484375, + "learning_rate": 7.088623046875e-07, + "loss": 0.0029, + "reward": 1.658606767654419, + "reward_std": 0.13669633120298386, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6742317080497742, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.03125, + "epoch": 1.16552734375, + "grad_norm": 1.5471087574001265, + "kl": 0.061279296875, + "learning_rate": 7.087402343749999e-07, + "loss": 0.0024, + "reward": 1.7448172569274902, + "reward_std": 0.20727698504924774, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.783879816532135, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.953125, + "epoch": 1.166015625, + "grad_norm": 3.4097330170492905, + "kl": 0.0640869140625, + "learning_rate": 7.086181640624999e-07, + "loss": 0.0026, + "reward": 1.765123426914215, + "reward_std": 0.05697265453636646, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7651234269142151, + "step": 2387 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.265625, + "epoch": 1.16650390625, + "grad_norm": 2.5094396166271236, + "kl": 0.058837890625, + "learning_rate": 7.0849609375e-07, + "loss": 0.0024, + "reward": 1.8069196343421936, + "reward_std": 0.10948172211647034, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8147320747375488, + "step": 2388 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.0703125, + "epoch": 1.1669921875, + "grad_norm": 1.6288146794380969, + "kl": 0.073974609375, + "learning_rate": 7.083740234375e-07, + "loss": 0.003, + "reward": 1.7542518377304077, + "reward_std": 0.045296634547412395, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7542518377304077, + "step": 2389 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.3828125, + "epoch": 1.16748046875, + "grad_norm": 1.5399608721774982, + "kl": 0.0509033203125, + "learning_rate": 7.08251953125e-07, + "loss": 0.002, + "reward": 1.70972341299057, + "reward_std": 0.16795263439416885, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7331609427928925, + "step": 2390 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.4375, + "epoch": 1.16796875, + "grad_norm": 2.2999725959208788, + "kl": 0.0594482421875, + "learning_rate": 7.081298828125e-07, + "loss": 0.0024, + "reward": 1.8128122091293335, + "reward_std": 0.09195205383002758, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8128121495246887, + "step": 2391 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.0859375, + "epoch": 1.16845703125, + "grad_norm": 1.940816342699902, + "kl": 0.068359375, + "learning_rate": 7.080078125e-07, + "loss": 0.0027, + "reward": 1.747936189174652, + "reward_std": 0.08866530656814575, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7479361891746521, + "step": 2392 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.6640625, + "epoch": 1.1689453125, + "grad_norm": 1.765314724522433, + "kl": 0.052001953125, + "learning_rate": 7.078857421874999e-07, + "loss": 0.0021, + "reward": 1.709149956703186, + "reward_std": 0.06337589770555496, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.709149956703186, + "step": 2393 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.6015625, + "epoch": 1.16943359375, + "grad_norm": 7.062752985549045, + "kl": 0.07958984375, + "learning_rate": 7.077636718749999e-07, + "loss": 0.0032, + "reward": 1.6336697340011597, + "reward_std": 0.1141487006098032, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6414822340011597, + "step": 2394 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6640625, + "epoch": 1.169921875, + "grad_norm": 1.0620017021682755, + "kl": 0.049560546875, + "learning_rate": 7.076416015625e-07, + "loss": 0.002, + "reward": 1.9040113687515259, + "reward_std": 0.05147293955087662, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.904011458158493, + "step": 2395 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.3125, + "epoch": 1.17041015625, + "grad_norm": 1.3276612147735944, + "kl": 0.075927734375, + "learning_rate": 7.0751953125e-07, + "loss": 0.003, + "reward": 1.7077276706695557, + "reward_std": 0.0732121616601944, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7077276408672333, + "step": 2396 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.671875, + "epoch": 1.1708984375, + "grad_norm": 1.4011487524978836, + "kl": 0.079833984375, + "learning_rate": 7.073974609375e-07, + "loss": 0.0032, + "reward": 1.7469829320907593, + "reward_std": 0.0623103235848248, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7469829320907593, + "step": 2397 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.90625, + "epoch": 1.17138671875, + "grad_norm": 2.1429163994559337, + "kl": 0.0740966796875, + "learning_rate": 7.07275390625e-07, + "loss": 0.003, + "reward": 1.8036177158355713, + "reward_std": 0.07194317691028118, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8036176562309265, + "step": 2398 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.75, + "epoch": 1.171875, + "grad_norm": 1.7405554380960897, + "kl": 0.066162109375, + "learning_rate": 7.071533203124999e-07, + "loss": 0.0026, + "reward": 1.6592023372650146, + "reward_std": 0.08969663083553314, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6592022776603699, + "step": 2399 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.328125, + "epoch": 1.17236328125, + "grad_norm": 1.2593548286372285, + "kl": 0.068115234375, + "learning_rate": 7.070312499999999e-07, + "loss": 0.0027, + "reward": 1.7829896211624146, + "reward_std": 0.0856513325124979, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7829896509647369, + "step": 2400 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.2109375, + "epoch": 1.1728515625, + "grad_norm": 3.1176602902157495, + "kl": 0.0859375, + "learning_rate": 7.069091796875e-07, + "loss": 0.0034, + "reward": 1.8581604957580566, + "reward_std": 0.10554312914609909, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.858160525560379, + "step": 2401 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.625, + "epoch": 1.17333984375, + "grad_norm": 1.3117450441494156, + "kl": 0.082763671875, + "learning_rate": 7.06787109375e-07, + "loss": 0.0033, + "reward": 1.706332802772522, + "reward_std": 0.04083455912768841, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.706332802772522, + "step": 2402 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.4140625, + "epoch": 1.173828125, + "grad_norm": 3.430690325991213, + "kl": 0.13623046875, + "learning_rate": 7.066650390625e-07, + "loss": 0.0055, + "reward": 1.6918965578079224, + "reward_std": 0.06476756557822227, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6918965578079224, + "step": 2403 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.0, + "epoch": 1.17431640625, + "grad_norm": 2.062517429740279, + "kl": 0.072509765625, + "learning_rate": 7.0654296875e-07, + "loss": 0.0029, + "reward": 1.6687769293785095, + "reward_std": 0.0822465568780899, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6687769889831543, + "step": 2404 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5390625, + "epoch": 1.1748046875, + "grad_norm": 1.479713578737759, + "kl": 0.065673828125, + "learning_rate": 7.064208984375e-07, + "loss": 0.0026, + "reward": 1.843060851097107, + "reward_std": 0.05525344889611006, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8430608510971069, + "step": 2405 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.21875, + "epoch": 1.17529296875, + "grad_norm": 5.418876184032981, + "kl": 0.0703125, + "learning_rate": 7.062988281249999e-07, + "loss": 0.0028, + "reward": 1.7128131985664368, + "reward_std": 0.0804828368127346, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7128131687641144, + "step": 2406 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.625, + "epoch": 1.17578125, + "grad_norm": 2.7306643847833683, + "kl": 0.08154296875, + "learning_rate": 7.061767578124999e-07, + "loss": 0.0033, + "reward": 1.84114408493042, + "reward_std": 0.047078766860067844, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8411440849304199, + "step": 2407 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.4296875, + "epoch": 1.17626953125, + "grad_norm": 0.959063992930826, + "kl": 0.0633544921875, + "learning_rate": 7.060546875e-07, + "loss": 0.0025, + "reward": 1.8835274577140808, + "reward_std": 0.06786072719842196, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8913399577140808, + "step": 2408 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.578125, + "epoch": 1.1767578125, + "grad_norm": 0.8022740950274427, + "kl": 0.082275390625, + "learning_rate": 7.059326171875e-07, + "loss": 0.0033, + "reward": 1.8769137263298035, + "reward_std": 0.09892814233899117, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8847261667251587, + "step": 2409 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.8125, + "epoch": 1.17724609375, + "grad_norm": 1.4812068400181597, + "kl": 0.0830078125, + "learning_rate": 7.05810546875e-07, + "loss": 0.0033, + "reward": 1.8106178045272827, + "reward_std": 0.13285555690526962, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8262427449226379, + "step": 2410 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.421875, + "epoch": 1.177734375, + "grad_norm": 4.6584927782887195, + "kl": 0.06884765625, + "learning_rate": 7.056884765625e-07, + "loss": 0.0028, + "reward": 1.6145755648612976, + "reward_std": 0.12683077156543732, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.62238809466362, + "step": 2411 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5859375, + "epoch": 1.17822265625, + "grad_norm": 1.3273325229855206, + "kl": 0.0693359375, + "learning_rate": 7.055664062499999e-07, + "loss": 0.0028, + "reward": 1.8337931036949158, + "reward_std": 0.06046081706881523, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8337931036949158, + "step": 2412 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.2265625, + "epoch": 1.1787109375, + "grad_norm": 1.1873786934092632, + "kl": 0.062744140625, + "learning_rate": 7.054443359374999e-07, + "loss": 0.0025, + "reward": 1.8494738936424255, + "reward_std": 0.07496330887079239, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8494738638401031, + "step": 2413 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.4375, + "epoch": 1.17919921875, + "grad_norm": 1.2850630040194209, + "kl": 0.0634765625, + "learning_rate": 7.05322265625e-07, + "loss": 0.0025, + "reward": 1.740599811077118, + "reward_std": 0.04363143816590309, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7405998110771179, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.8359375, + "epoch": 1.1796875, + "grad_norm": 2.6529425377978475, + "kl": 0.07861328125, + "learning_rate": 7.052001953125e-07, + "loss": 0.0032, + "reward": 1.8151870369911194, + "reward_std": 0.13191955909132957, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.822999507188797, + "step": 2415 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.4921875, + "epoch": 1.18017578125, + "grad_norm": 2.380781065882567, + "kl": 0.073974609375, + "learning_rate": 7.05078125e-07, + "loss": 0.003, + "reward": 1.8084670305252075, + "reward_std": 0.05959512945264578, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8084670305252075, + "step": 2416 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.46875, + "epoch": 1.1806640625, + "grad_norm": 1.633451395929759, + "kl": 0.0732421875, + "learning_rate": 7.049560546875e-07, + "loss": 0.0029, + "reward": 1.8302597403526306, + "reward_std": 0.10631529986858368, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8458847999572754, + "step": 2417 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.8984375, + "epoch": 1.18115234375, + "grad_norm": 1.095996447538376, + "kl": 0.065673828125, + "learning_rate": 7.04833984375e-07, + "loss": 0.0026, + "reward": 1.6789276599884033, + "reward_std": 0.07097472064197063, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6867401003837585, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.4375, + "epoch": 1.181640625, + "grad_norm": 0.6358463267113961, + "kl": 0.0692138671875, + "learning_rate": 7.047119140624999e-07, + "loss": 0.0028, + "reward": 1.7676212787628174, + "reward_std": 0.02877889759838581, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.767621248960495, + "step": 2419 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.0546875, + "epoch": 1.18212890625, + "grad_norm": 1.7201953005283817, + "kl": 0.06591796875, + "learning_rate": 7.045898437499999e-07, + "loss": 0.0026, + "reward": 1.6933047771453857, + "reward_std": 0.11904028803110123, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7089297771453857, + "step": 2420 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.84375, + "epoch": 1.1826171875, + "grad_norm": 1.642795231335406, + "kl": 0.076171875, + "learning_rate": 7.044677734375e-07, + "loss": 0.003, + "reward": 1.7209062576293945, + "reward_std": 0.08078465051949024, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7287188470363617, + "step": 2421 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.390625, + "epoch": 1.18310546875, + "grad_norm": 1.697016218007385, + "kl": 0.07861328125, + "learning_rate": 7.04345703125e-07, + "loss": 0.0031, + "reward": 1.7383880019187927, + "reward_std": 0.033076136372983456, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7383880317211151, + "step": 2422 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.96875, + "epoch": 1.18359375, + "grad_norm": 2.3807287354436486, + "kl": 0.0628662109375, + "learning_rate": 7.042236328125e-07, + "loss": 0.0025, + "reward": 1.785739779472351, + "reward_std": 0.0659907665103674, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7857397794723511, + "step": 2423 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.921875, + "epoch": 1.18408203125, + "grad_norm": 1.850170065076833, + "kl": 0.077880859375, + "learning_rate": 7.041015625e-07, + "loss": 0.0031, + "reward": 1.8572614789009094, + "reward_std": 0.03198802284896374, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8572614789009094, + "step": 2424 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.2578125, + "epoch": 1.1845703125, + "grad_norm": 1.250303267080643, + "kl": 0.0650634765625, + "learning_rate": 7.039794921874999e-07, + "loss": 0.0026, + "reward": 1.6902012825012207, + "reward_std": 0.04371343832463026, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6902012228965759, + "step": 2425 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.25, + "epoch": 1.18505859375, + "grad_norm": 5.468487040253603, + "kl": 0.0849609375, + "learning_rate": 7.038574218749999e-07, + "loss": 0.0034, + "reward": 1.557692527770996, + "reward_std": 0.11165288090705872, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6280049979686737, + "step": 2426 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.5546875, + "epoch": 1.185546875, + "grad_norm": 1.456806917925311, + "kl": 0.066650390625, + "learning_rate": 7.037353515625e-07, + "loss": 0.0027, + "reward": 1.66695636510849, + "reward_std": 0.03848722204566002, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6669564247131348, + "step": 2427 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.078125, + "epoch": 1.18603515625, + "grad_norm": 2.9368301614032495, + "kl": 0.0667724609375, + "learning_rate": 7.0361328125e-07, + "loss": 0.0027, + "reward": 1.7350419759750366, + "reward_std": 0.09676255099475384, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7584794461727142, + "step": 2428 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.4375, + "epoch": 1.1865234375, + "grad_norm": 2.219443545795303, + "kl": 0.0638427734375, + "learning_rate": 7.034912109375e-07, + "loss": 0.0026, + "reward": 1.8254042863845825, + "reward_std": 0.0871292520314455, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8254042863845825, + "step": 2429 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.7265625, + "epoch": 1.18701171875, + "grad_norm": 1.0660266946811585, + "kl": 0.079345703125, + "learning_rate": 7.03369140625e-07, + "loss": 0.0032, + "reward": 1.7727646231651306, + "reward_std": 0.06067582964897156, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.772764652967453, + "step": 2430 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.8515625, + "epoch": 1.1875, + "grad_norm": 2.0455673981048337, + "kl": 0.069091796875, + "learning_rate": 7.032470703125e-07, + "loss": 0.0028, + "reward": 1.6600714921951294, + "reward_std": 0.054243333637714386, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6600715816020966, + "step": 2431 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.046875, + "epoch": 1.18798828125, + "grad_norm": 2.3469567235126343, + "kl": 0.074951171875, + "learning_rate": 7.031249999999999e-07, + "loss": 0.003, + "reward": 1.717573642730713, + "reward_std": 0.14434907957911491, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7410111129283905, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.8828125, + "epoch": 1.1884765625, + "grad_norm": 1.7159405891568082, + "kl": 0.08154296875, + "learning_rate": 7.030029296874999e-07, + "loss": 0.0033, + "reward": 1.7161504030227661, + "reward_std": 0.03489119280129671, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7161504626274109, + "step": 2433 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.6171875, + "epoch": 1.18896484375, + "grad_norm": 2.209863344385722, + "kl": 0.065185546875, + "learning_rate": 7.02880859375e-07, + "loss": 0.0026, + "reward": 1.7833570837974548, + "reward_std": 0.06764233857393265, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7833570539951324, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.4765625, + "epoch": 1.189453125, + "grad_norm": 5.044233806012345, + "kl": 0.05859375, + "learning_rate": 7.027587890625e-07, + "loss": 0.0023, + "reward": 1.771048367023468, + "reward_std": 0.033720131730660796, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.771048367023468, + "step": 2435 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2265625, + "epoch": 1.18994140625, + "grad_norm": 2.5922434578602305, + "kl": 0.07177734375, + "learning_rate": 7.0263671875e-07, + "loss": 0.0029, + "reward": 1.6098366379737854, + "reward_std": 0.05740887112915516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.609836757183075, + "step": 2436 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0078125, + "epoch": 1.1904296875, + "grad_norm": 1.8713184275572499, + "kl": 0.080078125, + "learning_rate": 7.025146484375e-07, + "loss": 0.0032, + "reward": 1.7736141681671143, + "reward_std": 0.04480298818089068, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7736141979694366, + "step": 2437 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.2265625, + "epoch": 1.19091796875, + "grad_norm": 2.1515702226767512, + "kl": 0.0732421875, + "learning_rate": 7.02392578125e-07, + "loss": 0.0029, + "reward": 1.695095181465149, + "reward_std": 0.12620120495557785, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7263452112674713, + "step": 2438 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.546875, + "epoch": 1.19140625, + "grad_norm": 2.5367504438666413, + "kl": 0.0552978515625, + "learning_rate": 7.022705078124999e-07, + "loss": 0.0022, + "reward": 1.7602566480636597, + "reward_std": 0.11810046620666981, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.8149441182613373, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.3984375, + "epoch": 1.19189453125, + "grad_norm": 1.6103282302386244, + "kl": 0.0760498046875, + "learning_rate": 7.021484375e-07, + "loss": 0.003, + "reward": 1.5700552463531494, + "reward_std": 0.09616255201399326, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5856801867485046, + "step": 2440 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.2890625, + "epoch": 1.1923828125, + "grad_norm": 2.776703466020179, + "kl": 0.055908203125, + "learning_rate": 7.020263671875e-07, + "loss": 0.0022, + "reward": 1.7921187281608582, + "reward_std": 0.04691682942211628, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.792118638753891, + "step": 2441 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.8828125, + "epoch": 1.19287109375, + "grad_norm": 2.8510129285591836, + "kl": 0.0478515625, + "learning_rate": 7.01904296875e-07, + "loss": 0.0019, + "reward": 1.6040194630622864, + "reward_std": 0.1493111103773117, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.650894433259964, + "step": 2442 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.8359375, + "epoch": 1.193359375, + "grad_norm": 0.9498794503148597, + "kl": 0.068603515625, + "learning_rate": 7.017822265625e-07, + "loss": 0.0027, + "reward": 1.7161058187484741, + "reward_std": 0.09053925797343254, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7551683783531189, + "step": 2443 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.7734375, + "epoch": 1.19384765625, + "grad_norm": 1.1199023458800679, + "kl": 0.061279296875, + "learning_rate": 7.0166015625e-07, + "loss": 0.0025, + "reward": 1.8098965287208557, + "reward_std": 0.08372041955590248, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8177090287208557, + "step": 2444 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.6015625, + "epoch": 1.1943359375, + "grad_norm": 0.47829963771188727, + "kl": 0.0491943359375, + "learning_rate": 7.015380859374999e-07, + "loss": 0.002, + "reward": 1.6234931945800781, + "reward_std": 0.18474455177783966, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.6938056945800781, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.484375, + "epoch": 1.19482421875, + "grad_norm": 1.6276283737432735, + "kl": 0.0614013671875, + "learning_rate": 7.014160156249999e-07, + "loss": 0.0025, + "reward": 1.7953330874443054, + "reward_std": 0.08542214334011078, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8109579682350159, + "step": 2446 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.765625, + "epoch": 1.1953125, + "grad_norm": 2.960283467365649, + "kl": 0.070068359375, + "learning_rate": 7.012939453125e-07, + "loss": 0.0028, + "reward": 1.616200864315033, + "reward_std": 0.16303523629903793, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.678700864315033, + "step": 2447 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.4296875, + "epoch": 1.19580078125, + "grad_norm": 1.8931874672712756, + "kl": 0.070556640625, + "learning_rate": 7.01171875e-07, + "loss": 0.0028, + "reward": 1.7042686939239502, + "reward_std": 0.10324783250689507, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.712081253528595, + "step": 2448 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.7890625, + "epoch": 1.1962890625, + "grad_norm": 0.8846851645218154, + "kl": 0.045654296875, + "learning_rate": 7.010498046875e-07, + "loss": 0.0018, + "reward": 1.826434314250946, + "reward_std": 0.16191211715340614, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8576842248439789, + "step": 2449 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.3046875, + "epoch": 1.19677734375, + "grad_norm": 0.7349892567727097, + "kl": 0.0633544921875, + "learning_rate": 7.00927734375e-07, + "loss": 0.0025, + "reward": 1.8562174439430237, + "reward_std": 0.01508009573444724, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8562174439430237, + "step": 2450 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.4375, + "epoch": 1.197265625, + "grad_norm": 2.0795643682176217, + "kl": 0.07666015625, + "learning_rate": 7.008056640625e-07, + "loss": 0.0031, + "reward": 1.5798554420471191, + "reward_std": 0.09327048435807228, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6189179420471191, + "step": 2451 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.4296875, + "epoch": 1.19775390625, + "grad_norm": 2.43960100948424, + "kl": 0.0667724609375, + "learning_rate": 7.006835937499999e-07, + "loss": 0.0027, + "reward": 1.6752365827560425, + "reward_std": 0.16312190517783165, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6986740827560425, + "step": 2452 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.6953125, + "epoch": 1.1982421875, + "grad_norm": 1.783527107114228, + "kl": 0.0546875, + "learning_rate": 7.005615234374999e-07, + "loss": 0.0022, + "reward": 1.697092890739441, + "reward_std": 0.12336409464478493, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7127178907394409, + "step": 2453 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.796875, + "epoch": 1.19873046875, + "grad_norm": 1.4386375964027276, + "kl": 0.059814453125, + "learning_rate": 7.00439453125e-07, + "loss": 0.0024, + "reward": 1.8195868134498596, + "reward_std": 0.034168762154877186, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.819586843252182, + "step": 2454 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.0078125, + "epoch": 1.19921875, + "grad_norm": 1.0233839801880174, + "kl": 0.0572509765625, + "learning_rate": 7.003173828125e-07, + "loss": 0.0023, + "reward": 1.8272386193275452, + "reward_std": 0.05041295662522316, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8272385895252228, + "step": 2455 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.9453125, + "epoch": 1.19970703125, + "grad_norm": 1.721124868164957, + "kl": 0.0606689453125, + "learning_rate": 7.001953125e-07, + "loss": 0.0024, + "reward": 1.6735165119171143, + "reward_std": 0.0498051792383194, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6735165119171143, + "step": 2456 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.6640625, + "epoch": 1.2001953125, + "grad_norm": 4.63579264894847, + "kl": 0.0703125, + "learning_rate": 7.000732421875e-07, + "loss": 0.0028, + "reward": 1.8002795577049255, + "reward_std": 0.04197111213579774, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8002796173095703, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.3359375, + "epoch": 1.20068359375, + "grad_norm": 1.546489313566168, + "kl": 0.0576171875, + "learning_rate": 6.999511718749999e-07, + "loss": 0.0023, + "reward": 1.721842348575592, + "reward_std": 0.0830717384815216, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.721842348575592, + "step": 2458 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.6640625, + "epoch": 1.201171875, + "grad_norm": 5.094662458606525, + "kl": 0.0626220703125, + "learning_rate": 6.998291015624999e-07, + "loss": 0.0025, + "reward": 1.7491782903671265, + "reward_std": 0.08964913338422775, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7491783201694489, + "step": 2459 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.1640625, + "epoch": 1.20166015625, + "grad_norm": 1.1949232897338642, + "kl": 0.05615234375, + "learning_rate": 6.9970703125e-07, + "loss": 0.0022, + "reward": 1.8746750950813293, + "reward_std": 0.11067311465740204, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8824875354766846, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.4921875, + "epoch": 1.2021484375, + "grad_norm": 1.5899026672991512, + "kl": 0.08447265625, + "learning_rate": 6.995849609375e-07, + "loss": 0.0034, + "reward": 1.8029922246932983, + "reward_std": 0.06205196492373943, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8029922544956207, + "step": 2461 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.1953125, + "epoch": 1.20263671875, + "grad_norm": 2.103963718527489, + "kl": 0.0712890625, + "learning_rate": 6.99462890625e-07, + "loss": 0.0028, + "reward": 1.7411906719207764, + "reward_std": 0.07426265999674797, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7411905825138092, + "step": 2462 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.1484375, + "epoch": 1.203125, + "grad_norm": 1.2381185234846581, + "kl": 0.0516357421875, + "learning_rate": 6.993408203125e-07, + "loss": 0.0021, + "reward": 1.7142133712768555, + "reward_std": 0.07610474899411201, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7220259010791779, + "step": 2463 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.40625, + "epoch": 1.20361328125, + "grad_norm": 6.431279471704583, + "kl": 0.095703125, + "learning_rate": 6.9921875e-07, + "loss": 0.0038, + "reward": 1.6048645973205566, + "reward_std": 0.07959796488285065, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6048645377159119, + "step": 2464 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.3359375, + "epoch": 1.2041015625, + "grad_norm": 2.841010011660038, + "kl": 0.068603515625, + "learning_rate": 6.990966796874999e-07, + "loss": 0.0027, + "reward": 1.8495973944664001, + "reward_std": 0.05448159575462341, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8495973944664001, + "step": 2465 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.8671875, + "epoch": 1.20458984375, + "grad_norm": 0.8842872987899542, + "kl": 0.0574951171875, + "learning_rate": 6.989746093749999e-07, + "loss": 0.0023, + "reward": 1.8488314151763916, + "reward_std": 0.039236126467585564, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.848831444978714, + "step": 2466 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.640625, + "epoch": 1.205078125, + "grad_norm": 3.6653341295723876, + "kl": 0.072265625, + "learning_rate": 6.988525390625e-07, + "loss": 0.0029, + "reward": 1.7910266518592834, + "reward_std": 0.05401626043021679, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7910265922546387, + "step": 2467 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.6796875, + "epoch": 1.20556640625, + "grad_norm": 1.4644528281755704, + "kl": 0.078369140625, + "learning_rate": 6.9873046875e-07, + "loss": 0.0031, + "reward": 1.8185259103775024, + "reward_std": 0.01729111559689045, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8185259103775024, + "step": 2468 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.4453125, + "epoch": 1.2060546875, + "grad_norm": 0.8915286774826748, + "kl": 0.072021484375, + "learning_rate": 6.986083984375e-07, + "loss": 0.0029, + "reward": 1.7518900632858276, + "reward_std": 0.0792790362611413, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7909526228904724, + "step": 2469 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.515625, + "epoch": 1.20654296875, + "grad_norm": 1.9404027572704816, + "kl": 0.087158203125, + "learning_rate": 6.98486328125e-07, + "loss": 0.0035, + "reward": 1.7426846027374268, + "reward_std": 0.08018626365810633, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7426846027374268, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.0625, + "epoch": 1.20703125, + "grad_norm": 1.1767884723377213, + "kl": 0.0731201171875, + "learning_rate": 6.983642578124999e-07, + "loss": 0.0029, + "reward": 1.7661176919937134, + "reward_std": 0.05781315267086029, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7661177515983582, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.7421875, + "epoch": 1.20751953125, + "grad_norm": 12.083794382746516, + "kl": 0.07373046875, + "learning_rate": 6.982421874999999e-07, + "loss": 0.0029, + "reward": 1.7419702410697937, + "reward_std": 0.019140704069286585, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7419701814651489, + "step": 2472 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.8203125, + "epoch": 1.2080078125, + "grad_norm": 2.0703187627263686, + "kl": 0.084228515625, + "learning_rate": 6.981201171875e-07, + "loss": 0.0034, + "reward": 1.7072731852531433, + "reward_std": 0.10907960124313831, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7150856554508209, + "step": 2473 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.4453125, + "epoch": 1.20849609375, + "grad_norm": 0.9422041926618386, + "kl": 0.0816650390625, + "learning_rate": 6.97998046875e-07, + "loss": 0.0033, + "reward": 1.7536945343017578, + "reward_std": 0.02776573784649372, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.753694474697113, + "step": 2474 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.34375, + "epoch": 1.208984375, + "grad_norm": 0.8433697427169689, + "kl": 0.0611572265625, + "learning_rate": 6.978759765625e-07, + "loss": 0.0024, + "reward": 1.726251244544983, + "reward_std": 0.06488487310707569, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7262513041496277, + "step": 2475 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.53125, + "epoch": 1.20947265625, + "grad_norm": 0.7248153544649567, + "kl": 0.0712890625, + "learning_rate": 6.9775390625e-07, + "loss": 0.0029, + "reward": 1.7210323810577393, + "reward_std": 0.010402468382380903, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7210325002670288, + "step": 2476 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.3984375, + "epoch": 1.2099609375, + "grad_norm": 1.8834510853065414, + "kl": 0.06201171875, + "learning_rate": 6.976318359375e-07, + "loss": 0.0025, + "reward": 1.7611924409866333, + "reward_std": 0.10708035714924335, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7690049111843109, + "step": 2477 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.8125, + "epoch": 1.21044921875, + "grad_norm": 4.270766248537916, + "kl": 0.07763671875, + "learning_rate": 6.975097656249999e-07, + "loss": 0.0031, + "reward": 1.814048945903778, + "reward_std": 0.05252527166157961, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8140489459037781, + "step": 2478 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.5390625, + "epoch": 1.2109375, + "grad_norm": 2.2657492845126486, + "kl": 0.0574951171875, + "learning_rate": 6.973876953124999e-07, + "loss": 0.0023, + "reward": 1.809904932975769, + "reward_std": 0.05868878960609436, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8099049031734467, + "step": 2479 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.09375, + "epoch": 1.21142578125, + "grad_norm": 4.6466935324712235, + "kl": 0.0562744140625, + "learning_rate": 6.97265625e-07, + "loss": 0.0023, + "reward": 1.7215197086334229, + "reward_std": 0.07658331096172333, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7762071788311005, + "step": 2480 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.0703125, + "epoch": 1.2119140625, + "grad_norm": 14.953714286957666, + "kl": 0.133056640625, + "learning_rate": 6.971435546875e-07, + "loss": 0.0053, + "reward": 1.7927291989326477, + "reward_std": 0.027087991125881672, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7927291989326477, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.1015625, + "epoch": 1.21240234375, + "grad_norm": 5.068041510063571, + "kl": 0.0521240234375, + "learning_rate": 6.97021484375e-07, + "loss": 0.0021, + "reward": 1.7825981974601746, + "reward_std": 0.08510758727788925, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.790410727262497, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.5546875, + "epoch": 1.212890625, + "grad_norm": 1.2049058380875817, + "kl": 0.0714111328125, + "learning_rate": 6.968994140625e-07, + "loss": 0.0029, + "reward": 1.8188948035240173, + "reward_std": 0.025680112652480602, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8188948333263397, + "step": 2483 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.7578125, + "epoch": 1.21337890625, + "grad_norm": 7.356014600027885, + "kl": 0.069580078125, + "learning_rate": 6.967773437499999e-07, + "loss": 0.0028, + "reward": 1.7450536489486694, + "reward_std": 0.1266886219382286, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.752866119146347, + "step": 2484 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.0, + "epoch": 1.2138671875, + "grad_norm": 3.1051199689401043, + "kl": 0.0675048828125, + "learning_rate": 6.966552734374999e-07, + "loss": 0.0027, + "reward": 1.6235730051994324, + "reward_std": 0.11907243356108665, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6470105350017548, + "step": 2485 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.953125, + "epoch": 1.21435546875, + "grad_norm": 1.6944501406666026, + "kl": 0.069580078125, + "learning_rate": 6.96533203125e-07, + "loss": 0.0028, + "reward": 1.8035091161727905, + "reward_std": 0.07174506038427353, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8035090863704681, + "step": 2486 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.4140625, + "epoch": 1.21484375, + "grad_norm": 1.2360053249882443, + "kl": 0.0732421875, + "learning_rate": 6.964111328125e-07, + "loss": 0.0029, + "reward": 1.6692347526550293, + "reward_std": 0.04010845720767975, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6692347228527069, + "step": 2487 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.6953125, + "epoch": 1.21533203125, + "grad_norm": 1.0885353537961944, + "kl": 0.0716552734375, + "learning_rate": 6.962890625e-07, + "loss": 0.0029, + "reward": 1.7111193537712097, + "reward_std": 0.09523628279566765, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7189318835735321, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.7265625, + "epoch": 1.2158203125, + "grad_norm": 1.2668470125667208, + "kl": 0.05712890625, + "learning_rate": 6.961669921875e-07, + "loss": 0.0023, + "reward": 1.6149799227714539, + "reward_std": 0.12601268105208874, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6384174823760986, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.703125, + "epoch": 1.21630859375, + "grad_norm": 1.7178893277816036, + "kl": 0.0589599609375, + "learning_rate": 6.96044921875e-07, + "loss": 0.0024, + "reward": 1.8464585542678833, + "reward_std": 0.056060753762722015, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8464585840702057, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.734375, + "epoch": 1.216796875, + "grad_norm": 4.712548901479162, + "kl": 0.0654296875, + "learning_rate": 6.959228515624999e-07, + "loss": 0.0026, + "reward": 1.6734269857406616, + "reward_std": 0.09399673715233803, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6734269857406616, + "step": 2491 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.765625, + "epoch": 1.21728515625, + "grad_norm": 1.0758561945731477, + "kl": 0.0693359375, + "learning_rate": 6.958007812499999e-07, + "loss": 0.0028, + "reward": 1.6628954410552979, + "reward_std": 0.042761145159602165, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6628954112529755, + "step": 2492 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.6484375, + "epoch": 1.2177734375, + "grad_norm": 1.1584941663597192, + "kl": 0.076171875, + "learning_rate": 6.956787109375e-07, + "loss": 0.003, + "reward": 1.7079687118530273, + "reward_std": 0.0789231238886714, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7079687416553497, + "step": 2493 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3359375, + "epoch": 1.21826171875, + "grad_norm": 1.964653097717732, + "kl": 0.062255859375, + "learning_rate": 6.95556640625e-07, + "loss": 0.0025, + "reward": 1.6944482326507568, + "reward_std": 0.03738341759890318, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6944482922554016, + "step": 2494 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.96875, + "epoch": 1.21875, + "grad_norm": 3.3640026796745857, + "kl": 0.0626220703125, + "learning_rate": 6.954345703125e-07, + "loss": 0.0025, + "reward": 1.784760594367981, + "reward_std": 0.06138443388044834, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.784760594367981, + "step": 2495 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.4453125, + "epoch": 1.21923828125, + "grad_norm": 3.2144653678548583, + "kl": 0.0732421875, + "learning_rate": 6.953125e-07, + "loss": 0.0029, + "reward": 1.5434470176696777, + "reward_std": 0.14664818346500397, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5590719878673553, + "step": 2496 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.5625, + "epoch": 1.2197265625, + "grad_norm": 1.6026752821456773, + "kl": 0.0711669921875, + "learning_rate": 6.951904296874999e-07, + "loss": 0.0029, + "reward": 1.719383180141449, + "reward_std": 0.06268875673413277, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7193832099437714, + "step": 2497 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.8984375, + "epoch": 1.22021484375, + "grad_norm": 2.7339715672324685, + "kl": 0.09521484375, + "learning_rate": 6.950683593749999e-07, + "loss": 0.0038, + "reward": 1.6224533915519714, + "reward_std": 0.03794710151851177, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6224533319473267, + "step": 2498 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.6171875, + "epoch": 1.220703125, + "grad_norm": 3.3456055155414854, + "kl": 0.094970703125, + "learning_rate": 6.949462890625e-07, + "loss": 0.0038, + "reward": 1.687516987323761, + "reward_std": 0.04205773863941431, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.687516987323761, + "step": 2499 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.96875, + "epoch": 1.22119140625, + "grad_norm": 1.410321483173382, + "kl": 0.0670166015625, + "learning_rate": 6.9482421875e-07, + "loss": 0.0027, + "reward": 1.784384846687317, + "reward_std": 0.03612975589931011, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7843847870826721, + "step": 2500 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.984375, + "epoch": 1.2216796875, + "grad_norm": 4.366344762597936, + "kl": 0.078857421875, + "learning_rate": 6.947021484375e-07, + "loss": 0.0032, + "reward": 1.7177372574806213, + "reward_std": 0.1302860602736473, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7255498170852661, + "step": 2501 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.3671875, + "epoch": 1.22216796875, + "grad_norm": 1.39273870980327, + "kl": 0.078125, + "learning_rate": 6.94580078125e-07, + "loss": 0.0031, + "reward": 1.708004117012024, + "reward_std": 0.06258507259190083, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7392540872097015, + "step": 2502 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.015625, + "epoch": 1.22265625, + "grad_norm": 2.5625187589958376, + "kl": 0.08349609375, + "learning_rate": 6.944580078125e-07, + "loss": 0.0033, + "reward": 1.8098444938659668, + "reward_std": 0.09694074839353561, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8176570236682892, + "step": 2503 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.3984375, + "epoch": 1.22314453125, + "grad_norm": 2.0931664616747683, + "kl": 0.0699462890625, + "learning_rate": 6.943359374999999e-07, + "loss": 0.0028, + "reward": 1.6923267245292664, + "reward_std": 0.16218779981136322, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7157641649246216, + "step": 2504 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.609375, + "epoch": 1.2236328125, + "grad_norm": 1.3978025191646846, + "kl": 0.0634765625, + "learning_rate": 6.942138671874999e-07, + "loss": 0.0025, + "reward": 1.6496607065200806, + "reward_std": 0.08597181178629398, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6652857065200806, + "step": 2505 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.234375, + "epoch": 1.22412109375, + "grad_norm": 2.1379914333546512, + "kl": 0.06591796875, + "learning_rate": 6.94091796875e-07, + "loss": 0.0026, + "reward": 1.7624231576919556, + "reward_std": 0.12387410178780556, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7780481576919556, + "step": 2506 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.0234375, + "epoch": 1.224609375, + "grad_norm": 1.1228065728537064, + "kl": 0.0565185546875, + "learning_rate": 6.939697265625e-07, + "loss": 0.0023, + "reward": 1.6814470887184143, + "reward_std": 0.021641411818563938, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6814470887184143, + "step": 2507 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.609375, + "epoch": 1.22509765625, + "grad_norm": 0.8026004860091579, + "kl": 0.0560302734375, + "learning_rate": 6.9384765625e-07, + "loss": 0.0022, + "reward": 1.842549443244934, + "reward_std": 0.04153232369571924, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8425493836402893, + "step": 2508 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.3828125, + "epoch": 1.2255859375, + "grad_norm": 2.430656263181087, + "kl": 0.077880859375, + "learning_rate": 6.937255859375e-07, + "loss": 0.0031, + "reward": 1.8032140135765076, + "reward_std": 0.0743367203976959, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.81102654337883, + "step": 2509 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3828125, + "epoch": 1.22607421875, + "grad_norm": 1.168922825721485, + "kl": 0.08544921875, + "learning_rate": 6.936035156249999e-07, + "loss": 0.0034, + "reward": 1.6974853873252869, + "reward_std": 0.08852525055408478, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7521729171276093, + "step": 2510 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.8984375, + "epoch": 1.2265625, + "grad_norm": 8.023737021131817, + "kl": 0.08056640625, + "learning_rate": 6.934814453124999e-07, + "loss": 0.0032, + "reward": 1.7808299660682678, + "reward_std": 0.04870981816202402, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.780829906463623, + "step": 2511 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.8203125, + "epoch": 1.22705078125, + "grad_norm": 3.438619709615711, + "kl": 0.069580078125, + "learning_rate": 6.93359375e-07, + "loss": 0.0028, + "reward": 1.6688467860221863, + "reward_std": 0.12142006307840347, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6766592860221863, + "step": 2512 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.1796875, + "epoch": 1.2275390625, + "grad_norm": 2.202825505955737, + "kl": 0.0712890625, + "learning_rate": 6.932373046875e-07, + "loss": 0.0028, + "reward": 1.6930432319641113, + "reward_std": 0.06635242141783237, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7008557617664337, + "step": 2513 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.890625, + "epoch": 1.22802734375, + "grad_norm": 1.4339804333793391, + "kl": 0.078857421875, + "learning_rate": 6.93115234375e-07, + "loss": 0.0032, + "reward": 1.764868676662445, + "reward_std": 0.05308605916798115, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7648686468601227, + "step": 2514 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.2734375, + "epoch": 1.228515625, + "grad_norm": 3.6539966716350767, + "kl": 0.076416015625, + "learning_rate": 6.929931640625e-07, + "loss": 0.0031, + "reward": 1.7111204266548157, + "reward_std": 0.04776516975834966, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7111203968524933, + "step": 2515 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.9296875, + "epoch": 1.22900390625, + "grad_norm": 1.3654858706509436, + "kl": 0.0810546875, + "learning_rate": 6.9287109375e-07, + "loss": 0.0032, + "reward": 1.8311191201210022, + "reward_std": 0.03589681722223759, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8311191201210022, + "step": 2516 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.109375, + "epoch": 1.2294921875, + "grad_norm": 1.5725270743331548, + "kl": 0.07763671875, + "learning_rate": 6.927490234374999e-07, + "loss": 0.0031, + "reward": 1.5551150441169739, + "reward_std": 0.0790153406560421, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6019900143146515, + "step": 2517 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3359375, + "epoch": 1.22998046875, + "grad_norm": 3.1962807812262266, + "kl": 0.0567626953125, + "learning_rate": 6.926269531249999e-07, + "loss": 0.0023, + "reward": 1.6948148608207703, + "reward_std": 0.059128282591700554, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.694814920425415, + "step": 2518 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.8828125, + "epoch": 1.23046875, + "grad_norm": 1.986553003861319, + "kl": 0.0697021484375, + "learning_rate": 6.925048828125e-07, + "loss": 0.0028, + "reward": 1.7517194151878357, + "reward_std": 0.04597326088696718, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7517193853855133, + "step": 2519 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.4921875, + "epoch": 1.23095703125, + "grad_norm": 1.7190974829655477, + "kl": 0.072509765625, + "learning_rate": 6.923828125e-07, + "loss": 0.0029, + "reward": 1.6288211941719055, + "reward_std": 0.025298184249550104, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6288211941719055, + "step": 2520 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.390625, + "epoch": 1.2314453125, + "grad_norm": 3.1290421341569092, + "kl": 0.079345703125, + "learning_rate": 6.922607421875e-07, + "loss": 0.0032, + "reward": 1.8155579566955566, + "reward_std": 0.10759843979030848, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.823370486497879, + "step": 2521 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.71875, + "epoch": 1.23193359375, + "grad_norm": 1.2667127723104277, + "kl": 0.071533203125, + "learning_rate": 6.92138671875e-07, + "loss": 0.0029, + "reward": 1.737762212753296, + "reward_std": 0.03363693691790104, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7377622127532959, + "step": 2522 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.4765625, + "epoch": 1.232421875, + "grad_norm": 3.050787985177319, + "kl": 0.07470703125, + "learning_rate": 6.920166015624999e-07, + "loss": 0.003, + "reward": 1.8481884598731995, + "reward_std": 0.06605091877281666, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8560009598731995, + "step": 2523 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.7421875, + "epoch": 1.23291015625, + "grad_norm": 2.338599161344737, + "kl": 0.065185546875, + "learning_rate": 6.918945312499999e-07, + "loss": 0.0026, + "reward": 1.784920573234558, + "reward_std": 0.06366929598152637, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7927330732345581, + "step": 2524 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.25, + "epoch": 1.2333984375, + "grad_norm": 5.581950906370106, + "kl": 0.0888671875, + "learning_rate": 6.917724609375e-07, + "loss": 0.0036, + "reward": 1.7415488362312317, + "reward_std": 0.04909018334001303, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7415488362312317, + "step": 2525 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.796875, + "epoch": 1.23388671875, + "grad_norm": 2.231397610748833, + "kl": 0.09130859375, + "learning_rate": 6.91650390625e-07, + "loss": 0.0037, + "reward": 1.8085330724716187, + "reward_std": 0.06506985053420067, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8163455426692963, + "step": 2526 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.4375, + "epoch": 1.234375, + "grad_norm": 1.557598316678241, + "kl": 0.0849609375, + "learning_rate": 6.915283203125e-07, + "loss": 0.0034, + "reward": 1.771598756313324, + "reward_std": 0.036864256486296654, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.771598756313324, + "step": 2527 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.5859375, + "epoch": 1.23486328125, + "grad_norm": 8.050764518205462, + "kl": 0.108642578125, + "learning_rate": 6.9140625e-07, + "loss": 0.0044, + "reward": 1.6949394345283508, + "reward_std": 0.11935023218393326, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6949394047260284, + "step": 2528 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.75, + "epoch": 1.2353515625, + "grad_norm": 3.577749415144858, + "kl": 0.072509765625, + "learning_rate": 6.912841796875e-07, + "loss": 0.0029, + "reward": 1.7945581078529358, + "reward_std": 0.026416001841425896, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7945581078529358, + "step": 2529 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.1171875, + "epoch": 1.23583984375, + "grad_norm": 1.882919193494393, + "kl": 0.097900390625, + "learning_rate": 6.911621093749999e-07, + "loss": 0.0039, + "reward": 1.7366108298301697, + "reward_std": 0.088971808552742, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7444233596324921, + "step": 2530 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.9921875, + "epoch": 1.236328125, + "grad_norm": 1.6856414658348984, + "kl": 0.072998046875, + "learning_rate": 6.910400390624999e-07, + "loss": 0.0029, + "reward": 1.7254244089126587, + "reward_std": 0.08224152028560638, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7332369983196259, + "step": 2531 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.0625, + "epoch": 1.23681640625, + "grad_norm": 2.340077383886054, + "kl": 0.075927734375, + "learning_rate": 6.9091796875e-07, + "loss": 0.003, + "reward": 1.8460680842399597, + "reward_std": 0.06705048866569996, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8460681140422821, + "step": 2532 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.0390625, + "epoch": 1.2373046875, + "grad_norm": 1.9943965668932906, + "kl": 0.110595703125, + "learning_rate": 6.907958984375e-07, + "loss": 0.0044, + "reward": 1.775262475013733, + "reward_std": 0.06797738745808601, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7752624750137329, + "step": 2533 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.0625, + "epoch": 1.23779296875, + "grad_norm": 2.2221758125368467, + "kl": 0.067626953125, + "learning_rate": 6.90673828125e-07, + "loss": 0.0027, + "reward": 1.8863377571105957, + "reward_std": 0.058480268344283104, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8863376975059509, + "step": 2534 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.5546875, + "epoch": 1.23828125, + "grad_norm": 2.2548699663415714, + "kl": 0.0633544921875, + "learning_rate": 6.905517578125e-07, + "loss": 0.0025, + "reward": 1.9195441007614136, + "reward_std": 0.04364974796772003, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9195441007614136, + "step": 2535 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.171875, + "epoch": 1.23876953125, + "grad_norm": 4.280833973797064, + "kl": 0.0657958984375, + "learning_rate": 6.904296875e-07, + "loss": 0.0026, + "reward": 1.8286893963813782, + "reward_std": 0.1052445936948061, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8365019261837006, + "step": 2536 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.0390625, + "epoch": 1.2392578125, + "grad_norm": 1.0725990560276126, + "kl": 0.093505859375, + "learning_rate": 6.903076171874999e-07, + "loss": 0.0037, + "reward": 1.7462586760520935, + "reward_std": 0.01885821617906913, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7462586760520935, + "step": 2537 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.4296875, + "epoch": 1.23974609375, + "grad_norm": 0.5566830426478924, + "kl": 0.069580078125, + "learning_rate": 6.90185546875e-07, + "loss": 0.0028, + "reward": 1.7841619849205017, + "reward_std": 0.020260846242308617, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7841619849205017, + "step": 2538 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.453125, + "epoch": 1.240234375, + "grad_norm": 1.4564201896065856, + "kl": 0.055419921875, + "learning_rate": 6.900634765625e-07, + "loss": 0.0022, + "reward": 1.7457255721092224, + "reward_std": 0.10040692985057831, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7535381019115448, + "step": 2539 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.90625, + "epoch": 1.24072265625, + "grad_norm": 0.9193644485156054, + "kl": 0.07958984375, + "learning_rate": 6.8994140625e-07, + "loss": 0.0032, + "reward": 1.5869048237800598, + "reward_std": 0.060834175907075405, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6025297790765762, + "step": 2540 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.53125, + "epoch": 1.2412109375, + "grad_norm": 2.0003891138751797, + "kl": 0.0830078125, + "learning_rate": 6.898193359375e-07, + "loss": 0.0033, + "reward": 1.7775180339813232, + "reward_std": 0.09747044742107391, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7775180339813232, + "step": 2541 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.953125, + "epoch": 1.24169921875, + "grad_norm": 1.241308895200411, + "kl": 0.0694580078125, + "learning_rate": 6.89697265625e-07, + "loss": 0.0028, + "reward": 1.6753292679786682, + "reward_std": 0.07925521302968264, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6909542381763458, + "step": 2542 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.4375, + "epoch": 1.2421875, + "grad_norm": 7.373580663731735, + "kl": 0.083251953125, + "learning_rate": 6.895751953124999e-07, + "loss": 0.0033, + "reward": 1.7385675311088562, + "reward_std": 0.03642314299941063, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7385675311088562, + "step": 2543 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.8046875, + "epoch": 1.24267578125, + "grad_norm": 2.6589438244356502, + "kl": 0.078125, + "learning_rate": 6.894531249999999e-07, + "loss": 0.0031, + "reward": 1.6425248980522156, + "reward_std": 0.05202796123921871, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6425248980522156, + "step": 2544 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.640625, + "epoch": 1.2431640625, + "grad_norm": 1.7535877983308272, + "kl": 0.085205078125, + "learning_rate": 6.893310546875e-07, + "loss": 0.0034, + "reward": 1.7897852659225464, + "reward_std": 0.06561807543039322, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7897853255271912, + "step": 2545 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.546875, + "epoch": 1.24365234375, + "grad_norm": 1.1188283700852508, + "kl": 0.061767578125, + "learning_rate": 6.89208984375e-07, + "loss": 0.0025, + "reward": 1.8053843975067139, + "reward_std": 0.03231469355523586, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8053844273090363, + "step": 2546 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.90625, + "epoch": 1.244140625, + "grad_norm": 2.9696598961961973, + "kl": 0.0638427734375, + "learning_rate": 6.890869140625e-07, + "loss": 0.0026, + "reward": 1.629117488861084, + "reward_std": 0.0660979188978672, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6291175484657288, + "step": 2547 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.4375, + "epoch": 1.24462890625, + "grad_norm": 2.209328842454915, + "kl": 0.07373046875, + "learning_rate": 6.8896484375e-07, + "loss": 0.0029, + "reward": 1.7065168619155884, + "reward_std": 0.02056479558814317, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7065168619155884, + "step": 2548 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.4765625, + "epoch": 1.2451171875, + "grad_norm": 2.1506170169340852, + "kl": 0.0753173828125, + "learning_rate": 6.888427734375e-07, + "loss": 0.003, + "reward": 1.7226258516311646, + "reward_std": 0.06090010888874531, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7226258814334869, + "step": 2549 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.828125, + "epoch": 1.24560546875, + "grad_norm": 1.6867670137775563, + "kl": 0.0570068359375, + "learning_rate": 6.887207031249999e-07, + "loss": 0.0023, + "reward": 1.907860517501831, + "reward_std": 0.13618198037147522, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9156731367111206, + "step": 2550 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.4296875, + "epoch": 1.24609375, + "grad_norm": 2.014025847589191, + "kl": 0.0748291015625, + "learning_rate": 6.885986328124999e-07, + "loss": 0.003, + "reward": 1.8035182356834412, + "reward_std": 0.10311203170567751, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8035181760787964, + "step": 2551 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.8671875, + "epoch": 1.24658203125, + "grad_norm": 2.460777748950473, + "kl": 0.065185546875, + "learning_rate": 6.884765625e-07, + "loss": 0.0026, + "reward": 1.7247642874717712, + "reward_std": 0.0778743838891387, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7325767874717712, + "step": 2552 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.859375, + "epoch": 1.2470703125, + "grad_norm": 2.066296664154434, + "kl": 0.0635986328125, + "learning_rate": 6.883544921875e-07, + "loss": 0.0025, + "reward": 1.8594765067100525, + "reward_std": 0.03086453676223755, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8594764471054077, + "step": 2553 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.734375, + "epoch": 1.24755859375, + "grad_norm": 1.129593989044106, + "kl": 0.072265625, + "learning_rate": 6.88232421875e-07, + "loss": 0.0029, + "reward": 1.8053827285766602, + "reward_std": 0.08634701371192932, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8053827881813049, + "step": 2554 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.9609375, + "epoch": 1.248046875, + "grad_norm": 2.1481848196997033, + "kl": 0.074462890625, + "learning_rate": 6.881103515625e-07, + "loss": 0.003, + "reward": 1.8480090498924255, + "reward_std": 0.052580492570996284, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8480090796947479, + "step": 2555 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.8359375, + "epoch": 1.24853515625, + "grad_norm": 1.3215526698458806, + "kl": 0.072265625, + "learning_rate": 6.879882812499999e-07, + "loss": 0.0029, + "reward": 1.7874248027801514, + "reward_std": 0.04134686943143606, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7874249219894409, + "step": 2556 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.796875, + "epoch": 1.2490234375, + "grad_norm": 7.161134007570032, + "kl": 0.076904296875, + "learning_rate": 6.878662109374999e-07, + "loss": 0.0031, + "reward": 1.7982208728790283, + "reward_std": 0.05789235234260559, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7982209324836731, + "step": 2557 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.59375, + "epoch": 1.24951171875, + "grad_norm": 1.5826957329843547, + "kl": 0.086181640625, + "learning_rate": 6.87744140625e-07, + "loss": 0.0034, + "reward": 1.724461555480957, + "reward_std": 0.05809208191931248, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7244615852832794, + "step": 2558 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.8984375, + "epoch": 1.25, + "grad_norm": 0.6970013528941209, + "kl": 0.0509033203125, + "learning_rate": 6.876220703125e-07, + "loss": 0.002, + "reward": 1.8204131126403809, + "reward_std": 0.08120441623032093, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8282255232334137, + "step": 2559 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.75, + "epoch": 1.25048828125, + "grad_norm": 0.9949031525763067, + "kl": 0.080810546875, + "learning_rate": 6.875e-07, + "loss": 0.0032, + "reward": 1.85645192861557, + "reward_std": 0.05746803432703018, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8564519584178925, + "step": 2560 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.1640625, + "epoch": 1.2509765625, + "grad_norm": 1.5878015011305773, + "kl": 0.0599365234375, + "learning_rate": 6.873779296875e-07, + "loss": 0.0024, + "reward": 1.7821994423866272, + "reward_std": 0.09675415605306625, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.790012001991272, + "step": 2561 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.453125, + "epoch": 1.25146484375, + "grad_norm": 2.5951982218621885, + "kl": 0.092041015625, + "learning_rate": 6.87255859375e-07, + "loss": 0.0037, + "reward": 1.6413246393203735, + "reward_std": 0.07367514073848724, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6413246095180511, + "step": 2562 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.953125, + "epoch": 1.251953125, + "grad_norm": 2.1313560894780688, + "kl": 0.06494140625, + "learning_rate": 6.871337890624999e-07, + "loss": 0.0026, + "reward": 1.9887361526489258, + "reward_std": 0.059286823496222496, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9887359738349915, + "step": 2563 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.2890625, + "epoch": 1.25244140625, + "grad_norm": 0.9087564610351305, + "kl": 0.071044921875, + "learning_rate": 6.870117187499999e-07, + "loss": 0.0028, + "reward": 1.880197525024414, + "reward_std": 0.05455988273024559, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8801974654197693, + "step": 2564 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.7890625, + "epoch": 1.2529296875, + "grad_norm": 2.3160382343134533, + "kl": 0.097412109375, + "learning_rate": 6.868896484375e-07, + "loss": 0.0039, + "reward": 1.6566903591156006, + "reward_std": 0.09503332898020744, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6566903293132782, + "step": 2565 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.0, + "epoch": 1.25341796875, + "grad_norm": 2.2293864959901772, + "kl": 0.06982421875, + "learning_rate": 6.86767578125e-07, + "loss": 0.0028, + "reward": 1.8191727995872498, + "reward_std": 0.07018731534481049, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8191727995872498, + "step": 2566 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.0078125, + "epoch": 1.25390625, + "grad_norm": 3.352890296035172, + "kl": 0.076904296875, + "learning_rate": 6.866455078125e-07, + "loss": 0.0031, + "reward": 1.7632625102996826, + "reward_std": 0.052378684282302856, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7632625102996826, + "step": 2567 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.3125, + "epoch": 1.25439453125, + "grad_norm": 3.1814122296646916, + "kl": 0.0689697265625, + "learning_rate": 6.865234375e-07, + "loss": 0.0028, + "reward": 1.743474543094635, + "reward_std": 0.059906596317887306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7434745132923126, + "step": 2568 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.859375, + "epoch": 1.2548828125, + "grad_norm": 1.2044355795614614, + "kl": 0.0648193359375, + "learning_rate": 6.864013671874999e-07, + "loss": 0.0026, + "reward": 1.7757219076156616, + "reward_std": 0.05339077487587929, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7757218182086945, + "step": 2569 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.734375, + "epoch": 1.25537109375, + "grad_norm": 1.50396583424587, + "kl": 0.0584716796875, + "learning_rate": 6.862792968749999e-07, + "loss": 0.0023, + "reward": 1.8511592745780945, + "reward_std": 0.06993940658867359, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8511592745780945, + "step": 2570 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.3203125, + "epoch": 1.255859375, + "grad_norm": 7.325974862571428, + "kl": 0.080078125, + "learning_rate": 6.861572265625e-07, + "loss": 0.0032, + "reward": 1.8245378136634827, + "reward_std": 0.05453048273921013, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8245378732681274, + "step": 2571 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.875, + "epoch": 1.25634765625, + "grad_norm": 2.5748832751879034, + "kl": 0.073486328125, + "learning_rate": 6.8603515625e-07, + "loss": 0.0029, + "reward": 1.7536067962646484, + "reward_std": 0.07038977555930614, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7536068260669708, + "step": 2572 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.296875, + "epoch": 1.2568359375, + "grad_norm": 2.2116366377933505, + "kl": 0.05859375, + "learning_rate": 6.859130859375e-07, + "loss": 0.0023, + "reward": 1.7995004653930664, + "reward_std": 0.09891559928655624, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8073129653930664, + "step": 2573 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.78125, + "epoch": 1.25732421875, + "grad_norm": 1.123092523187175, + "kl": 0.06787109375, + "learning_rate": 6.85791015625e-07, + "loss": 0.0027, + "reward": 1.8330579996109009, + "reward_std": 0.045371233485639095, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8330580592155457, + "step": 2574 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.2734375, + "epoch": 1.2578125, + "grad_norm": 2.1695911445553513, + "kl": 0.06787109375, + "learning_rate": 6.856689453125e-07, + "loss": 0.0027, + "reward": 1.8154324889183044, + "reward_std": 0.04039592668414116, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8154324889183044, + "step": 2575 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.015625, + "epoch": 1.25830078125, + "grad_norm": 1.1977765178723714, + "kl": 0.0513916015625, + "learning_rate": 6.855468749999999e-07, + "loss": 0.0021, + "reward": 1.7662554383277893, + "reward_std": 0.008739282377064228, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7662554383277893, + "step": 2576 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.625, + "epoch": 1.2587890625, + "grad_norm": 2.582999038354569, + "kl": 0.05078125, + "learning_rate": 6.854248046874999e-07, + "loss": 0.002, + "reward": 1.8732368350028992, + "reward_std": 0.13687162101268768, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8810493648052216, + "step": 2577 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.1875, + "epoch": 1.25927734375, + "grad_norm": 3.2603077110393186, + "kl": 0.07080078125, + "learning_rate": 6.85302734375e-07, + "loss": 0.0028, + "reward": 1.744448721408844, + "reward_std": 0.07884471863508224, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7444487512111664, + "step": 2578 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.2734375, + "epoch": 1.259765625, + "grad_norm": 1.5072963778520792, + "kl": 0.072265625, + "learning_rate": 6.851806640625e-07, + "loss": 0.0029, + "reward": 1.8110605478286743, + "reward_std": 0.0798899196088314, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8110604882240295, + "step": 2579 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.109375, + "epoch": 1.26025390625, + "grad_norm": 6.052899624466231, + "kl": 0.1295166015625, + "learning_rate": 6.8505859375e-07, + "loss": 0.0052, + "reward": 1.855854094028473, + "reward_std": 0.03883726242929697, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8558541238307953, + "step": 2580 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.171875, + "epoch": 1.2607421875, + "grad_norm": 2.2670697938409052, + "kl": 0.0751953125, + "learning_rate": 6.849365234375e-07, + "loss": 0.003, + "reward": 1.5954683423042297, + "reward_std": 0.01611372921615839, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5954683721065521, + "step": 2581 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.5234375, + "epoch": 1.26123046875, + "grad_norm": 9.369240905527494, + "kl": 0.06787109375, + "learning_rate": 6.848144531249999e-07, + "loss": 0.0027, + "reward": 1.7278264164924622, + "reward_std": 0.05693458020687103, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7278264462947845, + "step": 2582 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.953125, + "epoch": 1.26171875, + "grad_norm": 1.2283989647554525, + "kl": 0.0574951171875, + "learning_rate": 6.846923828124999e-07, + "loss": 0.0023, + "reward": 1.7502474188804626, + "reward_std": 0.026749521493911743, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.750247448682785, + "step": 2583 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.6875, + "epoch": 1.26220703125, + "grad_norm": 1.2610728333774404, + "kl": 0.06201171875, + "learning_rate": 6.845703125e-07, + "loss": 0.0025, + "reward": 1.7522244453430176, + "reward_std": 0.09181947819888592, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7522244453430176, + "step": 2584 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.3828125, + "epoch": 1.2626953125, + "grad_norm": 2.414972278703281, + "kl": 0.06396484375, + "learning_rate": 6.844482421875e-07, + "loss": 0.0026, + "reward": 1.737687885761261, + "reward_std": 0.07903081178665161, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.737687885761261, + "step": 2585 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.75, + "epoch": 1.26318359375, + "grad_norm": 1.7270948683315666, + "kl": 0.04931640625, + "learning_rate": 6.84326171875e-07, + "loss": 0.002, + "reward": 1.781536877155304, + "reward_std": 0.0434822803363204, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7815368473529816, + "step": 2586 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.40625, + "epoch": 1.263671875, + "grad_norm": 1.3949112846676868, + "kl": 0.0535888671875, + "learning_rate": 6.842041015625e-07, + "loss": 0.0021, + "reward": 1.8454834818840027, + "reward_std": 0.05372583121061325, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8454834818840027, + "step": 2587 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.84375, + "epoch": 1.26416015625, + "grad_norm": 1.4368123149876182, + "kl": 0.0692138671875, + "learning_rate": 6.8408203125e-07, + "loss": 0.0028, + "reward": 1.6451177597045898, + "reward_std": 0.07861702609807253, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6763677000999451, + "step": 2588 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.125, + "epoch": 1.2646484375, + "grad_norm": 2.8027515559043823, + "kl": 0.068603515625, + "learning_rate": 6.839599609374999e-07, + "loss": 0.0027, + "reward": 1.863362193107605, + "reward_std": 0.08033762127161026, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.871174693107605, + "step": 2589 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.65625, + "epoch": 1.26513671875, + "grad_norm": 1.3661088668145678, + "kl": 0.071533203125, + "learning_rate": 6.838378906249999e-07, + "loss": 0.0029, + "reward": 1.7393322587013245, + "reward_std": 0.052713219076395035, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7393322587013245, + "step": 2590 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.53125, + "epoch": 1.265625, + "grad_norm": 1.1496627965247463, + "kl": 0.05908203125, + "learning_rate": 6.837158203125e-07, + "loss": 0.0024, + "reward": 1.831416666507721, + "reward_std": 0.06289010029286146, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8314166367053986, + "step": 2591 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.3359375, + "epoch": 1.26611328125, + "grad_norm": 1.1102501988463187, + "kl": 0.0625, + "learning_rate": 6.8359375e-07, + "loss": 0.0025, + "reward": 1.9127737879753113, + "reward_std": 0.09909685142338276, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.9362112879753113, + "step": 2592 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.5703125, + "epoch": 1.2666015625, + "grad_norm": 2.558242186997854, + "kl": 0.07080078125, + "learning_rate": 6.834716796875e-07, + "loss": 0.0028, + "reward": 1.7129462957382202, + "reward_std": 0.2064364030957222, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.752008855342865, + "step": 2593 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.2421875, + "epoch": 1.26708984375, + "grad_norm": 1.4121290555246413, + "kl": 0.06591796875, + "learning_rate": 6.83349609375e-07, + "loss": 0.0026, + "reward": 1.721196711063385, + "reward_std": 0.045722841285169125, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7211967408657074, + "step": 2594 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.6640625, + "epoch": 1.267578125, + "grad_norm": 8.542422233364945, + "kl": 0.0450439453125, + "learning_rate": 6.832275390624999e-07, + "loss": 0.0018, + "reward": 1.845105767250061, + "reward_std": 0.04223616607487202, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8451057970523834, + "step": 2595 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.65625, + "epoch": 1.26806640625, + "grad_norm": 2.172156191986647, + "kl": 0.05712890625, + "learning_rate": 6.831054687499999e-07, + "loss": 0.0023, + "reward": 1.7219505906105042, + "reward_std": 0.05229894071817398, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7219505310058594, + "step": 2596 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.2109375, + "epoch": 1.2685546875, + "grad_norm": 4.512629289400748, + "kl": 0.07275390625, + "learning_rate": 6.829833984375e-07, + "loss": 0.0029, + "reward": 1.775171935558319, + "reward_std": 0.11626030504703522, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7751719057559967, + "step": 2597 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.8828125, + "epoch": 1.26904296875, + "grad_norm": 19.436272087132117, + "kl": 0.0650634765625, + "learning_rate": 6.82861328125e-07, + "loss": 0.0026, + "reward": 1.8984931111335754, + "reward_std": 0.047796593979001045, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8984931111335754, + "step": 2598 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.1796875, + "epoch": 1.26953125, + "grad_norm": 0.8005281955093715, + "kl": 0.0565185546875, + "learning_rate": 6.827392578125e-07, + "loss": 0.0023, + "reward": 1.9395660758018494, + "reward_std": 0.015174323692917824, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9395660758018494, + "step": 2599 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.0546875, + "epoch": 1.27001953125, + "grad_norm": 1.7873236969539745, + "kl": 0.0601806640625, + "learning_rate": 6.826171875e-07, + "loss": 0.0024, + "reward": 1.729775309562683, + "reward_std": 0.11483496427536011, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7610252797603607, + "step": 2600 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.34375, + "epoch": 1.2705078125, + "grad_norm": 1.5860494003883214, + "kl": 0.0635986328125, + "learning_rate": 6.824951171875e-07, + "loss": 0.0025, + "reward": 1.619213342666626, + "reward_std": 0.13867055252194405, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6504633724689484, + "step": 2601 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.5625, + "epoch": 1.27099609375, + "grad_norm": 1.8263568249389657, + "kl": 0.0648193359375, + "learning_rate": 6.823730468749999e-07, + "loss": 0.0026, + "reward": 1.7087448835372925, + "reward_std": 0.059230593498796225, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7165573537349701, + "step": 2602 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.953125, + "epoch": 1.271484375, + "grad_norm": 2.021543206871508, + "kl": 0.081298828125, + "learning_rate": 6.822509765624999e-07, + "loss": 0.0033, + "reward": 1.6043951511383057, + "reward_std": 0.11108111217617989, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6278325915336609, + "step": 2603 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.984375, + "epoch": 1.27197265625, + "grad_norm": 3.424869485308043, + "kl": 0.072998046875, + "learning_rate": 6.8212890625e-07, + "loss": 0.0029, + "reward": 1.9290322065353394, + "reward_std": 0.13008323311805725, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.936844676733017, + "step": 2604 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.2734375, + "epoch": 1.2724609375, + "grad_norm": 0.8833183790651663, + "kl": 0.0531005859375, + "learning_rate": 6.820068359375e-07, + "loss": 0.0021, + "reward": 1.8816287517547607, + "reward_std": 0.05253131175413728, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8894413113594055, + "step": 2605 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.9609375, + "epoch": 1.27294921875, + "grad_norm": 1.57460282033083, + "kl": 0.0523681640625, + "learning_rate": 6.81884765625e-07, + "loss": 0.0021, + "reward": 1.7124788165092468, + "reward_std": 0.07889316231012344, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7202913463115692, + "step": 2606 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.6875, + "epoch": 1.2734375, + "grad_norm": 2.311076319577126, + "kl": 0.060546875, + "learning_rate": 6.817626953125e-07, + "loss": 0.0024, + "reward": 1.8965556025505066, + "reward_std": 0.060334792360663414, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8965555429458618, + "step": 2607 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.203125, + "epoch": 1.27392578125, + "grad_norm": 2.312918341316606, + "kl": 0.0556640625, + "learning_rate": 6.816406249999999e-07, + "loss": 0.0022, + "reward": 1.6706057786941528, + "reward_std": 0.15810733288526535, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.7331057786941528, + "step": 2608 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.6328125, + "epoch": 1.2744140625, + "grad_norm": 8.087988058753458, + "kl": 0.05908203125, + "learning_rate": 6.815185546874999e-07, + "loss": 0.0024, + "reward": 1.8144915699958801, + "reward_std": 0.04744442366063595, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8144915997982025, + "step": 2609 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.875, + "epoch": 1.27490234375, + "grad_norm": 0.5706401219102212, + "kl": 0.0506591796875, + "learning_rate": 6.81396484375e-07, + "loss": 0.002, + "reward": 1.7547515630722046, + "reward_std": 0.018506707157939672, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7547515630722046, + "step": 2610 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.4609375, + "epoch": 1.275390625, + "grad_norm": 12.49022768844792, + "kl": 0.059326171875, + "learning_rate": 6.812744140625e-07, + "loss": 0.0024, + "reward": 1.794619619846344, + "reward_std": 0.06346526741981506, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.794619619846344, + "step": 2611 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.828125, + "epoch": 1.27587890625, + "grad_norm": 1.0850062950816326, + "kl": 0.07958984375, + "learning_rate": 6.8115234375e-07, + "loss": 0.0032, + "reward": 1.7299774289131165, + "reward_std": 0.03509983792901039, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7299774289131165, + "step": 2612 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.71875, + "epoch": 1.2763671875, + "grad_norm": 1.080187026255585, + "kl": 0.0640869140625, + "learning_rate": 6.810302734375e-07, + "loss": 0.0026, + "reward": 1.8587305545806885, + "reward_std": 0.07435241714119911, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8587304651737213, + "step": 2613 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.125, + "epoch": 1.27685546875, + "grad_norm": 1.1601920921582176, + "kl": 0.060546875, + "learning_rate": 6.80908203125e-07, + "loss": 0.0024, + "reward": 1.8313266038894653, + "reward_std": 0.018857479095458984, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8313265740871429, + "step": 2614 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.34375, + "epoch": 1.27734375, + "grad_norm": 2.85155532714431, + "kl": 0.0675048828125, + "learning_rate": 6.807861328124999e-07, + "loss": 0.0027, + "reward": 1.8055160641670227, + "reward_std": 0.046973712742328644, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8055160045623779, + "step": 2615 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.84375, + "epoch": 1.27783203125, + "grad_norm": 4.513029644722789, + "kl": 0.0791015625, + "learning_rate": 6.806640624999999e-07, + "loss": 0.0032, + "reward": 1.7537464499473572, + "reward_std": 0.11804336681962013, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7615589499473572, + "step": 2616 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.4609375, + "epoch": 1.2783203125, + "grad_norm": 3.8194841411583798, + "kl": 0.053466796875, + "learning_rate": 6.805419921875e-07, + "loss": 0.0021, + "reward": 1.677466869354248, + "reward_std": 0.056195804849267006, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6774668991565704, + "step": 2617 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.2265625, + "epoch": 1.27880859375, + "grad_norm": 6.474962292439623, + "kl": 0.0562744140625, + "learning_rate": 6.80419921875e-07, + "loss": 0.0023, + "reward": 1.8547720909118652, + "reward_std": 0.025560058653354645, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8547720909118652, + "step": 2618 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.46875, + "epoch": 1.279296875, + "grad_norm": 1.4597995925565477, + "kl": 0.0693359375, + "learning_rate": 6.802978515625e-07, + "loss": 0.0028, + "reward": 1.6823553442955017, + "reward_std": 0.1308056991547346, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7136052846908569, + "step": 2619 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.5078125, + "epoch": 1.27978515625, + "grad_norm": 0.6324777562183502, + "kl": 0.052978515625, + "learning_rate": 6.8017578125e-07, + "loss": 0.0021, + "reward": 1.6706210374832153, + "reward_std": 0.05375996232032776, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6706210076808929, + "step": 2620 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.0078125, + "epoch": 1.2802734375, + "grad_norm": 1.2610636513041573, + "kl": 0.0584716796875, + "learning_rate": 6.800537109374999e-07, + "loss": 0.0023, + "reward": 1.7311273217201233, + "reward_std": 0.192430280148983, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7545647621154785, + "step": 2621 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.671875, + "epoch": 1.28076171875, + "grad_norm": 1.678965103507942, + "kl": 0.068603515625, + "learning_rate": 6.799316406249999e-07, + "loss": 0.0027, + "reward": 1.7555344700813293, + "reward_std": 0.048562128096818924, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7555344700813293, + "step": 2622 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.1875, + "epoch": 1.28125, + "grad_norm": 1.4249689682498725, + "kl": 0.0643310546875, + "learning_rate": 6.798095703125e-07, + "loss": 0.0026, + "reward": 1.7415629029273987, + "reward_std": 0.07066285982728004, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7415629029273987, + "step": 2623 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.1640625, + "epoch": 1.28173828125, + "grad_norm": 1.3604919034116547, + "kl": 0.0662841796875, + "learning_rate": 6.796875e-07, + "loss": 0.0027, + "reward": 1.841383457183838, + "reward_std": 0.037516459822654724, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8413834273815155, + "step": 2624 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.109375, + "epoch": 1.2822265625, + "grad_norm": 1.0918554991669291, + "kl": 0.0423583984375, + "learning_rate": 6.795654296875e-07, + "loss": 0.0017, + "reward": 1.810799479484558, + "reward_std": 0.05421273224055767, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8107994794845581, + "step": 2625 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.109375, + "epoch": 1.28271484375, + "grad_norm": 0.9904225454583597, + "kl": 0.06640625, + "learning_rate": 6.79443359375e-07, + "loss": 0.0027, + "reward": 1.7526730298995972, + "reward_std": 0.03807441703975201, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7526730000972748, + "step": 2626 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.765625, + "epoch": 1.283203125, + "grad_norm": 1.7982981484887894, + "kl": 0.104248046875, + "learning_rate": 6.793212890625e-07, + "loss": 0.0042, + "reward": 1.5381957292556763, + "reward_std": 0.23323528468608856, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.5850707292556763, + "step": 2627 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.453125, + "epoch": 1.28369140625, + "grad_norm": 1.550548686839029, + "kl": 0.085693359375, + "learning_rate": 6.791992187499999e-07, + "loss": 0.0034, + "reward": 1.7918881177902222, + "reward_std": 0.10955053754150867, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7997006177902222, + "step": 2628 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.5625, + "epoch": 1.2841796875, + "grad_norm": 1.6403304793757816, + "kl": 0.0595703125, + "learning_rate": 6.790771484374999e-07, + "loss": 0.0024, + "reward": 1.7768383026123047, + "reward_std": 0.0630792174488306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7768383026123047, + "step": 2629 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.09375, + "epoch": 1.28466796875, + "grad_norm": 2.733466306796843, + "kl": 0.063232421875, + "learning_rate": 6.78955078125e-07, + "loss": 0.0025, + "reward": 1.7490355968475342, + "reward_std": 0.09908445179462433, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7568481266498566, + "step": 2630 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.4609375, + "epoch": 1.28515625, + "grad_norm": 0.5505330642135668, + "kl": 0.05712890625, + "learning_rate": 6.788330078125e-07, + "loss": 0.0023, + "reward": 1.785165786743164, + "reward_std": 0.05979756236774847, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8007907271385193, + "step": 2631 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.1640625, + "epoch": 1.28564453125, + "grad_norm": 2.009225132396737, + "kl": 0.075927734375, + "learning_rate": 6.787109375e-07, + "loss": 0.003, + "reward": 1.7671828269958496, + "reward_std": 0.06479834392666817, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7671828269958496, + "step": 2632 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.125, + "epoch": 1.2861328125, + "grad_norm": 1.7136252258972942, + "kl": 0.0494384765625, + "learning_rate": 6.785888671875e-07, + "loss": 0.002, + "reward": 1.8264079093933105, + "reward_std": 0.017682873643934727, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8264078795909882, + "step": 2633 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.1640625, + "epoch": 1.28662109375, + "grad_norm": 3.778508106643716, + "kl": 0.0667724609375, + "learning_rate": 6.78466796875e-07, + "loss": 0.0027, + "reward": 1.8192695379257202, + "reward_std": 0.055723583325743675, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.819269597530365, + "step": 2634 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.5625, + "epoch": 1.287109375, + "grad_norm": 2.0871013161582908, + "kl": 0.068603515625, + "learning_rate": 6.783447265624999e-07, + "loss": 0.0027, + "reward": 1.8692251443862915, + "reward_std": 0.03455257322639227, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8770376443862915, + "step": 2635 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.796875, + "epoch": 1.28759765625, + "grad_norm": 1.9442360354470494, + "kl": 0.0665283203125, + "learning_rate": 6.7822265625e-07, + "loss": 0.0027, + "reward": 1.8144180178642273, + "reward_std": 0.060984525829553604, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8144180476665497, + "step": 2636 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.0390625, + "epoch": 1.2880859375, + "grad_norm": 1.0365066351162422, + "kl": 0.0621337890625, + "learning_rate": 6.781005859375e-07, + "loss": 0.0025, + "reward": 1.7693456411361694, + "reward_std": 0.0630057118833065, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7693456411361694, + "step": 2637 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.7578125, + "epoch": 1.28857421875, + "grad_norm": 2.8683486825278046, + "kl": 0.056396484375, + "learning_rate": 6.77978515625e-07, + "loss": 0.0023, + "reward": 1.7881279587745667, + "reward_std": 0.058837566524744034, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7881280183792114, + "step": 2638 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.2734375, + "epoch": 1.2890625, + "grad_norm": 6.264612314146436, + "kl": 0.060302734375, + "learning_rate": 6.778564453125e-07, + "loss": 0.0024, + "reward": 1.6483284831047058, + "reward_std": 0.13759692385792732, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.663953423500061, + "step": 2639 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.234375, + "epoch": 1.28955078125, + "grad_norm": 1.5759987187071773, + "kl": 0.05615234375, + "learning_rate": 6.77734375e-07, + "loss": 0.0022, + "reward": 1.8786953687667847, + "reward_std": 0.024645724333822727, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8786953389644623, + "step": 2640 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.6328125, + "epoch": 1.2900390625, + "grad_norm": 2.6015249930837347, + "kl": 0.088134765625, + "learning_rate": 6.776123046874999e-07, + "loss": 0.0035, + "reward": 1.656754493713379, + "reward_std": 0.07588385604321957, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6567544937133789, + "step": 2641 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.875, + "epoch": 1.29052734375, + "grad_norm": 1.3132376713397804, + "kl": 0.0791015625, + "learning_rate": 6.774902343749999e-07, + "loss": 0.0032, + "reward": 1.7957526445388794, + "reward_std": 0.04471752420067787, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.795752614736557, + "step": 2642 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.828125, + "epoch": 1.291015625, + "grad_norm": 3.2946009869851753, + "kl": 0.0601806640625, + "learning_rate": 6.773681640625e-07, + "loss": 0.0024, + "reward": 1.650659203529358, + "reward_std": 0.1262562870979309, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6897217035293579, + "step": 2643 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.078125, + "epoch": 1.29150390625, + "grad_norm": 3.1092957108787003, + "kl": 0.0665283203125, + "learning_rate": 6.7724609375e-07, + "loss": 0.0027, + "reward": 1.7754952907562256, + "reward_std": 0.11444034799933434, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7989327907562256, + "step": 2644 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.40625, + "epoch": 1.2919921875, + "grad_norm": 12.661664724175285, + "kl": 0.077392578125, + "learning_rate": 6.771240234375e-07, + "loss": 0.0031, + "reward": 1.6563068628311157, + "reward_std": 0.10409623384475708, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6563068330287933, + "step": 2645 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.8671875, + "epoch": 1.29248046875, + "grad_norm": 0.6055988273645182, + "kl": 0.05224609375, + "learning_rate": 6.77001953125e-07, + "loss": 0.0021, + "reward": 1.7593251466751099, + "reward_std": 0.026515904814004898, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7593251466751099, + "step": 2646 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.5625, + "epoch": 1.29296875, + "grad_norm": 2.019699875883687, + "kl": 0.06591796875, + "learning_rate": 6.768798828125e-07, + "loss": 0.0026, + "reward": 1.6537410616874695, + "reward_std": 0.19559639692306519, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.7162410914897919, + "step": 2647 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.78125, + "epoch": 1.29345703125, + "grad_norm": 3.7763404898754587, + "kl": 0.068603515625, + "learning_rate": 6.767578124999999e-07, + "loss": 0.0027, + "reward": 1.827435851097107, + "reward_std": 0.044061899185180664, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8274357616901398, + "step": 2648 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.453125, + "epoch": 1.2939453125, + "grad_norm": 2.8015440153661784, + "kl": 0.065185546875, + "learning_rate": 6.766357421874999e-07, + "loss": 0.0026, + "reward": 1.6706737279891968, + "reward_std": 0.18939045071601868, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7175487279891968, + "step": 2649 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.1484375, + "epoch": 1.29443359375, + "grad_norm": 1.9989463705585409, + "kl": 0.078125, + "learning_rate": 6.76513671875e-07, + "loss": 0.0031, + "reward": 1.7752625942230225, + "reward_std": 0.06523648090660572, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7752625942230225, + "step": 2650 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.7578125, + "epoch": 1.294921875, + "grad_norm": 5.2627762529444055, + "kl": 0.0523681640625, + "learning_rate": 6.763916015625e-07, + "loss": 0.0021, + "reward": 1.7596194744110107, + "reward_std": 0.05819558724761009, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7596194744110107, + "step": 2651 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.1875, + "epoch": 1.29541015625, + "grad_norm": 1.193829150298933, + "kl": 0.054443359375, + "learning_rate": 6.7626953125e-07, + "loss": 0.0022, + "reward": 1.7631536722183228, + "reward_std": 0.11396730691194534, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8022161424160004, + "step": 2652 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.75, + "epoch": 1.2958984375, + "grad_norm": 4.511945830839449, + "kl": 0.0693359375, + "learning_rate": 6.761474609375e-07, + "loss": 0.0028, + "reward": 1.6372665762901306, + "reward_std": 0.12859837338328362, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6763290762901306, + "step": 2653 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.015625, + "epoch": 1.29638671875, + "grad_norm": 7.337066580487183, + "kl": 0.088134765625, + "learning_rate": 6.760253906249999e-07, + "loss": 0.0035, + "reward": 1.770021915435791, + "reward_std": 0.0901176705956459, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.770021915435791, + "step": 2654 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.65625, + "epoch": 1.296875, + "grad_norm": 1.7138139340851553, + "kl": 0.054443359375, + "learning_rate": 6.759033203124999e-07, + "loss": 0.0022, + "reward": 1.6479786038398743, + "reward_std": 0.055697097443044186, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7026660740375519, + "step": 2655 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.328125, + "epoch": 1.29736328125, + "grad_norm": 1.157243381436146, + "kl": 0.067138671875, + "learning_rate": 6.7578125e-07, + "loss": 0.0027, + "reward": 1.7617986798286438, + "reward_std": 0.12362072244286537, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8008612096309662, + "step": 2656 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.4140625, + "epoch": 1.2978515625, + "grad_norm": 12.730576035610571, + "kl": 0.070556640625, + "learning_rate": 6.756591796875e-07, + "loss": 0.0028, + "reward": 1.6338982582092285, + "reward_std": 0.0442785257473588, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6338982731103897, + "step": 2657 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.1015625, + "epoch": 1.29833984375, + "grad_norm": 1.0874774416423236, + "kl": 0.06689453125, + "learning_rate": 6.75537109375e-07, + "loss": 0.0027, + "reward": 1.7247290015220642, + "reward_std": 0.06158460769802332, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7247289717197418, + "step": 2658 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.2109375, + "epoch": 1.298828125, + "grad_norm": 1.0818257630262431, + "kl": 0.0645751953125, + "learning_rate": 6.754150390625e-07, + "loss": 0.0026, + "reward": 1.8425450325012207, + "reward_std": 0.10945501737296581, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8503575325012207, + "step": 2659 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.0703125, + "epoch": 1.29931640625, + "grad_norm": 14.407404536821112, + "kl": 0.0506591796875, + "learning_rate": 6.7529296875e-07, + "loss": 0.002, + "reward": 1.8501919507980347, + "reward_std": 0.09266996011137962, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8580044806003571, + "step": 2660 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.46875, + "epoch": 1.2998046875, + "grad_norm": 0.8514788037499841, + "kl": 0.052001953125, + "learning_rate": 6.751708984374999e-07, + "loss": 0.0021, + "reward": 1.7188128232955933, + "reward_std": 0.07580004632472992, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7266253232955933, + "step": 2661 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.25, + "epoch": 1.30029296875, + "grad_norm": 0.7970795376599247, + "kl": 0.066650390625, + "learning_rate": 6.750488281249999e-07, + "loss": 0.0027, + "reward": 1.7323698997497559, + "reward_std": 0.11393286287784576, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7870573997497559, + "step": 2662 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.4921875, + "epoch": 1.30078125, + "grad_norm": 2.1897551324280338, + "kl": 0.07568359375, + "learning_rate": 6.749267578125e-07, + "loss": 0.003, + "reward": 1.8059039115905762, + "reward_std": 0.060338267125189304, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8059038519859314, + "step": 2663 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.0390625, + "epoch": 1.30126953125, + "grad_norm": 2.9581165989544824, + "kl": 0.064453125, + "learning_rate": 6.748046875e-07, + "loss": 0.0026, + "reward": 1.7930954098701477, + "reward_std": 0.061221227049827576, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7930954694747925, + "step": 2664 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.15625, + "epoch": 1.3017578125, + "grad_norm": 1.6499047047869468, + "kl": 0.0540771484375, + "learning_rate": 6.746826171875e-07, + "loss": 0.0022, + "reward": 1.8659427165985107, + "reward_std": 0.06026996113359928, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8659427464008331, + "step": 2665 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.0625, + "epoch": 1.30224609375, + "grad_norm": 1.0955739275158136, + "kl": 0.07861328125, + "learning_rate": 6.74560546875e-07, + "loss": 0.0031, + "reward": 1.6215183734893799, + "reward_std": 0.05073964595794678, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6215183138847351, + "step": 2666 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.03125, + "epoch": 1.302734375, + "grad_norm": 1.8339905299535653, + "kl": 0.0643310546875, + "learning_rate": 6.744384765624999e-07, + "loss": 0.0026, + "reward": 1.716743528842926, + "reward_std": 0.10541088692843914, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.740181028842926, + "step": 2667 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.1640625, + "epoch": 1.30322265625, + "grad_norm": 1.3647709905403491, + "kl": 0.086181640625, + "learning_rate": 6.743164062499999e-07, + "loss": 0.0034, + "reward": 1.7058016657829285, + "reward_std": 0.029076790437102318, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7058016955852509, + "step": 2668 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.59375, + "epoch": 1.3037109375, + "grad_norm": 1.3669269567191786, + "kl": 0.044189453125, + "learning_rate": 6.741943359375e-07, + "loss": 0.0018, + "reward": 1.672289490699768, + "reward_std": 0.13343248516321182, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7113519906997681, + "step": 2669 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.0390625, + "epoch": 1.30419921875, + "grad_norm": 0.6783532224856341, + "kl": 0.053955078125, + "learning_rate": 6.74072265625e-07, + "loss": 0.0022, + "reward": 1.7751038670539856, + "reward_std": 0.0778466984629631, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7985413074493408, + "step": 2670 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.03125, + "epoch": 1.3046875, + "grad_norm": 4.320924526476853, + "kl": 0.072021484375, + "learning_rate": 6.739501953125e-07, + "loss": 0.0029, + "reward": 1.6213982105255127, + "reward_std": 0.0660770833492279, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6213981807231903, + "step": 2671 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.609375, + "epoch": 1.30517578125, + "grad_norm": 1.4258907778375465, + "kl": 0.061279296875, + "learning_rate": 6.73828125e-07, + "loss": 0.0024, + "reward": 1.8601597547531128, + "reward_std": 0.11782187595963478, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8679722547531128, + "step": 2672 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.890625, + "epoch": 1.3056640625, + "grad_norm": 1.724437541506563, + "kl": 0.06787109375, + "learning_rate": 6.737060546875e-07, + "loss": 0.0027, + "reward": 1.786941945552826, + "reward_std": 0.0738075040280819, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7869419753551483, + "step": 2673 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.8671875, + "epoch": 1.30615234375, + "grad_norm": 1.0501594228457007, + "kl": 0.062255859375, + "learning_rate": 6.735839843749999e-07, + "loss": 0.0025, + "reward": 1.762403666973114, + "reward_std": 0.05871861148625612, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7702161371707916, + "step": 2674 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.015625, + "epoch": 1.306640625, + "grad_norm": 2.8131149989049526, + "kl": 0.069580078125, + "learning_rate": 6.734619140624999e-07, + "loss": 0.0028, + "reward": 1.651845395565033, + "reward_std": 0.13027137517929077, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.667470395565033, + "step": 2675 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.078125, + "epoch": 1.30712890625, + "grad_norm": 3.330367088072933, + "kl": 0.086181640625, + "learning_rate": 6.7333984375e-07, + "loss": 0.0034, + "reward": 1.806040346622467, + "reward_std": 0.043943583965301514, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8060402572154999, + "step": 2676 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.9609375, + "epoch": 1.3076171875, + "grad_norm": 10.765893801480635, + "kl": 0.063720703125, + "learning_rate": 6.732177734375e-07, + "loss": 0.0026, + "reward": 1.7391871809959412, + "reward_std": 0.05080571398139, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7391871213912964, + "step": 2677 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.3203125, + "epoch": 1.30810546875, + "grad_norm": 1.4238084568541345, + "kl": 0.0606689453125, + "learning_rate": 6.73095703125e-07, + "loss": 0.0024, + "reward": 1.8248883485794067, + "reward_std": 0.037258436903357506, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8248883485794067, + "step": 2678 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.3046875, + "epoch": 1.30859375, + "grad_norm": 1.4907117084185106, + "kl": 0.0635986328125, + "learning_rate": 6.729736328125e-07, + "loss": 0.0025, + "reward": 1.7909139394760132, + "reward_std": 0.06092044711112976, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7909139692783356, + "step": 2679 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.5703125, + "epoch": 1.30908203125, + "grad_norm": 2.0791465208702387, + "kl": 0.0679931640625, + "learning_rate": 6.728515624999999e-07, + "loss": 0.0027, + "reward": 1.7821356058120728, + "reward_std": 0.06090878788381815, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7821356058120728, + "step": 2680 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.0859375, + "epoch": 1.3095703125, + "grad_norm": 1.4915135016440937, + "kl": 0.0634765625, + "learning_rate": 6.727294921874999e-07, + "loss": 0.0025, + "reward": 1.8791195154190063, + "reward_std": 0.02637836430221796, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8791195452213287, + "step": 2681 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.921875, + "epoch": 1.31005859375, + "grad_norm": 2.3070951051177553, + "kl": 0.076171875, + "learning_rate": 6.72607421875e-07, + "loss": 0.003, + "reward": 1.757739543914795, + "reward_std": 0.06817848235368729, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7577394843101501, + "step": 2682 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.5234375, + "epoch": 1.310546875, + "grad_norm": 1.4622842311116624, + "kl": 0.0673828125, + "learning_rate": 6.724853515625e-07, + "loss": 0.0027, + "reward": 1.7414976358413696, + "reward_std": 0.022878904826939106, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7414976358413696, + "step": 2683 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.828125, + "epoch": 1.31103515625, + "grad_norm": 1.450865157370279, + "kl": 0.058349609375, + "learning_rate": 6.7236328125e-07, + "loss": 0.0023, + "reward": 1.7220868468284607, + "reward_std": 0.045497006736695766, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7220868170261383, + "step": 2684 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.9375, + "epoch": 1.3115234375, + "grad_norm": 4.128346398936986, + "kl": 0.142822265625, + "learning_rate": 6.722412109375e-07, + "loss": 0.0057, + "reward": 1.8581845164299011, + "reward_std": 0.0643857903778553, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8581845462322235, + "step": 2685 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.1171875, + "epoch": 1.31201171875, + "grad_norm": 2.486707810004422, + "kl": 0.0672607421875, + "learning_rate": 6.72119140625e-07, + "loss": 0.0027, + "reward": 1.81499183177948, + "reward_std": 0.04949922952800989, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.81499183177948, + "step": 2686 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.03125, + "epoch": 1.3125, + "grad_norm": 1.7415850172344802, + "kl": 0.079345703125, + "learning_rate": 6.719970703124999e-07, + "loss": 0.0032, + "reward": 1.8236759305000305, + "reward_std": 0.038199277594685555, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8236759305000305, + "step": 2687 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.2890625, + "epoch": 1.31298828125, + "grad_norm": 4.382732157563532, + "kl": 0.07080078125, + "learning_rate": 6.718749999999999e-07, + "loss": 0.0028, + "reward": 1.7863699793815613, + "reward_std": 0.09564121440052986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.786370038986206, + "step": 2688 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.421875, + "epoch": 1.3134765625, + "grad_norm": 1.9957093212811292, + "kl": 0.06494140625, + "learning_rate": 6.717529296875e-07, + "loss": 0.0026, + "reward": 1.729765772819519, + "reward_std": 0.058095003478229046, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.729765772819519, + "step": 2689 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.09375, + "epoch": 1.31396484375, + "grad_norm": 1.5829270533122184, + "kl": 0.05908203125, + "learning_rate": 6.71630859375e-07, + "loss": 0.0024, + "reward": 1.7915682792663574, + "reward_std": 0.06929146684706211, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7915682792663574, + "step": 2690 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.1015625, + "epoch": 1.314453125, + "grad_norm": 0.9886014203631965, + "kl": 0.06201171875, + "learning_rate": 6.715087890625e-07, + "loss": 0.0025, + "reward": 1.7871447801589966, + "reward_std": 0.08929637633264065, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.810582309961319, + "step": 2691 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.1640625, + "epoch": 1.31494140625, + "grad_norm": 1.0123772399132263, + "kl": 0.072509765625, + "learning_rate": 6.7138671875e-07, + "loss": 0.0029, + "reward": 1.6426368355751038, + "reward_std": 0.0795272197574377, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.642636775970459, + "step": 2692 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.0, + "epoch": 1.3154296875, + "grad_norm": 0.8408158657981815, + "kl": 0.0609130859375, + "learning_rate": 6.712646484374999e-07, + "loss": 0.0024, + "reward": 1.782248616218567, + "reward_std": 0.026081452146172523, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7822486758232117, + "step": 2693 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.109375, + "epoch": 1.31591796875, + "grad_norm": 1.1570138238531078, + "kl": 0.068359375, + "learning_rate": 6.711425781249999e-07, + "loss": 0.0027, + "reward": 1.9411388635635376, + "reward_std": 0.07927910797297955, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.941138744354248, + "step": 2694 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.75, + "epoch": 1.31640625, + "grad_norm": 6.129173902050601, + "kl": 0.0728759765625, + "learning_rate": 6.710205078125e-07, + "loss": 0.0029, + "reward": 1.7619558572769165, + "reward_std": 0.060450656339526176, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7619557976722717, + "step": 2695 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.09375, + "epoch": 1.31689453125, + "grad_norm": 2.0134898217947166, + "kl": 0.07568359375, + "learning_rate": 6.708984375e-07, + "loss": 0.003, + "reward": 1.7178034782409668, + "reward_std": 0.08819794841110706, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7256160378456116, + "step": 2696 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.65625, + "epoch": 1.3173828125, + "grad_norm": 2.2035043252857203, + "kl": 0.075927734375, + "learning_rate": 6.707763671875e-07, + "loss": 0.003, + "reward": 1.7355281710624695, + "reward_std": 0.13643942587077618, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7667781114578247, + "step": 2697 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.8359375, + "epoch": 1.31787109375, + "grad_norm": 1.556741176565315, + "kl": 0.0655517578125, + "learning_rate": 6.70654296875e-07, + "loss": 0.0026, + "reward": 1.8553110361099243, + "reward_std": 0.14732931554317474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8553110957145691, + "step": 2698 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.515625, + "epoch": 1.318359375, + "grad_norm": 0.8254256278213256, + "kl": 0.0787353515625, + "learning_rate": 6.705322265625e-07, + "loss": 0.0032, + "reward": 1.6898934841156006, + "reward_std": 0.02993142046034336, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6898934543132782, + "step": 2699 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.0234375, + "epoch": 1.31884765625, + "grad_norm": 1.8226982226518016, + "kl": 0.0614013671875, + "learning_rate": 6.704101562499999e-07, + "loss": 0.0025, + "reward": 1.717129111289978, + "reward_std": 0.03285204339772463, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.717129111289978, + "step": 2700 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.8984375, + "epoch": 1.3193359375, + "grad_norm": 12.341387634183427, + "kl": 0.076416015625, + "learning_rate": 6.702880859374999e-07, + "loss": 0.0031, + "reward": 1.725698471069336, + "reward_std": 0.06217564269900322, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7256983816623688, + "step": 2701 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.1875, + "epoch": 1.31982421875, + "grad_norm": 0.9525369914253854, + "kl": 0.060546875, + "learning_rate": 6.70166015625e-07, + "loss": 0.0024, + "reward": 1.8281516432762146, + "reward_std": 0.03971204720437527, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8281517326831818, + "step": 2702 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.1015625, + "epoch": 1.3203125, + "grad_norm": 1.2358295612242862, + "kl": 0.072265625, + "learning_rate": 6.700439453125e-07, + "loss": 0.0029, + "reward": 1.8739069700241089, + "reward_std": 0.04518134891986847, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8739069700241089, + "step": 2703 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.8359375, + "epoch": 1.32080078125, + "grad_norm": 1.6234402625579745, + "kl": 0.077880859375, + "learning_rate": 6.69921875e-07, + "loss": 0.0031, + "reward": 1.6680772304534912, + "reward_std": 0.03605229314416647, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6680772304534912, + "step": 2704 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.234375, + "epoch": 1.3212890625, + "grad_norm": 1.019102997793149, + "kl": 0.08935546875, + "learning_rate": 6.697998046875e-07, + "loss": 0.0036, + "reward": 1.7389346361160278, + "reward_std": 0.04351983033120632, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7389346957206726, + "step": 2705 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.5, + "epoch": 1.32177734375, + "grad_norm": 2.8813575124650197, + "kl": 0.07177734375, + "learning_rate": 6.696777343749999e-07, + "loss": 0.0029, + "reward": 1.7355643510818481, + "reward_std": 0.03399805910885334, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7355643510818481, + "step": 2706 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.9765625, + "epoch": 1.322265625, + "grad_norm": 2.810469992608748, + "kl": 0.069091796875, + "learning_rate": 6.695556640624999e-07, + "loss": 0.0028, + "reward": 1.841668725013733, + "reward_std": 0.06313896924257278, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8416686952114105, + "step": 2707 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.8515625, + "epoch": 1.32275390625, + "grad_norm": 1.4321682955957, + "kl": 0.054443359375, + "learning_rate": 6.6943359375e-07, + "loss": 0.0022, + "reward": 1.8613044619560242, + "reward_std": 0.023028030525892973, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8613044023513794, + "step": 2708 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.03125, + "epoch": 1.3232421875, + "grad_norm": 1.1730734025210117, + "kl": 0.064453125, + "learning_rate": 6.693115234375e-07, + "loss": 0.0026, + "reward": 1.7823152542114258, + "reward_std": 0.09651216119527817, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7979402244091034, + "step": 2709 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.28125, + "epoch": 1.32373046875, + "grad_norm": 8.059925351400762, + "kl": 0.09912109375, + "learning_rate": 6.69189453125e-07, + "loss": 0.004, + "reward": 1.7996364831924438, + "reward_std": 0.07981680566444993, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8152614533901215, + "step": 2710 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.3046875, + "epoch": 1.32421875, + "grad_norm": 1.3591185961038974, + "kl": 0.082275390625, + "learning_rate": 6.690673828125e-07, + "loss": 0.0033, + "reward": 1.7113600969314575, + "reward_std": 0.042755890637636185, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7113600373268127, + "step": 2711 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.7265625, + "epoch": 1.32470703125, + "grad_norm": 2.8367504069887737, + "kl": 0.0732421875, + "learning_rate": 6.689453125e-07, + "loss": 0.0029, + "reward": 1.7709915041923523, + "reward_std": 0.04804490879178047, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7709915339946747, + "step": 2712 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.34375, + "epoch": 1.3251953125, + "grad_norm": 2.423950195941969, + "kl": 0.048828125, + "learning_rate": 6.688232421874999e-07, + "loss": 0.002, + "reward": 1.8260767459869385, + "reward_std": 0.054210664704442024, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8260767161846161, + "step": 2713 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.8671875, + "epoch": 1.32568359375, + "grad_norm": 1.2894446032385674, + "kl": 0.0791015625, + "learning_rate": 6.687011718749999e-07, + "loss": 0.0032, + "reward": 1.7933790683746338, + "reward_std": 0.048833588138222694, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7933790981769562, + "step": 2714 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.09375, + "epoch": 1.326171875, + "grad_norm": 5.206795373840309, + "kl": 0.067626953125, + "learning_rate": 6.685791015625e-07, + "loss": 0.0027, + "reward": 1.712072491645813, + "reward_std": 0.08722497709095478, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7120724618434906, + "step": 2715 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.890625, + "epoch": 1.32666015625, + "grad_norm": 1.567961181256668, + "kl": 0.060546875, + "learning_rate": 6.6845703125e-07, + "loss": 0.0024, + "reward": 1.7127328515052795, + "reward_std": 0.05422433838248253, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7127328813076019, + "step": 2716 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.25, + "epoch": 1.3271484375, + "grad_norm": 1.3532097514243422, + "kl": 0.0728759765625, + "learning_rate": 6.683349609375e-07, + "loss": 0.0029, + "reward": 1.730715036392212, + "reward_std": 0.04247327148914337, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7307150363922119, + "step": 2717 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.703125, + "epoch": 1.32763671875, + "grad_norm": 2.413627159184983, + "kl": 0.08154296875, + "learning_rate": 6.68212890625e-07, + "loss": 0.0033, + "reward": 1.7469477653503418, + "reward_std": 0.15270064398646355, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7703852355480194, + "step": 2718 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.9375, + "epoch": 1.328125, + "grad_norm": 6.093449713291131, + "kl": 0.0732421875, + "learning_rate": 6.680908203125e-07, + "loss": 0.0029, + "reward": 1.7299081683158875, + "reward_std": 0.0472866240888834, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7299081385135651, + "step": 2719 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.8984375, + "epoch": 1.32861328125, + "grad_norm": 4.912267833376837, + "kl": 0.078125, + "learning_rate": 6.679687499999999e-07, + "loss": 0.0031, + "reward": 1.8125880360603333, + "reward_std": 0.06944674998521805, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8125880360603333, + "step": 2720 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.7890625, + "epoch": 1.3291015625, + "grad_norm": 1.4881380474177748, + "kl": 0.062744140625, + "learning_rate": 6.678466796875e-07, + "loss": 0.0025, + "reward": 1.7330250144004822, + "reward_std": 0.08363675326108932, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7408375144004822, + "step": 2721 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.390625, + "epoch": 1.32958984375, + "grad_norm": 1.4668229727045794, + "kl": 0.07763671875, + "learning_rate": 6.67724609375e-07, + "loss": 0.0031, + "reward": 1.805801808834076, + "reward_std": 0.06017332337796688, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8058017492294312, + "step": 2722 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0078125, + "epoch": 1.330078125, + "grad_norm": 2.118197040137688, + "kl": 0.058349609375, + "learning_rate": 6.676025390625e-07, + "loss": 0.0023, + "reward": 1.778078854084015, + "reward_std": 0.09711121767759323, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8015162944793701, + "step": 2723 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.4921875, + "epoch": 1.33056640625, + "grad_norm": 0.7593383155856351, + "kl": 0.083984375, + "learning_rate": 6.6748046875e-07, + "loss": 0.0034, + "reward": 1.5734055638313293, + "reward_std": 0.08365354500710964, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.5890305191278458, + "step": 2724 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.7421875, + "epoch": 1.3310546875, + "grad_norm": 14.803652390678243, + "kl": 0.075439453125, + "learning_rate": 6.673583984375e-07, + "loss": 0.003, + "reward": 1.8133496046066284, + "reward_std": 0.01667138608172536, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8133496046066284, + "step": 2725 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.0703125, + "epoch": 1.33154296875, + "grad_norm": 2.120863946435663, + "kl": 0.070068359375, + "learning_rate": 6.672363281249999e-07, + "loss": 0.0028, + "reward": 1.6798765659332275, + "reward_std": 0.12814366817474365, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6876890957355499, + "step": 2726 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.1484375, + "epoch": 1.33203125, + "grad_norm": 0.7789537405347509, + "kl": 0.068603515625, + "learning_rate": 6.671142578124999e-07, + "loss": 0.0027, + "reward": 1.7501285672187805, + "reward_std": 0.02368486486375332, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7501285076141357, + "step": 2727 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7265625, + "epoch": 1.33251953125, + "grad_norm": 1.6295888889315164, + "kl": 0.075927734375, + "learning_rate": 6.669921875e-07, + "loss": 0.003, + "reward": 1.6839573979377747, + "reward_std": 0.04334849305450916, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6839573085308075, + "step": 2728 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.875, + "epoch": 1.3330078125, + "grad_norm": 1.6509903224943192, + "kl": 0.0718994140625, + "learning_rate": 6.668701171875e-07, + "loss": 0.0029, + "reward": 1.8505354523658752, + "reward_std": 0.033067792654037476, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8505354523658752, + "step": 2729 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.421875, + "epoch": 1.33349609375, + "grad_norm": 0.7796283131638133, + "kl": 0.08837890625, + "learning_rate": 6.66748046875e-07, + "loss": 0.0035, + "reward": 1.7302683591842651, + "reward_std": 0.06862842850387096, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7380808293819427, + "step": 2730 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.4453125, + "epoch": 1.333984375, + "grad_norm": 1.4712873612302482, + "kl": 0.05859375, + "learning_rate": 6.666259765625e-07, + "loss": 0.0023, + "reward": 1.8550618886947632, + "reward_std": 0.09110767394304276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8550618886947632, + "step": 2731 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.4140625, + "epoch": 1.33447265625, + "grad_norm": 3.881608704348827, + "kl": 0.085693359375, + "learning_rate": 6.6650390625e-07, + "loss": 0.0034, + "reward": 1.7282916903495789, + "reward_std": 0.10343683697283268, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7517292201519012, + "step": 2732 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.9921875, + "epoch": 1.3349609375, + "grad_norm": 1.0148415053766122, + "kl": 0.069091796875, + "learning_rate": 6.663818359374999e-07, + "loss": 0.0028, + "reward": 1.8618406057357788, + "reward_std": 0.024386493489146233, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8618406653404236, + "step": 2733 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.609375, + "epoch": 1.33544921875, + "grad_norm": 1.0975253206002726, + "kl": 0.07275390625, + "learning_rate": 6.66259765625e-07, + "loss": 0.0029, + "reward": 1.7836071848869324, + "reward_std": 0.0855344720184803, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7914197146892548, + "step": 2734 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.921875, + "epoch": 1.3359375, + "grad_norm": 2.3233855799239578, + "kl": 0.069580078125, + "learning_rate": 6.661376953125e-07, + "loss": 0.0028, + "reward": 1.7295535802841187, + "reward_std": 0.08530437387526035, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7529910504817963, + "step": 2735 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.0625, + "epoch": 1.33642578125, + "grad_norm": 0.9699266686728515, + "kl": 0.077392578125, + "learning_rate": 6.66015625e-07, + "loss": 0.0031, + "reward": 1.841222882270813, + "reward_std": 0.0592461503110826, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.841222882270813, + "step": 2736 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.046875, + "epoch": 1.3369140625, + "grad_norm": 2.7201637371137823, + "kl": 0.092529296875, + "learning_rate": 6.658935546875e-07, + "loss": 0.0037, + "reward": 1.7343988418579102, + "reward_std": 0.1262606419622898, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7656488418579102, + "step": 2737 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.6875, + "epoch": 1.33740234375, + "grad_norm": 2.8247996935464554, + "kl": 0.0908203125, + "learning_rate": 6.65771484375e-07, + "loss": 0.0036, + "reward": 1.7593631744384766, + "reward_std": 0.03225879417732358, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7593631744384766, + "step": 2738 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5, + "epoch": 1.337890625, + "grad_norm": 1.160437436072484, + "kl": 0.0732421875, + "learning_rate": 6.656494140624999e-07, + "loss": 0.0029, + "reward": 1.7741823196411133, + "reward_std": 0.047216689214110374, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7741822898387909, + "step": 2739 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.953125, + "epoch": 1.33837890625, + "grad_norm": 2.843629986590572, + "kl": 0.07861328125, + "learning_rate": 6.655273437499999e-07, + "loss": 0.0031, + "reward": 1.694653332233429, + "reward_std": 0.07069635391235352, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6946533024311066, + "step": 2740 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.390625, + "epoch": 1.3388671875, + "grad_norm": 3.22653638503277, + "kl": 0.0821533203125, + "learning_rate": 6.654052734375e-07, + "loss": 0.0033, + "reward": 1.834531307220459, + "reward_std": 0.0343925547786057, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8345312476158142, + "step": 2741 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.625, + "epoch": 1.33935546875, + "grad_norm": 5.463175138878789, + "kl": 0.062744140625, + "learning_rate": 6.65283203125e-07, + "loss": 0.0025, + "reward": 1.814225673675537, + "reward_std": 0.024931567488238215, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8142256438732147, + "step": 2742 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.9765625, + "epoch": 1.33984375, + "grad_norm": 5.997176370030234, + "kl": 0.07666015625, + "learning_rate": 6.651611328125e-07, + "loss": 0.0031, + "reward": 1.778179109096527, + "reward_std": 0.08071616850793362, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7781790792942047, + "step": 2743 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.953125, + "epoch": 1.34033203125, + "grad_norm": 2.1108738481028824, + "kl": 0.0687255859375, + "learning_rate": 6.650390625e-07, + "loss": 0.0027, + "reward": 1.6546780467033386, + "reward_std": 0.04719951003789902, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6546780467033386, + "step": 2744 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.859375, + "epoch": 1.3408203125, + "grad_norm": 1.189954861287409, + "kl": 0.077880859375, + "learning_rate": 6.649169921875e-07, + "loss": 0.0031, + "reward": 1.722962737083435, + "reward_std": 0.10328607633709908, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7385877668857574, + "step": 2745 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.359375, + "epoch": 1.34130859375, + "grad_norm": 2.3195408485561986, + "kl": 0.078369140625, + "learning_rate": 6.647949218749999e-07, + "loss": 0.0031, + "reward": 1.7592060565948486, + "reward_std": 0.03395948093384504, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.759206086397171, + "step": 2746 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.640625, + "epoch": 1.341796875, + "grad_norm": 0.762429658572133, + "kl": 0.06884765625, + "learning_rate": 6.646728515625e-07, + "loss": 0.0028, + "reward": 1.813932180404663, + "reward_std": 0.09360839053988457, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8373695611953735, + "step": 2747 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.75, + "epoch": 1.34228515625, + "grad_norm": 3.539568556920886, + "kl": 0.0634765625, + "learning_rate": 6.6455078125e-07, + "loss": 0.0025, + "reward": 1.750356376171112, + "reward_std": 0.049200138077139854, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7503563463687897, + "step": 2748 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.140625, + "epoch": 1.3427734375, + "grad_norm": 3.9639963865650247, + "kl": 0.0772705078125, + "learning_rate": 6.644287109375e-07, + "loss": 0.0031, + "reward": 1.6544407606124878, + "reward_std": 0.07485324889421463, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6544407606124878, + "step": 2749 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.546875, + "epoch": 1.34326171875, + "grad_norm": 7.157519189741742, + "kl": 0.0570068359375, + "learning_rate": 6.64306640625e-07, + "loss": 0.0023, + "reward": 1.7769380807876587, + "reward_std": 0.12172123789787292, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7925631105899811, + "step": 2750 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.296875, + "epoch": 1.34375, + "grad_norm": 1.380768515593198, + "kl": 0.053466796875, + "learning_rate": 6.641845703125e-07, + "loss": 0.0021, + "reward": 1.6924372911453247, + "reward_std": 0.11786646395921707, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7314998209476471, + "step": 2751 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.1875, + "epoch": 1.34423828125, + "grad_norm": 2.166002705894626, + "kl": 0.0631103515625, + "learning_rate": 6.640624999999999e-07, + "loss": 0.0025, + "reward": 1.8119662404060364, + "reward_std": 0.0506830308586359, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8119662702083588, + "step": 2752 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.6875, + "epoch": 1.3447265625, + "grad_norm": 24.943229867842106, + "kl": 0.0791015625, + "learning_rate": 6.639404296874999e-07, + "loss": 0.0032, + "reward": 1.8568952083587646, + "reward_std": 0.04123697895556688, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8568951487541199, + "step": 2753 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.03125, + "epoch": 1.34521484375, + "grad_norm": 1.8708805947768723, + "kl": 0.090576171875, + "learning_rate": 6.63818359375e-07, + "loss": 0.0036, + "reward": 1.7188897132873535, + "reward_std": 0.056853797286748886, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7188896536827087, + "step": 2754 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.5078125, + "epoch": 1.345703125, + "grad_norm": 2.439239557617475, + "kl": 0.0806884765625, + "learning_rate": 6.636962890625e-07, + "loss": 0.0032, + "reward": 1.7047572135925293, + "reward_std": 0.18556798994541168, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7438197731971741, + "step": 2755 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.171875, + "epoch": 1.34619140625, + "grad_norm": 2.160190526404616, + "kl": 0.0657958984375, + "learning_rate": 6.6357421875e-07, + "loss": 0.0026, + "reward": 1.847311556339264, + "reward_std": 0.07765659689903259, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8551241159439087, + "step": 2756 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.9921875, + "epoch": 1.3466796875, + "grad_norm": 2.2755146480765056, + "kl": 0.078857421875, + "learning_rate": 6.634521484375e-07, + "loss": 0.0031, + "reward": 1.8561453819274902, + "reward_std": 0.04817195236682892, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8561453819274902, + "step": 2757 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.390625, + "epoch": 1.34716796875, + "grad_norm": 4.259565415030043, + "kl": 0.064697265625, + "learning_rate": 6.63330078125e-07, + "loss": 0.0026, + "reward": 1.7508844137191772, + "reward_std": 0.14695337787270546, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7665094435214996, + "step": 2758 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.4296875, + "epoch": 1.34765625, + "grad_norm": 1.761996511284053, + "kl": 0.07861328125, + "learning_rate": 6.632080078124999e-07, + "loss": 0.0031, + "reward": 1.8259143233299255, + "reward_std": 0.11976262181997299, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8337267935276031, + "step": 2759 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.109375, + "epoch": 1.34814453125, + "grad_norm": 1.1530002116392095, + "kl": 0.075439453125, + "learning_rate": 6.630859374999999e-07, + "loss": 0.003, + "reward": 1.8094568252563477, + "reward_std": 0.09631854109466076, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8407068252563477, + "step": 2760 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.8671875, + "epoch": 1.3486328125, + "grad_norm": 1.9409396087042068, + "kl": 0.0567626953125, + "learning_rate": 6.629638671875e-07, + "loss": 0.0023, + "reward": 1.8710192441940308, + "reward_std": 0.061006827279925346, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8710193037986755, + "step": 2761 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.1171875, + "epoch": 1.34912109375, + "grad_norm": 0.7799863900997498, + "kl": 0.085693359375, + "learning_rate": 6.62841796875e-07, + "loss": 0.0034, + "reward": 1.77259761095047, + "reward_std": 0.05002701282501221, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7725976407527924, + "step": 2762 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.484375, + "epoch": 1.349609375, + "grad_norm": 2.9890180449063792, + "kl": 0.072509765625, + "learning_rate": 6.627197265625e-07, + "loss": 0.0029, + "reward": 1.758193016052246, + "reward_std": 0.11531753093004227, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7581930160522461, + "step": 2763 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.0234375, + "epoch": 1.35009765625, + "grad_norm": 3.384788495208568, + "kl": 0.090087890625, + "learning_rate": 6.6259765625e-07, + "loss": 0.0036, + "reward": 1.7781252264976501, + "reward_std": 0.04981714114546776, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7781251966953278, + "step": 2764 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.109375, + "epoch": 1.3505859375, + "grad_norm": 2.0776218039129155, + "kl": 0.08349609375, + "learning_rate": 6.624755859374999e-07, + "loss": 0.0033, + "reward": 1.7179874777793884, + "reward_std": 0.06420490704476833, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7179875075817108, + "step": 2765 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.3203125, + "epoch": 1.35107421875, + "grad_norm": 1.3042813145233396, + "kl": 0.071533203125, + "learning_rate": 6.623535156249999e-07, + "loss": 0.0029, + "reward": 1.706727385520935, + "reward_std": 0.04921235144138336, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7067274153232574, + "step": 2766 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.0625, + "epoch": 1.3515625, + "grad_norm": 0.8583618689491556, + "kl": 0.05859375, + "learning_rate": 6.622314453125e-07, + "loss": 0.0023, + "reward": 1.8275976777076721, + "reward_std": 0.051028769463300705, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8275976479053497, + "step": 2767 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.046875, + "epoch": 1.35205078125, + "grad_norm": 1.3873179964338491, + "kl": 0.0771484375, + "learning_rate": 6.62109375e-07, + "loss": 0.0031, + "reward": 1.7254577279090881, + "reward_std": 0.12088143825531006, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7254576981067657, + "step": 2768 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.703125, + "epoch": 1.3525390625, + "grad_norm": 1.5492890007706852, + "kl": 0.09521484375, + "learning_rate": 6.619873046875e-07, + "loss": 0.0038, + "reward": 1.7280957102775574, + "reward_std": 0.10262476652860641, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7359082102775574, + "step": 2769 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.53125, + "epoch": 1.35302734375, + "grad_norm": 2.3373926002613374, + "kl": 0.065185546875, + "learning_rate": 6.61865234375e-07, + "loss": 0.0026, + "reward": 1.7113505005836487, + "reward_std": 0.056995073333382607, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7113505005836487, + "step": 2770 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.609375, + "epoch": 1.353515625, + "grad_norm": 4.3744425979305355, + "kl": 0.0986328125, + "learning_rate": 6.617431640625e-07, + "loss": 0.0039, + "reward": 1.7181638479232788, + "reward_std": 0.08497333526611328, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7181638479232788, + "step": 2771 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.96875, + "epoch": 1.35400390625, + "grad_norm": 14.857357422003494, + "kl": 0.0657958984375, + "learning_rate": 6.616210937499999e-07, + "loss": 0.0026, + "reward": 1.888843595981598, + "reward_std": 0.046841708943247795, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8888436257839203, + "step": 2772 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.734375, + "epoch": 1.3544921875, + "grad_norm": 1.8069280935451315, + "kl": 0.07373046875, + "learning_rate": 6.614990234374999e-07, + "loss": 0.0029, + "reward": 1.868907868862152, + "reward_std": 0.05142470262944698, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8689078092575073, + "step": 2773 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.4609375, + "epoch": 1.35498046875, + "grad_norm": 1.9204075579327249, + "kl": 0.069091796875, + "learning_rate": 6.61376953125e-07, + "loss": 0.0028, + "reward": 1.8886531591415405, + "reward_std": 0.08299789018929005, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8886531889438629, + "step": 2774 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.4140625, + "epoch": 1.35546875, + "grad_norm": 1.6906533792821081, + "kl": 0.0618896484375, + "learning_rate": 6.612548828125e-07, + "loss": 0.0025, + "reward": 1.8281062841415405, + "reward_std": 0.094516322016716, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8437312543392181, + "step": 2775 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.765625, + "epoch": 1.35595703125, + "grad_norm": 1.6386886335437982, + "kl": 0.09033203125, + "learning_rate": 6.611328125e-07, + "loss": 0.0036, + "reward": 1.6473196148872375, + "reward_std": 0.07153589557856321, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6473196148872375, + "step": 2776 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.6328125, + "epoch": 1.3564453125, + "grad_norm": 3.776389069468664, + "kl": 0.0716552734375, + "learning_rate": 6.610107421875e-07, + "loss": 0.0029, + "reward": 1.7405164241790771, + "reward_std": 0.11073359847068787, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7483289241790771, + "step": 2777 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.328125, + "epoch": 1.35693359375, + "grad_norm": 1.1085765846395907, + "kl": 0.0662841796875, + "learning_rate": 6.608886718749999e-07, + "loss": 0.0027, + "reward": 1.8612353205680847, + "reward_std": 0.05960194766521454, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8612352311611176, + "step": 2778 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.8359375, + "epoch": 1.357421875, + "grad_norm": 1.1688355930446612, + "kl": 0.075927734375, + "learning_rate": 6.607666015624999e-07, + "loss": 0.003, + "reward": 1.668241798877716, + "reward_std": 0.0824052020907402, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6682417988777161, + "step": 2779 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.921875, + "epoch": 1.35791015625, + "grad_norm": 1.80133842186887, + "kl": 0.0592041015625, + "learning_rate": 6.6064453125e-07, + "loss": 0.0024, + "reward": 1.8754128217697144, + "reward_std": 0.02819860354065895, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.875412791967392, + "step": 2780 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.015625, + "epoch": 1.3583984375, + "grad_norm": 2.9845705873686876, + "kl": 0.082763671875, + "learning_rate": 6.605224609375e-07, + "loss": 0.0033, + "reward": 1.837379813194275, + "reward_std": 0.0600747037678957, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8373798131942749, + "step": 2781 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.03125, + "epoch": 1.35888671875, + "grad_norm": 1.9146016266718324, + "kl": 0.072021484375, + "learning_rate": 6.60400390625e-07, + "loss": 0.0029, + "reward": 1.8498224020004272, + "reward_std": 0.07501747971400619, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8498223423957825, + "step": 2782 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.9375, + "epoch": 1.359375, + "grad_norm": 0.9660214761219784, + "kl": 0.066162109375, + "learning_rate": 6.602783203125e-07, + "loss": 0.0026, + "reward": 1.708676815032959, + "reward_std": 0.05009671114385128, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.708676815032959, + "step": 2783 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.0390625, + "epoch": 1.35986328125, + "grad_norm": 1.9126001884812758, + "kl": 0.06884765625, + "learning_rate": 6.6015625e-07, + "loss": 0.0027, + "reward": 1.7439513802528381, + "reward_std": 0.05504639446735382, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7439513504505157, + "step": 2784 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.140625, + "epoch": 1.3603515625, + "grad_norm": 2.6388109061222647, + "kl": 0.0625, + "learning_rate": 6.600341796874999e-07, + "loss": 0.0025, + "reward": 1.8127487897872925, + "reward_std": 0.05138452537357807, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8127487897872925, + "step": 2785 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.34375, + "epoch": 1.36083984375, + "grad_norm": 1.9324545434757587, + "kl": 0.0615234375, + "learning_rate": 6.599121093749999e-07, + "loss": 0.0025, + "reward": 1.8103123307228088, + "reward_std": 0.08414742723107338, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8103123307228088, + "step": 2786 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.15625, + "epoch": 1.361328125, + "grad_norm": 1.710010117700723, + "kl": 0.07861328125, + "learning_rate": 6.597900390625e-07, + "loss": 0.0031, + "reward": 1.8647686839103699, + "reward_std": 0.07051170617341995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8647686839103699, + "step": 2787 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.65625, + "epoch": 1.36181640625, + "grad_norm": 3.5098494042536115, + "kl": 0.09814453125, + "learning_rate": 6.5966796875e-07, + "loss": 0.0039, + "reward": 1.811535358428955, + "reward_std": 0.17406302690505981, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8115352988243103, + "step": 2788 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.03125, + "epoch": 1.3623046875, + "grad_norm": 4.488729872719589, + "kl": 0.054931640625, + "learning_rate": 6.595458984375e-07, + "loss": 0.0022, + "reward": 1.823096752166748, + "reward_std": 0.06310966797173023, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8230966925621033, + "step": 2789 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.9453125, + "epoch": 1.36279296875, + "grad_norm": 2.859947402634798, + "kl": 0.0673828125, + "learning_rate": 6.59423828125e-07, + "loss": 0.0027, + "reward": 1.7696452736854553, + "reward_std": 0.09921448305249214, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7696452140808105, + "step": 2790 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.6640625, + "epoch": 1.36328125, + "grad_norm": 1.8490800827770573, + "kl": 0.08642578125, + "learning_rate": 6.593017578124999e-07, + "loss": 0.0035, + "reward": 1.77943754196167, + "reward_std": 0.06473535671830177, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7794375717639923, + "step": 2791 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.546875, + "epoch": 1.36376953125, + "grad_norm": 1.1874902147313509, + "kl": 0.0633544921875, + "learning_rate": 6.591796874999999e-07, + "loss": 0.0025, + "reward": 1.8052760362625122, + "reward_std": 0.05710322968661785, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8052760064601898, + "step": 2792 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5078125, + "epoch": 1.3642578125, + "grad_norm": 1.3877294374512281, + "kl": 0.0810546875, + "learning_rate": 6.590576171875e-07, + "loss": 0.0032, + "reward": 1.7053526639938354, + "reward_std": 0.044744652695953846, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7053526639938354, + "step": 2793 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.2890625, + "epoch": 1.36474609375, + "grad_norm": 1.7614738212994774, + "kl": 0.091064453125, + "learning_rate": 6.58935546875e-07, + "loss": 0.0037, + "reward": 1.8614672422409058, + "reward_std": 0.06470566987991333, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8692797422409058, + "step": 2794 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.9921875, + "epoch": 1.365234375, + "grad_norm": 0.8216449129468091, + "kl": 0.0640869140625, + "learning_rate": 6.588134765625e-07, + "loss": 0.0026, + "reward": 1.7944404482841492, + "reward_std": 0.07953635044395924, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8022529482841492, + "step": 2795 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.2890625, + "epoch": 1.36572265625, + "grad_norm": 3.7784315403086732, + "kl": 0.0648193359375, + "learning_rate": 6.5869140625e-07, + "loss": 0.0026, + "reward": 1.7856959700584412, + "reward_std": 0.06272900477051735, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7856959700584412, + "step": 2796 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.5625, + "epoch": 1.3662109375, + "grad_norm": 1.9366371199429389, + "kl": 0.0728759765625, + "learning_rate": 6.585693359375e-07, + "loss": 0.0029, + "reward": 1.7936596274375916, + "reward_std": 0.07175188139081001, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7936596572399139, + "step": 2797 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.078125, + "epoch": 1.36669921875, + "grad_norm": 1.3702102849644753, + "kl": 0.063720703125, + "learning_rate": 6.584472656249999e-07, + "loss": 0.0025, + "reward": 1.7357134819030762, + "reward_std": 0.09683592431247234, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7435259819030762, + "step": 2798 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.40625, + "epoch": 1.3671875, + "grad_norm": 0.7898266031129043, + "kl": 0.082275390625, + "learning_rate": 6.583251953124999e-07, + "loss": 0.0033, + "reward": 1.7490665912628174, + "reward_std": 0.09386800974607468, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7568791508674622, + "step": 2799 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.4296875, + "epoch": 1.36767578125, + "grad_norm": 1.1152710272393713, + "kl": 0.06396484375, + "learning_rate": 6.58203125e-07, + "loss": 0.0026, + "reward": 1.8169459700584412, + "reward_std": 0.05791633389890194, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8169459104537964, + "step": 2800 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.734375, + "epoch": 1.3681640625, + "grad_norm": 4.796604127693978, + "kl": 0.0958251953125, + "learning_rate": 6.580810546875e-07, + "loss": 0.0038, + "reward": 1.720919132232666, + "reward_std": 0.08066519349813461, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7756066620349884, + "step": 2801 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.234375, + "epoch": 1.36865234375, + "grad_norm": 0.7665830601343561, + "kl": 0.065673828125, + "learning_rate": 6.57958984375e-07, + "loss": 0.0026, + "reward": 1.7669459581375122, + "reward_std": 0.052986389957368374, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7747583985328674, + "step": 2802 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.2109375, + "epoch": 1.369140625, + "grad_norm": 0.8869108480679029, + "kl": 0.052490234375, + "learning_rate": 6.578369140625e-07, + "loss": 0.0021, + "reward": 1.7913293838500977, + "reward_std": 0.057568637654185295, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7913293838500977, + "step": 2803 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.9140625, + "epoch": 1.36962890625, + "grad_norm": 1.1017743459095004, + "kl": 0.079833984375, + "learning_rate": 6.577148437499999e-07, + "loss": 0.0032, + "reward": 1.762194275856018, + "reward_std": 0.027863549068570137, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7621943652629852, + "step": 2804 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.3671875, + "epoch": 1.3701171875, + "grad_norm": 5.067708962892902, + "kl": 0.064697265625, + "learning_rate": 6.575927734374999e-07, + "loss": 0.0026, + "reward": 1.6529717445373535, + "reward_std": 0.07605472579598427, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6529717445373535, + "step": 2805 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.3203125, + "epoch": 1.37060546875, + "grad_norm": 3.5790933037952426, + "kl": 0.07470703125, + "learning_rate": 6.57470703125e-07, + "loss": 0.003, + "reward": 1.6717053651809692, + "reward_std": 0.06556748226284981, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6717053353786469, + "step": 2806 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.28125, + "epoch": 1.37109375, + "grad_norm": 1.1882639853118668, + "kl": 0.0606689453125, + "learning_rate": 6.573486328125e-07, + "loss": 0.0024, + "reward": 1.7812290787696838, + "reward_std": 0.11782369762659073, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8046665787696838, + "step": 2807 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.7734375, + "epoch": 1.37158203125, + "grad_norm": 1.0058167484415166, + "kl": 0.0628662109375, + "learning_rate": 6.572265625e-07, + "loss": 0.0025, + "reward": 1.658070147037506, + "reward_std": 0.16661040857434273, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7049451470375061, + "step": 2808 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.3359375, + "epoch": 1.3720703125, + "grad_norm": 1.5691798635044372, + "kl": 0.077392578125, + "learning_rate": 6.571044921875e-07, + "loss": 0.0031, + "reward": 1.8431367874145508, + "reward_std": 0.015472855884581804, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.843136727809906, + "step": 2809 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.0546875, + "epoch": 1.37255859375, + "grad_norm": 30.022616938561168, + "kl": 0.063232421875, + "learning_rate": 6.56982421875e-07, + "loss": 0.0025, + "reward": 1.5678275227546692, + "reward_std": 0.1511671245098114, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5912650525569916, + "step": 2810 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.734375, + "epoch": 1.373046875, + "grad_norm": 2.0753106666854597, + "kl": 0.070068359375, + "learning_rate": 6.568603515624999e-07, + "loss": 0.0028, + "reward": 1.705108404159546, + "reward_std": 0.14512356370687485, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7207334041595459, + "step": 2811 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.1796875, + "epoch": 1.37353515625, + "grad_norm": 1.3679898475411463, + "kl": 0.052001953125, + "learning_rate": 6.567382812499999e-07, + "loss": 0.0021, + "reward": 1.8090946078300476, + "reward_std": 0.037844820879399776, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8090946674346924, + "step": 2812 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.859375, + "epoch": 1.3740234375, + "grad_norm": 1.9064678334594833, + "kl": 0.0635986328125, + "learning_rate": 6.566162109375e-07, + "loss": 0.0025, + "reward": 1.765863299369812, + "reward_std": 0.07344381138682365, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.765863299369812, + "step": 2813 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.28125, + "epoch": 1.37451171875, + "grad_norm": 0.7913906152817536, + "kl": 0.0477294921875, + "learning_rate": 6.56494140625e-07, + "loss": 0.0019, + "reward": 1.7781551480293274, + "reward_std": 0.05877980962395668, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7859676480293274, + "step": 2814 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.9375, + "epoch": 1.375, + "grad_norm": 3.040552646693815, + "kl": 0.0626220703125, + "learning_rate": 6.563720703125e-07, + "loss": 0.0025, + "reward": 1.8006147146224976, + "reward_std": 0.11222148686647415, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.80842724442482, + "step": 2815 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.09375, + "epoch": 1.37548828125, + "grad_norm": 1.1902464862626296, + "kl": 0.07080078125, + "learning_rate": 6.5625e-07, + "loss": 0.0028, + "reward": 1.7808747291564941, + "reward_std": 0.06534177996218204, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7808747291564941, + "step": 2816 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.5390625, + "epoch": 1.3759765625, + "grad_norm": 1.1270478089250884, + "kl": 0.0849609375, + "learning_rate": 6.561279296875e-07, + "loss": 0.0034, + "reward": 1.6131686568260193, + "reward_std": 0.13650833070278168, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.6678561568260193, + "step": 2817 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.578125, + "epoch": 1.37646484375, + "grad_norm": 2.463194086778472, + "kl": 0.083251953125, + "learning_rate": 6.560058593749999e-07, + "loss": 0.0033, + "reward": 1.743131935596466, + "reward_std": 0.13736629113554955, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7743819355964661, + "step": 2818 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.8203125, + "epoch": 1.376953125, + "grad_norm": 1.655877833888544, + "kl": 0.061767578125, + "learning_rate": 6.558837890625e-07, + "loss": 0.0025, + "reward": 1.8583369255065918, + "reward_std": 0.06905535236001015, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8583369851112366, + "step": 2819 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.0703125, + "epoch": 1.37744140625, + "grad_norm": 1.1615671365734472, + "kl": 0.064208984375, + "learning_rate": 6.5576171875e-07, + "loss": 0.0026, + "reward": 1.8210389018058777, + "reward_std": 0.10824690014123917, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8366638422012329, + "step": 2820 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.8515625, + "epoch": 1.3779296875, + "grad_norm": 1.9462189546737534, + "kl": 0.072265625, + "learning_rate": 6.556396484375e-07, + "loss": 0.0029, + "reward": 1.7309507727622986, + "reward_std": 0.12317908834666014, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7543882727622986, + "step": 2821 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.6796875, + "epoch": 1.37841796875, + "grad_norm": 0.7250821619105872, + "kl": 0.0653076171875, + "learning_rate": 6.55517578125e-07, + "loss": 0.0026, + "reward": 1.7630040049552917, + "reward_std": 0.028386560268700123, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7630040049552917, + "step": 2822 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.078125, + "epoch": 1.37890625, + "grad_norm": 1.5496349062279748, + "kl": 0.06787109375, + "learning_rate": 6.553955078125e-07, + "loss": 0.0027, + "reward": 1.7218654155731201, + "reward_std": 0.14879543986171484, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7374904155731201, + "step": 2823 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.3515625, + "epoch": 1.37939453125, + "grad_norm": 0.8887945048990987, + "kl": 0.067626953125, + "learning_rate": 6.552734374999999e-07, + "loss": 0.0027, + "reward": 1.754398226737976, + "reward_std": 0.029574115527793765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7543981671333313, + "step": 2824 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.0, + "epoch": 1.3798828125, + "grad_norm": 1.0309054013771692, + "kl": 0.0648193359375, + "learning_rate": 6.551513671874999e-07, + "loss": 0.0026, + "reward": 1.9179275035858154, + "reward_std": 0.013918052427470684, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9179275631904602, + "step": 2825 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.8125, + "epoch": 1.38037109375, + "grad_norm": 1.6179425907866827, + "kl": 0.062744140625, + "learning_rate": 6.55029296875e-07, + "loss": 0.0025, + "reward": 1.829226016998291, + "reward_std": 0.09412107616662979, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8292260468006134, + "step": 2826 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.25, + "epoch": 1.380859375, + "grad_norm": 1.5403818911864977, + "kl": 0.0736083984375, + "learning_rate": 6.549072265625e-07, + "loss": 0.0029, + "reward": 1.7663615942001343, + "reward_std": 0.02866467647254467, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7663616240024567, + "step": 2827 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.265625, + "epoch": 1.38134765625, + "grad_norm": 0.8033462865951467, + "kl": 0.07373046875, + "learning_rate": 6.5478515625e-07, + "loss": 0.003, + "reward": 1.855335772037506, + "reward_std": 0.044215379282832146, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8553357422351837, + "step": 2828 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.796875, + "epoch": 1.3818359375, + "grad_norm": 2.522746992374405, + "kl": 0.060791015625, + "learning_rate": 6.546630859375e-07, + "loss": 0.0024, + "reward": 1.647928237915039, + "reward_std": 0.13220302015542984, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6713657677173615, + "step": 2829 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.5703125, + "epoch": 1.38232421875, + "grad_norm": 1.7436794453218056, + "kl": 0.0604248046875, + "learning_rate": 6.54541015625e-07, + "loss": 0.0024, + "reward": 1.6440320014953613, + "reward_std": 0.09271154180169106, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6596570014953613, + "step": 2830 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.453125, + "epoch": 1.3828125, + "grad_norm": 0.4931724331120224, + "kl": 0.063232421875, + "learning_rate": 6.544189453124999e-07, + "loss": 0.0025, + "reward": 1.8260149955749512, + "reward_std": 0.024739277781918645, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8260150253772736, + "step": 2831 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.4765625, + "epoch": 1.38330078125, + "grad_norm": 1.093941327614133, + "kl": 0.06396484375, + "learning_rate": 6.54296875e-07, + "loss": 0.0026, + "reward": 1.7941319942474365, + "reward_std": 0.017439838498830795, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7941320240497589, + "step": 2832 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.9296875, + "epoch": 1.3837890625, + "grad_norm": 1.0585896307141636, + "kl": 0.060791015625, + "learning_rate": 6.541748046875e-07, + "loss": 0.0024, + "reward": 1.8979597091674805, + "reward_std": 0.05379013530910015, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8979597091674805, + "step": 2833 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.5390625, + "epoch": 1.38427734375, + "grad_norm": 2.579223044774424, + "kl": 0.079345703125, + "learning_rate": 6.54052734375e-07, + "loss": 0.0032, + "reward": 1.6481378078460693, + "reward_std": 0.10650475323200226, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6559503078460693, + "step": 2834 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.546875, + "epoch": 1.384765625, + "grad_norm": 2.5957275431327025, + "kl": 0.074951171875, + "learning_rate": 6.539306640625e-07, + "loss": 0.003, + "reward": 1.7491188645362854, + "reward_std": 0.04664234071969986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7491189241409302, + "step": 2835 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.0078125, + "epoch": 1.38525390625, + "grad_norm": 1.4332709494413776, + "kl": 0.08251953125, + "learning_rate": 6.5380859375e-07, + "loss": 0.0033, + "reward": 1.7521470189094543, + "reward_std": 0.07698429748415947, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7521470487117767, + "step": 2836 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3359375, + "epoch": 1.3857421875, + "grad_norm": 0.8961647450408224, + "kl": 0.101806640625, + "learning_rate": 6.536865234374999e-07, + "loss": 0.0041, + "reward": 1.7263582348823547, + "reward_std": 0.05472889542579651, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7263582050800323, + "step": 2837 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.578125, + "epoch": 1.38623046875, + "grad_norm": 1.417539744141072, + "kl": 0.077392578125, + "learning_rate": 6.535644531249999e-07, + "loss": 0.0031, + "reward": 1.645662248134613, + "reward_std": 0.10042403638362885, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6534747779369354, + "step": 2838 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.140625, + "epoch": 1.38671875, + "grad_norm": 6.057330374770775, + "kl": 0.14404296875, + "learning_rate": 6.534423828125e-07, + "loss": 0.0058, + "reward": 1.7092814445495605, + "reward_std": 0.02909145038574934, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7092815637588501, + "step": 2839 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.078125, + "epoch": 1.38720703125, + "grad_norm": 0.9907276256447438, + "kl": 0.06494140625, + "learning_rate": 6.533203125e-07, + "loss": 0.0026, + "reward": 1.674263060092926, + "reward_std": 0.17288543283939362, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7133256196975708, + "step": 2840 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.8125, + "epoch": 1.3876953125, + "grad_norm": 72.38821291532156, + "kl": 0.57373046875, + "learning_rate": 6.531982421875e-07, + "loss": 0.0229, + "reward": 1.7409939765930176, + "reward_std": 0.14576169103384018, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7566189765930176, + "step": 2841 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.6328125, + "epoch": 1.38818359375, + "grad_norm": 0.7516084081038863, + "kl": 0.07373046875, + "learning_rate": 6.53076171875e-07, + "loss": 0.003, + "reward": 1.7337377667427063, + "reward_std": 0.05978046730160713, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7337378263473511, + "step": 2842 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.734375, + "epoch": 1.388671875, + "grad_norm": 1.1741854698445926, + "kl": 0.0694580078125, + "learning_rate": 6.529541015625e-07, + "loss": 0.0028, + "reward": 1.6884747743606567, + "reward_std": 0.12016388587653637, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7040997445583344, + "step": 2843 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.5078125, + "epoch": 1.38916015625, + "grad_norm": 1.7849395697497092, + "kl": 0.0703125, + "learning_rate": 6.528320312499999e-07, + "loss": 0.0028, + "reward": 1.7630398273468018, + "reward_std": 0.0514018889516592, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7630398273468018, + "step": 2844 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.734375, + "epoch": 1.3896484375, + "grad_norm": 1.2904172737765798, + "kl": 0.065673828125, + "learning_rate": 6.527099609375e-07, + "loss": 0.0026, + "reward": 1.7109894752502441, + "reward_std": 0.037571437656879425, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7109894752502441, + "step": 2845 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.0859375, + "epoch": 1.39013671875, + "grad_norm": 0.9983955556960216, + "kl": 0.073974609375, + "learning_rate": 6.52587890625e-07, + "loss": 0.003, + "reward": 1.7116398215293884, + "reward_std": 0.061076716519892216, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.711639791727066, + "step": 2846 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.71875, + "epoch": 1.390625, + "grad_norm": 0.7920028351802623, + "kl": 0.068603515625, + "learning_rate": 6.524658203125e-07, + "loss": 0.0027, + "reward": 1.78658789396286, + "reward_std": 0.06295670091640204, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7944003939628601, + "step": 2847 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.828125, + "epoch": 1.39111328125, + "grad_norm": 2.1506316903194094, + "kl": 0.06982421875, + "learning_rate": 6.5234375e-07, + "loss": 0.0028, + "reward": 1.8838631510734558, + "reward_std": 0.06648493744432926, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8916757106781006, + "step": 2848 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.578125, + "epoch": 1.3916015625, + "grad_norm": 1.1112381345863096, + "kl": 0.07421875, + "learning_rate": 6.522216796875e-07, + "loss": 0.003, + "reward": 1.712832510471344, + "reward_std": 0.07860787212848663, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7128325700759888, + "step": 2849 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3828125, + "epoch": 1.39208984375, + "grad_norm": 1.609890133849202, + "kl": 0.0626220703125, + "learning_rate": 6.520996093749999e-07, + "loss": 0.0025, + "reward": 1.802548348903656, + "reward_std": 0.039428723976016045, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8025482892990112, + "step": 2850 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.8828125, + "epoch": 1.392578125, + "grad_norm": 2.1181979161526345, + "kl": 0.09521484375, + "learning_rate": 6.519775390624999e-07, + "loss": 0.0038, + "reward": 1.7357767820358276, + "reward_std": 0.08965800702571869, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7357767522335052, + "step": 2851 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.421875, + "epoch": 1.39306640625, + "grad_norm": 6.007115954333245, + "kl": 0.0819091796875, + "learning_rate": 6.5185546875e-07, + "loss": 0.0033, + "reward": 1.7287788391113281, + "reward_std": 0.08958043158054352, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7287788093090057, + "step": 2852 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3125, + "epoch": 1.3935546875, + "grad_norm": 1.2649144166568242, + "kl": 0.075927734375, + "learning_rate": 6.517333984375e-07, + "loss": 0.003, + "reward": 1.6118924021720886, + "reward_std": 0.1063384860754013, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6197049021720886, + "step": 2853 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.6875, + "epoch": 1.39404296875, + "grad_norm": 4.371721447909433, + "kl": 0.08251953125, + "learning_rate": 6.51611328125e-07, + "loss": 0.0033, + "reward": 1.5534948110580444, + "reward_std": 0.04652980901300907, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5534948408603668, + "step": 2854 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.296875, + "epoch": 1.39453125, + "grad_norm": 1.1589218702703599, + "kl": 0.076904296875, + "learning_rate": 6.514892578125e-07, + "loss": 0.0031, + "reward": 1.6414119601249695, + "reward_std": 0.059677837416529655, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6492244899272919, + "step": 2855 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.328125, + "epoch": 1.39501953125, + "grad_norm": 2.7183716635470994, + "kl": 0.077392578125, + "learning_rate": 6.513671875e-07, + "loss": 0.0031, + "reward": 1.732553243637085, + "reward_std": 0.04065544903278351, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7325531840324402, + "step": 2856 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.0078125, + "epoch": 1.3955078125, + "grad_norm": 2.0094522160396378, + "kl": 0.0869140625, + "learning_rate": 6.512451171874999e-07, + "loss": 0.0035, + "reward": 1.7317038774490356, + "reward_std": 0.04309108108282089, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7317038774490356, + "step": 2857 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.1015625, + "epoch": 1.39599609375, + "grad_norm": 1.7850835774009906, + "kl": 0.0718994140625, + "learning_rate": 6.511230468749999e-07, + "loss": 0.0029, + "reward": 1.751394808292389, + "reward_std": 0.09872918948531151, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7513948082923889, + "step": 2858 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.53125, + "epoch": 1.396484375, + "grad_norm": 2.3667754366875067, + "kl": 0.080810546875, + "learning_rate": 6.510009765625e-07, + "loss": 0.0032, + "reward": 1.749050259590149, + "reward_std": 0.057237736880779266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7490502893924713, + "step": 2859 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.71875, + "epoch": 1.39697265625, + "grad_norm": 4.5044551506426656, + "kl": 0.0618896484375, + "learning_rate": 6.5087890625e-07, + "loss": 0.0025, + "reward": 1.816435694694519, + "reward_std": 0.05485322326421738, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.816435694694519, + "step": 2860 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.5625, + "epoch": 1.3974609375, + "grad_norm": 0.9423463055806306, + "kl": 0.073486328125, + "learning_rate": 6.507568359375e-07, + "loss": 0.0029, + "reward": 1.7192687392234802, + "reward_std": 0.057240571826696396, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7192686796188354, + "step": 2861 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.0546875, + "epoch": 1.39794921875, + "grad_norm": 4.006446009955896, + "kl": 0.070068359375, + "learning_rate": 6.50634765625e-07, + "loss": 0.0028, + "reward": 1.7493478059768677, + "reward_std": 0.08323598839342594, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7493478059768677, + "step": 2862 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.28125, + "epoch": 1.3984375, + "grad_norm": 1.1324756629593455, + "kl": 0.075439453125, + "learning_rate": 6.505126953124999e-07, + "loss": 0.003, + "reward": 1.8583284616470337, + "reward_std": 0.01956217922270298, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8583284616470337, + "step": 2863 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.046875, + "epoch": 1.39892578125, + "grad_norm": 1.583271479867273, + "kl": 0.0682373046875, + "learning_rate": 6.503906249999999e-07, + "loss": 0.0027, + "reward": 1.8412460088729858, + "reward_std": 0.11462399363517761, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8490585088729858, + "step": 2864 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.421875, + "epoch": 1.3994140625, + "grad_norm": 1.1615901347183661, + "kl": 0.0711669921875, + "learning_rate": 6.502685546875e-07, + "loss": 0.0028, + "reward": 1.7544441223144531, + "reward_std": 0.05260493792593479, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7544440627098083, + "step": 2865 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.6875, + "epoch": 1.39990234375, + "grad_norm": 7.6165646200529284, + "kl": 0.0791015625, + "learning_rate": 6.50146484375e-07, + "loss": 0.0032, + "reward": 1.7561290264129639, + "reward_std": 0.09663645923137665, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7717540264129639, + "step": 2866 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.28125, + "epoch": 1.400390625, + "grad_norm": 1.1725742243241242, + "kl": 0.069091796875, + "learning_rate": 6.500244140625e-07, + "loss": 0.0028, + "reward": 1.6970765590667725, + "reward_std": 0.13552076928317547, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7048889398574829, + "step": 2867 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.578125, + "epoch": 1.40087890625, + "grad_norm": 3.094101668652219, + "kl": 0.0693359375, + "learning_rate": 6.4990234375e-07, + "loss": 0.0028, + "reward": 1.7442750334739685, + "reward_std": 0.05703293904662132, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7442750930786133, + "step": 2868 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.859375, + "epoch": 1.4013671875, + "grad_norm": 3.527416892215106, + "kl": 0.0927734375, + "learning_rate": 6.497802734375e-07, + "loss": 0.0037, + "reward": 1.658632516860962, + "reward_std": 0.05164991691708565, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6586325764656067, + "step": 2869 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.890625, + "epoch": 1.40185546875, + "grad_norm": 8.518640897558415, + "kl": 0.09228515625, + "learning_rate": 6.496582031249999e-07, + "loss": 0.0037, + "reward": 1.7752405405044556, + "reward_std": 0.04082014970481396, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7752405405044556, + "step": 2870 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.3046875, + "epoch": 1.40234375, + "grad_norm": 1.2735396040272664, + "kl": 0.0635986328125, + "learning_rate": 6.495361328124999e-07, + "loss": 0.0025, + "reward": 1.8235573172569275, + "reward_std": 0.04111157916486263, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8235573768615723, + "step": 2871 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.390625, + "epoch": 1.40283203125, + "grad_norm": 1.91252386313284, + "kl": 0.07470703125, + "learning_rate": 6.494140625e-07, + "loss": 0.003, + "reward": 1.7103378772735596, + "reward_std": 0.06228804960846901, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7103378474712372, + "step": 2872 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.828125, + "epoch": 1.4033203125, + "grad_norm": 1.1497181496044657, + "kl": 0.088134765625, + "learning_rate": 6.492919921875e-07, + "loss": 0.0035, + "reward": 1.75039541721344, + "reward_std": 0.04632897302508354, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7503954172134399, + "step": 2873 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.125, + "epoch": 1.40380859375, + "grad_norm": 2.0387706691630783, + "kl": 0.068115234375, + "learning_rate": 6.49169921875e-07, + "loss": 0.0027, + "reward": 1.799646258354187, + "reward_std": 0.09669975563883781, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.807458758354187, + "step": 2874 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.7890625, + "epoch": 1.404296875, + "grad_norm": 1.3450734188530216, + "kl": 0.0771484375, + "learning_rate": 6.490478515625e-07, + "loss": 0.0031, + "reward": 1.7853235602378845, + "reward_std": 0.11038247868418694, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7931360602378845, + "step": 2875 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.9609375, + "epoch": 1.40478515625, + "grad_norm": 2.8171006789126753, + "kl": 0.0574951171875, + "learning_rate": 6.489257812499999e-07, + "loss": 0.0023, + "reward": 1.7852590084075928, + "reward_std": 0.10786662250757217, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7930714786052704, + "step": 2876 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.9375, + "epoch": 1.4052734375, + "grad_norm": 7.972532719423657, + "kl": 0.0927734375, + "learning_rate": 6.488037109374999e-07, + "loss": 0.0037, + "reward": 1.6059449911117554, + "reward_std": 0.0608881339430809, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.605944961309433, + "step": 2877 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.3203125, + "epoch": 1.40576171875, + "grad_norm": 1.0381545239399501, + "kl": 0.062255859375, + "learning_rate": 6.48681640625e-07, + "loss": 0.0025, + "reward": 1.7436492443084717, + "reward_std": 0.10550978034734726, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7514617443084717, + "step": 2878 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.75, + "epoch": 1.40625, + "grad_norm": 0.9257238624924248, + "kl": 0.0810546875, + "learning_rate": 6.485595703125e-07, + "loss": 0.0032, + "reward": 1.7538402080535889, + "reward_std": 0.08971455320715904, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7616527676582336, + "step": 2879 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.1796875, + "epoch": 1.40673828125, + "grad_norm": 2.4068440590607576, + "kl": 0.083740234375, + "learning_rate": 6.484375e-07, + "loss": 0.0034, + "reward": 1.7696812748908997, + "reward_std": 0.05204281397163868, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7696812748908997, + "step": 2880 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.0703125, + "epoch": 1.4072265625, + "grad_norm": 3.6657345328091027, + "kl": 0.0927734375, + "learning_rate": 6.483154296875e-07, + "loss": 0.0037, + "reward": 1.7494273781776428, + "reward_std": 0.12914881110191345, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7650523781776428, + "step": 2881 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.15625, + "epoch": 1.40771484375, + "grad_norm": 1.3100826271644412, + "kl": 0.09375, + "learning_rate": 6.48193359375e-07, + "loss": 0.0038, + "reward": 1.7687904238700867, + "reward_std": 0.03958193212747574, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7687904834747314, + "step": 2882 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.21875, + "epoch": 1.408203125, + "grad_norm": 2.850476912346545, + "kl": 0.0791015625, + "learning_rate": 6.480712890624999e-07, + "loss": 0.0032, + "reward": 1.718224585056305, + "reward_std": 0.07007079944014549, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7182245850563049, + "step": 2883 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.0703125, + "epoch": 1.40869140625, + "grad_norm": 2.224248079259704, + "kl": 0.070556640625, + "learning_rate": 6.479492187499999e-07, + "loss": 0.0028, + "reward": 1.81133633852005, + "reward_std": 0.07499665580689907, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.81133633852005, + "step": 2884 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.9296875, + "epoch": 1.4091796875, + "grad_norm": 1.6913926305711484, + "kl": 0.0560302734375, + "learning_rate": 6.478271484375e-07, + "loss": 0.0022, + "reward": 1.7844181060791016, + "reward_std": 0.1209041029214859, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7844181060791016, + "step": 2885 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.0625, + "epoch": 1.40966796875, + "grad_norm": 1.0647796466049049, + "kl": 0.084228515625, + "learning_rate": 6.47705078125e-07, + "loss": 0.0034, + "reward": 1.8228704333305359, + "reward_std": 0.03462422825396061, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8228704631328583, + "step": 2886 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.7578125, + "epoch": 1.41015625, + "grad_norm": 1.4054822535501044, + "kl": 0.059326171875, + "learning_rate": 6.475830078125e-07, + "loss": 0.0024, + "reward": 1.7760130167007446, + "reward_std": 0.0543476827442646, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.776013046503067, + "step": 2887 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.34375, + "epoch": 1.41064453125, + "grad_norm": 6.2835453355319535, + "kl": 0.057373046875, + "learning_rate": 6.474609375e-07, + "loss": 0.0023, + "reward": 1.8040868043899536, + "reward_std": 0.046066829934716225, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8040868043899536, + "step": 2888 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.96875, + "epoch": 1.4111328125, + "grad_norm": 2.407813465668005, + "kl": 0.069580078125, + "learning_rate": 6.473388671874999e-07, + "loss": 0.0028, + "reward": 1.6971461772918701, + "reward_std": 0.08162091299891472, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6971462368965149, + "step": 2889 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.9921875, + "epoch": 1.41162109375, + "grad_norm": 6.777877992014395, + "kl": 0.068359375, + "learning_rate": 6.472167968749999e-07, + "loss": 0.0027, + "reward": 1.7807137966156006, + "reward_std": 0.07970836386084557, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7807137668132782, + "step": 2890 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.4453125, + "epoch": 1.412109375, + "grad_norm": 5.8315057276286675, + "kl": 0.046142578125, + "learning_rate": 6.470947265625e-07, + "loss": 0.0018, + "reward": 1.8434737920761108, + "reward_std": 0.06651721894741058, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8512862920761108, + "step": 2891 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.265625, + "epoch": 1.41259765625, + "grad_norm": 1.3705583029994997, + "kl": 0.0595703125, + "learning_rate": 6.4697265625e-07, + "loss": 0.0024, + "reward": 1.6831304430961609, + "reward_std": 0.22392578423023224, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7378179430961609, + "step": 2892 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.6796875, + "epoch": 1.4130859375, + "grad_norm": 0.8488028443872007, + "kl": 0.066162109375, + "learning_rate": 6.468505859375e-07, + "loss": 0.0026, + "reward": 1.738794982433319, + "reward_std": 0.13992030546069145, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7700450122356415, + "step": 2893 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.5625, + "epoch": 1.41357421875, + "grad_norm": 1.069830600645475, + "kl": 0.080322265625, + "learning_rate": 6.46728515625e-07, + "loss": 0.0032, + "reward": 1.4707675576210022, + "reward_std": 0.07445183768868446, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.5254550278186798, + "step": 2894 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.984375, + "epoch": 1.4140625, + "grad_norm": 1.2750408423135347, + "kl": 0.060791015625, + "learning_rate": 6.466064453125e-07, + "loss": 0.0024, + "reward": 1.770975410938263, + "reward_std": 0.11516737192869186, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7866004109382629, + "step": 2895 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.359375, + "epoch": 1.41455078125, + "grad_norm": 2.2794674346865134, + "kl": 0.064697265625, + "learning_rate": 6.464843749999999e-07, + "loss": 0.0026, + "reward": 1.7922708988189697, + "reward_std": 0.1690206415951252, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8235209882259369, + "step": 2896 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.75, + "epoch": 1.4150390625, + "grad_norm": 1.0310963228503536, + "kl": 0.068603515625, + "learning_rate": 6.463623046874999e-07, + "loss": 0.0027, + "reward": 1.8120849132537842, + "reward_std": 0.06387075781822205, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8198973536491394, + "step": 2897 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.8046875, + "epoch": 1.41552734375, + "grad_norm": 0.941272249249113, + "kl": 0.0628662109375, + "learning_rate": 6.46240234375e-07, + "loss": 0.0025, + "reward": 1.6103965044021606, + "reward_std": 0.13843106850981712, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6416464447975159, + "step": 2898 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.3359375, + "epoch": 1.416015625, + "grad_norm": 2.4628580027257514, + "kl": 0.0589599609375, + "learning_rate": 6.461181640625e-07, + "loss": 0.0024, + "reward": 1.822964370250702, + "reward_std": 0.13363437354564667, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8464018404483795, + "step": 2899 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.5703125, + "epoch": 1.41650390625, + "grad_norm": 3.35939246403186, + "kl": 0.07763671875, + "learning_rate": 6.4599609375e-07, + "loss": 0.0031, + "reward": 1.8861233592033386, + "reward_std": 0.17267528176307678, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8939358592033386, + "step": 2900 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.5, + "epoch": 1.4169921875, + "grad_norm": 1.678376818469578, + "kl": 0.07861328125, + "learning_rate": 6.458740234375e-07, + "loss": 0.0031, + "reward": 1.904150128364563, + "reward_std": 0.03639446757733822, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9041501879692078, + "step": 2901 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.0625, + "epoch": 1.41748046875, + "grad_norm": 1.8009959754813598, + "kl": 0.07958984375, + "learning_rate": 6.457519531249999e-07, + "loss": 0.0032, + "reward": 1.738788664340973, + "reward_std": 0.04321512393653393, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7387886047363281, + "step": 2902 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.2578125, + "epoch": 1.41796875, + "grad_norm": 1.751332386306868, + "kl": 0.072998046875, + "learning_rate": 6.456298828124999e-07, + "loss": 0.0029, + "reward": 1.8467352390289307, + "reward_std": 0.05003441125154495, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8467352986335754, + "step": 2903 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.796875, + "epoch": 1.41845703125, + "grad_norm": 0.8841298672283215, + "kl": 0.081298828125, + "learning_rate": 6.455078125e-07, + "loss": 0.0033, + "reward": 1.6881967186927795, + "reward_std": 0.09820759668946266, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7038217782974243, + "step": 2904 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.7265625, + "epoch": 1.4189453125, + "grad_norm": 2.2187376677871553, + "kl": 0.083984375, + "learning_rate": 6.453857421875e-07, + "loss": 0.0034, + "reward": 1.7173711061477661, + "reward_std": 0.11199202761054039, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7251836061477661, + "step": 2905 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.765625, + "epoch": 1.41943359375, + "grad_norm": 1.677314384639223, + "kl": 0.0687255859375, + "learning_rate": 6.45263671875e-07, + "loss": 0.0028, + "reward": 1.7222880125045776, + "reward_std": 0.08582095801830292, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7222879230976105, + "step": 2906 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.2578125, + "epoch": 1.419921875, + "grad_norm": 1.5234578995591637, + "kl": 0.0589599609375, + "learning_rate": 6.451416015625e-07, + "loss": 0.0024, + "reward": 1.786740779876709, + "reward_std": 0.1870395466685295, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.8179908096790314, + "step": 2907 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.8046875, + "epoch": 1.42041015625, + "grad_norm": 3.9967357862014166, + "kl": 0.069091796875, + "learning_rate": 6.4501953125e-07, + "loss": 0.0028, + "reward": 1.7316583395004272, + "reward_std": 0.0972440093755722, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7394708096981049, + "step": 2908 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.2890625, + "epoch": 1.4208984375, + "grad_norm": 0.9583192864484285, + "kl": 0.056884765625, + "learning_rate": 6.448974609374999e-07, + "loss": 0.0023, + "reward": 1.7740533947944641, + "reward_std": 0.06026652827858925, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7740534543991089, + "step": 2909 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.1484375, + "epoch": 1.42138671875, + "grad_norm": 2.6735176863175707, + "kl": 0.0640869140625, + "learning_rate": 6.447753906249999e-07, + "loss": 0.0026, + "reward": 1.8210537433624268, + "reward_std": 0.10449858009815216, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8288662135601044, + "step": 2910 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.5859375, + "epoch": 1.421875, + "grad_norm": 5.569660343086, + "kl": 0.05224609375, + "learning_rate": 6.446533203125e-07, + "loss": 0.0021, + "reward": 1.8247524499893188, + "reward_std": 0.0780985876917839, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8325649201869965, + "step": 2911 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.1953125, + "epoch": 1.42236328125, + "grad_norm": 9.353451383129727, + "kl": 0.068115234375, + "learning_rate": 6.4453125e-07, + "loss": 0.0027, + "reward": 1.7248165011405945, + "reward_std": 0.09427638724446297, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7248164713382721, + "step": 2912 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.3828125, + "epoch": 1.4228515625, + "grad_norm": 0.8980177691930714, + "kl": 0.06640625, + "learning_rate": 6.444091796875e-07, + "loss": 0.0027, + "reward": 1.817960262298584, + "reward_std": 0.02281077764928341, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8179602324962616, + "step": 2913 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.5859375, + "epoch": 1.42333984375, + "grad_norm": 1.6111400955213642, + "kl": 0.08740234375, + "learning_rate": 6.44287109375e-07, + "loss": 0.0035, + "reward": 1.7714014053344727, + "reward_std": 0.04745063558220863, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7714014053344727, + "step": 2914 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.0, + "epoch": 1.423828125, + "grad_norm": 0.9803662051320426, + "kl": 0.081298828125, + "learning_rate": 6.441650390625e-07, + "loss": 0.0032, + "reward": 1.8443069458007812, + "reward_std": 0.044496684800833464, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8443069458007812, + "step": 2915 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.640625, + "epoch": 1.42431640625, + "grad_norm": 1.0830923513904844, + "kl": 0.063232421875, + "learning_rate": 6.440429687499999e-07, + "loss": 0.0025, + "reward": 1.739950180053711, + "reward_std": 0.09381197765469551, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7477626502513885, + "step": 2916 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.0703125, + "epoch": 1.4248046875, + "grad_norm": 1.7415926590595752, + "kl": 0.07080078125, + "learning_rate": 6.439208984375e-07, + "loss": 0.0028, + "reward": 1.8552255630493164, + "reward_std": 0.04862390458583832, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8552254736423492, + "step": 2917 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.3359375, + "epoch": 1.42529296875, + "grad_norm": 2.190010550447508, + "kl": 0.0662841796875, + "learning_rate": 6.43798828125e-07, + "loss": 0.0026, + "reward": 1.6964465975761414, + "reward_std": 0.062107209116220474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6964466571807861, + "step": 2918 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.234375, + "epoch": 1.42578125, + "grad_norm": 2.9783937153275017, + "kl": 0.069091796875, + "learning_rate": 6.436767578125e-07, + "loss": 0.0028, + "reward": 1.6416288614273071, + "reward_std": 0.06529787369072437, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6494413912296295, + "step": 2919 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.375, + "epoch": 1.42626953125, + "grad_norm": 1.6599884910259533, + "kl": 0.059326171875, + "learning_rate": 6.435546875e-07, + "loss": 0.0024, + "reward": 1.7820017337799072, + "reward_std": 0.06994332000613213, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7820016741752625, + "step": 2920 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.5390625, + "epoch": 1.4267578125, + "grad_norm": 0.839929372194412, + "kl": 0.0582275390625, + "learning_rate": 6.434326171875e-07, + "loss": 0.0023, + "reward": 1.812927007675171, + "reward_std": 0.028376199770718813, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8129269778728485, + "step": 2921 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.4921875, + "epoch": 1.42724609375, + "grad_norm": 1.3813349416908838, + "kl": 0.07763671875, + "learning_rate": 6.433105468749999e-07, + "loss": 0.0031, + "reward": 1.560662865638733, + "reward_std": 0.062494926154613495, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5606628656387329, + "step": 2922 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.6015625, + "epoch": 1.427734375, + "grad_norm": 1.9195445310350872, + "kl": 0.064453125, + "learning_rate": 6.431884765624999e-07, + "loss": 0.0026, + "reward": 1.7902184128761292, + "reward_std": 0.07021540775895119, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7902184724807739, + "step": 2923 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.5546875, + "epoch": 1.42822265625, + "grad_norm": 1.9159652352107033, + "kl": 0.1015625, + "learning_rate": 6.4306640625e-07, + "loss": 0.0041, + "reward": 1.8406153321266174, + "reward_std": 0.09906695038080215, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8406153321266174, + "step": 2924 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.8046875, + "epoch": 1.4287109375, + "grad_norm": 2.002785748157656, + "kl": 0.06640625, + "learning_rate": 6.429443359375e-07, + "loss": 0.0027, + "reward": 1.7776061296463013, + "reward_std": 0.05872194468975067, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7776060402393341, + "step": 2925 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.3828125, + "epoch": 1.42919921875, + "grad_norm": 1.101408367276945, + "kl": 0.0751953125, + "learning_rate": 6.42822265625e-07, + "loss": 0.003, + "reward": 1.9210602045059204, + "reward_std": 0.04052995890378952, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9210601449012756, + "step": 2926 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.109375, + "epoch": 1.4296875, + "grad_norm": 25.134180738645007, + "kl": 0.06689453125, + "learning_rate": 6.427001953125e-07, + "loss": 0.0027, + "reward": 1.780125081539154, + "reward_std": 0.042955007404088974, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7801250517368317, + "step": 2927 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.625, + "epoch": 1.43017578125, + "grad_norm": 1.3853361093593575, + "kl": 0.0748291015625, + "learning_rate": 6.42578125e-07, + "loss": 0.003, + "reward": 1.8148647546768188, + "reward_std": 0.05056310258805752, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8148646950721741, + "step": 2928 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.484375, + "epoch": 1.4306640625, + "grad_norm": 1.4127563383132502, + "kl": 0.0538330078125, + "learning_rate": 6.424560546874999e-07, + "loss": 0.0022, + "reward": 1.80906081199646, + "reward_std": 0.05354410037398338, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8090607523918152, + "step": 2929 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.8671875, + "epoch": 1.43115234375, + "grad_norm": 0.9102013320719257, + "kl": 0.0670166015625, + "learning_rate": 6.42333984375e-07, + "loss": 0.0027, + "reward": 1.8222399950027466, + "reward_std": 0.07436484284698963, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.822240024805069, + "step": 2930 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.015625, + "epoch": 1.431640625, + "grad_norm": 4.725685144329046, + "kl": 0.083740234375, + "learning_rate": 6.422119140625e-07, + "loss": 0.0033, + "reward": 1.7311421036720276, + "reward_std": 0.11611544340848923, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7389545738697052, + "step": 2931 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.328125, + "epoch": 1.43212890625, + "grad_norm": 1.5563255627352082, + "kl": 0.0810546875, + "learning_rate": 6.4208984375e-07, + "loss": 0.0032, + "reward": 1.770340383052826, + "reward_std": 0.07487385906279087, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7781528830528259, + "step": 2932 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.03125, + "epoch": 1.4326171875, + "grad_norm": 1.9586762224922618, + "kl": 0.082763671875, + "learning_rate": 6.419677734375e-07, + "loss": 0.0033, + "reward": 1.614130437374115, + "reward_std": 0.025444690138101578, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.614130437374115, + "step": 2933 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.5390625, + "epoch": 1.43310546875, + "grad_norm": 1.4419175695980417, + "kl": 0.080810546875, + "learning_rate": 6.41845703125e-07, + "loss": 0.0032, + "reward": 1.8455346822738647, + "reward_std": 0.20514215901494026, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8611597120761871, + "step": 2934 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.515625, + "epoch": 1.43359375, + "grad_norm": 1.993045779944748, + "kl": 0.0830078125, + "learning_rate": 6.417236328124999e-07, + "loss": 0.0033, + "reward": 1.7336124181747437, + "reward_std": 0.10690167173743248, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7492374181747437, + "step": 2935 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.9609375, + "epoch": 1.43408203125, + "grad_norm": 1.283473748968174, + "kl": 0.095458984375, + "learning_rate": 6.416015624999999e-07, + "loss": 0.0038, + "reward": 1.7129297852516174, + "reward_std": 0.0318203317001462, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7129298150539398, + "step": 2936 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.3046875, + "epoch": 1.4345703125, + "grad_norm": 1.1453748810817357, + "kl": 0.0654296875, + "learning_rate": 6.414794921875e-07, + "loss": 0.0026, + "reward": 1.6736072897911072, + "reward_std": 0.08154010493308306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6736072897911072, + "step": 2937 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.25, + "epoch": 1.43505859375, + "grad_norm": 2.3905058136733777, + "kl": 0.0712890625, + "learning_rate": 6.41357421875e-07, + "loss": 0.0028, + "reward": 1.7600257396697998, + "reward_std": 0.10162025317549706, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7756507098674774, + "step": 2938 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.0234375, + "epoch": 1.435546875, + "grad_norm": 0.9852440812866968, + "kl": 0.0693359375, + "learning_rate": 6.412353515625e-07, + "loss": 0.0028, + "reward": 1.7784386277198792, + "reward_std": 0.09795338660478592, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8018760681152344, + "step": 2939 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.390625, + "epoch": 1.43603515625, + "grad_norm": 5.526089374651416, + "kl": 0.0697021484375, + "learning_rate": 6.4111328125e-07, + "loss": 0.0028, + "reward": 1.8081418871879578, + "reward_std": 0.05436134152114391, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.808141827583313, + "step": 2940 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.7734375, + "epoch": 1.4365234375, + "grad_norm": 1.4939649863180227, + "kl": 0.0582275390625, + "learning_rate": 6.409912109375e-07, + "loss": 0.0023, + "reward": 1.9053468704223633, + "reward_std": 0.05440020468086004, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9053468704223633, + "step": 2941 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.109375, + "epoch": 1.43701171875, + "grad_norm": 1.8121729150206805, + "kl": 0.074951171875, + "learning_rate": 6.408691406249999e-07, + "loss": 0.003, + "reward": 1.762609839439392, + "reward_std": 0.025970693212002516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7626098394393921, + "step": 2942 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.5859375, + "epoch": 1.4375, + "grad_norm": 0.9192342581628401, + "kl": 0.0701904296875, + "learning_rate": 6.407470703125e-07, + "loss": 0.0028, + "reward": 1.6859044432640076, + "reward_std": 0.040243260096758604, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6859044134616852, + "step": 2943 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.3671875, + "epoch": 1.43798828125, + "grad_norm": 2.475395126244144, + "kl": 0.08544921875, + "learning_rate": 6.40625e-07, + "loss": 0.0034, + "reward": 1.848134458065033, + "reward_std": 0.0467034000903368, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.848134458065033, + "step": 2944 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.3828125, + "epoch": 1.4384765625, + "grad_norm": 3.6098224066349096, + "kl": 0.075439453125, + "learning_rate": 6.405029296875e-07, + "loss": 0.003, + "reward": 1.812850534915924, + "reward_std": 0.031669266521930695, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8128505349159241, + "step": 2945 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.2109375, + "epoch": 1.43896484375, + "grad_norm": 2.222659762962882, + "kl": 0.07421875, + "learning_rate": 6.40380859375e-07, + "loss": 0.003, + "reward": 1.7910358309745789, + "reward_std": 0.05065750889480114, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7910358011722565, + "step": 2946 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.6484375, + "epoch": 1.439453125, + "grad_norm": 2.208671012693913, + "kl": 0.10302734375, + "learning_rate": 6.402587890625e-07, + "loss": 0.0041, + "reward": 1.6763520240783691, + "reward_std": 0.058261996135115623, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6763519942760468, + "step": 2947 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.3515625, + "epoch": 1.43994140625, + "grad_norm": 1.5352653822354414, + "kl": 0.063232421875, + "learning_rate": 6.401367187499999e-07, + "loss": 0.0025, + "reward": 1.755962073802948, + "reward_std": 0.03862538933753967, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7559620141983032, + "step": 2948 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.7265625, + "epoch": 1.4404296875, + "grad_norm": 0.698355831200674, + "kl": 0.06982421875, + "learning_rate": 6.400146484374999e-07, + "loss": 0.0028, + "reward": 1.7664831280708313, + "reward_std": 0.020398199558258057, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7664831578731537, + "step": 2949 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.8359375, + "epoch": 1.44091796875, + "grad_norm": 2.578363130813962, + "kl": 0.080810546875, + "learning_rate": 6.39892578125e-07, + "loss": 0.0032, + "reward": 1.7696714997291565, + "reward_std": 0.051741763949394226, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7696714997291565, + "step": 2950 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.1875, + "epoch": 1.44140625, + "grad_norm": 6.744614815262741, + "kl": 0.070068359375, + "learning_rate": 6.397705078125e-07, + "loss": 0.0028, + "reward": 1.787192463874817, + "reward_std": 0.030791327357292175, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7871924936771393, + "step": 2951 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.90625, + "epoch": 1.44189453125, + "grad_norm": 0.853256922168278, + "kl": 0.0648193359375, + "learning_rate": 6.396484375e-07, + "loss": 0.0026, + "reward": 1.7069947719573975, + "reward_std": 0.10049226693809032, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7226196825504303, + "step": 2952 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.875, + "epoch": 1.4423828125, + "grad_norm": 1.7414803959752738, + "kl": 0.0633544921875, + "learning_rate": 6.395263671875e-07, + "loss": 0.0025, + "reward": 1.7784687280654907, + "reward_std": 0.03222686983644962, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7784686982631683, + "step": 2953 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.796875, + "epoch": 1.44287109375, + "grad_norm": 0.7824543197588109, + "kl": 0.065185546875, + "learning_rate": 6.39404296875e-07, + "loss": 0.0026, + "reward": 1.835627555847168, + "reward_std": 0.07133413106203079, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.835627555847168, + "step": 2954 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.2421875, + "epoch": 1.443359375, + "grad_norm": 1.005077756695292, + "kl": 0.0574951171875, + "learning_rate": 6.392822265624999e-07, + "loss": 0.0023, + "reward": 1.907008171081543, + "reward_std": 0.12066750600934029, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9148207008838654, + "step": 2955 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.796875, + "epoch": 1.44384765625, + "grad_norm": 1.6630553458065824, + "kl": 0.05712890625, + "learning_rate": 6.391601562499999e-07, + "loss": 0.0023, + "reward": 1.6454344391822815, + "reward_std": 0.130395095795393, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6923094987869263, + "step": 2956 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.1875, + "epoch": 1.4443359375, + "grad_norm": 1.6919739972247043, + "kl": 0.080078125, + "learning_rate": 6.390380859375e-07, + "loss": 0.0032, + "reward": 1.7030593156814575, + "reward_std": 0.11804335564374924, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7343093156814575, + "step": 2957 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.5, + "epoch": 1.44482421875, + "grad_norm": 3.0488067627667186, + "kl": 0.0574951171875, + "learning_rate": 6.38916015625e-07, + "loss": 0.0023, + "reward": 1.7935433387756348, + "reward_std": 0.11342027597129345, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8091683387756348, + "step": 2958 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.875, + "epoch": 1.4453125, + "grad_norm": 3.417146010971531, + "kl": 0.08154296875, + "learning_rate": 6.387939453125e-07, + "loss": 0.0033, + "reward": 1.7204725742340088, + "reward_std": 0.10613266006112099, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7282850742340088, + "step": 2959 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0078125, + "epoch": 1.44580078125, + "grad_norm": 7.369282649351467, + "kl": 0.06591796875, + "learning_rate": 6.38671875e-07, + "loss": 0.0026, + "reward": 1.7681906819343567, + "reward_std": 0.02635895786806941, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7681907117366791, + "step": 2960 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.953125, + "epoch": 1.4462890625, + "grad_norm": 2.0658434347932264, + "kl": 0.0645751953125, + "learning_rate": 6.385498046874999e-07, + "loss": 0.0026, + "reward": 1.817894995212555, + "reward_std": 0.07758408039808273, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8178950250148773, + "step": 2961 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.9765625, + "epoch": 1.44677734375, + "grad_norm": 0.6569603767694976, + "kl": 0.0533447265625, + "learning_rate": 6.384277343749999e-07, + "loss": 0.0021, + "reward": 1.8447397351264954, + "reward_std": 0.06235711555927992, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8525522649288177, + "step": 2962 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.3125, + "epoch": 1.447265625, + "grad_norm": 2.645560581989775, + "kl": 0.072265625, + "learning_rate": 6.383056640625e-07, + "loss": 0.0029, + "reward": 1.687853217124939, + "reward_std": 0.0805647261440754, + "rewards/format_reward": 0.9296875, + "rewards/ocr_reward": 0.758165717124939, + "step": 2963 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.5859375, + "epoch": 1.44775390625, + "grad_norm": 1.3650167038051417, + "kl": 0.0523681640625, + "learning_rate": 6.3818359375e-07, + "loss": 0.0021, + "reward": 1.7391371130943298, + "reward_std": 0.15477406233549118, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7860120832920074, + "step": 2964 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.6328125, + "epoch": 1.4482421875, + "grad_norm": 1.1245251287679778, + "kl": 0.072998046875, + "learning_rate": 6.380615234375e-07, + "loss": 0.0029, + "reward": 1.646054744720459, + "reward_std": 0.06316448841243982, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6538672149181366, + "step": 2965 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.5546875, + "epoch": 1.44873046875, + "grad_norm": 13.043523585623863, + "kl": 0.0791015625, + "learning_rate": 6.37939453125e-07, + "loss": 0.0032, + "reward": 1.7939326763153076, + "reward_std": 0.06745261326432228, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8017451465129852, + "step": 2966 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.359375, + "epoch": 1.44921875, + "grad_norm": 1.57791062011593, + "kl": 0.060302734375, + "learning_rate": 6.378173828125e-07, + "loss": 0.0024, + "reward": 1.8393926620483398, + "reward_std": 0.018147557973861694, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8393926918506622, + "step": 2967 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.3359375, + "epoch": 1.44970703125, + "grad_norm": 1.8175153858800717, + "kl": 0.06005859375, + "learning_rate": 6.376953124999999e-07, + "loss": 0.0024, + "reward": 1.7290653586387634, + "reward_std": 0.12614280730485916, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7446902990341187, + "step": 2968 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.9296875, + "epoch": 1.4501953125, + "grad_norm": 9.465224240232326, + "kl": 0.0694580078125, + "learning_rate": 6.375732421874999e-07, + "loss": 0.0028, + "reward": 1.7350887060165405, + "reward_std": 0.11684410274028778, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7429012060165405, + "step": 2969 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.9765625, + "epoch": 1.45068359375, + "grad_norm": 2.083144051979017, + "kl": 0.061279296875, + "learning_rate": 6.37451171875e-07, + "loss": 0.0024, + "reward": 1.7892062067985535, + "reward_std": 0.12466869875788689, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8048312067985535, + "step": 2970 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.1796875, + "epoch": 1.451171875, + "grad_norm": 3.731969056408829, + "kl": 0.06591796875, + "learning_rate": 6.373291015625e-07, + "loss": 0.0026, + "reward": 1.8324419260025024, + "reward_std": 0.07397226989269257, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8324419260025024, + "step": 2971 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.8515625, + "epoch": 1.45166015625, + "grad_norm": 2.1941261882702006, + "kl": 0.0714111328125, + "learning_rate": 6.3720703125e-07, + "loss": 0.0029, + "reward": 1.7780184149742126, + "reward_std": 0.20118620991706848, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8014559149742126, + "step": 2972 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.21875, + "epoch": 1.4521484375, + "grad_norm": 1.3453507107528788, + "kl": 0.070556640625, + "learning_rate": 6.370849609375e-07, + "loss": 0.0028, + "reward": 1.6615075469017029, + "reward_std": 0.01416647876612842, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6615075469017029, + "step": 2973 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.203125, + "epoch": 1.45263671875, + "grad_norm": 0.941837624396483, + "kl": 0.071533203125, + "learning_rate": 6.369628906249999e-07, + "loss": 0.0029, + "reward": 1.7576044797897339, + "reward_std": 0.030522312968969345, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7576044797897339, + "step": 2974 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.1953125, + "epoch": 1.453125, + "grad_norm": 2.6111080609725876, + "kl": 0.063720703125, + "learning_rate": 6.368408203124999e-07, + "loss": 0.0025, + "reward": 1.748826265335083, + "reward_std": 0.040242042392492294, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.748826265335083, + "step": 2975 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.75, + "epoch": 1.45361328125, + "grad_norm": 9.758378083342457, + "kl": 0.08203125, + "learning_rate": 6.3671875e-07, + "loss": 0.0033, + "reward": 1.7325801849365234, + "reward_std": 0.1269008917734027, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7560176849365234, + "step": 2976 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.34375, + "epoch": 1.4541015625, + "grad_norm": 1.4790068606433127, + "kl": 0.05712890625, + "learning_rate": 6.365966796875e-07, + "loss": 0.0023, + "reward": 1.7784000039100647, + "reward_std": 0.11495145037770271, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7862125039100647, + "step": 2977 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.5234375, + "epoch": 1.45458984375, + "grad_norm": 7.445604414954171, + "kl": 0.090087890625, + "learning_rate": 6.36474609375e-07, + "loss": 0.0036, + "reward": 1.7594309449195862, + "reward_std": 0.05727781727910042, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7594309449195862, + "step": 2978 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.8984375, + "epoch": 1.455078125, + "grad_norm": 1.9969872662674215, + "kl": 0.0635986328125, + "learning_rate": 6.363525390625e-07, + "loss": 0.0025, + "reward": 1.8546399474143982, + "reward_std": 0.03894917480647564, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8546398878097534, + "step": 2979 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.3984375, + "epoch": 1.45556640625, + "grad_norm": 2.774830461991803, + "kl": 0.086669921875, + "learning_rate": 6.3623046875e-07, + "loss": 0.0035, + "reward": 1.7985565066337585, + "reward_std": 0.09453297778964043, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8063690066337585, + "step": 2980 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.734375, + "epoch": 1.4560546875, + "grad_norm": 1.005235350260796, + "kl": 0.067626953125, + "learning_rate": 6.361083984374999e-07, + "loss": 0.0027, + "reward": 1.8525811433792114, + "reward_std": 0.032127720303833485, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.852581113576889, + "step": 2981 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.15625, + "epoch": 1.45654296875, + "grad_norm": 1.480660541121175, + "kl": 0.06591796875, + "learning_rate": 6.359863281249999e-07, + "loss": 0.0026, + "reward": 1.8015734553337097, + "reward_std": 0.10697927977889776, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8093858957290649, + "step": 2982 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6484375, + "epoch": 1.45703125, + "grad_norm": 1.28090728544506, + "kl": 0.090087890625, + "learning_rate": 6.358642578125e-07, + "loss": 0.0036, + "reward": 1.7251802682876587, + "reward_std": 0.02976925577968359, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7251803278923035, + "step": 2983 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.25, + "epoch": 1.45751953125, + "grad_norm": 1.5160356660704382, + "kl": 0.0509033203125, + "learning_rate": 6.357421875e-07, + "loss": 0.002, + "reward": 1.7424423694610596, + "reward_std": 0.10370543040335178, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7502549290657043, + "step": 2984 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.1875, + "epoch": 1.4580078125, + "grad_norm": 1.2745109824108936, + "kl": 0.06298828125, + "learning_rate": 6.356201171875e-07, + "loss": 0.0025, + "reward": 1.5927727818489075, + "reward_std": 0.12405483797192574, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6318353116512299, + "step": 2985 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.640625, + "epoch": 1.45849609375, + "grad_norm": 2.169138878887581, + "kl": 0.0712890625, + "learning_rate": 6.35498046875e-07, + "loss": 0.0029, + "reward": 1.860952913761139, + "reward_std": 0.06300730584189296, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8609528839588165, + "step": 2986 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.421875, + "epoch": 1.458984375, + "grad_norm": 1.6032586983368817, + "kl": 0.074462890625, + "learning_rate": 6.353759765624999e-07, + "loss": 0.003, + "reward": 1.6849730610847473, + "reward_std": 0.04362546745687723, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6849730312824249, + "step": 2987 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3125, + "epoch": 1.45947265625, + "grad_norm": 2.296346852512009, + "kl": 0.0687255859375, + "learning_rate": 6.352539062499999e-07, + "loss": 0.0028, + "reward": 1.8462890982627869, + "reward_std": 0.11591282114386559, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8541015088558197, + "step": 2988 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.3125, + "epoch": 1.4599609375, + "grad_norm": 4.792249158952244, + "kl": 0.067626953125, + "learning_rate": 6.351318359375e-07, + "loss": 0.0027, + "reward": 1.6084554195404053, + "reward_std": 0.08818965405225754, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6162679195404053, + "step": 2989 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.078125, + "epoch": 1.46044921875, + "grad_norm": 0.4828418747479682, + "kl": 0.05810546875, + "learning_rate": 6.35009765625e-07, + "loss": 0.0023, + "reward": 1.6002402305603027, + "reward_std": 0.12684646097477525, + "rewards/format_reward": 0.9375, + "rewards/ocr_reward": 0.6627402305603027, + "step": 2990 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.5078125, + "epoch": 1.4609375, + "grad_norm": 1.515503324615265, + "kl": 0.0631103515625, + "learning_rate": 6.348876953125e-07, + "loss": 0.0025, + "reward": 1.8527971506118774, + "reward_std": 0.07764232903718948, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.852797120809555, + "step": 2991 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.03125, + "epoch": 1.46142578125, + "grad_norm": 1.2235439862774495, + "kl": 0.0574951171875, + "learning_rate": 6.34765625e-07, + "loss": 0.0023, + "reward": 1.7806990146636963, + "reward_std": 0.05695920065045357, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7885115146636963, + "step": 2992 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.25, + "epoch": 1.4619140625, + "grad_norm": 3.314536987656733, + "kl": 0.0587158203125, + "learning_rate": 6.346435546875e-07, + "loss": 0.0023, + "reward": 1.8154310584068298, + "reward_std": 0.12618440762162209, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8310559988021851, + "step": 2993 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.0390625, + "epoch": 1.46240234375, + "grad_norm": 1.2908463477638215, + "kl": 0.0565185546875, + "learning_rate": 6.345214843749999e-07, + "loss": 0.0023, + "reward": 1.7146747708320618, + "reward_std": 0.07501043565571308, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7224873006343842, + "step": 2994 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.8984375, + "epoch": 1.462890625, + "grad_norm": 2.9125890422786016, + "kl": 0.072021484375, + "learning_rate": 6.343994140624999e-07, + "loss": 0.0029, + "reward": 1.6301099061965942, + "reward_std": 0.13861995935440063, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6535473763942719, + "step": 2995 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.2734375, + "epoch": 1.46337890625, + "grad_norm": 0.8886604709538318, + "kl": 0.05078125, + "learning_rate": 6.3427734375e-07, + "loss": 0.002, + "reward": 1.7734524011611938, + "reward_std": 0.020457894541323185, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7734524309635162, + "step": 2996 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.0546875, + "epoch": 1.4638671875, + "grad_norm": 1.0464030461396734, + "kl": 0.0601806640625, + "learning_rate": 6.341552734375e-07, + "loss": 0.0024, + "reward": 1.654776692390442, + "reward_std": 0.08548066765069962, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6704016923904419, + "step": 2997 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.515625, + "epoch": 1.46435546875, + "grad_norm": 9.606833597314674, + "kl": 0.059326171875, + "learning_rate": 6.34033203125e-07, + "loss": 0.0024, + "reward": 1.8002928495407104, + "reward_std": 0.06611186265945435, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8002929091453552, + "step": 2998 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.0546875, + "epoch": 1.46484375, + "grad_norm": 1.0930533697553297, + "kl": 0.056640625, + "learning_rate": 6.339111328125e-07, + "loss": 0.0023, + "reward": 1.7426326274871826, + "reward_std": 0.0601738141849637, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.742632657289505, + "step": 2999 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.921875, + "epoch": 1.46533203125, + "grad_norm": 0.9165980219862101, + "kl": 0.0672607421875, + "learning_rate": 6.337890625e-07, + "loss": 0.0027, + "reward": 1.829143762588501, + "reward_std": 0.051721951458603144, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.836956262588501, + "step": 3000 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.5625, + "epoch": 1.4658203125, + "grad_norm": 3.771010388411293, + "kl": 0.09716796875, + "learning_rate": 6.336669921874999e-07, + "loss": 0.0039, + "reward": 1.746791124343872, + "reward_std": 0.0833306573331356, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7546035051345825, + "step": 3001 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.3515625, + "epoch": 1.46630859375, + "grad_norm": 1.7568136419799434, + "kl": 0.079345703125, + "learning_rate": 6.33544921875e-07, + "loss": 0.0032, + "reward": 1.8903692960739136, + "reward_std": 0.08128884993493557, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8981817662715912, + "step": 3002 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.3359375, + "epoch": 1.466796875, + "grad_norm": 0.6670166215082387, + "kl": 0.0595703125, + "learning_rate": 6.334228515625e-07, + "loss": 0.0024, + "reward": 1.7217431664466858, + "reward_std": 0.052741317078471184, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7217431664466858, + "step": 3003 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.8359375, + "epoch": 1.46728515625, + "grad_norm": 0.5130515259710517, + "kl": 0.0673828125, + "learning_rate": 6.3330078125e-07, + "loss": 0.0027, + "reward": 1.747616171836853, + "reward_std": 0.09704152680933475, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7632412910461426, + "step": 3004 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.2578125, + "epoch": 1.4677734375, + "grad_norm": 1.4447424507562618, + "kl": 0.0552978515625, + "learning_rate": 6.331787109375e-07, + "loss": 0.0022, + "reward": 1.767389953136444, + "reward_std": 0.052734846249222755, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7673899531364441, + "step": 3005 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.390625, + "epoch": 1.46826171875, + "grad_norm": 2.541777603017261, + "kl": 0.07666015625, + "learning_rate": 6.33056640625e-07, + "loss": 0.0031, + "reward": 1.813210904598236, + "reward_std": 0.11053607612848282, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8210234045982361, + "step": 3006 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.0, + "epoch": 1.46875, + "grad_norm": 0.8804910586643099, + "kl": 0.0538330078125, + "learning_rate": 6.329345703124999e-07, + "loss": 0.0022, + "reward": 1.8383709192276, + "reward_std": 0.02365578804165125, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8383709490299225, + "step": 3007 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.1484375, + "epoch": 1.46923828125, + "grad_norm": 2.8323385895385247, + "kl": 0.066162109375, + "learning_rate": 6.328124999999999e-07, + "loss": 0.0026, + "reward": 1.7967005968093872, + "reward_std": 0.07634428888559341, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8045131862163544, + "step": 3008 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.4921875, + "epoch": 1.4697265625, + "grad_norm": 1.558692224561045, + "kl": 0.09521484375, + "learning_rate": 6.326904296875e-07, + "loss": 0.0038, + "reward": 1.6224290132522583, + "reward_std": 0.04988163709640503, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6224290430545807, + "step": 3009 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.375, + "epoch": 1.47021484375, + "grad_norm": 0.6934292087860076, + "kl": 0.0556640625, + "learning_rate": 6.32568359375e-07, + "loss": 0.0022, + "reward": 1.9525578022003174, + "reward_std": 0.031788173131644726, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9525578618049622, + "step": 3010 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.7109375, + "epoch": 1.470703125, + "grad_norm": 1.1566353452168543, + "kl": 0.0509033203125, + "learning_rate": 6.324462890625e-07, + "loss": 0.002, + "reward": 1.80779629945755, + "reward_std": 0.13630902767181396, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.83123379945755, + "step": 3011 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.28125, + "epoch": 1.47119140625, + "grad_norm": 1.4657246132330377, + "kl": 0.0772705078125, + "learning_rate": 6.3232421875e-07, + "loss": 0.0031, + "reward": 1.868379831314087, + "reward_std": 0.05622401461005211, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8683798313140869, + "step": 3012 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.09375, + "epoch": 1.4716796875, + "grad_norm": 0.8995301250346504, + "kl": 0.0518798828125, + "learning_rate": 6.322021484375e-07, + "loss": 0.0021, + "reward": 1.7831536531448364, + "reward_std": 0.03840099833905697, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7831536531448364, + "step": 3013 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7578125, + "epoch": 1.47216796875, + "grad_norm": 1.064413517900183, + "kl": 0.0673828125, + "learning_rate": 6.320800781249999e-07, + "loss": 0.0027, + "reward": 1.8467872142791748, + "reward_std": 0.06419426389038563, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8545996248722076, + "step": 3014 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.8203125, + "epoch": 1.47265625, + "grad_norm": 1.2422987199959086, + "kl": 0.0726318359375, + "learning_rate": 6.319580078125e-07, + "loss": 0.0029, + "reward": 1.8063626289367676, + "reward_std": 0.14462891966104507, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8141750693321228, + "step": 3015 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.953125, + "epoch": 1.47314453125, + "grad_norm": 2.517009818019777, + "kl": 0.09033203125, + "learning_rate": 6.318359375e-07, + "loss": 0.0036, + "reward": 1.653084933757782, + "reward_std": 0.098308514803648, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6530849635601044, + "step": 3016 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.8203125, + "epoch": 1.4736328125, + "grad_norm": 2.340141995970886, + "kl": 0.0723876953125, + "learning_rate": 6.317138671875e-07, + "loss": 0.0029, + "reward": 1.7496492862701416, + "reward_std": 0.1289630625396967, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7496492564678192, + "step": 3017 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.96875, + "epoch": 1.47412109375, + "grad_norm": 1.256597280828511, + "kl": 0.054443359375, + "learning_rate": 6.31591796875e-07, + "loss": 0.0022, + "reward": 1.8092172145843506, + "reward_std": 0.0746869370341301, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8170297741889954, + "step": 3018 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.609375, + "epoch": 1.474609375, + "grad_norm": 2.4683338443131673, + "kl": 0.074951171875, + "learning_rate": 6.314697265625e-07, + "loss": 0.003, + "reward": 1.6483544707298279, + "reward_std": 0.10350741818547249, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6561670005321503, + "step": 3019 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.5390625, + "epoch": 1.47509765625, + "grad_norm": 3.6247431205126874, + "kl": 0.0791015625, + "learning_rate": 6.313476562499999e-07, + "loss": 0.0032, + "reward": 1.7551026344299316, + "reward_std": 0.11083749681711197, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.755102664232254, + "step": 3020 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.40625, + "epoch": 1.4755859375, + "grad_norm": 1.0766199180438065, + "kl": 0.07763671875, + "learning_rate": 6.312255859374999e-07, + "loss": 0.0031, + "reward": 1.7589967250823975, + "reward_std": 0.05489533022046089, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7589967548847198, + "step": 3021 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.65625, + "epoch": 1.47607421875, + "grad_norm": 1.4208095059281178, + "kl": 0.0693359375, + "learning_rate": 6.31103515625e-07, + "loss": 0.0028, + "reward": 1.8012661933898926, + "reward_std": 0.045568812638521194, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8012661635875702, + "step": 3022 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.28125, + "epoch": 1.4765625, + "grad_norm": 3.075879562177541, + "kl": 0.0670166015625, + "learning_rate": 6.309814453125e-07, + "loss": 0.0027, + "reward": 1.870323121547699, + "reward_std": 0.03863493725657463, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.870323121547699, + "step": 3023 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.3671875, + "epoch": 1.47705078125, + "grad_norm": 6.548224072360138, + "kl": 0.064208984375, + "learning_rate": 6.30859375e-07, + "loss": 0.0026, + "reward": 1.7646169662475586, + "reward_std": 0.07329913601279259, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7646169662475586, + "step": 3024 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.3203125, + "epoch": 1.4775390625, + "grad_norm": 3.4394236221792247, + "kl": 0.081787109375, + "learning_rate": 6.307373046875e-07, + "loss": 0.0033, + "reward": 1.6831781268119812, + "reward_std": 0.07777292281389236, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6909905970096588, + "step": 3025 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.390625, + "epoch": 1.47802734375, + "grad_norm": 1.4825730777624018, + "kl": 0.068359375, + "learning_rate": 6.30615234375e-07, + "loss": 0.0027, + "reward": 1.7502532005310059, + "reward_std": 0.055065859109163284, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7502532303333282, + "step": 3026 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.078125, + "epoch": 1.478515625, + "grad_norm": 1.5490192480578866, + "kl": 0.07470703125, + "learning_rate": 6.304931640624999e-07, + "loss": 0.003, + "reward": 1.7950791120529175, + "reward_std": 0.038502528332173824, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7950790822505951, + "step": 3027 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.9375, + "epoch": 1.47900390625, + "grad_norm": 7.667208022460056, + "kl": 0.07861328125, + "learning_rate": 6.3037109375e-07, + "loss": 0.0031, + "reward": 1.8664205074310303, + "reward_std": 0.07143169827759266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8664205372333527, + "step": 3028 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.2421875, + "epoch": 1.4794921875, + "grad_norm": 1.562849631053882, + "kl": 0.096923828125, + "learning_rate": 6.302490234375e-07, + "loss": 0.0039, + "reward": 1.6895395517349243, + "reward_std": 0.03657793905586004, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6895396113395691, + "step": 3029 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.4609375, + "epoch": 1.47998046875, + "grad_norm": 2.072472095377072, + "kl": 0.073974609375, + "learning_rate": 6.30126953125e-07, + "loss": 0.003, + "reward": 1.6448410749435425, + "reward_std": 0.12514834105968475, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6526535749435425, + "step": 3030 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.90625, + "epoch": 1.48046875, + "grad_norm": 4.248343394614419, + "kl": 0.082763671875, + "learning_rate": 6.300048828125e-07, + "loss": 0.0033, + "reward": 1.676950991153717, + "reward_std": 0.05259130522608757, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.676950991153717, + "step": 3031 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.3515625, + "epoch": 1.48095703125, + "grad_norm": 10.016521585655443, + "kl": 0.0672607421875, + "learning_rate": 6.298828125e-07, + "loss": 0.0027, + "reward": 1.855322241783142, + "reward_std": 0.06691266316920519, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8553222417831421, + "step": 3032 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.3125, + "epoch": 1.4814453125, + "grad_norm": 1.7764278531475133, + "kl": 0.056884765625, + "learning_rate": 6.297607421874999e-07, + "loss": 0.0023, + "reward": 1.932866632938385, + "reward_std": 0.05053331330418587, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.932866632938385, + "step": 3033 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.2890625, + "epoch": 1.48193359375, + "grad_norm": 2.0240030324030367, + "kl": 0.0572509765625, + "learning_rate": 6.296386718749999e-07, + "loss": 0.0023, + "reward": 1.6182212233543396, + "reward_std": 0.2208278402686119, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.67290860414505, + "step": 3034 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.5859375, + "epoch": 1.482421875, + "grad_norm": 1.713898440746238, + "kl": 0.0733642578125, + "learning_rate": 6.295166015625e-07, + "loss": 0.0029, + "reward": 1.7418071627616882, + "reward_std": 0.10038780607283115, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7496196925640106, + "step": 3035 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.8125, + "epoch": 1.48291015625, + "grad_norm": 0.9264185253043364, + "kl": 0.075439453125, + "learning_rate": 6.2939453125e-07, + "loss": 0.003, + "reward": 1.741519808769226, + "reward_std": 0.07616345398128033, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7493323385715485, + "step": 3036 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.7890625, + "epoch": 1.4833984375, + "grad_norm": 1.4044617030039843, + "kl": 0.095947265625, + "learning_rate": 6.292724609375e-07, + "loss": 0.0038, + "reward": 1.6602322459220886, + "reward_std": 0.1420225277543068, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6758573055267334, + "step": 3037 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.3203125, + "epoch": 1.48388671875, + "grad_norm": 1.9738335423607676, + "kl": 0.071533203125, + "learning_rate": 6.29150390625e-07, + "loss": 0.0029, + "reward": 1.7275782823562622, + "reward_std": 0.08198518864810467, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7432032227516174, + "step": 3038 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.6015625, + "epoch": 1.484375, + "grad_norm": 2.0017739219506825, + "kl": 0.080322265625, + "learning_rate": 6.290283203125e-07, + "loss": 0.0032, + "reward": 1.644084870815277, + "reward_std": 0.0899181142449379, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6518973708152771, + "step": 3039 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.2421875, + "epoch": 1.48486328125, + "grad_norm": 0.883112354454319, + "kl": 0.084228515625, + "learning_rate": 6.289062499999999e-07, + "loss": 0.0034, + "reward": 1.513433575630188, + "reward_std": 0.05119518283754587, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.5134336054325104, + "step": 3040 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.3359375, + "epoch": 1.4853515625, + "grad_norm": 1.3435499384819551, + "kl": 0.0771484375, + "learning_rate": 6.287841796875e-07, + "loss": 0.0031, + "reward": 1.7283309698104858, + "reward_std": 0.10164744779467583, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7283309698104858, + "step": 3041 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.0625, + "epoch": 1.48583984375, + "grad_norm": 1.5022112167151482, + "kl": 0.0653076171875, + "learning_rate": 6.28662109375e-07, + "loss": 0.0026, + "reward": 1.7406939268112183, + "reward_std": 0.07021256536245346, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7406939268112183, + "step": 3042 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.3359375, + "epoch": 1.486328125, + "grad_norm": 2.0786694119288205, + "kl": 0.083984375, + "learning_rate": 6.285400390625e-07, + "loss": 0.0034, + "reward": 1.7357019186019897, + "reward_std": 0.0432198503986001, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7357019484043121, + "step": 3043 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.1796875, + "epoch": 1.48681640625, + "grad_norm": 1.4782807841049792, + "kl": 0.09716796875, + "learning_rate": 6.2841796875e-07, + "loss": 0.0039, + "reward": 1.7358573079109192, + "reward_std": 0.10283184796571732, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7514822483062744, + "step": 3044 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.5390625, + "epoch": 1.4873046875, + "grad_norm": 9.234695754183694, + "kl": 0.10302734375, + "learning_rate": 6.282958984375e-07, + "loss": 0.0041, + "reward": 1.6147398948669434, + "reward_std": 0.14877690002322197, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.6381773948669434, + "step": 3045 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.9609375, + "epoch": 1.48779296875, + "grad_norm": 1.043601797533081, + "kl": 0.078369140625, + "learning_rate": 6.281738281249999e-07, + "loss": 0.0031, + "reward": 1.7752271890640259, + "reward_std": 0.06411982700228691, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7752272188663483, + "step": 3046 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.59375, + "epoch": 1.48828125, + "grad_norm": 2.505802779158192, + "kl": 0.06298828125, + "learning_rate": 6.280517578124999e-07, + "loss": 0.0025, + "reward": 1.8750739693641663, + "reward_std": 0.018464698921889067, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8750739097595215, + "step": 3047 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.875, + "epoch": 1.48876953125, + "grad_norm": 1.5420008942864827, + "kl": 0.088134765625, + "learning_rate": 6.279296875e-07, + "loss": 0.0035, + "reward": 1.8027490973472595, + "reward_std": 0.07335010170936584, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8105616271495819, + "step": 3048 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.0078125, + "epoch": 1.4892578125, + "grad_norm": 1.883570258751632, + "kl": 0.08447265625, + "learning_rate": 6.278076171875e-07, + "loss": 0.0034, + "reward": 1.8453101515769958, + "reward_std": 0.052738748490810394, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8453101217746735, + "step": 3049 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.65625, + "epoch": 1.48974609375, + "grad_norm": 1.9780475385606873, + "kl": 0.091064453125, + "learning_rate": 6.27685546875e-07, + "loss": 0.0036, + "reward": 1.8935607075691223, + "reward_std": 0.05056627467274666, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8935607373714447, + "step": 3050 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.734375, + "epoch": 1.490234375, + "grad_norm": 2.9996617070545546, + "kl": 0.0687255859375, + "learning_rate": 6.275634765625e-07, + "loss": 0.0027, + "reward": 1.8360978960990906, + "reward_std": 0.054630378261208534, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.836097925901413, + "step": 3051 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.3515625, + "epoch": 1.49072265625, + "grad_norm": 2.6323826664386236, + "kl": 0.079833984375, + "learning_rate": 6.2744140625e-07, + "loss": 0.0032, + "reward": 1.749899685382843, + "reward_std": 0.034031180664896965, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7498997449874878, + "step": 3052 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.5625, + "epoch": 1.4912109375, + "grad_norm": 1.2654812608116905, + "kl": 0.0703125, + "learning_rate": 6.273193359374999e-07, + "loss": 0.0028, + "reward": 1.781448781490326, + "reward_std": 0.03715716116130352, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7814488708972931, + "step": 3053 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.8203125, + "epoch": 1.49169921875, + "grad_norm": 4.420456353697728, + "kl": 0.07373046875, + "learning_rate": 6.271972656249999e-07, + "loss": 0.0029, + "reward": 1.8104448914527893, + "reward_std": 0.09506340697407722, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8104448914527893, + "step": 3054 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.421875, + "epoch": 1.4921875, + "grad_norm": 1.6948290597573188, + "kl": 0.069091796875, + "learning_rate": 6.270751953125e-07, + "loss": 0.0028, + "reward": 1.7458880543708801, + "reward_std": 0.08343839459121227, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7537006139755249, + "step": 3055 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.2734375, + "epoch": 1.49267578125, + "grad_norm": 6.382155809604849, + "kl": 0.069091796875, + "learning_rate": 6.26953125e-07, + "loss": 0.0028, + "reward": 1.8328390717506409, + "reward_std": 0.06546132825314999, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8406516313552856, + "step": 3056 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.0, + "epoch": 1.4931640625, + "grad_norm": 1.482140538189339, + "kl": 0.080078125, + "learning_rate": 6.268310546875e-07, + "loss": 0.0032, + "reward": 1.8376395106315613, + "reward_std": 0.046169581823050976, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8376395106315613, + "step": 3057 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.1171875, + "epoch": 1.49365234375, + "grad_norm": 1.7613816221555076, + "kl": 0.0782470703125, + "learning_rate": 6.26708984375e-07, + "loss": 0.0031, + "reward": 1.6370373368263245, + "reward_std": 0.046255904249846935, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6370373964309692, + "step": 3058 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.8046875, + "epoch": 1.494140625, + "grad_norm": 2.290078497473891, + "kl": 0.065185546875, + "learning_rate": 6.265869140624999e-07, + "loss": 0.0026, + "reward": 1.7369165420532227, + "reward_std": 0.14749253168702126, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7369165420532227, + "step": 3059 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.765625, + "epoch": 1.49462890625, + "grad_norm": 0.5100306804078655, + "kl": 0.0810546875, + "learning_rate": 6.264648437499999e-07, + "loss": 0.0032, + "reward": 1.8437798023223877, + "reward_std": 0.03653890639543533, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8437797725200653, + "step": 3060 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.2109375, + "epoch": 1.4951171875, + "grad_norm": 1.6593962077313371, + "kl": 0.08984375, + "learning_rate": 6.263427734375e-07, + "loss": 0.0036, + "reward": 1.6682219505310059, + "reward_std": 0.019469616003334522, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6682219803333282, + "step": 3061 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.84375, + "epoch": 1.49560546875, + "grad_norm": 1.8396284831116319, + "kl": 0.0830078125, + "learning_rate": 6.26220703125e-07, + "loss": 0.0033, + "reward": 1.727663278579712, + "reward_std": 0.06270462274551392, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7276632189750671, + "step": 3062 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.6171875, + "epoch": 1.49609375, + "grad_norm": 1.0117046188705436, + "kl": 0.0732421875, + "learning_rate": 6.260986328125e-07, + "loss": 0.0029, + "reward": 1.7094378471374512, + "reward_std": 0.02379227802157402, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7094378173351288, + "step": 3063 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.8828125, + "epoch": 1.49658203125, + "grad_norm": 1.3497291028510259, + "kl": 0.090576171875, + "learning_rate": 6.259765625e-07, + "loss": 0.0036, + "reward": 1.7751423716545105, + "reward_std": 0.13514219038188457, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7907673716545105, + "step": 3064 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.46875, + "epoch": 1.4970703125, + "grad_norm": 3.4190881996802682, + "kl": 0.0592041015625, + "learning_rate": 6.258544921875e-07, + "loss": 0.0024, + "reward": 1.764043927192688, + "reward_std": 0.07390506565570831, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.771856427192688, + "step": 3065 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.140625, + "epoch": 1.49755859375, + "grad_norm": 2.0795235433392705, + "kl": 0.0665283203125, + "learning_rate": 6.257324218749999e-07, + "loss": 0.0027, + "reward": 1.8003657460212708, + "reward_std": 0.05936916545033455, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8003657460212708, + "step": 3066 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.78125, + "epoch": 1.498046875, + "grad_norm": 1.2096859898867558, + "kl": 0.0479736328125, + "learning_rate": 6.256103515624999e-07, + "loss": 0.0019, + "reward": 1.7172734141349792, + "reward_std": 0.028629466891288757, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7172734439373016, + "step": 3067 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.25, + "epoch": 1.49853515625, + "grad_norm": 0.837342886906685, + "kl": 0.0615234375, + "learning_rate": 6.2548828125e-07, + "loss": 0.0025, + "reward": 1.7248526215553284, + "reward_std": 0.04399119131267071, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7248526215553284, + "step": 3068 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.0234375, + "epoch": 1.4990234375, + "grad_norm": 0.8384243546554345, + "kl": 0.0643310546875, + "learning_rate": 6.253662109375e-07, + "loss": 0.0026, + "reward": 1.7257351875305176, + "reward_std": 0.09183939173817635, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7413601875305176, + "step": 3069 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.1875, + "epoch": 1.49951171875, + "grad_norm": 1.1447058461259672, + "kl": 0.074951171875, + "learning_rate": 6.25244140625e-07, + "loss": 0.003, + "reward": 1.81594717502594, + "reward_std": 0.021475983783602715, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8159471154212952, + "step": 3070 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.921875, + "epoch": 1.5, + "grad_norm": 1.7077930126684038, + "kl": 0.07080078125, + "learning_rate": 6.251220703125e-07, + "loss": 0.0028, + "reward": 1.6829584240913391, + "reward_std": 0.1759318709373474, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.7376458644866943, + "step": 3071 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.5859375, + "epoch": 1.50048828125, + "grad_norm": 4.251619392395568, + "kl": 0.079345703125, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0032, + "reward": 1.7641262412071228, + "reward_std": 0.10015225410461426, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7797512412071228, + "step": 3072 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.0546875, + "epoch": 1.5009765625, + "grad_norm": 2.037037846840877, + "kl": 0.0662841796875, + "learning_rate": 6.248779296874999e-07, + "loss": 0.0027, + "reward": 1.8416993618011475, + "reward_std": 0.0264980373904109, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8416993618011475, + "step": 3073 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.3125, + "epoch": 1.50146484375, + "grad_norm": 2.4016129108930446, + "kl": 0.07373046875, + "learning_rate": 6.24755859375e-07, + "loss": 0.003, + "reward": 1.884174108505249, + "reward_std": 0.01571572571992874, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8841741383075714, + "step": 3074 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.6953125, + "epoch": 1.501953125, + "grad_norm": 14.731726849066805, + "kl": 0.0576171875, + "learning_rate": 6.246337890625e-07, + "loss": 0.0023, + "reward": 1.8040361404418945, + "reward_std": 0.1080729328095913, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8274736106395721, + "step": 3075 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.3984375, + "epoch": 1.50244140625, + "grad_norm": 2.9540282421835924, + "kl": 0.0684814453125, + "learning_rate": 6.2451171875e-07, + "loss": 0.0027, + "reward": 1.8184278011322021, + "reward_std": 0.05250486359000206, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8184277713298798, + "step": 3076 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.6875, + "epoch": 1.5029296875, + "grad_norm": 1.8338223896953099, + "kl": 0.0611572265625, + "learning_rate": 6.243896484375e-07, + "loss": 0.0024, + "reward": 1.7305577397346497, + "reward_std": 0.09738858230412006, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7618077993392944, + "step": 3077 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.2421875, + "epoch": 1.50341796875, + "grad_norm": 1.4824856231273853, + "kl": 0.07666015625, + "learning_rate": 6.24267578125e-07, + "loss": 0.0031, + "reward": 1.8530486822128296, + "reward_std": 0.06430929712951183, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8530486822128296, + "step": 3078 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.5859375, + "epoch": 1.50390625, + "grad_norm": 1.2200578050818203, + "kl": 0.06494140625, + "learning_rate": 6.241455078124999e-07, + "loss": 0.0026, + "reward": 1.869605302810669, + "reward_std": 0.053065571933984756, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8696053326129913, + "step": 3079 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.59375, + "epoch": 1.50439453125, + "grad_norm": 2.107871829584508, + "kl": 0.086669921875, + "learning_rate": 6.240234374999999e-07, + "loss": 0.0035, + "reward": 1.7725425362586975, + "reward_std": 0.08174478355795145, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7725425660610199, + "step": 3080 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.765625, + "epoch": 1.5048828125, + "grad_norm": 2.090164106693488, + "kl": 0.0560302734375, + "learning_rate": 6.239013671875e-07, + "loss": 0.0022, + "reward": 1.677848756313324, + "reward_std": 0.06150129809975624, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6778488159179688, + "step": 3081 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.78125, + "epoch": 1.50537109375, + "grad_norm": 1.9087461795639435, + "kl": 0.08447265625, + "learning_rate": 6.23779296875e-07, + "loss": 0.0034, + "reward": 1.8541353940963745, + "reward_std": 0.09079751744866371, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8541353642940521, + "step": 3082 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.2890625, + "epoch": 1.505859375, + "grad_norm": 1.3215498934075522, + "kl": 0.066162109375, + "learning_rate": 6.236572265625e-07, + "loss": 0.0026, + "reward": 1.838699460029602, + "reward_std": 0.1073538176715374, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8543243706226349, + "step": 3083 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.96875, + "epoch": 1.50634765625, + "grad_norm": 1.6207069275912624, + "kl": 0.059814453125, + "learning_rate": 6.2353515625e-07, + "loss": 0.0024, + "reward": 1.8489339351654053, + "reward_std": 0.08780923672020435, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.8958089351654053, + "step": 3084 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.8359375, + "epoch": 1.5068359375, + "grad_norm": 1.8268450463781, + "kl": 0.0859375, + "learning_rate": 6.234130859374999e-07, + "loss": 0.0034, + "reward": 1.855950951576233, + "reward_std": 0.04823304433375597, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8559509217739105, + "step": 3085 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.71875, + "epoch": 1.50732421875, + "grad_norm": 0.8367633020259986, + "kl": 0.09033203125, + "learning_rate": 6.232910156249999e-07, + "loss": 0.0036, + "reward": 1.7904430627822876, + "reward_std": 0.04001910053193569, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7904431223869324, + "step": 3086 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.59375, + "epoch": 1.5078125, + "grad_norm": 2.52179471226621, + "kl": 0.095703125, + "learning_rate": 6.231689453125e-07, + "loss": 0.0038, + "reward": 2.024749219417572, + "reward_std": 0.08385680988430977, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 1.0247493088245392, + "step": 3087 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.1171875, + "epoch": 1.50830078125, + "grad_norm": 1.0623002349275934, + "kl": 0.074951171875, + "learning_rate": 6.23046875e-07, + "loss": 0.003, + "reward": 1.8043740391731262, + "reward_std": 0.04912651889026165, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8043740093708038, + "step": 3088 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.015625, + "epoch": 1.5087890625, + "grad_norm": 2.3861784634827097, + "kl": 0.069580078125, + "learning_rate": 6.229248046875e-07, + "loss": 0.0028, + "reward": 1.7216225266456604, + "reward_std": 0.05122903361916542, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.721622496843338, + "step": 3089 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.96875, + "epoch": 1.50927734375, + "grad_norm": 9.04191328799389, + "kl": 0.0845947265625, + "learning_rate": 6.22802734375e-07, + "loss": 0.0034, + "reward": 1.7171977162361145, + "reward_std": 0.03181068133562803, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7171976864337921, + "step": 3090 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.4609375, + "epoch": 1.509765625, + "grad_norm": 1.5795526400740247, + "kl": 0.075439453125, + "learning_rate": 6.226806640625e-07, + "loss": 0.003, + "reward": 1.6946417689323425, + "reward_std": 0.08766061812639236, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7180792689323425, + "step": 3091 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.0234375, + "epoch": 1.51025390625, + "grad_norm": 1.7844017464871738, + "kl": 0.0859375, + "learning_rate": 6.225585937499999e-07, + "loss": 0.0034, + "reward": 1.8546817898750305, + "reward_std": 0.05815475434064865, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8546817898750305, + "step": 3092 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.109375, + "epoch": 1.5107421875, + "grad_norm": 0.6275503497060736, + "kl": 0.07763671875, + "learning_rate": 6.224365234374999e-07, + "loss": 0.0031, + "reward": 1.9210276007652283, + "reward_std": 0.014174860902130604, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9210276007652283, + "step": 3093 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.0234375, + "epoch": 1.51123046875, + "grad_norm": 2.8960257527388484, + "kl": 0.07421875, + "learning_rate": 6.22314453125e-07, + "loss": 0.003, + "reward": 1.6736098527908325, + "reward_std": 0.07500293478369713, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6736098527908325, + "step": 3094 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0625, + "epoch": 1.51171875, + "grad_norm": 3.3937763447140306, + "kl": 0.106201171875, + "learning_rate": 6.221923828125e-07, + "loss": 0.0043, + "reward": 1.6761849522590637, + "reward_std": 0.15080446749925613, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6839974224567413, + "step": 3095 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.1015625, + "epoch": 1.51220703125, + "grad_norm": 0.7920607825397445, + "kl": 0.0640869140625, + "learning_rate": 6.220703125e-07, + "loss": 0.0026, + "reward": 1.7301459312438965, + "reward_std": 0.030234874226152897, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7301459610462189, + "step": 3096 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.2890625, + "epoch": 1.5126953125, + "grad_norm": 1.725648229722019, + "kl": 0.0693359375, + "learning_rate": 6.219482421875e-07, + "loss": 0.0028, + "reward": 1.7913671731948853, + "reward_std": 0.04710565786808729, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7913671433925629, + "step": 3097 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.0625, + "epoch": 1.51318359375, + "grad_norm": 5.525808172297369, + "kl": 0.062744140625, + "learning_rate": 6.21826171875e-07, + "loss": 0.0025, + "reward": 1.8240219950675964, + "reward_std": 0.05396724492311478, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.824021965265274, + "step": 3098 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.078125, + "epoch": 1.513671875, + "grad_norm": 1.0932630221981614, + "kl": 0.0712890625, + "learning_rate": 6.217041015624999e-07, + "loss": 0.0029, + "reward": 1.882387936115265, + "reward_std": 0.03053974825888872, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8823879063129425, + "step": 3099 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.8828125, + "epoch": 1.51416015625, + "grad_norm": 0.8220993732465702, + "kl": 0.081298828125, + "learning_rate": 6.2158203125e-07, + "loss": 0.0032, + "reward": 1.723827600479126, + "reward_std": 0.02734041726216674, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.723827600479126, + "step": 3100 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.734375, + "epoch": 1.5146484375, + "grad_norm": 0.7732507508307016, + "kl": 0.06884765625, + "learning_rate": 6.214599609375e-07, + "loss": 0.0028, + "reward": 1.68502938747406, + "reward_std": 0.06600722670555115, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6850293874740601, + "step": 3101 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.421875, + "epoch": 1.51513671875, + "grad_norm": 1.8060542775559811, + "kl": 0.0791015625, + "learning_rate": 6.21337890625e-07, + "loss": 0.0032, + "reward": 1.725981593132019, + "reward_std": 0.05310596153140068, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.725981593132019, + "step": 3102 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.4375, + "epoch": 1.515625, + "grad_norm": 2.287645696309139, + "kl": 0.07177734375, + "learning_rate": 6.212158203125e-07, + "loss": 0.0029, + "reward": 1.8063457012176514, + "reward_std": 0.060521697625517845, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8063457310199738, + "step": 3103 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.0703125, + "epoch": 1.51611328125, + "grad_norm": 1.7967027573346939, + "kl": 0.0618896484375, + "learning_rate": 6.2109375e-07, + "loss": 0.0025, + "reward": 1.766247570514679, + "reward_std": 0.027749599888920784, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.766247570514679, + "step": 3104 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.84375, + "epoch": 1.5166015625, + "grad_norm": 0.4646725109012026, + "kl": 0.078125, + "learning_rate": 6.209716796874999e-07, + "loss": 0.0031, + "reward": 1.7971450686454773, + "reward_std": 0.024683097377419472, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7971450686454773, + "step": 3105 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.4453125, + "epoch": 1.51708984375, + "grad_norm": 1.8283999063737095, + "kl": 0.0662841796875, + "learning_rate": 6.208496093749999e-07, + "loss": 0.0027, + "reward": 1.9062353372573853, + "reward_std": 0.07536712661385536, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9062353372573853, + "step": 3106 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.1171875, + "epoch": 1.517578125, + "grad_norm": 17.037657891691474, + "kl": 0.09130859375, + "learning_rate": 6.207275390625e-07, + "loss": 0.0037, + "reward": 1.7185717821121216, + "reward_std": 0.06076034903526306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7185717523097992, + "step": 3107 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.8046875, + "epoch": 1.51806640625, + "grad_norm": 2.418724869062214, + "kl": 0.079833984375, + "learning_rate": 6.2060546875e-07, + "loss": 0.0032, + "reward": 1.7952438592910767, + "reward_std": 0.09894811734557152, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8030563592910767, + "step": 3108 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.8984375, + "epoch": 1.5185546875, + "grad_norm": 1.6978086850538485, + "kl": 0.072265625, + "learning_rate": 6.204833984375e-07, + "loss": 0.0029, + "reward": 1.7135571241378784, + "reward_std": 0.05942201055586338, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7135571241378784, + "step": 3109 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.5703125, + "epoch": 1.51904296875, + "grad_norm": 11.000253342045735, + "kl": 0.07666015625, + "learning_rate": 6.20361328125e-07, + "loss": 0.0031, + "reward": 1.7439785599708557, + "reward_std": 0.02391317579895258, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7439785599708557, + "step": 3110 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.25, + "epoch": 1.51953125, + "grad_norm": 2.1906764680903854, + "kl": 0.066162109375, + "learning_rate": 6.202392578125e-07, + "loss": 0.0026, + "reward": 1.8155664801597595, + "reward_std": 0.06511466577649117, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8233789801597595, + "step": 3111 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.1875, + "epoch": 1.52001953125, + "grad_norm": 3.584987719765566, + "kl": 0.0677490234375, + "learning_rate": 6.201171874999999e-07, + "loss": 0.0027, + "reward": 1.8200541734695435, + "reward_std": 0.049682820681482553, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8200541734695435, + "step": 3112 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.0859375, + "epoch": 1.5205078125, + "grad_norm": 2.018089591759681, + "kl": 0.0748291015625, + "learning_rate": 6.199951171875e-07, + "loss": 0.003, + "reward": 1.564685881137848, + "reward_std": 0.1015004925429821, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.5724983513355255, + "step": 3113 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.7890625, + "epoch": 1.52099609375, + "grad_norm": 2.3565598111007593, + "kl": 0.068603515625, + "learning_rate": 6.19873046875e-07, + "loss": 0.0027, + "reward": 1.8028762936592102, + "reward_std": 0.058065131306648254, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8028763234615326, + "step": 3114 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.546875, + "epoch": 1.521484375, + "grad_norm": 1.3845163583688693, + "kl": 0.0859375, + "learning_rate": 6.197509765625e-07, + "loss": 0.0034, + "reward": 1.8404591083526611, + "reward_std": 0.04378460347652435, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8404591083526611, + "step": 3115 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.625, + "epoch": 1.52197265625, + "grad_norm": 1.321361217819104, + "kl": 0.0712890625, + "learning_rate": 6.1962890625e-07, + "loss": 0.0029, + "reward": 1.8013597130775452, + "reward_std": 0.11758016794919968, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8247972130775452, + "step": 3116 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.46875, + "epoch": 1.5224609375, + "grad_norm": 0.9992851869601573, + "kl": 0.0771484375, + "learning_rate": 6.195068359375e-07, + "loss": 0.0031, + "reward": 1.8446565866470337, + "reward_std": 0.04555722698569298, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8446565270423889, + "step": 3117 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.0546875, + "epoch": 1.52294921875, + "grad_norm": 1.3223466253645542, + "kl": 0.0623779296875, + "learning_rate": 6.193847656249999e-07, + "loss": 0.0025, + "reward": 1.7789299488067627, + "reward_std": 0.06991294771432877, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7867424488067627, + "step": 3118 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.578125, + "epoch": 1.5234375, + "grad_norm": 2.6833880146707907, + "kl": 0.048095703125, + "learning_rate": 6.192626953124999e-07, + "loss": 0.0019, + "reward": 1.8840059041976929, + "reward_std": 0.05926818028092384, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8840057849884033, + "step": 3119 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.46875, + "epoch": 1.52392578125, + "grad_norm": 1.4510987621579094, + "kl": 0.068603515625, + "learning_rate": 6.19140625e-07, + "loss": 0.0027, + "reward": 1.7248252034187317, + "reward_std": 0.07782328687608242, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7248252332210541, + "step": 3120 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.4375, + "epoch": 1.5244140625, + "grad_norm": 1.136138045433055, + "kl": 0.06982421875, + "learning_rate": 6.190185546875e-07, + "loss": 0.0028, + "reward": 1.5833680629730225, + "reward_std": 0.046957019716501236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.583368107676506, + "step": 3121 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.859375, + "epoch": 1.52490234375, + "grad_norm": 5.258286501764103, + "kl": 0.095703125, + "learning_rate": 6.18896484375e-07, + "loss": 0.0038, + "reward": 1.8398154973983765, + "reward_std": 0.07556849718093872, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8398154675960541, + "step": 3122 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.9921875, + "epoch": 1.525390625, + "grad_norm": 3.19613212286148, + "kl": 0.08203125, + "learning_rate": 6.187744140625e-07, + "loss": 0.0033, + "reward": 1.7610740661621094, + "reward_std": 0.08400712162256241, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7610740661621094, + "step": 3123 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.1328125, + "epoch": 1.52587890625, + "grad_norm": 1.2440046063167502, + "kl": 0.0677490234375, + "learning_rate": 6.1865234375e-07, + "loss": 0.0027, + "reward": 1.7349917888641357, + "reward_std": 0.0564101692289114, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7349918186664581, + "step": 3124 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.609375, + "epoch": 1.5263671875, + "grad_norm": 1.9358611243229036, + "kl": 0.0606689453125, + "learning_rate": 6.185302734374999e-07, + "loss": 0.0024, + "reward": 1.871264934539795, + "reward_std": 0.01118523720651865, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8712649047374725, + "step": 3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.4609375, + "epoch": 1.52685546875, + "grad_norm": 0.7636233474618398, + "kl": 0.07373046875, + "learning_rate": 6.18408203125e-07, + "loss": 0.003, + "reward": 1.7732288837432861, + "reward_std": 0.04541287012398243, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7732289135456085, + "step": 3126 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5703125, + "epoch": 1.52734375, + "grad_norm": 2.2485695960334566, + "kl": 0.078125, + "learning_rate": 6.182861328125e-07, + "loss": 0.0031, + "reward": 1.7269670367240906, + "reward_std": 0.056932706385850906, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7269670069217682, + "step": 3127 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.875, + "epoch": 1.52783203125, + "grad_norm": 2.0938435538417135, + "kl": 0.07080078125, + "learning_rate": 6.181640625e-07, + "loss": 0.0028, + "reward": 1.8258466124534607, + "reward_std": 0.034098366275429726, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8258466720581055, + "step": 3128 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.125, + "epoch": 1.5283203125, + "grad_norm": 6.351273063845393, + "kl": 0.08837890625, + "learning_rate": 6.180419921875e-07, + "loss": 0.0035, + "reward": 1.6825706362724304, + "reward_std": 0.06605061516165733, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6825706958770752, + "step": 3129 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.8828125, + "epoch": 1.52880859375, + "grad_norm": 1.73857162773802, + "kl": 0.0908203125, + "learning_rate": 6.17919921875e-07, + "loss": 0.0036, + "reward": 1.8035194873809814, + "reward_std": 0.06293283682316542, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8035195171833038, + "step": 3130 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.90625, + "epoch": 1.529296875, + "grad_norm": 1.8707235948004883, + "kl": 0.0869140625, + "learning_rate": 6.177978515624999e-07, + "loss": 0.0035, + "reward": 1.85841304063797, + "reward_std": 0.05445600301027298, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.85841304063797, + "step": 3131 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.03125, + "epoch": 1.52978515625, + "grad_norm": 1.4148200869799006, + "kl": 0.0732421875, + "learning_rate": 6.176757812499999e-07, + "loss": 0.0029, + "reward": 1.7566508054733276, + "reward_std": 0.0620297584682703, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7566508650779724, + "step": 3132 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.8671875, + "epoch": 1.5302734375, + "grad_norm": 3.895550568287067, + "kl": 0.0673828125, + "learning_rate": 6.175537109375e-07, + "loss": 0.0027, + "reward": 1.735254943370819, + "reward_std": 0.08439107239246368, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7352549433708191, + "step": 3133 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.5546875, + "epoch": 1.53076171875, + "grad_norm": 1.1219004153321612, + "kl": 0.0908203125, + "learning_rate": 6.17431640625e-07, + "loss": 0.0036, + "reward": 1.7691398859024048, + "reward_std": 0.03892973717302084, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7691399157047272, + "step": 3134 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.359375, + "epoch": 1.53125, + "grad_norm": 5.821411252101163, + "kl": 0.08349609375, + "learning_rate": 6.173095703125e-07, + "loss": 0.0033, + "reward": 1.6651095747947693, + "reward_std": 0.07258575409650803, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6651095747947693, + "step": 3135 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.3046875, + "epoch": 1.53173828125, + "grad_norm": 1.82841994581108, + "kl": 0.0751953125, + "learning_rate": 6.171875e-07, + "loss": 0.003, + "reward": 1.8692357540130615, + "reward_std": 0.0736299641430378, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8692357242107391, + "step": 3136 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.78125, + "epoch": 1.5322265625, + "grad_norm": 2.0403074353548045, + "kl": 0.08203125, + "learning_rate": 6.170654296875e-07, + "loss": 0.0033, + "reward": 1.61654931306839, + "reward_std": 0.022080027498304844, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6165493130683899, + "step": 3137 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.546875, + "epoch": 1.53271484375, + "grad_norm": 18.76627322206088, + "kl": 0.08447265625, + "learning_rate": 6.169433593749999e-07, + "loss": 0.0034, + "reward": 1.7105411887168884, + "reward_std": 0.09866257756948471, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7105412185192108, + "step": 3138 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.53125, + "epoch": 1.533203125, + "grad_norm": 2.305963685532532, + "kl": 0.058837890625, + "learning_rate": 6.168212890625e-07, + "loss": 0.0024, + "reward": 1.7301769852638245, + "reward_std": 0.13028892129659653, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7458019852638245, + "step": 3139 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.3203125, + "epoch": 1.53369140625, + "grad_norm": 0.24505066269967637, + "kl": 0.0689697265625, + "learning_rate": 6.1669921875e-07, + "loss": 0.0028, + "reward": 1.7604427337646484, + "reward_std": 0.028698831796646118, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.760442703962326, + "step": 3140 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.9921875, + "epoch": 1.5341796875, + "grad_norm": 1.735357218192801, + "kl": 0.092529296875, + "learning_rate": 6.165771484375e-07, + "loss": 0.0037, + "reward": 1.8620553016662598, + "reward_std": 0.03215474262833595, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8620553314685822, + "step": 3141 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.4375, + "epoch": 1.53466796875, + "grad_norm": 2.8803153468108436, + "kl": 0.076416015625, + "learning_rate": 6.16455078125e-07, + "loss": 0.0031, + "reward": 1.8484528064727783, + "reward_std": 0.07194521278142929, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8484528362751007, + "step": 3142 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.7734375, + "epoch": 1.53515625, + "grad_norm": 2.679606476186728, + "kl": 0.0750732421875, + "learning_rate": 6.163330078125e-07, + "loss": 0.003, + "reward": 1.7876355051994324, + "reward_std": 0.12041214294731617, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7954480051994324, + "step": 3143 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.953125, + "epoch": 1.53564453125, + "grad_norm": 1.616840595359916, + "kl": 0.06201171875, + "learning_rate": 6.162109374999999e-07, + "loss": 0.0025, + "reward": 1.8307392001152039, + "reward_std": 0.03423440642654896, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8307392001152039, + "step": 3144 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.140625, + "epoch": 1.5361328125, + "grad_norm": 2.036514638498422, + "kl": 0.10400390625, + "learning_rate": 6.160888671874999e-07, + "loss": 0.0042, + "reward": 1.8995371460914612, + "reward_std": 0.08496665954589844, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8995371758937836, + "step": 3145 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.5859375, + "epoch": 1.53662109375, + "grad_norm": 1.0505257847917335, + "kl": 0.0780029296875, + "learning_rate": 6.15966796875e-07, + "loss": 0.0031, + "reward": 1.7895857691764832, + "reward_std": 0.032869850285351276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7895857095718384, + "step": 3146 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.4140625, + "epoch": 1.537109375, + "grad_norm": 2.49269461000481, + "kl": 0.087646484375, + "learning_rate": 6.158447265625e-07, + "loss": 0.0035, + "reward": 1.729568362236023, + "reward_std": 0.14366939291357994, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7530059218406677, + "step": 3147 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.53125, + "epoch": 1.53759765625, + "grad_norm": 0.9767868294994723, + "kl": 0.06982421875, + "learning_rate": 6.1572265625e-07, + "loss": 0.0028, + "reward": 1.9281029105186462, + "reward_std": 0.06500357203185558, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9281029105186462, + "step": 3148 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.53125, + "epoch": 1.5380859375, + "grad_norm": 1.567228738812135, + "kl": 0.080078125, + "learning_rate": 6.156005859375e-07, + "loss": 0.0032, + "reward": 1.8201581239700317, + "reward_std": 0.07531145215034485, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8279706239700317, + "step": 3149 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.1015625, + "epoch": 1.53857421875, + "grad_norm": 1.37480935198485, + "kl": 0.07861328125, + "learning_rate": 6.15478515625e-07, + "loss": 0.0031, + "reward": 1.8080313205718994, + "reward_std": 0.02758025284856558, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8080313503742218, + "step": 3150 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.0390625, + "epoch": 1.5390625, + "grad_norm": 1.8428019019258013, + "kl": 0.096435546875, + "learning_rate": 6.153564453124999e-07, + "loss": 0.0039, + "reward": 1.6825060844421387, + "reward_std": 0.03835061937570572, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6825061142444611, + "step": 3151 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.46875, + "epoch": 1.53955078125, + "grad_norm": 1.7497641001072382, + "kl": 0.104736328125, + "learning_rate": 6.152343749999999e-07, + "loss": 0.0042, + "reward": 1.747897982597351, + "reward_std": 0.04787810705602169, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7478979229927063, + "step": 3152 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.34375, + "epoch": 1.5400390625, + "grad_norm": 5.833818291905867, + "kl": 0.089599609375, + "learning_rate": 6.151123046875e-07, + "loss": 0.0036, + "reward": 1.6996482610702515, + "reward_std": 0.033032437320798635, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6996482610702515, + "step": 3153 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.1953125, + "epoch": 1.54052734375, + "grad_norm": 3.41976551776808, + "kl": 0.0748291015625, + "learning_rate": 6.14990234375e-07, + "loss": 0.003, + "reward": 1.7531882524490356, + "reward_std": 0.01387872640043497, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7531882524490356, + "step": 3154 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.2578125, + "epoch": 1.541015625, + "grad_norm": 1.8858479830744364, + "kl": 0.067626953125, + "learning_rate": 6.148681640625e-07, + "loss": 0.0027, + "reward": 1.6333616375923157, + "reward_std": 0.07776164263486862, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6411741375923157, + "step": 3155 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.0078125, + "epoch": 1.54150390625, + "grad_norm": 0.8035865329277778, + "kl": 0.05712890625, + "learning_rate": 6.1474609375e-07, + "loss": 0.0023, + "reward": 1.7784574627876282, + "reward_std": 0.05039230780676007, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7784574329853058, + "step": 3156 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.9296875, + "epoch": 1.5419921875, + "grad_norm": 6.951850739468607, + "kl": 0.0699462890625, + "learning_rate": 6.146240234374999e-07, + "loss": 0.0028, + "reward": 1.7199862003326416, + "reward_std": 0.1958215907216072, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7199861109256744, + "step": 3157 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.9453125, + "epoch": 1.54248046875, + "grad_norm": 3.9343633726846288, + "kl": 0.076904296875, + "learning_rate": 6.145019531249999e-07, + "loss": 0.0031, + "reward": 1.8535465002059937, + "reward_std": 0.04394886875525117, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8535465002059937, + "step": 3158 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.109375, + "epoch": 1.54296875, + "grad_norm": 1.6757964157727798, + "kl": 0.0643310546875, + "learning_rate": 6.143798828125e-07, + "loss": 0.0026, + "reward": 1.8247731924057007, + "reward_std": 0.11207094416022301, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8325856924057007, + "step": 3159 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.9140625, + "epoch": 1.54345703125, + "grad_norm": 1.5482408835535821, + "kl": 0.057373046875, + "learning_rate": 6.142578125e-07, + "loss": 0.0023, + "reward": 1.7898805737495422, + "reward_std": 0.08811133727431297, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7898805737495422, + "step": 3160 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.7890625, + "epoch": 1.5439453125, + "grad_norm": 1.4270590005546342, + "kl": 0.06005859375, + "learning_rate": 6.141357421875e-07, + "loss": 0.0024, + "reward": 1.7193145751953125, + "reward_std": 0.12322738021612167, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7271271049976349, + "step": 3161 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.328125, + "epoch": 1.54443359375, + "grad_norm": 8.265490886129752, + "kl": 0.0648193359375, + "learning_rate": 6.14013671875e-07, + "loss": 0.0026, + "reward": 1.8019053936004639, + "reward_std": 0.03768607368692756, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8019054532051086, + "step": 3162 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.9921875, + "epoch": 1.544921875, + "grad_norm": 2.5739517984161697, + "kl": 0.07080078125, + "learning_rate": 6.138916015625e-07, + "loss": 0.0028, + "reward": 1.8906748294830322, + "reward_std": 0.04288986138999462, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.890674889087677, + "step": 3163 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.859375, + "epoch": 1.54541015625, + "grad_norm": 4.09072500677864, + "kl": 0.068115234375, + "learning_rate": 6.137695312499999e-07, + "loss": 0.0027, + "reward": 1.8343228101730347, + "reward_std": 0.07122788205742836, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8421353101730347, + "step": 3164 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.8671875, + "epoch": 1.5458984375, + "grad_norm": 2.310781538336421, + "kl": 0.0704345703125, + "learning_rate": 6.136474609374999e-07, + "loss": 0.0028, + "reward": 1.69595205783844, + "reward_std": 0.13838719576597214, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7115771174430847, + "step": 3165 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.8828125, + "epoch": 1.54638671875, + "grad_norm": 7.1679887718503545, + "kl": 0.076904296875, + "learning_rate": 6.13525390625e-07, + "loss": 0.0031, + "reward": 1.631429135799408, + "reward_std": 0.11012212559580803, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.639241635799408, + "step": 3166 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.484375, + "epoch": 1.546875, + "grad_norm": 1.8191201659721, + "kl": 0.0643310546875, + "learning_rate": 6.134033203125e-07, + "loss": 0.0026, + "reward": 1.7357022166252136, + "reward_std": 0.16989228129386902, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7747646570205688, + "step": 3167 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.953125, + "epoch": 1.54736328125, + "grad_norm": 1.402226497605308, + "kl": 0.090576171875, + "learning_rate": 6.1328125e-07, + "loss": 0.0036, + "reward": 1.6996177434921265, + "reward_std": 0.12328409217298031, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7152426540851593, + "step": 3168 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.25, + "epoch": 1.5478515625, + "grad_norm": 1.2654508230027297, + "kl": 0.0609130859375, + "learning_rate": 6.131591796875e-07, + "loss": 0.0024, + "reward": 1.8366875052452087, + "reward_std": 0.04399787541478872, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8366875350475311, + "step": 3169 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.421875, + "epoch": 1.54833984375, + "grad_norm": 7.351539110964937, + "kl": 0.0694580078125, + "learning_rate": 6.130371093749999e-07, + "loss": 0.0028, + "reward": 1.8194851875305176, + "reward_std": 0.04994682688266039, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8194851577281952, + "step": 3170 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.1875, + "epoch": 1.548828125, + "grad_norm": 0.9587044136725793, + "kl": 0.0552978515625, + "learning_rate": 6.129150390624999e-07, + "loss": 0.0022, + "reward": 1.7184346914291382, + "reward_std": 0.11068252101540565, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7340596914291382, + "step": 3171 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.40625, + "epoch": 1.54931640625, + "grad_norm": 5.169780841895419, + "kl": 0.06201171875, + "learning_rate": 6.1279296875e-07, + "loss": 0.0025, + "reward": 1.6911205053329468, + "reward_std": 0.1017858237028122, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7067455053329468, + "step": 3172 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.140625, + "epoch": 1.5498046875, + "grad_norm": 4.360943076089211, + "kl": 0.058349609375, + "learning_rate": 6.126708984375e-07, + "loss": 0.0023, + "reward": 1.6890897750854492, + "reward_std": 0.1352338343858719, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.7203397750854492, + "step": 3173 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.5703125, + "epoch": 1.55029296875, + "grad_norm": 4.05868490070331, + "kl": 0.058837890625, + "learning_rate": 6.12548828125e-07, + "loss": 0.0024, + "reward": 1.8603836297988892, + "reward_std": 0.11744150519371033, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8681961894035339, + "step": 3174 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.578125, + "epoch": 1.55078125, + "grad_norm": 3.0900057199687456, + "kl": 0.063720703125, + "learning_rate": 6.124267578125e-07, + "loss": 0.0026, + "reward": 1.7822343111038208, + "reward_std": 0.04202779196202755, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7822343111038208, + "step": 3175 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3671875, + "epoch": 1.55126953125, + "grad_norm": 1.3501893344640266, + "kl": 0.0716552734375, + "learning_rate": 6.123046875e-07, + "loss": 0.0029, + "reward": 1.9244567155838013, + "reward_std": 0.13922565057873726, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.9478942155838013, + "step": 3176 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.8984375, + "epoch": 1.5517578125, + "grad_norm": 1.423254075976608, + "kl": 0.059326171875, + "learning_rate": 6.121826171874999e-07, + "loss": 0.0024, + "reward": 1.890386700630188, + "reward_std": 0.09997991472482681, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8981991708278656, + "step": 3177 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.5546875, + "epoch": 1.55224609375, + "grad_norm": 0.7915773476493393, + "kl": 0.0577392578125, + "learning_rate": 6.120605468749999e-07, + "loss": 0.0023, + "reward": 1.784228265285492, + "reward_std": 0.016746554523706436, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7842282950878143, + "step": 3178 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.2734375, + "epoch": 1.552734375, + "grad_norm": 1.05922557400982, + "kl": 0.0587158203125, + "learning_rate": 6.119384765625e-07, + "loss": 0.0023, + "reward": 1.7570964097976685, + "reward_std": 0.12910258024930954, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7805339395999908, + "step": 3179 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.40625, + "epoch": 1.55322265625, + "grad_norm": 1.6684105733598493, + "kl": 0.0655517578125, + "learning_rate": 6.1181640625e-07, + "loss": 0.0026, + "reward": 1.5738105773925781, + "reward_std": 0.14181802049279213, + "rewards/format_reward": 0.921875, + "rewards/ocr_reward": 0.6519355773925781, + "step": 3180 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.9921875, + "epoch": 1.5537109375, + "grad_norm": 1.1439095055320303, + "kl": 0.0625, + "learning_rate": 6.116943359375e-07, + "loss": 0.0025, + "reward": 1.791804313659668, + "reward_std": 0.07595885917544365, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.791804313659668, + "step": 3181 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.046875, + "epoch": 1.55419921875, + "grad_norm": 1.538031849450602, + "kl": 0.06298828125, + "learning_rate": 6.11572265625e-07, + "loss": 0.0025, + "reward": 1.7163517475128174, + "reward_std": 0.06050669401884079, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7163517475128174, + "step": 3182 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.1953125, + "epoch": 1.5546875, + "grad_norm": 0.7099072244794439, + "kl": 0.0516357421875, + "learning_rate": 6.114501953124999e-07, + "loss": 0.0021, + "reward": 1.7604435086250305, + "reward_std": 0.019243311136960983, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7604435384273529, + "step": 3183 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.6171875, + "epoch": 1.55517578125, + "grad_norm": 1.464970168168766, + "kl": 0.068115234375, + "learning_rate": 6.113281249999999e-07, + "loss": 0.0027, + "reward": 1.7117717266082764, + "reward_std": 0.03939279168844223, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7117717266082764, + "step": 3184 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.2890625, + "epoch": 1.5556640625, + "grad_norm": 2.4843836102727055, + "kl": 0.0599365234375, + "learning_rate": 6.112060546875e-07, + "loss": 0.0024, + "reward": 1.7494711875915527, + "reward_std": 0.16199829429388046, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7729085981845856, + "step": 3185 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.5390625, + "epoch": 1.55615234375, + "grad_norm": 2.4069985981517785, + "kl": 0.062255859375, + "learning_rate": 6.11083984375e-07, + "loss": 0.0025, + "reward": 1.844676434993744, + "reward_std": 0.07243941724300385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8446764647960663, + "step": 3186 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.4765625, + "epoch": 1.556640625, + "grad_norm": 1.9087556018788279, + "kl": 0.0859375, + "learning_rate": 6.109619140625e-07, + "loss": 0.0034, + "reward": 1.7857062220573425, + "reward_std": 0.04655470885336399, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7857063114643097, + "step": 3187 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.625, + "epoch": 1.55712890625, + "grad_norm": 2.7113439900767484, + "kl": 0.0533447265625, + "learning_rate": 6.1083984375e-07, + "loss": 0.0021, + "reward": 1.6991124153137207, + "reward_std": 0.18883011117577553, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7459874153137207, + "step": 3188 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.1875, + "epoch": 1.5576171875, + "grad_norm": 5.941979108249293, + "kl": 0.0625, + "learning_rate": 6.107177734375e-07, + "loss": 0.0025, + "reward": 1.680684208869934, + "reward_std": 0.12884881347417831, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6963091790676117, + "step": 3189 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.421875, + "epoch": 1.55810546875, + "grad_norm": 1.2343491223909973, + "kl": 0.069580078125, + "learning_rate": 6.105957031249999e-07, + "loss": 0.0028, + "reward": 1.816174864768982, + "reward_std": 0.03796030767261982, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8161748945713043, + "step": 3190 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.4453125, + "epoch": 1.55859375, + "grad_norm": 5.34134665833269, + "kl": 0.053955078125, + "learning_rate": 6.104736328124999e-07, + "loss": 0.0022, + "reward": 1.7767646312713623, + "reward_std": 0.05293313413858414, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7767646312713623, + "step": 3191 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.59375, + "epoch": 1.55908203125, + "grad_norm": 1.857327467302615, + "kl": 0.070556640625, + "learning_rate": 6.103515625e-07, + "loss": 0.0028, + "reward": 1.6192150712013245, + "reward_std": 0.04519081301987171, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6192150712013245, + "step": 3192 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.8125, + "epoch": 1.5595703125, + "grad_norm": 0.7278409601873849, + "kl": 0.061279296875, + "learning_rate": 6.102294921875e-07, + "loss": 0.0025, + "reward": 1.6941944360733032, + "reward_std": 0.11231286264955997, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7098194360733032, + "step": 3193 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.4765625, + "epoch": 1.56005859375, + "grad_norm": 1.4439461511082938, + "kl": 0.0611572265625, + "learning_rate": 6.10107421875e-07, + "loss": 0.0024, + "reward": 1.7672069072723389, + "reward_std": 0.029297824949026108, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7672069072723389, + "step": 3194 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.90625, + "epoch": 1.560546875, + "grad_norm": 1.5977665784586512, + "kl": 0.06640625, + "learning_rate": 6.099853515625e-07, + "loss": 0.0027, + "reward": 1.8012299537658691, + "reward_std": 0.07727400679141283, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8012299239635468, + "step": 3195 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.2265625, + "epoch": 1.56103515625, + "grad_norm": 5.699754950304771, + "kl": 0.082763671875, + "learning_rate": 6.0986328125e-07, + "loss": 0.0033, + "reward": 1.669058918952942, + "reward_std": 0.07229340635240078, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6690589189529419, + "step": 3196 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.6875, + "epoch": 1.5615234375, + "grad_norm": 1.7331866102593816, + "kl": 0.060791015625, + "learning_rate": 6.097412109374999e-07, + "loss": 0.0024, + "reward": 1.8495106101036072, + "reward_std": 0.07928337901830673, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8573231399059296, + "step": 3197 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.046875, + "epoch": 1.56201171875, + "grad_norm": 1.7502170233469734, + "kl": 0.08056640625, + "learning_rate": 6.09619140625e-07, + "loss": 0.0032, + "reward": 1.7798677682876587, + "reward_std": 0.07166917249560356, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7798677682876587, + "step": 3198 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.2421875, + "epoch": 1.5625, + "grad_norm": 1.6271455308496765, + "kl": 0.0672607421875, + "learning_rate": 6.094970703125e-07, + "loss": 0.0027, + "reward": 1.837442696094513, + "reward_std": 0.08605869952589273, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.8843176364898682, + "step": 3199 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.421875, + "epoch": 1.56298828125, + "grad_norm": 1.858548397213829, + "kl": 0.0660400390625, + "learning_rate": 6.09375e-07, + "loss": 0.0026, + "reward": 1.852772295475006, + "reward_std": 0.04148021200671792, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8527723550796509, + "step": 3200 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.1328125, + "epoch": 1.5634765625, + "grad_norm": 2.830884430833045, + "kl": 0.084228515625, + "learning_rate": 6.092529296875e-07, + "loss": 0.0034, + "reward": 1.7869747877120972, + "reward_std": 0.031565818935632706, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7869747877120972, + "step": 3201 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.9140625, + "epoch": 1.56396484375, + "grad_norm": 1.7092660582715655, + "kl": 0.0751953125, + "learning_rate": 6.09130859375e-07, + "loss": 0.003, + "reward": 1.7863489985466003, + "reward_std": 0.11272731982171535, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8019739985466003, + "step": 3202 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.203125, + "epoch": 1.564453125, + "grad_norm": 4.3119377697610295, + "kl": 0.05322265625, + "learning_rate": 6.090087890624999e-07, + "loss": 0.0021, + "reward": 1.8258104920387268, + "reward_std": 0.05819419212639332, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8258104920387268, + "step": 3203 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.03125, + "epoch": 1.56494140625, + "grad_norm": 1.7927135426124725, + "kl": 0.0760498046875, + "learning_rate": 6.088867187499999e-07, + "loss": 0.003, + "reward": 1.7928436994552612, + "reward_std": 0.03461040183901787, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7928436994552612, + "step": 3204 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.4375, + "epoch": 1.5654296875, + "grad_norm": 2.7188622700332865, + "kl": 0.064697265625, + "learning_rate": 6.087646484375e-07, + "loss": 0.0026, + "reward": 1.7308800220489502, + "reward_std": 0.15542292036116123, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7465050220489502, + "step": 3205 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.9921875, + "epoch": 1.56591796875, + "grad_norm": 1.020499144334956, + "kl": 0.060546875, + "learning_rate": 6.08642578125e-07, + "loss": 0.0024, + "reward": 1.7392455339431763, + "reward_std": 0.11669945158064365, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7548705637454987, + "step": 3206 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.625, + "epoch": 1.56640625, + "grad_norm": 2.073316599502687, + "kl": 0.08447265625, + "learning_rate": 6.085205078125e-07, + "loss": 0.0034, + "reward": 1.78858482837677, + "reward_std": 0.05400579236447811, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.78858482837677, + "step": 3207 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.953125, + "epoch": 1.56689453125, + "grad_norm": 1.0320420379679651, + "kl": 0.0703125, + "learning_rate": 6.083984375e-07, + "loss": 0.0028, + "reward": 1.7168057560920715, + "reward_std": 0.08419827371835709, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7246182560920715, + "step": 3208 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.953125, + "epoch": 1.5673828125, + "grad_norm": 1.1431633568387154, + "kl": 0.080810546875, + "learning_rate": 6.082763671875e-07, + "loss": 0.0032, + "reward": 1.8084399104118347, + "reward_std": 0.05502317473292351, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8084399402141571, + "step": 3209 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.203125, + "epoch": 1.56787109375, + "grad_norm": 3.5444712422473756, + "kl": 0.07470703125, + "learning_rate": 6.081542968749999e-07, + "loss": 0.003, + "reward": 1.7140299677848816, + "reward_std": 0.04421941842883825, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.714029997587204, + "step": 3210 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.7421875, + "epoch": 1.568359375, + "grad_norm": 1.3334847785571438, + "kl": 0.087646484375, + "learning_rate": 6.080322265625e-07, + "loss": 0.0035, + "reward": 1.7436646223068237, + "reward_std": 0.05065160011872649, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7436646223068237, + "step": 3211 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.65625, + "epoch": 1.56884765625, + "grad_norm": 2.331640767662747, + "kl": 0.068115234375, + "learning_rate": 6.0791015625e-07, + "loss": 0.0027, + "reward": 1.755677580833435, + "reward_std": 0.038787453435361385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7556776106357574, + "step": 3212 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.65625, + "epoch": 1.5693359375, + "grad_norm": 1.475076168231399, + "kl": 0.07958984375, + "learning_rate": 6.077880859375e-07, + "loss": 0.0032, + "reward": 1.8441100716590881, + "reward_std": 0.06590352766215801, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8441100716590881, + "step": 3213 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.59375, + "epoch": 1.56982421875, + "grad_norm": 1.3134125044104092, + "kl": 0.09912109375, + "learning_rate": 6.07666015625e-07, + "loss": 0.004, + "reward": 1.8156479597091675, + "reward_std": 0.07976316474378109, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8234605491161346, + "step": 3214 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.3046875, + "epoch": 1.5703125, + "grad_norm": 2.259184238309457, + "kl": 0.065673828125, + "learning_rate": 6.075439453125e-07, + "loss": 0.0026, + "reward": 2.002101182937622, + "reward_std": 0.05331834591925144, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 1.002101182937622, + "step": 3215 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.3671875, + "epoch": 1.57080078125, + "grad_norm": 5.468710622222766, + "kl": 0.0555419921875, + "learning_rate": 6.074218749999999e-07, + "loss": 0.0022, + "reward": 1.8509008884429932, + "reward_std": 0.030736079439520836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8509008884429932, + "step": 3216 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.5078125, + "epoch": 1.5712890625, + "grad_norm": 1.528191106828427, + "kl": 0.070068359375, + "learning_rate": 6.072998046874999e-07, + "loss": 0.0028, + "reward": 1.7934442162513733, + "reward_std": 0.017778453417122364, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7934441566467285, + "step": 3217 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.8671875, + "epoch": 1.57177734375, + "grad_norm": 2.864106160888729, + "kl": 0.0616455078125, + "learning_rate": 6.07177734375e-07, + "loss": 0.0025, + "reward": 1.809788703918457, + "reward_std": 0.1024474948644638, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8176011443138123, + "step": 3218 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.59375, + "epoch": 1.572265625, + "grad_norm": 3.8916992472152545, + "kl": 0.071533203125, + "learning_rate": 6.070556640625e-07, + "loss": 0.0029, + "reward": 1.6070039868354797, + "reward_std": 0.08922014944255352, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.661691427230835, + "step": 3219 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.90625, + "epoch": 1.57275390625, + "grad_norm": 2.5009858382320638, + "kl": 0.079833984375, + "learning_rate": 6.0693359375e-07, + "loss": 0.0032, + "reward": 1.78548663854599, + "reward_std": 0.062262922525405884, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.78548663854599, + "step": 3220 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.3203125, + "epoch": 1.5732421875, + "grad_norm": 0.6996858384501395, + "kl": 0.079833984375, + "learning_rate": 6.068115234375e-07, + "loss": 0.0032, + "reward": 1.783010184764862, + "reward_std": 0.04869150370359421, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7908226549625397, + "step": 3221 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.109375, + "epoch": 1.57373046875, + "grad_norm": 1.1006856780390493, + "kl": 0.0555419921875, + "learning_rate": 6.06689453125e-07, + "loss": 0.0022, + "reward": 1.8491575717926025, + "reward_std": 0.05759404879063368, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8491575717926025, + "step": 3222 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.3515625, + "epoch": 1.57421875, + "grad_norm": 1.1841499529105677, + "kl": 0.07666015625, + "learning_rate": 6.065673828124999e-07, + "loss": 0.0031, + "reward": 1.8020890951156616, + "reward_std": 0.036463672295212746, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.802089124917984, + "step": 3223 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.2421875, + "epoch": 1.57470703125, + "grad_norm": 0.7286182525692657, + "kl": 0.0599365234375, + "learning_rate": 6.064453125e-07, + "loss": 0.0024, + "reward": 1.7325817942619324, + "reward_std": 0.027954386197961867, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7325817942619324, + "step": 3224 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.71875, + "epoch": 1.5751953125, + "grad_norm": 1.237495827483586, + "kl": 0.06689453125, + "learning_rate": 6.063232421875e-07, + "loss": 0.0027, + "reward": 1.7775406241416931, + "reward_std": 0.08435166534036398, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8009780943393707, + "step": 3225 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.90625, + "epoch": 1.57568359375, + "grad_norm": 0.528238479149417, + "kl": 0.0548095703125, + "learning_rate": 6.06201171875e-07, + "loss": 0.0022, + "reward": 1.8354755640029907, + "reward_std": 0.14160921424627304, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8511005938053131, + "step": 3226 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.484375, + "epoch": 1.576171875, + "grad_norm": 1.8220474361249062, + "kl": 0.0606689453125, + "learning_rate": 6.060791015625e-07, + "loss": 0.0024, + "reward": 1.8181970715522766, + "reward_std": 0.13987145572900772, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8338220417499542, + "step": 3227 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.859375, + "epoch": 1.57666015625, + "grad_norm": 0.6962390678727239, + "kl": 0.0594482421875, + "learning_rate": 6.0595703125e-07, + "loss": 0.0024, + "reward": 1.7669880390167236, + "reward_std": 0.06059642741456628, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7826130092144012, + "step": 3228 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.1640625, + "epoch": 1.5771484375, + "grad_norm": 1.5005401701296768, + "kl": 0.076904296875, + "learning_rate": 6.058349609374999e-07, + "loss": 0.0031, + "reward": 1.8269048929214478, + "reward_std": 0.05390936695039272, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8269048929214478, + "step": 3229 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.5234375, + "epoch": 1.57763671875, + "grad_norm": 16.482646426702743, + "kl": 0.069580078125, + "learning_rate": 6.057128906249999e-07, + "loss": 0.0028, + "reward": 1.763689935207367, + "reward_std": 0.19288001954555511, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7871273458003998, + "step": 3230 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.7265625, + "epoch": 1.578125, + "grad_norm": 3.7445612059262894, + "kl": 0.0947265625, + "learning_rate": 6.055908203125e-07, + "loss": 0.0038, + "reward": 1.7791760563850403, + "reward_std": 0.03553357906639576, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7791760265827179, + "step": 3231 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.734375, + "epoch": 1.57861328125, + "grad_norm": 15.055576583772224, + "kl": 0.0604248046875, + "learning_rate": 6.0546875e-07, + "loss": 0.0024, + "reward": 1.8308890461921692, + "reward_std": 0.10220515914261341, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8387015163898468, + "step": 3232 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.4609375, + "epoch": 1.5791015625, + "grad_norm": 0.9772827198196143, + "kl": 0.07421875, + "learning_rate": 6.053466796875e-07, + "loss": 0.003, + "reward": 1.8157562017440796, + "reward_std": 0.1360500417649746, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8313812017440796, + "step": 3233 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.796875, + "epoch": 1.57958984375, + "grad_norm": 0.6814418196014778, + "kl": 0.09228515625, + "learning_rate": 6.05224609375e-07, + "loss": 0.0037, + "reward": 1.7003534436225891, + "reward_std": 0.06812034081667662, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7081659436225891, + "step": 3234 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.5078125, + "epoch": 1.580078125, + "grad_norm": 4.059148446029115, + "kl": 0.073974609375, + "learning_rate": 6.051025390625e-07, + "loss": 0.003, + "reward": 1.7905691862106323, + "reward_std": 0.053706713020801544, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7905691266059875, + "step": 3235 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.6328125, + "epoch": 1.58056640625, + "grad_norm": 0.89045191140724, + "kl": 0.057861328125, + "learning_rate": 6.049804687499999e-07, + "loss": 0.0023, + "reward": 1.9589157104492188, + "reward_std": 0.05596003495156765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9589157104492188, + "step": 3236 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.671875, + "epoch": 1.5810546875, + "grad_norm": 27.98655992203742, + "kl": 0.091796875, + "learning_rate": 6.048583984375e-07, + "loss": 0.0037, + "reward": 1.742477536201477, + "reward_std": 0.09132163226604462, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.742477536201477, + "step": 3237 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.78125, + "epoch": 1.58154296875, + "grad_norm": 1.8475477481704758, + "kl": 0.067626953125, + "learning_rate": 6.04736328125e-07, + "loss": 0.0027, + "reward": 1.7812891602516174, + "reward_std": 0.13560626655817032, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7969141900539398, + "step": 3238 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.109375, + "epoch": 1.58203125, + "grad_norm": 1.6327596150343278, + "kl": 0.087158203125, + "learning_rate": 6.046142578125e-07, + "loss": 0.0035, + "reward": 1.799069106578827, + "reward_std": 0.057514723390340805, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7990691363811493, + "step": 3239 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.203125, + "epoch": 1.58251953125, + "grad_norm": 2.100124971039901, + "kl": 0.0732421875, + "learning_rate": 6.044921875e-07, + "loss": 0.0029, + "reward": 1.7056349515914917, + "reward_std": 0.021463132463395596, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7056349515914917, + "step": 3240 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.109375, + "epoch": 1.5830078125, + "grad_norm": 0.9123360385826916, + "kl": 0.0704345703125, + "learning_rate": 6.043701171875e-07, + "loss": 0.0028, + "reward": 1.8157188296318054, + "reward_std": 0.06355854496359825, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8157188296318054, + "step": 3241 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.7421875, + "epoch": 1.58349609375, + "grad_norm": 3.3904581418786015, + "kl": 0.08642578125, + "learning_rate": 6.042480468749999e-07, + "loss": 0.0035, + "reward": 1.8644654154777527, + "reward_std": 0.06617464870214462, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8644654750823975, + "step": 3242 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.0390625, + "epoch": 1.583984375, + "grad_norm": 2.5280802263881643, + "kl": 0.06982421875, + "learning_rate": 6.041259765624999e-07, + "loss": 0.0028, + "reward": 1.8594006896018982, + "reward_std": 0.03832878777757287, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8672131299972534, + "step": 3243 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.0625, + "epoch": 1.58447265625, + "grad_norm": 1.7853672075638696, + "kl": 0.09375, + "learning_rate": 6.0400390625e-07, + "loss": 0.0037, + "reward": 1.7820322513580322, + "reward_std": 0.03775404021143913, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7820322811603546, + "step": 3244 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.4609375, + "epoch": 1.5849609375, + "grad_norm": 3.1479729709522544, + "kl": 0.09033203125, + "learning_rate": 6.038818359375e-07, + "loss": 0.0036, + "reward": 1.828788161277771, + "reward_std": 0.029659430496394634, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8287881314754486, + "step": 3245 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.7734375, + "epoch": 1.58544921875, + "grad_norm": 1.8693724346947198, + "kl": 0.091552734375, + "learning_rate": 6.03759765625e-07, + "loss": 0.0037, + "reward": 1.6129669547080994, + "reward_std": 0.051816992461681366, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6129669547080994, + "step": 3246 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.0625, + "epoch": 1.5859375, + "grad_norm": 2.4589967009937026, + "kl": 0.0771484375, + "learning_rate": 6.036376953125e-07, + "loss": 0.0031, + "reward": 1.776804268360138, + "reward_std": 0.04013761132955551, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7768042683601379, + "step": 3247 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.078125, + "epoch": 1.58642578125, + "grad_norm": 0.9274776553938363, + "kl": 0.080322265625, + "learning_rate": 6.03515625e-07, + "loss": 0.0032, + "reward": 1.7535163760185242, + "reward_std": 0.046172965317964554, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7535163760185242, + "step": 3248 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.0859375, + "epoch": 1.5869140625, + "grad_norm": 1.1471100947644117, + "kl": 0.07421875, + "learning_rate": 6.033935546874999e-07, + "loss": 0.003, + "reward": 1.738187551498413, + "reward_std": 0.0872982544824481, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7460000813007355, + "step": 3249 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.390625, + "epoch": 1.58740234375, + "grad_norm": 0.9499426055546841, + "kl": 0.0634765625, + "learning_rate": 6.032714843749999e-07, + "loss": 0.0025, + "reward": 1.8778213262557983, + "reward_std": 0.08278293255716562, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.9012588858604431, + "step": 3250 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.3984375, + "epoch": 1.587890625, + "grad_norm": 2.2261842170424577, + "kl": 0.080322265625, + "learning_rate": 6.031494140625e-07, + "loss": 0.0032, + "reward": 1.7778486013412476, + "reward_std": 0.0640218211337924, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7856611609458923, + "step": 3251 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.21875, + "epoch": 1.58837890625, + "grad_norm": 1.2139552367739217, + "kl": 0.0704345703125, + "learning_rate": 6.0302734375e-07, + "loss": 0.0028, + "reward": 1.8429046869277954, + "reward_std": 0.028244564309716225, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8429047465324402, + "step": 3252 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.8828125, + "epoch": 1.5888671875, + "grad_norm": 1.4532070226696283, + "kl": 0.0849609375, + "learning_rate": 6.029052734375e-07, + "loss": 0.0034, + "reward": 1.6911569833755493, + "reward_std": 0.1081528514623642, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6989694237709045, + "step": 3253 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.234375, + "epoch": 1.58935546875, + "grad_norm": 1.2377545429292913, + "kl": 0.066650390625, + "learning_rate": 6.02783203125e-07, + "loss": 0.0027, + "reward": 1.853829026222229, + "reward_std": 0.02524241991341114, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8538290560245514, + "step": 3254 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.6328125, + "epoch": 1.58984375, + "grad_norm": 1.0689435406627494, + "kl": 0.07568359375, + "learning_rate": 6.026611328124999e-07, + "loss": 0.003, + "reward": 1.7502402663230896, + "reward_std": 0.08583210222423077, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7580527365207672, + "step": 3255 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.609375, + "epoch": 1.59033203125, + "grad_norm": 2.129247438581581, + "kl": 0.07275390625, + "learning_rate": 6.025390624999999e-07, + "loss": 0.0029, + "reward": 1.6336244344711304, + "reward_std": 0.08128929510712624, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6336244642734528, + "step": 3256 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.0625, + "epoch": 1.5908203125, + "grad_norm": 2.9401186862267634, + "kl": 0.076904296875, + "learning_rate": 6.024169921875e-07, + "loss": 0.0031, + "reward": 1.7974181175231934, + "reward_std": 0.03139576967805624, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7974181473255157, + "step": 3257 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.015625, + "epoch": 1.59130859375, + "grad_norm": 1.7758066245944604, + "kl": 0.0751953125, + "learning_rate": 6.02294921875e-07, + "loss": 0.003, + "reward": 1.7531208395957947, + "reward_std": 0.02848457545042038, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7531208395957947, + "step": 3258 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.859375, + "epoch": 1.591796875, + "grad_norm": 6.7846858101834435, + "kl": 0.0712890625, + "learning_rate": 6.021728515625e-07, + "loss": 0.0029, + "reward": 1.7426277995109558, + "reward_std": 0.06002306379377842, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.742627739906311, + "step": 3259 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5625, + "epoch": 1.59228515625, + "grad_norm": 3.355150773217038, + "kl": 0.0810546875, + "learning_rate": 6.0205078125e-07, + "loss": 0.0032, + "reward": 1.913890540599823, + "reward_std": 0.041608670726418495, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9138904809951782, + "step": 3260 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.2421875, + "epoch": 1.5927734375, + "grad_norm": 0.9105120534656213, + "kl": 0.0877685546875, + "learning_rate": 6.019287109375e-07, + "loss": 0.0035, + "reward": 1.8028390407562256, + "reward_std": 0.07113232091069221, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8028390407562256, + "step": 3261 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.703125, + "epoch": 1.59326171875, + "grad_norm": 0.9245025145271322, + "kl": 0.07470703125, + "learning_rate": 6.018066406249999e-07, + "loss": 0.003, + "reward": 1.671428918838501, + "reward_std": 0.05243074335157871, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6714289486408234, + "step": 3262 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.9765625, + "epoch": 1.59375, + "grad_norm": 2.130691763806817, + "kl": 0.06982421875, + "learning_rate": 6.016845703124999e-07, + "loss": 0.0028, + "reward": 1.7941365838050842, + "reward_std": 0.09248049557209015, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.809761643409729, + "step": 3263 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.8828125, + "epoch": 1.59423828125, + "grad_norm": 1.4565066323176799, + "kl": 0.0625, + "learning_rate": 6.015625e-07, + "loss": 0.0025, + "reward": 1.7813687324523926, + "reward_std": 0.07458901032805443, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7891812920570374, + "step": 3264 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3359375, + "epoch": 1.5947265625, + "grad_norm": 1.0694645510879401, + "kl": 0.068115234375, + "learning_rate": 6.014404296875e-07, + "loss": 0.0027, + "reward": 1.7209742665290833, + "reward_std": 0.12353447079658508, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7678492069244385, + "step": 3265 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.6171875, + "epoch": 1.59521484375, + "grad_norm": 2.180425663820791, + "kl": 0.074951171875, + "learning_rate": 6.01318359375e-07, + "loss": 0.003, + "reward": 1.7168704271316528, + "reward_std": 0.10325317457318306, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7324954271316528, + "step": 3266 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.9140625, + "epoch": 1.595703125, + "grad_norm": 2.791039159238429, + "kl": 0.0684814453125, + "learning_rate": 6.011962890625e-07, + "loss": 0.0027, + "reward": 1.7321181297302246, + "reward_std": 0.10391049832105637, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.786805659532547, + "step": 3267 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.0078125, + "epoch": 1.59619140625, + "grad_norm": 0.8947974535479477, + "kl": 0.0577392578125, + "learning_rate": 6.010742187499999e-07, + "loss": 0.0023, + "reward": 1.8403544425964355, + "reward_std": 0.055374979972839355, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8481669425964355, + "step": 3268 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.0078125, + "epoch": 1.5966796875, + "grad_norm": 1.1528503859568437, + "kl": 0.07568359375, + "learning_rate": 6.009521484374999e-07, + "loss": 0.003, + "reward": 1.6485916376113892, + "reward_std": 0.018121136352419853, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6485916376113892, + "step": 3269 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.5078125, + "epoch": 1.59716796875, + "grad_norm": 3.0563137616187652, + "kl": 0.0594482421875, + "learning_rate": 6.00830078125e-07, + "loss": 0.0024, + "reward": 1.7807026505470276, + "reward_std": 0.1419503539800644, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7807026207447052, + "step": 3270 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.140625, + "epoch": 1.59765625, + "grad_norm": 1.335457862432662, + "kl": 0.084228515625, + "learning_rate": 6.007080078125e-07, + "loss": 0.0034, + "reward": 1.797263503074646, + "reward_std": 0.06179828941822052, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.797263503074646, + "step": 3271 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.609375, + "epoch": 1.59814453125, + "grad_norm": 1.121524259737413, + "kl": 0.06689453125, + "learning_rate": 6.005859375e-07, + "loss": 0.0027, + "reward": 1.7773959040641785, + "reward_std": 0.05081337783485651, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7773959338665009, + "step": 3272 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.15625, + "epoch": 1.5986328125, + "grad_norm": 2.866249679866173, + "kl": 0.0645751953125, + "learning_rate": 6.004638671875e-07, + "loss": 0.0026, + "reward": 1.8175668716430664, + "reward_std": 0.11312521249055862, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8253794610500336, + "step": 3273 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.453125, + "epoch": 1.59912109375, + "grad_norm": 3.606065177774678, + "kl": 0.0640869140625, + "learning_rate": 6.00341796875e-07, + "loss": 0.0026, + "reward": 1.781424641609192, + "reward_std": 0.08653675019741058, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7814246118068695, + "step": 3274 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.3828125, + "epoch": 1.599609375, + "grad_norm": 3.5178633189395523, + "kl": 0.08203125, + "learning_rate": 6.002197265624999e-07, + "loss": 0.0033, + "reward": 1.7668121457099915, + "reward_std": 0.126564159989357, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7824371457099915, + "step": 3275 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.171875, + "epoch": 1.60009765625, + "grad_norm": 0.6996774519584336, + "kl": 0.0472412109375, + "learning_rate": 6.000976562499999e-07, + "loss": 0.0019, + "reward": 1.814025104045868, + "reward_std": 0.12107747420668602, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.8609001040458679, + "step": 3276 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.75, + "epoch": 1.6005859375, + "grad_norm": 0.8261108370026679, + "kl": 0.0513916015625, + "learning_rate": 5.999755859375e-07, + "loss": 0.0021, + "reward": 1.7932913899421692, + "reward_std": 0.032305057160556316, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7932913601398468, + "step": 3277 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.3203125, + "epoch": 1.60107421875, + "grad_norm": 2.058098522878682, + "kl": 0.0830078125, + "learning_rate": 5.99853515625e-07, + "loss": 0.0033, + "reward": 1.8758089542388916, + "reward_std": 0.0519051980227232, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8758089244365692, + "step": 3278 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.4765625, + "epoch": 1.6015625, + "grad_norm": 3.9373950604712626, + "kl": 0.0494384765625, + "learning_rate": 5.997314453125e-07, + "loss": 0.002, + "reward": 1.8489559888839722, + "reward_std": 0.04422624595463276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8489560484886169, + "step": 3279 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.7265625, + "epoch": 1.60205078125, + "grad_norm": 7.884811127959207, + "kl": 0.0703125, + "learning_rate": 5.99609375e-07, + "loss": 0.0028, + "reward": 1.6899959444999695, + "reward_std": 0.10859640687704086, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6978083550930023, + "step": 3280 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.2265625, + "epoch": 1.6025390625, + "grad_norm": 2.7975461752500577, + "kl": 0.054443359375, + "learning_rate": 5.994873046875e-07, + "loss": 0.0022, + "reward": 1.6196198463439941, + "reward_std": 0.1712161898612976, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.6664949059486389, + "step": 3281 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.0234375, + "epoch": 1.60302734375, + "grad_norm": 0.9151595270896026, + "kl": 0.086181640625, + "learning_rate": 5.993652343749999e-07, + "loss": 0.0034, + "reward": 1.7856322526931763, + "reward_std": 0.02972456067800522, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7856322526931763, + "step": 3282 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.7109375, + "epoch": 1.603515625, + "grad_norm": 1.7965453423126865, + "kl": 0.097900390625, + "learning_rate": 5.992431640625e-07, + "loss": 0.0039, + "reward": 1.7602424621582031, + "reward_std": 0.13400599360466003, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7758674323558807, + "step": 3283 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.90625, + "epoch": 1.60400390625, + "grad_norm": 1.0639662963187586, + "kl": 0.0548095703125, + "learning_rate": 5.9912109375e-07, + "loss": 0.0022, + "reward": 1.8079357147216797, + "reward_std": 0.18928005546331406, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8313732445240021, + "step": 3284 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.953125, + "epoch": 1.6044921875, + "grad_norm": 1.2197720201061695, + "kl": 0.057373046875, + "learning_rate": 5.989990234375e-07, + "loss": 0.0023, + "reward": 1.821268081665039, + "reward_std": 0.02508594747632742, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8212681412696838, + "step": 3285 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.609375, + "epoch": 1.60498046875, + "grad_norm": 1.4231911127812176, + "kl": 0.0611572265625, + "learning_rate": 5.98876953125e-07, + "loss": 0.0024, + "reward": 1.851391077041626, + "reward_std": 0.12025601789355278, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8670159578323364, + "step": 3286 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.4296875, + "epoch": 1.60546875, + "grad_norm": 1.911311177993667, + "kl": 0.0947265625, + "learning_rate": 5.987548828125e-07, + "loss": 0.0038, + "reward": 1.7131580114364624, + "reward_std": 0.07230347953736782, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7131580114364624, + "step": 3287 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.015625, + "epoch": 1.60595703125, + "grad_norm": 0.8159968339623103, + "kl": 0.0516357421875, + "learning_rate": 5.986328124999999e-07, + "loss": 0.0021, + "reward": 1.7010605335235596, + "reward_std": 0.12904052436351776, + "rewards/format_reward": 0.953125, + "rewards/ocr_reward": 0.7479356527328491, + "step": 3288 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.125, + "epoch": 1.6064453125, + "grad_norm": 0.8081713539518741, + "kl": 0.072509765625, + "learning_rate": 5.985107421874999e-07, + "loss": 0.0029, + "reward": 1.8329209685325623, + "reward_std": 0.02778689656406641, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8329209983348846, + "step": 3289 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.796875, + "epoch": 1.60693359375, + "grad_norm": 1.5031271056588227, + "kl": 0.072509765625, + "learning_rate": 5.98388671875e-07, + "loss": 0.0029, + "reward": 1.794043481349945, + "reward_std": 0.05896776542067528, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7940434813499451, + "step": 3290 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.7890625, + "epoch": 1.607421875, + "grad_norm": 1.3872359065707587, + "kl": 0.07470703125, + "learning_rate": 5.982666015625e-07, + "loss": 0.003, + "reward": 1.6995200514793396, + "reward_std": 0.08452805131673813, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7073325216770172, + "step": 3291 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.8046875, + "epoch": 1.60791015625, + "grad_norm": 0.8648030111794723, + "kl": 0.078857421875, + "learning_rate": 5.9814453125e-07, + "loss": 0.0032, + "reward": 1.7610323429107666, + "reward_std": 0.05742851458489895, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7688448131084442, + "step": 3292 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.671875, + "epoch": 1.6083984375, + "grad_norm": 6.153849152464751, + "kl": 0.082763671875, + "learning_rate": 5.980224609375e-07, + "loss": 0.0033, + "reward": 1.7795958518981934, + "reward_std": 0.07071587443351746, + "rewards/format_reward": 0.9453125, + "rewards/ocr_reward": 0.8342833817005157, + "step": 3293 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.34375, + "epoch": 1.60888671875, + "grad_norm": 1.0323918030975063, + "kl": 0.064453125, + "learning_rate": 5.97900390625e-07, + "loss": 0.0026, + "reward": 1.8604564666748047, + "reward_std": 0.04785814322531223, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8604564666748047, + "step": 3294 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.6015625, + "epoch": 1.609375, + "grad_norm": 1.4849996417871474, + "kl": 0.074462890625, + "learning_rate": 5.977783203124999e-07, + "loss": 0.003, + "reward": 1.703747808933258, + "reward_std": 0.08227039128541946, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7037478089332581, + "step": 3295 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.125, + "epoch": 1.60986328125, + "grad_norm": 2.1214209431752122, + "kl": 0.06298828125, + "learning_rate": 5.9765625e-07, + "loss": 0.0025, + "reward": 1.794127881526947, + "reward_std": 0.07666090503334999, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7941278219223022, + "step": 3296 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.859375, + "epoch": 1.6103515625, + "grad_norm": 1.6164264023490769, + "kl": 0.086181640625, + "learning_rate": 5.975341796875e-07, + "loss": 0.0035, + "reward": 1.6789074540138245, + "reward_std": 0.033364531584084034, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6789074838161469, + "step": 3297 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.9453125, + "epoch": 1.61083984375, + "grad_norm": 1.8073919332154889, + "kl": 0.076171875, + "learning_rate": 5.97412109375e-07, + "loss": 0.003, + "reward": 1.6800431609153748, + "reward_std": 0.06180498003959656, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6800430715084076, + "step": 3298 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.15625, + "epoch": 1.611328125, + "grad_norm": 1.9650430682434774, + "kl": 0.0849609375, + "learning_rate": 5.972900390625e-07, + "loss": 0.0034, + "reward": 1.720855951309204, + "reward_std": 0.08976828306913376, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7442933619022369, + "step": 3299 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.953125, + "epoch": 1.61181640625, + "grad_norm": 1.5026385468711767, + "kl": 0.07373046875, + "learning_rate": 5.9716796875e-07, + "loss": 0.0029, + "reward": 1.7150686383247375, + "reward_std": 0.10774907097220421, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7228811085224152, + "step": 3300 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.9609375, + "epoch": 1.6123046875, + "grad_norm": 3.9327500342779778, + "kl": 0.08203125, + "learning_rate": 5.970458984374999e-07, + "loss": 0.0033, + "reward": 1.7554203271865845, + "reward_std": 0.09172924142330885, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7554203867912292, + "step": 3301 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.0546875, + "epoch": 1.61279296875, + "grad_norm": 2.715132927951165, + "kl": 0.083251953125, + "learning_rate": 5.969238281249999e-07, + "loss": 0.0033, + "reward": 1.752245843410492, + "reward_std": 0.0424564378336072, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7522459030151367, + "step": 3302 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9375, + "epoch": 1.61328125, + "grad_norm": 0.9299085475576804, + "kl": 0.067626953125, + "learning_rate": 5.968017578125e-07, + "loss": 0.0027, + "reward": 1.815238118171692, + "reward_std": 0.04331210441887379, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8152380585670471, + "step": 3303 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.59375, + "epoch": 1.61376953125, + "grad_norm": 1.4842707399360437, + "kl": 0.07080078125, + "learning_rate": 5.966796875e-07, + "loss": 0.0028, + "reward": 1.6607686877250671, + "reward_std": 0.0442405054345727, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6607686877250671, + "step": 3304 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.65625, + "epoch": 1.6142578125, + "grad_norm": 2.0590875753569335, + "kl": 0.064697265625, + "learning_rate": 5.965576171875e-07, + "loss": 0.0026, + "reward": 1.8585594296455383, + "reward_std": 0.05867746938019991, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8585593700408936, + "step": 3305 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.8125, + "epoch": 1.61474609375, + "grad_norm": 1.6223776518735307, + "kl": 0.089599609375, + "learning_rate": 5.96435546875e-07, + "loss": 0.0036, + "reward": 1.7585085034370422, + "reward_std": 0.055630091577768326, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.758508563041687, + "step": 3306 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.3515625, + "epoch": 1.615234375, + "grad_norm": 6.29346114659625, + "kl": 0.0816650390625, + "learning_rate": 5.963134765625e-07, + "loss": 0.0033, + "reward": 1.6852021217346191, + "reward_std": 0.030728538520634174, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6852021813392639, + "step": 3307 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.640625, + "epoch": 1.61572265625, + "grad_norm": 57.61803616396629, + "kl": 0.114501953125, + "learning_rate": 5.961914062499999e-07, + "loss": 0.0046, + "reward": 1.651352047920227, + "reward_std": 0.05351191433146596, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6513520181179047, + "step": 3308 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.0, + "epoch": 1.6162109375, + "grad_norm": 1.3324160676364192, + "kl": 0.0623779296875, + "learning_rate": 5.960693359375e-07, + "loss": 0.0025, + "reward": 1.818449318408966, + "reward_std": 0.07615053281188011, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8262618482112885, + "step": 3309 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.6015625, + "epoch": 1.61669921875, + "grad_norm": 1.5524145516661523, + "kl": 0.0604248046875, + "learning_rate": 5.95947265625e-07, + "loss": 0.0024, + "reward": 1.7558764815330505, + "reward_std": 0.08649563789367676, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7558764517307281, + "step": 3310 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.9296875, + "epoch": 1.6171875, + "grad_norm": 2.2022666625672174, + "kl": 0.077880859375, + "learning_rate": 5.958251953125e-07, + "loss": 0.0031, + "reward": 1.9051913619041443, + "reward_std": 0.057089509442448616, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9051913619041443, + "step": 3311 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.3125, + "epoch": 1.61767578125, + "grad_norm": 2.122944616604469, + "kl": 0.087646484375, + "learning_rate": 5.95703125e-07, + "loss": 0.0035, + "reward": 1.7621399760246277, + "reward_std": 0.07783204689621925, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7621399760246277, + "step": 3312 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.375, + "epoch": 1.6181640625, + "grad_norm": 0.6451018067962863, + "kl": 0.092041015625, + "learning_rate": 5.955810546875e-07, + "loss": 0.0037, + "reward": 1.8151302337646484, + "reward_std": 0.17584221065044403, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8385677337646484, + "step": 3313 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.3125, + "epoch": 1.61865234375, + "grad_norm": 2.9643724571250902, + "kl": 0.072021484375, + "learning_rate": 5.954589843749999e-07, + "loss": 0.0029, + "reward": 1.8480368256568909, + "reward_std": 0.04931685887277126, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8480368256568909, + "step": 3314 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.6171875, + "epoch": 1.619140625, + "grad_norm": 14.73302874665288, + "kl": 0.175048828125, + "learning_rate": 5.953369140624999e-07, + "loss": 0.007, + "reward": 1.781773030757904, + "reward_std": 0.14042264595627785, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.805210530757904, + "step": 3315 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.96875, + "epoch": 1.61962890625, + "grad_norm": 0.8900575904775362, + "kl": 0.075927734375, + "learning_rate": 5.9521484375e-07, + "loss": 0.003, + "reward": 1.8554713726043701, + "reward_std": 0.06886312644928694, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8632838726043701, + "step": 3316 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.515625, + "epoch": 1.6201171875, + "grad_norm": 2.466288409101878, + "kl": 0.077880859375, + "learning_rate": 5.950927734375e-07, + "loss": 0.0031, + "reward": 1.7723018527030945, + "reward_std": 0.02210051123984158, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7723018527030945, + "step": 3317 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.59375, + "epoch": 1.62060546875, + "grad_norm": 2.5984727262943124, + "kl": 0.074462890625, + "learning_rate": 5.94970703125e-07, + "loss": 0.003, + "reward": 1.7868224382400513, + "reward_std": 0.05945824505761266, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7868223786354065, + "step": 3318 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.5390625, + "epoch": 1.62109375, + "grad_norm": 1.4154199137348544, + "kl": 0.0694580078125, + "learning_rate": 5.948486328125e-07, + "loss": 0.0028, + "reward": 1.7688942551612854, + "reward_std": 0.13842950016260147, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8079566955566406, + "step": 3319 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.0078125, + "epoch": 1.62158203125, + "grad_norm": 0.6836739446073203, + "kl": 0.085205078125, + "learning_rate": 5.947265625e-07, + "loss": 0.0034, + "reward": 1.7379599213600159, + "reward_std": 0.05289880000054836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7379598617553711, + "step": 3320 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.953125, + "epoch": 1.6220703125, + "grad_norm": 1.1625041958754734, + "kl": 0.08544921875, + "learning_rate": 5.946044921874999e-07, + "loss": 0.0034, + "reward": 1.901548981666565, + "reward_std": 0.04509174823760986, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9015489816665649, + "step": 3321 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.4453125, + "epoch": 1.62255859375, + "grad_norm": 2.346547215431855, + "kl": 0.12109375, + "learning_rate": 5.94482421875e-07, + "loss": 0.0049, + "reward": 1.7190340757369995, + "reward_std": 0.10668664053082466, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7268466651439667, + "step": 3322 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.4296875, + "epoch": 1.623046875, + "grad_norm": 1.7067499710514877, + "kl": 0.0728759765625, + "learning_rate": 5.943603515625e-07, + "loss": 0.0029, + "reward": 1.7981135249137878, + "reward_std": 0.08968368917703629, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7981135547161102, + "step": 3323 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.3046875, + "epoch": 1.62353515625, + "grad_norm": 0.9846710994291399, + "kl": 0.071533203125, + "learning_rate": 5.9423828125e-07, + "loss": 0.0029, + "reward": 1.8831993341445923, + "reward_std": 0.029867228120565414, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8831993639469147, + "step": 3324 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.5, + "epoch": 1.6240234375, + "grad_norm": 2.922443835097143, + "kl": 0.08642578125, + "learning_rate": 5.941162109375e-07, + "loss": 0.0035, + "reward": 1.7861003875732422, + "reward_std": 0.06693215668201447, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7861004769802094, + "step": 3325 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.140625, + "epoch": 1.62451171875, + "grad_norm": 0.6981453405142553, + "kl": 0.0657958984375, + "learning_rate": 5.93994140625e-07, + "loss": 0.0026, + "reward": 1.9097455143928528, + "reward_std": 0.02694264892488718, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9097454845905304, + "step": 3326 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.75, + "epoch": 1.625, + "grad_norm": 1.7535640438867004, + "kl": 0.076904296875, + "learning_rate": 5.938720703124999e-07, + "loss": 0.0031, + "reward": 1.8061844110488892, + "reward_std": 0.04717784374952316, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8061844110488892, + "step": 3327 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.1015625, + "epoch": 1.62548828125, + "grad_norm": 1.2336422112092555, + "kl": 0.064697265625, + "learning_rate": 5.937499999999999e-07, + "loss": 0.0026, + "reward": 1.7810336351394653, + "reward_std": 0.07749061286449432, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7888461053371429, + "step": 3328 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.09375, + "epoch": 1.6259765625, + "grad_norm": 3.487351497648713, + "kl": 0.06494140625, + "learning_rate": 5.936279296875e-07, + "loss": 0.0026, + "reward": 1.6700169444084167, + "reward_std": 0.17180902510881424, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.693454384803772, + "step": 3329 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.1875, + "epoch": 1.62646484375, + "grad_norm": 13.359609968705223, + "kl": 0.08935546875, + "learning_rate": 5.93505859375e-07, + "loss": 0.0036, + "reward": 1.6575063467025757, + "reward_std": 0.055701796896755695, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6575063467025757, + "step": 3330 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.3984375, + "epoch": 1.626953125, + "grad_norm": 0.6032534098055211, + "kl": 0.06396484375, + "learning_rate": 5.933837890625e-07, + "loss": 0.0026, + "reward": 1.8520901799201965, + "reward_std": 0.07921074330806732, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8599026799201965, + "step": 3331 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.078125, + "epoch": 1.62744140625, + "grad_norm": 1.2697462288963357, + "kl": 0.08154296875, + "learning_rate": 5.9326171875e-07, + "loss": 0.0033, + "reward": 1.7715474963188171, + "reward_std": 0.06629283353686333, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7715475261211395, + "step": 3332 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.8515625, + "epoch": 1.6279296875, + "grad_norm": 2.641032389095305, + "kl": 0.0672607421875, + "learning_rate": 5.931396484375e-07, + "loss": 0.0027, + "reward": 1.69329833984375, + "reward_std": 0.06569128856062889, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6932983696460724, + "step": 3333 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.78125, + "epoch": 1.62841796875, + "grad_norm": 2.5623974388990454, + "kl": 0.0908203125, + "learning_rate": 5.930175781249999e-07, + "loss": 0.0036, + "reward": 1.8435781002044678, + "reward_std": 0.0874359430745244, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8435779809951782, + "step": 3334 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.828125, + "epoch": 1.62890625, + "grad_norm": 1.5883862067873453, + "kl": 0.088623046875, + "learning_rate": 5.928955078125e-07, + "loss": 0.0035, + "reward": 1.7597174644470215, + "reward_std": 0.08109994605183601, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7675299346446991, + "step": 3335 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.65625, + "epoch": 1.62939453125, + "grad_norm": 1.0616997874872647, + "kl": 0.06640625, + "learning_rate": 5.927734375e-07, + "loss": 0.0027, + "reward": 1.7667133212089539, + "reward_std": 0.1313837133347988, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7901508212089539, + "step": 3336 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.1875, + "epoch": 1.6298828125, + "grad_norm": 2.9226107751812354, + "kl": 0.1103515625, + "learning_rate": 5.926513671875e-07, + "loss": 0.0044, + "reward": 1.6865645051002502, + "reward_std": 0.06128368899226189, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6865644752979279, + "step": 3337 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.9765625, + "epoch": 1.63037109375, + "grad_norm": 1.4254524637894548, + "kl": 0.0645751953125, + "learning_rate": 5.92529296875e-07, + "loss": 0.0026, + "reward": 1.7799670696258545, + "reward_std": 0.02988600544631481, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7799670994281769, + "step": 3338 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.34375, + "epoch": 1.630859375, + "grad_norm": 0.9006275035038049, + "kl": 0.0548095703125, + "learning_rate": 5.924072265625e-07, + "loss": 0.0022, + "reward": 1.8364945650100708, + "reward_std": 0.03155016852542758, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8364945650100708, + "step": 3339 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.203125, + "epoch": 1.63134765625, + "grad_norm": 1.9607130935646655, + "kl": 0.080322265625, + "learning_rate": 5.922851562499999e-07, + "loss": 0.0032, + "reward": 1.77052640914917, + "reward_std": 0.06949007511138916, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7705264091491699, + "step": 3340 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.375, + "epoch": 1.6318359375, + "grad_norm": 4.07493900518628, + "kl": 0.076416015625, + "learning_rate": 5.921630859374999e-07, + "loss": 0.003, + "reward": 1.8116753101348877, + "reward_std": 0.11965424194931984, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8116753697395325, + "step": 3341 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.09375, + "epoch": 1.63232421875, + "grad_norm": 3.3284633422339027, + "kl": 0.06884765625, + "learning_rate": 5.92041015625e-07, + "loss": 0.0028, + "reward": 1.7393649220466614, + "reward_std": 0.11131243035197258, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7628024816513062, + "step": 3342 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.328125, + "epoch": 1.6328125, + "grad_norm": 1.6845731379939248, + "kl": 0.076171875, + "learning_rate": 5.919189453125e-07, + "loss": 0.003, + "reward": 1.7558993101119995, + "reward_std": 0.03900916501879692, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7558992505073547, + "step": 3343 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.03125, + "epoch": 1.63330078125, + "grad_norm": 1.3128493226455236, + "kl": 0.06005859375, + "learning_rate": 5.91796875e-07, + "loss": 0.0024, + "reward": 1.6177734732627869, + "reward_std": 0.0996141117066145, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6490235030651093, + "step": 3344 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.984375, + "epoch": 1.6337890625, + "grad_norm": 1.7117136465741267, + "kl": 0.086669921875, + "learning_rate": 5.916748046875e-07, + "loss": 0.0035, + "reward": 1.555152177810669, + "reward_std": 0.10387120954692364, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.555152177810669, + "step": 3345 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.2265625, + "epoch": 1.63427734375, + "grad_norm": 1.4566254667516192, + "kl": 0.072021484375, + "learning_rate": 5.91552734375e-07, + "loss": 0.0029, + "reward": 1.7571306228637695, + "reward_std": 0.05150624364614487, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7571305930614471, + "step": 3346 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.4140625, + "epoch": 1.634765625, + "grad_norm": 2.543802498479339, + "kl": 0.082275390625, + "learning_rate": 5.914306640624999e-07, + "loss": 0.0033, + "reward": 1.732638418674469, + "reward_std": 0.11016843095421791, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7404508292675018, + "step": 3347 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.0078125, + "epoch": 1.63525390625, + "grad_norm": 7.467359428277764, + "kl": 0.0714111328125, + "learning_rate": 5.913085937499999e-07, + "loss": 0.0029, + "reward": 1.9553462266921997, + "reward_std": 0.07758795842528343, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9553462266921997, + "step": 3348 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.03125, + "epoch": 1.6357421875, + "grad_norm": 1.1941702843950022, + "kl": 0.058837890625, + "learning_rate": 5.911865234375e-07, + "loss": 0.0024, + "reward": 1.6709920763969421, + "reward_std": 0.05428230203688145, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6709920763969421, + "step": 3349 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.3125, + "epoch": 1.63623046875, + "grad_norm": 1.2102935214629125, + "kl": 0.0623779296875, + "learning_rate": 5.91064453125e-07, + "loss": 0.0025, + "reward": 1.8021827936172485, + "reward_std": 0.03311594016849995, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8021828532218933, + "step": 3350 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0546875, + "epoch": 1.63671875, + "grad_norm": 9.421608837634526, + "kl": 0.08251953125, + "learning_rate": 5.909423828125e-07, + "loss": 0.0033, + "reward": 1.809500515460968, + "reward_std": 0.09658823721110821, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.809500515460968, + "step": 3351 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.3828125, + "epoch": 1.63720703125, + "grad_norm": 2.2569011847158373, + "kl": 0.0986328125, + "learning_rate": 5.908203125e-07, + "loss": 0.0039, + "reward": 1.721911609172821, + "reward_std": 0.039531731978058815, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7219116389751434, + "step": 3352 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0078125, + "epoch": 1.6376953125, + "grad_norm": 4.408187685160172, + "kl": 0.0643310546875, + "learning_rate": 5.906982421874999e-07, + "loss": 0.0026, + "reward": 1.8004092574119568, + "reward_std": 0.10856766253709793, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.823846697807312, + "step": 3353 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.71875, + "epoch": 1.63818359375, + "grad_norm": 1.482450859345647, + "kl": 0.0640869140625, + "learning_rate": 5.905761718749999e-07, + "loss": 0.0026, + "reward": 1.8073206543922424, + "reward_std": 0.09571165032684803, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8073206543922424, + "step": 3354 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.2890625, + "epoch": 1.638671875, + "grad_norm": 1.1455791471480024, + "kl": 0.0482177734375, + "learning_rate": 5.904541015625e-07, + "loss": 0.0019, + "reward": 1.8967827558517456, + "reward_std": 0.06931715365499258, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9045952558517456, + "step": 3355 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.8359375, + "epoch": 1.63916015625, + "grad_norm": 2.129032656060852, + "kl": 0.068359375, + "learning_rate": 5.9033203125e-07, + "loss": 0.0027, + "reward": 1.7991633415222168, + "reward_std": 0.09995237179100513, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.814788281917572, + "step": 3356 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.140625, + "epoch": 1.6396484375, + "grad_norm": 2.146074812890639, + "kl": 0.060302734375, + "learning_rate": 5.902099609375e-07, + "loss": 0.0024, + "reward": 1.8512172102928162, + "reward_std": 0.03298699017614126, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8512172102928162, + "step": 3357 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.2578125, + "epoch": 1.64013671875, + "grad_norm": 3.1698472003026805, + "kl": 0.124755859375, + "learning_rate": 5.90087890625e-07, + "loss": 0.005, + "reward": 1.7051687836647034, + "reward_std": 0.05394227243959904, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7051687836647034, + "step": 3358 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.15625, + "epoch": 1.640625, + "grad_norm": 1.3803953318171671, + "kl": 0.0633544921875, + "learning_rate": 5.899658203125e-07, + "loss": 0.0025, + "reward": 1.8232309818267822, + "reward_std": 0.1542208231985569, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8466684818267822, + "step": 3359 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.1171875, + "epoch": 1.64111328125, + "grad_norm": 1.7556288084155496, + "kl": 0.084228515625, + "learning_rate": 5.898437499999999e-07, + "loss": 0.0034, + "reward": 1.815483808517456, + "reward_std": 0.07215754687786102, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.823296308517456, + "step": 3360 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.1796875, + "epoch": 1.6416015625, + "grad_norm": 0.8320346859160097, + "kl": 0.074462890625, + "learning_rate": 5.897216796874999e-07, + "loss": 0.003, + "reward": 1.7806763648986816, + "reward_std": 0.0820821225643158, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.788488894701004, + "step": 3361 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1796875, + "epoch": 1.64208984375, + "grad_norm": 1.6450666905858577, + "kl": 0.094482421875, + "learning_rate": 5.89599609375e-07, + "loss": 0.0038, + "reward": 1.786492109298706, + "reward_std": 0.05385753884911537, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7864921987056732, + "step": 3362 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.4921875, + "epoch": 1.642578125, + "grad_norm": 1.0348454721040237, + "kl": 0.08251953125, + "learning_rate": 5.894775390625e-07, + "loss": 0.0033, + "reward": 1.7476333379745483, + "reward_std": 0.061420466750860214, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.755445808172226, + "step": 3363 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.7578125, + "epoch": 1.64306640625, + "grad_norm": 1.2261159636339791, + "kl": 0.0654296875, + "learning_rate": 5.8935546875e-07, + "loss": 0.0026, + "reward": 1.8361621499061584, + "reward_std": 0.11280067265033722, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8517871201038361, + "step": 3364 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.296875, + "epoch": 1.6435546875, + "grad_norm": 1.7651610785496405, + "kl": 0.06787109375, + "learning_rate": 5.892333984375e-07, + "loss": 0.0027, + "reward": 1.8495672345161438, + "reward_std": 0.08414103463292122, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8495671451091766, + "step": 3365 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.8359375, + "epoch": 1.64404296875, + "grad_norm": 0.7490996025535948, + "kl": 0.074951171875, + "learning_rate": 5.891113281249999e-07, + "loss": 0.003, + "reward": 1.8907862901687622, + "reward_std": 0.01694483682513237, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.890786349773407, + "step": 3366 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.5546875, + "epoch": 1.64453125, + "grad_norm": 0.7273295456840305, + "kl": 0.08544921875, + "learning_rate": 5.889892578124999e-07, + "loss": 0.0034, + "reward": 1.7350443005561829, + "reward_std": 0.048879725858569145, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7350443005561829, + "step": 3367 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.21875, + "epoch": 1.64501953125, + "grad_norm": 7.736603835381099, + "kl": 0.095947265625, + "learning_rate": 5.888671875e-07, + "loss": 0.0038, + "reward": 1.761667251586914, + "reward_std": 0.04048959631472826, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7616672217845917, + "step": 3368 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.4609375, + "epoch": 1.6455078125, + "grad_norm": 1.4724329943645882, + "kl": 0.0556640625, + "learning_rate": 5.887451171875e-07, + "loss": 0.0022, + "reward": 1.8380178213119507, + "reward_std": 0.04197421669960022, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8380178809165955, + "step": 3369 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.5625, + "epoch": 1.64599609375, + "grad_norm": 1.4415743842970918, + "kl": 0.07470703125, + "learning_rate": 5.88623046875e-07, + "loss": 0.003, + "reward": 1.811439573764801, + "reward_std": 0.06903266906738281, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8114396035671234, + "step": 3370 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.71875, + "epoch": 1.646484375, + "grad_norm": 1.5866478649672215, + "kl": 0.072265625, + "learning_rate": 5.885009765625e-07, + "loss": 0.0029, + "reward": 1.8194407224655151, + "reward_std": 0.10099057853221893, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8194407522678375, + "step": 3371 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.9765625, + "epoch": 1.64697265625, + "grad_norm": 2.5414429477301455, + "kl": 0.064697265625, + "learning_rate": 5.8837890625e-07, + "loss": 0.0026, + "reward": 1.785763442516327, + "reward_std": 0.029020313173532486, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7857634425163269, + "step": 3372 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.46875, + "epoch": 1.6474609375, + "grad_norm": 1.1864726934683525, + "kl": 0.08349609375, + "learning_rate": 5.882568359374999e-07, + "loss": 0.0033, + "reward": 1.7326732277870178, + "reward_std": 0.045171596109867096, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7326732873916626, + "step": 3373 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.390625, + "epoch": 1.64794921875, + "grad_norm": 1.1727755992557876, + "kl": 0.0673828125, + "learning_rate": 5.881347656249999e-07, + "loss": 0.0027, + "reward": 1.9291696548461914, + "reward_std": 0.05488063208758831, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.929169625043869, + "step": 3374 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.8203125, + "epoch": 1.6484375, + "grad_norm": 1.188974789867218, + "kl": 0.07373046875, + "learning_rate": 5.880126953125e-07, + "loss": 0.003, + "reward": 1.7475911974906921, + "reward_std": 0.03924562409520149, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7475912272930145, + "step": 3375 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.7109375, + "epoch": 1.64892578125, + "grad_norm": 2.8855723689123254, + "kl": 0.069580078125, + "learning_rate": 5.87890625e-07, + "loss": 0.0028, + "reward": 1.8286888599395752, + "reward_std": 0.05414394848048687, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8286888897418976, + "step": 3376 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.1171875, + "epoch": 1.6494140625, + "grad_norm": 1.348637874083575, + "kl": 0.0787353515625, + "learning_rate": 5.877685546875e-07, + "loss": 0.0032, + "reward": 1.7663710117340088, + "reward_std": 0.03424928430467844, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7663710117340088, + "step": 3377 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.328125, + "epoch": 1.64990234375, + "grad_norm": 1.9070692767765587, + "kl": 0.0706787109375, + "learning_rate": 5.87646484375e-07, + "loss": 0.0028, + "reward": 1.792538821697235, + "reward_std": 0.06923755258321762, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7925387620925903, + "step": 3378 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.46875, + "epoch": 1.650390625, + "grad_norm": 1.65866315067269, + "kl": 0.08740234375, + "learning_rate": 5.875244140625e-07, + "loss": 0.0035, + "reward": 1.7584347128868103, + "reward_std": 0.0411848658695817, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7584347426891327, + "step": 3379 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.25, + "epoch": 1.65087890625, + "grad_norm": 2.196175009298372, + "kl": 0.08349609375, + "learning_rate": 5.874023437499999e-07, + "loss": 0.0033, + "reward": 1.621177852153778, + "reward_std": 0.11486036516726017, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6289903223514557, + "step": 3380 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.328125, + "epoch": 1.6513671875, + "grad_norm": 1.6329994027998105, + "kl": 0.088134765625, + "learning_rate": 5.872802734375e-07, + "loss": 0.0035, + "reward": 1.76973557472229, + "reward_std": 0.053606728091835976, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7697354555130005, + "step": 3381 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.5859375, + "epoch": 1.65185546875, + "grad_norm": 1.2486372473820984, + "kl": 0.075927734375, + "learning_rate": 5.87158203125e-07, + "loss": 0.003, + "reward": 1.9062875509262085, + "reward_std": 0.02893537748605013, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9062875807285309, + "step": 3382 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.53125, + "epoch": 1.65234375, + "grad_norm": 1.5306926158806675, + "kl": 0.081787109375, + "learning_rate": 5.870361328125e-07, + "loss": 0.0033, + "reward": 1.7200778126716614, + "reward_std": 0.053463514894247055, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.720077782869339, + "step": 3383 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.15625, + "epoch": 1.65283203125, + "grad_norm": 1.9605699591720807, + "kl": 0.086181640625, + "learning_rate": 5.869140625e-07, + "loss": 0.0034, + "reward": 1.7785995602607727, + "reward_std": 0.05440284963697195, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7785995900630951, + "step": 3384 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3671875, + "epoch": 1.6533203125, + "grad_norm": 1.1186012819207778, + "kl": 0.08154296875, + "learning_rate": 5.867919921875e-07, + "loss": 0.0033, + "reward": 1.7977607250213623, + "reward_std": 0.04878430813550949, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7977607250213623, + "step": 3385 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.8125, + "epoch": 1.65380859375, + "grad_norm": 1.4994976766626942, + "kl": 0.0703125, + "learning_rate": 5.866699218749999e-07, + "loss": 0.0028, + "reward": 1.7266179919242859, + "reward_std": 0.044663604348897934, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7266179323196411, + "step": 3386 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.7421875, + "epoch": 1.654296875, + "grad_norm": 2.908450555481079, + "kl": 0.093017578125, + "learning_rate": 5.865478515624999e-07, + "loss": 0.0037, + "reward": 1.7329715490341187, + "reward_std": 0.1744391992688179, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7485965490341187, + "step": 3387 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.203125, + "epoch": 1.65478515625, + "grad_norm": 1.221054748601646, + "kl": 0.102294921875, + "learning_rate": 5.8642578125e-07, + "loss": 0.0041, + "reward": 1.7703983783721924, + "reward_std": 0.06643011048436165, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7703984379768372, + "step": 3388 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8671875, + "epoch": 1.6552734375, + "grad_norm": 6.847886742373634, + "kl": 0.093017578125, + "learning_rate": 5.863037109375e-07, + "loss": 0.0037, + "reward": 1.8793238401412964, + "reward_std": 0.09725763648748398, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8793238401412964, + "step": 3389 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.265625, + "epoch": 1.65576171875, + "grad_norm": 1.6334940052830407, + "kl": 0.07373046875, + "learning_rate": 5.86181640625e-07, + "loss": 0.0029, + "reward": 1.754858374595642, + "reward_std": 0.13046734035015106, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7626709043979645, + "step": 3390 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.203125, + "epoch": 1.65625, + "grad_norm": 2.4516828308421816, + "kl": 0.0673828125, + "learning_rate": 5.860595703125e-07, + "loss": 0.0027, + "reward": 1.6885485649108887, + "reward_std": 0.12255653738975525, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6963610649108887, + "step": 3391 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.9453125, + "epoch": 1.65673828125, + "grad_norm": 1.0973530080385832, + "kl": 0.06591796875, + "learning_rate": 5.859375e-07, + "loss": 0.0026, + "reward": 1.7706849575042725, + "reward_std": 0.08537603169679642, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.8097475171089172, + "step": 3392 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.03125, + "epoch": 1.6572265625, + "grad_norm": 1.3434826786542489, + "kl": 0.08056640625, + "learning_rate": 5.858154296874999e-07, + "loss": 0.0032, + "reward": 1.769907832145691, + "reward_std": 0.02592490427196026, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7699077725410461, + "step": 3393 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.578125, + "epoch": 1.65771484375, + "grad_norm": 1.8016472051718078, + "kl": 0.091552734375, + "learning_rate": 5.85693359375e-07, + "loss": 0.0037, + "reward": 1.7779169082641602, + "reward_std": 0.08736255019903183, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7779169082641602, + "step": 3394 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.875, + "epoch": 1.658203125, + "grad_norm": 2.5785595078259833, + "kl": 0.080322265625, + "learning_rate": 5.855712890625e-07, + "loss": 0.0032, + "reward": 1.7090917825698853, + "reward_std": 0.027815474197268486, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7090917825698853, + "step": 3395 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.3828125, + "epoch": 1.65869140625, + "grad_norm": 1.007748533466306, + "kl": 0.078369140625, + "learning_rate": 5.8544921875e-07, + "loss": 0.0031, + "reward": 1.722628891468048, + "reward_std": 0.026624855119735003, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7226289510726929, + "step": 3396 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.3359375, + "epoch": 1.6591796875, + "grad_norm": 0.6947502588037848, + "kl": 0.06640625, + "learning_rate": 5.853271484375e-07, + "loss": 0.0027, + "reward": 1.8273064494132996, + "reward_std": 0.02322842739522457, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8273063898086548, + "step": 3397 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.484375, + "epoch": 1.65966796875, + "grad_norm": 1.357728234815542, + "kl": 0.0673828125, + "learning_rate": 5.85205078125e-07, + "loss": 0.0027, + "reward": 1.777342975139618, + "reward_std": 0.0818490230012685, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7929678857326508, + "step": 3398 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.1640625, + "epoch": 1.66015625, + "grad_norm": 1.5073163826811782, + "kl": 0.088623046875, + "learning_rate": 5.850830078124999e-07, + "loss": 0.0035, + "reward": 1.8505135774612427, + "reward_std": 0.036812907084822655, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8505135774612427, + "step": 3399 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.6640625, + "epoch": 1.66064453125, + "grad_norm": 0.5187163531397792, + "kl": 0.05615234375, + "learning_rate": 5.849609374999999e-07, + "loss": 0.0022, + "reward": 1.7177002429962158, + "reward_std": 0.039677318185567856, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7177002131938934, + "step": 3400 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.9765625, + "epoch": 1.6611328125, + "grad_norm": 0.7388474136721533, + "kl": 0.0703125, + "learning_rate": 5.848388671875e-07, + "loss": 0.0028, + "reward": 1.793430507183075, + "reward_std": 0.12719424441456795, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.816868007183075, + "step": 3401 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.7578125, + "epoch": 1.66162109375, + "grad_norm": 0.9275778304188466, + "kl": 0.0556640625, + "learning_rate": 5.84716796875e-07, + "loss": 0.0022, + "reward": 1.8390734195709229, + "reward_std": 0.07613059133291245, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8468858897686005, + "step": 3402 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.3359375, + "epoch": 1.662109375, + "grad_norm": 1.1797462013382214, + "kl": 0.0712890625, + "learning_rate": 5.845947265625e-07, + "loss": 0.0029, + "reward": 1.7845313549041748, + "reward_std": 0.056141434237360954, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.78453129529953, + "step": 3403 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.3046875, + "epoch": 1.66259765625, + "grad_norm": 1.3006085053369527, + "kl": 0.074951171875, + "learning_rate": 5.8447265625e-07, + "loss": 0.003, + "reward": 1.7825297117233276, + "reward_std": 0.07007915712893009, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7903422117233276, + "step": 3404 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.7734375, + "epoch": 1.6630859375, + "grad_norm": 1.0770527330394377, + "kl": 0.0595703125, + "learning_rate": 5.843505859375e-07, + "loss": 0.0024, + "reward": 1.9472978711128235, + "reward_std": 0.053828125819563866, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9472978115081787, + "step": 3405 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.84375, + "epoch": 1.66357421875, + "grad_norm": 0.940289877722008, + "kl": 0.0732421875, + "learning_rate": 5.842285156249999e-07, + "loss": 0.0029, + "reward": 1.5709947347640991, + "reward_std": 0.10499111982062459, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.5944323241710663, + "step": 3406 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.3203125, + "epoch": 1.6640625, + "grad_norm": 1.5937163992276968, + "kl": 0.0614013671875, + "learning_rate": 5.841064453125e-07, + "loss": 0.0025, + "reward": 1.8672499656677246, + "reward_std": 0.04773255158215761, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8672499656677246, + "step": 3407 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.6875, + "epoch": 1.66455078125, + "grad_norm": 1.0772380721292933, + "kl": 0.063232421875, + "learning_rate": 5.83984375e-07, + "loss": 0.0025, + "reward": 1.7862460613250732, + "reward_std": 0.03994133323431015, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7862460613250732, + "step": 3408 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3515625, + "epoch": 1.6650390625, + "grad_norm": 2.3302586200745905, + "kl": 0.072021484375, + "learning_rate": 5.838623046875e-07, + "loss": 0.0029, + "reward": 1.8306609988212585, + "reward_std": 0.06606091558933258, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8306609988212585, + "step": 3409 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.609375, + "epoch": 1.66552734375, + "grad_norm": 1.5068422166925504, + "kl": 0.087158203125, + "learning_rate": 5.83740234375e-07, + "loss": 0.0035, + "reward": 1.7843865156173706, + "reward_std": 0.03198308777064085, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.784386545419693, + "step": 3410 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.96875, + "epoch": 1.666015625, + "grad_norm": 3.0485685011397536, + "kl": 0.06689453125, + "learning_rate": 5.836181640625e-07, + "loss": 0.0027, + "reward": 1.8677841424942017, + "reward_std": 0.04165232554078102, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8677841424942017, + "step": 3411 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.03125, + "epoch": 1.66650390625, + "grad_norm": 1.5582087332286472, + "kl": 0.0908203125, + "learning_rate": 5.834960937499999e-07, + "loss": 0.0036, + "reward": 1.750693678855896, + "reward_std": 0.14407047256827354, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7741312682628632, + "step": 3412 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.6875, + "epoch": 1.6669921875, + "grad_norm": 1.3780651239867965, + "kl": 0.0771484375, + "learning_rate": 5.833740234374999e-07, + "loss": 0.0031, + "reward": 1.774366855621338, + "reward_std": 0.03671616315841675, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7743669152259827, + "step": 3413 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7734375, + "epoch": 1.66748046875, + "grad_norm": 2.4401285404057287, + "kl": 0.09814453125, + "learning_rate": 5.83251953125e-07, + "loss": 0.0039, + "reward": 1.7291316986083984, + "reward_std": 0.06025635078549385, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7291316986083984, + "step": 3414 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.96875, + "epoch": 1.66796875, + "grad_norm": 0.9632974422175405, + "kl": 0.069091796875, + "learning_rate": 5.831298828125e-07, + "loss": 0.0028, + "reward": 1.7597804069519043, + "reward_std": 0.04982480686157942, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7597803771495819, + "step": 3415 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.0, + "epoch": 1.66845703125, + "grad_norm": 2.137948451017365, + "kl": 0.078857421875, + "learning_rate": 5.830078125e-07, + "loss": 0.0032, + "reward": 1.7445058226585388, + "reward_std": 0.05879105068743229, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7445058226585388, + "step": 3416 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.1953125, + "epoch": 1.6689453125, + "grad_norm": 0.916829410323994, + "kl": 0.0849609375, + "learning_rate": 5.828857421875e-07, + "loss": 0.0034, + "reward": 1.767207384109497, + "reward_std": 0.021022816188633442, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7672074139118195, + "step": 3417 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.265625, + "epoch": 1.66943359375, + "grad_norm": 1.8069522939932925, + "kl": 0.076171875, + "learning_rate": 5.82763671875e-07, + "loss": 0.003, + "reward": 1.7938191294670105, + "reward_std": 0.06862248852849007, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7938191592693329, + "step": 3418 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.125, + "epoch": 1.669921875, + "grad_norm": 1.5981682786493596, + "kl": 0.06591796875, + "learning_rate": 5.826416015624999e-07, + "loss": 0.0026, + "reward": 1.8110138773918152, + "reward_std": 0.017725080251693726, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8110139071941376, + "step": 3419 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.6171875, + "epoch": 1.67041015625, + "grad_norm": 2.2101743301776504, + "kl": 0.098388671875, + "learning_rate": 5.8251953125e-07, + "loss": 0.0039, + "reward": 1.7247655987739563, + "reward_std": 0.06738665699958801, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7247655987739563, + "step": 3420 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.0703125, + "epoch": 1.6708984375, + "grad_norm": 5.321656124557536, + "kl": 0.068115234375, + "learning_rate": 5.823974609375e-07, + "loss": 0.0027, + "reward": 1.7655808925628662, + "reward_std": 0.06156047061085701, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7655808329582214, + "step": 3421 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.8984375, + "epoch": 1.67138671875, + "grad_norm": 2.8203241658105247, + "kl": 0.082763671875, + "learning_rate": 5.82275390625e-07, + "loss": 0.0033, + "reward": 2.0598042607307434, + "reward_std": 0.057430900633335114, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 1.0598042011260986, + "step": 3422 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.484375, + "epoch": 1.671875, + "grad_norm": 1.8127244648228247, + "kl": 0.07421875, + "learning_rate": 5.821533203125e-07, + "loss": 0.003, + "reward": 1.7353619933128357, + "reward_std": 0.11851292103528976, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7509869635105133, + "step": 3423 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.9375, + "epoch": 1.67236328125, + "grad_norm": 4.191171905834963, + "kl": 0.0677490234375, + "learning_rate": 5.8203125e-07, + "loss": 0.0027, + "reward": 1.8721721768379211, + "reward_std": 0.03585383854806423, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8721721768379211, + "step": 3424 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.265625, + "epoch": 1.6728515625, + "grad_norm": 3.1746721322821356, + "kl": 0.077392578125, + "learning_rate": 5.819091796874999e-07, + "loss": 0.0031, + "reward": 1.7025277614593506, + "reward_std": 0.07956914976239204, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7025277316570282, + "step": 3425 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.7890625, + "epoch": 1.67333984375, + "grad_norm": 0.934350590497789, + "kl": 0.063720703125, + "learning_rate": 5.817871093749999e-07, + "loss": 0.0025, + "reward": 1.8146753311157227, + "reward_std": 0.03896358422935009, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8146752715110779, + "step": 3426 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.8359375, + "epoch": 1.673828125, + "grad_norm": 2.2976808756968197, + "kl": 0.075439453125, + "learning_rate": 5.816650390625e-07, + "loss": 0.003, + "reward": 1.7204577922821045, + "reward_std": 0.10786120407283306, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7204578518867493, + "step": 3427 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.703125, + "epoch": 1.67431640625, + "grad_norm": 5.209395887612558, + "kl": 0.0859375, + "learning_rate": 5.8154296875e-07, + "loss": 0.0034, + "reward": 1.7612760663032532, + "reward_std": 0.14266540855169296, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7612760663032532, + "step": 3428 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.4921875, + "epoch": 1.6748046875, + "grad_norm": 2.974078254850948, + "kl": 0.0771484375, + "learning_rate": 5.814208984375e-07, + "loss": 0.0031, + "reward": 1.7587260007858276, + "reward_std": 0.045107051730155945, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.75872603058815, + "step": 3429 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.3671875, + "epoch": 1.67529296875, + "grad_norm": 2.1974943162800957, + "kl": 0.062744140625, + "learning_rate": 5.81298828125e-07, + "loss": 0.0025, + "reward": 1.7304607629776, + "reward_std": 0.07229996286332607, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7304607033729553, + "step": 3430 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.3515625, + "epoch": 1.67578125, + "grad_norm": 0.8396091238176443, + "kl": 0.0738525390625, + "learning_rate": 5.811767578125e-07, + "loss": 0.003, + "reward": 1.747983455657959, + "reward_std": 0.02914267312735319, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7479834854602814, + "step": 3431 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.1640625, + "epoch": 1.67626953125, + "grad_norm": 2.6575284914917816, + "kl": 0.069580078125, + "learning_rate": 5.810546874999999e-07, + "loss": 0.0028, + "reward": 1.8400204181671143, + "reward_std": 0.04621163569390774, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8400204181671143, + "step": 3432 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.296875, + "epoch": 1.6767578125, + "grad_norm": 2.4310818517899753, + "kl": 0.0673828125, + "learning_rate": 5.809326171875e-07, + "loss": 0.0027, + "reward": 1.8287239074707031, + "reward_std": 0.04241657070815563, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8287239372730255, + "step": 3433 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.03125, + "epoch": 1.67724609375, + "grad_norm": 1.4441990565870806, + "kl": 0.0732421875, + "learning_rate": 5.80810546875e-07, + "loss": 0.0029, + "reward": 1.8703618049621582, + "reward_std": 0.09191784635186195, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8859868347644806, + "step": 3434 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.84375, + "epoch": 1.677734375, + "grad_norm": 4.354556191413624, + "kl": 0.070068359375, + "learning_rate": 5.806884765625e-07, + "loss": 0.0028, + "reward": 1.6289713382720947, + "reward_std": 0.10028214752674103, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6367838084697723, + "step": 3435 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.3515625, + "epoch": 1.67822265625, + "grad_norm": 3.14226423674001, + "kl": 0.063720703125, + "learning_rate": 5.8056640625e-07, + "loss": 0.0025, + "reward": 1.8268967866897583, + "reward_std": 0.1756245121359825, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8503343164920807, + "step": 3436 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.421875, + "epoch": 1.6787109375, + "grad_norm": 1.6468385086214168, + "kl": 0.0556640625, + "learning_rate": 5.804443359375e-07, + "loss": 0.0022, + "reward": 1.8361674547195435, + "reward_std": 0.07743523456156254, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8439799845218658, + "step": 3437 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.828125, + "epoch": 1.67919921875, + "grad_norm": 1.236516691039877, + "kl": 0.08056640625, + "learning_rate": 5.803222656249999e-07, + "loss": 0.0032, + "reward": 1.8144738674163818, + "reward_std": 0.03578588366508484, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8144738674163818, + "step": 3438 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.125, + "epoch": 1.6796875, + "grad_norm": 7.342318015197428, + "kl": 0.0582275390625, + "learning_rate": 5.802001953124999e-07, + "loss": 0.0023, + "reward": 1.7566466927528381, + "reward_std": 0.057324403896927834, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7566466629505157, + "step": 3439 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.5, + "epoch": 1.68017578125, + "grad_norm": 1.5126156935058757, + "kl": 0.056884765625, + "learning_rate": 5.80078125e-07, + "loss": 0.0023, + "reward": 1.8134649991989136, + "reward_std": 0.07411767356097698, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8212774991989136, + "step": 3440 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.8984375, + "epoch": 1.6806640625, + "grad_norm": 2.8029832705522795, + "kl": 0.0721435546875, + "learning_rate": 5.799560546875e-07, + "loss": 0.0029, + "reward": 1.7705180048942566, + "reward_std": 0.14411171525716782, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.786143034696579, + "step": 3441 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.0546875, + "epoch": 1.68115234375, + "grad_norm": 2.075903401049284, + "kl": 0.060791015625, + "learning_rate": 5.79833984375e-07, + "loss": 0.0024, + "reward": 1.7683696746826172, + "reward_std": 0.06466953456401825, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7683696448802948, + "step": 3442 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.984375, + "epoch": 1.681640625, + "grad_norm": 1.6509715387223451, + "kl": 0.086181640625, + "learning_rate": 5.797119140625e-07, + "loss": 0.0035, + "reward": 1.7627912759780884, + "reward_std": 0.034193447791039944, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7627912759780884, + "step": 3443 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.921875, + "epoch": 1.68212890625, + "grad_norm": 1.1378399236168264, + "kl": 0.0523681640625, + "learning_rate": 5.7958984375e-07, + "loss": 0.0021, + "reward": 1.7294188141822815, + "reward_std": 0.13451597094535828, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7372312545776367, + "step": 3444 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.703125, + "epoch": 1.6826171875, + "grad_norm": 5.502362650100543, + "kl": 0.0650634765625, + "learning_rate": 5.794677734374999e-07, + "loss": 0.0026, + "reward": 1.8440684080123901, + "reward_std": 0.08081773668527603, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8440684378147125, + "step": 3445 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.9375, + "epoch": 1.68310546875, + "grad_norm": 1.2259288505339647, + "kl": 0.0706787109375, + "learning_rate": 5.79345703125e-07, + "loss": 0.0028, + "reward": 1.7710025310516357, + "reward_std": 0.12468947097659111, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7866275310516357, + "step": 3446 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.9921875, + "epoch": 1.68359375, + "grad_norm": 1.4159167368823977, + "kl": 0.0728759765625, + "learning_rate": 5.792236328125e-07, + "loss": 0.0029, + "reward": 1.782673954963684, + "reward_std": 0.025566712021827698, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7826739549636841, + "step": 3447 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.953125, + "epoch": 1.68408203125, + "grad_norm": 1.2888654953284708, + "kl": 0.057373046875, + "learning_rate": 5.791015625e-07, + "loss": 0.0023, + "reward": 1.7656115293502808, + "reward_std": 0.07896413654088974, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7812364995479584, + "step": 3448 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.4453125, + "epoch": 1.6845703125, + "grad_norm": 0.8051007000694611, + "kl": 0.069091796875, + "learning_rate": 5.789794921875e-07, + "loss": 0.0028, + "reward": 1.8307116031646729, + "reward_std": 0.03494404815137386, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8307116329669952, + "step": 3449 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.03125, + "epoch": 1.68505859375, + "grad_norm": 1.6248548960001123, + "kl": 0.057373046875, + "learning_rate": 5.78857421875e-07, + "loss": 0.0023, + "reward": 1.6238124370574951, + "reward_std": 0.10186551045626402, + "rewards/format_reward": 0.96875, + "rewards/ocr_reward": 0.6550624966621399, + "step": 3450 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.6875, + "epoch": 1.685546875, + "grad_norm": 1.8883437681183317, + "kl": 0.0675048828125, + "learning_rate": 5.787353515624999e-07, + "loss": 0.0027, + "reward": 1.7342004776000977, + "reward_std": 0.11742651090025902, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7498254776000977, + "step": 3451 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.34375, + "epoch": 1.68603515625, + "grad_norm": 0.854071632997255, + "kl": 0.06298828125, + "learning_rate": 5.786132812499999e-07, + "loss": 0.0025, + "reward": 1.8311820030212402, + "reward_std": 0.05601404421031475, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.831182062625885, + "step": 3452 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.0546875, + "epoch": 1.6865234375, + "grad_norm": 0.4515732246038297, + "kl": 0.0604248046875, + "learning_rate": 5.784912109375e-07, + "loss": 0.0024, + "reward": 1.7703008651733398, + "reward_std": 0.07104413863271475, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7859258651733398, + "step": 3453 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.140625, + "epoch": 1.68701171875, + "grad_norm": 2.068456057453586, + "kl": 0.08349609375, + "learning_rate": 5.78369140625e-07, + "loss": 0.0033, + "reward": 1.7976149916648865, + "reward_std": 0.04534151777625084, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7976149916648865, + "step": 3454 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.375, + "epoch": 1.6875, + "grad_norm": 1.7355862712462304, + "kl": 0.0626220703125, + "learning_rate": 5.782470703125e-07, + "loss": 0.0025, + "reward": 1.8032622337341309, + "reward_std": 0.053254470229148865, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8032622635364532, + "step": 3455 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.1328125, + "epoch": 1.68798828125, + "grad_norm": 2.871369071419689, + "kl": 0.0594482421875, + "learning_rate": 5.78125e-07, + "loss": 0.0024, + "reward": 1.8522000908851624, + "reward_std": 0.043344199657440186, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8522000908851624, + "step": 3456 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.984375, + "epoch": 1.6884765625, + "grad_norm": 1.0797110346192287, + "kl": 0.066650390625, + "learning_rate": 5.780029296875e-07, + "loss": 0.0027, + "reward": 1.718557059764862, + "reward_std": 0.09980412572622299, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7341820597648621, + "step": 3457 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.3046875, + "epoch": 1.68896484375, + "grad_norm": 2.010061312368978, + "kl": 0.071533203125, + "learning_rate": 5.778808593749999e-07, + "loss": 0.0029, + "reward": 1.7197965383529663, + "reward_std": 0.1209321841597557, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.7588590979576111, + "step": 3458 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.7265625, + "epoch": 1.689453125, + "grad_norm": 1.7529446065841696, + "kl": 0.0675048828125, + "learning_rate": 5.777587890624999e-07, + "loss": 0.0027, + "reward": 1.6905794739723206, + "reward_std": 0.13847313076257706, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7062044143676758, + "step": 3459 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.6796875, + "epoch": 1.68994140625, + "grad_norm": 2.0487912470077925, + "kl": 0.0673828125, + "learning_rate": 5.7763671875e-07, + "loss": 0.0027, + "reward": 1.770250141620636, + "reward_std": 0.027558826841413975, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.770250141620636, + "step": 3460 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.375, + "epoch": 1.6904296875, + "grad_norm": 1.607076612787755, + "kl": 0.0579833984375, + "learning_rate": 5.775146484375e-07, + "loss": 0.0023, + "reward": 1.7800695300102234, + "reward_std": 0.07028440106660128, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7878820598125458, + "step": 3461 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.96875, + "epoch": 1.69091796875, + "grad_norm": 3.802817161929801, + "kl": 0.064453125, + "learning_rate": 5.77392578125e-07, + "loss": 0.0026, + "reward": 1.7009736895561218, + "reward_std": 0.033372608944773674, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7009736895561218, + "step": 3462 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.640625, + "epoch": 1.69140625, + "grad_norm": 0.8747808849685156, + "kl": 0.0589599609375, + "learning_rate": 5.772705078125e-07, + "loss": 0.0024, + "reward": 1.7267315983772278, + "reward_std": 0.12720267474651337, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7423565685749054, + "step": 3463 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.4296875, + "epoch": 1.69189453125, + "grad_norm": 2.0193595822393293, + "kl": 0.0712890625, + "learning_rate": 5.771484374999999e-07, + "loss": 0.0029, + "reward": 1.708345651626587, + "reward_std": 0.03282461129128933, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7083457112312317, + "step": 3464 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.96875, + "epoch": 1.6923828125, + "grad_norm": 1.7772356569257048, + "kl": 0.0599365234375, + "learning_rate": 5.770263671874999e-07, + "loss": 0.0024, + "reward": 1.7624672055244446, + "reward_std": 0.11228394508361816, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7624672055244446, + "step": 3465 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.515625, + "epoch": 1.69287109375, + "grad_norm": 5.082897937319996, + "kl": 0.084716796875, + "learning_rate": 5.76904296875e-07, + "loss": 0.0034, + "reward": 1.7134467959403992, + "reward_std": 0.12474965304136276, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7134467661380768, + "step": 3466 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.734375, + "epoch": 1.693359375, + "grad_norm": 7.706459266928283, + "kl": 0.07421875, + "learning_rate": 5.767822265625e-07, + "loss": 0.003, + "reward": 1.8310195803642273, + "reward_std": 0.04999265819787979, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8310195505619049, + "step": 3467 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.515625, + "epoch": 1.69384765625, + "grad_norm": 1.7616788714824219, + "kl": 0.09423828125, + "learning_rate": 5.7666015625e-07, + "loss": 0.0038, + "reward": 1.7039409279823303, + "reward_std": 0.12659362703561783, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7117533683776855, + "step": 3468 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.171875, + "epoch": 1.6943359375, + "grad_norm": 1.3000730711438913, + "kl": 0.07666015625, + "learning_rate": 5.765380859375e-07, + "loss": 0.0031, + "reward": 1.7594356536865234, + "reward_std": 0.02355903387069702, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.759435623884201, + "step": 3469 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.6796875, + "epoch": 1.69482421875, + "grad_norm": 1.5447040369180085, + "kl": 0.0908203125, + "learning_rate": 5.76416015625e-07, + "loss": 0.0036, + "reward": 1.7440487742424011, + "reward_std": 0.09019343182444572, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7440488040447235, + "step": 3470 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.65625, + "epoch": 1.6953125, + "grad_norm": 1.207797652928656, + "kl": 0.0830078125, + "learning_rate": 5.762939453124999e-07, + "loss": 0.0033, + "reward": 1.6612102389335632, + "reward_std": 0.023191725835204124, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6612102389335632, + "step": 3471 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.8359375, + "epoch": 1.69580078125, + "grad_norm": 1.7416107299485089, + "kl": 0.060791015625, + "learning_rate": 5.761718749999999e-07, + "loss": 0.0024, + "reward": 1.8417921662330627, + "reward_std": 0.034474316984415054, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.841792106628418, + "step": 3472 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.234375, + "epoch": 1.6962890625, + "grad_norm": 1.4354809097592818, + "kl": 0.0771484375, + "learning_rate": 5.760498046875e-07, + "loss": 0.0031, + "reward": 1.5817983150482178, + "reward_std": 0.12967222556471825, + "rewards/format_reward": 0.9609375, + "rewards/ocr_reward": 0.6208608150482178, + "step": 3473 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.0, + "epoch": 1.69677734375, + "grad_norm": 3.4907537874766117, + "kl": 0.0516357421875, + "learning_rate": 5.75927734375e-07, + "loss": 0.0021, + "reward": 1.85581374168396, + "reward_std": 0.04456772096455097, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8558137714862823, + "step": 3474 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.53125, + "epoch": 1.697265625, + "grad_norm": 0.9069326314470321, + "kl": 0.064697265625, + "learning_rate": 5.758056640625e-07, + "loss": 0.0026, + "reward": 1.777282476425171, + "reward_std": 0.07658272795379162, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7772825360298157, + "step": 3475 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0625, + "epoch": 1.69775390625, + "grad_norm": 4.59511333562533, + "kl": 0.0643310546875, + "learning_rate": 5.7568359375e-07, + "loss": 0.0026, + "reward": 1.7993093729019165, + "reward_std": 0.06882397923618555, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7993092834949493, + "step": 3476 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.0078125, + "epoch": 1.6982421875, + "grad_norm": 1.3731053289621549, + "kl": 0.071533203125, + "learning_rate": 5.755615234375e-07, + "loss": 0.0029, + "reward": 1.8083871006965637, + "reward_std": 0.0703160697594285, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8083871304988861, + "step": 3477 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.2734375, + "epoch": 1.69873046875, + "grad_norm": 1.629812946645828, + "kl": 0.0693359375, + "learning_rate": 5.754394531249999e-07, + "loss": 0.0028, + "reward": 1.7725472450256348, + "reward_std": 0.017249885015189648, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7725472748279572, + "step": 3478 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.375, + "epoch": 1.69921875, + "grad_norm": 2.215999100156812, + "kl": 0.0732421875, + "learning_rate": 5.753173828125e-07, + "loss": 0.0029, + "reward": 1.7830110788345337, + "reward_std": 0.05324237793684006, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7830111086368561, + "step": 3479 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.8046875, + "epoch": 1.69970703125, + "grad_norm": 1.6611927604495773, + "kl": 0.0601806640625, + "learning_rate": 5.751953125e-07, + "loss": 0.0024, + "reward": 1.7691839337348938, + "reward_std": 0.042867109179496765, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7691839039325714, + "step": 3480 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.765625, + "epoch": 1.7001953125, + "grad_norm": 1.528033156409457, + "kl": 0.068115234375, + "learning_rate": 5.750732421875e-07, + "loss": 0.0027, + "reward": 1.7416942119598389, + "reward_std": 0.07591928541660309, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7416941821575165, + "step": 3481 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.1640625, + "epoch": 1.70068359375, + "grad_norm": 1.1833895285281955, + "kl": 0.06005859375, + "learning_rate": 5.74951171875e-07, + "loss": 0.0024, + "reward": 1.7779241800308228, + "reward_std": 0.1391547992825508, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.8013616800308228, + "step": 3482 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.90625, + "epoch": 1.701171875, + "grad_norm": 1.761189142899964, + "kl": 0.0662841796875, + "learning_rate": 5.748291015625e-07, + "loss": 0.0027, + "reward": 1.8467693328857422, + "reward_std": 0.06352511048316956, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8467693328857422, + "step": 3483 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.2890625, + "epoch": 1.70166015625, + "grad_norm": 1.5902592871580774, + "kl": 0.08154296875, + "learning_rate": 5.747070312499999e-07, + "loss": 0.0033, + "reward": 1.7309446930885315, + "reward_std": 0.07680136896669865, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7465696930885315, + "step": 3484 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.6953125, + "epoch": 1.7021484375, + "grad_norm": 10.920362143170516, + "kl": 0.0516357421875, + "learning_rate": 5.745849609374999e-07, + "loss": 0.0021, + "reward": 1.6752318739891052, + "reward_std": 0.09153604693710804, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6752318441867828, + "step": 3485 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.6640625, + "epoch": 1.70263671875, + "grad_norm": 1.5465562693732942, + "kl": 0.0650634765625, + "learning_rate": 5.74462890625e-07, + "loss": 0.0026, + "reward": 1.8406208753585815, + "reward_std": 0.03312433697283268, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8406208753585815, + "step": 3486 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.4375, + "epoch": 1.703125, + "grad_norm": 1.025623413376822, + "kl": 0.072998046875, + "learning_rate": 5.743408203125e-07, + "loss": 0.0029, + "reward": 1.8625024557113647, + "reward_std": 0.07112840935587883, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8703149855136871, + "step": 3487 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.1640625, + "epoch": 1.70361328125, + "grad_norm": 1.0215455164365324, + "kl": 0.067138671875, + "learning_rate": 5.7421875e-07, + "loss": 0.0027, + "reward": 1.6743749380111694, + "reward_std": 0.1329372152686119, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.6899998188018799, + "step": 3488 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.6484375, + "epoch": 1.7041015625, + "grad_norm": 1.5182995549340972, + "kl": 0.078125, + "learning_rate": 5.740966796875e-07, + "loss": 0.0031, + "reward": 1.6961557269096375, + "reward_std": 0.05626895558089018, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6961557865142822, + "step": 3489 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.34375, + "epoch": 1.70458984375, + "grad_norm": 1.501814717810848, + "kl": 0.068603515625, + "learning_rate": 5.73974609375e-07, + "loss": 0.0027, + "reward": 1.8220676183700562, + "reward_std": 0.0694145429879427, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8220676481723785, + "step": 3490 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.234375, + "epoch": 1.705078125, + "grad_norm": 8.211844024368538, + "kl": 0.090576171875, + "learning_rate": 5.738525390624999e-07, + "loss": 0.0036, + "reward": 1.8239883780479431, + "reward_std": 0.062240034341812134, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8239883780479431, + "step": 3491 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.4765625, + "epoch": 1.70556640625, + "grad_norm": 0.8810007015149223, + "kl": 0.05517578125, + "learning_rate": 5.7373046875e-07, + "loss": 0.0022, + "reward": 1.8131248354911804, + "reward_std": 0.04876277968287468, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8131248354911804, + "step": 3492 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.046875, + "epoch": 1.7060546875, + "grad_norm": 1.3082128636150807, + "kl": 0.087890625, + "learning_rate": 5.736083984375e-07, + "loss": 0.0035, + "reward": 1.8320286870002747, + "reward_std": 0.16402263939380646, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8476536571979523, + "step": 3493 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2734375, + "epoch": 1.70654296875, + "grad_norm": 0.7233338633250107, + "kl": 0.079833984375, + "learning_rate": 5.73486328125e-07, + "loss": 0.0032, + "reward": 1.809335172176361, + "reward_std": 0.07215743651613593, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8171476721763611, + "step": 3494 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.9453125, + "epoch": 1.70703125, + "grad_norm": 1.8643023308951008, + "kl": 0.083984375, + "learning_rate": 5.733642578125e-07, + "loss": 0.0034, + "reward": 1.7340399026870728, + "reward_std": 0.03040897147729993, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7340399026870728, + "step": 3495 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.078125, + "epoch": 1.70751953125, + "grad_norm": 1.5851306415379258, + "kl": 0.0732421875, + "learning_rate": 5.732421875e-07, + "loss": 0.0029, + "reward": 1.7629672288894653, + "reward_std": 0.053599401377141476, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7629671692848206, + "step": 3496 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.859375, + "epoch": 1.7080078125, + "grad_norm": 1.5268802541107716, + "kl": 0.072021484375, + "learning_rate": 5.731201171874999e-07, + "loss": 0.0029, + "reward": 1.8078510761260986, + "reward_std": 0.049535930156707764, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8078510463237762, + "step": 3497 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.421875, + "epoch": 1.70849609375, + "grad_norm": 3.351870958415951, + "kl": 0.0635986328125, + "learning_rate": 5.729980468749999e-07, + "loss": 0.0025, + "reward": 1.794227421283722, + "reward_std": 0.09365762025117874, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7942274212837219, + "step": 3498 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.359375, + "epoch": 1.708984375, + "grad_norm": 2.7206709323249125, + "kl": 0.067626953125, + "learning_rate": 5.728759765625e-07, + "loss": 0.0027, + "reward": 1.7822973728179932, + "reward_std": 0.09830936044454575, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7822974026203156, + "step": 3499 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.0546875, + "epoch": 1.70947265625, + "grad_norm": 1.0980561954495642, + "kl": 0.069580078125, + "learning_rate": 5.7275390625e-07, + "loss": 0.0028, + "reward": 1.879291832447052, + "reward_std": 0.05373461917042732, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8871042728424072, + "step": 3500 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.4921875, + "epoch": 1.7099609375, + "grad_norm": 1.7798040760161173, + "kl": 0.07958984375, + "learning_rate": 5.726318359375e-07, + "loss": 0.0032, + "reward": 1.7494339346885681, + "reward_std": 0.07848425209522247, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7494339644908905, + "step": 3501 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.703125, + "epoch": 1.71044921875, + "grad_norm": 1.0485894953158654, + "kl": 0.0657958984375, + "learning_rate": 5.72509765625e-07, + "loss": 0.0026, + "reward": 1.7808015942573547, + "reward_std": 0.01619276311248541, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7808016836643219, + "step": 3502 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.21875, + "epoch": 1.7109375, + "grad_norm": 1.2363428348153052, + "kl": 0.0810546875, + "learning_rate": 5.723876953125e-07, + "loss": 0.0032, + "reward": 1.6847857236862183, + "reward_std": 0.052163584157824516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6847857236862183, + "step": 3503 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.5, + "epoch": 1.71142578125, + "grad_norm": 1.0241359134056272, + "kl": 0.04931640625, + "learning_rate": 5.722656249999999e-07, + "loss": 0.002, + "reward": 1.8333210349082947, + "reward_std": 0.06691450020298362, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8411335349082947, + "step": 3504 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.8125, + "epoch": 1.7119140625, + "grad_norm": 1.0599995273767435, + "kl": 0.07568359375, + "learning_rate": 5.721435546875e-07, + "loss": 0.003, + "reward": 1.7699226140975952, + "reward_std": 0.04928914085030556, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7699226140975952, + "step": 3505 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.0390625, + "epoch": 1.71240234375, + "grad_norm": 0.9453226329364856, + "kl": 0.0645751953125, + "learning_rate": 5.72021484375e-07, + "loss": 0.0026, + "reward": 1.8773809671401978, + "reward_std": 0.028110843151807785, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8773809373378754, + "step": 3506 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.484375, + "epoch": 1.712890625, + "grad_norm": 1.6786568278471627, + "kl": 0.0679931640625, + "learning_rate": 5.718994140625e-07, + "loss": 0.0027, + "reward": 1.7743658423423767, + "reward_std": 0.03792189992964268, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7743658423423767, + "step": 3507 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.359375, + "epoch": 1.71337890625, + "grad_norm": 1.370461486241404, + "kl": 0.075927734375, + "learning_rate": 5.7177734375e-07, + "loss": 0.003, + "reward": 1.744931399822235, + "reward_std": 0.09244660288095474, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7449314296245575, + "step": 3508 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.1171875, + "epoch": 1.7138671875, + "grad_norm": 2.7346113141185033, + "kl": 0.069091796875, + "learning_rate": 5.716552734375e-07, + "loss": 0.0028, + "reward": 1.7291991710662842, + "reward_std": 0.04934484884142876, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7291992008686066, + "step": 3509 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.6171875, + "epoch": 1.71435546875, + "grad_norm": 1.4661207261525067, + "kl": 0.0947265625, + "learning_rate": 5.715332031249999e-07, + "loss": 0.0038, + "reward": 1.9682253003120422, + "reward_std": 0.05683219991624355, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.9760377407073975, + "step": 3510 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.1328125, + "epoch": 1.71484375, + "grad_norm": 2.5955313987666124, + "kl": 0.095703125, + "learning_rate": 5.714111328124999e-07, + "loss": 0.0038, + "reward": 1.7580629587173462, + "reward_std": 0.04326807055622339, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.758063018321991, + "step": 3511 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.90625, + "epoch": 1.71533203125, + "grad_norm": 3.7538484706080677, + "kl": 0.078369140625, + "learning_rate": 5.712890625e-07, + "loss": 0.0031, + "reward": 1.8174352049827576, + "reward_std": 0.07365524023771286, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8252476751804352, + "step": 3512 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.0234375, + "epoch": 1.7158203125, + "grad_norm": 2.4016719264444077, + "kl": 0.0732421875, + "learning_rate": 5.711669921875e-07, + "loss": 0.0029, + "reward": 1.7750000953674316, + "reward_std": 0.08003316074609756, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7750000655651093, + "step": 3513 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.546875, + "epoch": 1.71630859375, + "grad_norm": 1.4411311752581444, + "kl": 0.0673828125, + "learning_rate": 5.71044921875e-07, + "loss": 0.0027, + "reward": 1.8372459411621094, + "reward_std": 0.05695566162467003, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8372458517551422, + "step": 3514 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.1953125, + "epoch": 1.716796875, + "grad_norm": 1.6044404016784177, + "kl": 0.068359375, + "learning_rate": 5.709228515625e-07, + "loss": 0.0027, + "reward": 1.7338838577270508, + "reward_std": 0.08176321163773537, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7338838577270508, + "step": 3515 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.3203125, + "epoch": 1.71728515625, + "grad_norm": 2.0755482906159606, + "kl": 0.0858154296875, + "learning_rate": 5.7080078125e-07, + "loss": 0.0034, + "reward": 1.7466081380844116, + "reward_std": 0.032042115926742554, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7466080784797668, + "step": 3516 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.8671875, + "epoch": 1.7177734375, + "grad_norm": 1.7631315601889124, + "kl": 0.085693359375, + "learning_rate": 5.706787109374999e-07, + "loss": 0.0034, + "reward": 1.7691351175308228, + "reward_std": 0.05385134369134903, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7691351771354675, + "step": 3517 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.7734375, + "epoch": 1.71826171875, + "grad_norm": 1.0110099202676348, + "kl": 0.092041015625, + "learning_rate": 5.70556640625e-07, + "loss": 0.0037, + "reward": 1.6459341049194336, + "reward_std": 0.09330805763602257, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6537465453147888, + "step": 3518 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.8046875, + "epoch": 1.71875, + "grad_norm": 1.0560140334835186, + "kl": 0.083251953125, + "learning_rate": 5.704345703125e-07, + "loss": 0.0033, + "reward": 1.9419002532958984, + "reward_std": 0.08785379119217396, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.9419002532958984, + "step": 3519 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.8671875, + "epoch": 1.71923828125, + "grad_norm": 1.1441030012568587, + "kl": 0.058837890625, + "learning_rate": 5.703125e-07, + "loss": 0.0024, + "reward": 1.8219019174575806, + "reward_std": 0.0309375561773777, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8219019174575806, + "step": 3520 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.03125, + "epoch": 1.7197265625, + "grad_norm": 2.3512579540857153, + "kl": 0.08154296875, + "learning_rate": 5.701904296875e-07, + "loss": 0.0033, + "reward": 1.7773525714874268, + "reward_std": 0.04743030574172735, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7773525714874268, + "step": 3521 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.078125, + "epoch": 1.72021484375, + "grad_norm": 0.82600864833063, + "kl": 0.069580078125, + "learning_rate": 5.70068359375e-07, + "loss": 0.0028, + "reward": 1.7188506722450256, + "reward_std": 0.06316574104130268, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.726663202047348, + "step": 3522 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.0625, + "epoch": 1.720703125, + "grad_norm": 2.0136715514754693, + "kl": 0.092529296875, + "learning_rate": 5.699462890624999e-07, + "loss": 0.0037, + "reward": 1.729398787021637, + "reward_std": 0.07341841980814934, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7372111976146698, + "step": 3523 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.6171875, + "epoch": 1.72119140625, + "grad_norm": 1.2203906346471134, + "kl": 0.0908203125, + "learning_rate": 5.698242187499999e-07, + "loss": 0.0036, + "reward": 1.7553092241287231, + "reward_std": 0.05260470602661371, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7553092241287231, + "step": 3524 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.890625, + "epoch": 1.7216796875, + "grad_norm": 0.9548802375780792, + "kl": 0.10009765625, + "learning_rate": 5.697021484375e-07, + "loss": 0.004, + "reward": 1.7244818210601807, + "reward_std": 0.079419358051382, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7401067912578583, + "step": 3525 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.7890625, + "epoch": 1.72216796875, + "grad_norm": 1.743456302936007, + "kl": 0.083984375, + "learning_rate": 5.69580078125e-07, + "loss": 0.0034, + "reward": 1.7955304980278015, + "reward_std": 0.028726408258080482, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7955304086208344, + "step": 3526 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.8828125, + "epoch": 1.72265625, + "grad_norm": 2.8631334970459283, + "kl": 0.107421875, + "learning_rate": 5.694580078125e-07, + "loss": 0.0043, + "reward": 1.710024654865265, + "reward_std": 0.07605608738958836, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7100247442722321, + "step": 3527 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.09375, + "epoch": 1.72314453125, + "grad_norm": 11.37817101239002, + "kl": 0.116455078125, + "learning_rate": 5.693359375e-07, + "loss": 0.0047, + "reward": 1.753280758857727, + "reward_std": 0.07618452608585358, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7532808184623718, + "step": 3528 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.734375, + "epoch": 1.7236328125, + "grad_norm": 0.8179633518643423, + "kl": 0.0869140625, + "learning_rate": 5.692138671875e-07, + "loss": 0.0035, + "reward": 1.7533798813819885, + "reward_std": 0.024872629903256893, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7533798813819885, + "step": 3529 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.1953125, + "epoch": 1.72412109375, + "grad_norm": 1.6417406207916294, + "kl": 0.072021484375, + "learning_rate": 5.690917968749999e-07, + "loss": 0.0029, + "reward": 1.7788927555084229, + "reward_std": 0.025608118914533406, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7788927257061005, + "step": 3530 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.4140625, + "epoch": 1.724609375, + "grad_norm": 0.9017859503866928, + "kl": 0.073486328125, + "learning_rate": 5.689697265625e-07, + "loss": 0.003, + "reward": 1.8279852867126465, + "reward_std": 0.059508029371500015, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8279853165149689, + "step": 3531 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.515625, + "epoch": 1.72509765625, + "grad_norm": 2.5888358981187016, + "kl": 0.0849609375, + "learning_rate": 5.6884765625e-07, + "loss": 0.0034, + "reward": 1.8510370254516602, + "reward_std": 0.018395755905658007, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8510370254516602, + "step": 3532 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.40625, + "epoch": 1.7255859375, + "grad_norm": 1.0428518646158351, + "kl": 0.093017578125, + "learning_rate": 5.687255859375e-07, + "loss": 0.0037, + "reward": 1.8244857788085938, + "reward_std": 0.0313012283295393, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8244858086109161, + "step": 3533 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.140625, + "epoch": 1.72607421875, + "grad_norm": 2.3145569600287557, + "kl": 0.0787353515625, + "learning_rate": 5.68603515625e-07, + "loss": 0.0031, + "reward": 1.901319682598114, + "reward_std": 0.029859434813261032, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.901319682598114, + "step": 3534 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.8828125, + "epoch": 1.7265625, + "grad_norm": 1.9450424606133498, + "kl": 0.09033203125, + "learning_rate": 5.684814453125e-07, + "loss": 0.0036, + "reward": 1.7189557552337646, + "reward_std": 0.11504796147346497, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.7345808148384094, + "step": 3535 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.59375, + "epoch": 1.72705078125, + "grad_norm": 1.9911963089475972, + "kl": 0.072265625, + "learning_rate": 5.683593749999999e-07, + "loss": 0.0029, + "reward": 1.7401865720748901, + "reward_std": 0.10161124914884567, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7401866316795349, + "step": 3536 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.9453125, + "epoch": 1.7275390625, + "grad_norm": 1.1839095765959295, + "kl": 0.071044921875, + "learning_rate": 5.682373046874999e-07, + "loss": 0.0028, + "reward": 1.889032244682312, + "reward_std": 0.02803337760269642, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8890321254730225, + "step": 3537 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.671875, + "epoch": 1.72802734375, + "grad_norm": 1.7911653432373396, + "kl": 0.09521484375, + "learning_rate": 5.68115234375e-07, + "loss": 0.0038, + "reward": 1.6901207566261292, + "reward_std": 0.03867476247251034, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6901208162307739, + "step": 3538 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.234375, + "epoch": 1.728515625, + "grad_norm": 1.9111408087116626, + "kl": 0.088623046875, + "learning_rate": 5.679931640625e-07, + "loss": 0.0036, + "reward": 1.7439436316490173, + "reward_std": 0.078775430098176, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7439436316490173, + "step": 3539 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.4140625, + "epoch": 1.72900390625, + "grad_norm": 1.5678708925275149, + "kl": 0.06494140625, + "learning_rate": 5.6787109375e-07, + "loss": 0.0026, + "reward": 1.7854658365249634, + "reward_std": 0.03933623246848583, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7854658663272858, + "step": 3540 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.328125, + "epoch": 1.7294921875, + "grad_norm": 14.526695575847024, + "kl": 0.07275390625, + "learning_rate": 5.677490234375e-07, + "loss": 0.0029, + "reward": 1.7927899956703186, + "reward_std": 0.08604315388947725, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8006025552749634, + "step": 3541 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.2890625, + "epoch": 1.72998046875, + "grad_norm": 0.9340497276184674, + "kl": 0.05615234375, + "learning_rate": 5.67626953125e-07, + "loss": 0.0022, + "reward": 1.8815443515777588, + "reward_std": 0.03566223941743374, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.881544291973114, + "step": 3542 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.09375, + "epoch": 1.73046875, + "grad_norm": 1.4170493602251024, + "kl": 0.06884765625, + "learning_rate": 5.675048828124999e-07, + "loss": 0.0028, + "reward": 1.8988550901412964, + "reward_std": 0.02946687676012516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.898855060338974, + "step": 3543 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.2109375, + "epoch": 1.73095703125, + "grad_norm": 1.6780326561924064, + "kl": 0.07470703125, + "learning_rate": 5.673828125e-07, + "loss": 0.003, + "reward": 1.7607104778289795, + "reward_std": 0.05685322359204292, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7607105076313019, + "step": 3544 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.078125, + "epoch": 1.7314453125, + "grad_norm": 1.265920665539601, + "kl": 0.07958984375, + "learning_rate": 5.672607421875e-07, + "loss": 0.0032, + "reward": 1.660174310207367, + "reward_std": 0.09357069805264473, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6679868102073669, + "step": 3545 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.890625, + "epoch": 1.73193359375, + "grad_norm": 1.3387333409709177, + "kl": 0.084228515625, + "learning_rate": 5.67138671875e-07, + "loss": 0.0034, + "reward": 1.723749816417694, + "reward_std": 0.08172390796244144, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7237497866153717, + "step": 3546 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.3984375, + "epoch": 1.732421875, + "grad_norm": 6.581068369525777, + "kl": 0.070556640625, + "learning_rate": 5.670166015625e-07, + "loss": 0.0028, + "reward": 1.8678494691848755, + "reward_std": 0.027833457104861736, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8678494393825531, + "step": 3547 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.8203125, + "epoch": 1.73291015625, + "grad_norm": 0.7849499263584183, + "kl": 0.0703125, + "learning_rate": 5.6689453125e-07, + "loss": 0.0028, + "reward": 1.8659499883651733, + "reward_std": 0.0487942174077034, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8659500479698181, + "step": 3548 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.6640625, + "epoch": 1.7333984375, + "grad_norm": 0.8256068854796749, + "kl": 0.054443359375, + "learning_rate": 5.667724609374999e-07, + "loss": 0.0022, + "reward": 1.8360854387283325, + "reward_std": 0.028709974139928818, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8360854387283325, + "step": 3549 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.9609375, + "epoch": 1.73388671875, + "grad_norm": 2.5622246174809873, + "kl": 0.104736328125, + "learning_rate": 5.666503906249999e-07, + "loss": 0.0042, + "reward": 1.6949216723442078, + "reward_std": 0.07603111118078232, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6949216425418854, + "step": 3550 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.46875, + "epoch": 1.734375, + "grad_norm": 1.885269304797715, + "kl": 0.0487060546875, + "learning_rate": 5.665283203125e-07, + "loss": 0.002, + "reward": 1.800632357597351, + "reward_std": 0.09131154417991638, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8006323575973511, + "step": 3551 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.890625, + "epoch": 1.73486328125, + "grad_norm": 1.2590360439221413, + "kl": 0.071533203125, + "learning_rate": 5.6640625e-07, + "loss": 0.0029, + "reward": 1.747345209121704, + "reward_std": 0.024599829223006964, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7473451793193817, + "step": 3552 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.5859375, + "epoch": 1.7353515625, + "grad_norm": 1.7621451023351664, + "kl": 0.0531005859375, + "learning_rate": 5.662841796875e-07, + "loss": 0.0021, + "reward": 1.8149150013923645, + "reward_std": 0.06236854917369783, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8149150013923645, + "step": 3553 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.03125, + "epoch": 1.73583984375, + "grad_norm": 1.9104865022494395, + "kl": 0.06640625, + "learning_rate": 5.66162109375e-07, + "loss": 0.0027, + "reward": 1.6441280841827393, + "reward_std": 0.02519212942570448, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6441280245780945, + "step": 3554 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.5, + "epoch": 1.736328125, + "grad_norm": 1.2578790013200032, + "kl": 0.063720703125, + "learning_rate": 5.660400390625e-07, + "loss": 0.0025, + "reward": 1.6923622488975525, + "reward_std": 0.08163776621222496, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7001748085021973, + "step": 3555 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.7734375, + "epoch": 1.73681640625, + "grad_norm": 2.277411615256479, + "kl": 0.08447265625, + "learning_rate": 5.659179687499999e-07, + "loss": 0.0034, + "reward": 1.6454498767852783, + "reward_std": 0.05983481742441654, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6454498171806335, + "step": 3556 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.59375, + "epoch": 1.7373046875, + "grad_norm": 12.33623189322276, + "kl": 0.0657958984375, + "learning_rate": 5.657958984374999e-07, + "loss": 0.0026, + "reward": 1.7834393978118896, + "reward_std": 0.042173080146312714, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7834393680095673, + "step": 3557 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.6015625, + "epoch": 1.73779296875, + "grad_norm": 1.2785230001853094, + "kl": 0.066162109375, + "learning_rate": 5.65673828125e-07, + "loss": 0.0026, + "reward": 1.8725386261940002, + "reward_std": 0.038782306015491486, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8725386261940002, + "step": 3558 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.859375, + "epoch": 1.73828125, + "grad_norm": 1.4368144722786214, + "kl": 0.0693359375, + "learning_rate": 5.655517578125e-07, + "loss": 0.0028, + "reward": 1.828158974647522, + "reward_std": 0.07489650882780552, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.828158974647522, + "step": 3559 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.4375, + "epoch": 1.73876953125, + "grad_norm": 7.317107973230845, + "kl": 0.055908203125, + "learning_rate": 5.654296875e-07, + "loss": 0.0022, + "reward": 1.8085799813270569, + "reward_std": 0.03719876706600189, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8085800111293793, + "step": 3560 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.8359375, + "epoch": 1.7392578125, + "grad_norm": 1.1775016795529225, + "kl": 0.1083984375, + "learning_rate": 5.653076171875e-07, + "loss": 0.0043, + "reward": 1.7262591123580933, + "reward_std": 0.037734927609562874, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.726259171962738, + "step": 3561 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.421875, + "epoch": 1.73974609375, + "grad_norm": 1.7178106445382675, + "kl": 0.082763671875, + "learning_rate": 5.65185546875e-07, + "loss": 0.0033, + "reward": 1.6607239246368408, + "reward_std": 0.06611571833491325, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6607239246368408, + "step": 3562 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.4375, + "epoch": 1.740234375, + "grad_norm": 3.1487378984733274, + "kl": 0.091064453125, + "learning_rate": 5.650634765624999e-07, + "loss": 0.0036, + "reward": 1.8339160084724426, + "reward_std": 0.03272883594036102, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8339160084724426, + "step": 3563 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.1328125, + "epoch": 1.74072265625, + "grad_norm": 1.7524697236338946, + "kl": 0.0654296875, + "learning_rate": 5.6494140625e-07, + "loss": 0.0026, + "reward": 1.8101386427879333, + "reward_std": 0.03559943049913272, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8101385533809662, + "step": 3564 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.9609375, + "epoch": 1.7412109375, + "grad_norm": 1.0956422342478667, + "kl": 0.072509765625, + "learning_rate": 5.648193359375e-07, + "loss": 0.0029, + "reward": 1.8012146949768066, + "reward_std": 0.020907348953187466, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8012146949768066, + "step": 3565 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.9375, + "epoch": 1.74169921875, + "grad_norm": 1.1328692001884817, + "kl": 0.08349609375, + "learning_rate": 5.64697265625e-07, + "loss": 0.0033, + "reward": 1.7395102381706238, + "reward_std": 0.046815380454063416, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7395102381706238, + "step": 3566 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.6875, + "epoch": 1.7421875, + "grad_norm": 8.860759098488048, + "kl": 0.0732421875, + "learning_rate": 5.645751953125e-07, + "loss": 0.0029, + "reward": 1.7368816137313843, + "reward_std": 0.061663146945647895, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7368816435337067, + "step": 3567 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.5625, + "epoch": 1.74267578125, + "grad_norm": 5.322487155817771, + "kl": 0.058349609375, + "learning_rate": 5.64453125e-07, + "loss": 0.0023, + "reward": 1.7705302238464355, + "reward_std": 0.09227291122078896, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7783427238464355, + "step": 3568 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.828125, + "epoch": 1.7431640625, + "grad_norm": 0.9319050653167747, + "kl": 0.0540771484375, + "learning_rate": 5.643310546874999e-07, + "loss": 0.0022, + "reward": 1.7691416144371033, + "reward_std": 0.027585056610405445, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7691416144371033, + "step": 3569 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.6015625, + "epoch": 1.74365234375, + "grad_norm": 1.2001691656551983, + "kl": 0.06689453125, + "learning_rate": 5.642089843749999e-07, + "loss": 0.0027, + "reward": 1.7046304941177368, + "reward_std": 0.06601490080356598, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7046305537223816, + "step": 3570 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.859375, + "epoch": 1.744140625, + "grad_norm": 1.88195678513803, + "kl": 0.068359375, + "learning_rate": 5.640869140625e-07, + "loss": 0.0027, + "reward": 1.8015184998512268, + "reward_std": 0.11520305648446083, + "rewards/format_reward": 0.984375, + "rewards/ocr_reward": 0.8171434998512268, + "step": 3571 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.1484375, + "epoch": 1.74462890625, + "grad_norm": 1.3393368677542685, + "kl": 0.0760498046875, + "learning_rate": 5.6396484375e-07, + "loss": 0.003, + "reward": 1.6661378145217896, + "reward_std": 0.028218165040016174, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6661378145217896, + "step": 3572 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.4296875, + "epoch": 1.7451171875, + "grad_norm": 1.3605222672595276, + "kl": 0.079833984375, + "learning_rate": 5.638427734375e-07, + "loss": 0.0032, + "reward": 1.737298607826233, + "reward_std": 0.07310641929507256, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7372985780239105, + "step": 3573 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.9921875, + "epoch": 1.74560546875, + "grad_norm": 1.103577603969494, + "kl": 0.07373046875, + "learning_rate": 5.63720703125e-07, + "loss": 0.003, + "reward": 1.732740879058838, + "reward_std": 0.06374066509306431, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7327408790588379, + "step": 3574 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.5546875, + "epoch": 1.74609375, + "grad_norm": 1.31741473565297, + "kl": 0.074462890625, + "learning_rate": 5.635986328125e-07, + "loss": 0.003, + "reward": 1.8524270057678223, + "reward_std": 0.041487690061330795, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8524269759654999, + "step": 3575 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.203125, + "epoch": 1.74658203125, + "grad_norm": 1.5223059068844398, + "kl": 0.083984375, + "learning_rate": 5.634765624999999e-07, + "loss": 0.0034, + "reward": 1.6632152795791626, + "reward_std": 0.053207699209451675, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6632152497768402, + "step": 3576 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.7421875, + "epoch": 1.7470703125, + "grad_norm": 3.2437260416145666, + "kl": 0.0511474609375, + "learning_rate": 5.633544921875e-07, + "loss": 0.002, + "reward": 1.8188891410827637, + "reward_std": 0.09965669736266136, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8188891410827637, + "step": 3577 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.2734375, + "epoch": 1.74755859375, + "grad_norm": 8.572121114129786, + "kl": 0.04833984375, + "learning_rate": 5.63232421875e-07, + "loss": 0.0019, + "reward": 1.895998477935791, + "reward_std": 0.02911460120230913, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.895998477935791, + "step": 3578 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.984375, + "epoch": 1.748046875, + "grad_norm": 1.1357685130160955, + "kl": 0.058349609375, + "learning_rate": 5.631103515625e-07, + "loss": 0.0023, + "reward": 1.8604804277420044, + "reward_std": 0.03513455484062433, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.860480397939682, + "step": 3579 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.7265625, + "epoch": 1.74853515625, + "grad_norm": 1.7682231539196884, + "kl": 0.05908203125, + "learning_rate": 5.6298828125e-07, + "loss": 0.0024, + "reward": 1.8791704773902893, + "reward_std": 0.033109684474766254, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8791704773902893, + "step": 3580 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.234375, + "epoch": 1.7490234375, + "grad_norm": 17.460259222379793, + "kl": 0.076171875, + "learning_rate": 5.628662109375e-07, + "loss": 0.003, + "reward": 1.7239627838134766, + "reward_std": 0.05568823218345642, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7239627540111542, + "step": 3581 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.59375, + "epoch": 1.74951171875, + "grad_norm": 10.60035752233288, + "kl": 0.07958984375, + "learning_rate": 5.627441406249999e-07, + "loss": 0.0032, + "reward": 1.8284227848052979, + "reward_std": 0.09722843207418919, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.8362353146076202, + "step": 3582 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.375, + "epoch": 1.75, + "grad_norm": 1.4466112179411692, + "kl": 0.0595703125, + "learning_rate": 5.626220703124999e-07, + "loss": 0.0024, + "reward": 1.7760446071624756, + "reward_std": 0.0447351299226284, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7760446071624756, + "step": 3583 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.5234375, + "epoch": 1.75048828125, + "grad_norm": 1.2053351036061268, + "kl": 0.083740234375, + "learning_rate": 5.625e-07, + "loss": 0.0033, + "reward": 1.8212696313858032, + "reward_std": 0.08149140700697899, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.821269690990448, + "step": 3584 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5859375, + "epoch": 1.7509765625, + "grad_norm": 4.851214185716174, + "kl": 0.083984375, + "learning_rate": 5.623779296875e-07, + "loss": 0.0034, + "reward": 1.7441505193710327, + "reward_std": 0.1517154574394226, + "rewards/format_reward": 0.9765625, + "rewards/ocr_reward": 0.7675879597663879, + "step": 3585 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.5625, + "epoch": 1.75146484375, + "grad_norm": 1.179914259413158, + "kl": 0.07763671875, + "learning_rate": 5.62255859375e-07, + "loss": 0.0031, + "reward": 1.7303178310394287, + "reward_std": 0.047688692808151245, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7303178906440735, + "step": 3586 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.140625, + "epoch": 1.751953125, + "grad_norm": 1.8342388609879379, + "kl": 0.0589599609375, + "learning_rate": 5.621337890625e-07, + "loss": 0.0024, + "reward": 1.7912532687187195, + "reward_std": 0.0980726070702076, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7990657389163971, + "step": 3587 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.1875, + "epoch": 1.75244140625, + "grad_norm": 1.7620888034734943, + "kl": 0.071533203125, + "learning_rate": 5.6201171875e-07, + "loss": 0.0029, + "reward": 1.7349724173545837, + "reward_std": 0.14423664659261703, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.742784857749939, + "step": 3588 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.65625, + "epoch": 1.7529296875, + "grad_norm": 0.9893845914550516, + "kl": 0.0682373046875, + "learning_rate": 5.618896484374999e-07, + "loss": 0.0027, + "reward": 1.697411596775055, + "reward_std": 0.023635744117200375, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6974115371704102, + "step": 3589 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.0390625, + "epoch": 1.75341796875, + "grad_norm": 1.3535344923489823, + "kl": 0.0555419921875, + "learning_rate": 5.61767578125e-07, + "loss": 0.0022, + "reward": 1.799051284790039, + "reward_std": 0.01989690400660038, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7990512847900391, + "step": 3590 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.390625, + "epoch": 1.75390625, + "grad_norm": 1.3581378000044912, + "kl": 0.063720703125, + "learning_rate": 5.616455078125e-07, + "loss": 0.0025, + "reward": 1.6952258944511414, + "reward_std": 0.03106315340846777, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.6952258944511414, + "step": 3591 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.6171875, + "epoch": 1.75439453125, + "grad_norm": 2.5388820892946122, + "kl": 0.08154296875, + "learning_rate": 5.615234375e-07, + "loss": 0.0033, + "reward": 1.7009756565093994, + "reward_std": 0.09824825078248978, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.7087881565093994, + "step": 3592 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.53125, + "epoch": 1.7548828125, + "grad_norm": 1.7898213356726147, + "kl": 0.0576171875, + "learning_rate": 5.614013671875e-07, + "loss": 0.0023, + "reward": 1.7667632102966309, + "reward_std": 0.038947849068790674, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7667632400989532, + "step": 3593 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.9140625, + "epoch": 1.75537109375, + "grad_norm": 1.4385952936986826, + "kl": 0.0592041015625, + "learning_rate": 5.61279296875e-07, + "loss": 0.0024, + "reward": 1.7931809425354004, + "reward_std": 0.05069575086236, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7931809425354004, + "step": 3594 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.03125, + "epoch": 1.755859375, + "grad_norm": 2.1819507976548147, + "kl": 0.074951171875, + "learning_rate": 5.611572265624999e-07, + "loss": 0.003, + "reward": 1.8132346272468567, + "reward_std": 0.040440889075398445, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8132346570491791, + "step": 3595 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.1328125, + "epoch": 1.75634765625, + "grad_norm": 1.2064973111450872, + "kl": 0.078857421875, + "learning_rate": 5.610351562499999e-07, + "loss": 0.0031, + "reward": 1.7098599076271057, + "reward_std": 0.0605672225356102, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7098599076271057, + "step": 3596 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.921875, + "epoch": 1.7568359375, + "grad_norm": 1.7526125144836613, + "kl": 0.076171875, + "learning_rate": 5.609130859375e-07, + "loss": 0.003, + "reward": 1.747939109802246, + "reward_std": 0.07001195242628455, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7479391098022461, + "step": 3597 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.046875, + "epoch": 1.75732421875, + "grad_norm": 1.0693339957223436, + "kl": 0.0582275390625, + "learning_rate": 5.60791015625e-07, + "loss": 0.0023, + "reward": 1.6757773160934448, + "reward_std": 0.08137864619493484, + "rewards/format_reward": 0.9921875, + "rewards/ocr_reward": 0.6835898458957672, + "step": 3598 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.8984375, + "epoch": 1.7578125, + "grad_norm": 0.9296572610348767, + "kl": 0.071533203125, + "learning_rate": 5.606689453125e-07, + "loss": 0.0029, + "reward": 1.8186118006706238, + "reward_std": 0.07801494561135769, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.8186118006706238, + "step": 3599 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.2578125, + "epoch": 1.75830078125, + "grad_norm": 1.7734580005970337, + "kl": 0.070068359375, + "learning_rate": 5.60546875e-07, + "loss": 0.0028, + "reward": 1.757921278476715, + "reward_std": 0.05061543360352516, + "rewards/format_reward": 1.0, + "rewards/ocr_reward": 0.7579212486743927, + "step": 3600 + } + ], + "logging_steps": 1.0, + "max_steps": 8192, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}