{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3333333333333333, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages": -2.384185791015625e-07, "completion_length": 101.0, "epoch": 0.0003333333333333333, "grad_norm": 2.5402183532714844, "kl": 0.000659942626953125, "learning_rate": 9.996666666666667e-07, "loss": 0.0, "reward": 0.6115533113479614, "reward_mean": 0.6115533113479614, "reward_std": 0.04060140997171402, "rewards/a_meteor_reward": 0.6115533113479614, "step": 1 }, { "advantages": 4.842877388000488e-08, "completion_length": 14.1875, "epoch": 0.0006666666666666666, "grad_norm": 13.973031044006348, "kl": 0.00018596649169921875, "learning_rate": 9.993333333333333e-07, "loss": 0.0, "reward": 0.16142894327640533, "reward_mean": 0.16142894327640533, "reward_std": 0.09404676407575607, "rewards/v_meteor_reward": 0.16142894327640533, "step": 2 }, { "advantages": -1.3969838619232178e-08, "completion_length": 33.5625, "epoch": 0.001, "grad_norm": 9.751057624816895, "kl": 0.0018463134765625, "learning_rate": 9.989999999999999e-07, "loss": 0.0001, "reward": 0.38357752561569214, "reward_mean": 0.38357752561569214, "reward_std": 0.08058954775333405, "rewards/a_meteor_reward": 0.38357752561569214, "step": 3 }, { "advantages": 1.646578311920166e-06, "completion_length": 76.5625, "epoch": 0.0013333333333333333, "grad_norm": 9.374955177307129, "kl": 0.00128173828125, "learning_rate": 9.986666666666667e-07, "loss": 0.0, "reward": 0.1374811828136444, "reward_mean": 0.1374811828136444, "reward_std": 0.026278557255864143, "rewards/v_meteor_reward": 0.1374811828136444, "step": 4 }, { "advantages": 4.6566128730773926e-09, "completion_length": 15.5625, "epoch": 0.0016666666666666668, "grad_norm": 10.907198905944824, "kl": 6.4849853515625e-05, "learning_rate": 9.983333333333332e-07, "loss": 0.0, "reward": 0.1199445128440857, "reward_mean": 0.1199445128440857, "reward_std": 0.0758294016122818, "rewards/v_meteor_reward": 0.1199445128440857, "step": 5 }, { "advantages": 0.0, "completion_length": 10.1875, "epoch": 0.002, "grad_norm": 11.182469367980957, "kl": 0.0003871917724609375, "learning_rate": 9.98e-07, "loss": 0.0, "reward": 0.125, "reward_mean": 0.125, "reward_std": 0.14433756470680237, "rewards/iou_timestamp_reward": 0.0, "rewards/t_format_reward": 0.125, "step": 6 }, { "advantages": -1.1175870895385742e-08, "completion_length": 11.9375, "epoch": 0.0023333333333333335, "grad_norm": 14.401700973510742, "kl": 0.004852294921875, "learning_rate": 9.976666666666666e-07, "loss": 0.0002, "reward": 0.3069257140159607, "reward_mean": 0.3069257140159607, "reward_std": 0.30791395902633667, "rewards/iou_timestamp_reward": 0.0569256953895092, "rewards/t_format_reward": 0.25, "step": 7 }, { "advantages": 2.60770320892334e-08, "completion_length": 281.0, "epoch": 0.0026666666666666666, "grad_norm": 10.16336441040039, "kl": 0.0013580322265625, "learning_rate": 9.973333333333332e-07, "loss": 0.0001, "reward": 0.36273080110549927, "reward_mean": 0.36273080110549927, "reward_std": 0.14579002559185028, "rewards/a_meteor_reward": 0.36273080110549927, "step": 8 }, { "advantages": 6.332993507385254e-08, "completion_length": 14.8125, "epoch": 0.003, "grad_norm": 26.37580680847168, "kl": 0.06298828125, "learning_rate": 9.97e-07, "loss": 0.0025, "reward": 1.0411381721496582, "reward_mean": 1.0411381721496582, "reward_std": 0.6881328821182251, "rewards/iou_timestamp_reward": 0.2911382019519806, "rewards/t_format_reward": 0.75, "step": 9 }, { "advantages": -8.530914783477783e-07, "completion_length": 87.3125, "epoch": 0.0033333333333333335, "grad_norm": 4.604795455932617, "kl": 0.0004558563232421875, "learning_rate": 9.966666666666667e-07, "loss": 0.0, "reward": 0.4058172106742859, "reward_mean": 0.4058172106742859, "reward_std": 0.03221432492136955, "rewards/a_meteor_reward": 0.4058172106742859, "step": 10 }, { "advantages": -8.195638656616211e-08, "completion_length": 14.75, "epoch": 0.0036666666666666666, "grad_norm": 7.5022969245910645, "kl": 0.064453125, "learning_rate": 9.963333333333333e-07, "loss": 0.0026, "reward": 1.2750389575958252, "reward_mean": 1.2750389575958252, "reward_std": 0.3316720426082611, "rewards/iou_timestamp_reward": 0.3375389575958252, "rewards/t_format_reward": 0.9375, "step": 11 }, { "advantages": -3.427267074584961e-07, "completion_length": 33.1875, "epoch": 0.004, "grad_norm": 6.995570659637451, "kl": 0.000690460205078125, "learning_rate": 9.959999999999999e-07, "loss": 0.0, "reward": 0.3956291079521179, "reward_mean": 0.3956291079521179, "reward_std": 0.023739833384752274, "rewards/a_meteor_reward": 0.3956291079521179, "step": 12 }, { "advantages": 4.842877388000488e-08, "completion_length": 155.3125, "epoch": 0.004333333333333333, "grad_norm": 14.567049026489258, "kl": 0.005615234375, "learning_rate": 9.956666666666666e-07, "loss": 0.0002, "reward": 0.1286456286907196, "reward_mean": 0.1286456286907196, "reward_std": 0.10959772020578384, "rewards/a_meteor_reward": 0.1286456286907196, "step": 13 }, { "advantages": -4.852190613746643e-07, "completion_length": 14.4375, "epoch": 0.004666666666666667, "grad_norm": 12.244415283203125, "kl": 0.0634765625, "learning_rate": 9.953333333333332e-07, "loss": 0.0026, "reward": 1.200040578842163, "reward_mean": 1.200040578842163, "reward_std": 0.08909125626087189, "rewards/iou_timestamp_reward": 0.20004045963287354, "rewards/t_format_reward": 1.0, "step": 14 }, { "advantages": -1.862645149230957e-07, "completion_length": 15.3125, "epoch": 0.005, "grad_norm": 7.899139881134033, "kl": 0.02978515625, "learning_rate": 9.95e-07, "loss": 0.0012, "reward": 1.2055667638778687, "reward_mean": 1.2055667638778687, "reward_std": 0.12087348848581314, "rewards/iou_timestamp_reward": 0.20556670427322388, "rewards/t_format_reward": 1.0, "step": 15 }, { "advantages": -3.3527612686157227e-08, "completion_length": 18.8125, "epoch": 0.005333333333333333, "grad_norm": 13.358997344970703, "kl": 0.00738525390625, "learning_rate": 9.946666666666666e-07, "loss": 0.0003, "reward": 0.1260966956615448, "reward_mean": 0.1260966956615448, "reward_std": 0.10400636494159698, "rewards/v_meteor_reward": 0.1260966956615448, "step": 16 }, { "advantages": 3.3527612686157227e-08, "completion_length": 25.8125, "epoch": 0.005666666666666667, "grad_norm": 11.948675155639648, "kl": 0.00823974609375, "learning_rate": 9.943333333333331e-07, "loss": 0.0003, "reward": 0.10011281073093414, "reward_mean": 0.10011281073093414, "reward_std": 0.04693843424320221, "rewards/v_meteor_reward": 0.10011281073093414, "step": 17 }, { "advantages": 8.754432201385498e-08, "completion_length": 40.125, "epoch": 0.006, "grad_norm": 9.123346328735352, "kl": 0.0069580078125, "learning_rate": 9.94e-07, "loss": 0.0003, "reward": 0.15726304054260254, "reward_mean": 0.15726304054260254, "reward_std": 0.06641571968793869, "rewards/v_meteor_reward": 0.15726304054260254, "step": 18 }, { "advantages": -1.8998980522155762e-07, "completion_length": 15.625, "epoch": 0.006333333333333333, "grad_norm": 6.107767105102539, "kl": 0.0634765625, "learning_rate": 9.936666666666667e-07, "loss": 0.0025, "reward": 1.5461912155151367, "reward_mean": 1.5461912155151367, "reward_std": 0.2295360565185547, "rewards/iou_timestamp_reward": 0.5461910963058472, "rewards/t_format_reward": 1.0, "step": 19 }, { "advantages": 1.2479722499847412e-07, "completion_length": 16.0625, "epoch": 0.006666666666666667, "grad_norm": 4.278800010681152, "kl": 0.05615234375, "learning_rate": 9.933333333333333e-07, "loss": 0.0022, "reward": 1.1681230068206787, "reward_mean": 1.1681230068206787, "reward_std": 0.12982547283172607, "rewards/iou_timestamp_reward": 0.1681230366230011, "rewards/t_format_reward": 1.0, "step": 20 }, { "advantages": 0.0, "completion_length": 14.1875, "epoch": 0.007, "grad_norm": 7.504348278045654, "kl": 0.08740234375, "learning_rate": 9.929999999999999e-07, "loss": 0.0035, "reward": 1.2965755462646484, "reward_mean": 1.2965755462646484, "reward_std": 0.19274276494979858, "rewards/iou_timestamp_reward": 0.2965756058692932, "rewards/t_format_reward": 1.0, "step": 21 }, { "advantages": -9.313225746154785e-08, "completion_length": 282.9375, "epoch": 0.007333333333333333, "grad_norm": 2.931668281555176, "kl": 0.00286865234375, "learning_rate": 9.926666666666666e-07, "loss": 0.0001, "reward": 0.3277899920940399, "reward_mean": 0.3277899920940399, "reward_std": 0.11091656237840652, "rewards/a_meteor_reward": 0.3277899920940399, "step": 22 }, { "advantages": -1.2293457984924316e-07, "completion_length": 53.0625, "epoch": 0.007666666666666666, "grad_norm": 3.7461183071136475, "kl": 0.01141357421875, "learning_rate": 9.923333333333332e-07, "loss": 0.0005, "reward": 0.06904295086860657, "reward_mean": 0.06904295086860657, "reward_std": 0.006471453700214624, "rewards/v_meteor_reward": 0.06904295086860657, "step": 23 }, { "advantages": -3.725290298461914e-08, "completion_length": 15.125, "epoch": 0.008, "grad_norm": 5.493320465087891, "kl": 0.060791015625, "learning_rate": 9.92e-07, "loss": 0.0024, "reward": 1.1666371822357178, "reward_mean": 1.1666371822357178, "reward_std": 0.1544974446296692, "rewards/iou_timestamp_reward": 0.16663724184036255, "rewards/t_format_reward": 1.0, "step": 24 }, { "advantages": 7.35744833946228e-08, "completion_length": 15.1875, "epoch": 0.008333333333333333, "grad_norm": 7.952383518218994, "kl": 0.04052734375, "learning_rate": 9.916666666666666e-07, "loss": 0.0016, "reward": 1.3359739780426025, "reward_mean": 1.3359739780426025, "reward_std": 0.1132359504699707, "rewards/iou_timestamp_reward": 0.3359740078449249, "rewards/t_format_reward": 1.0, "step": 25 }, { "advantages": 1.341104507446289e-07, "completion_length": 41.1875, "epoch": 0.008666666666666666, "grad_norm": 6.880733013153076, "kl": 0.00897216796875, "learning_rate": 9.913333333333333e-07, "loss": 0.0004, "reward": 0.08443792164325714, "reward_mean": 0.08443792164325714, "reward_std": 0.010595927014946938, "rewards/v_meteor_reward": 0.08443792164325714, "step": 26 }, { "advantages": 3.598630428314209e-06, "completion_length": 90.0, "epoch": 0.009, "grad_norm": 6.438241004943848, "kl": 0.0013427734375, "learning_rate": 9.91e-07, "loss": 0.0, "reward": 0.4687597453594208, "reward_mean": 0.4687597453594208, "reward_std": 0.02755456045269966, "rewards/a_meteor_reward": 0.4687597453594208, "step": 27 }, { "advantages": 1.1175870895385742e-08, "completion_length": 179.5, "epoch": 0.009333333333333334, "grad_norm": 4.246849060058594, "kl": 0.016845703125, "learning_rate": 9.906666666666667e-07, "loss": 0.0007, "reward": 0.3106178641319275, "reward_mean": 0.3106178641319275, "reward_std": 0.13001520931720734, "rewards/a_meteor_reward": 0.3106178641319275, "step": 28 }, { "advantages": 2.2351741790771484e-08, "completion_length": 261.1875, "epoch": 0.009666666666666667, "grad_norm": 2.6727449893951416, "kl": 0.011962890625, "learning_rate": 9.903333333333333e-07, "loss": 0.0005, "reward": 0.34186387062072754, "reward_mean": 0.34186387062072754, "reward_std": 0.034738704562187195, "rewards/a_meteor_reward": 0.34186387062072754, "step": 29 }, { "advantages": -9.723007678985596e-07, "completion_length": 239.875, "epoch": 0.01, "grad_norm": 3.132370948791504, "kl": 0.00058746337890625, "learning_rate": 9.9e-07, "loss": 0.0, "reward": 0.5484164357185364, "reward_mean": 0.5484164357185364, "reward_std": 0.014626091346144676, "rewards/a_meteor_reward": 0.5484164357185364, "step": 30 }, { "advantages": -5.029141902923584e-08, "completion_length": 62.875, "epoch": 0.010333333333333333, "grad_norm": 8.629778861999512, "kl": 0.033203125, "learning_rate": 9.896666666666666e-07, "loss": 0.0013, "reward": 0.1254253387451172, "reward_mean": 0.1254253387451172, "reward_std": 0.06880996376276016, "rewards/v_meteor_reward": 0.1254253387451172, "step": 31 }, { "advantages": -5.587935447692871e-08, "completion_length": 324.125, "epoch": 0.010666666666666666, "grad_norm": 5.252091884613037, "kl": 0.020751953125, "learning_rate": 9.893333333333332e-07, "loss": 0.0008, "reward": 0.3805185854434967, "reward_mean": 0.3805185854434967, "reward_std": 0.11986806243658066, "rewards/a_meteor_reward": 0.3805185854434967, "step": 32 }, { "advantages": -1.1175870895385742e-08, "completion_length": 81.875, "epoch": 0.011, "grad_norm": 7.561497211456299, "kl": 0.0311279296875, "learning_rate": 9.89e-07, "loss": 0.0012, "reward": 0.17030546069145203, "reward_mean": 0.17030546069145203, "reward_std": 0.05711165815591812, "rewards/a_meteor_reward": 0.17030546069145203, "step": 33 }, { "advantages": 1.6763806343078613e-07, "completion_length": 54.625, "epoch": 0.011333333333333334, "grad_norm": 8.652045249938965, "kl": 0.0208740234375, "learning_rate": 9.886666666666665e-07, "loss": 0.0008, "reward": 0.15539142489433289, "reward_mean": 0.15539142489433289, "reward_std": 0.04058125615119934, "rewards/v_meteor_reward": 0.15539142489433289, "step": 34 }, { "advantages": -4.0978193283081055e-08, "completion_length": 66.0625, "epoch": 0.011666666666666667, "grad_norm": 13.000102996826172, "kl": 0.06005859375, "learning_rate": 9.883333333333333e-07, "loss": 0.0024, "reward": 0.18552859127521515, "reward_mean": 0.18552859127521515, "reward_std": 0.09346804022789001, "rewards/v_meteor_reward": 0.18552859127521515, "step": 35 }, { "advantages": 5.774199962615967e-08, "completion_length": 16.5625, "epoch": 0.012, "grad_norm": 4.7450127601623535, "kl": 0.07177734375, "learning_rate": 9.88e-07, "loss": 0.0029, "reward": 1.1357805728912354, "reward_mean": 1.1357805728912354, "reward_std": 0.054469890892505646, "rewards/iou_timestamp_reward": 0.1357804834842682, "rewards/t_format_reward": 1.0, "step": 36 }, { "advantages": 3.725290298461914e-08, "completion_length": 55.6875, "epoch": 0.012333333333333333, "grad_norm": 8.798715591430664, "kl": 0.07958984375, "learning_rate": 9.876666666666667e-07, "loss": 0.0032, "reward": 0.20064634084701538, "reward_mean": 0.20064634084701538, "reward_std": 0.1137518659234047, "rewards/v_meteor_reward": 0.20064634084701538, "step": 37 }, { "advantages": 6.705522537231445e-08, "completion_length": 36.125, "epoch": 0.012666666666666666, "grad_norm": 10.945947647094727, "kl": 0.08056640625, "learning_rate": 9.873333333333333e-07, "loss": 0.0032, "reward": 0.18528231978416443, "reward_mean": 0.18528231978416443, "reward_std": 0.08586443960666656, "rewards/v_meteor_reward": 0.18528231978416443, "step": 38 }, { "advantages": 1.471489667892456e-07, "completion_length": 16.1875, "epoch": 0.013, "grad_norm": 5.416378021240234, "kl": 0.1103515625, "learning_rate": 9.87e-07, "loss": 0.0044, "reward": 1.3143255710601807, "reward_mean": 1.3143255710601807, "reward_std": 0.1600990742444992, "rewards/iou_timestamp_reward": 0.3143255114555359, "rewards/t_format_reward": 1.0, "step": 39 }, { "advantages": -3.390014171600342e-07, "completion_length": 16.0, "epoch": 0.013333333333333334, "grad_norm": 7.088622570037842, "kl": 0.08837890625, "learning_rate": 9.866666666666666e-07, "loss": 0.0035, "reward": 1.5420372486114502, "reward_mean": 1.5420372486114502, "reward_std": 0.17091229557991028, "rewards/iou_timestamp_reward": 0.5420371294021606, "rewards/t_format_reward": 1.0, "step": 40 }, { "advantages": -2.3096799850463867e-07, "completion_length": 16.3125, "epoch": 0.013666666666666667, "grad_norm": 6.0945258140563965, "kl": 0.0849609375, "learning_rate": 9.863333333333332e-07, "loss": 0.0034, "reward": 1.3367459774017334, "reward_mean": 1.3367459774017334, "reward_std": 0.2646312713623047, "rewards/iou_timestamp_reward": 0.336745947599411, "rewards/t_format_reward": 1.0, "step": 41 }, { "advantages": 1.0058283805847168e-07, "completion_length": 15.0625, "epoch": 0.014, "grad_norm": 6.868780136108398, "kl": 0.1103515625, "learning_rate": 9.86e-07, "loss": 0.0044, "reward": 1.3071861267089844, "reward_mean": 1.3071861267089844, "reward_std": 0.20395562052726746, "rewards/iou_timestamp_reward": 0.30718615651130676, "rewards/t_format_reward": 1.0, "step": 42 }, { "advantages": -4.0978193283081055e-08, "completion_length": 120.9375, "epoch": 0.014333333333333333, "grad_norm": 9.675359725952148, "kl": 0.0220947265625, "learning_rate": 9.856666666666667e-07, "loss": 0.0009, "reward": 0.1490117907524109, "reward_mean": 0.1490117907524109, "reward_std": 0.07224567234516144, "rewards/v_meteor_reward": 0.1490117907524109, "step": 43 }, { "advantages": -2.421438694000244e-07, "completion_length": 23.5625, "epoch": 0.014666666666666666, "grad_norm": 10.192291259765625, "kl": 0.0184326171875, "learning_rate": 9.853333333333333e-07, "loss": 0.0007, "reward": 0.05797901749610901, "reward_mean": 0.05797901749610901, "reward_std": 0.02320011705160141, "rewards/v_meteor_reward": 0.05797901749610901, "step": 44 }, { "advantages": 1.8067657947540283e-07, "completion_length": 14.8125, "epoch": 0.015, "grad_norm": 6.661442756652832, "kl": 0.07373046875, "learning_rate": 9.849999999999999e-07, "loss": 0.0029, "reward": 1.652003288269043, "reward_mean": 1.652003288269043, "reward_std": 0.15927281975746155, "rewards/iou_timestamp_reward": 0.6520033478736877, "rewards/t_format_reward": 1.0, "step": 45 }, { "advantages": 1.1175870895385742e-08, "completion_length": 15.1875, "epoch": 0.015333333333333332, "grad_norm": 4.517682075500488, "kl": 0.1103515625, "learning_rate": 9.846666666666667e-07, "loss": 0.0044, "reward": 1.6551637649536133, "reward_mean": 1.6551637649536133, "reward_std": 0.18847060203552246, "rewards/iou_timestamp_reward": 0.6551637649536133, "rewards/t_format_reward": 1.0, "step": 46 }, { "advantages": 8.940696716308594e-08, "completion_length": 14.75, "epoch": 0.015666666666666666, "grad_norm": 5.813600063323975, "kl": 0.0888671875, "learning_rate": 9.843333333333332e-07, "loss": 0.0036, "reward": 1.6079601049423218, "reward_mean": 1.6079601049423218, "reward_std": 0.17017459869384766, "rewards/iou_timestamp_reward": 0.6079601645469666, "rewards/t_format_reward": 1.0, "step": 47 }, { "advantages": 3.1329691410064697e-06, "completion_length": 15.3125, "epoch": 0.016, "grad_norm": 6.380448818206787, "kl": 0.08056640625, "learning_rate": 9.84e-07, "loss": 0.0032, "reward": 1.354614019393921, "reward_mean": 1.354614019393921, "reward_std": 0.1052875965833664, "rewards/iou_timestamp_reward": 0.3546140193939209, "rewards/t_format_reward": 1.0, "step": 48 }, { "advantages": -1.6763806343078613e-08, "completion_length": 37.25, "epoch": 0.01633333333333333, "grad_norm": 8.184245109558105, "kl": 0.11083984375, "learning_rate": 9.836666666666666e-07, "loss": 0.0044, "reward": 0.2494458258152008, "reward_mean": 0.2494458258152008, "reward_std": 0.07414299249649048, "rewards/v_meteor_reward": 0.2494458258152008, "step": 49 }, { "advantages": 2.421438694000244e-08, "completion_length": 15.8125, "epoch": 0.016666666666666666, "grad_norm": 12.26054573059082, "kl": 0.06591796875, "learning_rate": 9.833333333333332e-07, "loss": 0.0026, "reward": 1.4482377767562866, "reward_mean": 1.4482377767562866, "reward_std": 0.26251208782196045, "rewards/iou_timestamp_reward": 0.4482377767562866, "rewards/t_format_reward": 1.0, "step": 50 }, { "advantages": -3.7066638469696045e-07, "completion_length": 35.5, "epoch": 0.017, "grad_norm": 5.7489542961120605, "kl": 0.004730224609375, "learning_rate": 9.83e-07, "loss": 0.0002, "reward": 0.5013377666473389, "reward_mean": 0.5013377666473389, "reward_std": 0.04430757090449333, "rewards/a_meteor_reward": 0.5013377666473389, "step": 51 }, { "advantages": -5.21540641784668e-08, "completion_length": 601.75, "epoch": 0.017333333333333333, "grad_norm": 1.627846598625183, "kl": 0.0032196044921875, "learning_rate": 9.826666666666667e-07, "loss": 0.0001, "reward": 0.32330283522605896, "reward_mean": 0.32330283522605896, "reward_std": 0.08750564604997635, "rewards/a_meteor_reward": 0.32330283522605896, "step": 52 }, { "advantages": -2.2351741790771484e-08, "completion_length": 191.875, "epoch": 0.017666666666666667, "grad_norm": 5.513625144958496, "kl": 0.041748046875, "learning_rate": 9.823333333333333e-07, "loss": 0.0017, "reward": 0.29467225074768066, "reward_mean": 0.29467225074768066, "reward_std": 0.1500266194343567, "rewards/a_meteor_reward": 0.29467225074768066, "step": 53 }, { "advantages": 4.172325134277344e-07, "completion_length": 236.9375, "epoch": 0.018, "grad_norm": 4.818164348602295, "kl": 0.0908203125, "learning_rate": 9.819999999999999e-07, "loss": 0.0036, "reward": 0.4115750789642334, "reward_mean": 0.4115750789642334, "reward_std": 0.10925880819559097, "rewards/a_meteor_reward": 0.4115750789642334, "step": 54 }, { "advantages": -1.4901161193847656e-08, "completion_length": 74.9375, "epoch": 0.018333333333333333, "grad_norm": 10.010245323181152, "kl": 0.10302734375, "learning_rate": 9.816666666666667e-07, "loss": 0.0041, "reward": 0.21779310703277588, "reward_mean": 0.21779310703277588, "reward_std": 0.0911189615726471, "rewards/v_meteor_reward": 0.21779310703277588, "step": 55 }, { "advantages": 7.450580596923828e-09, "completion_length": 49.3125, "epoch": 0.018666666666666668, "grad_norm": 9.415597915649414, "kl": 0.10693359375, "learning_rate": 9.813333333333332e-07, "loss": 0.0043, "reward": 0.21439437568187714, "reward_mean": 0.21439437568187714, "reward_std": 0.11204023659229279, "rewards/v_meteor_reward": 0.21439437568187714, "step": 56 }, { "advantages": 2.2351741790771484e-08, "completion_length": 44.25, "epoch": 0.019, "grad_norm": 12.997027397155762, "kl": 0.45703125, "learning_rate": 9.81e-07, "loss": 0.0183, "reward": 0.2595212161540985, "reward_mean": 0.2595212161540985, "reward_std": 0.09712166339159012, "rewards/v_meteor_reward": 0.2595212161540985, "step": 57 }, { "advantages": 0.0, "completion_length": 39.8125, "epoch": 0.019333333333333334, "grad_norm": 9.700669288635254, "kl": 0.036376953125, "learning_rate": 9.806666666666666e-07, "loss": 0.0015, "reward": 0.2057061493396759, "reward_mean": 0.2057061493396759, "reward_std": 0.11181198060512543, "rewards/v_meteor_reward": 0.2057061493396759, "step": 58 }, { "advantages": -2.0489096641540527e-07, "completion_length": 15.125, "epoch": 0.019666666666666666, "grad_norm": 7.792709827423096, "kl": 0.115234375, "learning_rate": 9.803333333333332e-07, "loss": 0.0046, "reward": 1.392256259918213, "reward_mean": 1.392256259918213, "reward_std": 0.13876548409461975, "rewards/iou_timestamp_reward": 0.39225631952285767, "rewards/t_format_reward": 1.0, "step": 59 }, { "advantages": 2.2724270820617676e-07, "completion_length": 16.375, "epoch": 0.02, "grad_norm": 5.400968074798584, "kl": 0.1044921875, "learning_rate": 9.8e-07, "loss": 0.0042, "reward": 1.3651189804077148, "reward_mean": 1.3651189804077148, "reward_std": 0.14512097835540771, "rewards/iou_timestamp_reward": 0.36511898040771484, "rewards/t_format_reward": 1.0, "step": 60 }, { "advantages": -9.033828973770142e-08, "completion_length": 16.0625, "epoch": 0.02033333333333333, "grad_norm": 5.843118190765381, "kl": 0.08935546875, "learning_rate": 9.796666666666667e-07, "loss": 0.0036, "reward": 1.6153221130371094, "reward_mean": 1.6153221130371094, "reward_std": 0.2675389051437378, "rewards/iou_timestamp_reward": 0.6153220534324646, "rewards/t_format_reward": 1.0, "step": 61 }, { "advantages": -3.46451997756958e-07, "completion_length": 16.0, "epoch": 0.020666666666666667, "grad_norm": 5.339211463928223, "kl": 0.177734375, "learning_rate": 9.793333333333333e-07, "loss": 0.0071, "reward": 1.4900802373886108, "reward_mean": 1.4900802373886108, "reward_std": 0.24816764891147614, "rewards/iou_timestamp_reward": 0.49008020758628845, "rewards/t_format_reward": 1.0, "step": 62 }, { "advantages": -5.699694156646729e-07, "completion_length": 57.3125, "epoch": 0.021, "grad_norm": 8.780570983886719, "kl": 0.061767578125, "learning_rate": 9.789999999999999e-07, "loss": 0.0025, "reward": 0.14250430464744568, "reward_mean": 0.14250430464744568, "reward_std": 0.04000229015946388, "rewards/v_meteor_reward": 0.14250430464744568, "step": 63 }, { "advantages": 2.60770320892334e-08, "completion_length": 23.0, "epoch": 0.021333333333333333, "grad_norm": 10.809118270874023, "kl": 0.166015625, "learning_rate": 9.786666666666666e-07, "loss": 0.0066, "reward": 0.15076887607574463, "reward_mean": 0.15076887607574463, "reward_std": 0.07303221523761749, "rewards/v_meteor_reward": 0.15076887607574463, "step": 64 }, { "advantages": -2.2351741790771484e-08, "completion_length": 39.125, "epoch": 0.021666666666666667, "grad_norm": 7.282024383544922, "kl": 0.126953125, "learning_rate": 9.783333333333334e-07, "loss": 0.0051, "reward": 0.24037489295005798, "reward_mean": 0.24037489295005798, "reward_std": 0.07491183280944824, "rewards/v_meteor_reward": 0.24037489295005798, "step": 65 }, { "advantages": -3.632158041000366e-07, "completion_length": 15.375, "epoch": 0.022, "grad_norm": 10.100847244262695, "kl": 0.119140625, "learning_rate": 9.78e-07, "loss": 0.0047, "reward": 1.2945668697357178, "reward_mean": 1.2945668697357178, "reward_std": 0.07659132778644562, "rewards/iou_timestamp_reward": 0.2945667803287506, "rewards/t_format_reward": 1.0, "step": 66 }, { "advantages": -1.862645149230957e-08, "completion_length": 367.9375, "epoch": 0.022333333333333334, "grad_norm": 1.4799838066101074, "kl": 0.012451171875, "learning_rate": 9.776666666666666e-07, "loss": 0.0005, "reward": 0.4855964481830597, "reward_mean": 0.4855964481830597, "reward_std": 0.16220612823963165, "rewards/a_meteor_reward": 0.4855964481830597, "step": 67 }, { "advantages": 1.9185245037078857e-07, "completion_length": 31.5, "epoch": 0.02266666666666667, "grad_norm": 5.611546993255615, "kl": 0.0174560546875, "learning_rate": 9.773333333333333e-07, "loss": 0.0007, "reward": 0.65589439868927, "reward_mean": 0.65589439868927, "reward_std": 0.018078040331602097, "rewards/a_meteor_reward": 0.65589439868927, "step": 68 }, { "advantages": 1.3224780559539795e-07, "completion_length": 63.0625, "epoch": 0.023, "grad_norm": 7.750249862670898, "kl": 0.00830078125, "learning_rate": 9.77e-07, "loss": 0.0003, "reward": 0.4357224106788635, "reward_mean": 0.4357224106788635, "reward_std": 0.0916471928358078, "rewards/a_meteor_reward": 0.4357224106788635, "step": 69 }, { "advantages": 3.041699528694153e-06, "completion_length": 14.9375, "epoch": 0.023333333333333334, "grad_norm": 8.70356273651123, "kl": 0.08837890625, "learning_rate": 9.766666666666667e-07, "loss": 0.0035, "reward": 1.4397003650665283, "reward_mean": 1.4397003650665283, "reward_std": 0.09882628172636032, "rewards/iou_timestamp_reward": 0.4397004544734955, "rewards/t_format_reward": 1.0, "step": 70 }, { "advantages": -4.284083843231201e-08, "completion_length": 41.75, "epoch": 0.023666666666666666, "grad_norm": 8.626035690307617, "kl": 0.15234375, "learning_rate": 9.763333333333333e-07, "loss": 0.0061, "reward": 0.24369995296001434, "reward_mean": 0.24369995296001434, "reward_std": 0.085405632853508, "rewards/v_meteor_reward": 0.24369995296001434, "step": 71 }, { "advantages": -4.284083843231201e-08, "completion_length": 100.5625, "epoch": 0.024, "grad_norm": 10.16545581817627, "kl": 0.08203125, "learning_rate": 9.759999999999998e-07, "loss": 0.0033, "reward": 0.19306381046772003, "reward_mean": 0.19306381046772003, "reward_std": 0.08142146468162537, "rewards/v_meteor_reward": 0.19306381046772003, "step": 72 }, { "advantages": -1.5273690223693848e-07, "completion_length": 15.5, "epoch": 0.024333333333333332, "grad_norm": 6.949907302856445, "kl": 0.150390625, "learning_rate": 9.756666666666666e-07, "loss": 0.006, "reward": 1.6330397129058838, "reward_mean": 1.6330397129058838, "reward_std": 0.1594752073287964, "rewards/iou_timestamp_reward": 0.6330397725105286, "rewards/t_format_reward": 1.0, "step": 73 }, { "advantages": 1.773238182067871e-06, "completion_length": 16.0, "epoch": 0.024666666666666667, "grad_norm": 4.653798580169678, "kl": 0.08935546875, "learning_rate": 9.753333333333334e-07, "loss": 0.0036, "reward": 1.3714673519134521, "reward_mean": 1.3714673519134521, "reward_std": 0.062404565513134, "rewards/iou_timestamp_reward": 0.37146735191345215, "rewards/t_format_reward": 1.0, "step": 74 }, { "advantages": -2.1420419216156006e-08, "completion_length": 27.25, "epoch": 0.025, "grad_norm": 10.945286750793457, "kl": 0.05908203125, "learning_rate": 9.75e-07, "loss": 0.0024, "reward": 0.11463064700365067, "reward_mean": 0.11463064700365067, "reward_std": 0.032095007598400116, "rewards/a_meteor_reward": 0.11463064700365067, "step": 75 }, { "advantages": 1.1175870895385742e-07, "completion_length": 16.0, "epoch": 0.025333333333333333, "grad_norm": 5.327963352203369, "kl": 0.134765625, "learning_rate": 9.746666666666666e-07, "loss": 0.0054, "reward": 1.8515534400939941, "reward_mean": 1.8515534400939941, "reward_std": 0.07561706006526947, "rewards/iou_timestamp_reward": 0.8515534400939941, "rewards/t_format_reward": 1.0, "step": 76 }, { "advantages": -4.470348358154297e-08, "completion_length": 75.6875, "epoch": 0.025666666666666667, "grad_norm": 5.755127906799316, "kl": 0.08544921875, "learning_rate": 9.743333333333333e-07, "loss": 0.0034, "reward": 0.31156125664711, "reward_mean": 0.31156125664711, "reward_std": 0.10285411030054092, "rewards/v_meteor_reward": 0.31156125664711, "step": 77 }, { "advantages": -2.9802322387695312e-08, "completion_length": 102.125, "epoch": 0.026, "grad_norm": 5.3092451095581055, "kl": 0.125, "learning_rate": 9.74e-07, "loss": 0.005, "reward": 0.33842015266418457, "reward_mean": 0.33842015266418457, "reward_std": 0.06552913039922714, "rewards/v_meteor_reward": 0.33842015266418457, "step": 78 }, { "advantages": 1.2293457984924316e-07, "completion_length": 285.75, "epoch": 0.026333333333333334, "grad_norm": 2.1406848430633545, "kl": 0.033203125, "learning_rate": 9.736666666666667e-07, "loss": 0.0013, "reward": 0.49545618891716003, "reward_mean": 0.49545618891716003, "reward_std": 0.06459375470876694, "rewards/a_meteor_reward": 0.49545618891716003, "step": 79 }, { "advantages": 2.7194619178771973e-07, "completion_length": 16.5, "epoch": 0.02666666666666667, "grad_norm": 4.033078670501709, "kl": 0.0810546875, "learning_rate": 9.733333333333333e-07, "loss": 0.0033, "reward": 1.6211519241333008, "reward_mean": 1.6211519241333008, "reward_std": 0.14778578281402588, "rewards/iou_timestamp_reward": 0.6211519837379456, "rewards/t_format_reward": 1.0, "step": 80 }, { "advantages": -3.7997961044311523e-07, "completion_length": 17.0, "epoch": 0.027, "grad_norm": 5.206119537353516, "kl": 0.1220703125, "learning_rate": 9.729999999999998e-07, "loss": 0.0049, "reward": 1.326761245727539, "reward_mean": 1.326761245727539, "reward_std": 0.05654191970825195, "rewards/iou_timestamp_reward": 0.3267611563205719, "rewards/t_format_reward": 1.0, "step": 81 }, { "advantages": -7.450580596923828e-09, "completion_length": 162.0, "epoch": 0.027333333333333334, "grad_norm": 3.603849411010742, "kl": 0.04931640625, "learning_rate": 9.726666666666666e-07, "loss": 0.002, "reward": 0.597176730632782, "reward_mean": 0.597176730632782, "reward_std": 0.08925116062164307, "rewards/a_meteor_reward": 0.597176730632782, "step": 82 }, { "advantages": 1.7881393432617188e-07, "completion_length": 15.75, "epoch": 0.027666666666666666, "grad_norm": 6.882969856262207, "kl": 0.125, "learning_rate": 9.723333333333334e-07, "loss": 0.005, "reward": 1.691230058670044, "reward_mean": 1.691230058670044, "reward_std": 0.08068326115608215, "rewards/iou_timestamp_reward": 0.6912299990653992, "rewards/t_format_reward": 1.0, "step": 83 }, { "advantages": 0.0, "completion_length": 15.25, "epoch": 0.028, "grad_norm": 6.1427459716796875, "kl": 0.06689453125, "learning_rate": 9.72e-07, "loss": 0.0027, "reward": 1.3220338821411133, "reward_mean": 1.3220338821411133, "reward_std": 0.11557573080062866, "rewards/iou_timestamp_reward": 0.32203394174575806, "rewards/t_format_reward": 1.0, "step": 84 }, { "advantages": -1.862645149230957e-08, "completion_length": 73.4375, "epoch": 0.028333333333333332, "grad_norm": 6.435576438903809, "kl": 0.1015625, "learning_rate": 9.716666666666665e-07, "loss": 0.0041, "reward": 0.3587035536766052, "reward_mean": 0.3587035536766052, "reward_std": 0.14621688425540924, "rewards/v_meteor_reward": 0.3587035536766052, "step": 85 }, { "advantages": 2.3096799850463867e-07, "completion_length": 15.9375, "epoch": 0.028666666666666667, "grad_norm": 6.326730728149414, "kl": 0.09423828125, "learning_rate": 9.713333333333333e-07, "loss": 0.0038, "reward": 1.6387945413589478, "reward_mean": 1.6387945413589478, "reward_std": 0.17399975657463074, "rewards/iou_timestamp_reward": 0.6387945413589478, "rewards/t_format_reward": 1.0, "step": 86 }, { "advantages": 2.60770320892334e-08, "completion_length": 15.5625, "epoch": 0.029, "grad_norm": 9.553888320922852, "kl": 0.1767578125, "learning_rate": 9.709999999999999e-07, "loss": 0.0071, "reward": 1.516904592514038, "reward_mean": 1.516904592514038, "reward_std": 0.09101510792970657, "rewards/iou_timestamp_reward": 0.5169045925140381, "rewards/t_format_reward": 1.0, "step": 87 }, { "advantages": 1.4528632164001465e-07, "completion_length": 15.1875, "epoch": 0.029333333333333333, "grad_norm": 5.465631484985352, "kl": 0.11865234375, "learning_rate": 9.706666666666667e-07, "loss": 0.0048, "reward": 1.5967228412628174, "reward_mean": 1.5967228412628174, "reward_std": 0.08286619186401367, "rewards/iou_timestamp_reward": 0.5967228412628174, "rewards/t_format_reward": 1.0, "step": 88 }, { "advantages": -2.60770320892334e-08, "completion_length": 14.4375, "epoch": 0.029666666666666668, "grad_norm": 9.020552635192871, "kl": 0.1279296875, "learning_rate": 9.703333333333332e-07, "loss": 0.0051, "reward": 1.576202392578125, "reward_mean": 1.576202392578125, "reward_std": 0.23134055733680725, "rewards/iou_timestamp_reward": 0.5762023329734802, "rewards/t_format_reward": 1.0, "step": 89 }, { "advantages": 1.7136335372924805e-07, "completion_length": 16.1875, "epoch": 0.03, "grad_norm": 9.228470802307129, "kl": 0.08154296875, "learning_rate": 9.7e-07, "loss": 0.0033, "reward": 1.1671541929244995, "reward_mean": 1.1671541929244995, "reward_std": 0.15509943664073944, "rewards/iou_timestamp_reward": 0.1671541929244995, "rewards/t_format_reward": 1.0, "step": 90 }, { "advantages": 3.725290298461914e-08, "completion_length": 15.5, "epoch": 0.030333333333333334, "grad_norm": 8.227510452270508, "kl": 0.1591796875, "learning_rate": 9.696666666666666e-07, "loss": 0.0064, "reward": 1.6748456954956055, "reward_mean": 1.6748456954956055, "reward_std": 0.1766783893108368, "rewards/iou_timestamp_reward": 0.6748456358909607, "rewards/t_format_reward": 1.0, "step": 91 }, { "advantages": 2.8312206268310547e-07, "completion_length": 111.875, "epoch": 0.030666666666666665, "grad_norm": 11.094443321228027, "kl": 0.0260009765625, "learning_rate": 9.693333333333334e-07, "loss": 0.001, "reward": 0.39269232749938965, "reward_mean": 0.39269232749938965, "reward_std": 0.07297664880752563, "rewards/a_meteor_reward": 0.39269232749938965, "step": 92 }, { "advantages": -1.4528632164001465e-07, "completion_length": 15.75, "epoch": 0.031, "grad_norm": 8.377711296081543, "kl": 0.10595703125, "learning_rate": 9.69e-07, "loss": 0.0043, "reward": 1.6669204235076904, "reward_mean": 1.6669204235076904, "reward_std": 0.24539482593536377, "rewards/iou_timestamp_reward": 0.6669204235076904, "rewards/t_format_reward": 1.0, "step": 93 }, { "advantages": -3.650784492492676e-07, "completion_length": 73.1875, "epoch": 0.03133333333333333, "grad_norm": 9.034433364868164, "kl": 0.0244140625, "learning_rate": 9.686666666666667e-07, "loss": 0.001, "reward": 0.6095114946365356, "reward_mean": 0.6095114946365356, "reward_std": 0.07278476655483246, "rewards/a_meteor_reward": 0.6095114946365356, "step": 94 }, { "advantages": -7.636845111846924e-08, "completion_length": 81.9375, "epoch": 0.03166666666666667, "grad_norm": 6.3000993728637695, "kl": 0.115234375, "learning_rate": 9.683333333333333e-07, "loss": 0.0046, "reward": 0.30284059047698975, "reward_mean": 0.30284059047698975, "reward_std": 0.1016615480184555, "rewards/v_meteor_reward": 0.30284059047698975, "step": 95 }, { "advantages": 2.8312206268310547e-07, "completion_length": 335.3125, "epoch": 0.032, "grad_norm": 4.491758346557617, "kl": 0.057373046875, "learning_rate": 9.679999999999999e-07, "loss": 0.0023, "reward": 0.38683223724365234, "reward_mean": 0.38683223724365234, "reward_std": 0.11081382632255554, "rewards/a_meteor_reward": 0.38683223724365234, "step": 96 }, { "advantages": -2.4028122425079346e-07, "completion_length": 59.375, "epoch": 0.03233333333333333, "grad_norm": 7.140879154205322, "kl": 0.0966796875, "learning_rate": 9.676666666666667e-07, "loss": 0.0039, "reward": 0.31064194440841675, "reward_mean": 0.31064194440841675, "reward_std": 0.08246345818042755, "rewards/v_meteor_reward": 0.31064194440841675, "step": 97 }, { "advantages": 3.166496753692627e-08, "completion_length": 16.0, "epoch": 0.03266666666666666, "grad_norm": 5.591695785522461, "kl": 0.171875, "learning_rate": 9.673333333333332e-07, "loss": 0.0069, "reward": 1.6777211427688599, "reward_mean": 1.6777211427688599, "reward_std": 0.1325010508298874, "rewards/iou_timestamp_reward": 0.6777211427688599, "rewards/t_format_reward": 1.0, "step": 98 }, { "advantages": 8.568167686462402e-08, "completion_length": 47.5625, "epoch": 0.033, "grad_norm": 7.70865535736084, "kl": 0.119140625, "learning_rate": 9.67e-07, "loss": 0.0048, "reward": 0.272571861743927, "reward_mean": 0.272571861743927, "reward_std": 0.08858765661716461, "rewards/v_meteor_reward": 0.272571861743927, "step": 99 }, { "advantages": -1.1920928955078125e-07, "completion_length": 14.5625, "epoch": 0.03333333333333333, "grad_norm": 10.010721206665039, "kl": 0.22265625, "learning_rate": 9.666666666666666e-07, "loss": 0.0089, "reward": 1.5711581707000732, "reward_mean": 1.5711581707000732, "reward_std": 0.11216755211353302, "rewards/iou_timestamp_reward": 0.5711581707000732, "rewards/t_format_reward": 1.0, "step": 100 }, { "advantages": -6.295740604400635e-07, "completion_length": 15.5625, "epoch": 0.033666666666666664, "grad_norm": 8.376897811889648, "kl": 0.1806640625, "learning_rate": 9.663333333333334e-07, "loss": 0.0072, "reward": 1.9091110229492188, "reward_mean": 1.9091110229492188, "reward_std": 0.05105387419462204, "rewards/iou_timestamp_reward": 0.909110963344574, "rewards/t_format_reward": 1.0, "step": 101 }, { "advantages": -4.842877388000488e-08, "completion_length": 145.375, "epoch": 0.034, "grad_norm": 4.151526927947998, "kl": 0.078125, "learning_rate": 9.66e-07, "loss": 0.0031, "reward": 0.36579787731170654, "reward_mean": 0.36579787731170654, "reward_std": 0.13643667101860046, "rewards/a_meteor_reward": 0.36579787731170654, "step": 102 }, { "advantages": 7.450580596923828e-09, "completion_length": 50.5625, "epoch": 0.034333333333333334, "grad_norm": 9.529373168945312, "kl": 0.05419921875, "learning_rate": 9.656666666666667e-07, "loss": 0.0022, "reward": 0.31767547130584717, "reward_mean": 0.31767547130584717, "reward_std": 0.029892440885305405, "rewards/a_meteor_reward": 0.31767547130584717, "step": 103 }, { "advantages": -5.01750037074089e-07, "completion_length": 61.125, "epoch": 0.034666666666666665, "grad_norm": 6.028570652008057, "kl": 0.0164794921875, "learning_rate": 9.653333333333333e-07, "loss": 0.0007, "reward": 0.5155657529830933, "reward_mean": 0.5155657529830933, "reward_std": 0.022184547036886215, "rewards/a_meteor_reward": 0.5155657529830933, "step": 104 }, { "advantages": 3.129243850708008e-07, "completion_length": 385.25, "epoch": 0.035, "grad_norm": 1.9676549434661865, "kl": 0.0286865234375, "learning_rate": 9.649999999999999e-07, "loss": 0.0011, "reward": 0.5510239005088806, "reward_mean": 0.5510239005088806, "reward_std": 0.06231849640607834, "rewards/a_meteor_reward": 0.5510239005088806, "step": 105 }, { "advantages": 6.984919309616089e-08, "completion_length": 15.5, "epoch": 0.035333333333333335, "grad_norm": 10.02199935913086, "kl": 0.22265625, "learning_rate": 9.646666666666666e-07, "loss": 0.0089, "reward": 1.7212822437286377, "reward_mean": 1.7212822437286377, "reward_std": 0.11323533207178116, "rewards/iou_timestamp_reward": 0.7212823629379272, "rewards/t_format_reward": 1.0, "step": 106 }, { "advantages": 3.725290298461914e-09, "completion_length": 357.8125, "epoch": 0.035666666666666666, "grad_norm": 2.2444164752960205, "kl": 0.03662109375, "learning_rate": 9.643333333333334e-07, "loss": 0.0015, "reward": 0.3381531834602356, "reward_mean": 0.3381531834602356, "reward_std": 0.13471977412700653, "rewards/a_meteor_reward": 0.3381531834602356, "step": 107 }, { "advantages": 1.471489667892456e-07, "completion_length": 93.375, "epoch": 0.036, "grad_norm": 7.70637321472168, "kl": 0.1787109375, "learning_rate": 9.64e-07, "loss": 0.0071, "reward": 0.28107142448425293, "reward_mean": 0.28107142448425293, "reward_std": 0.08433818817138672, "rewards/v_meteor_reward": 0.28107142448425293, "step": 108 }, { "advantages": -1.257285475730896e-06, "completion_length": 15.1875, "epoch": 0.036333333333333336, "grad_norm": 9.160130500793457, "kl": 0.212890625, "learning_rate": 9.636666666666666e-07, "loss": 0.0085, "reward": 1.7501591444015503, "reward_mean": 1.7501591444015503, "reward_std": 0.10709328949451447, "rewards/iou_timestamp_reward": 0.7501592040061951, "rewards/t_format_reward": 1.0, "step": 109 }, { "advantages": -4.1350722312927246e-07, "completion_length": 56.0, "epoch": 0.03666666666666667, "grad_norm": 11.45932674407959, "kl": 0.0269775390625, "learning_rate": 9.633333333333334e-07, "loss": 0.0011, "reward": 0.5588961243629456, "reward_mean": 0.5588961243629456, "reward_std": 0.03106272965669632, "rewards/a_meteor_reward": 0.5588961243629456, "step": 110 }, { "advantages": 5.21540641784668e-08, "completion_length": 73.0625, "epoch": 0.037, "grad_norm": 8.036032676696777, "kl": 0.142578125, "learning_rate": 9.63e-07, "loss": 0.0057, "reward": 0.242688849568367, "reward_mean": 0.242688849568367, "reward_std": 0.09107036888599396, "rewards/v_meteor_reward": 0.242688849568367, "step": 111 }, { "advantages": -8.014030754566193e-07, "completion_length": 14.6875, "epoch": 0.037333333333333336, "grad_norm": 7.103935241699219, "kl": 0.1171875, "learning_rate": 9.626666666666667e-07, "loss": 0.0047, "reward": 1.3655147552490234, "reward_mean": 1.3655147552490234, "reward_std": 0.10877620428800583, "rewards/iou_timestamp_reward": 0.3655148148536682, "rewards/t_format_reward": 1.0, "step": 112 }, { "advantages": -2.2351741790771484e-08, "completion_length": 262.75, "epoch": 0.03766666666666667, "grad_norm": 1.336516261100769, "kl": 0.036376953125, "learning_rate": 9.623333333333333e-07, "loss": 0.0015, "reward": 0.3901206851005554, "reward_mean": 0.3901206851005554, "reward_std": 0.029694439843297005, "rewards/a_meteor_reward": 0.3901206851005554, "step": 113 }, { "advantages": -2.942979335784912e-07, "completion_length": 15.75, "epoch": 0.038, "grad_norm": 6.64762544631958, "kl": 0.267578125, "learning_rate": 9.619999999999999e-07, "loss": 0.0107, "reward": 1.7828259468078613, "reward_mean": 1.7828259468078613, "reward_std": 0.09520608186721802, "rewards/iou_timestamp_reward": 0.7828259468078613, "rewards/t_format_reward": 1.0, "step": 114 }, { "advantages": 6.51925802230835e-08, "completion_length": 145.25, "epoch": 0.03833333333333333, "grad_norm": 4.987364292144775, "kl": 0.12890625, "learning_rate": 9.616666666666666e-07, "loss": 0.0051, "reward": 0.3403514623641968, "reward_mean": 0.3403514623641968, "reward_std": 0.09397107362747192, "rewards/v_meteor_reward": 0.3403514623641968, "step": 115 }, { "advantages": 9.424984455108643e-07, "completion_length": 15.4375, "epoch": 0.03866666666666667, "grad_norm": 9.328133583068848, "kl": 0.150390625, "learning_rate": 9.613333333333334e-07, "loss": 0.006, "reward": 1.6889970302581787, "reward_mean": 1.6889970302581787, "reward_std": 0.09184948354959488, "rewards/iou_timestamp_reward": 0.6889971494674683, "rewards/t_format_reward": 1.0, "step": 116 }, { "advantages": -2.3245811462402344e-06, "completion_length": 528.0625, "epoch": 0.039, "grad_norm": 2.1579296588897705, "kl": 0.0286865234375, "learning_rate": 9.61e-07, "loss": 0.0011, "reward": 0.42002779245376587, "reward_mean": 0.42002779245376587, "reward_std": 0.0364028662443161, "rewards/a_meteor_reward": 0.42002779245376587, "step": 117 }, { "advantages": -4.246830940246582e-07, "completion_length": 168.5, "epoch": 0.03933333333333333, "grad_norm": 4.850235462188721, "kl": 0.07568359375, "learning_rate": 9.606666666666666e-07, "loss": 0.003, "reward": 0.5040794610977173, "reward_mean": 0.5040794610977173, "reward_std": 0.07964249700307846, "rewards/a_meteor_reward": 0.5040794610977173, "step": 118 }, { "advantages": -7.82310962677002e-07, "completion_length": 218.125, "epoch": 0.03966666666666667, "grad_norm": 2.0448973178863525, "kl": 0.0693359375, "learning_rate": 9.603333333333333e-07, "loss": 0.0028, "reward": 0.3898804783821106, "reward_mean": 0.3898804783821106, "reward_std": 0.029736947268247604, "rewards/a_meteor_reward": 0.3898804783821106, "step": 119 }, { "advantages": -1.6763806343078613e-07, "completion_length": 34.25, "epoch": 0.04, "grad_norm": 11.406599044799805, "kl": 0.134765625, "learning_rate": 9.6e-07, "loss": 0.0054, "reward": 0.3834467828273773, "reward_mean": 0.3834467828273773, "reward_std": 0.07651173323392868, "rewards/a_meteor_reward": 0.3834467828273773, "step": 120 }, { "advantages": -2.868473529815674e-07, "completion_length": 122.5625, "epoch": 0.04033333333333333, "grad_norm": 7.683600425720215, "kl": 0.1083984375, "learning_rate": 9.596666666666667e-07, "loss": 0.0043, "reward": 0.6076045036315918, "reward_mean": 0.6076045036315918, "reward_std": 0.05977663770318031, "rewards/a_meteor_reward": 0.6076045036315918, "step": 121 }, { "advantages": 1.6763806343078613e-07, "completion_length": 38.625, "epoch": 0.04066666666666666, "grad_norm": 11.782561302185059, "kl": 0.306640625, "learning_rate": 9.593333333333333e-07, "loss": 0.0123, "reward": 0.4143635630607605, "reward_mean": 0.4143635630607605, "reward_std": 0.09519274532794952, "rewards/a_meteor_reward": 0.4143635630607605, "step": 122 }, { "advantages": -3.948807716369629e-07, "completion_length": 15.75, "epoch": 0.041, "grad_norm": 6.244944095611572, "kl": 0.1015625, "learning_rate": 9.589999999999998e-07, "loss": 0.0041, "reward": 1.6249700784683228, "reward_mean": 1.6249700784683228, "reward_std": 0.047293901443481445, "rewards/iou_timestamp_reward": 0.6249701380729675, "rewards/t_format_reward": 1.0, "step": 123 }, { "advantages": -4.0978193283081055e-08, "completion_length": 146.125, "epoch": 0.04133333333333333, "grad_norm": 10.830638885498047, "kl": 0.333984375, "learning_rate": 9.586666666666666e-07, "loss": 0.0134, "reward": 0.3417627811431885, "reward_mean": 0.3417627811431885, "reward_std": 0.10412581264972687, "rewards/a_meteor_reward": 0.3417627811431885, "step": 124 }, { "advantages": 4.991888999938965e-07, "completion_length": 151.125, "epoch": 0.041666666666666664, "grad_norm": 4.3537068367004395, "kl": 0.0830078125, "learning_rate": 9.583333333333334e-07, "loss": 0.0033, "reward": 0.37301206588745117, "reward_mean": 0.37301206588745117, "reward_std": 0.04253820329904556, "rewards/v_meteor_reward": 0.37301206588745117, "step": 125 }, { "advantages": -1.3476237654685974e-06, "completion_length": 25.3125, "epoch": 0.042, "grad_norm": 5.9638190269470215, "kl": 0.26953125, "learning_rate": 9.58e-07, "loss": 0.0108, "reward": 0.33645251393318176, "reward_mean": 0.33645251393318176, "reward_std": 0.03595134615898132, "rewards/a_meteor_reward": 0.33645251393318176, "step": 126 }, { "advantages": 3.725290298461914e-09, "completion_length": 15.25, "epoch": 0.042333333333333334, "grad_norm": 5.584634780883789, "kl": 0.1279296875, "learning_rate": 9.576666666666665e-07, "loss": 0.0051, "reward": 1.7488969564437866, "reward_mean": 1.7488969564437866, "reward_std": 0.1075374037027359, "rewards/iou_timestamp_reward": 0.7488969564437866, "rewards/t_format_reward": 1.0, "step": 127 }, { "advantages": -1.955777406692505e-07, "completion_length": 172.3125, "epoch": 0.042666666666666665, "grad_norm": 4.104102611541748, "kl": 0.12353515625, "learning_rate": 9.573333333333333e-07, "loss": 0.005, "reward": 0.3584914803504944, "reward_mean": 0.3584914803504944, "reward_std": 0.06407015025615692, "rewards/v_meteor_reward": 0.3584914803504944, "step": 128 }, { "advantages": -1.0617077350616455e-07, "completion_length": 16.0, "epoch": 0.043, "grad_norm": 9.165255546569824, "kl": 0.1884765625, "learning_rate": 9.57e-07, "loss": 0.0076, "reward": 1.6522139310836792, "reward_mean": 1.6522139310836792, "reward_std": 0.12564575672149658, "rewards/iou_timestamp_reward": 0.6522139310836792, "rewards/t_format_reward": 1.0, "step": 129 }, { "advantages": -2.980232238769531e-07, "completion_length": 107.5625, "epoch": 0.043333333333333335, "grad_norm": 4.67799186706543, "kl": 0.107421875, "learning_rate": 9.566666666666667e-07, "loss": 0.0043, "reward": 0.34709811210632324, "reward_mean": 0.34709811210632324, "reward_std": 0.053231969475746155, "rewards/v_meteor_reward": 0.34709811210632324, "step": 130 }, { "advantages": -1.695007085800171e-07, "completion_length": 120.5625, "epoch": 0.043666666666666666, "grad_norm": 5.082855224609375, "kl": 0.11376953125, "learning_rate": 9.563333333333333e-07, "loss": 0.0045, "reward": 0.3200334906578064, "reward_mean": 0.3200334906578064, "reward_std": 0.07295849919319153, "rewards/v_meteor_reward": 0.3200334906578064, "step": 131 }, { "advantages": -4.6566128730773926e-07, "completion_length": 206.9375, "epoch": 0.044, "grad_norm": 9.59042739868164, "kl": 0.1572265625, "learning_rate": 9.559999999999998e-07, "loss": 0.0063, "reward": 0.33664804697036743, "reward_mean": 0.33664804697036743, "reward_std": 0.05300241708755493, "rewards/a_meteor_reward": 0.33664804697036743, "step": 132 }, { "advantages": 1.8998980522155762e-07, "completion_length": 135.3125, "epoch": 0.044333333333333336, "grad_norm": 7.48255729675293, "kl": 0.2236328125, "learning_rate": 9.556666666666666e-07, "loss": 0.009, "reward": 0.2923402488231659, "reward_mean": 0.2923402488231659, "reward_std": 0.0723569393157959, "rewards/v_meteor_reward": 0.2923402488231659, "step": 133 }, { "advantages": -2.0302832126617432e-07, "completion_length": 59.75, "epoch": 0.04466666666666667, "grad_norm": 7.033110618591309, "kl": 0.373046875, "learning_rate": 9.553333333333334e-07, "loss": 0.0149, "reward": 0.5713991522789001, "reward_mean": 0.5713991522789001, "reward_std": 0.08644755184650421, "rewards/a_meteor_reward": 0.5713991522789001, "step": 134 }, { "advantages": 4.023313522338867e-07, "completion_length": 15.0, "epoch": 0.045, "grad_norm": 8.320035934448242, "kl": 0.162109375, "learning_rate": 9.55e-07, "loss": 0.0065, "reward": 1.8030287027359009, "reward_mean": 1.8030287027359009, "reward_std": 0.12482485175132751, "rewards/iou_timestamp_reward": 0.8030286431312561, "rewards/t_format_reward": 1.0, "step": 135 }, { "advantages": 1.043081283569336e-07, "completion_length": 197.0, "epoch": 0.04533333333333334, "grad_norm": 4.154504299163818, "kl": 0.13671875, "learning_rate": 9.546666666666665e-07, "loss": 0.0055, "reward": 0.3503352403640747, "reward_mean": 0.3503352403640747, "reward_std": 0.07537497580051422, "rewards/v_meteor_reward": 0.3503352403640747, "step": 136 }, { "advantages": 1.7881393432617188e-07, "completion_length": 16.4375, "epoch": 0.04566666666666667, "grad_norm": 10.59654712677002, "kl": 0.32421875, "learning_rate": 9.543333333333333e-07, "loss": 0.0129, "reward": 1.8501031398773193, "reward_mean": 1.8501031398773193, "reward_std": 0.07161524146795273, "rewards/iou_timestamp_reward": 0.8501031994819641, "rewards/t_format_reward": 1.0, "step": 137 }, { "advantages": 8.568167686462402e-08, "completion_length": 16.25, "epoch": 0.046, "grad_norm": 7.3680572509765625, "kl": 0.23828125, "learning_rate": 9.539999999999999e-07, "loss": 0.0095, "reward": 1.8512609004974365, "reward_mean": 1.8512609004974365, "reward_std": 0.06069695204496384, "rewards/iou_timestamp_reward": 0.8512609601020813, "rewards/t_format_reward": 1.0, "step": 138 }, { "advantages": -1.424923539161682e-07, "completion_length": 15.5, "epoch": 0.04633333333333333, "grad_norm": 5.227673053741455, "kl": 0.146484375, "learning_rate": 9.536666666666667e-07, "loss": 0.0058, "reward": 1.4599018096923828, "reward_mean": 1.4599018096923828, "reward_std": 0.12586817145347595, "rewards/iou_timestamp_reward": 0.45990175008773804, "rewards/t_format_reward": 1.0, "step": 139 }, { "advantages": 1.1175870895385742e-07, "completion_length": 440.0, "epoch": 0.04666666666666667, "grad_norm": 9.650946617126465, "kl": 0.203125, "learning_rate": 9.533333333333333e-07, "loss": 0.0081, "reward": 0.243077352643013, "reward_mean": 0.243077352643013, "reward_std": 0.047230400145053864, "rewards/a_meteor_reward": 0.243077352643013, "step": 140 }, { "advantages": -5.587935447692871e-08, "completion_length": 89.875, "epoch": 0.047, "grad_norm": 6.734220027923584, "kl": 0.1318359375, "learning_rate": 9.529999999999999e-07, "loss": 0.0053, "reward": 0.3107553720474243, "reward_mean": 0.3107553720474243, "reward_std": 0.0887354239821434, "rewards/v_meteor_reward": 0.3107553720474243, "step": 141 }, { "advantages": -3.3527612686157227e-08, "completion_length": 183.625, "epoch": 0.04733333333333333, "grad_norm": 4.381998062133789, "kl": 0.1142578125, "learning_rate": 9.526666666666666e-07, "loss": 0.0046, "reward": 0.3454071283340454, "reward_mean": 0.3454071283340454, "reward_std": 0.0629056841135025, "rewards/v_meteor_reward": 0.3454071283340454, "step": 142 }, { "advantages": -2.7567148208618164e-07, "completion_length": 107.6875, "epoch": 0.04766666666666667, "grad_norm": 6.866272449493408, "kl": 0.2021484375, "learning_rate": 9.523333333333333e-07, "loss": 0.0081, "reward": 0.46506205201148987, "reward_mean": 0.46506205201148987, "reward_std": 0.19587653875350952, "rewards/a_meteor_reward": 0.46506205201148987, "step": 143 }, { "advantages": 2.384185791015625e-07, "completion_length": 15.4375, "epoch": 0.048, "grad_norm": 10.104644775390625, "kl": 0.25, "learning_rate": 9.52e-07, "loss": 0.01, "reward": 1.5293771028518677, "reward_mean": 1.5293771028518677, "reward_std": 0.14130829274654388, "rewards/iou_timestamp_reward": 0.5293771624565125, "rewards/t_format_reward": 1.0, "step": 144 }, { "advantages": 2.4996697902679443e-06, "completion_length": 15.25, "epoch": 0.04833333333333333, "grad_norm": 8.065862655639648, "kl": 0.2294921875, "learning_rate": 9.516666666666666e-07, "loss": 0.0092, "reward": 1.908073902130127, "reward_mean": 1.908073902130127, "reward_std": 0.03945036977529526, "rewards/iou_timestamp_reward": 0.9080739617347717, "rewards/t_format_reward": 1.0, "step": 145 }, { "advantages": 1.1548399925231934e-07, "completion_length": 16.75, "epoch": 0.048666666666666664, "grad_norm": 10.470989227294922, "kl": 0.2021484375, "learning_rate": 9.513333333333333e-07, "loss": 0.0081, "reward": 1.3917040824890137, "reward_mean": 1.3917040824890137, "reward_std": 0.16038565337657928, "rewards/iou_timestamp_reward": 0.39170408248901367, "rewards/t_format_reward": 1.0, "step": 146 }, { "advantages": 1.30385160446167e-08, "completion_length": 138.25, "epoch": 0.049, "grad_norm": 5.09306001663208, "kl": 0.2412109375, "learning_rate": 9.509999999999999e-07, "loss": 0.0096, "reward": 0.2719792127609253, "reward_mean": 0.2719792127609253, "reward_std": 0.05485783517360687, "rewards/v_meteor_reward": 0.2719792127609253, "step": 147 }, { "advantages": -5.960464477539063e-08, "completion_length": 96.0, "epoch": 0.04933333333333333, "grad_norm": 5.434594631195068, "kl": 0.251953125, "learning_rate": 9.506666666666667e-07, "loss": 0.01, "reward": 0.33650946617126465, "reward_mean": 0.33650946617126465, "reward_std": 0.06964412331581116, "rewards/v_meteor_reward": 0.33650946617126465, "step": 148 }, { "advantages": 3.725290298461914e-08, "completion_length": 15.6875, "epoch": 0.049666666666666665, "grad_norm": 8.195295333862305, "kl": 0.158203125, "learning_rate": 9.503333333333333e-07, "loss": 0.0063, "reward": 1.5852638483047485, "reward_mean": 1.5852638483047485, "reward_std": 0.08470474183559418, "rewards/iou_timestamp_reward": 0.5852638483047485, "rewards/t_format_reward": 1.0, "step": 149 }, { "advantages": 1.601874828338623e-07, "completion_length": 65.0625, "epoch": 0.05, "grad_norm": 5.830376625061035, "kl": 0.453125, "learning_rate": 9.499999999999999e-07, "loss": 0.0181, "reward": 0.5064964294433594, "reward_mean": 0.5064964294433594, "reward_std": 0.02889607474207878, "rewards/a_meteor_reward": 0.5064964294433594, "step": 150 }, { "advantages": 2.9802322387695312e-08, "completion_length": 205.625, "epoch": 0.050333333333333334, "grad_norm": 2.661024570465088, "kl": 0.07861328125, "learning_rate": 9.496666666666666e-07, "loss": 0.0031, "reward": 0.3515915274620056, "reward_mean": 0.3515915274620056, "reward_std": 0.1484222412109375, "rewards/a_meteor_reward": 0.3515915274620056, "step": 151 }, { "advantages": 7.264316082000732e-08, "completion_length": 104.125, "epoch": 0.050666666666666665, "grad_norm": 4.986944675445557, "kl": 0.166015625, "learning_rate": 9.493333333333334e-07, "loss": 0.0066, "reward": 0.3497838079929352, "reward_mean": 0.3497838079929352, "reward_std": 0.06787674129009247, "rewards/v_meteor_reward": 0.3497838079929352, "step": 152 }, { "advantages": 1.6111880540847778e-07, "completion_length": 110.375, "epoch": 0.051, "grad_norm": 5.036022186279297, "kl": 0.1904296875, "learning_rate": 9.489999999999999e-07, "loss": 0.0076, "reward": 0.38326507806777954, "reward_mean": 0.38326507806777954, "reward_std": 0.04256826639175415, "rewards/v_meteor_reward": 0.38326507806777954, "step": 153 }, { "advantages": -7.450580596923828e-09, "completion_length": 108.875, "epoch": 0.051333333333333335, "grad_norm": 4.718959808349609, "kl": 0.1953125, "learning_rate": 9.486666666666666e-07, "loss": 0.0078, "reward": 0.3651476502418518, "reward_mean": 0.3651476502418518, "reward_std": 0.06869818270206451, "rewards/v_meteor_reward": 0.3651476502418518, "step": 154 }, { "advantages": 8.21426510810852e-07, "completion_length": 15.8125, "epoch": 0.051666666666666666, "grad_norm": 8.571718215942383, "kl": 0.1923828125, "learning_rate": 9.483333333333333e-07, "loss": 0.0077, "reward": 1.6531906127929688, "reward_mean": 1.6531906127929688, "reward_std": 0.14871498942375183, "rewards/iou_timestamp_reward": 0.6531907320022583, "rewards/t_format_reward": 1.0, "step": 155 }, { "advantages": -7.82310962677002e-08, "completion_length": 145.4375, "epoch": 0.052, "grad_norm": 4.320443153381348, "kl": 0.1201171875, "learning_rate": 9.479999999999999e-07, "loss": 0.0048, "reward": 0.33417075872421265, "reward_mean": 0.33417075872421265, "reward_std": 0.08785159140825272, "rewards/v_meteor_reward": 0.33417075872421265, "step": 156 }, { "advantages": 1.0244548320770264e-07, "completion_length": 16.125, "epoch": 0.052333333333333336, "grad_norm": 4.854378700256348, "kl": 0.134765625, "learning_rate": 9.476666666666666e-07, "loss": 0.0054, "reward": 1.5017434358596802, "reward_mean": 1.5017434358596802, "reward_std": 0.04548380523920059, "rewards/iou_timestamp_reward": 0.5017433762550354, "rewards/t_format_reward": 1.0, "step": 157 }, { "advantages": 8.940696716308594e-08, "completion_length": 130.4375, "epoch": 0.05266666666666667, "grad_norm": 4.786351680755615, "kl": 0.1142578125, "learning_rate": 9.473333333333333e-07, "loss": 0.0046, "reward": 0.3322998285293579, "reward_mean": 0.3322998285293579, "reward_std": 0.06792421638965607, "rewards/v_meteor_reward": 0.3322998285293579, "step": 158 }, { "advantages": -3.0547380447387695e-07, "completion_length": 99.375, "epoch": 0.053, "grad_norm": 5.84738302230835, "kl": 0.2021484375, "learning_rate": 9.469999999999999e-07, "loss": 0.0081, "reward": 0.36573007702827454, "reward_mean": 0.36573007702827454, "reward_std": 0.058926116675138474, "rewards/v_meteor_reward": 0.36573007702827454, "step": 159 }, { "advantages": 7.264316082000732e-08, "completion_length": 91.125, "epoch": 0.05333333333333334, "grad_norm": 5.669164180755615, "kl": 0.21875, "learning_rate": 9.466666666666666e-07, "loss": 0.0088, "reward": 0.34862929582595825, "reward_mean": 0.34862929582595825, "reward_std": 0.046043284237384796, "rewards/v_meteor_reward": 0.34862929582595825, "step": 160 }, { "advantages": 1.4901161193847656e-08, "completion_length": 108.8125, "epoch": 0.05366666666666667, "grad_norm": 4.848283290863037, "kl": 0.1748046875, "learning_rate": 9.463333333333334e-07, "loss": 0.007, "reward": 0.3031328022480011, "reward_mean": 0.3031328022480011, "reward_std": 0.048524558544158936, "rewards/v_meteor_reward": 0.3031328022480011, "step": 161 }, { "advantages": 1.1920928955078125e-07, "completion_length": 55.75, "epoch": 0.054, "grad_norm": 5.544229984283447, "kl": 0.314453125, "learning_rate": 9.459999999999999e-07, "loss": 0.0126, "reward": 0.6716249585151672, "reward_mean": 0.6716249585151672, "reward_std": 0.03897134214639664, "rewards/a_meteor_reward": 0.6716249585151672, "step": 162 }, { "advantages": 1.1175870895385742e-07, "completion_length": 155.3125, "epoch": 0.05433333333333333, "grad_norm": 4.005749225616455, "kl": 0.1162109375, "learning_rate": 9.456666666666666e-07, "loss": 0.0047, "reward": 0.39268988370895386, "reward_mean": 0.39268988370895386, "reward_std": 0.10236059129238129, "rewards/v_meteor_reward": 0.39268988370895386, "step": 163 }, { "advantages": 1.8067657947540283e-07, "completion_length": 124.0, "epoch": 0.05466666666666667, "grad_norm": 4.652139663696289, "kl": 0.09521484375, "learning_rate": 9.453333333333333e-07, "loss": 0.0038, "reward": 0.3237873911857605, "reward_mean": 0.3237873911857605, "reward_std": 0.05899618938565254, "rewards/v_meteor_reward": 0.3237873911857605, "step": 164 }, { "advantages": 1.4901161193847656e-07, "completion_length": 101.5625, "epoch": 0.055, "grad_norm": 6.533298015594482, "kl": 0.20703125, "learning_rate": 9.45e-07, "loss": 0.0083, "reward": 0.34651291370391846, "reward_mean": 0.34651291370391846, "reward_std": 0.06518127024173737, "rewards/v_meteor_reward": 0.34651291370391846, "step": 165 }, { "advantages": 3.67872416973114e-07, "completion_length": 36.0, "epoch": 0.05533333333333333, "grad_norm": 8.040228843688965, "kl": 0.38671875, "learning_rate": 9.446666666666666e-07, "loss": 0.0155, "reward": 0.3199371099472046, "reward_mean": 0.3199371099472046, "reward_std": 0.07617528736591339, "rewards/a_meteor_reward": 0.3199371099472046, "step": 166 }, { "advantages": -3.3527612686157227e-08, "completion_length": 138.6875, "epoch": 0.05566666666666667, "grad_norm": 4.693573951721191, "kl": 0.12060546875, "learning_rate": 9.443333333333333e-07, "loss": 0.0048, "reward": 0.3104463815689087, "reward_mean": 0.3104463815689087, "reward_std": 0.08182478696107864, "rewards/v_meteor_reward": 0.3104463815689087, "step": 167 }, { "advantages": 4.0512531995773315e-07, "completion_length": 15.9375, "epoch": 0.056, "grad_norm": 6.816257476806641, "kl": 0.2080078125, "learning_rate": 9.439999999999999e-07, "loss": 0.0083, "reward": 1.5904431343078613, "reward_mean": 1.5904431343078613, "reward_std": 0.06587842106819153, "rewards/iou_timestamp_reward": 0.5904431343078613, "rewards/t_format_reward": 1.0, "step": 168 }, { "advantages": 9.872019290924072e-08, "completion_length": 128.625, "epoch": 0.05633333333333333, "grad_norm": 4.523577690124512, "kl": 0.09912109375, "learning_rate": 9.436666666666667e-07, "loss": 0.004, "reward": 0.36655426025390625, "reward_mean": 0.36655426025390625, "reward_std": 0.06727494299411774, "rewards/v_meteor_reward": 0.36655426025390625, "step": 169 }, { "advantages": 1.210719347000122e-07, "completion_length": 89.9375, "epoch": 0.056666666666666664, "grad_norm": 5.404072284698486, "kl": 0.2041015625, "learning_rate": 9.433333333333333e-07, "loss": 0.0082, "reward": 0.32131993770599365, "reward_mean": 0.32131993770599365, "reward_std": 0.04615428298711777, "rewards/v_meteor_reward": 0.32131993770599365, "step": 170 }, { "advantages": -9.313225746154785e-08, "completion_length": 16.0, "epoch": 0.057, "grad_norm": 14.049200057983398, "kl": 0.146484375, "learning_rate": 9.429999999999999e-07, "loss": 0.0059, "reward": 1.606938123703003, "reward_mean": 1.606938123703003, "reward_std": 0.19581793248653412, "rewards/iou_timestamp_reward": 0.6069381833076477, "rewards/t_format_reward": 1.0, "step": 171 }, { "advantages": -1.0058283805847168e-06, "completion_length": 15.3125, "epoch": 0.05733333333333333, "grad_norm": 6.53110933303833, "kl": 0.146484375, "learning_rate": 9.426666666666666e-07, "loss": 0.0059, "reward": 1.7110098600387573, "reward_mean": 1.7110098600387573, "reward_std": 0.08147981017827988, "rewards/iou_timestamp_reward": 0.7110098004341125, "rewards/t_format_reward": 1.0, "step": 172 }, { "advantages": 1.2293457984924316e-07, "completion_length": 106.6875, "epoch": 0.057666666666666665, "grad_norm": 5.3037428855896, "kl": 0.2080078125, "learning_rate": 9.423333333333333e-07, "loss": 0.0083, "reward": 0.34091711044311523, "reward_mean": 0.34091711044311523, "reward_std": 0.054847199469804764, "rewards/v_meteor_reward": 0.34091711044311523, "step": 173 }, { "advantages": -9.387731552124023e-07, "completion_length": 14.9375, "epoch": 0.058, "grad_norm": 13.017971992492676, "kl": 0.1552734375, "learning_rate": 9.419999999999999e-07, "loss": 0.0062, "reward": 1.581139087677002, "reward_mean": 1.581139087677002, "reward_std": 0.1964217722415924, "rewards/iou_timestamp_reward": 0.5811392068862915, "rewards/t_format_reward": 1.0, "step": 174 }, { "advantages": 7.003545761108398e-07, "completion_length": 15.5, "epoch": 0.058333333333333334, "grad_norm": 4.955448150634766, "kl": 0.201171875, "learning_rate": 9.416666666666666e-07, "loss": 0.008, "reward": 1.4787129163742065, "reward_mean": 1.4787129163742065, "reward_std": 0.0437820702791214, "rewards/iou_timestamp_reward": 0.47871291637420654, "rewards/t_format_reward": 1.0, "step": 175 }, { "advantages": -1.471489667892456e-07, "completion_length": 82.5625, "epoch": 0.058666666666666666, "grad_norm": 5.676736354827881, "kl": 0.19140625, "learning_rate": 9.413333333333333e-07, "loss": 0.0076, "reward": 0.3530689477920532, "reward_mean": 0.3530689477920532, "reward_std": 0.07129685580730438, "rewards/v_meteor_reward": 0.3530689477920532, "step": 176 }, { "advantages": -9.872019290924072e-08, "completion_length": 541.25, "epoch": 0.059, "grad_norm": 1.5729377269744873, "kl": 0.0301513671875, "learning_rate": 9.409999999999999e-07, "loss": 0.0012, "reward": 0.38659876585006714, "reward_mean": 0.38659876585006714, "reward_std": 0.061690423637628555, "rewards/a_meteor_reward": 0.38659876585006714, "step": 177 }, { "advantages": -2.086162567138672e-07, "completion_length": 107.75, "epoch": 0.059333333333333335, "grad_norm": 4.422677516937256, "kl": 0.294921875, "learning_rate": 9.406666666666666e-07, "loss": 0.0118, "reward": 0.7729659080505371, "reward_mean": 0.7729659080505371, "reward_std": 0.05877571180462837, "rewards/a_meteor_reward": 0.7729659080505371, "step": 178 }, { "advantages": 9.685754776000977e-08, "completion_length": 608.75, "epoch": 0.059666666666666666, "grad_norm": 2.90602970123291, "kl": 0.04833984375, "learning_rate": 9.403333333333333e-07, "loss": 0.0019, "reward": 0.4457953870296478, "reward_mean": 0.4457953870296478, "reward_std": 0.21669965982437134, "rewards/a_meteor_reward": 0.4457953870296478, "step": 179 }, { "advantages": -1.1175870895385742e-07, "completion_length": 144.3125, "epoch": 0.06, "grad_norm": 5.941070556640625, "kl": 0.263671875, "learning_rate": 9.399999999999999e-07, "loss": 0.0105, "reward": 0.5886591672897339, "reward_mean": 0.5886591672897339, "reward_std": 0.08937446027994156, "rewards/a_meteor_reward": 0.5886591672897339, "step": 180 }, { "advantages": 1.5273690223693848e-07, "completion_length": 83.8125, "epoch": 0.060333333333333336, "grad_norm": 5.636475563049316, "kl": 0.1318359375, "learning_rate": 9.396666666666666e-07, "loss": 0.0053, "reward": 0.33770453929901123, "reward_mean": 0.33770453929901123, "reward_std": 0.0778227373957634, "rewards/v_meteor_reward": 0.33770453929901123, "step": 181 }, { "advantages": 7.82310962677002e-08, "completion_length": 14.25, "epoch": 0.06066666666666667, "grad_norm": 4.859750747680664, "kl": 0.11083984375, "learning_rate": 9.393333333333334e-07, "loss": 0.0044, "reward": 1.873721718788147, "reward_mean": 1.873721718788147, "reward_std": 0.03481683507561684, "rewards/iou_timestamp_reward": 0.8737218379974365, "rewards/t_format_reward": 1.0, "step": 182 }, { "advantages": -1.471489667892456e-07, "completion_length": 101.125, "epoch": 0.061, "grad_norm": 4.730010032653809, "kl": 0.2001953125, "learning_rate": 9.389999999999999e-07, "loss": 0.008, "reward": 0.37742435932159424, "reward_mean": 0.37742435932159424, "reward_std": 0.04855173081159592, "rewards/v_meteor_reward": 0.37742435932159424, "step": 183 }, { "advantages": 4.991888999938965e-07, "completion_length": 15.9375, "epoch": 0.06133333333333333, "grad_norm": 8.059599876403809, "kl": 0.197265625, "learning_rate": 9.386666666666666e-07, "loss": 0.0079, "reward": 1.6294739246368408, "reward_mean": 1.6294739246368408, "reward_std": 0.10038585960865021, "rewards/iou_timestamp_reward": 0.6294739246368408, "rewards/t_format_reward": 1.0, "step": 184 }, { "advantages": -1.825392246246338e-07, "completion_length": 186.8125, "epoch": 0.06166666666666667, "grad_norm": 2.6471893787384033, "kl": 0.07373046875, "learning_rate": 9.383333333333333e-07, "loss": 0.0029, "reward": 0.4596899747848511, "reward_mean": 0.4596899747848511, "reward_std": 0.13111940026283264, "rewards/a_meteor_reward": 0.4596899747848511, "step": 185 }, { "advantages": 1.0058283805847168e-07, "completion_length": 100.875, "epoch": 0.062, "grad_norm": 5.5908660888671875, "kl": 0.123046875, "learning_rate": 9.379999999999998e-07, "loss": 0.0049, "reward": 0.36619120836257935, "reward_mean": 0.36619120836257935, "reward_std": 0.05109809339046478, "rewards/v_meteor_reward": 0.36619120836257935, "step": 186 }, { "advantages": -4.246830940246582e-07, "completion_length": 16.125, "epoch": 0.06233333333333333, "grad_norm": 8.923288345336914, "kl": 0.2060546875, "learning_rate": 9.376666666666666e-07, "loss": 0.0082, "reward": 1.462788462638855, "reward_mean": 1.462788462638855, "reward_std": 0.10602619498968124, "rewards/iou_timestamp_reward": 0.4627884328365326, "rewards/t_format_reward": 1.0, "step": 187 }, { "advantages": 3.203749656677246e-07, "completion_length": 15.5, "epoch": 0.06266666666666666, "grad_norm": 12.374619483947754, "kl": 0.22265625, "learning_rate": 9.373333333333333e-07, "loss": 0.0089, "reward": 1.6300444602966309, "reward_mean": 1.6300444602966309, "reward_std": 0.0986478179693222, "rewards/iou_timestamp_reward": 0.6300444006919861, "rewards/t_format_reward": 1.0, "step": 188 }, { "advantages": -9.98377799987793e-07, "completion_length": 16.0, "epoch": 0.063, "grad_norm": 9.772567749023438, "kl": 0.10498046875, "learning_rate": 9.37e-07, "loss": 0.0042, "reward": 1.5678999423980713, "reward_mean": 1.5678999423980713, "reward_std": 0.1467738300561905, "rewards/iou_timestamp_reward": 0.5678998827934265, "rewards/t_format_reward": 1.0, "step": 189 }, { "advantages": 9.872019290924072e-08, "completion_length": 386.125, "epoch": 0.06333333333333334, "grad_norm": 2.9738035202026367, "kl": 0.07763671875, "learning_rate": 9.366666666666666e-07, "loss": 0.0031, "reward": 0.48542651534080505, "reward_mean": 0.48542651534080505, "reward_std": 0.1676703244447708, "rewards/a_meteor_reward": 0.48542651534080505, "step": 190 }, { "advantages": 5.587935447692871e-09, "completion_length": 70.9375, "epoch": 0.06366666666666666, "grad_norm": 7.478069305419922, "kl": 0.439453125, "learning_rate": 9.363333333333333e-07, "loss": 0.0175, "reward": 0.5349234342575073, "reward_mean": 0.5349234342575073, "reward_std": 0.1873595267534256, "rewards/a_meteor_reward": 0.5349234342575073, "step": 191 }, { "advantages": 2.60770320892334e-08, "completion_length": 93.5625, "epoch": 0.064, "grad_norm": 6.404247283935547, "kl": 0.1796875, "learning_rate": 9.36e-07, "loss": 0.0072, "reward": 0.4388931691646576, "reward_mean": 0.4388931691646576, "reward_std": 0.09346583485603333, "rewards/v_meteor_reward": 0.4388931691646576, "step": 192 }, { "advantages": 3.129243850708008e-07, "completion_length": 220.4375, "epoch": 0.06433333333333334, "grad_norm": 5.198151111602783, "kl": 0.080078125, "learning_rate": 9.356666666666666e-07, "loss": 0.0032, "reward": 0.4569103717803955, "reward_mean": 0.4569103717803955, "reward_std": 0.10219883918762207, "rewards/a_meteor_reward": 0.4569103717803955, "step": 193 }, { "advantages": 2.8312206268310547e-07, "completion_length": 130.9375, "epoch": 0.06466666666666666, "grad_norm": 3.3771634101867676, "kl": 0.09619140625, "learning_rate": 9.353333333333333e-07, "loss": 0.0039, "reward": 0.344923734664917, "reward_mean": 0.344923734664917, "reward_std": 0.07583391666412354, "rewards/v_meteor_reward": 0.344923734664917, "step": 194 }, { "advantages": 1.0989606380462646e-07, "completion_length": 14.5, "epoch": 0.065, "grad_norm": 7.972990989685059, "kl": 0.1796875, "learning_rate": 9.35e-07, "loss": 0.0072, "reward": 1.783461570739746, "reward_mean": 1.783461570739746, "reward_std": 0.11473490297794342, "rewards/iou_timestamp_reward": 0.7834615707397461, "rewards/t_format_reward": 1.0, "step": 195 }, { "advantages": -3.3527612686157227e-07, "completion_length": 90.5, "epoch": 0.06533333333333333, "grad_norm": 5.025396347045898, "kl": 0.189453125, "learning_rate": 9.346666666666666e-07, "loss": 0.0076, "reward": 0.3656533360481262, "reward_mean": 0.3656533360481262, "reward_std": 0.06314139813184738, "rewards/v_meteor_reward": 0.3656533360481262, "step": 196 }, { "advantages": -4.209578037261963e-07, "completion_length": 159.0, "epoch": 0.06566666666666666, "grad_norm": 2.062155246734619, "kl": 0.11376953125, "learning_rate": 9.343333333333333e-07, "loss": 0.0046, "reward": 0.8587131500244141, "reward_mean": 0.8587131500244141, "reward_std": 0.03078605979681015, "rewards/a_meteor_reward": 0.8587131500244141, "step": 197 }, { "advantages": 1.601874828338623e-07, "completion_length": 367.5625, "epoch": 0.066, "grad_norm": 1.7891137599945068, "kl": 0.0634765625, "learning_rate": 9.34e-07, "loss": 0.0025, "reward": 0.43649405241012573, "reward_mean": 0.43649405241012573, "reward_std": 0.04834435135126114, "rewards/a_meteor_reward": 0.43649405241012573, "step": 198 }, { "advantages": 5.587935447692871e-08, "completion_length": 15.5, "epoch": 0.06633333333333333, "grad_norm": 11.901440620422363, "kl": 0.1953125, "learning_rate": 9.336666666666666e-07, "loss": 0.0078, "reward": 1.4632906913757324, "reward_mean": 1.4632906913757324, "reward_std": 0.12326391041278839, "rewards/iou_timestamp_reward": 0.46329060196876526, "rewards/t_format_reward": 1.0, "step": 199 }, { "advantages": -1.1175870895385742e-07, "completion_length": 102.5625, "epoch": 0.06666666666666667, "grad_norm": 5.600762367248535, "kl": 0.11572265625, "learning_rate": 9.333333333333333e-07, "loss": 0.0046, "reward": 0.3488306701183319, "reward_mean": 0.3488306701183319, "reward_std": 0.056952208280563354, "rewards/v_meteor_reward": 0.3488306701183319, "step": 200 }, { "advantages": -2.60770320892334e-08, "completion_length": 456.6875, "epoch": 0.067, "grad_norm": 2.1802213191986084, "kl": 0.06494140625, "learning_rate": 9.33e-07, "loss": 0.0026, "reward": 0.3059713840484619, "reward_mean": 0.3059713840484619, "reward_std": 0.08998338878154755, "rewards/a_meteor_reward": 0.3059713840484619, "step": 201 }, { "advantages": 2.60770320892334e-08, "completion_length": 82.125, "epoch": 0.06733333333333333, "grad_norm": 5.23966121673584, "kl": 0.166015625, "learning_rate": 9.326666666666666e-07, "loss": 0.0066, "reward": 0.42426949739456177, "reward_mean": 0.42426949739456177, "reward_std": 0.06834129989147186, "rewards/v_meteor_reward": 0.42426949739456177, "step": 202 }, { "advantages": 1.1548399925231934e-07, "completion_length": 252.6875, "epoch": 0.06766666666666667, "grad_norm": 5.3964619636535645, "kl": 0.251953125, "learning_rate": 9.323333333333334e-07, "loss": 0.01, "reward": 0.5853011012077332, "reward_mean": 0.5853011012077332, "reward_std": 0.19028408825397491, "rewards/a_meteor_reward": 0.5853011012077332, "step": 203 }, { "advantages": -3.8370490074157715e-07, "completion_length": 15.375, "epoch": 0.068, "grad_norm": 10.376399040222168, "kl": 0.1875, "learning_rate": 9.32e-07, "loss": 0.0075, "reward": 1.644266128540039, "reward_mean": 1.644266128540039, "reward_std": 0.123634472489357, "rewards/iou_timestamp_reward": 0.6442660689353943, "rewards/t_format_reward": 1.0, "step": 204 }, { "advantages": 3.725290298461914e-07, "completion_length": 16.5, "epoch": 0.06833333333333333, "grad_norm": 7.484046459197998, "kl": 0.189453125, "learning_rate": 9.316666666666666e-07, "loss": 0.0076, "reward": 1.7539336681365967, "reward_mean": 1.7539336681365967, "reward_std": 0.08392991125583649, "rewards/iou_timestamp_reward": 0.7539336681365967, "rewards/t_format_reward": 1.0, "step": 205 }, { "advantages": 1.471489667892456e-07, "completion_length": 124.0625, "epoch": 0.06866666666666667, "grad_norm": 4.914553642272949, "kl": 0.1357421875, "learning_rate": 9.313333333333333e-07, "loss": 0.0054, "reward": 0.5079240798950195, "reward_mean": 0.5079240798950195, "reward_std": 0.16682039201259613, "rewards/a_meteor_reward": 0.5079240798950195, "step": 206 }, { "advantages": 1.4528632164001465e-07, "completion_length": 195.6875, "epoch": 0.069, "grad_norm": 6.388495922088623, "kl": 0.361328125, "learning_rate": 9.31e-07, "loss": 0.0145, "reward": 0.6598904132843018, "reward_mean": 0.6598904132843018, "reward_std": 0.06138555333018303, "rewards/a_meteor_reward": 0.6598904132843018, "step": 207 }, { "advantages": -1.4901161193847656e-08, "completion_length": 90.75, "epoch": 0.06933333333333333, "grad_norm": 6.111006259918213, "kl": 0.255859375, "learning_rate": 9.306666666666666e-07, "loss": 0.0103, "reward": 0.41709619760513306, "reward_mean": 0.41709619760513306, "reward_std": 0.08290216326713562, "rewards/v_meteor_reward": 0.41709619760513306, "step": 208 }, { "advantages": -3.725290298461914e-08, "completion_length": 142.75, "epoch": 0.06966666666666667, "grad_norm": 5.462996959686279, "kl": 0.1279296875, "learning_rate": 9.303333333333333e-07, "loss": 0.0051, "reward": 0.33555829524993896, "reward_mean": 0.33555829524993896, "reward_std": 0.0727539211511612, "rewards/v_meteor_reward": 0.33555829524993896, "step": 209 }, { "advantages": 9.685754776000977e-08, "completion_length": 299.3125, "epoch": 0.07, "grad_norm": 2.9405696392059326, "kl": 0.06591796875, "learning_rate": 9.3e-07, "loss": 0.0026, "reward": 0.43430423736572266, "reward_mean": 0.43430423736572266, "reward_std": 0.08675377070903778, "rewards/a_meteor_reward": 0.43430423736572266, "step": 210 }, { "advantages": -9.12696123123169e-08, "completion_length": 75.25, "epoch": 0.07033333333333333, "grad_norm": 6.375410556793213, "kl": 0.15234375, "learning_rate": 9.296666666666666e-07, "loss": 0.0061, "reward": 0.4311317205429077, "reward_mean": 0.4311317205429077, "reward_std": 0.12065474689006805, "rewards/v_meteor_reward": 0.4311317205429077, "step": 211 }, { "advantages": -1.2516975402832031e-06, "completion_length": 14.6875, "epoch": 0.07066666666666667, "grad_norm": 5.423073768615723, "kl": 0.208984375, "learning_rate": 9.293333333333333e-07, "loss": 0.0084, "reward": 1.6344020366668701, "reward_mean": 1.6344020366668701, "reward_std": 0.021865515038371086, "rewards/iou_timestamp_reward": 0.6344020366668701, "rewards/t_format_reward": 1.0, "step": 212 }, { "advantages": 3.5455450415611267e-06, "completion_length": 15.25, "epoch": 0.071, "grad_norm": 5.7610368728637695, "kl": 0.3046875, "learning_rate": 9.29e-07, "loss": 0.0122, "reward": 1.8776636123657227, "reward_mean": 1.8776636123657227, "reward_std": 0.02486364357173443, "rewards/iou_timestamp_reward": 0.8776636123657227, "rewards/t_format_reward": 1.0, "step": 213 }, { "advantages": -7.450580596923828e-08, "completion_length": 499.25, "epoch": 0.07133333333333333, "grad_norm": 1.805978775024414, "kl": 0.05859375, "learning_rate": 9.286666666666666e-07, "loss": 0.0024, "reward": 0.3604682683944702, "reward_mean": 0.3604682683944702, "reward_std": 0.12854740023612976, "rewards/a_meteor_reward": 0.3604682683944702, "step": 214 }, { "advantages": -1.8998980522155762e-07, "completion_length": 77.9375, "epoch": 0.07166666666666667, "grad_norm": 5.606757164001465, "kl": 0.1669921875, "learning_rate": 9.283333333333333e-07, "loss": 0.0067, "reward": 0.3658486008644104, "reward_mean": 0.3658486008644104, "reward_std": 0.06844094395637512, "rewards/v_meteor_reward": 0.3658486008644104, "step": 215 }, { "advantages": 4.5821070671081543e-07, "completion_length": 127.25, "epoch": 0.072, "grad_norm": 3.294691801071167, "kl": 0.2265625, "learning_rate": 9.28e-07, "loss": 0.0091, "reward": 0.7274866700172424, "reward_mean": 0.7274866700172424, "reward_std": 0.1506788432598114, "rewards/a_meteor_reward": 0.7274866700172424, "step": 216 }, { "advantages": 1.0803341865539551e-07, "completion_length": 65.125, "epoch": 0.07233333333333333, "grad_norm": 6.756781578063965, "kl": 0.271484375, "learning_rate": 9.276666666666666e-07, "loss": 0.0109, "reward": 0.34833472967147827, "reward_mean": 0.34833472967147827, "reward_std": 0.10317523777484894, "rewards/v_meteor_reward": 0.34833472967147827, "step": 217 }, { "advantages": 3.501772880554199e-07, "completion_length": 98.375, "epoch": 0.07266666666666667, "grad_norm": 4.425882339477539, "kl": 0.11328125, "learning_rate": 9.273333333333333e-07, "loss": 0.0045, "reward": 0.366863876581192, "reward_mean": 0.366863876581192, "reward_std": 0.040785521268844604, "rewards/v_meteor_reward": 0.366863876581192, "step": 218 }, { "advantages": -7.450580596923828e-08, "completion_length": 58.8125, "epoch": 0.073, "grad_norm": 6.464948654174805, "kl": 0.296875, "learning_rate": 9.27e-07, "loss": 0.0119, "reward": 0.6157374382019043, "reward_mean": 0.6157374382019043, "reward_std": 0.15406450629234314, "rewards/a_meteor_reward": 0.6157374382019043, "step": 219 }, { "advantages": 9.8496675491333e-06, "completion_length": 16.25, "epoch": 0.07333333333333333, "grad_norm": 6.565788745880127, "kl": 0.275390625, "learning_rate": 9.266666666666665e-07, "loss": 0.011, "reward": 1.6243677139282227, "reward_mean": 1.6243677139282227, "reward_std": 0.06572499871253967, "rewards/iou_timestamp_reward": 0.6243677139282227, "rewards/t_format_reward": 1.0, "step": 220 }, { "advantages": 3.1851232051849365e-07, "completion_length": 15.5, "epoch": 0.07366666666666667, "grad_norm": 7.425717353820801, "kl": 0.193359375, "learning_rate": 9.263333333333333e-07, "loss": 0.0078, "reward": 1.6702316999435425, "reward_mean": 1.6702316999435425, "reward_std": 0.14717936515808105, "rewards/iou_timestamp_reward": 0.6702316999435425, "rewards/t_format_reward": 1.0, "step": 221 }, { "advantages": 7.078051567077637e-08, "completion_length": 16.0, "epoch": 0.074, "grad_norm": 15.771302223205566, "kl": 0.19921875, "learning_rate": 9.26e-07, "loss": 0.008, "reward": 1.8032572269439697, "reward_mean": 1.8032572269439697, "reward_std": 0.11691711843013763, "rewards/iou_timestamp_reward": 0.8032572269439697, "rewards/t_format_reward": 1.0, "step": 222 }, { "advantages": 1.1175870895385742e-07, "completion_length": 187.125, "epoch": 0.07433333333333333, "grad_norm": 4.032297134399414, "kl": 0.1689453125, "learning_rate": 9.256666666666666e-07, "loss": 0.0068, "reward": 0.6872379779815674, "reward_mean": 0.6872379779815674, "reward_std": 0.11287400126457214, "rewards/a_meteor_reward": 0.6872379779815674, "step": 223 }, { "advantages": 4.284083843231201e-07, "completion_length": 151.625, "epoch": 0.07466666666666667, "grad_norm": 5.106250762939453, "kl": 0.24609375, "learning_rate": 9.253333333333333e-07, "loss": 0.0099, "reward": 0.754265546798706, "reward_mean": 0.754265546798706, "reward_std": 0.059052422642707825, "rewards/a_meteor_reward": 0.754265546798706, "step": 224 }, { "advantages": 4.3213367462158203e-07, "completion_length": 15.5, "epoch": 0.075, "grad_norm": 8.278326034545898, "kl": 0.1884765625, "learning_rate": 9.25e-07, "loss": 0.0075, "reward": 1.8006435632705688, "reward_mean": 1.8006435632705688, "reward_std": 0.05766880884766579, "rewards/iou_timestamp_reward": 0.8006436228752136, "rewards/t_format_reward": 1.0, "step": 225 }, { "advantages": 3.4458935260772705e-07, "completion_length": 168.5625, "epoch": 0.07533333333333334, "grad_norm": 3.5070271492004395, "kl": 0.0849609375, "learning_rate": 9.246666666666666e-07, "loss": 0.0034, "reward": 0.5579768419265747, "reward_mean": 0.5579768419265747, "reward_std": 0.06925159692764282, "rewards/a_meteor_reward": 0.5579768419265747, "step": 226 }, { "advantages": -1.1175870895385742e-08, "completion_length": 130.3125, "epoch": 0.07566666666666666, "grad_norm": 6.069686412811279, "kl": 0.341796875, "learning_rate": 9.243333333333333e-07, "loss": 0.0137, "reward": 0.49303966760635376, "reward_mean": 0.49303966760635376, "reward_std": 0.11376556754112244, "rewards/a_meteor_reward": 0.49303966760635376, "step": 227 }, { "advantages": 1.1455267667770386e-06, "completion_length": 173.25, "epoch": 0.076, "grad_norm": 9.042088508605957, "kl": 0.400390625, "learning_rate": 9.24e-07, "loss": 0.0161, "reward": 0.7823003530502319, "reward_mean": 0.7823003530502319, "reward_std": 0.02690923772752285, "rewards/a_meteor_reward": 0.7823003530502319, "step": 228 }, { "advantages": 8.642673492431641e-07, "completion_length": 15.5, "epoch": 0.07633333333333334, "grad_norm": 5.8272929191589355, "kl": 0.224609375, "learning_rate": 9.236666666666666e-07, "loss": 0.009, "reward": 1.5649354457855225, "reward_mean": 1.5649354457855225, "reward_std": 0.026487130671739578, "rewards/iou_timestamp_reward": 0.5649354457855225, "rewards/t_format_reward": 1.0, "step": 229 }, { "advantages": 1.862645149230957e-07, "completion_length": 16.25, "epoch": 0.07666666666666666, "grad_norm": 12.853278160095215, "kl": 0.275390625, "learning_rate": 9.233333333333333e-07, "loss": 0.011, "reward": 1.698009729385376, "reward_mean": 1.698009729385376, "reward_std": 0.054210275411605835, "rewards/iou_timestamp_reward": 0.698009729385376, "rewards/t_format_reward": 1.0, "step": 230 }, { "advantages": -3.725290298461914e-08, "completion_length": 93.3125, "epoch": 0.077, "grad_norm": 5.630387783050537, "kl": 0.1640625, "learning_rate": 9.23e-07, "loss": 0.0066, "reward": 0.479880690574646, "reward_mean": 0.479880690574646, "reward_std": 0.047382015734910965, "rewards/v_meteor_reward": 0.479880690574646, "step": 231 }, { "advantages": 5.178153514862061e-07, "completion_length": 15.625, "epoch": 0.07733333333333334, "grad_norm": 10.003639221191406, "kl": 0.259765625, "learning_rate": 9.226666666666666e-07, "loss": 0.0104, "reward": 1.5301299095153809, "reward_mean": 1.5301299095153809, "reward_std": 0.05076393485069275, "rewards/iou_timestamp_reward": 0.5301299691200256, "rewards/t_format_reward": 1.0, "step": 232 }, { "advantages": 2.384185791015625e-07, "completion_length": 15.9375, "epoch": 0.07766666666666666, "grad_norm": 10.636816024780273, "kl": 0.2578125, "learning_rate": 9.223333333333333e-07, "loss": 0.0103, "reward": 1.3321356773376465, "reward_mean": 1.3321356773376465, "reward_std": 0.12244856357574463, "rewards/iou_timestamp_reward": 0.3321356773376465, "rewards/t_format_reward": 1.0, "step": 233 }, { "advantages": -2.7008354663848877e-07, "completion_length": 14.75, "epoch": 0.078, "grad_norm": 11.62215518951416, "kl": 0.169921875, "learning_rate": 9.22e-07, "loss": 0.0068, "reward": 1.7483896017074585, "reward_mean": 1.7483896017074585, "reward_std": 0.10887445509433746, "rewards/iou_timestamp_reward": 0.7483896017074585, "rewards/t_format_reward": 1.0, "step": 234 }, { "advantages": -7.078051567077637e-08, "completion_length": 84.875, "epoch": 0.07833333333333334, "grad_norm": 8.138920783996582, "kl": 0.1455078125, "learning_rate": 9.216666666666666e-07, "loss": 0.0058, "reward": 0.34677648544311523, "reward_mean": 0.34677648544311523, "reward_std": 0.08910583704710007, "rewards/v_meteor_reward": 0.34677648544311523, "step": 235 }, { "advantages": 5.960464477539063e-08, "completion_length": 75.3125, "epoch": 0.07866666666666666, "grad_norm": 5.55446720123291, "kl": 0.18359375, "learning_rate": 9.213333333333333e-07, "loss": 0.0074, "reward": 0.302579402923584, "reward_mean": 0.302579402923584, "reward_std": 0.06211242079734802, "rewards/v_meteor_reward": 0.302579402923584, "step": 236 }, { "advantages": 2.0489096641540527e-07, "completion_length": 66.1875, "epoch": 0.079, "grad_norm": 9.218978881835938, "kl": 0.52734375, "learning_rate": 9.21e-07, "loss": 0.0211, "reward": 0.7169442176818848, "reward_mean": 0.7169442176818848, "reward_std": 0.1348247230052948, "rewards/a_meteor_reward": 0.7169442176818848, "step": 237 }, { "advantages": 2.5564804673194885e-07, "completion_length": 78.0, "epoch": 0.07933333333333334, "grad_norm": 4.956727027893066, "kl": 0.25390625, "learning_rate": 9.206666666666666e-07, "loss": 0.0101, "reward": 0.41432368755340576, "reward_mean": 0.41432368755340576, "reward_std": 0.050833672285079956, "rewards/v_meteor_reward": 0.41432368755340576, "step": 238 }, { "advantages": 7.450580596923828e-09, "completion_length": 109.75, "epoch": 0.07966666666666666, "grad_norm": 9.658778190612793, "kl": 0.205078125, "learning_rate": 9.203333333333333e-07, "loss": 0.0082, "reward": 0.390242338180542, "reward_mean": 0.390242338180542, "reward_std": 0.11946467310190201, "rewards/a_meteor_reward": 0.390242338180542, "step": 239 }, { "advantages": -2.60770320892334e-08, "completion_length": 15.75, "epoch": 0.08, "grad_norm": 14.017374038696289, "kl": 0.1728515625, "learning_rate": 9.2e-07, "loss": 0.0069, "reward": 1.5120313167572021, "reward_mean": 1.5120313167572021, "reward_std": 0.1569957286119461, "rewards/iou_timestamp_reward": 0.5120313763618469, "rewards/t_format_reward": 1.0, "step": 240 }, { "advantages": 6.556510925292969e-07, "completion_length": 15.0, "epoch": 0.08033333333333334, "grad_norm": 5.733087539672852, "kl": 0.26953125, "learning_rate": 9.196666666666666e-07, "loss": 0.0108, "reward": 1.6980397701263428, "reward_mean": 1.6980397701263428, "reward_std": 0.055907510221004486, "rewards/iou_timestamp_reward": 0.698039710521698, "rewards/t_format_reward": 1.0, "step": 241 }, { "advantages": -7.450580596923828e-09, "completion_length": 63.6875, "epoch": 0.08066666666666666, "grad_norm": 6.284910202026367, "kl": 0.2890625, "learning_rate": 9.193333333333333e-07, "loss": 0.0116, "reward": 0.47604459524154663, "reward_mean": 0.47604459524154663, "reward_std": 0.09315034747123718, "rewards/v_meteor_reward": 0.47604459524154663, "step": 242 }, { "advantages": -2.384185791015625e-07, "completion_length": 15.5625, "epoch": 0.081, "grad_norm": 9.408245086669922, "kl": 0.251953125, "learning_rate": 9.19e-07, "loss": 0.0101, "reward": 1.6924891471862793, "reward_mean": 1.6924891471862793, "reward_std": 0.08916769921779633, "rewards/iou_timestamp_reward": 0.6924892663955688, "rewards/t_format_reward": 1.0, "step": 243 }, { "advantages": -5.140900611877441e-07, "completion_length": 43.625, "epoch": 0.08133333333333333, "grad_norm": 6.918147563934326, "kl": 0.486328125, "learning_rate": 9.186666666666666e-07, "loss": 0.0195, "reward": 0.6548071503639221, "reward_mean": 0.6548071503639221, "reward_std": 0.059837706387043, "rewards/a_meteor_reward": 0.6548071503639221, "step": 244 }, { "advantages": 1.0244548320770264e-07, "completion_length": 75.1875, "epoch": 0.08166666666666667, "grad_norm": 5.264615535736084, "kl": 0.30078125, "learning_rate": 9.183333333333333e-07, "loss": 0.012, "reward": 0.42913302779197693, "reward_mean": 0.42913302779197693, "reward_std": 0.04991994798183441, "rewards/v_meteor_reward": 0.42913302779197693, "step": 245 }, { "advantages": -1.7881393432617188e-07, "completion_length": 91.0625, "epoch": 0.082, "grad_norm": 5.717381477355957, "kl": 0.2158203125, "learning_rate": 9.18e-07, "loss": 0.0086, "reward": 0.40714865922927856, "reward_mean": 0.40714865922927856, "reward_std": 0.06179865449666977, "rewards/v_meteor_reward": 0.40714865922927856, "step": 246 }, { "advantages": 3.725290298461914e-08, "completion_length": 207.3125, "epoch": 0.08233333333333333, "grad_norm": 4.0288472175598145, "kl": 0.1005859375, "learning_rate": 9.176666666666666e-07, "loss": 0.004, "reward": 0.48714253306388855, "reward_mean": 0.48714253306388855, "reward_std": 0.1342807114124298, "rewards/a_meteor_reward": 0.48714253306388855, "step": 247 }, { "advantages": 3.91155481338501e-08, "completion_length": 250.9375, "epoch": 0.08266666666666667, "grad_norm": 3.030146837234497, "kl": 0.0927734375, "learning_rate": 9.173333333333333e-07, "loss": 0.0037, "reward": 0.5504482388496399, "reward_mean": 0.5504482388496399, "reward_std": 0.07322737574577332, "rewards/a_meteor_reward": 0.5504482388496399, "step": 248 }, { "advantages": -4.3585896492004395e-07, "completion_length": 15.6875, "epoch": 0.083, "grad_norm": 9.898754119873047, "kl": 0.228515625, "learning_rate": 9.17e-07, "loss": 0.0092, "reward": 1.4164183139801025, "reward_mean": 1.4164183139801025, "reward_std": 0.08886722475290298, "rewards/iou_timestamp_reward": 0.4164182245731354, "rewards/t_format_reward": 1.0, "step": 249 }, { "advantages": 1.4901161193847656e-08, "completion_length": 67.375, "epoch": 0.08333333333333333, "grad_norm": 6.3251166343688965, "kl": 0.296875, "learning_rate": 9.166666666666665e-07, "loss": 0.0119, "reward": 0.3844689428806305, "reward_mean": 0.3844689428806305, "reward_std": 0.07891203463077545, "rewards/v_meteor_reward": 0.3844689428806305, "step": 250 }, { "advantages": 4.0978193283081055e-08, "completion_length": 116.0, "epoch": 0.08366666666666667, "grad_norm": 3.161395311355591, "kl": 0.259765625, "learning_rate": 9.163333333333333e-07, "loss": 0.0104, "reward": 0.8061105012893677, "reward_mean": 0.8061105012893677, "reward_std": 0.0526382252573967, "rewards/a_meteor_reward": 0.8061105012893677, "step": 251 }, { "advantages": 2.1439045667648315e-06, "completion_length": 15.5, "epoch": 0.084, "grad_norm": 5.385815620422363, "kl": 0.267578125, "learning_rate": 9.16e-07, "loss": 0.0107, "reward": 1.8004518747329712, "reward_mean": 1.8004518747329712, "reward_std": 0.035702772438526154, "rewards/iou_timestamp_reward": 0.8004518747329712, "rewards/t_format_reward": 1.0, "step": 252 }, { "advantages": 2.0209699869155884e-07, "completion_length": 83.75, "epoch": 0.08433333333333333, "grad_norm": 5.232234001159668, "kl": 0.232421875, "learning_rate": 9.156666666666666e-07, "loss": 0.0093, "reward": 0.4111520051956177, "reward_mean": 0.4111520051956177, "reward_std": 0.07946176826953888, "rewards/v_meteor_reward": 0.4111520051956177, "step": 253 }, { "advantages": 1.9371509552001953e-07, "completion_length": 107.125, "epoch": 0.08466666666666667, "grad_norm": 2.8821778297424316, "kl": 0.1142578125, "learning_rate": 9.153333333333332e-07, "loss": 0.0046, "reward": 0.5713457465171814, "reward_mean": 0.5713457465171814, "reward_std": 0.10096725821495056, "rewards/a_meteor_reward": 0.5713457465171814, "step": 254 }, { "advantages": 4.842877388000488e-08, "completion_length": 189.8125, "epoch": 0.085, "grad_norm": 2.4827847480773926, "kl": 0.1171875, "learning_rate": 9.15e-07, "loss": 0.0047, "reward": 0.49495941400527954, "reward_mean": 0.49495941400527954, "reward_std": 0.05110243707895279, "rewards/a_meteor_reward": 0.49495941400527954, "step": 255 }, { "advantages": -3.725290298461914e-09, "completion_length": 61.5625, "epoch": 0.08533333333333333, "grad_norm": 5.319590091705322, "kl": 0.388671875, "learning_rate": 9.146666666666666e-07, "loss": 0.0156, "reward": 0.72675621509552, "reward_mean": 0.72675621509552, "reward_std": 0.05776664614677429, "rewards/a_meteor_reward": 0.72675621509552, "step": 256 }, { "advantages": 7.830094546079636e-07, "completion_length": 15.0, "epoch": 0.08566666666666667, "grad_norm": 7.111326694488525, "kl": 0.2294921875, "learning_rate": 9.143333333333333e-07, "loss": 0.0092, "reward": 1.762721061706543, "reward_mean": 1.762721061706543, "reward_std": 0.06280451267957687, "rewards/iou_timestamp_reward": 0.7627211213111877, "rewards/t_format_reward": 1.0, "step": 257 }, { "advantages": 2.868473529815674e-07, "completion_length": 92.6875, "epoch": 0.086, "grad_norm": 5.261801719665527, "kl": 0.2021484375, "learning_rate": 9.14e-07, "loss": 0.0081, "reward": 0.46607887744903564, "reward_mean": 0.46607887744903564, "reward_std": 0.10977263748645782, "rewards/v_meteor_reward": 0.46607887744903564, "step": 258 }, { "advantages": 4.0978193283081055e-08, "completion_length": 15.5, "epoch": 0.08633333333333333, "grad_norm": 8.274084091186523, "kl": 0.220703125, "learning_rate": 9.136666666666666e-07, "loss": 0.0088, "reward": 1.6332216262817383, "reward_mean": 1.6332216262817383, "reward_std": 0.10214142501354218, "rewards/iou_timestamp_reward": 0.6332216262817383, "rewards/t_format_reward": 1.0, "step": 259 }, { "advantages": 2.5443732738494873e-06, "completion_length": 15.625, "epoch": 0.08666666666666667, "grad_norm": 12.37807846069336, "kl": 0.298828125, "learning_rate": 9.133333333333333e-07, "loss": 0.012, "reward": 1.2143453359603882, "reward_mean": 1.2143453359603882, "reward_std": 0.04954046383500099, "rewards/iou_timestamp_reward": 0.21434536576271057, "rewards/t_format_reward": 1.0, "step": 260 }, { "advantages": 5.960464477539062e-07, "completion_length": 16.0, "epoch": 0.087, "grad_norm": 5.096524238586426, "kl": 0.2060546875, "learning_rate": 9.13e-07, "loss": 0.0082, "reward": 1.8133461475372314, "reward_mean": 1.8133461475372314, "reward_std": 0.0301095861941576, "rewards/iou_timestamp_reward": 0.813346266746521, "rewards/t_format_reward": 1.0, "step": 261 }, { "advantages": -4.507601261138916e-07, "completion_length": 69.4375, "epoch": 0.08733333333333333, "grad_norm": 4.634082794189453, "kl": 0.3515625, "learning_rate": 9.126666666666666e-07, "loss": 0.0141, "reward": 0.67677903175354, "reward_mean": 0.67677903175354, "reward_std": 0.07359357178211212, "rewards/a_meteor_reward": 0.67677903175354, "step": 262 }, { "advantages": 3.6135315895080566e-07, "completion_length": 120.1875, "epoch": 0.08766666666666667, "grad_norm": 4.794086933135986, "kl": 0.251953125, "learning_rate": 9.123333333333333e-07, "loss": 0.0101, "reward": 0.6355336904525757, "reward_mean": 0.6355336904525757, "reward_std": 0.12106330692768097, "rewards/a_meteor_reward": 0.6355336904525757, "step": 263 }, { "advantages": -1.2293457984924316e-07, "completion_length": 15.25, "epoch": 0.088, "grad_norm": 9.887167930603027, "kl": 0.171875, "learning_rate": 9.12e-07, "loss": 0.0069, "reward": 1.3667504787445068, "reward_mean": 1.3667504787445068, "reward_std": 0.09573734551668167, "rewards/iou_timestamp_reward": 0.36675044894218445, "rewards/t_format_reward": 1.0, "step": 264 }, { "advantages": 1.1175870895385742e-08, "completion_length": 82.1875, "epoch": 0.08833333333333333, "grad_norm": 4.999013900756836, "kl": 0.14453125, "learning_rate": 9.116666666666666e-07, "loss": 0.0058, "reward": 0.4991932809352875, "reward_mean": 0.4991932809352875, "reward_std": 0.06710423529148102, "rewards/v_meteor_reward": 0.4991932809352875, "step": 265 }, { "advantages": -8.670613169670105e-07, "completion_length": 164.25, "epoch": 0.08866666666666667, "grad_norm": 2.1935908794403076, "kl": 0.08349609375, "learning_rate": 9.113333333333333e-07, "loss": 0.0033, "reward": 0.6674792766571045, "reward_mean": 0.6674792766571045, "reward_std": 0.051882438361644745, "rewards/a_meteor_reward": 0.6674792766571045, "step": 266 }, { "advantages": -5.345791578292847e-07, "completion_length": 15.0, "epoch": 0.089, "grad_norm": 8.304143905639648, "kl": 0.1767578125, "learning_rate": 9.109999999999999e-07, "loss": 0.0071, "reward": 1.78111732006073, "reward_mean": 1.78111732006073, "reward_std": 0.06319838017225266, "rewards/iou_timestamp_reward": 0.78111732006073, "rewards/t_format_reward": 1.0, "step": 267 }, { "advantages": 1.7695128917694092e-07, "completion_length": 114.875, "epoch": 0.08933333333333333, "grad_norm": 5.128224849700928, "kl": 0.208984375, "learning_rate": 9.106666666666666e-07, "loss": 0.0084, "reward": 0.7186999320983887, "reward_mean": 0.7186999320983887, "reward_std": 0.04554865136742592, "rewards/a_meteor_reward": 0.7186999320983887, "step": 268 }, { "advantages": -3.1851232051849365e-07, "completion_length": 16.0, "epoch": 0.08966666666666667, "grad_norm": 11.123666763305664, "kl": 0.2578125, "learning_rate": 9.103333333333333e-07, "loss": 0.0103, "reward": 1.6106586456298828, "reward_mean": 1.6106586456298828, "reward_std": 0.07349641621112823, "rewards/iou_timestamp_reward": 0.610658586025238, "rewards/t_format_reward": 1.0, "step": 269 }, { "advantages": 3.46451997756958e-07, "completion_length": 15.75, "epoch": 0.09, "grad_norm": 9.504988670349121, "kl": 0.26953125, "learning_rate": 9.1e-07, "loss": 0.0108, "reward": 1.8124570846557617, "reward_mean": 1.8124570846557617, "reward_std": 0.03306771069765091, "rewards/iou_timestamp_reward": 0.8124571442604065, "rewards/t_format_reward": 1.0, "step": 270 }, { "advantages": 6.742775440216064e-07, "completion_length": 15.6875, "epoch": 0.09033333333333333, "grad_norm": 7.610354900360107, "kl": 0.1982421875, "learning_rate": 9.096666666666665e-07, "loss": 0.0079, "reward": 1.4061007499694824, "reward_mean": 1.4061007499694824, "reward_std": 0.03468426316976547, "rewards/iou_timestamp_reward": 0.4061008095741272, "rewards/t_format_reward": 1.0, "step": 271 }, { "advantages": -3.1311064958572388e-06, "completion_length": 15.75, "epoch": 0.09066666666666667, "grad_norm": 11.547009468078613, "kl": 0.255859375, "learning_rate": 9.093333333333333e-07, "loss": 0.0102, "reward": 1.6173896789550781, "reward_mean": 1.6173896789550781, "reward_std": 0.07738161087036133, "rewards/iou_timestamp_reward": 0.6173896789550781, "rewards/t_format_reward": 1.0, "step": 272 }, { "advantages": 5.587935447692871e-09, "completion_length": 222.125, "epoch": 0.091, "grad_norm": 2.137578248977661, "kl": 0.07568359375, "learning_rate": 9.09e-07, "loss": 0.003, "reward": 0.47939372062683105, "reward_mean": 0.47939372062683105, "reward_std": 0.13272616267204285, "rewards/a_meteor_reward": 0.47939372062683105, "step": 273 }, { "advantages": -1.2442469596862793e-06, "completion_length": 98.375, "epoch": 0.09133333333333334, "grad_norm": 5.9495768547058105, "kl": 0.318359375, "learning_rate": 9.086666666666666e-07, "loss": 0.0127, "reward": 0.7139204740524292, "reward_mean": 0.7139204740524292, "reward_std": 0.06265301257371902, "rewards/a_meteor_reward": 0.7139204740524292, "step": 274 }, { "advantages": 7.897615432739258e-07, "completion_length": 16.25, "epoch": 0.09166666666666666, "grad_norm": 16.091691970825195, "kl": 0.25390625, "learning_rate": 9.083333333333332e-07, "loss": 0.0102, "reward": 1.6614896059036255, "reward_mean": 1.6614896059036255, "reward_std": 0.059912510216236115, "rewards/iou_timestamp_reward": 0.6614896655082703, "rewards/t_format_reward": 1.0, "step": 275 }, { "advantages": 2.3469328880310059e-07, "completion_length": 81.0625, "epoch": 0.092, "grad_norm": 6.28101110458374, "kl": 0.251953125, "learning_rate": 9.08e-07, "loss": 0.0101, "reward": 0.7628781199455261, "reward_mean": 0.7628781199455261, "reward_std": 0.08816389739513397, "rewards/a_meteor_reward": 0.7628781199455261, "step": 276 }, { "advantages": -2.644956111907959e-07, "completion_length": 16.0, "epoch": 0.09233333333333334, "grad_norm": 8.454538345336914, "kl": 0.326171875, "learning_rate": 9.076666666666666e-07, "loss": 0.013, "reward": 1.397200345993042, "reward_mean": 1.397200345993042, "reward_std": 0.06599199026823044, "rewards/iou_timestamp_reward": 0.397200345993042, "rewards/t_format_reward": 1.0, "step": 277 }, { "advantages": -1.5273690223693848e-07, "completion_length": 118.4375, "epoch": 0.09266666666666666, "grad_norm": 7.562681198120117, "kl": 0.33984375, "learning_rate": 9.073333333333333e-07, "loss": 0.0136, "reward": 0.6210438013076782, "reward_mean": 0.6210438013076782, "reward_std": 0.0653066337108612, "rewards/a_meteor_reward": 0.6210438013076782, "step": 278 }, { "advantages": 1.7136335372924805e-07, "completion_length": 144.5625, "epoch": 0.093, "grad_norm": 5.136427402496338, "kl": 0.1572265625, "learning_rate": 9.07e-07, "loss": 0.0063, "reward": 0.2550972104072571, "reward_mean": 0.2550972104072571, "reward_std": 0.04984430596232414, "rewards/v_meteor_reward": 0.2550972104072571, "step": 279 }, { "advantages": -4.600733518600464e-07, "completion_length": 132.0, "epoch": 0.09333333333333334, "grad_norm": 4.513078689575195, "kl": 0.3125, "learning_rate": 9.066666666666665e-07, "loss": 0.0125, "reward": 0.5923042297363281, "reward_mean": 0.5923042297363281, "reward_std": 0.04273742809891701, "rewards/a_meteor_reward": 0.5923042297363281, "step": 280 }, { "advantages": 2.421438694000244e-07, "completion_length": 233.5, "epoch": 0.09366666666666666, "grad_norm": 1.3806575536727905, "kl": 0.0595703125, "learning_rate": 9.063333333333333e-07, "loss": 0.0024, "reward": 0.4868759512901306, "reward_mean": 0.4868759512901306, "reward_std": 0.04184426739811897, "rewards/a_meteor_reward": 0.4868759512901306, "step": 281 }, { "advantages": -6.05359673500061e-08, "completion_length": 70.75, "epoch": 0.094, "grad_norm": 6.465293884277344, "kl": 0.28125, "learning_rate": 9.06e-07, "loss": 0.0112, "reward": 0.5952208638191223, "reward_mean": 0.5952208638191223, "reward_std": 0.03992389142513275, "rewards/a_meteor_reward": 0.5952208638191223, "step": 282 }, { "advantages": 1.2814998626708984e-06, "completion_length": 139.0, "epoch": 0.09433333333333334, "grad_norm": 2.529193162918091, "kl": 0.11279296875, "learning_rate": 9.056666666666666e-07, "loss": 0.0045, "reward": 0.703646183013916, "reward_mean": 0.703646183013916, "reward_std": 0.06034284085035324, "rewards/a_meteor_reward": 0.703646183013916, "step": 283 }, { "advantages": -1.952052116394043e-06, "completion_length": 14.5, "epoch": 0.09466666666666666, "grad_norm": 10.243638038635254, "kl": 0.2490234375, "learning_rate": 9.053333333333332e-07, "loss": 0.01, "reward": 1.6606011390686035, "reward_mean": 1.6606011390686035, "reward_std": 0.0792248472571373, "rewards/iou_timestamp_reward": 0.6606011390686035, "rewards/t_format_reward": 1.0, "step": 284 }, { "advantages": -7.078051567077637e-07, "completion_length": 175.75, "epoch": 0.095, "grad_norm": 2.7919864654541016, "kl": 0.1787109375, "learning_rate": 9.05e-07, "loss": 0.0072, "reward": 0.8790655732154846, "reward_mean": 0.8790655732154846, "reward_std": 0.044031091034412384, "rewards/a_meteor_reward": 0.8790655732154846, "step": 285 }, { "advantages": -2.738088369369507e-07, "completion_length": 15.625, "epoch": 0.09533333333333334, "grad_norm": 11.644107818603516, "kl": 0.205078125, "learning_rate": 9.046666666666666e-07, "loss": 0.0082, "reward": 1.5939452648162842, "reward_mean": 1.5939452648162842, "reward_std": 0.13681641221046448, "rewards/iou_timestamp_reward": 0.593945324420929, "rewards/t_format_reward": 1.0, "step": 286 }, { "advantages": -3.725290298461914e-09, "completion_length": 73.0, "epoch": 0.09566666666666666, "grad_norm": 6.851900100708008, "kl": 0.2734375, "learning_rate": 9.043333333333333e-07, "loss": 0.0109, "reward": 0.40309739112854004, "reward_mean": 0.40309739112854004, "reward_std": 0.11617565155029297, "rewards/v_meteor_reward": 0.40309739112854004, "step": 287 }, { "advantages": 6.705522537231445e-08, "completion_length": 97.875, "epoch": 0.096, "grad_norm": 5.308900356292725, "kl": 0.1806640625, "learning_rate": 9.039999999999999e-07, "loss": 0.0072, "reward": 0.37150847911834717, "reward_mean": 0.37150847911834717, "reward_std": 0.07315857708454132, "rewards/v_meteor_reward": 0.37150847911834717, "step": 288 }, { "advantages": 4.805624485015869e-07, "completion_length": 15.8125, "epoch": 0.09633333333333334, "grad_norm": 7.771056175231934, "kl": 0.251953125, "learning_rate": 9.036666666666666e-07, "loss": 0.0101, "reward": 1.5399010181427002, "reward_mean": 1.5399010181427002, "reward_std": 0.0268818661570549, "rewards/iou_timestamp_reward": 0.5399009585380554, "rewards/t_format_reward": 1.0, "step": 289 }, { "advantages": -2.9802322387695312e-08, "completion_length": 29.125, "epoch": 0.09666666666666666, "grad_norm": 9.123204231262207, "kl": 0.62890625, "learning_rate": 9.033333333333333e-07, "loss": 0.0251, "reward": 0.5202202200889587, "reward_mean": 0.5202202200889587, "reward_std": 0.1131030023097992, "rewards/a_meteor_reward": 0.5202202200889587, "step": 290 }, { "advantages": -1.30385160446167e-07, "completion_length": 181.0625, "epoch": 0.097, "grad_norm": 2.6846866607666016, "kl": 0.10302734375, "learning_rate": 9.03e-07, "loss": 0.0041, "reward": 0.7239788770675659, "reward_mean": 0.7239788770675659, "reward_std": 0.0640561580657959, "rewards/a_meteor_reward": 0.7239788770675659, "step": 291 }, { "advantages": -2.0675361156463623e-07, "completion_length": 58.6875, "epoch": 0.09733333333333333, "grad_norm": 5.441228866577148, "kl": 0.34765625, "learning_rate": 9.026666666666665e-07, "loss": 0.0139, "reward": 0.729315996170044, "reward_mean": 0.729315996170044, "reward_std": 0.06628341972827911, "rewards/a_meteor_reward": 0.729315996170044, "step": 292 }, { "advantages": 2.0079314708709717e-06, "completion_length": 16.0, "epoch": 0.09766666666666667, "grad_norm": 10.64461898803711, "kl": 0.2890625, "learning_rate": 9.023333333333333e-07, "loss": 0.0115, "reward": 1.7669801712036133, "reward_mean": 1.7669801712036133, "reward_std": 0.06640756875276566, "rewards/iou_timestamp_reward": 0.7669802904129028, "rewards/t_format_reward": 1.0, "step": 293 }, { "advantages": -4.380941390991211e-06, "completion_length": 15.0, "epoch": 0.098, "grad_norm": 9.31991958618164, "kl": 0.232421875, "learning_rate": 9.02e-07, "loss": 0.0093, "reward": 1.6886277198791504, "reward_mean": 1.6886277198791504, "reward_std": 0.015501173213124275, "rewards/iou_timestamp_reward": 0.6886276006698608, "rewards/t_format_reward": 1.0, "step": 294 }, { "advantages": -5.587935447692871e-08, "completion_length": 79.6875, "epoch": 0.09833333333333333, "grad_norm": 6.343013763427734, "kl": 0.1826171875, "learning_rate": 9.016666666666666e-07, "loss": 0.0073, "reward": 0.37936294078826904, "reward_mean": 0.37936294078826904, "reward_std": 0.09278326481580734, "rewards/v_meteor_reward": 0.37936294078826904, "step": 295 }, { "advantages": -3.725290298461914e-09, "completion_length": 86.6875, "epoch": 0.09866666666666667, "grad_norm": 5.406430244445801, "kl": 0.15625, "learning_rate": 9.013333333333333e-07, "loss": 0.0062, "reward": 0.32385706901550293, "reward_mean": 0.32385706901550293, "reward_std": 0.07772045582532883, "rewards/v_meteor_reward": 0.32385706901550293, "step": 296 }, { "advantages": -2.9802322387695312e-08, "completion_length": 92.0625, "epoch": 0.099, "grad_norm": 5.831965446472168, "kl": 0.1572265625, "learning_rate": 9.01e-07, "loss": 0.0063, "reward": 0.34471675753593445, "reward_mean": 0.34471675753593445, "reward_std": 0.06440328061580658, "rewards/v_meteor_reward": 0.34471675753593445, "step": 297 }, { "advantages": -5.494803190231323e-08, "completion_length": 101.875, "epoch": 0.09933333333333333, "grad_norm": 5.301065444946289, "kl": 0.171875, "learning_rate": 9.006666666666666e-07, "loss": 0.0069, "reward": 0.4547102451324463, "reward_mean": 0.4547102451324463, "reward_std": 0.09205565601587296, "rewards/v_meteor_reward": 0.4547102451324463, "step": 298 }, { "advantages": 7.860362529754639e-07, "completion_length": 14.5, "epoch": 0.09966666666666667, "grad_norm": 12.130569458007812, "kl": 0.28125, "learning_rate": 9.003333333333333e-07, "loss": 0.0112, "reward": 1.5610377788543701, "reward_mean": 1.5610377788543701, "reward_std": 0.03920230641961098, "rewards/iou_timestamp_reward": 0.5610377788543701, "rewards/t_format_reward": 1.0, "step": 299 }, { "advantages": -8.940696716308594e-08, "completion_length": 149.6875, "epoch": 0.1, "grad_norm": 2.1052677631378174, "kl": 0.10986328125, "learning_rate": 9e-07, "loss": 0.0044, "reward": 0.7866935729980469, "reward_mean": 0.7866935729980469, "reward_std": 0.023218370974063873, "rewards/a_meteor_reward": 0.7866935729980469, "step": 300 }, { "advantages": 1.1920928955078125e-07, "completion_length": 97.125, "epoch": 0.10033333333333333, "grad_norm": 6.952653884887695, "kl": 0.357421875, "learning_rate": 8.996666666666665e-07, "loss": 0.0143, "reward": 0.6042121052742004, "reward_mean": 0.6042121052742004, "reward_std": 0.07144323736429214, "rewards/a_meteor_reward": 0.6042121052742004, "step": 301 }, { "advantages": 2.551823854446411e-07, "completion_length": 15.1875, "epoch": 0.10066666666666667, "grad_norm": 17.036930084228516, "kl": 0.30859375, "learning_rate": 8.993333333333333e-07, "loss": 0.0123, "reward": 1.255386471748352, "reward_mean": 1.255386471748352, "reward_std": 0.15244929492473602, "rewards/iou_timestamp_reward": 0.25538644194602966, "rewards/t_format_reward": 1.0, "step": 302 }, { "advantages": -4.0978193283081055e-08, "completion_length": 87.25, "epoch": 0.101, "grad_norm": 5.927923679351807, "kl": 0.234375, "learning_rate": 8.99e-07, "loss": 0.0094, "reward": 0.3465459942817688, "reward_mean": 0.3465459942817688, "reward_std": 0.06658230721950531, "rewards/v_meteor_reward": 0.3465459942817688, "step": 303 }, { "advantages": -3.3527612686157227e-08, "completion_length": 149.0, "epoch": 0.10133333333333333, "grad_norm": 5.349584579467773, "kl": 0.162109375, "learning_rate": 8.986666666666666e-07, "loss": 0.0065, "reward": 0.6379706859588623, "reward_mean": 0.6379706859588623, "reward_std": 0.11043699085712433, "rewards/a_meteor_reward": 0.6379706859588623, "step": 304 }, { "advantages": 5.029141902923584e-08, "completion_length": 47.1875, "epoch": 0.10166666666666667, "grad_norm": 7.831838607788086, "kl": 0.4453125, "learning_rate": 8.983333333333332e-07, "loss": 0.0178, "reward": 0.38552194833755493, "reward_mean": 0.38552194833755493, "reward_std": 0.07513781636953354, "rewards/v_meteor_reward": 0.38552194833755493, "step": 305 }, { "advantages": 3.5762786865234375e-07, "completion_length": 84.4375, "epoch": 0.102, "grad_norm": 5.7606024742126465, "kl": 0.3046875, "learning_rate": 8.98e-07, "loss": 0.0122, "reward": 0.680375874042511, "reward_mean": 0.680375874042511, "reward_std": 0.1464424580335617, "rewards/a_meteor_reward": 0.680375874042511, "step": 306 }, { "advantages": -1.6205012798309326e-07, "completion_length": 146.9375, "epoch": 0.10233333333333333, "grad_norm": 4.908440589904785, "kl": 0.28125, "learning_rate": 8.976666666666666e-07, "loss": 0.0113, "reward": 0.6811435222625732, "reward_mean": 0.6811435222625732, "reward_std": 0.05242307484149933, "rewards/a_meteor_reward": 0.6811435222625732, "step": 307 }, { "advantages": 5.774199962615967e-07, "completion_length": 325.375, "epoch": 0.10266666666666667, "grad_norm": 2.6991305351257324, "kl": 0.109375, "learning_rate": 8.973333333333333e-07, "loss": 0.0044, "reward": 0.5960113406181335, "reward_mean": 0.5960113406181335, "reward_std": 0.024249427020549774, "rewards/a_meteor_reward": 0.5960113406181335, "step": 308 }, { "advantages": -3.427267074584961e-07, "completion_length": 108.625, "epoch": 0.103, "grad_norm": 5.508455753326416, "kl": 0.359375, "learning_rate": 8.969999999999999e-07, "loss": 0.0144, "reward": 0.6772817373275757, "reward_mean": 0.6772817373275757, "reward_std": 0.05800691619515419, "rewards/a_meteor_reward": 0.6772817373275757, "step": 309 }, { "advantages": -1.862645149230957e-08, "completion_length": 60.5625, "epoch": 0.10333333333333333, "grad_norm": 6.735232830047607, "kl": 0.1884765625, "learning_rate": 8.966666666666666e-07, "loss": 0.0075, "reward": 0.4111091196537018, "reward_mean": 0.4111091196537018, "reward_std": 0.07136540114879608, "rewards/v_meteor_reward": 0.4111091196537018, "step": 310 }, { "advantages": 1.7881393432617188e-07, "completion_length": 45.75, "epoch": 0.10366666666666667, "grad_norm": 7.271182060241699, "kl": 0.376953125, "learning_rate": 8.963333333333333e-07, "loss": 0.0151, "reward": 0.4096001982688904, "reward_mean": 0.4096001982688904, "reward_std": 0.0682385265827179, "rewards/v_meteor_reward": 0.4096001982688904, "step": 311 }, { "advantages": 1.996755599975586e-06, "completion_length": 15.25, "epoch": 0.104, "grad_norm": 16.09444236755371, "kl": 0.294921875, "learning_rate": 8.96e-07, "loss": 0.0118, "reward": 1.532212257385254, "reward_mean": 1.532212257385254, "reward_std": 0.06681720167398453, "rewards/iou_timestamp_reward": 0.5322123765945435, "rewards/t_format_reward": 1.0, "step": 312 }, { "advantages": 3.166496753692627e-08, "completion_length": 66.9375, "epoch": 0.10433333333333333, "grad_norm": 6.6974029541015625, "kl": 0.138671875, "learning_rate": 8.956666666666667e-07, "loss": 0.0056, "reward": 0.3487696349620819, "reward_mean": 0.3487696349620819, "reward_std": 0.07501904666423798, "rewards/v_meteor_reward": 0.3487696349620819, "step": 313 }, { "advantages": -1.7415732145309448e-07, "completion_length": 198.125, "epoch": 0.10466666666666667, "grad_norm": 2.103353261947632, "kl": 0.09619140625, "learning_rate": 8.953333333333332e-07, "loss": 0.0038, "reward": 0.5382170081138611, "reward_mean": 0.5382170081138611, "reward_std": 0.056871213018894196, "rewards/a_meteor_reward": 0.5382170081138611, "step": 314 }, { "advantages": -1.4426186680793762e-06, "completion_length": 14.0, "epoch": 0.105, "grad_norm": 18.082101821899414, "kl": 0.197265625, "learning_rate": 8.95e-07, "loss": 0.0079, "reward": 1.7176992893218994, "reward_mean": 1.7176992893218994, "reward_std": 0.09550988674163818, "rewards/iou_timestamp_reward": 0.7176992297172546, "rewards/t_format_reward": 1.0, "step": 315 }, { "advantages": -2.7194619178771973e-07, "completion_length": 15.0, "epoch": 0.10533333333333333, "grad_norm": 11.390405654907227, "kl": 0.16796875, "learning_rate": 8.946666666666667e-07, "loss": 0.0067, "reward": 1.6334500312805176, "reward_mean": 1.6334500312805176, "reward_std": 0.041434433311223984, "rewards/iou_timestamp_reward": 0.6334500312805176, "rewards/t_format_reward": 1.0, "step": 316 }, { "advantages": 2.2351741790771484e-08, "completion_length": 54.9375, "epoch": 0.10566666666666667, "grad_norm": 7.881657600402832, "kl": 0.173828125, "learning_rate": 8.943333333333333e-07, "loss": 0.007, "reward": 0.35311973094940186, "reward_mean": 0.35311973094940186, "reward_std": 0.05284072458744049, "rewards/v_meteor_reward": 0.35311973094940186, "step": 317 }, { "advantages": -4.0978193283081055e-08, "completion_length": 188.0625, "epoch": 0.106, "grad_norm": 3.957944393157959, "kl": 0.1416015625, "learning_rate": 8.939999999999999e-07, "loss": 0.0057, "reward": 0.47446733713150024, "reward_mean": 0.47446733713150024, "reward_std": 0.0627574548125267, "rewards/a_meteor_reward": 0.47446733713150024, "step": 318 }, { "advantages": -1.2665987014770508e-07, "completion_length": 56.0, "epoch": 0.10633333333333334, "grad_norm": 6.095126152038574, "kl": 0.373046875, "learning_rate": 8.936666666666667e-07, "loss": 0.0149, "reward": 0.4184494912624359, "reward_mean": 0.4184494912624359, "reward_std": 0.09909875690937042, "rewards/v_meteor_reward": 0.4184494912624359, "step": 319 }, { "advantages": 1.0803341865539551e-07, "completion_length": 162.875, "epoch": 0.10666666666666667, "grad_norm": 4.797267436981201, "kl": 0.1396484375, "learning_rate": 8.933333333333333e-07, "loss": 0.0056, "reward": 0.6469359397888184, "reward_mean": 0.6469359397888184, "reward_std": 0.09621260315179825, "rewards/a_meteor_reward": 0.6469359397888184, "step": 320 }, { "advantages": -1.862645149230957e-09, "completion_length": 69.5625, "epoch": 0.107, "grad_norm": 6.288244247436523, "kl": 0.1728515625, "learning_rate": 8.93e-07, "loss": 0.0069, "reward": 0.3703422546386719, "reward_mean": 0.3703422546386719, "reward_std": 0.0634663999080658, "rewards/v_meteor_reward": 0.3703422546386719, "step": 321 }, { "advantages": 4.991888999938965e-07, "completion_length": 16.5, "epoch": 0.10733333333333334, "grad_norm": 9.20387077331543, "kl": 0.27734375, "learning_rate": 8.926666666666666e-07, "loss": 0.0111, "reward": 1.667521357536316, "reward_mean": 1.667521357536316, "reward_std": 0.05917797237634659, "rewards/iou_timestamp_reward": 0.6675214171409607, "rewards/t_format_reward": 1.0, "step": 322 }, { "advantages": -3.7997961044311523e-07, "completion_length": 168.8125, "epoch": 0.10766666666666666, "grad_norm": 3.5032765865325928, "kl": 0.142578125, "learning_rate": 8.923333333333333e-07, "loss": 0.0057, "reward": 0.6042474508285522, "reward_mean": 0.6042474508285522, "reward_std": 0.02732779085636139, "rewards/a_meteor_reward": 0.6042474508285522, "step": 323 }, { "advantages": -5.550682544708252e-07, "completion_length": 157.375, "epoch": 0.108, "grad_norm": 2.9922189712524414, "kl": 0.1171875, "learning_rate": 8.92e-07, "loss": 0.0047, "reward": 0.5011246204376221, "reward_mean": 0.5011246204376221, "reward_std": 0.06860694289207458, "rewards/a_meteor_reward": 0.5011246204376221, "step": 324 }, { "advantages": 4.284083843231201e-07, "completion_length": 79.8125, "epoch": 0.10833333333333334, "grad_norm": 4.959620475769043, "kl": 0.275390625, "learning_rate": 8.916666666666667e-07, "loss": 0.011, "reward": 0.781048059463501, "reward_mean": 0.781048059463501, "reward_std": 0.05344407260417938, "rewards/a_meteor_reward": 0.781048059463501, "step": 325 }, { "advantages": 6.332993507385254e-08, "completion_length": 51.3125, "epoch": 0.10866666666666666, "grad_norm": 8.160191535949707, "kl": 0.2001953125, "learning_rate": 8.913333333333332e-07, "loss": 0.008, "reward": 0.3556007742881775, "reward_mean": 0.3556007742881775, "reward_std": 0.06602829694747925, "rewards/v_meteor_reward": 0.3556007742881775, "step": 326 }, { "advantages": 1.862645149230957e-07, "completion_length": 82.875, "epoch": 0.109, "grad_norm": 8.974308967590332, "kl": 0.2431640625, "learning_rate": 8.91e-07, "loss": 0.0097, "reward": 0.4185582995414734, "reward_mean": 0.4185582995414734, "reward_std": 0.08331790566444397, "rewards/v_meteor_reward": 0.4185582995414734, "step": 327 }, { "advantages": -1.601874828338623e-07, "completion_length": 68.0, "epoch": 0.10933333333333334, "grad_norm": 5.632379531860352, "kl": 0.333984375, "learning_rate": 8.906666666666667e-07, "loss": 0.0134, "reward": 0.32356855273246765, "reward_mean": 0.32356855273246765, "reward_std": 0.0665157288312912, "rewards/v_meteor_reward": 0.32356855273246765, "step": 328 }, { "advantages": -1.862645149230957e-08, "completion_length": 69.625, "epoch": 0.10966666666666666, "grad_norm": 5.999619960784912, "kl": 0.294921875, "learning_rate": 8.903333333333333e-07, "loss": 0.0118, "reward": 0.30482929944992065, "reward_mean": 0.30482929944992065, "reward_std": 0.06696130335330963, "rewards/v_meteor_reward": 0.30482929944992065, "step": 329 }, { "advantages": 1.8291175365447998e-06, "completion_length": 14.75, "epoch": 0.11, "grad_norm": 7.805632591247559, "kl": 0.2578125, "learning_rate": 8.9e-07, "loss": 0.0103, "reward": 1.838677167892456, "reward_mean": 1.838677167892456, "reward_std": 0.029479648917913437, "rewards/iou_timestamp_reward": 0.8386772274971008, "rewards/t_format_reward": 1.0, "step": 330 }, { "advantages": 2.1047890186309814e-06, "completion_length": 15.75, "epoch": 0.11033333333333334, "grad_norm": 11.323892593383789, "kl": 0.318359375, "learning_rate": 8.896666666666666e-07, "loss": 0.0127, "reward": 1.9247452020645142, "reward_mean": 1.9247452020645142, "reward_std": 0.02209809422492981, "rewards/iou_timestamp_reward": 0.9247453212738037, "rewards/t_format_reward": 1.0, "step": 331 }, { "advantages": -1.7136335372924805e-07, "completion_length": 15.5, "epoch": 0.11066666666666666, "grad_norm": 8.655220985412598, "kl": 0.2734375, "learning_rate": 8.893333333333333e-07, "loss": 0.0109, "reward": 1.733642578125, "reward_mean": 1.733642578125, "reward_std": 0.05369095504283905, "rewards/iou_timestamp_reward": 0.733642578125, "rewards/t_format_reward": 1.0, "step": 332 }, { "advantages": -2.2351741790771484e-07, "completion_length": 165.75, "epoch": 0.111, "grad_norm": 2.4204933643341064, "kl": 0.08203125, "learning_rate": 8.89e-07, "loss": 0.0033, "reward": 0.520375669002533, "reward_mean": 0.520375669002533, "reward_std": 0.07777003943920135, "rewards/a_meteor_reward": 0.520375669002533, "step": 333 }, { "advantages": 1.1175870895385742e-08, "completion_length": 128.0625, "epoch": 0.11133333333333334, "grad_norm": 5.206874847412109, "kl": 0.255859375, "learning_rate": 8.886666666666667e-07, "loss": 0.0102, "reward": 0.4714474081993103, "reward_mean": 0.4714474081993103, "reward_std": 0.03993505612015724, "rewards/a_meteor_reward": 0.4714474081993103, "step": 334 }, { "advantages": 3.3527612686157227e-07, "completion_length": 155.8125, "epoch": 0.11166666666666666, "grad_norm": 3.0799665451049805, "kl": 0.1328125, "learning_rate": 8.883333333333332e-07, "loss": 0.0053, "reward": 0.719544529914856, "reward_mean": 0.719544529914856, "reward_std": 0.0775858610868454, "rewards/a_meteor_reward": 0.719544529914856, "step": 335 }, { "advantages": 5.587935447692871e-08, "completion_length": 159.125, "epoch": 0.112, "grad_norm": 2.141324520111084, "kl": 0.09033203125, "learning_rate": 8.88e-07, "loss": 0.0036, "reward": 0.5259382724761963, "reward_mean": 0.5259382724761963, "reward_std": 0.07925127446651459, "rewards/a_meteor_reward": 0.5259382724761963, "step": 336 }, { "advantages": 1.2386590242385864e-07, "completion_length": 107.375, "epoch": 0.11233333333333333, "grad_norm": 4.71900749206543, "kl": 0.1796875, "learning_rate": 8.876666666666667e-07, "loss": 0.0072, "reward": 0.5021330118179321, "reward_mean": 0.5021330118179321, "reward_std": 0.1164492666721344, "rewards/a_meteor_reward": 0.5021330118179321, "step": 337 }, { "advantages": 2.384185791015625e-07, "completion_length": 206.1875, "epoch": 0.11266666666666666, "grad_norm": 3.83905029296875, "kl": 0.1015625, "learning_rate": 8.873333333333333e-07, "loss": 0.0041, "reward": 0.4734407067298889, "reward_mean": 0.4734407067298889, "reward_std": 0.08214977383613586, "rewards/a_meteor_reward": 0.4734407067298889, "step": 338 }, { "advantages": 1.8905848264694214e-07, "completion_length": 153.0, "epoch": 0.113, "grad_norm": 2.2874791622161865, "kl": 0.11181640625, "learning_rate": 8.869999999999999e-07, "loss": 0.0045, "reward": 0.6050254702568054, "reward_mean": 0.6050254702568054, "reward_std": 0.062094707041978836, "rewards/a_meteor_reward": 0.6050254702568054, "step": 339 }, { "advantages": 6.658956408500671e-08, "completion_length": 203.75, "epoch": 0.11333333333333333, "grad_norm": 5.015377044677734, "kl": 0.138671875, "learning_rate": 8.866666666666667e-07, "loss": 0.0055, "reward": 0.59549880027771, "reward_mean": 0.59549880027771, "reward_std": 0.10875111818313599, "rewards/a_meteor_reward": 0.59549880027771, "step": 340 }, { "advantages": 2.644956111907959e-07, "completion_length": 111.9375, "epoch": 0.11366666666666667, "grad_norm": 2.260815143585205, "kl": 0.1181640625, "learning_rate": 8.863333333333333e-07, "loss": 0.0047, "reward": 0.5617192983627319, "reward_mean": 0.5617192983627319, "reward_std": 0.054143164306879044, "rewards/a_meteor_reward": 0.5617192983627319, "step": 341 }, { "advantages": 2.8312206268310547e-07, "completion_length": 64.375, "epoch": 0.114, "grad_norm": 4.273818492889404, "kl": 0.3046875, "learning_rate": 8.86e-07, "loss": 0.0122, "reward": 0.7719604969024658, "reward_mean": 0.7719604969024658, "reward_std": 0.04438149929046631, "rewards/a_meteor_reward": 0.7719604969024658, "step": 342 }, { "advantages": 1.3783574104309082e-07, "completion_length": 73.1875, "epoch": 0.11433333333333333, "grad_norm": 5.714700222015381, "kl": 0.138671875, "learning_rate": 8.856666666666666e-07, "loss": 0.0055, "reward": 0.42620038986206055, "reward_mean": 0.42620038986206055, "reward_std": 0.06276571750640869, "rewards/v_meteor_reward": 0.42620038986206055, "step": 343 }, { "advantages": 3.725290298461914e-07, "completion_length": 83.3125, "epoch": 0.11466666666666667, "grad_norm": 5.606874942779541, "kl": 0.177734375, "learning_rate": 8.853333333333332e-07, "loss": 0.0071, "reward": 0.42054587602615356, "reward_mean": 0.42054587602615356, "reward_std": 0.05243799835443497, "rewards/v_meteor_reward": 0.42054587602615356, "step": 344 }, { "advantages": 3.5762786865234375e-07, "completion_length": 175.625, "epoch": 0.115, "grad_norm": 2.9721455574035645, "kl": 0.09521484375, "learning_rate": 8.85e-07, "loss": 0.0038, "reward": 0.5709729790687561, "reward_mean": 0.5709729790687561, "reward_std": 0.07296385616064072, "rewards/a_meteor_reward": 0.5709729790687561, "step": 345 }, { "advantages": -1.6763806343078613e-08, "completion_length": 41.125, "epoch": 0.11533333333333333, "grad_norm": 7.051071643829346, "kl": 0.47265625, "learning_rate": 8.846666666666667e-07, "loss": 0.019, "reward": 0.6513578295707703, "reward_mean": 0.6513578295707703, "reward_std": 0.1478959321975708, "rewards/a_meteor_reward": 0.6513578295707703, "step": 346 }, { "advantages": -1.043081283569336e-06, "completion_length": 85.75, "epoch": 0.11566666666666667, "grad_norm": 4.402799129486084, "kl": 0.37109375, "learning_rate": 8.843333333333332e-07, "loss": 0.0149, "reward": 0.724353551864624, "reward_mean": 0.724353551864624, "reward_std": 0.04191334545612335, "rewards/a_meteor_reward": 0.724353551864624, "step": 347 }, { "advantages": -6.034970283508301e-07, "completion_length": 15.5, "epoch": 0.116, "grad_norm": 11.08935832977295, "kl": 0.228515625, "learning_rate": 8.839999999999999e-07, "loss": 0.0092, "reward": 1.6454495191574097, "reward_mean": 1.6454495191574097, "reward_std": 0.03940470516681671, "rewards/iou_timestamp_reward": 0.6454495191574097, "rewards/t_format_reward": 1.0, "step": 348 }, { "advantages": -5.476176738739014e-07, "completion_length": 57.8125, "epoch": 0.11633333333333333, "grad_norm": 7.421530723571777, "kl": 0.359375, "learning_rate": 8.836666666666667e-07, "loss": 0.0144, "reward": 0.6884969472885132, "reward_mean": 0.6884969472885132, "reward_std": 0.08752787113189697, "rewards/a_meteor_reward": 0.6884969472885132, "step": 349 }, { "advantages": 4.842877388000488e-08, "completion_length": 138.25, "epoch": 0.11666666666666667, "grad_norm": 3.8197929859161377, "kl": 0.1845703125, "learning_rate": 8.833333333333333e-07, "loss": 0.0074, "reward": 0.45993053913116455, "reward_mean": 0.45993053913116455, "reward_std": 0.08795689046382904, "rewards/a_meteor_reward": 0.45993053913116455, "step": 350 }, { "advantages": 1.6391277313232422e-07, "completion_length": 110.1875, "epoch": 0.117, "grad_norm": 3.0333986282348633, "kl": 0.1435546875, "learning_rate": 8.83e-07, "loss": 0.0057, "reward": 0.4948909282684326, "reward_mean": 0.4948909282684326, "reward_std": 0.052590832114219666, "rewards/a_meteor_reward": 0.4948909282684326, "step": 351 }, { "advantages": -9.220093488693237e-08, "completion_length": 46.6875, "epoch": 0.11733333333333333, "grad_norm": 5.3001885414123535, "kl": 0.39453125, "learning_rate": 8.826666666666666e-07, "loss": 0.0157, "reward": 0.6959344148635864, "reward_mean": 0.6959344148635864, "reward_std": 0.04604612663388252, "rewards/a_meteor_reward": 0.6959344148635864, "step": 352 }, { "advantages": -7.078051567077637e-08, "completion_length": 92.5, "epoch": 0.11766666666666667, "grad_norm": 2.857538938522339, "kl": 0.1572265625, "learning_rate": 8.823333333333333e-07, "loss": 0.0063, "reward": 0.5875868797302246, "reward_mean": 0.5875868797302246, "reward_std": 0.05038633197546005, "rewards/a_meteor_reward": 0.5875868797302246, "step": 353 }, { "advantages": -7.450580596923828e-09, "completion_length": 111.5625, "epoch": 0.118, "grad_norm": 5.185698509216309, "kl": 0.197265625, "learning_rate": 8.82e-07, "loss": 0.0079, "reward": 0.3631853461265564, "reward_mean": 0.3631853461265564, "reward_std": 0.09029058367013931, "rewards/v_meteor_reward": 0.3631853461265564, "step": 354 }, { "advantages": 7.450580596923828e-09, "completion_length": 88.625, "epoch": 0.11833333333333333, "grad_norm": 4.940003395080566, "kl": 0.2021484375, "learning_rate": 8.816666666666667e-07, "loss": 0.0081, "reward": 0.4644373655319214, "reward_mean": 0.4644373655319214, "reward_std": 0.06608481705188751, "rewards/v_meteor_reward": 0.4644373655319214, "step": 355 }, { "advantages": 8.512288331985474e-07, "completion_length": 14.75, "epoch": 0.11866666666666667, "grad_norm": 8.494638442993164, "kl": 0.255859375, "learning_rate": 8.813333333333332e-07, "loss": 0.0103, "reward": 1.8151524066925049, "reward_mean": 1.8151524066925049, "reward_std": 0.0354289636015892, "rewards/iou_timestamp_reward": 0.8151522874832153, "rewards/t_format_reward": 1.0, "step": 356 }, { "advantages": -1.0207295417785645e-06, "completion_length": 16.25, "epoch": 0.119, "grad_norm": 9.114838600158691, "kl": 0.298828125, "learning_rate": 8.81e-07, "loss": 0.0119, "reward": 1.546210527420044, "reward_mean": 1.546210527420044, "reward_std": 0.031782642006874084, "rewards/iou_timestamp_reward": 0.546210527420044, "rewards/t_format_reward": 1.0, "step": 357 }, { "advantages": 2.2724270820617676e-06, "completion_length": 15.25, "epoch": 0.11933333333333333, "grad_norm": 11.251458168029785, "kl": 0.294921875, "learning_rate": 8.806666666666667e-07, "loss": 0.0118, "reward": 1.8321483135223389, "reward_mean": 1.8321483135223389, "reward_std": 0.027968470007181168, "rewards/iou_timestamp_reward": 0.8321484327316284, "rewards/t_format_reward": 1.0, "step": 358 }, { "advantages": -8.195638656616211e-08, "completion_length": 38.0, "epoch": 0.11966666666666667, "grad_norm": 12.616169929504395, "kl": 0.48046875, "learning_rate": 8.803333333333333e-07, "loss": 0.0193, "reward": 0.5552762150764465, "reward_mean": 0.5552762150764465, "reward_std": 0.07057227939367294, "rewards/a_meteor_reward": 0.5552762150764465, "step": 359 }, { "advantages": -5.699694156646729e-07, "completion_length": 49.625, "epoch": 0.12, "grad_norm": 5.401580810546875, "kl": 0.40625, "learning_rate": 8.799999999999999e-07, "loss": 0.0162, "reward": 0.8189531564712524, "reward_mean": 0.8189531564712524, "reward_std": 0.028221510350704193, "rewards/a_meteor_reward": 0.8189531564712524, "step": 360 }, { "advantages": -3.46451997756958e-07, "completion_length": 40.5625, "epoch": 0.12033333333333333, "grad_norm": 7.984609127044678, "kl": 0.4140625, "learning_rate": 8.796666666666666e-07, "loss": 0.0166, "reward": 0.6047461032867432, "reward_mean": 0.6047461032867432, "reward_std": 0.07390522956848145, "rewards/a_meteor_reward": 0.6047461032867432, "step": 361 }, { "advantages": 7.450580596923828e-08, "completion_length": 86.125, "epoch": 0.12066666666666667, "grad_norm": 5.915136814117432, "kl": 0.314453125, "learning_rate": 8.793333333333333e-07, "loss": 0.0126, "reward": 0.4425250291824341, "reward_mean": 0.4425250291824341, "reward_std": 0.07172109186649323, "rewards/v_meteor_reward": 0.4425250291824341, "step": 362 }, { "advantages": -1.4761462807655334e-07, "completion_length": 85.8125, "epoch": 0.121, "grad_norm": 5.847969055175781, "kl": 0.140625, "learning_rate": 8.79e-07, "loss": 0.0056, "reward": 0.34082716703414917, "reward_mean": 0.34082716703414917, "reward_std": 0.0712624341249466, "rewards/v_meteor_reward": 0.34082716703414917, "step": 363 }, { "advantages": 2.7939677238464355e-08, "completion_length": 60.875, "epoch": 0.12133333333333333, "grad_norm": 8.9639892578125, "kl": 0.224609375, "learning_rate": 8.786666666666666e-07, "loss": 0.009, "reward": 0.3585376441478729, "reward_mean": 0.3585376441478729, "reward_std": 0.0644926130771637, "rewards/v_meteor_reward": 0.3585376441478729, "step": 364 }, { "advantages": -9.12696123123169e-08, "completion_length": 71.3125, "epoch": 0.12166666666666667, "grad_norm": 6.88213586807251, "kl": 0.310546875, "learning_rate": 8.783333333333332e-07, "loss": 0.0124, "reward": 0.3724985718727112, "reward_mean": 0.3724985718727112, "reward_std": 0.08955898880958557, "rewards/v_meteor_reward": 0.3724985718727112, "step": 365 }, { "advantages": 1.7136335372924805e-07, "completion_length": 71.0, "epoch": 0.122, "grad_norm": 6.1270527839660645, "kl": 0.15234375, "learning_rate": 8.78e-07, "loss": 0.0061, "reward": 0.28166118264198303, "reward_mean": 0.28166118264198303, "reward_std": 0.05546601116657257, "rewards/v_meteor_reward": 0.28166118264198303, "step": 366 }, { "advantages": -9.313225746154785e-08, "completion_length": 54.5, "epoch": 0.12233333333333334, "grad_norm": 7.251705169677734, "kl": 0.28515625, "learning_rate": 8.776666666666667e-07, "loss": 0.0114, "reward": 0.40426215529441833, "reward_mean": 0.40426215529441833, "reward_std": 0.08352222293615341, "rewards/v_meteor_reward": 0.40426215529441833, "step": 367 }, { "advantages": -2.421438694000244e-08, "completion_length": 15.4375, "epoch": 0.12266666666666666, "grad_norm": 20.803001403808594, "kl": 0.267578125, "learning_rate": 8.773333333333332e-07, "loss": 0.0107, "reward": 1.5442235469818115, "reward_mean": 1.5442235469818115, "reward_std": 0.09503461420536041, "rewards/iou_timestamp_reward": 0.5442234873771667, "rewards/t_format_reward": 1.0, "step": 368 }, { "advantages": 6.705522537231445e-08, "completion_length": 15.0, "epoch": 0.123, "grad_norm": 14.887096405029297, "kl": 0.162109375, "learning_rate": 8.769999999999999e-07, "loss": 0.0065, "reward": 1.7359035015106201, "reward_mean": 1.7359035015106201, "reward_std": 0.06549666821956635, "rewards/iou_timestamp_reward": 0.7359035015106201, "rewards/t_format_reward": 1.0, "step": 369 }, { "advantages": 1.8067657947540283e-07, "completion_length": 145.6875, "epoch": 0.12333333333333334, "grad_norm": 4.057080268859863, "kl": 0.1484375, "learning_rate": 8.766666666666667e-07, "loss": 0.0059, "reward": 0.3879455626010895, "reward_mean": 0.3879455626010895, "reward_std": 0.0544019490480423, "rewards/a_meteor_reward": 0.3879455626010895, "step": 370 }, { "advantages": -3.1813979148864746e-06, "completion_length": 16.25, "epoch": 0.12366666666666666, "grad_norm": 7.6623992919921875, "kl": 0.361328125, "learning_rate": 8.763333333333333e-07, "loss": 0.0145, "reward": 1.9155139923095703, "reward_mean": 1.9155139923095703, "reward_std": 0.01717974618077278, "rewards/iou_timestamp_reward": 0.9155138731002808, "rewards/t_format_reward": 1.0, "step": 371 }, { "advantages": 5.774199962615967e-07, "completion_length": 15.75, "epoch": 0.124, "grad_norm": 22.635831832885742, "kl": 0.38671875, "learning_rate": 8.76e-07, "loss": 0.0154, "reward": 1.5255228281021118, "reward_mean": 1.5255228281021118, "reward_std": 0.0887971818447113, "rewards/iou_timestamp_reward": 0.5255228281021118, "rewards/t_format_reward": 1.0, "step": 372 }, { "advantages": 1.043081283569336e-07, "completion_length": 84.1875, "epoch": 0.12433333333333334, "grad_norm": 6.142969131469727, "kl": 0.232421875, "learning_rate": 8.756666666666666e-07, "loss": 0.0093, "reward": 0.36115580797195435, "reward_mean": 0.36115580797195435, "reward_std": 0.10790325701236725, "rewards/v_meteor_reward": 0.36115580797195435, "step": 373 }, { "advantages": -4.842877388000488e-08, "completion_length": 69.1875, "epoch": 0.12466666666666666, "grad_norm": 6.284999370574951, "kl": 0.1484375, "learning_rate": 8.753333333333332e-07, "loss": 0.006, "reward": 0.33787935972213745, "reward_mean": 0.33787935972213745, "reward_std": 0.06941648572683334, "rewards/v_meteor_reward": 0.33787935972213745, "step": 374 }, { "advantages": 2.041459083557129e-06, "completion_length": 14.75, "epoch": 0.125, "grad_norm": 4.943756103515625, "kl": 0.263671875, "learning_rate": 8.75e-07, "loss": 0.0106, "reward": 1.6765477657318115, "reward_mean": 1.6765477657318115, "reward_std": 0.006416885182261467, "rewards/iou_timestamp_reward": 0.6765477657318115, "rewards/t_format_reward": 1.0, "step": 375 }, { "advantages": -1.2945383787155151e-06, "completion_length": 15.5, "epoch": 0.12533333333333332, "grad_norm": 5.972479343414307, "kl": 0.291015625, "learning_rate": 8.746666666666667e-07, "loss": 0.0116, "reward": 1.895212173461914, "reward_mean": 1.895212173461914, "reward_std": 0.04216558486223221, "rewards/iou_timestamp_reward": 0.8952121734619141, "rewards/t_format_reward": 1.0, "step": 376 }, { "advantages": -1.3317912817001343e-07, "completion_length": 99.8125, "epoch": 0.12566666666666668, "grad_norm": 4.591060638427734, "kl": 0.2158203125, "learning_rate": 8.743333333333332e-07, "loss": 0.0086, "reward": 0.7264018058776855, "reward_mean": 0.7264018058776855, "reward_std": 0.04010023921728134, "rewards/a_meteor_reward": 0.7264018058776855, "step": 377 }, { "advantages": -9.387731552124023e-07, "completion_length": 16.0, "epoch": 0.126, "grad_norm": 8.458571434020996, "kl": 0.318359375, "learning_rate": 8.739999999999999e-07, "loss": 0.0127, "reward": 1.7128148078918457, "reward_mean": 1.7128148078918457, "reward_std": 0.03810898959636688, "rewards/iou_timestamp_reward": 0.7128148674964905, "rewards/t_format_reward": 1.0, "step": 378 }, { "advantages": -6.407499313354492e-07, "completion_length": 14.5, "epoch": 0.12633333333333333, "grad_norm": 11.413023948669434, "kl": 0.3203125, "learning_rate": 8.736666666666667e-07, "loss": 0.0128, "reward": 1.9477016925811768, "reward_mean": 1.9477016925811768, "reward_std": 0.016262924298644066, "rewards/iou_timestamp_reward": 0.9477018117904663, "rewards/t_format_reward": 1.0, "step": 379 }, { "advantages": -1.8719583749771118e-07, "completion_length": 71.6875, "epoch": 0.12666666666666668, "grad_norm": 5.683255195617676, "kl": 0.130859375, "learning_rate": 8.733333333333333e-07, "loss": 0.0052, "reward": 0.3362624943256378, "reward_mean": 0.3362624943256378, "reward_std": 0.05595378205180168, "rewards/v_meteor_reward": 0.3362624943256378, "step": 380 }, { "advantages": 6.146728992462158e-08, "completion_length": 62.625, "epoch": 0.127, "grad_norm": 7.246224880218506, "kl": 0.240234375, "learning_rate": 8.729999999999999e-07, "loss": 0.0096, "reward": 0.40315794944763184, "reward_mean": 0.40315794944763184, "reward_std": 0.07833754271268845, "rewards/v_meteor_reward": 0.40315794944763184, "step": 381 }, { "advantages": 1.1362135410308838e-07, "completion_length": 70.8125, "epoch": 0.12733333333333333, "grad_norm": 6.817910194396973, "kl": 0.1552734375, "learning_rate": 8.726666666666666e-07, "loss": 0.0062, "reward": 0.4020038843154907, "reward_mean": 0.4020038843154907, "reward_std": 0.05700753629207611, "rewards/v_meteor_reward": 0.4020038843154907, "step": 382 }, { "advantages": 1.7881393432617188e-07, "completion_length": 65.125, "epoch": 0.12766666666666668, "grad_norm": 5.7200927734375, "kl": 0.248046875, "learning_rate": 8.723333333333333e-07, "loss": 0.0099, "reward": 0.48375755548477173, "reward_mean": 0.48375755548477173, "reward_std": 0.08051072061061859, "rewards/v_meteor_reward": 0.48375755548477173, "step": 383 }, { "advantages": -8.195638656616211e-08, "completion_length": 66.625, "epoch": 0.128, "grad_norm": 6.729382038116455, "kl": 0.205078125, "learning_rate": 8.72e-07, "loss": 0.0082, "reward": 0.42416346073150635, "reward_mean": 0.42416346073150635, "reward_std": 0.11439232528209686, "rewards/v_meteor_reward": 0.42416346073150635, "step": 384 }, { "advantages": -2.3134052753448486e-06, "completion_length": 15.75, "epoch": 0.12833333333333333, "grad_norm": 7.859336853027344, "kl": 0.23046875, "learning_rate": 8.716666666666667e-07, "loss": 0.0092, "reward": 1.9627703428268433, "reward_mean": 1.9627703428268433, "reward_std": 0.014390189200639725, "rewards/iou_timestamp_reward": 0.9627702832221985, "rewards/t_format_reward": 1.0, "step": 385 }, { "advantages": -7.022172212600708e-07, "completion_length": 15.5, "epoch": 0.12866666666666668, "grad_norm": 16.73173713684082, "kl": 0.220703125, "learning_rate": 8.713333333333332e-07, "loss": 0.0088, "reward": 1.5285565853118896, "reward_mean": 1.5285565853118896, "reward_std": 0.06645043939352036, "rewards/iou_timestamp_reward": 0.5285565853118896, "rewards/t_format_reward": 1.0, "step": 386 }, { "advantages": -1.564621925354004e-07, "completion_length": 81.625, "epoch": 0.129, "grad_norm": 5.282042980194092, "kl": 0.3984375, "learning_rate": 8.71e-07, "loss": 0.016, "reward": 0.8288697004318237, "reward_mean": 0.8288697004318237, "reward_std": 0.04231568053364754, "rewards/a_meteor_reward": 0.8288697004318237, "step": 387 }, { "advantages": -9.685754776000977e-08, "completion_length": 87.0, "epoch": 0.12933333333333333, "grad_norm": 6.052459716796875, "kl": 0.1708984375, "learning_rate": 8.706666666666667e-07, "loss": 0.0068, "reward": 0.4815748631954193, "reward_mean": 0.4815748631954193, "reward_std": 0.04548005014657974, "rewards/a_meteor_reward": 0.4815748631954193, "step": 388 }, { "advantages": 3.166496753692627e-08, "completion_length": 100.0625, "epoch": 0.12966666666666668, "grad_norm": 3.8211185932159424, "kl": 0.15234375, "learning_rate": 8.703333333333333e-07, "loss": 0.0061, "reward": 0.4809826910495758, "reward_mean": 0.4809826910495758, "reward_std": 0.08624415099620819, "rewards/a_meteor_reward": 0.4809826910495758, "step": 389 }, { "advantages": 3.5762786865234375e-07, "completion_length": 87.75, "epoch": 0.13, "grad_norm": 5.295890808105469, "kl": 0.154296875, "learning_rate": 8.699999999999999e-07, "loss": 0.0062, "reward": 0.36085739731788635, "reward_mean": 0.36085739731788635, "reward_std": 0.0569886639714241, "rewards/v_meteor_reward": 0.36085739731788635, "step": 390 }, { "advantages": 9.546056389808655e-08, "completion_length": 44.0, "epoch": 0.13033333333333333, "grad_norm": 8.370195388793945, "kl": 0.5703125, "learning_rate": 8.696666666666667e-07, "loss": 0.0229, "reward": 0.6749313473701477, "reward_mean": 0.6749313473701477, "reward_std": 0.0802539810538292, "rewards/a_meteor_reward": 0.6749313473701477, "step": 391 }, { "advantages": -1.8440186977386475e-07, "completion_length": 65.4375, "epoch": 0.13066666666666665, "grad_norm": 5.92170524597168, "kl": 0.171875, "learning_rate": 8.693333333333333e-07, "loss": 0.0069, "reward": 0.370177686214447, "reward_mean": 0.370177686214447, "reward_std": 0.08095911145210266, "rewards/v_meteor_reward": 0.370177686214447, "step": 392 }, { "advantages": 1.1548399925231934e-06, "completion_length": 16.0, "epoch": 0.131, "grad_norm": 7.055375099182129, "kl": 0.2265625, "learning_rate": 8.69e-07, "loss": 0.0091, "reward": 1.6923909187316895, "reward_mean": 1.6923909187316895, "reward_std": 0.025903185829520226, "rewards/iou_timestamp_reward": 0.692391037940979, "rewards/t_format_reward": 1.0, "step": 393 }, { "advantages": 6.183981895446777e-07, "completion_length": 15.75, "epoch": 0.13133333333333333, "grad_norm": 9.641589164733887, "kl": 0.25, "learning_rate": 8.686666666666666e-07, "loss": 0.01, "reward": 1.8716516494750977, "reward_mean": 1.8716516494750977, "reward_std": 0.04421761631965637, "rewards/iou_timestamp_reward": 0.8716515898704529, "rewards/t_format_reward": 1.0, "step": 394 }, { "advantages": 1.564621925354004e-07, "completion_length": 198.5, "epoch": 0.13166666666666665, "grad_norm": 2.3430168628692627, "kl": 0.087890625, "learning_rate": 8.683333333333332e-07, "loss": 0.0035, "reward": 0.6752159595489502, "reward_mean": 0.6752159595489502, "reward_std": 0.1331263929605484, "rewards/a_meteor_reward": 0.6752159595489502, "step": 395 }, { "advantages": 4.842877388000488e-08, "completion_length": 70.875, "epoch": 0.132, "grad_norm": 6.295318126678467, "kl": 0.150390625, "learning_rate": 8.68e-07, "loss": 0.006, "reward": 0.3618727922439575, "reward_mean": 0.3618727922439575, "reward_std": 0.10361625999212265, "rewards/v_meteor_reward": 0.3618727922439575, "step": 396 }, { "advantages": 6.109476089477539e-07, "completion_length": 61.1875, "epoch": 0.13233333333333333, "grad_norm": 4.2453718185424805, "kl": 0.33203125, "learning_rate": 8.676666666666667e-07, "loss": 0.0133, "reward": 0.7283474206924438, "reward_mean": 0.7283474206924438, "reward_std": 0.04738365486264229, "rewards/a_meteor_reward": 0.7283474206924438, "step": 397 }, { "advantages": -5.103647708892822e-07, "completion_length": 15.5, "epoch": 0.13266666666666665, "grad_norm": 9.072973251342773, "kl": 0.32421875, "learning_rate": 8.673333333333332e-07, "loss": 0.0129, "reward": 1.8589081764221191, "reward_mean": 1.8589081764221191, "reward_std": 0.03561907261610031, "rewards/iou_timestamp_reward": 0.8589081168174744, "rewards/t_format_reward": 1.0, "step": 398 }, { "advantages": -2.1792948246002197e-07, "completion_length": 260.3125, "epoch": 0.133, "grad_norm": 2.959134817123413, "kl": 0.1279296875, "learning_rate": 8.669999999999999e-07, "loss": 0.0051, "reward": 0.7392145991325378, "reward_mean": 0.7392145991325378, "reward_std": 0.04429839923977852, "rewards/a_meteor_reward": 0.7392145991325378, "step": 399 }, { "advantages": 2.421438694000244e-07, "completion_length": 84.75, "epoch": 0.13333333333333333, "grad_norm": 5.187649726867676, "kl": 0.171875, "learning_rate": 8.666666666666667e-07, "loss": 0.0069, "reward": 0.44561731815338135, "reward_mean": 0.44561731815338135, "reward_std": 0.07872200012207031, "rewards/v_meteor_reward": 0.44561731815338135, "step": 400 }, { "advantages": -4.805624485015869e-07, "completion_length": 15.9375, "epoch": 0.13366666666666666, "grad_norm": 9.76257610321045, "kl": 0.431640625, "learning_rate": 8.663333333333333e-07, "loss": 0.0173, "reward": 1.501516342163086, "reward_mean": 1.501516342163086, "reward_std": 0.07762297242879868, "rewards/iou_timestamp_reward": 0.5015162825584412, "rewards/t_format_reward": 1.0, "step": 401 }, { "advantages": -1.4640390872955322e-06, "completion_length": 16.75, "epoch": 0.134, "grad_norm": 13.60025405883789, "kl": 0.328125, "learning_rate": 8.659999999999999e-07, "loss": 0.0131, "reward": 1.8051435947418213, "reward_mean": 1.8051435947418213, "reward_std": 0.07289563119411469, "rewards/iou_timestamp_reward": 0.8051435947418213, "rewards/t_format_reward": 1.0, "step": 402 }, { "advantages": 4.842877388000488e-08, "completion_length": 64.5, "epoch": 0.13433333333333333, "grad_norm": 4.350046157836914, "kl": 0.1806640625, "learning_rate": 8.656666666666666e-07, "loss": 0.0072, "reward": 0.6228093504905701, "reward_mean": 0.6228093504905701, "reward_std": 0.059131406247615814, "rewards/a_meteor_reward": 0.6228093504905701, "step": 403 }, { "advantages": 3.5390257835388184e-08, "completion_length": 119.3125, "epoch": 0.13466666666666666, "grad_norm": 5.089149475097656, "kl": 0.1318359375, "learning_rate": 8.653333333333333e-07, "loss": 0.0053, "reward": 0.3434523344039917, "reward_mean": 0.3434523344039917, "reward_std": 0.0742306113243103, "rewards/v_meteor_reward": 0.3434523344039917, "step": 404 }, { "advantages": 1.475214958190918e-06, "completion_length": 15.5, "epoch": 0.135, "grad_norm": 12.145588874816895, "kl": 0.271484375, "learning_rate": 8.65e-07, "loss": 0.0109, "reward": 1.6689558029174805, "reward_mean": 1.6689558029174805, "reward_std": 0.05203702673316002, "rewards/iou_timestamp_reward": 0.6689557433128357, "rewards/t_format_reward": 1.0, "step": 405 }, { "advantages": -1.862645149230957e-08, "completion_length": 61.0, "epoch": 0.13533333333333333, "grad_norm": 6.196192264556885, "kl": 0.27734375, "learning_rate": 8.646666666666667e-07, "loss": 0.0111, "reward": 0.3547646999359131, "reward_mean": 0.3547646999359131, "reward_std": 0.11123979836702347, "rewards/v_meteor_reward": 0.3547646999359131, "step": 406 }, { "advantages": -1.1622905731201172e-06, "completion_length": 14.75, "epoch": 0.13566666666666666, "grad_norm": 13.998571395874023, "kl": 0.375, "learning_rate": 8.643333333333332e-07, "loss": 0.015, "reward": 1.8812363147735596, "reward_mean": 1.8812363147735596, "reward_std": 0.05969054251909256, "rewards/iou_timestamp_reward": 0.8812363147735596, "rewards/t_format_reward": 1.0, "step": 407 }, { "advantages": -4.813075065612793e-06, "completion_length": 15.25, "epoch": 0.136, "grad_norm": 10.604145050048828, "kl": 0.275390625, "learning_rate": 8.639999999999999e-07, "loss": 0.011, "reward": 1.8794538974761963, "reward_mean": 1.8794538974761963, "reward_std": 0.04065141826868057, "rewards/iou_timestamp_reward": 0.8794539570808411, "rewards/t_format_reward": 1.0, "step": 408 }, { "advantages": 4.842877388000488e-08, "completion_length": 154.625, "epoch": 0.13633333333333333, "grad_norm": 2.3966691493988037, "kl": 0.10791015625, "learning_rate": 8.636666666666667e-07, "loss": 0.0043, "reward": 0.4799809455871582, "reward_mean": 0.4799809455871582, "reward_std": 0.17770390212535858, "rewards/a_meteor_reward": 0.4799809455871582, "step": 409 }, { "advantages": 1.30385160446167e-07, "completion_length": 68.5625, "epoch": 0.13666666666666666, "grad_norm": 5.917040824890137, "kl": 0.158203125, "learning_rate": 8.633333333333333e-07, "loss": 0.0063, "reward": 0.4371931850910187, "reward_mean": 0.4371931850910187, "reward_std": 0.04741670936346054, "rewards/v_meteor_reward": 0.4371931850910187, "step": 410 }, { "advantages": -1.0542571544647217e-06, "completion_length": 16.1875, "epoch": 0.137, "grad_norm": 10.655323028564453, "kl": 0.25390625, "learning_rate": 8.629999999999999e-07, "loss": 0.0102, "reward": 1.635863184928894, "reward_mean": 1.635863184928894, "reward_std": 0.045937083661556244, "rewards/iou_timestamp_reward": 0.6358631253242493, "rewards/t_format_reward": 1.0, "step": 411 }, { "advantages": -2.8721988201141357e-06, "completion_length": 15.75, "epoch": 0.13733333333333334, "grad_norm": 12.565261840820312, "kl": 0.31640625, "learning_rate": 8.626666666666666e-07, "loss": 0.0127, "reward": 1.3310413360595703, "reward_mean": 1.3310413360595703, "reward_std": 0.08718497306108475, "rewards/iou_timestamp_reward": 0.33104127645492554, "rewards/t_format_reward": 1.0, "step": 412 }, { "advantages": 8.344650268554688e-07, "completion_length": 16.125, "epoch": 0.13766666666666666, "grad_norm": 6.253748893737793, "kl": 0.1923828125, "learning_rate": 8.623333333333333e-07, "loss": 0.0077, "reward": 1.5673727989196777, "reward_mean": 1.5673727989196777, "reward_std": 0.09211936593055725, "rewards/iou_timestamp_reward": 0.5673727989196777, "rewards/t_format_reward": 1.0, "step": 413 }, { "advantages": -2.8777867555618286e-07, "completion_length": 177.25, "epoch": 0.138, "grad_norm": 7.646405220031738, "kl": 0.4921875, "learning_rate": 8.62e-07, "loss": 0.0197, "reward": 0.4960624575614929, "reward_mean": 0.4960624575614929, "reward_std": 0.05842657387256622, "rewards/a_meteor_reward": 0.4960624575614929, "step": 414 }, { "advantages": -1.0803341865539551e-07, "completion_length": 81.375, "epoch": 0.13833333333333334, "grad_norm": 5.21417760848999, "kl": 0.169921875, "learning_rate": 8.616666666666666e-07, "loss": 0.0068, "reward": 0.41000741720199585, "reward_mean": 0.41000741720199585, "reward_std": 0.08371071517467499, "rewards/v_meteor_reward": 0.41000741720199585, "step": 415 }, { "advantages": -8.940696716308594e-07, "completion_length": 87.0625, "epoch": 0.13866666666666666, "grad_norm": 10.66010570526123, "kl": 0.470703125, "learning_rate": 8.613333333333332e-07, "loss": 0.0188, "reward": 0.62529456615448, "reward_mean": 0.62529456615448, "reward_std": 0.09328899532556534, "rewards/a_meteor_reward": 0.62529456615448, "step": 416 }, { "advantages": 1.1734664440155029e-07, "completion_length": 82.75, "epoch": 0.139, "grad_norm": 5.215280532836914, "kl": 0.18359375, "learning_rate": 8.61e-07, "loss": 0.0073, "reward": 0.45057475566864014, "reward_mean": 0.45057475566864014, "reward_std": 0.061213523149490356, "rewards/v_meteor_reward": 0.45057475566864014, "step": 417 }, { "advantages": 1.0617077350616455e-07, "completion_length": 60.875, "epoch": 0.13933333333333334, "grad_norm": 6.735771179199219, "kl": 0.318359375, "learning_rate": 8.606666666666667e-07, "loss": 0.0128, "reward": 0.3364852964878082, "reward_mean": 0.3364852964878082, "reward_std": 0.04575827717781067, "rewards/v_meteor_reward": 0.3364852964878082, "step": 418 }, { "advantages": 2.1420419216156006e-07, "completion_length": 237.75, "epoch": 0.13966666666666666, "grad_norm": 2.4223015308380127, "kl": 0.109375, "learning_rate": 8.603333333333332e-07, "loss": 0.0044, "reward": 0.6726592779159546, "reward_mean": 0.6726592779159546, "reward_std": 0.04512732848525047, "rewards/a_meteor_reward": 0.6726592779159546, "step": 419 }, { "advantages": -9.313225746154785e-08, "completion_length": 16.0, "epoch": 0.14, "grad_norm": 9.444098472595215, "kl": 0.173828125, "learning_rate": 8.599999999999999e-07, "loss": 0.0069, "reward": 1.6549913883209229, "reward_mean": 1.6549913883209229, "reward_std": 0.07804711163043976, "rewards/iou_timestamp_reward": 0.6549912691116333, "rewards/t_format_reward": 1.0, "step": 420 }, { "advantages": -1.3709068298339844e-06, "completion_length": 15.5, "epoch": 0.14033333333333334, "grad_norm": 11.43238353729248, "kl": 0.255859375, "learning_rate": 8.596666666666667e-07, "loss": 0.0103, "reward": 1.8455003499984741, "reward_mean": 1.8455003499984741, "reward_std": 0.053449757397174835, "rewards/iou_timestamp_reward": 0.8455002307891846, "rewards/t_format_reward": 1.0, "step": 421 }, { "advantages": -3.647059202194214e-06, "completion_length": 15.5, "epoch": 0.14066666666666666, "grad_norm": 9.261083602905273, "kl": 0.24609375, "learning_rate": 8.593333333333333e-07, "loss": 0.0099, "reward": 1.678217887878418, "reward_mean": 1.678217887878418, "reward_std": 0.028118852525949478, "rewards/iou_timestamp_reward": 0.678217887878418, "rewards/t_format_reward": 1.0, "step": 422 }, { "advantages": -1.0244548320770264e-06, "completion_length": 15.25, "epoch": 0.141, "grad_norm": 12.662087440490723, "kl": 0.216796875, "learning_rate": 8.59e-07, "loss": 0.0087, "reward": 1.6165802478790283, "reward_mean": 1.6165802478790283, "reward_std": 0.05329296737909317, "rewards/iou_timestamp_reward": 0.6165802478790283, "rewards/t_format_reward": 1.0, "step": 423 }, { "advantages": -3.725290298461914e-08, "completion_length": 84.6875, "epoch": 0.14133333333333334, "grad_norm": 5.477326393127441, "kl": 0.2197265625, "learning_rate": 8.586666666666666e-07, "loss": 0.0088, "reward": 0.4643705487251282, "reward_mean": 0.4643705487251282, "reward_std": 0.08559716492891312, "rewards/v_meteor_reward": 0.4643705487251282, "step": 424 }, { "advantages": -3.8463622331619263e-07, "completion_length": 131.625, "epoch": 0.14166666666666666, "grad_norm": 3.558806896209717, "kl": 0.119140625, "learning_rate": 8.583333333333332e-07, "loss": 0.0048, "reward": 0.6032675504684448, "reward_mean": 0.6032675504684448, "reward_std": 0.06156185641884804, "rewards/a_meteor_reward": 0.6032675504684448, "step": 425 }, { "advantages": -6.332993507385254e-08, "completion_length": 213.75, "epoch": 0.142, "grad_norm": 2.417961359024048, "kl": 0.1015625, "learning_rate": 8.58e-07, "loss": 0.0041, "reward": 0.5675121545791626, "reward_mean": 0.5675121545791626, "reward_std": 0.06664060056209564, "rewards/a_meteor_reward": 0.5675121545791626, "step": 426 }, { "advantages": -1.1548399925231934e-07, "completion_length": 105.3125, "epoch": 0.14233333333333334, "grad_norm": 2.4740593433380127, "kl": 0.1318359375, "learning_rate": 8.576666666666667e-07, "loss": 0.0053, "reward": 0.6548553705215454, "reward_mean": 0.6548553705215454, "reward_std": 0.04034842550754547, "rewards/a_meteor_reward": 0.6548553705215454, "step": 427 }, { "advantages": -5.401670932769775e-07, "completion_length": 15.25, "epoch": 0.14266666666666666, "grad_norm": 8.098332405090332, "kl": 0.267578125, "learning_rate": 8.573333333333332e-07, "loss": 0.0107, "reward": 1.8054379224777222, "reward_mean": 1.8054379224777222, "reward_std": 0.04607955738902092, "rewards/iou_timestamp_reward": 0.8054379224777222, "rewards/t_format_reward": 1.0, "step": 428 }, { "advantages": 7.934868335723877e-07, "completion_length": 15.5, "epoch": 0.143, "grad_norm": 10.197552680969238, "kl": 0.28125, "learning_rate": 8.569999999999999e-07, "loss": 0.0113, "reward": 1.6847952604293823, "reward_mean": 1.6847952604293823, "reward_std": 0.061729829758405685, "rewards/iou_timestamp_reward": 0.6847953200340271, "rewards/t_format_reward": 1.0, "step": 429 }, { "advantages": 1.210719347000122e-07, "completion_length": 108.25, "epoch": 0.14333333333333334, "grad_norm": 5.246685028076172, "kl": 0.1650390625, "learning_rate": 8.566666666666667e-07, "loss": 0.0066, "reward": 0.41124287247657776, "reward_mean": 0.41124287247657776, "reward_std": 0.03930448740720749, "rewards/v_meteor_reward": 0.41124287247657776, "step": 430 }, { "advantages": -1.0244548320770264e-08, "completion_length": 257.875, "epoch": 0.14366666666666666, "grad_norm": 2.3045008182525635, "kl": 0.11083984375, "learning_rate": 8.563333333333333e-07, "loss": 0.0044, "reward": 0.6242561340332031, "reward_mean": 0.6242561340332031, "reward_std": 0.08688556402921677, "rewards/a_meteor_reward": 0.6242561340332031, "step": 431 }, { "advantages": 4.0978193283081055e-08, "completion_length": 57.3125, "epoch": 0.144, "grad_norm": 6.654321670532227, "kl": 0.2109375, "learning_rate": 8.559999999999999e-07, "loss": 0.0084, "reward": 0.38596752285957336, "reward_mean": 0.38596752285957336, "reward_std": 0.07701711356639862, "rewards/v_meteor_reward": 0.38596752285957336, "step": 432 }, { "advantages": -4.697591066360474e-06, "completion_length": 16.0, "epoch": 0.14433333333333334, "grad_norm": 34.13024139404297, "kl": 0.2373046875, "learning_rate": 8.556666666666666e-07, "loss": 0.0095, "reward": 1.6286848783493042, "reward_mean": 1.6286848783493042, "reward_std": 0.08064456284046173, "rewards/iou_timestamp_reward": 0.6286849975585938, "rewards/t_format_reward": 1.0, "step": 433 }, { "advantages": 7.264316082000732e-08, "completion_length": 106.125, "epoch": 0.14466666666666667, "grad_norm": 4.91023588180542, "kl": 0.193359375, "learning_rate": 8.553333333333333e-07, "loss": 0.0077, "reward": 0.3381608724594116, "reward_mean": 0.3381608724594116, "reward_std": 0.04574884474277496, "rewards/v_meteor_reward": 0.3381608724594116, "step": 434 }, { "advantages": -3.9301812648773193e-07, "completion_length": 103.875, "epoch": 0.145, "grad_norm": 4.3554158210754395, "kl": 0.197265625, "learning_rate": 8.55e-07, "loss": 0.0079, "reward": 0.40586400032043457, "reward_mean": 0.40586400032043457, "reward_std": 0.03452424332499504, "rewards/v_meteor_reward": 0.40586400032043457, "step": 435 }, { "advantages": 1.2293457984924316e-06, "completion_length": 16.5, "epoch": 0.14533333333333334, "grad_norm": 6.003942489624023, "kl": 0.236328125, "learning_rate": 8.546666666666666e-07, "loss": 0.0095, "reward": 1.6842951774597168, "reward_mean": 1.6842951774597168, "reward_std": 0.02776217833161354, "rewards/iou_timestamp_reward": 0.684295117855072, "rewards/t_format_reward": 1.0, "step": 436 }, { "advantages": -5.21540641784668e-08, "completion_length": 62.3125, "epoch": 0.14566666666666667, "grad_norm": 5.775354385375977, "kl": 0.173828125, "learning_rate": 8.543333333333332e-07, "loss": 0.0069, "reward": 0.34570279717445374, "reward_mean": 0.34570279717445374, "reward_std": 0.036784008145332336, "rewards/v_meteor_reward": 0.34570279717445374, "step": 437 }, { "advantages": -5.066394805908203e-07, "completion_length": 15.75, "epoch": 0.146, "grad_norm": 10.637430191040039, "kl": 0.2099609375, "learning_rate": 8.539999999999999e-07, "loss": 0.0084, "reward": 1.4779419898986816, "reward_mean": 1.4779419898986816, "reward_std": 0.010258219204843044, "rewards/iou_timestamp_reward": 0.47794196009635925, "rewards/t_format_reward": 1.0, "step": 438 }, { "advantages": -2.0228326320648193e-06, "completion_length": 415.75, "epoch": 0.14633333333333334, "grad_norm": 1.9490766525268555, "kl": 0.0673828125, "learning_rate": 8.536666666666667e-07, "loss": 0.0027, "reward": 0.5663048624992371, "reward_mean": 0.5663048624992371, "reward_std": 0.053142622113227844, "rewards/a_meteor_reward": 0.5663048624992371, "step": 439 }, { "advantages": -3.725290298461914e-09, "completion_length": 85.5, "epoch": 0.14666666666666667, "grad_norm": 4.7609076499938965, "kl": 0.1455078125, "learning_rate": 8.533333333333334e-07, "loss": 0.0058, "reward": 0.41326066851615906, "reward_mean": 0.41326066851615906, "reward_std": 0.07478078454732895, "rewards/v_meteor_reward": 0.41326066851615906, "step": 440 }, { "advantages": 1.9185245037078857e-07, "completion_length": 91.5, "epoch": 0.147, "grad_norm": 6.293208599090576, "kl": 0.2412109375, "learning_rate": 8.529999999999999e-07, "loss": 0.0097, "reward": 0.3228587508201599, "reward_mean": 0.3228587508201599, "reward_std": 0.04995673522353172, "rewards/v_meteor_reward": 0.3228587508201599, "step": 441 }, { "advantages": 1.4901161193847656e-08, "completion_length": 77.25, "epoch": 0.14733333333333334, "grad_norm": 5.2798357009887695, "kl": 0.1630859375, "learning_rate": 8.526666666666666e-07, "loss": 0.0065, "reward": 0.4100263714790344, "reward_mean": 0.4100263714790344, "reward_std": 0.1004585474729538, "rewards/v_meteor_reward": 0.4100263714790344, "step": 442 }, { "advantages": -1.5832483768463135e-07, "completion_length": 89.125, "epoch": 0.14766666666666667, "grad_norm": 5.410758972167969, "kl": 0.244140625, "learning_rate": 8.523333333333334e-07, "loss": 0.0098, "reward": 0.4079280495643616, "reward_mean": 0.4079280495643616, "reward_std": 0.07979464530944824, "rewards/v_meteor_reward": 0.4079280495643616, "step": 443 }, { "advantages": 6.016343832015991e-07, "completion_length": 105.625, "epoch": 0.148, "grad_norm": 3.8908233642578125, "kl": 0.259765625, "learning_rate": 8.52e-07, "loss": 0.0104, "reward": 0.7456333637237549, "reward_mean": 0.7456333637237549, "reward_std": 0.04227818548679352, "rewards/a_meteor_reward": 0.7456333637237549, "step": 444 }, { "advantages": 2.514570951461792e-06, "completion_length": 15.75, "epoch": 0.14833333333333334, "grad_norm": 23.232980728149414, "kl": 0.271484375, "learning_rate": 8.516666666666666e-07, "loss": 0.0108, "reward": 1.804418921470642, "reward_mean": 1.804418921470642, "reward_std": 0.07605786621570587, "rewards/iou_timestamp_reward": 0.8044189810752869, "rewards/t_format_reward": 1.0, "step": 445 }, { "advantages": 5.923211574554443e-07, "completion_length": 159.8125, "epoch": 0.14866666666666667, "grad_norm": 3.283461093902588, "kl": 0.2158203125, "learning_rate": 8.513333333333333e-07, "loss": 0.0087, "reward": 0.8691999912261963, "reward_mean": 0.8691999912261963, "reward_std": 0.029546715319156647, "rewards/a_meteor_reward": 0.8691999912261963, "step": 446 }, { "advantages": -1.043081283569336e-07, "completion_length": 91.4375, "epoch": 0.149, "grad_norm": 5.533822536468506, "kl": 0.1533203125, "learning_rate": 8.51e-07, "loss": 0.0061, "reward": 0.32561907172203064, "reward_mean": 0.32561907172203064, "reward_std": 0.09206882864236832, "rewards/v_meteor_reward": 0.32561907172203064, "step": 447 }, { "advantages": -2.0060688257217407e-06, "completion_length": 15.0, "epoch": 0.14933333333333335, "grad_norm": 6.936544895172119, "kl": 0.318359375, "learning_rate": 8.506666666666667e-07, "loss": 0.0128, "reward": 1.8826426267623901, "reward_mean": 1.8826426267623901, "reward_std": 0.021893423050642014, "rewards/iou_timestamp_reward": 0.8826426863670349, "rewards/t_format_reward": 1.0, "step": 448 }, { "advantages": -2.1606683731079102e-07, "completion_length": 36.9375, "epoch": 0.14966666666666667, "grad_norm": 7.930682182312012, "kl": 0.5625, "learning_rate": 8.503333333333333e-07, "loss": 0.0226, "reward": 0.6810884475708008, "reward_mean": 0.6810884475708008, "reward_std": 0.04771989956498146, "rewards/a_meteor_reward": 0.6810884475708008, "step": 449 }, { "advantages": 1.564621925354004e-07, "completion_length": 93.9375, "epoch": 0.15, "grad_norm": 5.239786148071289, "kl": 0.1806640625, "learning_rate": 8.499999999999999e-07, "loss": 0.0072, "reward": 0.4109438955783844, "reward_mean": 0.4109438955783844, "reward_std": 0.07206615805625916, "rewards/v_meteor_reward": 0.4109438955783844, "step": 450 }, { "advantages": 1.30385160446167e-07, "completion_length": 16.0, "epoch": 0.15033333333333335, "grad_norm": 9.00416374206543, "kl": 0.2734375, "learning_rate": 8.496666666666667e-07, "loss": 0.011, "reward": 1.8222689628601074, "reward_mean": 1.8222689628601074, "reward_std": 0.05056701600551605, "rewards/iou_timestamp_reward": 0.8222688436508179, "rewards/t_format_reward": 1.0, "step": 451 }, { "advantages": -1.5906989574432373e-06, "completion_length": 15.75, "epoch": 0.15066666666666667, "grad_norm": 7.568958759307861, "kl": 0.2734375, "learning_rate": 8.493333333333334e-07, "loss": 0.0109, "reward": 1.8202013969421387, "reward_mean": 1.8202013969421387, "reward_std": 0.00872770044952631, "rewards/iou_timestamp_reward": 0.8202014565467834, "rewards/t_format_reward": 1.0, "step": 452 }, { "advantages": 9.685754776000977e-08, "completion_length": 100.875, "epoch": 0.151, "grad_norm": 4.295027732849121, "kl": 0.11474609375, "learning_rate": 8.489999999999999e-07, "loss": 0.0046, "reward": 0.5521038174629211, "reward_mean": 0.5521038174629211, "reward_std": 0.07034406810998917, "rewards/a_meteor_reward": 0.5521038174629211, "step": 453 }, { "advantages": 1.9613653421401978e-06, "completion_length": 16.0, "epoch": 0.15133333333333332, "grad_norm": 6.161564350128174, "kl": 0.328125, "learning_rate": 8.486666666666666e-07, "loss": 0.0131, "reward": 1.361774206161499, "reward_mean": 1.361774206161499, "reward_std": 0.01789630576968193, "rewards/iou_timestamp_reward": 0.3617742657661438, "rewards/t_format_reward": 1.0, "step": 454 }, { "advantages": -4.470348358154297e-08, "completion_length": 15.375, "epoch": 0.15166666666666667, "grad_norm": 14.956618309020996, "kl": 0.259765625, "learning_rate": 8.483333333333333e-07, "loss": 0.0104, "reward": 1.7974071502685547, "reward_mean": 1.7974071502685547, "reward_std": 0.04600508511066437, "rewards/iou_timestamp_reward": 0.7974070310592651, "rewards/t_format_reward": 1.0, "step": 455 }, { "advantages": 7.096678018569946e-07, "completion_length": 15.75, "epoch": 0.152, "grad_norm": 21.39028549194336, "kl": 0.2314453125, "learning_rate": 8.48e-07, "loss": 0.0093, "reward": 1.6001003980636597, "reward_mean": 1.6001003980636597, "reward_std": 0.05184166505932808, "rewards/iou_timestamp_reward": 0.6001003980636597, "rewards/t_format_reward": 1.0, "step": 456 }, { "advantages": 4.842877388000488e-08, "completion_length": 62.1875, "epoch": 0.15233333333333332, "grad_norm": 5.7560715675354, "kl": 0.177734375, "learning_rate": 8.476666666666666e-07, "loss": 0.0071, "reward": 0.4251399636268616, "reward_mean": 0.4251399636268616, "reward_std": 0.049595825374126434, "rewards/v_meteor_reward": 0.4251399636268616, "step": 457 }, { "advantages": 9.313225746154785e-08, "completion_length": 65.9375, "epoch": 0.15266666666666667, "grad_norm": 6.408620357513428, "kl": 0.2255859375, "learning_rate": 8.473333333333333e-07, "loss": 0.009, "reward": 0.3138352930545807, "reward_mean": 0.3138352930545807, "reward_std": 0.09050948917865753, "rewards/v_meteor_reward": 0.3138352930545807, "step": 458 }, { "advantages": -6.183981895446777e-07, "completion_length": 15.3125, "epoch": 0.153, "grad_norm": 13.685544967651367, "kl": 0.265625, "learning_rate": 8.469999999999999e-07, "loss": 0.0106, "reward": 1.5985195636749268, "reward_mean": 1.5985195636749268, "reward_std": 0.05624012649059296, "rewards/iou_timestamp_reward": 0.5985195636749268, "rewards/t_format_reward": 1.0, "step": 459 }, { "advantages": 1.1976808309555054e-06, "completion_length": 15.75, "epoch": 0.15333333333333332, "grad_norm": 6.046235084533691, "kl": 0.40625, "learning_rate": 8.466666666666667e-07, "loss": 0.0163, "reward": 1.936330795288086, "reward_mean": 1.936330795288086, "reward_std": 0.01625414937734604, "rewards/iou_timestamp_reward": 0.9363308548927307, "rewards/t_format_reward": 1.0, "step": 460 }, { "advantages": -1.2293457984924316e-07, "completion_length": 163.8125, "epoch": 0.15366666666666667, "grad_norm": 2.7963826656341553, "kl": 0.1533203125, "learning_rate": 8.463333333333334e-07, "loss": 0.0061, "reward": 0.5016416907310486, "reward_mean": 0.5016416907310486, "reward_std": 0.10746964812278748, "rewards/a_meteor_reward": 0.5016416907310486, "step": 461 }, { "advantages": 1.955777406692505e-08, "completion_length": 100.0625, "epoch": 0.154, "grad_norm": 4.496992588043213, "kl": 0.177734375, "learning_rate": 8.459999999999999e-07, "loss": 0.0071, "reward": 0.5018336176872253, "reward_mean": 0.5018336176872253, "reward_std": 0.07087819278240204, "rewards/v_meteor_reward": 0.5018336176872253, "step": 462 }, { "advantages": 1.955777406692505e-07, "completion_length": 71.6875, "epoch": 0.15433333333333332, "grad_norm": 6.718770503997803, "kl": 0.212890625, "learning_rate": 8.456666666666666e-07, "loss": 0.0085, "reward": 0.34575533866882324, "reward_mean": 0.34575533866882324, "reward_std": 0.06804229319095612, "rewards/v_meteor_reward": 0.34575533866882324, "step": 463 }, { "advantages": 2.086162567138672e-07, "completion_length": 93.625, "epoch": 0.15466666666666667, "grad_norm": 5.126475811004639, "kl": 0.1923828125, "learning_rate": 8.453333333333334e-07, "loss": 0.0077, "reward": 0.3563383221626282, "reward_mean": 0.3563383221626282, "reward_std": 0.08918845653533936, "rewards/v_meteor_reward": 0.3563383221626282, "step": 464 }, { "advantages": -4.284083843231201e-06, "completion_length": 15.75, "epoch": 0.155, "grad_norm": 8.503064155578613, "kl": 0.255859375, "learning_rate": 8.45e-07, "loss": 0.0103, "reward": 1.911846399307251, "reward_mean": 1.911846399307251, "reward_std": 0.017851687967777252, "rewards/iou_timestamp_reward": 0.911846399307251, "rewards/t_format_reward": 1.0, "step": 465 }, { "advantages": -5.085021257400513e-07, "completion_length": 14.6875, "epoch": 0.15533333333333332, "grad_norm": 14.144426345825195, "kl": 0.228515625, "learning_rate": 8.446666666666666e-07, "loss": 0.0091, "reward": 1.821690559387207, "reward_mean": 1.821690559387207, "reward_std": 0.04826091229915619, "rewards/iou_timestamp_reward": 0.8216904997825623, "rewards/t_format_reward": 1.0, "step": 466 }, { "advantages": -7.078051567077637e-08, "completion_length": 196.5, "epoch": 0.15566666666666668, "grad_norm": 4.852541923522949, "kl": 0.16015625, "learning_rate": 8.443333333333333e-07, "loss": 0.0064, "reward": 0.5878129005432129, "reward_mean": 0.5878129005432129, "reward_std": 0.14504940807819366, "rewards/a_meteor_reward": 0.5878129005432129, "step": 467 }, { "advantages": 1.1827796697616577e-07, "completion_length": 82.3125, "epoch": 0.156, "grad_norm": 5.018570899963379, "kl": 0.150390625, "learning_rate": 8.439999999999999e-07, "loss": 0.006, "reward": 0.3615458905696869, "reward_mean": 0.3615458905696869, "reward_std": 0.05135630816221237, "rewards/v_meteor_reward": 0.3615458905696869, "step": 468 }, { "advantages": 8.195638656616211e-08, "completion_length": 203.9375, "epoch": 0.15633333333333332, "grad_norm": 2.5232183933258057, "kl": 0.1298828125, "learning_rate": 8.436666666666667e-07, "loss": 0.0052, "reward": 0.4793699383735657, "reward_mean": 0.4793699383735657, "reward_std": 0.0732998251914978, "rewards/a_meteor_reward": 0.4793699383735657, "step": 469 }, { "advantages": 2.1792948246002197e-07, "completion_length": 15.5, "epoch": 0.15666666666666668, "grad_norm": 12.298480987548828, "kl": 0.19921875, "learning_rate": 8.433333333333333e-07, "loss": 0.008, "reward": 1.622817873954773, "reward_mean": 1.622817873954773, "reward_std": 0.05319179594516754, "rewards/iou_timestamp_reward": 0.6228179335594177, "rewards/t_format_reward": 1.0, "step": 470 }, { "advantages": -1.5273690223693848e-06, "completion_length": 16.0, "epoch": 0.157, "grad_norm": 10.833212852478027, "kl": 0.3203125, "learning_rate": 8.429999999999999e-07, "loss": 0.0128, "reward": 1.8574228286743164, "reward_mean": 1.8574228286743164, "reward_std": 0.024094339460134506, "rewards/iou_timestamp_reward": 0.8574228286743164, "rewards/t_format_reward": 1.0, "step": 471 }, { "advantages": 3.725290298461914e-08, "completion_length": 16.0, "epoch": 0.15733333333333333, "grad_norm": 16.497568130493164, "kl": 0.27734375, "learning_rate": 8.426666666666666e-07, "loss": 0.0111, "reward": 1.4156112670898438, "reward_mean": 1.4156112670898438, "reward_std": 0.0721508264541626, "rewards/iou_timestamp_reward": 0.41561126708984375, "rewards/t_format_reward": 1.0, "step": 472 }, { "advantages": 1.3783574104309082e-07, "completion_length": 85.375, "epoch": 0.15766666666666668, "grad_norm": 5.205262660980225, "kl": 0.271484375, "learning_rate": 8.423333333333334e-07, "loss": 0.0108, "reward": 0.6345682144165039, "reward_mean": 0.6345682144165039, "reward_std": 0.0989941731095314, "rewards/a_meteor_reward": 0.6345682144165039, "step": 473 }, { "advantages": -1.1175870895385742e-08, "completion_length": 78.375, "epoch": 0.158, "grad_norm": 6.658729076385498, "kl": 0.1640625, "learning_rate": 8.419999999999999e-07, "loss": 0.0066, "reward": 0.40192627906799316, "reward_mean": 0.40192627906799316, "reward_std": 0.08111090958118439, "rewards/v_meteor_reward": 0.40192627906799316, "step": 474 }, { "advantages": 9.313225746154785e-08, "completion_length": 29.0, "epoch": 0.15833333333333333, "grad_norm": 7.462789058685303, "kl": 0.5078125, "learning_rate": 8.416666666666666e-07, "loss": 0.0202, "reward": 0.5661401152610779, "reward_mean": 0.5661401152610779, "reward_std": 0.0658406913280487, "rewards/a_meteor_reward": 0.5661401152610779, "step": 475 }, { "advantages": 6.593763828277588e-07, "completion_length": 15.5, "epoch": 0.15866666666666668, "grad_norm": 7.8821024894714355, "kl": 0.314453125, "learning_rate": 8.413333333333333e-07, "loss": 0.0125, "reward": 1.6720080375671387, "reward_mean": 1.6720080375671387, "reward_std": 0.06527311354875565, "rewards/iou_timestamp_reward": 0.6720080375671387, "rewards/t_format_reward": 1.0, "step": 476 }, { "advantages": -3.5390257835388184e-08, "completion_length": 91.625, "epoch": 0.159, "grad_norm": 5.545376777648926, "kl": 0.255859375, "learning_rate": 8.41e-07, "loss": 0.0102, "reward": 0.4125930964946747, "reward_mean": 0.4125930964946747, "reward_std": 0.08295712620019913, "rewards/v_meteor_reward": 0.4125930964946747, "step": 477 }, { "advantages": -8.195638656616211e-08, "completion_length": 16.0, "epoch": 0.15933333333333333, "grad_norm": 15.914656639099121, "kl": 0.171875, "learning_rate": 8.406666666666667e-07, "loss": 0.0069, "reward": 1.775702953338623, "reward_mean": 1.775702953338623, "reward_std": 0.0477575957775116, "rewards/iou_timestamp_reward": 0.775702953338623, "rewards/t_format_reward": 1.0, "step": 478 }, { "advantages": 1.564621925354004e-07, "completion_length": 269.9375, "epoch": 0.15966666666666668, "grad_norm": 2.871934652328491, "kl": 0.15234375, "learning_rate": 8.403333333333333e-07, "loss": 0.0061, "reward": 0.456388384103775, "reward_mean": 0.456388384103775, "reward_std": 0.11525219678878784, "rewards/a_meteor_reward": 0.456388384103775, "step": 479 }, { "advantages": 7.82310962677002e-08, "completion_length": 100.125, "epoch": 0.16, "grad_norm": 4.626733303070068, "kl": 0.166015625, "learning_rate": 8.399999999999999e-07, "loss": 0.0067, "reward": 0.4464868903160095, "reward_mean": 0.4464868903160095, "reward_std": 0.03646330535411835, "rewards/v_meteor_reward": 0.4464868903160095, "step": 480 }, { "advantages": -1.30385160446167e-07, "completion_length": 93.625, "epoch": 0.16033333333333333, "grad_norm": 4.459719181060791, "kl": 0.126953125, "learning_rate": 8.396666666666667e-07, "loss": 0.0051, "reward": 0.4349040687084198, "reward_mean": 0.4349040687084198, "reward_std": 0.05996791273355484, "rewards/v_meteor_reward": 0.4349040687084198, "step": 481 }, { "advantages": -1.389533281326294e-06, "completion_length": 15.875, "epoch": 0.16066666666666668, "grad_norm": 11.985993385314941, "kl": 0.205078125, "learning_rate": 8.393333333333334e-07, "loss": 0.0082, "reward": 1.8864169120788574, "reward_mean": 1.8864169120788574, "reward_std": 0.03952454403042793, "rewards/iou_timestamp_reward": 0.8864167928695679, "rewards/t_format_reward": 1.0, "step": 482 }, { "advantages": 4.842877388000488e-08, "completion_length": 163.3125, "epoch": 0.161, "grad_norm": 2.4037415981292725, "kl": 0.1123046875, "learning_rate": 8.389999999999999e-07, "loss": 0.0045, "reward": 0.4772765040397644, "reward_mean": 0.4772765040397644, "reward_std": 0.09340770542621613, "rewards/a_meteor_reward": 0.4772765040397644, "step": 483 }, { "advantages": -7.059425115585327e-07, "completion_length": 53.125, "epoch": 0.16133333333333333, "grad_norm": 7.901736259460449, "kl": 0.3046875, "learning_rate": 8.386666666666666e-07, "loss": 0.0122, "reward": 0.36313962936401367, "reward_mean": 0.36313962936401367, "reward_std": 0.0766587182879448, "rewards/v_meteor_reward": 0.36313962936401367, "step": 484 }, { "advantages": 3.7997961044311523e-07, "completion_length": 83.9375, "epoch": 0.16166666666666665, "grad_norm": 5.776797294616699, "kl": 0.1875, "learning_rate": 8.383333333333334e-07, "loss": 0.0075, "reward": 0.4208107590675354, "reward_mean": 0.4208107590675354, "reward_std": 0.03301607072353363, "rewards/v_meteor_reward": 0.4208107590675354, "step": 485 }, { "advantages": 6.146728992462158e-08, "completion_length": 196.0625, "epoch": 0.162, "grad_norm": 3.477459192276001, "kl": 0.12890625, "learning_rate": 8.38e-07, "loss": 0.0052, "reward": 0.5820612907409668, "reward_mean": 0.5820612907409668, "reward_std": 0.07887408137321472, "rewards/a_meteor_reward": 0.5820612907409668, "step": 486 }, { "advantages": 8.568167686462402e-08, "completion_length": 92.25, "epoch": 0.16233333333333333, "grad_norm": 5.186706066131592, "kl": 0.181640625, "learning_rate": 8.376666666666666e-07, "loss": 0.0073, "reward": 0.3234841823577881, "reward_mean": 0.3234841823577881, "reward_std": 0.08788863569498062, "rewards/a_meteor_reward": 0.3234841823577881, "step": 487 }, { "advantages": 9.313225746154785e-09, "completion_length": 59.375, "epoch": 0.16266666666666665, "grad_norm": 6.921597957611084, "kl": 0.35546875, "learning_rate": 8.373333333333333e-07, "loss": 0.0142, "reward": 0.38592779636383057, "reward_mean": 0.38592779636383057, "reward_std": 0.07113415002822876, "rewards/v_meteor_reward": 0.38592779636383057, "step": 488 }, { "advantages": 3.725290298461914e-09, "completion_length": 76.0625, "epoch": 0.163, "grad_norm": 5.975680828094482, "kl": 0.15234375, "learning_rate": 8.369999999999999e-07, "loss": 0.0061, "reward": 0.4142968952655792, "reward_mean": 0.4142968952655792, "reward_std": 0.08450037240982056, "rewards/v_meteor_reward": 0.4142968952655792, "step": 489 }, { "advantages": 4.98257577419281e-08, "completion_length": 141.0, "epoch": 0.16333333333333333, "grad_norm": 3.037839412689209, "kl": 0.11865234375, "learning_rate": 8.366666666666667e-07, "loss": 0.0047, "reward": 0.6904536485671997, "reward_mean": 0.6904536485671997, "reward_std": 0.08892332017421722, "rewards/a_meteor_reward": 0.6904536485671997, "step": 490 }, { "advantages": -7.105991244316101e-07, "completion_length": 98.6875, "epoch": 0.16366666666666665, "grad_norm": 3.806877851486206, "kl": 0.189453125, "learning_rate": 8.363333333333333e-07, "loss": 0.0076, "reward": 0.6082212924957275, "reward_mean": 0.6082212924957275, "reward_std": 0.07709494233131409, "rewards/a_meteor_reward": 0.6082212924957275, "step": 491 }, { "advantages": 1.6167759895324707e-06, "completion_length": 15.5, "epoch": 0.164, "grad_norm": 8.189621925354004, "kl": 0.33984375, "learning_rate": 8.359999999999999e-07, "loss": 0.0136, "reward": 1.774922251701355, "reward_mean": 1.774922251701355, "reward_std": 0.054442793130874634, "rewards/iou_timestamp_reward": 0.774922251701355, "rewards/t_format_reward": 1.0, "step": 492 }, { "advantages": -6.705522537231445e-08, "completion_length": 95.3125, "epoch": 0.16433333333333333, "grad_norm": 5.315469264984131, "kl": 0.23046875, "learning_rate": 8.356666666666666e-07, "loss": 0.0092, "reward": 0.47548753023147583, "reward_mean": 0.47548753023147583, "reward_std": 0.09822823852300644, "rewards/v_meteor_reward": 0.47548753023147583, "step": 493 }, { "advantages": 5.178153514862061e-07, "completion_length": 15.0, "epoch": 0.16466666666666666, "grad_norm": 13.078065872192383, "kl": 0.2138671875, "learning_rate": 8.353333333333334e-07, "loss": 0.0085, "reward": 1.6270856857299805, "reward_mean": 1.6270856857299805, "reward_std": 0.06451466679573059, "rewards/iou_timestamp_reward": 0.6270857453346252, "rewards/t_format_reward": 1.0, "step": 494 }, { "advantages": -4.284083843231201e-08, "completion_length": 57.25, "epoch": 0.165, "grad_norm": 4.755226135253906, "kl": 0.4453125, "learning_rate": 8.349999999999999e-07, "loss": 0.0178, "reward": 0.7142922282218933, "reward_mean": 0.7142922282218933, "reward_std": 0.04916740208864212, "rewards/a_meteor_reward": 0.7142922282218933, "step": 495 }, { "advantages": -1.341104507446289e-07, "completion_length": 69.875, "epoch": 0.16533333333333333, "grad_norm": 5.750853538513184, "kl": 0.158203125, "learning_rate": 8.346666666666666e-07, "loss": 0.0063, "reward": 0.38156384229660034, "reward_mean": 0.38156384229660034, "reward_std": 0.04034440964460373, "rewards/v_meteor_reward": 0.38156384229660034, "step": 496 }, { "advantages": -2.9802322387695312e-08, "completion_length": 249.6875, "epoch": 0.16566666666666666, "grad_norm": 3.2629668712615967, "kl": 0.16015625, "learning_rate": 8.343333333333333e-07, "loss": 0.0064, "reward": 0.7212510108947754, "reward_mean": 0.7212510108947754, "reward_std": 0.10952048003673553, "rewards/a_meteor_reward": 0.7212510108947754, "step": 497 }, { "advantages": 3.110617399215698e-07, "completion_length": 58.375, "epoch": 0.166, "grad_norm": 5.030550956726074, "kl": 0.27734375, "learning_rate": 8.34e-07, "loss": 0.0111, "reward": 0.3973839282989502, "reward_mean": 0.3973839282989502, "reward_std": 0.04650750011205673, "rewards/a_meteor_reward": 0.3973839282989502, "step": 498 }, { "advantages": -9.648501873016357e-07, "completion_length": 106.3125, "epoch": 0.16633333333333333, "grad_norm": 4.999449729919434, "kl": 0.326171875, "learning_rate": 8.336666666666667e-07, "loss": 0.013, "reward": 0.785520076751709, "reward_mean": 0.785520076751709, "reward_std": 0.04099607467651367, "rewards/a_meteor_reward": 0.785520076751709, "step": 499 }, { "advantages": -1.1604279279708862e-06, "completion_length": 16.75, "epoch": 0.16666666666666666, "grad_norm": 6.873377799987793, "kl": 0.267578125, "learning_rate": 8.333333333333333e-07, "loss": 0.0107, "reward": 1.6652846336364746, "reward_mean": 1.6652846336364746, "reward_std": 0.031159212812781334, "rewards/iou_timestamp_reward": 0.6652846932411194, "rewards/t_format_reward": 1.0, "step": 500 }, { "advantages": 1.3550743460655212e-07, "completion_length": 69.0, "epoch": 0.167, "grad_norm": 5.719926357269287, "kl": 0.2109375, "learning_rate": 8.329999999999999e-07, "loss": 0.0085, "reward": 0.4685244858264923, "reward_mean": 0.4685244858264923, "reward_std": 0.044342849403619766, "rewards/v_meteor_reward": 0.4685244858264923, "step": 501 }, { "advantages": -1.1045485734939575e-06, "completion_length": 70.375, "epoch": 0.16733333333333333, "grad_norm": 5.666744709014893, "kl": 0.435546875, "learning_rate": 8.326666666666666e-07, "loss": 0.0175, "reward": 0.6640211939811707, "reward_mean": 0.6640211939811707, "reward_std": 0.03633186221122742, "rewards/a_meteor_reward": 0.6640211939811707, "step": 502 }, { "advantages": 6.705522537231445e-07, "completion_length": 15.5, "epoch": 0.16766666666666666, "grad_norm": 11.445030212402344, "kl": 0.34375, "learning_rate": 8.323333333333334e-07, "loss": 0.0138, "reward": 1.8884830474853516, "reward_mean": 1.8884830474853516, "reward_std": 0.058739569038152695, "rewards/iou_timestamp_reward": 0.888482928276062, "rewards/t_format_reward": 1.0, "step": 503 }, { "advantages": -9.313225746154785e-08, "completion_length": 76.375, "epoch": 0.168, "grad_norm": 5.845485687255859, "kl": 0.1904296875, "learning_rate": 8.319999999999999e-07, "loss": 0.0076, "reward": 0.3458072543144226, "reward_mean": 0.3458072543144226, "reward_std": 0.050908785313367844, "rewards/v_meteor_reward": 0.3458072543144226, "step": 504 }, { "advantages": -7.078051567077637e-08, "completion_length": 90.3125, "epoch": 0.16833333333333333, "grad_norm": 10.444978713989258, "kl": 0.3984375, "learning_rate": 8.316666666666666e-07, "loss": 0.0159, "reward": 0.5221906900405884, "reward_mean": 0.5221906900405884, "reward_std": 0.08562339842319489, "rewards/a_meteor_reward": 0.5221906900405884, "step": 505 }, { "advantages": -1.1175870895385742e-08, "completion_length": 15.1875, "epoch": 0.16866666666666666, "grad_norm": 38.64601516723633, "kl": 0.17578125, "learning_rate": 8.313333333333333e-07, "loss": 0.007, "reward": 1.7725896835327148, "reward_mean": 1.7725896835327148, "reward_std": 0.1382352113723755, "rewards/iou_timestamp_reward": 0.7725896239280701, "rewards/t_format_reward": 1.0, "step": 506 }, { "advantages": -1.1362135410308838e-07, "completion_length": 85.4375, "epoch": 0.169, "grad_norm": 4.892390251159668, "kl": 0.1943359375, "learning_rate": 8.31e-07, "loss": 0.0078, "reward": 0.3874978721141815, "reward_mean": 0.3874978721141815, "reward_std": 0.09315089881420135, "rewards/v_meteor_reward": 0.3874978721141815, "step": 507 }, { "advantages": 1.4901161193847656e-07, "completion_length": 101.5, "epoch": 0.16933333333333334, "grad_norm": 2.6657662391662598, "kl": 0.138671875, "learning_rate": 8.306666666666666e-07, "loss": 0.0055, "reward": 0.7143365740776062, "reward_mean": 0.7143365740776062, "reward_std": 0.0738392174243927, "rewards/a_meteor_reward": 0.7143365740776062, "step": 508 }, { "advantages": 3.3527612686157227e-08, "completion_length": 56.0, "epoch": 0.16966666666666666, "grad_norm": 6.204608917236328, "kl": 0.1962890625, "learning_rate": 8.303333333333333e-07, "loss": 0.0079, "reward": 0.4367736279964447, "reward_mean": 0.4367736279964447, "reward_std": 0.09892702102661133, "rewards/v_meteor_reward": 0.4367736279964447, "step": 509 }, { "advantages": 1.0542571544647217e-06, "completion_length": 15.75, "epoch": 0.17, "grad_norm": 63.61357498168945, "kl": 0.248046875, "learning_rate": 8.299999999999999e-07, "loss": 0.0099, "reward": 1.4191381931304932, "reward_mean": 1.4191381931304932, "reward_std": 0.1322384774684906, "rewards/iou_timestamp_reward": 0.4191383123397827, "rewards/t_format_reward": 1.0, "step": 510 }, { "advantages": 6.044283509254456e-07, "completion_length": 92.6875, "epoch": 0.17033333333333334, "grad_norm": 5.153968811035156, "kl": 0.1689453125, "learning_rate": 8.296666666666667e-07, "loss": 0.0068, "reward": 0.5528182983398438, "reward_mean": 0.5528182983398438, "reward_std": 0.12406514585018158, "rewards/a_meteor_reward": 0.5528182983398438, "step": 511 }, { "advantages": 1.0468065738677979e-06, "completion_length": 16.0, "epoch": 0.17066666666666666, "grad_norm": 14.040021896362305, "kl": 0.2734375, "learning_rate": 8.293333333333333e-07, "loss": 0.0109, "reward": 1.8547897338867188, "reward_mean": 1.8547897338867188, "reward_std": 0.03300733119249344, "rewards/iou_timestamp_reward": 0.8547897934913635, "rewards/t_format_reward": 1.0, "step": 512 }, { "advantages": -9.350478649139404e-07, "completion_length": 15.0, "epoch": 0.171, "grad_norm": 11.411669731140137, "kl": 0.1533203125, "learning_rate": 8.289999999999999e-07, "loss": 0.0061, "reward": 1.5957889556884766, "reward_mean": 1.5957889556884766, "reward_std": 0.050636596977710724, "rewards/iou_timestamp_reward": 0.5957889556884766, "rewards/t_format_reward": 1.0, "step": 513 }, { "advantages": 7.227063179016113e-07, "completion_length": 16.375, "epoch": 0.17133333333333334, "grad_norm": 7.871614456176758, "kl": 0.265625, "learning_rate": 8.286666666666666e-07, "loss": 0.0107, "reward": 1.664292573928833, "reward_mean": 1.664292573928833, "reward_std": 0.026425033807754517, "rewards/iou_timestamp_reward": 0.664292573928833, "rewards/t_format_reward": 1.0, "step": 514 }, { "advantages": -3.0137598514556885e-06, "completion_length": 15.0, "epoch": 0.17166666666666666, "grad_norm": 11.95697021484375, "kl": 0.283203125, "learning_rate": 8.283333333333334e-07, "loss": 0.0113, "reward": 1.8815515041351318, "reward_mean": 1.8815515041351318, "reward_std": 0.01747399941086769, "rewards/iou_timestamp_reward": 0.8815514445304871, "rewards/t_format_reward": 1.0, "step": 515 }, { "advantages": 4.5821070671081543e-07, "completion_length": 16.25, "epoch": 0.172, "grad_norm": 9.809446334838867, "kl": 0.1982421875, "learning_rate": 8.28e-07, "loss": 0.0079, "reward": 1.7557218074798584, "reward_mean": 1.7557218074798584, "reward_std": 0.04255734384059906, "rewards/iou_timestamp_reward": 0.7557218074798584, "rewards/t_format_reward": 1.0, "step": 516 }, { "advantages": -3.5390257835388184e-07, "completion_length": 16.0, "epoch": 0.17233333333333334, "grad_norm": 14.190924644470215, "kl": 0.28515625, "learning_rate": 8.276666666666666e-07, "loss": 0.0114, "reward": 1.8854451179504395, "reward_mean": 1.8854451179504395, "reward_std": 0.0704183280467987, "rewards/iou_timestamp_reward": 0.8854451179504395, "rewards/t_format_reward": 1.0, "step": 517 }, { "advantages": -4.4330954551696777e-07, "completion_length": 123.5, "epoch": 0.17266666666666666, "grad_norm": 5.0829243659973145, "kl": 0.2236328125, "learning_rate": 8.273333333333333e-07, "loss": 0.009, "reward": 0.5555451512336731, "reward_mean": 0.5555451512336731, "reward_std": 0.0802421122789383, "rewards/a_meteor_reward": 0.5555451512336731, "step": 518 }, { "advantages": 1.7024576663970947e-06, "completion_length": 92.9375, "epoch": 0.173, "grad_norm": 4.841123580932617, "kl": 0.16015625, "learning_rate": 8.269999999999999e-07, "loss": 0.0064, "reward": 0.439178466796875, "reward_mean": 0.439178466796875, "reward_std": 0.05043286085128784, "rewards/v_meteor_reward": 0.439178466796875, "step": 519 }, { "advantages": 1.7136335372924805e-07, "completion_length": 15.75, "epoch": 0.17333333333333334, "grad_norm": 24.638887405395508, "kl": 0.35546875, "learning_rate": 8.266666666666667e-07, "loss": 0.0143, "reward": 1.4402347803115845, "reward_mean": 1.4402347803115845, "reward_std": 0.0385943166911602, "rewards/iou_timestamp_reward": 0.4402347505092621, "rewards/t_format_reward": 1.0, "step": 520 }, { "advantages": -1.4603137969970703e-06, "completion_length": 15.5, "epoch": 0.17366666666666666, "grad_norm": 10.355154991149902, "kl": 0.2001953125, "learning_rate": 8.263333333333333e-07, "loss": 0.008, "reward": 1.3973705768585205, "reward_mean": 1.3973705768585205, "reward_std": 0.07367601990699768, "rewards/iou_timestamp_reward": 0.3973705768585205, "rewards/t_format_reward": 1.0, "step": 521 }, { "advantages": -7.450580596923828e-08, "completion_length": 64.6875, "epoch": 0.174, "grad_norm": 6.527217864990234, "kl": 0.2392578125, "learning_rate": 8.259999999999999e-07, "loss": 0.0096, "reward": 0.42712485790252686, "reward_mean": 0.42712485790252686, "reward_std": 0.05587443709373474, "rewards/v_meteor_reward": 0.42712485790252686, "step": 522 }, { "advantages": 2.7939677238464355e-09, "completion_length": 76.625, "epoch": 0.17433333333333334, "grad_norm": 4.933520317077637, "kl": 0.1767578125, "learning_rate": 8.256666666666666e-07, "loss": 0.0071, "reward": 0.35208019614219666, "reward_mean": 0.35208019614219666, "reward_std": 0.05421566218137741, "rewards/v_meteor_reward": 0.35208019614219666, "step": 523 }, { "advantages": -5.21540641784668e-08, "completion_length": 82.4375, "epoch": 0.17466666666666666, "grad_norm": 6.089928150177002, "kl": 0.1796875, "learning_rate": 8.253333333333334e-07, "loss": 0.0072, "reward": 0.41657567024230957, "reward_mean": 0.41657567024230957, "reward_std": 0.0929170697927475, "rewards/v_meteor_reward": 0.41657567024230957, "step": 524 }, { "advantages": 4.842877388000488e-08, "completion_length": 14.875, "epoch": 0.175, "grad_norm": 8.498848915100098, "kl": 0.1787109375, "learning_rate": 8.249999999999999e-07, "loss": 0.0071, "reward": 1.5131535530090332, "reward_mean": 1.5131535530090332, "reward_std": 0.03320374712347984, "rewards/iou_timestamp_reward": 0.5131536722183228, "rewards/t_format_reward": 1.0, "step": 525 }, { "advantages": 8.009374141693115e-08, "completion_length": 61.6875, "epoch": 0.17533333333333334, "grad_norm": 8.373775482177734, "kl": 0.21484375, "learning_rate": 8.246666666666666e-07, "loss": 0.0086, "reward": 0.33565789461135864, "reward_mean": 0.33565789461135864, "reward_std": 0.08662083745002747, "rewards/v_meteor_reward": 0.33565789461135864, "step": 526 }, { "advantages": -3.725290298461914e-09, "completion_length": 143.875, "epoch": 0.17566666666666667, "grad_norm": 4.489406585693359, "kl": 0.251953125, "learning_rate": 8.243333333333333e-07, "loss": 0.0101, "reward": 0.8134461641311646, "reward_mean": 0.8134461641311646, "reward_std": 0.04160070791840553, "rewards/a_meteor_reward": 0.8134461641311646, "step": 527 }, { "advantages": -1.6577541828155518e-07, "completion_length": 62.5, "epoch": 0.176, "grad_norm": 7.099466323852539, "kl": 0.23828125, "learning_rate": 8.24e-07, "loss": 0.0095, "reward": 0.40798887610435486, "reward_mean": 0.40798887610435486, "reward_std": 0.08802825212478638, "rewards/v_meteor_reward": 0.40798887610435486, "step": 528 }, { "advantages": 5.066394805908203e-07, "completion_length": 15.3125, "epoch": 0.17633333333333334, "grad_norm": 18.050811767578125, "kl": 0.283203125, "learning_rate": 8.236666666666666e-07, "loss": 0.0113, "reward": 1.5934619903564453, "reward_mean": 1.5934619903564453, "reward_std": 0.05200813710689545, "rewards/iou_timestamp_reward": 0.5934619307518005, "rewards/t_format_reward": 1.0, "step": 529 }, { "advantages": 2.0489096641540527e-08, "completion_length": 16.75, "epoch": 0.17666666666666667, "grad_norm": 9.832855224609375, "kl": 0.228515625, "learning_rate": 8.233333333333333e-07, "loss": 0.0091, "reward": 1.4238654375076294, "reward_mean": 1.4238654375076294, "reward_std": 0.09994375705718994, "rewards/iou_timestamp_reward": 0.4238654375076294, "rewards/t_format_reward": 1.0, "step": 530 }, { "advantages": -2.60770320892334e-08, "completion_length": 86.0, "epoch": 0.177, "grad_norm": 3.8348143100738525, "kl": 0.1962890625, "learning_rate": 8.229999999999999e-07, "loss": 0.0079, "reward": 0.7184687852859497, "reward_mean": 0.7184687852859497, "reward_std": 0.0833221822977066, "rewards/a_meteor_reward": 0.7184687852859497, "step": 531 }, { "advantages": -1.1995434761047363e-06, "completion_length": 15.125, "epoch": 0.17733333333333334, "grad_norm": 8.98647403717041, "kl": 0.1982421875, "learning_rate": 8.226666666666666e-07, "loss": 0.0079, "reward": 1.4683747291564941, "reward_mean": 1.4683747291564941, "reward_std": 0.07959958910942078, "rewards/iou_timestamp_reward": 0.468374639749527, "rewards/t_format_reward": 1.0, "step": 532 }, { "advantages": 2.1979212760925293e-07, "completion_length": 119.5, "epoch": 0.17766666666666667, "grad_norm": 4.152987957000732, "kl": 0.10498046875, "learning_rate": 8.223333333333334e-07, "loss": 0.0042, "reward": 0.39364323019981384, "reward_mean": 0.39364323019981384, "reward_std": 0.05390506982803345, "rewards/v_meteor_reward": 0.39364323019981384, "step": 533 }, { "advantages": 9.5367431640625e-07, "completion_length": 150.25, "epoch": 0.178, "grad_norm": 2.7512786388397217, "kl": 0.150390625, "learning_rate": 8.219999999999999e-07, "loss": 0.006, "reward": 0.5350244641304016, "reward_mean": 0.5350244641304016, "reward_std": 0.06900075078010559, "rewards/a_meteor_reward": 0.5350244641304016, "step": 534 }, { "advantages": -1.2665987014770508e-07, "completion_length": 78.25, "epoch": 0.17833333333333334, "grad_norm": 5.538064479827881, "kl": 0.146484375, "learning_rate": 8.216666666666666e-07, "loss": 0.0059, "reward": 0.46444034576416016, "reward_mean": 0.46444034576416016, "reward_std": 0.05627264827489853, "rewards/v_meteor_reward": 0.46444034576416016, "step": 535 }, { "advantages": -1.0468065738677979e-06, "completion_length": 15.75, "epoch": 0.17866666666666667, "grad_norm": 17.551233291625977, "kl": 0.205078125, "learning_rate": 8.213333333333333e-07, "loss": 0.0082, "reward": 1.8126271963119507, "reward_mean": 1.8126271963119507, "reward_std": 0.030827026814222336, "rewards/iou_timestamp_reward": 0.8126271963119507, "rewards/t_format_reward": 1.0, "step": 536 }, { "advantages": -1.1175870895385742e-08, "completion_length": 16.0, "epoch": 0.179, "grad_norm": 7.447608947753906, "kl": 0.294921875, "learning_rate": 8.21e-07, "loss": 0.0118, "reward": 1.7157893180847168, "reward_mean": 1.7157893180847168, "reward_std": 0.020587928593158722, "rewards/iou_timestamp_reward": 0.715789258480072, "rewards/t_format_reward": 1.0, "step": 537 }, { "advantages": 8.381903171539307e-07, "completion_length": 15.25, "epoch": 0.17933333333333334, "grad_norm": 9.398547172546387, "kl": 0.1689453125, "learning_rate": 8.206666666666666e-07, "loss": 0.0067, "reward": 1.90016770362854, "reward_mean": 1.90016770362854, "reward_std": 0.019742708653211594, "rewards/iou_timestamp_reward": 0.90016770362854, "rewards/t_format_reward": 1.0, "step": 538 }, { "advantages": -2.421438694000244e-08, "completion_length": 112.875, "epoch": 0.17966666666666667, "grad_norm": 5.223174095153809, "kl": 0.1376953125, "learning_rate": 8.203333333333333e-07, "loss": 0.0055, "reward": 0.33129560947418213, "reward_mean": 0.33129560947418213, "reward_std": 0.0485013872385025, "rewards/v_meteor_reward": 0.33129560947418213, "step": 539 }, { "advantages": 5.029141902923584e-08, "completion_length": 100.0, "epoch": 0.18, "grad_norm": 4.616864204406738, "kl": 0.14453125, "learning_rate": 8.199999999999999e-07, "loss": 0.0058, "reward": 0.4368554651737213, "reward_mean": 0.4368554651737213, "reward_std": 0.0779755711555481, "rewards/v_meteor_reward": 0.4368554651737213, "step": 540 }, { "advantages": 2.0489096641540527e-06, "completion_length": 15.5, "epoch": 0.18033333333333335, "grad_norm": 13.460594177246094, "kl": 0.3125, "learning_rate": 8.196666666666667e-07, "loss": 0.0125, "reward": 1.6471084356307983, "reward_mean": 1.6471084356307983, "reward_std": 0.08152349293231964, "rewards/iou_timestamp_reward": 0.6471085548400879, "rewards/t_format_reward": 1.0, "step": 541 }, { "advantages": -4.0978193283081055e-08, "completion_length": 77.0625, "epoch": 0.18066666666666667, "grad_norm": 6.601118564605713, "kl": 0.1640625, "learning_rate": 8.193333333333333e-07, "loss": 0.0065, "reward": 0.4272722005844116, "reward_mean": 0.4272722005844116, "reward_std": 0.10047949850559235, "rewards/v_meteor_reward": 0.4272722005844116, "step": 542 }, { "advantages": -1.3485550880432129e-06, "completion_length": 14.625, "epoch": 0.181, "grad_norm": 8.957409858703613, "kl": 0.26171875, "learning_rate": 8.189999999999999e-07, "loss": 0.0105, "reward": 1.673655390739441, "reward_mean": 1.673655390739441, "reward_std": 0.0342106819152832, "rewards/iou_timestamp_reward": 0.6736552715301514, "rewards/t_format_reward": 1.0, "step": 543 }, { "advantages": -4.5262277126312256e-07, "completion_length": 90.375, "epoch": 0.18133333333333335, "grad_norm": 4.833371162414551, "kl": 0.185546875, "learning_rate": 8.186666666666666e-07, "loss": 0.0074, "reward": 0.6097884178161621, "reward_mean": 0.6097884178161621, "reward_std": 0.07986613363027573, "rewards/a_meteor_reward": 0.6097884178161621, "step": 544 }, { "advantages": -4.665926098823547e-06, "completion_length": 16.25, "epoch": 0.18166666666666667, "grad_norm": 6.571645259857178, "kl": 0.26953125, "learning_rate": 8.183333333333334e-07, "loss": 0.0108, "reward": 1.8462421894073486, "reward_mean": 1.8462421894073486, "reward_std": 0.017336763441562653, "rewards/iou_timestamp_reward": 0.8462421894073486, "rewards/t_format_reward": 1.0, "step": 545 }, { "advantages": 4.470348358154297e-08, "completion_length": 93.1875, "epoch": 0.182, "grad_norm": 5.240266799926758, "kl": 0.1376953125, "learning_rate": 8.179999999999999e-07, "loss": 0.0055, "reward": 0.36493557691574097, "reward_mean": 0.36493557691574097, "reward_std": 0.0639754980802536, "rewards/v_meteor_reward": 0.36493557691574097, "step": 546 }, { "advantages": -5.21540641784668e-08, "completion_length": 140.0, "epoch": 0.18233333333333332, "grad_norm": 3.0422873497009277, "kl": 0.1484375, "learning_rate": 8.176666666666666e-07, "loss": 0.0059, "reward": 0.3720993995666504, "reward_mean": 0.3720993995666504, "reward_std": 0.08238929510116577, "rewards/a_meteor_reward": 0.3720993995666504, "step": 547 }, { "advantages": -5.8294739574193954e-05, "completion_length": 15.0, "epoch": 0.18266666666666667, "grad_norm": 7.4085588455200195, "kl": 0.1591796875, "learning_rate": 8.173333333333333e-07, "loss": 0.0065, "reward": 1.9611880779266357, "reward_mean": 1.9611880779266357, "reward_std": 0.013082440011203289, "rewards/iou_timestamp_reward": 0.9611879587173462, "rewards/t_format_reward": 1.0, "step": 548 }, { "advantages": 9.313225746154785e-08, "completion_length": 17.0, "epoch": 0.183, "grad_norm": 24.338838577270508, "kl": 0.248046875, "learning_rate": 8.169999999999999e-07, "loss": 0.0099, "reward": 1.6163660287857056, "reward_mean": 1.6163660287857056, "reward_std": 0.1603071689605713, "rewards/iou_timestamp_reward": 0.6163660287857056, "rewards/t_format_reward": 1.0, "step": 549 }, { "advantages": -5.21540641784668e-08, "completion_length": 101.9375, "epoch": 0.18333333333333332, "grad_norm": 6.775302886962891, "kl": 0.154296875, "learning_rate": 8.166666666666666e-07, "loss": 0.0062, "reward": 0.39217787981033325, "reward_mean": 0.39217787981033325, "reward_std": 0.06430089473724365, "rewards/v_meteor_reward": 0.39217787981033325, "step": 550 }, { "advantages": 2.0489096641540527e-07, "completion_length": 55.4375, "epoch": 0.18366666666666667, "grad_norm": 4.408350467681885, "kl": 0.47265625, "learning_rate": 8.163333333333333e-07, "loss": 0.0189, "reward": 0.6902073621749878, "reward_mean": 0.6902073621749878, "reward_std": 0.042835790663957596, "rewards/a_meteor_reward": 0.6902073621749878, "step": 551 }, { "advantages": 1.7695128917694092e-07, "completion_length": 98.8125, "epoch": 0.184, "grad_norm": 4.636231899261475, "kl": 0.1494140625, "learning_rate": 8.159999999999999e-07, "loss": 0.006, "reward": 0.34288179874420166, "reward_mean": 0.34288179874420166, "reward_std": 0.05348343402147293, "rewards/v_meteor_reward": 0.34288179874420166, "step": 552 }, { "advantages": -5.4016709327697754e-08, "completion_length": 278.9375, "epoch": 0.18433333333333332, "grad_norm": 5.741393089294434, "kl": 0.26953125, "learning_rate": 8.156666666666666e-07, "loss": 0.0108, "reward": 0.5934747457504272, "reward_mean": 0.5934747457504272, "reward_std": 0.07953640818595886, "rewards/a_meteor_reward": 0.5934747457504272, "step": 553 }, { "advantages": -1.816079020500183e-07, "completion_length": 91.375, "epoch": 0.18466666666666667, "grad_norm": 6.546193599700928, "kl": 0.2158203125, "learning_rate": 8.153333333333334e-07, "loss": 0.0086, "reward": 0.3742319643497467, "reward_mean": 0.3742319643497467, "reward_std": 0.04547739401459694, "rewards/v_meteor_reward": 0.3742319643497467, "step": 554 }, { "advantages": -4.842877388000488e-08, "completion_length": 69.625, "epoch": 0.185, "grad_norm": 4.2292304039001465, "kl": 0.326171875, "learning_rate": 8.149999999999999e-07, "loss": 0.013, "reward": 0.7986037731170654, "reward_mean": 0.7986037731170654, "reward_std": 0.049482375383377075, "rewards/a_meteor_reward": 0.7986037731170654, "step": 555 }, { "advantages": 2.60770320892334e-08, "completion_length": 111.5625, "epoch": 0.18533333333333332, "grad_norm": 5.045796871185303, "kl": 0.1630859375, "learning_rate": 8.146666666666666e-07, "loss": 0.0065, "reward": 0.34126758575439453, "reward_mean": 0.34126758575439453, "reward_std": 0.029349634423851967, "rewards/v_meteor_reward": 0.34126758575439453, "step": 556 }, { "advantages": -1.0989606380462646e-07, "completion_length": 96.1875, "epoch": 0.18566666666666667, "grad_norm": 5.949594974517822, "kl": 0.14453125, "learning_rate": 8.143333333333333e-07, "loss": 0.0058, "reward": 0.27618494629859924, "reward_mean": 0.27618494629859924, "reward_std": 0.029798634350299835, "rewards/v_meteor_reward": 0.27618494629859924, "step": 557 }, { "advantages": 1.1064112186431885e-06, "completion_length": 15.75, "epoch": 0.186, "grad_norm": 7.469447135925293, "kl": 0.283203125, "learning_rate": 8.14e-07, "loss": 0.0113, "reward": 1.6240215301513672, "reward_mean": 1.6240215301513672, "reward_std": 0.03711411729454994, "rewards/iou_timestamp_reward": 0.6240215301513672, "rewards/t_format_reward": 1.0, "step": 558 }, { "advantages": -1.0803341865539551e-07, "completion_length": 81.375, "epoch": 0.18633333333333332, "grad_norm": 5.288058757781982, "kl": 0.24609375, "learning_rate": 8.136666666666666e-07, "loss": 0.0098, "reward": 0.3307967185974121, "reward_mean": 0.3307967185974121, "reward_std": 0.03757268935441971, "rewards/v_meteor_reward": 0.3307967185974121, "step": 559 }, { "advantages": -8.195638656616211e-08, "completion_length": 78.125, "epoch": 0.18666666666666668, "grad_norm": 5.35267972946167, "kl": 0.2255859375, "learning_rate": 8.133333333333333e-07, "loss": 0.009, "reward": 0.37088024616241455, "reward_mean": 0.37088024616241455, "reward_std": 0.07345245033502579, "rewards/v_meteor_reward": 0.37088024616241455, "step": 560 }, { "advantages": -2.0489096641540527e-07, "completion_length": 35.3125, "epoch": 0.187, "grad_norm": 7.536660194396973, "kl": 0.5234375, "learning_rate": 8.129999999999999e-07, "loss": 0.021, "reward": 0.641446590423584, "reward_mean": 0.641446590423584, "reward_std": 0.11535373330116272, "rewards/a_meteor_reward": 0.641446590423584, "step": 561 }, { "advantages": 1.0989606380462646e-07, "completion_length": 139.625, "epoch": 0.18733333333333332, "grad_norm": 6.619078159332275, "kl": 0.302734375, "learning_rate": 8.126666666666666e-07, "loss": 0.0121, "reward": 0.6090068221092224, "reward_mean": 0.6090068221092224, "reward_std": 0.07572633028030396, "rewards/a_meteor_reward": 0.6090068221092224, "step": 562 }, { "advantages": -2.5331974029541016e-07, "completion_length": 15.5, "epoch": 0.18766666666666668, "grad_norm": 18.455101013183594, "kl": 0.2421875, "learning_rate": 8.123333333333333e-07, "loss": 0.0097, "reward": 1.5256643295288086, "reward_mean": 1.5256643295288086, "reward_std": 0.11571074277162552, "rewards/iou_timestamp_reward": 0.5256643295288086, "rewards/t_format_reward": 1.0, "step": 563 }, { "advantages": -3.3527612686157227e-08, "completion_length": 16.0, "epoch": 0.188, "grad_norm": 10.556632995605469, "kl": 0.24609375, "learning_rate": 8.12e-07, "loss": 0.0098, "reward": 1.5776832103729248, "reward_mean": 1.5776832103729248, "reward_std": 0.04119346663355827, "rewards/iou_timestamp_reward": 0.5776832699775696, "rewards/t_format_reward": 1.0, "step": 564 }, { "advantages": -6.94766640663147e-07, "completion_length": 15.75, "epoch": 0.18833333333333332, "grad_norm": 10.39317512512207, "kl": 0.294921875, "learning_rate": 8.116666666666666e-07, "loss": 0.0118, "reward": 1.780836582183838, "reward_mean": 1.780836582183838, "reward_std": 0.016539543867111206, "rewards/iou_timestamp_reward": 0.7808365821838379, "rewards/t_format_reward": 1.0, "step": 565 }, { "advantages": 1.210719347000122e-07, "completion_length": 193.875, "epoch": 0.18866666666666668, "grad_norm": 2.6314454078674316, "kl": 0.158203125, "learning_rate": 8.113333333333333e-07, "loss": 0.0063, "reward": 0.6951919794082642, "reward_mean": 0.6951919794082642, "reward_std": 0.1102418527007103, "rewards/a_meteor_reward": 0.6951919794082642, "step": 566 }, { "advantages": -3.203749656677246e-07, "completion_length": 16.5, "epoch": 0.189, "grad_norm": 8.569807052612305, "kl": 0.2578125, "learning_rate": 8.11e-07, "loss": 0.0103, "reward": 1.714277982711792, "reward_mean": 1.714277982711792, "reward_std": 0.03707485646009445, "rewards/iou_timestamp_reward": 0.7142779231071472, "rewards/t_format_reward": 1.0, "step": 567 }, { "advantages": 6.966292858123779e-07, "completion_length": 15.75, "epoch": 0.18933333333333333, "grad_norm": 10.550886154174805, "kl": 0.2578125, "learning_rate": 8.106666666666666e-07, "loss": 0.0103, "reward": 1.5691871643066406, "reward_mean": 1.5691871643066406, "reward_std": 0.04036330059170723, "rewards/iou_timestamp_reward": 0.5691872239112854, "rewards/t_format_reward": 1.0, "step": 568 }, { "advantages": 5.010515451431274e-07, "completion_length": 15.75, "epoch": 0.18966666666666668, "grad_norm": 12.459877967834473, "kl": 0.34375, "learning_rate": 8.103333333333333e-07, "loss": 0.0138, "reward": 1.4297935962677002, "reward_mean": 1.4297935962677002, "reward_std": 0.09123526513576508, "rewards/iou_timestamp_reward": 0.4297935962677002, "rewards/t_format_reward": 1.0, "step": 569 }, { "advantages": 9.164214134216309e-07, "completion_length": 16.5, "epoch": 0.19, "grad_norm": 12.325056076049805, "kl": 0.369140625, "learning_rate": 8.1e-07, "loss": 0.0148, "reward": 1.5564442873001099, "reward_mean": 1.5564442873001099, "reward_std": 0.019581202417612076, "rewards/iou_timestamp_reward": 0.5564442873001099, "rewards/t_format_reward": 1.0, "step": 570 }, { "advantages": -2.7939677238464355e-07, "completion_length": 59.5625, "epoch": 0.19033333333333333, "grad_norm": 4.821761608123779, "kl": 0.49609375, "learning_rate": 8.096666666666667e-07, "loss": 0.0198, "reward": 0.7573415040969849, "reward_mean": 0.7573415040969849, "reward_std": 0.025331465527415276, "rewards/a_meteor_reward": 0.7573415040969849, "step": 571 }, { "advantages": -2.60770320892334e-08, "completion_length": 151.5625, "epoch": 0.19066666666666668, "grad_norm": 3.900909423828125, "kl": 0.1953125, "learning_rate": 8.093333333333333e-07, "loss": 0.0078, "reward": 0.5707398653030396, "reward_mean": 0.5707398653030396, "reward_std": 0.14857330918312073, "rewards/a_meteor_reward": 0.5707398653030396, "step": 572 }, { "advantages": 1.1920928955078125e-07, "completion_length": 15.0, "epoch": 0.191, "grad_norm": 11.534671783447266, "kl": 0.2333984375, "learning_rate": 8.09e-07, "loss": 0.0093, "reward": 1.4710906744003296, "reward_mean": 1.4710906744003296, "reward_std": 0.12386453151702881, "rewards/iou_timestamp_reward": 0.4710906147956848, "rewards/t_format_reward": 1.0, "step": 573 }, { "advantages": 2.477318048477173e-07, "completion_length": 105.625, "epoch": 0.19133333333333333, "grad_norm": 4.454545497894287, "kl": 0.16796875, "learning_rate": 8.086666666666666e-07, "loss": 0.0067, "reward": 0.4118654131889343, "reward_mean": 0.4118654131889343, "reward_std": 0.061521925032138824, "rewards/v_meteor_reward": 0.4118654131889343, "step": 574 }, { "advantages": -1.525040715932846e-07, "completion_length": 15.5, "epoch": 0.19166666666666668, "grad_norm": 24.213462829589844, "kl": 0.306640625, "learning_rate": 8.083333333333334e-07, "loss": 0.0123, "reward": 1.7161095142364502, "reward_mean": 1.7161095142364502, "reward_std": 0.05916770547628403, "rewards/iou_timestamp_reward": 0.7161095142364502, "rewards/t_format_reward": 1.0, "step": 575 }, { "advantages": 1.2218952178955078e-06, "completion_length": 15.375, "epoch": 0.192, "grad_norm": 13.881576538085938, "kl": 0.2158203125, "learning_rate": 8.08e-07, "loss": 0.0086, "reward": 1.9315083026885986, "reward_mean": 1.9315083026885986, "reward_std": 0.01825626567006111, "rewards/iou_timestamp_reward": 0.9315084218978882, "rewards/t_format_reward": 1.0, "step": 576 }, { "advantages": 2.5760382413864136e-06, "completion_length": 14.9375, "epoch": 0.19233333333333333, "grad_norm": 11.383197784423828, "kl": 0.26953125, "learning_rate": 8.076666666666666e-07, "loss": 0.0108, "reward": 1.8865711688995361, "reward_mean": 1.8865711688995361, "reward_std": 0.049697328358888626, "rewards/iou_timestamp_reward": 0.8865711688995361, "rewards/t_format_reward": 1.0, "step": 577 }, { "advantages": -7.711350917816162e-07, "completion_length": 130.0625, "epoch": 0.19266666666666668, "grad_norm": 3.1635935306549072, "kl": 0.263671875, "learning_rate": 8.073333333333333e-07, "loss": 0.0106, "reward": 0.8463408946990967, "reward_mean": 0.8463408946990967, "reward_std": 0.03688797354698181, "rewards/a_meteor_reward": 0.8463408946990967, "step": 578 }, { "advantages": -3.246590495109558e-06, "completion_length": 16.5, "epoch": 0.193, "grad_norm": 6.943329811096191, "kl": 0.345703125, "learning_rate": 8.070000000000001e-07, "loss": 0.0138, "reward": 1.548109531402588, "reward_mean": 1.548109531402588, "reward_std": 0.019428417086601257, "rewards/iou_timestamp_reward": 0.5481094717979431, "rewards/t_format_reward": 1.0, "step": 579 }, { "advantages": 4.908069968223572e-07, "completion_length": 109.5625, "epoch": 0.19333333333333333, "grad_norm": 2.8688178062438965, "kl": 0.138671875, "learning_rate": 8.066666666666666e-07, "loss": 0.0055, "reward": 0.8003230094909668, "reward_mean": 0.8003230094909668, "reward_std": 0.029168717563152313, "rewards/a_meteor_reward": 0.8003230094909668, "step": 580 }, { "advantages": 7.82310962677002e-08, "completion_length": 103.625, "epoch": 0.19366666666666665, "grad_norm": 5.242727279663086, "kl": 0.205078125, "learning_rate": 8.063333333333333e-07, "loss": 0.0082, "reward": 0.5136655569076538, "reward_mean": 0.5136655569076538, "reward_std": 0.09248703718185425, "rewards/a_meteor_reward": 0.5136655569076538, "step": 581 }, { "advantages": 1.0803341865539551e-07, "completion_length": 88.6875, "epoch": 0.194, "grad_norm": 4.2216572761535645, "kl": 0.126953125, "learning_rate": 8.06e-07, "loss": 0.0051, "reward": 0.40292203426361084, "reward_mean": 0.40292203426361084, "reward_std": 0.07632257044315338, "rewards/v_meteor_reward": 0.40292203426361084, "step": 582 }, { "advantages": 4.6566128730773926e-08, "completion_length": 96.6875, "epoch": 0.19433333333333333, "grad_norm": 3.2855587005615234, "kl": 0.25390625, "learning_rate": 8.056666666666666e-07, "loss": 0.0101, "reward": 0.70721435546875, "reward_mean": 0.70721435546875, "reward_std": 0.03051060251891613, "rewards/a_meteor_reward": 0.70721435546875, "step": 583 }, { "advantages": 2.9802322387695312e-08, "completion_length": 77.4375, "epoch": 0.19466666666666665, "grad_norm": 5.284548759460449, "kl": 0.203125, "learning_rate": 8.053333333333333e-07, "loss": 0.0081, "reward": 0.3670760691165924, "reward_mean": 0.3670760691165924, "reward_std": 0.07759791612625122, "rewards/v_meteor_reward": 0.3670760691165924, "step": 584 }, { "advantages": 1.4901161193847656e-07, "completion_length": 16.25, "epoch": 0.195, "grad_norm": 18.08498764038086, "kl": 0.31640625, "learning_rate": 8.05e-07, "loss": 0.0127, "reward": 1.5334231853485107, "reward_mean": 1.5334231853485107, "reward_std": 0.0507931113243103, "rewards/iou_timestamp_reward": 0.5334231853485107, "rewards/t_format_reward": 1.0, "step": 585 }, { "advantages": -4.246830940246582e-07, "completion_length": 145.75, "epoch": 0.19533333333333333, "grad_norm": 4.70088529586792, "kl": 0.330078125, "learning_rate": 8.046666666666666e-07, "loss": 0.0132, "reward": 0.8154833316802979, "reward_mean": 0.8154833316802979, "reward_std": 0.04487649351358414, "rewards/a_meteor_reward": 0.8154833316802979, "step": 586 }, { "advantages": 4.284083843231201e-08, "completion_length": 16.0, "epoch": 0.19566666666666666, "grad_norm": 8.818635940551758, "kl": 0.2333984375, "learning_rate": 8.043333333333333e-07, "loss": 0.0093, "reward": 1.7585904598236084, "reward_mean": 1.7585904598236084, "reward_std": 0.0581134557723999, "rewards/iou_timestamp_reward": 0.7585904598236084, "rewards/t_format_reward": 1.0, "step": 587 }, { "advantages": -2.950429916381836e-06, "completion_length": 16.0, "epoch": 0.196, "grad_norm": 10.476907730102539, "kl": 0.287109375, "learning_rate": 8.04e-07, "loss": 0.0115, "reward": 1.827131748199463, "reward_mean": 1.827131748199463, "reward_std": 0.0396793931722641, "rewards/iou_timestamp_reward": 0.8271315693855286, "rewards/t_format_reward": 1.0, "step": 588 }, { "advantages": -1.9744038581848145e-07, "completion_length": 76.3125, "epoch": 0.19633333333333333, "grad_norm": 4.404415607452393, "kl": 0.205078125, "learning_rate": 8.036666666666666e-07, "loss": 0.0082, "reward": 0.7492482662200928, "reward_mean": 0.7492482662200928, "reward_std": 0.06093882769346237, "rewards/a_meteor_reward": 0.7492482662200928, "step": 589 }, { "advantages": -1.1920928955078125e-07, "completion_length": 95.0625, "epoch": 0.19666666666666666, "grad_norm": 5.233668804168701, "kl": 0.201171875, "learning_rate": 8.033333333333333e-07, "loss": 0.0081, "reward": 0.4528219699859619, "reward_mean": 0.4528219699859619, "reward_std": 0.11429522931575775, "rewards/v_meteor_reward": 0.4528219699859619, "step": 590 }, { "advantages": -1.4528632164001465e-07, "completion_length": 117.125, "epoch": 0.197, "grad_norm": 3.998581886291504, "kl": 0.15234375, "learning_rate": 8.03e-07, "loss": 0.0061, "reward": 0.4549980163574219, "reward_mean": 0.4549980163574219, "reward_std": 0.07376306504011154, "rewards/a_meteor_reward": 0.4549980163574219, "step": 591 }, { "advantages": 4.0046870708465576e-08, "completion_length": 121.375, "epoch": 0.19733333333333333, "grad_norm": 4.322912693023682, "kl": 0.1552734375, "learning_rate": 8.026666666666667e-07, "loss": 0.0062, "reward": 0.3200297951698303, "reward_mean": 0.3200297951698303, "reward_std": 0.05659855529665947, "rewards/v_meteor_reward": 0.3200297951698303, "step": 592 }, { "advantages": -2.3283064365386963e-07, "completion_length": 164.4375, "epoch": 0.19766666666666666, "grad_norm": 2.344141721725464, "kl": 0.1923828125, "learning_rate": 8.023333333333333e-07, "loss": 0.0077, "reward": 0.884859561920166, "reward_mean": 0.884859561920166, "reward_std": 0.03449128195643425, "rewards/a_meteor_reward": 0.884859561920166, "step": 593 }, { "advantages": -2.779066562652588e-06, "completion_length": 64.5, "epoch": 0.198, "grad_norm": 4.409121036529541, "kl": 0.400390625, "learning_rate": 8.02e-07, "loss": 0.016, "reward": 0.714792013168335, "reward_mean": 0.714792013168335, "reward_std": 0.03240291029214859, "rewards/a_meteor_reward": 0.714792013168335, "step": 594 }, { "advantages": 1.1175870895385742e-08, "completion_length": 93.125, "epoch": 0.19833333333333333, "grad_norm": 5.6013078689575195, "kl": 0.234375, "learning_rate": 8.016666666666666e-07, "loss": 0.0094, "reward": 0.35085874795913696, "reward_mean": 0.35085874795913696, "reward_std": 0.07603539526462555, "rewards/v_meteor_reward": 0.35085874795913696, "step": 595 }, { "advantages": -1.0989606380462646e-07, "completion_length": 16.0, "epoch": 0.19866666666666666, "grad_norm": 11.528850555419922, "kl": 0.318359375, "learning_rate": 8.013333333333333e-07, "loss": 0.0128, "reward": 1.8770169019699097, "reward_mean": 1.8770169019699097, "reward_std": 0.04361153766512871, "rewards/iou_timestamp_reward": 0.8770169019699097, "rewards/t_format_reward": 1.0, "step": 596 }, { "advantages": -1.7508864402770996e-07, "completion_length": 287.0625, "epoch": 0.199, "grad_norm": 3.831446647644043, "kl": 0.2138671875, "learning_rate": 8.01e-07, "loss": 0.0085, "reward": 0.8595179319381714, "reward_mean": 0.8595179319381714, "reward_std": 0.02985784225165844, "rewards/a_meteor_reward": 0.8595179319381714, "step": 597 }, { "advantages": -1.0468065738677979e-06, "completion_length": 16.25, "epoch": 0.19933333333333333, "grad_norm": 14.804408073425293, "kl": 0.25, "learning_rate": 8.006666666666666e-07, "loss": 0.01, "reward": 1.6007871627807617, "reward_mean": 1.6007871627807617, "reward_std": 0.019084494560956955, "rewards/iou_timestamp_reward": 0.6007871031761169, "rewards/t_format_reward": 1.0, "step": 598 }, { "advantages": 1.1548399925231934e-07, "completion_length": 98.9375, "epoch": 0.19966666666666666, "grad_norm": 4.277563571929932, "kl": 0.1298828125, "learning_rate": 8.003333333333333e-07, "loss": 0.0052, "reward": 0.3691234290599823, "reward_mean": 0.3691234290599823, "reward_std": 0.03125295788049698, "rewards/v_meteor_reward": 0.3691234290599823, "step": 599 }, { "advantages": -6.455229595303535e-08, "completion_length": 88.75, "epoch": 0.2, "grad_norm": 4.3518452644348145, "kl": 0.37890625, "learning_rate": 8e-07, "loss": 0.0152, "reward": 0.6425923109054565, "reward_mean": 0.6425923109054565, "reward_std": 0.027555137872695923, "rewards/a_meteor_reward": 0.6425923109054565, "step": 600 }, { "advantages": 7.82310962677002e-08, "completion_length": 144.9375, "epoch": 0.20033333333333334, "grad_norm": 4.116743564605713, "kl": 0.1376953125, "learning_rate": 7.996666666666666e-07, "loss": 0.0055, "reward": 0.3185270428657532, "reward_mean": 0.3185270428657532, "reward_std": 0.04729309678077698, "rewards/v_meteor_reward": 0.3185270428657532, "step": 601 }, { "advantages": -2.0116567611694336e-07, "completion_length": 138.8125, "epoch": 0.20066666666666666, "grad_norm": 3.6946468353271484, "kl": 0.125, "learning_rate": 7.993333333333333e-07, "loss": 0.005, "reward": 0.49804794788360596, "reward_mean": 0.49804794788360596, "reward_std": 0.12493899464607239, "rewards/a_meteor_reward": 0.49804794788360596, "step": 602 }, { "advantages": -3.2745301723480225e-06, "completion_length": 16.5, "epoch": 0.201, "grad_norm": 14.543778419494629, "kl": 0.287109375, "learning_rate": 7.99e-07, "loss": 0.0115, "reward": 1.7262531518936157, "reward_mean": 1.7262531518936157, "reward_std": 0.062062062323093414, "rewards/iou_timestamp_reward": 0.726253092288971, "rewards/t_format_reward": 1.0, "step": 603 }, { "advantages": 1.4156103134155273e-07, "completion_length": 95.25, "epoch": 0.20133333333333334, "grad_norm": 5.054253578186035, "kl": 0.2119140625, "learning_rate": 7.986666666666666e-07, "loss": 0.0085, "reward": 0.32010090351104736, "reward_mean": 0.32010090351104736, "reward_std": 0.058301325887441635, "rewards/v_meteor_reward": 0.32010090351104736, "step": 604 }, { "advantages": 5.438923835754395e-07, "completion_length": 15.25, "epoch": 0.20166666666666666, "grad_norm": 10.332256317138672, "kl": 0.240234375, "learning_rate": 7.983333333333333e-07, "loss": 0.0096, "reward": 1.6251192092895508, "reward_mean": 1.6251192092895508, "reward_std": 0.0869760811328888, "rewards/iou_timestamp_reward": 0.6251190900802612, "rewards/t_format_reward": 1.0, "step": 605 }, { "advantages": 1.1548399925231934e-06, "completion_length": 15.75, "epoch": 0.202, "grad_norm": 8.3735990524292, "kl": 0.166015625, "learning_rate": 7.98e-07, "loss": 0.0066, "reward": 1.8525385856628418, "reward_mean": 1.8525385856628418, "reward_std": 0.046368710696697235, "rewards/iou_timestamp_reward": 0.8525385856628418, "rewards/t_format_reward": 1.0, "step": 606 }, { "advantages": 1.0715797543525696e-05, "completion_length": 15.0, "epoch": 0.20233333333333334, "grad_norm": 7.76922082901001, "kl": 0.1708984375, "learning_rate": 7.976666666666666e-07, "loss": 0.0068, "reward": 1.4943251609802246, "reward_mean": 1.4943251609802246, "reward_std": 0.006052759476006031, "rewards/iou_timestamp_reward": 0.49432510137557983, "rewards/t_format_reward": 1.0, "step": 607 }, { "advantages": 3.8370490074157715e-07, "completion_length": 346.0, "epoch": 0.20266666666666666, "grad_norm": 2.6481523513793945, "kl": 0.1171875, "learning_rate": 7.973333333333333e-07, "loss": 0.0047, "reward": 0.5504317879676819, "reward_mean": 0.5504317879676819, "reward_std": 0.05553242564201355, "rewards/a_meteor_reward": 0.5504317879676819, "step": 608 }, { "advantages": -1.4901161193847656e-07, "completion_length": 109.5625, "epoch": 0.203, "grad_norm": 6.446979999542236, "kl": 0.205078125, "learning_rate": 7.970000000000001e-07, "loss": 0.0082, "reward": 0.37762078642845154, "reward_mean": 0.37762078642845154, "reward_std": 0.1031365767121315, "rewards/v_meteor_reward": 0.37762078642845154, "step": 609 }, { "advantages": -1.1920928955078125e-07, "completion_length": 28.0625, "epoch": 0.20333333333333334, "grad_norm": 9.113619804382324, "kl": 0.65625, "learning_rate": 7.966666666666666e-07, "loss": 0.0263, "reward": 0.6530382633209229, "reward_mean": 0.6530382633209229, "reward_std": 0.10067333281040192, "rewards/a_meteor_reward": 0.6530382633209229, "step": 610 }, { "advantages": -4.3213367462158203e-07, "completion_length": 15.3125, "epoch": 0.20366666666666666, "grad_norm": 19.7434139251709, "kl": 0.24609375, "learning_rate": 7.963333333333333e-07, "loss": 0.0099, "reward": 1.6470847129821777, "reward_mean": 1.6470847129821777, "reward_std": 0.07609723508358002, "rewards/iou_timestamp_reward": 0.6470847725868225, "rewards/t_format_reward": 1.0, "step": 611 }, { "advantages": 8.195638656616211e-08, "completion_length": 81.8125, "epoch": 0.204, "grad_norm": 6.343822479248047, "kl": 0.2060546875, "learning_rate": 7.96e-07, "loss": 0.0082, "reward": 0.35743480920791626, "reward_mean": 0.35743480920791626, "reward_std": 0.08779138326644897, "rewards/v_meteor_reward": 0.35743480920791626, "step": 612 }, { "advantages": 1.1734664440155029e-07, "completion_length": 38.25, "epoch": 0.20433333333333334, "grad_norm": 5.947166919708252, "kl": 0.4921875, "learning_rate": 7.956666666666666e-07, "loss": 0.0197, "reward": 0.6589776277542114, "reward_mean": 0.6589776277542114, "reward_std": 0.07498305290937424, "rewards/a_meteor_reward": 0.6589776277542114, "step": 613 }, { "advantages": 2.171844244003296e-06, "completion_length": 17.0, "epoch": 0.20466666666666666, "grad_norm": 12.142913818359375, "kl": 0.3359375, "learning_rate": 7.953333333333333e-07, "loss": 0.0135, "reward": 1.8306440114974976, "reward_mean": 1.8306440114974976, "reward_std": 0.020568447187542915, "rewards/iou_timestamp_reward": 0.8306440114974976, "rewards/t_format_reward": 1.0, "step": 614 }, { "advantages": -2.505257725715637e-07, "completion_length": 121.875, "epoch": 0.205, "grad_norm": 4.872692108154297, "kl": 0.193359375, "learning_rate": 7.95e-07, "loss": 0.0077, "reward": 0.3467659056186676, "reward_mean": 0.3467659056186676, "reward_std": 0.053526975214481354, "rewards/v_meteor_reward": 0.3467659056186676, "step": 615 }, { "advantages": -2.6263296604156494e-07, "completion_length": 66.875, "epoch": 0.20533333333333334, "grad_norm": 5.563328266143799, "kl": 0.212890625, "learning_rate": 7.946666666666666e-07, "loss": 0.0085, "reward": 0.3890882432460785, "reward_mean": 0.3890882432460785, "reward_std": 0.07292935997247696, "rewards/v_meteor_reward": 0.3890882432460785, "step": 616 }, { "advantages": -4.917383193969727e-07, "completion_length": 15.75, "epoch": 0.20566666666666666, "grad_norm": 18.65684700012207, "kl": 0.220703125, "learning_rate": 7.943333333333333e-07, "loss": 0.0088, "reward": 1.4516267776489258, "reward_mean": 1.4516267776489258, "reward_std": 0.0954328253865242, "rewards/iou_timestamp_reward": 0.45162689685821533, "rewards/t_format_reward": 1.0, "step": 617 }, { "advantages": 8.940696716308594e-08, "completion_length": 117.75, "epoch": 0.206, "grad_norm": 5.224288463592529, "kl": 0.2314453125, "learning_rate": 7.94e-07, "loss": 0.0093, "reward": 0.65116947889328, "reward_mean": 0.65116947889328, "reward_std": 0.07311993837356567, "rewards/a_meteor_reward": 0.65116947889328, "step": 618 }, { "advantages": -2.7194619178771973e-07, "completion_length": 98.3125, "epoch": 0.20633333333333334, "grad_norm": 5.2157111167907715, "kl": 0.232421875, "learning_rate": 7.936666666666666e-07, "loss": 0.0093, "reward": 0.3648647665977478, "reward_mean": 0.3648647665977478, "reward_std": 0.043528422713279724, "rewards/v_meteor_reward": 0.3648647665977478, "step": 619 }, { "advantages": 6.593763828277588e-07, "completion_length": 73.9375, "epoch": 0.20666666666666667, "grad_norm": 6.352459907531738, "kl": 0.298828125, "learning_rate": 7.933333333333333e-07, "loss": 0.0119, "reward": 0.71700519323349, "reward_mean": 0.71700519323349, "reward_std": 0.04213907569646835, "rewards/a_meteor_reward": 0.71700519323349, "step": 620 }, { "advantages": -1.4901161193847656e-08, "completion_length": 237.6875, "epoch": 0.207, "grad_norm": 2.802676200866699, "kl": 0.11083984375, "learning_rate": 7.93e-07, "loss": 0.0044, "reward": 0.5621645450592041, "reward_mean": 0.5621645450592041, "reward_std": 0.09990620613098145, "rewards/a_meteor_reward": 0.5621645450592041, "step": 621 }, { "advantages": 1.4565885066986084e-06, "completion_length": 15.5, "epoch": 0.20733333333333334, "grad_norm": 6.298768520355225, "kl": 0.26953125, "learning_rate": 7.926666666666666e-07, "loss": 0.0108, "reward": 1.8239507675170898, "reward_mean": 1.8239507675170898, "reward_std": 0.018991723656654358, "rewards/iou_timestamp_reward": 0.8239508867263794, "rewards/t_format_reward": 1.0, "step": 622 }, { "advantages": -4.470348358154297e-08, "completion_length": 84.6875, "epoch": 0.20766666666666667, "grad_norm": 6.47592306137085, "kl": 0.2138671875, "learning_rate": 7.923333333333333e-07, "loss": 0.0086, "reward": 0.3422720432281494, "reward_mean": 0.3422720432281494, "reward_std": 0.06652481853961945, "rewards/v_meteor_reward": 0.3422720432281494, "step": 623 }, { "advantages": -1.0803341865539551e-07, "completion_length": 16.0, "epoch": 0.208, "grad_norm": 10.73786735534668, "kl": 0.228515625, "learning_rate": 7.92e-07, "loss": 0.0092, "reward": 1.753670573234558, "reward_mean": 1.753670573234558, "reward_std": 0.026496410369873047, "rewards/iou_timestamp_reward": 0.7536705732345581, "rewards/t_format_reward": 1.0, "step": 624 }, { "advantages": 1.8440186977386475e-07, "completion_length": 109.3125, "epoch": 0.20833333333333334, "grad_norm": 5.798683166503906, "kl": 0.4765625, "learning_rate": 7.916666666666666e-07, "loss": 0.0191, "reward": 0.6478855609893799, "reward_mean": 0.6478855609893799, "reward_std": 0.07031069695949554, "rewards/a_meteor_reward": 0.6478855609893799, "step": 625 }, { "advantages": 1.776963472366333e-06, "completion_length": 15.25, "epoch": 0.20866666666666667, "grad_norm": 13.027702331542969, "kl": 0.234375, "learning_rate": 7.913333333333332e-07, "loss": 0.0094, "reward": 1.939704179763794, "reward_mean": 1.939704179763794, "reward_std": 0.027395736426115036, "rewards/iou_timestamp_reward": 0.9397042393684387, "rewards/t_format_reward": 1.0, "step": 626 }, { "advantages": 2.980232238769531e-07, "completion_length": 14.625, "epoch": 0.209, "grad_norm": 16.47467803955078, "kl": 0.19140625, "learning_rate": 7.91e-07, "loss": 0.0077, "reward": 1.7275110483169556, "reward_mean": 1.7275110483169556, "reward_std": 0.11372262239456177, "rewards/iou_timestamp_reward": 0.7275111675262451, "rewards/t_format_reward": 1.0, "step": 627 }, { "advantages": 1.8998980522155762e-07, "completion_length": 106.3125, "epoch": 0.20933333333333334, "grad_norm": 4.933012008666992, "kl": 0.1845703125, "learning_rate": 7.906666666666666e-07, "loss": 0.0074, "reward": 0.3403220772743225, "reward_mean": 0.3403220772743225, "reward_std": 0.05687690153717995, "rewards/v_meteor_reward": 0.3403220772743225, "step": 628 }, { "advantages": -1.4156103134155273e-07, "completion_length": 85.125, "epoch": 0.20966666666666667, "grad_norm": 4.933987617492676, "kl": 0.169921875, "learning_rate": 7.903333333333333e-07, "loss": 0.0068, "reward": 0.353900283575058, "reward_mean": 0.353900283575058, "reward_std": 0.05370844155550003, "rewards/v_meteor_reward": 0.353900283575058, "step": 629 }, { "advantages": 3.46451997756958e-07, "completion_length": 93.125, "epoch": 0.21, "grad_norm": 5.112491607666016, "kl": 0.16796875, "learning_rate": 7.9e-07, "loss": 0.0067, "reward": 0.3538914918899536, "reward_mean": 0.3538914918899536, "reward_std": 0.040435053408145905, "rewards/v_meteor_reward": 0.3538914918899536, "step": 630 }, { "advantages": -1.3783574104309082e-07, "completion_length": 108.4375, "epoch": 0.21033333333333334, "grad_norm": 5.877511024475098, "kl": 0.345703125, "learning_rate": 7.896666666666666e-07, "loss": 0.0138, "reward": 0.7484733462333679, "reward_mean": 0.7484733462333679, "reward_std": 0.050207287073135376, "rewards/a_meteor_reward": 0.7484733462333679, "step": 631 }, { "advantages": -3.725290298461914e-08, "completion_length": 28.8125, "epoch": 0.21066666666666667, "grad_norm": 8.121668815612793, "kl": 0.60546875, "learning_rate": 7.893333333333333e-07, "loss": 0.0242, "reward": 0.36740565299987793, "reward_mean": 0.36740565299987793, "reward_std": 0.06430971622467041, "rewards/a_meteor_reward": 0.36740565299987793, "step": 632 }, { "advantages": -1.30385160446167e-08, "completion_length": 79.1875, "epoch": 0.211, "grad_norm": 5.9121785163879395, "kl": 0.193359375, "learning_rate": 7.89e-07, "loss": 0.0077, "reward": 0.4016711413860321, "reward_mean": 0.4016711413860321, "reward_std": 0.09762128442525864, "rewards/v_meteor_reward": 0.4016711413860321, "step": 633 }, { "advantages": 1.2665987014770508e-07, "completion_length": 95.9375, "epoch": 0.21133333333333335, "grad_norm": 4.508252143859863, "kl": 0.1962890625, "learning_rate": 7.886666666666666e-07, "loss": 0.0078, "reward": 0.472261905670166, "reward_mean": 0.472261905670166, "reward_std": 0.07484094798564911, "rewards/v_meteor_reward": 0.472261905670166, "step": 634 }, { "advantages": -4.1909515857696533e-07, "completion_length": 14.5625, "epoch": 0.21166666666666667, "grad_norm": 15.345756530761719, "kl": 0.1953125, "learning_rate": 7.883333333333333e-07, "loss": 0.0078, "reward": 1.8745887279510498, "reward_mean": 1.8745887279510498, "reward_std": 0.03406809642910957, "rewards/iou_timestamp_reward": 0.8745887875556946, "rewards/t_format_reward": 1.0, "step": 635 }, { "advantages": 5.587935447692871e-07, "completion_length": 99.375, "epoch": 0.212, "grad_norm": 4.621163368225098, "kl": 0.158203125, "learning_rate": 7.88e-07, "loss": 0.0063, "reward": 0.45441973209381104, "reward_mean": 0.45441973209381104, "reward_std": 0.03852500766515732, "rewards/v_meteor_reward": 0.45441973209381104, "step": 636 }, { "advantages": -1.3373792171478271e-06, "completion_length": 15.375, "epoch": 0.21233333333333335, "grad_norm": 11.99035930633545, "kl": 0.244140625, "learning_rate": 7.876666666666666e-07, "loss": 0.0098, "reward": 1.5316121578216553, "reward_mean": 1.5316121578216553, "reward_std": 0.025103826075792313, "rewards/iou_timestamp_reward": 0.5316121578216553, "rewards/t_format_reward": 1.0, "step": 637 }, { "advantages": -8.568167686462402e-08, "completion_length": 309.5625, "epoch": 0.21266666666666667, "grad_norm": 1.3812575340270996, "kl": 0.08837890625, "learning_rate": 7.873333333333333e-07, "loss": 0.0035, "reward": 0.8741787075996399, "reward_mean": 0.8741787075996399, "reward_std": 0.03446812555193901, "rewards/a_meteor_reward": 0.8741787075996399, "step": 638 }, { "advantages": -1.4007091522216797e-06, "completion_length": 15.5625, "epoch": 0.213, "grad_norm": 10.442456245422363, "kl": 0.2451171875, "learning_rate": 7.87e-07, "loss": 0.0098, "reward": 1.7486037015914917, "reward_mean": 1.7486037015914917, "reward_std": 0.031992070376873016, "rewards/iou_timestamp_reward": 0.7486037015914917, "rewards/t_format_reward": 1.0, "step": 639 }, { "advantages": 1.5692785382270813e-07, "completion_length": 52.4375, "epoch": 0.21333333333333335, "grad_norm": 5.393824577331543, "kl": 0.3828125, "learning_rate": 7.866666666666666e-07, "loss": 0.0153, "reward": 0.5713068842887878, "reward_mean": 0.5713068842887878, "reward_std": 0.07101015001535416, "rewards/a_meteor_reward": 0.5713068842887878, "step": 640 }, { "advantages": -3.725290298461914e-09, "completion_length": 67.0, "epoch": 0.21366666666666667, "grad_norm": 6.5012030601501465, "kl": 0.294921875, "learning_rate": 7.863333333333333e-07, "loss": 0.0117, "reward": 0.28913670778274536, "reward_mean": 0.28913670778274536, "reward_std": 0.08050169050693512, "rewards/v_meteor_reward": 0.28913670778274536, "step": 641 }, { "advantages": -1.1548399925231934e-07, "completion_length": 27.75, "epoch": 0.214, "grad_norm": 7.54563045501709, "kl": 0.53125, "learning_rate": 7.86e-07, "loss": 0.0212, "reward": 0.4967079758644104, "reward_mean": 0.4967079758644104, "reward_std": 0.09257599711418152, "rewards/a_meteor_reward": 0.4967079758644104, "step": 642 }, { "advantages": 1.5683472156524658e-06, "completion_length": 16.0, "epoch": 0.21433333333333332, "grad_norm": 9.56205940246582, "kl": 0.1962890625, "learning_rate": 7.856666666666665e-07, "loss": 0.0078, "reward": 1.6954916715621948, "reward_mean": 1.6954916715621948, "reward_std": 0.023980913683772087, "rewards/iou_timestamp_reward": 0.6954916715621948, "rewards/t_format_reward": 1.0, "step": 643 }, { "advantages": 4.842877388000488e-07, "completion_length": 15.0, "epoch": 0.21466666666666667, "grad_norm": 23.835756301879883, "kl": 0.1513671875, "learning_rate": 7.853333333333333e-07, "loss": 0.0061, "reward": 1.705311894416809, "reward_mean": 1.705311894416809, "reward_std": 0.11156104505062103, "rewards/iou_timestamp_reward": 0.7053118944168091, "rewards/t_format_reward": 1.0, "step": 644 }, { "advantages": -5.960464477539063e-08, "completion_length": 101.9375, "epoch": 0.215, "grad_norm": 4.779202461242676, "kl": 0.12890625, "learning_rate": 7.85e-07, "loss": 0.0051, "reward": 0.41725897789001465, "reward_mean": 0.41725897789001465, "reward_std": 0.06132848188281059, "rewards/v_meteor_reward": 0.41725897789001465, "step": 645 }, { "advantages": 1.564621925354004e-07, "completion_length": 83.0, "epoch": 0.21533333333333332, "grad_norm": 6.22570276260376, "kl": 0.212890625, "learning_rate": 7.846666666666666e-07, "loss": 0.0085, "reward": 0.45018017292022705, "reward_mean": 0.45018017292022705, "reward_std": 0.07035762071609497, "rewards/v_meteor_reward": 0.45018017292022705, "step": 646 }, { "advantages": -6.146728992462158e-08, "completion_length": 116.6875, "epoch": 0.21566666666666667, "grad_norm": 2.2787206172943115, "kl": 0.1865234375, "learning_rate": 7.843333333333332e-07, "loss": 0.0075, "reward": 0.8215811252593994, "reward_mean": 0.8215811252593994, "reward_std": 0.0245538167655468, "rewards/a_meteor_reward": 0.8215811252593994, "step": 647 }, { "advantages": 4.842877388000488e-07, "completion_length": 16.0, "epoch": 0.216, "grad_norm": 8.84388542175293, "kl": 0.3515625, "learning_rate": 7.84e-07, "loss": 0.0141, "reward": 1.7653602361679077, "reward_mean": 1.7653602361679077, "reward_std": 0.026927147060632706, "rewards/iou_timestamp_reward": 0.7653602361679077, "rewards/t_format_reward": 1.0, "step": 648 }, { "advantages": -5.62518835067749e-07, "completion_length": 15.5, "epoch": 0.21633333333333332, "grad_norm": 10.2977933883667, "kl": 0.32421875, "learning_rate": 7.836666666666666e-07, "loss": 0.013, "reward": 1.8125474452972412, "reward_mean": 1.8125474452972412, "reward_std": 0.0500105544924736, "rewards/iou_timestamp_reward": 0.8125473856925964, "rewards/t_format_reward": 1.0, "step": 649 }, { "advantages": 2.644956111907959e-07, "completion_length": 175.5, "epoch": 0.21666666666666667, "grad_norm": 2.7017619609832764, "kl": 0.12255859375, "learning_rate": 7.833333333333333e-07, "loss": 0.0049, "reward": 0.6011068224906921, "reward_mean": 0.6011068224906921, "reward_std": 0.1299181878566742, "rewards/a_meteor_reward": 0.6011068224906921, "step": 650 }, { "advantages": -1.046457327902317e-06, "completion_length": 66.75, "epoch": 0.217, "grad_norm": 4.331597328186035, "kl": 0.28125, "learning_rate": 7.83e-07, "loss": 0.0113, "reward": 0.6995042562484741, "reward_mean": 0.6995042562484741, "reward_std": 0.02259618602693081, "rewards/a_meteor_reward": 0.6995042562484741, "step": 651 }, { "advantages": -6.258487701416016e-07, "completion_length": 48.0625, "epoch": 0.21733333333333332, "grad_norm": 8.304964065551758, "kl": 0.5078125, "learning_rate": 7.826666666666666e-07, "loss": 0.0203, "reward": 0.4151023030281067, "reward_mean": 0.4151023030281067, "reward_std": 0.07177278399467468, "rewards/a_meteor_reward": 0.4151023030281067, "step": 652 }, { "advantages": 2.7567148208618164e-07, "completion_length": 100.125, "epoch": 0.21766666666666667, "grad_norm": 4.8541083335876465, "kl": 0.1474609375, "learning_rate": 7.823333333333333e-07, "loss": 0.0059, "reward": 0.3250288665294647, "reward_mean": 0.3250288665294647, "reward_std": 0.0384698286652565, "rewards/v_meteor_reward": 0.3250288665294647, "step": 653 }, { "advantages": 1.601874828338623e-06, "completion_length": 247.4375, "epoch": 0.218, "grad_norm": 2.360539674758911, "kl": 0.11767578125, "learning_rate": 7.82e-07, "loss": 0.0047, "reward": 0.8337380886077881, "reward_mean": 0.8337380886077881, "reward_std": 0.029132023453712463, "rewards/a_meteor_reward": 0.8337380886077881, "step": 654 }, { "advantages": -3.166496753692627e-07, "completion_length": 16.0, "epoch": 0.21833333333333332, "grad_norm": 8.235990524291992, "kl": 0.2294921875, "learning_rate": 7.816666666666666e-07, "loss": 0.0092, "reward": 1.8230977058410645, "reward_mean": 1.8230977058410645, "reward_std": 0.02625545673072338, "rewards/iou_timestamp_reward": 0.8230976462364197, "rewards/t_format_reward": 1.0, "step": 655 }, { "advantages": 2.7567148208618164e-07, "completion_length": 96.125, "epoch": 0.21866666666666668, "grad_norm": 3.7148187160491943, "kl": 0.158203125, "learning_rate": 7.813333333333332e-07, "loss": 0.0063, "reward": 0.645971417427063, "reward_mean": 0.645971417427063, "reward_std": 0.06372880935668945, "rewards/a_meteor_reward": 0.645971417427063, "step": 656 }, { "advantages": -1.4528632164001465e-07, "completion_length": 168.9375, "epoch": 0.219, "grad_norm": 3.5884850025177, "kl": 0.19140625, "learning_rate": 7.81e-07, "loss": 0.0077, "reward": 0.6909002065658569, "reward_mean": 0.6909002065658569, "reward_std": 0.16198879480361938, "rewards/a_meteor_reward": 0.6909002065658569, "step": 657 }, { "advantages": -3.4794211387634277e-06, "completion_length": 15.5, "epoch": 0.21933333333333332, "grad_norm": 6.766848087310791, "kl": 0.2412109375, "learning_rate": 7.806666666666666e-07, "loss": 0.0097, "reward": 1.6401209831237793, "reward_mean": 1.6401209831237793, "reward_std": 0.02800721675157547, "rewards/iou_timestamp_reward": 0.6401209235191345, "rewards/t_format_reward": 1.0, "step": 658 }, { "advantages": 1.1175870895385742e-08, "completion_length": 82.875, "epoch": 0.21966666666666668, "grad_norm": 5.1811604499816895, "kl": 0.1650390625, "learning_rate": 7.803333333333333e-07, "loss": 0.0066, "reward": 0.3492392897605896, "reward_mean": 0.3492392897605896, "reward_std": 0.06984499096870422, "rewards/v_meteor_reward": 0.3492392897605896, "step": 659 }, { "advantages": 1.9781291484832764e-06, "completion_length": 16.0, "epoch": 0.22, "grad_norm": 9.04871654510498, "kl": 0.166015625, "learning_rate": 7.799999999999999e-07, "loss": 0.0066, "reward": 1.5062472820281982, "reward_mean": 1.5062472820281982, "reward_std": 0.04612359404563904, "rewards/iou_timestamp_reward": 0.506247341632843, "rewards/t_format_reward": 1.0, "step": 660 }, { "advantages": 2.3283064365386963e-07, "completion_length": 93.75, "epoch": 0.22033333333333333, "grad_norm": 2.9890499114990234, "kl": 0.19921875, "learning_rate": 7.796666666666666e-07, "loss": 0.008, "reward": 0.8433783054351807, "reward_mean": 0.8433783054351807, "reward_std": 0.03176786005496979, "rewards/a_meteor_reward": 0.8433783054351807, "step": 661 }, { "advantages": 6.51925802230835e-08, "completion_length": 84.75, "epoch": 0.22066666666666668, "grad_norm": 5.515537261962891, "kl": 0.228515625, "learning_rate": 7.793333333333333e-07, "loss": 0.0091, "reward": 0.44967973232269287, "reward_mean": 0.44967973232269287, "reward_std": 0.10811269283294678, "rewards/v_meteor_reward": 0.44967973232269287, "step": 662 }, { "advantages": -1.862645149230957e-07, "completion_length": 16.0, "epoch": 0.221, "grad_norm": 5.868802070617676, "kl": 0.35546875, "learning_rate": 7.79e-07, "loss": 0.0142, "reward": 1.6153388023376465, "reward_mean": 1.6153388023376465, "reward_std": 0.049297090619802475, "rewards/iou_timestamp_reward": 0.6153387427330017, "rewards/t_format_reward": 1.0, "step": 663 }, { "advantages": -1.6391277313232422e-07, "completion_length": 85.125, "epoch": 0.22133333333333333, "grad_norm": 5.14755916595459, "kl": 0.173828125, "learning_rate": 7.786666666666665e-07, "loss": 0.007, "reward": 0.3289879262447357, "reward_mean": 0.3289879262447357, "reward_std": 0.07261068373918533, "rewards/v_meteor_reward": 0.3289879262447357, "step": 664 }, { "advantages": 1.0989606380462646e-07, "completion_length": 95.875, "epoch": 0.22166666666666668, "grad_norm": 5.629847526550293, "kl": 0.201171875, "learning_rate": 7.783333333333333e-07, "loss": 0.008, "reward": 0.36205872893333435, "reward_mean": 0.36205872893333435, "reward_std": 0.07615582644939423, "rewards/v_meteor_reward": 0.36205872893333435, "step": 665 }, { "advantages": -2.9802322387695312e-08, "completion_length": 85.6875, "epoch": 0.222, "grad_norm": 4.770866870880127, "kl": 0.2138671875, "learning_rate": 7.78e-07, "loss": 0.0086, "reward": 0.33612728118896484, "reward_mean": 0.33612728118896484, "reward_std": 0.05759852007031441, "rewards/v_meteor_reward": 0.33612728118896484, "step": 666 }, { "advantages": -1.4528632164001465e-07, "completion_length": 199.625, "epoch": 0.22233333333333333, "grad_norm": 2.9682629108428955, "kl": 0.162109375, "learning_rate": 7.776666666666666e-07, "loss": 0.0065, "reward": 0.5880117416381836, "reward_mean": 0.5880117416381836, "reward_std": 0.08399669080972672, "rewards/a_meteor_reward": 0.5880117416381836, "step": 667 }, { "advantages": -9.778887033462524e-08, "completion_length": 255.9375, "epoch": 0.22266666666666668, "grad_norm": 3.3744993209838867, "kl": 0.13671875, "learning_rate": 7.773333333333333e-07, "loss": 0.0055, "reward": 0.8271226286888123, "reward_mean": 0.8271226286888123, "reward_std": 0.015963859856128693, "rewards/a_meteor_reward": 0.8271226286888123, "step": 668 }, { "advantages": -1.8067657947540283e-07, "completion_length": 15.5, "epoch": 0.223, "grad_norm": 13.09054183959961, "kl": 0.251953125, "learning_rate": 7.77e-07, "loss": 0.0101, "reward": 1.4697391986846924, "reward_mean": 1.4697391986846924, "reward_std": 0.06756982952356339, "rewards/iou_timestamp_reward": 0.4697391986846924, "rewards/t_format_reward": 1.0, "step": 669 }, { "advantages": 2.2351741790771484e-08, "completion_length": 149.0625, "epoch": 0.22333333333333333, "grad_norm": 7.468779563903809, "kl": 0.41796875, "learning_rate": 7.766666666666666e-07, "loss": 0.0167, "reward": 0.6966019868850708, "reward_mean": 0.6966019868850708, "reward_std": 0.05451729893684387, "rewards/a_meteor_reward": 0.6966019868850708, "step": 670 }, { "advantages": 3.725290298461914e-08, "completion_length": 74.3125, "epoch": 0.22366666666666668, "grad_norm": 5.355835437774658, "kl": 0.171875, "learning_rate": 7.763333333333333e-07, "loss": 0.0069, "reward": 0.37415868043899536, "reward_mean": 0.37415868043899536, "reward_std": 0.0814162939786911, "rewards/v_meteor_reward": 0.37415868043899536, "step": 671 }, { "advantages": -7.543712854385376e-08, "completion_length": 74.4375, "epoch": 0.224, "grad_norm": 5.709102153778076, "kl": 0.220703125, "learning_rate": 7.76e-07, "loss": 0.0088, "reward": 0.3398711085319519, "reward_mean": 0.3398711085319519, "reward_std": 0.042332325130701065, "rewards/v_meteor_reward": 0.3398711085319519, "step": 672 }, { "advantages": -1.1101365089416504e-06, "completion_length": 15.5, "epoch": 0.22433333333333333, "grad_norm": 20.679298400878906, "kl": 0.26953125, "learning_rate": 7.756666666666665e-07, "loss": 0.0108, "reward": 1.7068462371826172, "reward_mean": 1.7068462371826172, "reward_std": 0.10678645223379135, "rewards/iou_timestamp_reward": 0.7068462371826172, "rewards/t_format_reward": 1.0, "step": 673 }, { "advantages": -9.834766387939453e-07, "completion_length": 15.5, "epoch": 0.22466666666666665, "grad_norm": 18.058826446533203, "kl": 0.392578125, "learning_rate": 7.753333333333333e-07, "loss": 0.0157, "reward": 1.9302558898925781, "reward_mean": 1.9302558898925781, "reward_std": 0.02851530909538269, "rewards/iou_timestamp_reward": 0.9302558302879333, "rewards/t_format_reward": 1.0, "step": 674 }, { "advantages": 7.264316082000732e-07, "completion_length": 15.875, "epoch": 0.225, "grad_norm": 13.951688766479492, "kl": 0.279296875, "learning_rate": 7.75e-07, "loss": 0.0112, "reward": 1.5831117630004883, "reward_mean": 1.5831117630004883, "reward_std": 0.11207427829504013, "rewards/iou_timestamp_reward": 0.5831116437911987, "rewards/t_format_reward": 1.0, "step": 675 }, { "advantages": 8.735805749893188e-07, "completion_length": 15.5, "epoch": 0.22533333333333333, "grad_norm": 7.804253101348877, "kl": 0.220703125, "learning_rate": 7.746666666666666e-07, "loss": 0.0088, "reward": 1.5971291065216064, "reward_mean": 1.5971291065216064, "reward_std": 0.047428518533706665, "rewards/iou_timestamp_reward": 0.597129225730896, "rewards/t_format_reward": 1.0, "step": 676 }, { "advantages": -1.4901161193847656e-08, "completion_length": 107.6875, "epoch": 0.22566666666666665, "grad_norm": 4.723328113555908, "kl": 0.1640625, "learning_rate": 7.743333333333332e-07, "loss": 0.0065, "reward": 0.34111517667770386, "reward_mean": 0.34111517667770386, "reward_std": 0.036954306066036224, "rewards/v_meteor_reward": 0.34111517667770386, "step": 677 }, { "advantages": 2.289190888404846e-06, "completion_length": 16.0, "epoch": 0.226, "grad_norm": 11.729619026184082, "kl": 0.30078125, "learning_rate": 7.74e-07, "loss": 0.012, "reward": 1.4047517776489258, "reward_mean": 1.4047517776489258, "reward_std": 0.009431653656065464, "rewards/iou_timestamp_reward": 0.40475180745124817, "rewards/t_format_reward": 1.0, "step": 678 }, { "advantages": -8.493661880493164e-07, "completion_length": 16.25, "epoch": 0.22633333333333333, "grad_norm": 5.9131245613098145, "kl": 0.29296875, "learning_rate": 7.736666666666666e-07, "loss": 0.0117, "reward": 1.790055513381958, "reward_mean": 1.790055513381958, "reward_std": 0.041889432817697525, "rewards/iou_timestamp_reward": 0.7900555729866028, "rewards/t_format_reward": 1.0, "step": 679 }, { "advantages": -1.0803341865539551e-07, "completion_length": 388.875, "epoch": 0.22666666666666666, "grad_norm": 2.1026434898376465, "kl": 0.0791015625, "learning_rate": 7.733333333333333e-07, "loss": 0.0032, "reward": 0.6803286671638489, "reward_mean": 0.6803286671638489, "reward_std": 0.0448436439037323, "rewards/a_meteor_reward": 0.6803286671638489, "step": 680 }, { "advantages": 2.4121254682540894e-07, "completion_length": 56.75, "epoch": 0.227, "grad_norm": 4.702218055725098, "kl": 0.28515625, "learning_rate": 7.729999999999999e-07, "loss": 0.0114, "reward": 0.6043165922164917, "reward_mean": 0.6043165922164917, "reward_std": 0.05598846822977066, "rewards/a_meteor_reward": 0.6043165922164917, "step": 681 }, { "advantages": 2.9299408197402954e-06, "completion_length": 15.5, "epoch": 0.22733333333333333, "grad_norm": 15.853730201721191, "kl": 0.29296875, "learning_rate": 7.726666666666666e-07, "loss": 0.0117, "reward": 1.804813265800476, "reward_mean": 1.804813265800476, "reward_std": 0.041401494294404984, "rewards/iou_timestamp_reward": 0.8048133254051208, "rewards/t_format_reward": 1.0, "step": 682 }, { "advantages": -3.725290298461914e-08, "completion_length": 64.5625, "epoch": 0.22766666666666666, "grad_norm": 4.61552095413208, "kl": 0.34765625, "learning_rate": 7.723333333333333e-07, "loss": 0.0139, "reward": 0.7538237571716309, "reward_mean": 0.7538237571716309, "reward_std": 0.05200108140707016, "rewards/a_meteor_reward": 0.7538237571716309, "step": 683 }, { "advantages": -2.514570951461792e-07, "completion_length": 74.9375, "epoch": 0.228, "grad_norm": 5.283360958099365, "kl": 0.2265625, "learning_rate": 7.72e-07, "loss": 0.0091, "reward": 0.378223717212677, "reward_mean": 0.378223717212677, "reward_std": 0.07244960963726044, "rewards/v_meteor_reward": 0.378223717212677, "step": 684 }, { "advantages": 4.0978193283081055e-08, "completion_length": 68.0625, "epoch": 0.22833333333333333, "grad_norm": 6.337747097015381, "kl": 0.275390625, "learning_rate": 7.716666666666665e-07, "loss": 0.0111, "reward": 0.4573162794113159, "reward_mean": 0.4573162794113159, "reward_std": 0.08276009559631348, "rewards/v_meteor_reward": 0.4573162794113159, "step": 685 }, { "advantages": 1.1175870895385742e-07, "completion_length": 77.5625, "epoch": 0.22866666666666666, "grad_norm": 5.319462299346924, "kl": 0.169921875, "learning_rate": 7.713333333333333e-07, "loss": 0.0068, "reward": 0.37480151653289795, "reward_mean": 0.37480151653289795, "reward_std": 0.08307808637619019, "rewards/v_meteor_reward": 0.37480151653289795, "step": 686 }, { "advantages": 3.725290298461914e-07, "completion_length": 70.0625, "epoch": 0.229, "grad_norm": 5.895742416381836, "kl": 0.234375, "learning_rate": 7.71e-07, "loss": 0.0094, "reward": 0.47910022735595703, "reward_mean": 0.47910022735595703, "reward_std": 0.048991259187459946, "rewards/v_meteor_reward": 0.47910022735595703, "step": 687 }, { "advantages": -7.450580596923828e-09, "completion_length": 124.5625, "epoch": 0.22933333333333333, "grad_norm": 3.5147900581359863, "kl": 0.1337890625, "learning_rate": 7.706666666666667e-07, "loss": 0.0054, "reward": 0.6586074829101562, "reward_mean": 0.6586074829101562, "reward_std": 0.10110722482204437, "rewards/a_meteor_reward": 0.6586074829101562, "step": 688 }, { "advantages": -1.448206603527069e-07, "completion_length": 83.375, "epoch": 0.22966666666666666, "grad_norm": 5.785263538360596, "kl": 0.2109375, "learning_rate": 7.703333333333333e-07, "loss": 0.0084, "reward": 0.41979312896728516, "reward_mean": 0.41979312896728516, "reward_std": 0.059389758855104446, "rewards/v_meteor_reward": 0.41979312896728516, "step": 689 }, { "advantages": -1.1175870895385742e-07, "completion_length": 81.5, "epoch": 0.23, "grad_norm": 6.483829975128174, "kl": 0.212890625, "learning_rate": 7.699999999999999e-07, "loss": 0.0085, "reward": 0.4788403809070587, "reward_mean": 0.4788403809070587, "reward_std": 0.07527898252010345, "rewards/v_meteor_reward": 0.4788403809070587, "step": 690 }, { "advantages": 5.587935447692871e-08, "completion_length": 78.5, "epoch": 0.23033333333333333, "grad_norm": 5.118138313293457, "kl": 0.1435546875, "learning_rate": 7.696666666666667e-07, "loss": 0.0057, "reward": 0.3647388517856598, "reward_mean": 0.3647388517856598, "reward_std": 0.07490149885416031, "rewards/v_meteor_reward": 0.3647388517856598, "step": 691 }, { "advantages": -1.862645149230957e-08, "completion_length": 79.0, "epoch": 0.23066666666666666, "grad_norm": 5.121640682220459, "kl": 0.2490234375, "learning_rate": 7.693333333333333e-07, "loss": 0.01, "reward": 0.485495924949646, "reward_mean": 0.485495924949646, "reward_std": 0.09712748229503632, "rewards/v_meteor_reward": 0.485495924949646, "step": 692 }, { "advantages": 6.426125764846802e-07, "completion_length": 187.3125, "epoch": 0.231, "grad_norm": 4.281647682189941, "kl": 0.1640625, "learning_rate": 7.69e-07, "loss": 0.0066, "reward": 0.5629845857620239, "reward_mean": 0.5629845857620239, "reward_std": 0.052302286028862, "rewards/a_meteor_reward": 0.5629845857620239, "step": 693 }, { "advantages": -3.725290298461914e-08, "completion_length": 122.3125, "epoch": 0.23133333333333334, "grad_norm": 4.558494567871094, "kl": 0.1533203125, "learning_rate": 7.686666666666666e-07, "loss": 0.0061, "reward": 0.4032898247241974, "reward_mean": 0.4032898247241974, "reward_std": 0.06440785527229309, "rewards/v_meteor_reward": 0.4032898247241974, "step": 694 }, { "advantages": 4.414469003677368e-06, "completion_length": 16.0, "epoch": 0.23166666666666666, "grad_norm": 13.495662689208984, "kl": 0.33203125, "learning_rate": 7.683333333333333e-07, "loss": 0.0133, "reward": 1.7320489883422852, "reward_mean": 1.7320489883422852, "reward_std": 0.03597994148731232, "rewards/iou_timestamp_reward": 0.7320491075515747, "rewards/t_format_reward": 1.0, "step": 695 }, { "advantages": 8.568167686462402e-08, "completion_length": 88.375, "epoch": 0.232, "grad_norm": 5.224710464477539, "kl": 0.197265625, "learning_rate": 7.68e-07, "loss": 0.0079, "reward": 0.436146080493927, "reward_mean": 0.436146080493927, "reward_std": 0.07538889348506927, "rewards/v_meteor_reward": 0.436146080493927, "step": 696 }, { "advantages": 9.760260581970215e-07, "completion_length": 130.0, "epoch": 0.23233333333333334, "grad_norm": 5.081066608428955, "kl": 0.41796875, "learning_rate": 7.676666666666667e-07, "loss": 0.0167, "reward": 0.7911392450332642, "reward_mean": 0.7911392450332642, "reward_std": 0.046326085925102234, "rewards/a_meteor_reward": 0.7911392450332642, "step": 697 }, { "advantages": 1.434236764907837e-07, "completion_length": 16.1875, "epoch": 0.23266666666666666, "grad_norm": 35.764984130859375, "kl": 0.2421875, "learning_rate": 7.673333333333332e-07, "loss": 0.0097, "reward": 1.552461862564087, "reward_mean": 1.552461862564087, "reward_std": 0.11961062252521515, "rewards/iou_timestamp_reward": 0.5524618625640869, "rewards/t_format_reward": 1.0, "step": 698 }, { "advantages": -3.166496753692627e-07, "completion_length": 15.75, "epoch": 0.233, "grad_norm": 12.926369667053223, "kl": 0.1875, "learning_rate": 7.67e-07, "loss": 0.0075, "reward": 1.7443546056747437, "reward_mean": 1.7443546056747437, "reward_std": 0.05547779053449631, "rewards/iou_timestamp_reward": 0.7443546056747437, "rewards/t_format_reward": 1.0, "step": 699 }, { "advantages": 6.05359673500061e-08, "completion_length": 81.8125, "epoch": 0.23333333333333334, "grad_norm": 5.469788074493408, "kl": 0.2158203125, "learning_rate": 7.666666666666667e-07, "loss": 0.0086, "reward": 0.422529399394989, "reward_mean": 0.422529399394989, "reward_std": 0.08745817840099335, "rewards/v_meteor_reward": 0.422529399394989, "step": 700 }, { "advantages": -2.384185791015625e-07, "completion_length": 81.9375, "epoch": 0.23366666666666666, "grad_norm": 5.549681186676025, "kl": 0.244140625, "learning_rate": 7.663333333333333e-07, "loss": 0.0098, "reward": 0.38009506464004517, "reward_mean": 0.38009506464004517, "reward_std": 0.07391367852687836, "rewards/v_meteor_reward": 0.38009506464004517, "step": 701 }, { "advantages": 5.587935447692871e-08, "completion_length": 85.375, "epoch": 0.234, "grad_norm": 5.052911758422852, "kl": 0.1826171875, "learning_rate": 7.66e-07, "loss": 0.0073, "reward": 0.34527069330215454, "reward_mean": 0.34527069330215454, "reward_std": 0.06333577632904053, "rewards/v_meteor_reward": 0.34527069330215454, "step": 702 }, { "advantages": 2.246815711259842e-07, "completion_length": 94.375, "epoch": 0.23433333333333334, "grad_norm": 5.457962989807129, "kl": 0.16015625, "learning_rate": 7.656666666666667e-07, "loss": 0.0064, "reward": 0.48956388235092163, "reward_mean": 0.48956388235092163, "reward_std": 0.058606699109077454, "rewards/v_meteor_reward": 0.48956388235092163, "step": 703 }, { "advantages": 4.470348358154297e-08, "completion_length": 105.625, "epoch": 0.23466666666666666, "grad_norm": 4.762293815612793, "kl": 0.154296875, "learning_rate": 7.653333333333333e-07, "loss": 0.0061, "reward": 0.3508829176425934, "reward_mean": 0.3508829176425934, "reward_std": 0.07592898607254028, "rewards/v_meteor_reward": 0.3508829176425934, "step": 704 }, { "advantages": -5.420297384262085e-07, "completion_length": 58.375, "epoch": 0.235, "grad_norm": 3.165025234222412, "kl": 0.263671875, "learning_rate": 7.65e-07, "loss": 0.0105, "reward": 0.5586701035499573, "reward_mean": 0.5586701035499573, "reward_std": 0.045389093458652496, "rewards/a_meteor_reward": 0.5586701035499573, "step": 705 }, { "advantages": -2.430751919746399e-07, "completion_length": 104.25, "epoch": 0.23533333333333334, "grad_norm": 3.9104440212249756, "kl": 0.1669921875, "learning_rate": 7.646666666666667e-07, "loss": 0.0067, "reward": 0.6188258528709412, "reward_mean": 0.6188258528709412, "reward_std": 0.03144820034503937, "rewards/a_meteor_reward": 0.6188258528709412, "step": 706 }, { "advantages": 2.868473529815674e-07, "completion_length": 90.4375, "epoch": 0.23566666666666666, "grad_norm": 4.546730041503906, "kl": 0.310546875, "learning_rate": 7.643333333333332e-07, "loss": 0.0124, "reward": 0.7672648429870605, "reward_mean": 0.7672648429870605, "reward_std": 0.03559780865907669, "rewards/a_meteor_reward": 0.7672648429870605, "step": 707 }, { "advantages": 3.390014171600342e-07, "completion_length": 95.5, "epoch": 0.236, "grad_norm": 5.2616376876831055, "kl": 0.2158203125, "learning_rate": 7.64e-07, "loss": 0.0086, "reward": 0.40950366854667664, "reward_mean": 0.40950366854667664, "reward_std": 0.05555078387260437, "rewards/v_meteor_reward": 0.40950366854667664, "step": 708 }, { "advantages": 5.606561899185181e-07, "completion_length": 191.6875, "epoch": 0.23633333333333334, "grad_norm": 2.332364320755005, "kl": 0.1181640625, "learning_rate": 7.636666666666667e-07, "loss": 0.0047, "reward": 0.43988361954689026, "reward_mean": 0.43988361954689026, "reward_std": 0.04083241522312164, "rewards/a_meteor_reward": 0.43988361954689026, "step": 709 }, { "advantages": -3.725290298461914e-08, "completion_length": 75.25, "epoch": 0.23666666666666666, "grad_norm": 5.54289436340332, "kl": 0.1787109375, "learning_rate": 7.633333333333333e-07, "loss": 0.0071, "reward": 0.3591221570968628, "reward_mean": 0.3591221570968628, "reward_std": 0.0767032578587532, "rewards/v_meteor_reward": 0.3591221570968628, "step": 710 }, { "advantages": -8.568167686462402e-08, "completion_length": 103.125, "epoch": 0.237, "grad_norm": 3.322636127471924, "kl": 0.185546875, "learning_rate": 7.629999999999999e-07, "loss": 0.0074, "reward": 0.8175325989723206, "reward_mean": 0.8175325989723206, "reward_std": 0.056017715483903885, "rewards/a_meteor_reward": 0.8175325989723206, "step": 711 }, { "advantages": -5.21540641784668e-08, "completion_length": 79.25, "epoch": 0.23733333333333334, "grad_norm": 6.295307636260986, "kl": 0.169921875, "learning_rate": 7.626666666666667e-07, "loss": 0.0068, "reward": 0.25754427909851074, "reward_mean": 0.25754427909851074, "reward_std": 0.04309212416410446, "rewards/v_meteor_reward": 0.25754427909851074, "step": 712 }, { "advantages": 1.9371509552001953e-07, "completion_length": 79.9375, "epoch": 0.23766666666666666, "grad_norm": 4.77794885635376, "kl": 0.177734375, "learning_rate": 7.623333333333333e-07, "loss": 0.0071, "reward": 0.4346046447753906, "reward_mean": 0.4346046447753906, "reward_std": 0.045248351991176605, "rewards/v_meteor_reward": 0.4346046447753906, "step": 713 }, { "advantages": 1.4901161193847656e-07, "completion_length": 97.1875, "epoch": 0.238, "grad_norm": 4.946713924407959, "kl": 0.193359375, "learning_rate": 7.62e-07, "loss": 0.0077, "reward": 0.3729186952114105, "reward_mean": 0.3729186952114105, "reward_std": 0.0616484060883522, "rewards/v_meteor_reward": 0.3729186952114105, "step": 714 }, { "advantages": 5.558133125305176e-06, "completion_length": 16.25, "epoch": 0.23833333333333334, "grad_norm": 11.469893455505371, "kl": 0.265625, "learning_rate": 7.616666666666666e-07, "loss": 0.0106, "reward": 1.6236298084259033, "reward_mean": 1.6236298084259033, "reward_std": 0.038027357310056686, "rewards/iou_timestamp_reward": 0.6236298084259033, "rewards/t_format_reward": 1.0, "step": 715 }, { "advantages": -6.51925802230835e-08, "completion_length": 79.75, "epoch": 0.23866666666666667, "grad_norm": 5.71196985244751, "kl": 0.17578125, "learning_rate": 7.613333333333333e-07, "loss": 0.007, "reward": 0.4275064468383789, "reward_mean": 0.4275064468383789, "reward_std": 0.06925172358751297, "rewards/v_meteor_reward": 0.4275064468383789, "step": 716 }, { "advantages": -7.133930921554565e-07, "completion_length": 15.75, "epoch": 0.239, "grad_norm": 13.687564849853516, "kl": 0.35546875, "learning_rate": 7.61e-07, "loss": 0.0142, "reward": 1.8183561563491821, "reward_mean": 1.8183561563491821, "reward_std": 0.03768281266093254, "rewards/iou_timestamp_reward": 0.8183562159538269, "rewards/t_format_reward": 1.0, "step": 717 }, { "advantages": -1.5012919902801514e-06, "completion_length": 82.625, "epoch": 0.23933333333333334, "grad_norm": 3.368729829788208, "kl": 0.20703125, "learning_rate": 7.606666666666667e-07, "loss": 0.0083, "reward": 0.8587035536766052, "reward_mean": 0.8587035536766052, "reward_std": 0.02604309841990471, "rewards/a_meteor_reward": 0.8587035536766052, "step": 718 }, { "advantages": 3.241002559661865e-07, "completion_length": 89.0625, "epoch": 0.23966666666666667, "grad_norm": 5.038793563842773, "kl": 0.234375, "learning_rate": 7.603333333333332e-07, "loss": 0.0094, "reward": 0.3854299783706665, "reward_mean": 0.3854299783706665, "reward_std": 0.05692180246114731, "rewards/v_meteor_reward": 0.3854299783706665, "step": 719 }, { "advantages": -4.158820956945419e-06, "completion_length": 16.0, "epoch": 0.24, "grad_norm": 7.264834403991699, "kl": 0.33984375, "learning_rate": 7.599999999999999e-07, "loss": 0.0136, "reward": 1.9236184358596802, "reward_mean": 1.9236184358596802, "reward_std": 0.01124115101993084, "rewards/iou_timestamp_reward": 0.9236183166503906, "rewards/t_format_reward": 1.0, "step": 720 }, { "advantages": 1.0319054126739502e-06, "completion_length": 15.25, "epoch": 0.24033333333333334, "grad_norm": 11.323607444763184, "kl": 0.328125, "learning_rate": 7.596666666666667e-07, "loss": 0.0132, "reward": 1.9133801460266113, "reward_mean": 1.9133801460266113, "reward_std": 0.03327372670173645, "rewards/iou_timestamp_reward": 0.9133802056312561, "rewards/t_format_reward": 1.0, "step": 721 }, { "advantages": 2.738088369369507e-07, "completion_length": 106.8125, "epoch": 0.24066666666666667, "grad_norm": 4.855013847351074, "kl": 0.1728515625, "learning_rate": 7.593333333333333e-07, "loss": 0.0069, "reward": 0.459386944770813, "reward_mean": 0.459386944770813, "reward_std": 0.06587585806846619, "rewards/v_meteor_reward": 0.459386944770813, "step": 722 }, { "advantages": -5.932524800300598e-07, "completion_length": 16.0, "epoch": 0.241, "grad_norm": 8.450592994689941, "kl": 0.2373046875, "learning_rate": 7.59e-07, "loss": 0.0095, "reward": 1.7402660846710205, "reward_mean": 1.7402660846710205, "reward_std": 0.02357116900384426, "rewards/iou_timestamp_reward": 0.7402660846710205, "rewards/t_format_reward": 1.0, "step": 723 }, { "advantages": -6.146728992462158e-07, "completion_length": 57.1875, "epoch": 0.24133333333333334, "grad_norm": 6.561420917510986, "kl": 0.498046875, "learning_rate": 7.586666666666666e-07, "loss": 0.0199, "reward": 0.478323757648468, "reward_mean": 0.478323757648468, "reward_std": 0.03915631026029587, "rewards/a_meteor_reward": 0.478323757648468, "step": 724 }, { "advantages": -6.51925802230835e-08, "completion_length": 150.125, "epoch": 0.24166666666666667, "grad_norm": 2.750239372253418, "kl": 0.140625, "learning_rate": 7.583333333333333e-07, "loss": 0.0056, "reward": 0.7775278091430664, "reward_mean": 0.7775278091430664, "reward_std": 0.0888567864894867, "rewards/a_meteor_reward": 0.7775278091430664, "step": 725 }, { "advantages": 4.0978193283081055e-07, "completion_length": 250.8125, "epoch": 0.242, "grad_norm": 2.532073974609375, "kl": 0.10546875, "learning_rate": 7.58e-07, "loss": 0.0042, "reward": 0.46961161494255066, "reward_mean": 0.46961161494255066, "reward_std": 0.05985736846923828, "rewards/a_meteor_reward": 0.46961161494255066, "step": 726 }, { "advantages": -5.9604644775390625e-06, "completion_length": 16.3125, "epoch": 0.24233333333333335, "grad_norm": 13.571438789367676, "kl": 0.26171875, "learning_rate": 7.576666666666667e-07, "loss": 0.0105, "reward": 1.5525057315826416, "reward_mean": 1.5525057315826416, "reward_std": 0.056208811700344086, "rewards/iou_timestamp_reward": 0.5525057315826416, "rewards/t_format_reward": 1.0, "step": 727 }, { "advantages": 3.7439167499542236e-07, "completion_length": 195.1875, "epoch": 0.24266666666666667, "grad_norm": 4.325538635253906, "kl": 0.228515625, "learning_rate": 7.573333333333332e-07, "loss": 0.0091, "reward": 0.6156072020530701, "reward_mean": 0.6156072020530701, "reward_std": 0.06653063744306564, "rewards/a_meteor_reward": 0.6156072020530701, "step": 728 }, { "advantages": 1.0654330253601074e-06, "completion_length": 46.5625, "epoch": 0.243, "grad_norm": 9.35767650604248, "kl": 0.515625, "learning_rate": 7.57e-07, "loss": 0.0206, "reward": 0.6960612535476685, "reward_mean": 0.6960612535476685, "reward_std": 0.020815635100007057, "rewards/a_meteor_reward": 0.6960612535476685, "step": 729 }, { "advantages": 7.543712854385376e-08, "completion_length": 79.8125, "epoch": 0.24333333333333335, "grad_norm": 5.865809440612793, "kl": 0.193359375, "learning_rate": 7.566666666666667e-07, "loss": 0.0077, "reward": 0.3869869112968445, "reward_mean": 0.3869869112968445, "reward_std": 0.07255639135837555, "rewards/v_meteor_reward": 0.3869869112968445, "step": 730 }, { "advantages": -2.738088369369507e-07, "completion_length": 104.3125, "epoch": 0.24366666666666667, "grad_norm": 5.563388347625732, "kl": 0.18359375, "learning_rate": 7.563333333333333e-07, "loss": 0.0073, "reward": 0.3724134564399719, "reward_mean": 0.3724134564399719, "reward_std": 0.06504839658737183, "rewards/v_meteor_reward": 0.3724134564399719, "step": 731 }, { "advantages": 5.736947059631348e-07, "completion_length": 82.25, "epoch": 0.244, "grad_norm": 5.240825653076172, "kl": 0.15625, "learning_rate": 7.559999999999999e-07, "loss": 0.0063, "reward": 0.401014506816864, "reward_mean": 0.401014506816864, "reward_std": 0.056592442095279694, "rewards/v_meteor_reward": 0.401014506816864, "step": 732 }, { "advantages": 5.327165126800537e-07, "completion_length": 74.75, "epoch": 0.24433333333333335, "grad_norm": 7.404443264007568, "kl": 0.388671875, "learning_rate": 7.556666666666667e-07, "loss": 0.0155, "reward": 0.7000753879547119, "reward_mean": 0.7000753879547119, "reward_std": 0.04774368554353714, "rewards/a_meteor_reward": 0.7000753879547119, "step": 733 }, { "advantages": 2.384185791015625e-07, "completion_length": 164.6875, "epoch": 0.24466666666666667, "grad_norm": 5.3721394538879395, "kl": 0.359375, "learning_rate": 7.553333333333333e-07, "loss": 0.0144, "reward": 0.7130347490310669, "reward_mean": 0.7130347490310669, "reward_std": 0.09074271470308304, "rewards/a_meteor_reward": 0.7130347490310669, "step": 734 }, { "advantages": 5.2638351917266846e-06, "completion_length": 13.75, "epoch": 0.245, "grad_norm": 4.802180767059326, "kl": 0.234375, "learning_rate": 7.55e-07, "loss": 0.0094, "reward": 1.6488630771636963, "reward_mean": 1.6488630771636963, "reward_std": 0.011708758771419525, "rewards/iou_timestamp_reward": 0.6488631367683411, "rewards/t_format_reward": 1.0, "step": 735 }, { "advantages": -5.960464477539063e-08, "completion_length": 84.25, "epoch": 0.24533333333333332, "grad_norm": 6.334385871887207, "kl": 0.2119140625, "learning_rate": 7.546666666666666e-07, "loss": 0.0085, "reward": 0.5029623508453369, "reward_mean": 0.5029623508453369, "reward_std": 0.10716930031776428, "rewards/v_meteor_reward": 0.5029623508453369, "step": 736 }, { "advantages": 1.8457649275660515e-06, "completion_length": 15.5, "epoch": 0.24566666666666667, "grad_norm": 12.247471809387207, "kl": 0.228515625, "learning_rate": 7.543333333333332e-07, "loss": 0.0091, "reward": 1.6148422956466675, "reward_mean": 1.6148422956466675, "reward_std": 0.0245656818151474, "rewards/iou_timestamp_reward": 0.6148422956466675, "rewards/t_format_reward": 1.0, "step": 737 }, { "advantages": -4.0978193283081055e-08, "completion_length": 75.875, "epoch": 0.246, "grad_norm": 4.886992931365967, "kl": 0.1650390625, "learning_rate": 7.54e-07, "loss": 0.0066, "reward": 0.4379813075065613, "reward_mean": 0.4379813075065613, "reward_std": 0.08844590187072754, "rewards/v_meteor_reward": 0.4379813075065613, "step": 738 }, { "advantages": 2.60770320892334e-08, "completion_length": 84.25, "epoch": 0.24633333333333332, "grad_norm": 5.388879299163818, "kl": 0.2353515625, "learning_rate": 7.536666666666667e-07, "loss": 0.0094, "reward": 0.35390737652778625, "reward_mean": 0.35390737652778625, "reward_std": 0.04933437332510948, "rewards/v_meteor_reward": 0.35390737652778625, "step": 739 }, { "advantages": -1.0803341865539551e-07, "completion_length": 79.625, "epoch": 0.24666666666666667, "grad_norm": 5.650897979736328, "kl": 0.16796875, "learning_rate": 7.533333333333332e-07, "loss": 0.0067, "reward": 0.36976292729377747, "reward_mean": 0.36976292729377747, "reward_std": 0.07584340125322342, "rewards/v_meteor_reward": 0.36976292729377747, "step": 740 }, { "advantages": -4.842877388000488e-08, "completion_length": 61.6875, "epoch": 0.247, "grad_norm": 8.255349159240723, "kl": 0.423828125, "learning_rate": 7.529999999999999e-07, "loss": 0.017, "reward": 0.7545152902603149, "reward_mean": 0.7545152902603149, "reward_std": 0.057585205882787704, "rewards/a_meteor_reward": 0.7545152902603149, "step": 741 }, { "advantages": -4.842877388000488e-08, "completion_length": 15.5, "epoch": 0.24733333333333332, "grad_norm": 10.391411781311035, "kl": 0.322265625, "learning_rate": 7.526666666666667e-07, "loss": 0.0129, "reward": 1.8517098426818848, "reward_mean": 1.8517098426818848, "reward_std": 0.04161078482866287, "rewards/iou_timestamp_reward": 0.8517098426818848, "rewards/t_format_reward": 1.0, "step": 742 }, { "advantages": 3.3527612686157227e-08, "completion_length": 69.375, "epoch": 0.24766666666666667, "grad_norm": 6.3726630210876465, "kl": 0.28515625, "learning_rate": 7.523333333333333e-07, "loss": 0.0114, "reward": 0.42829418182373047, "reward_mean": 0.42829418182373047, "reward_std": 0.07716856151819229, "rewards/v_meteor_reward": 0.42829418182373047, "step": 743 }, { "advantages": 1.3783574104309082e-07, "completion_length": 79.9375, "epoch": 0.248, "grad_norm": 5.814966678619385, "kl": 0.2060546875, "learning_rate": 7.52e-07, "loss": 0.0082, "reward": 0.3815266191959381, "reward_mean": 0.3815266191959381, "reward_std": 0.07505191117525101, "rewards/v_meteor_reward": 0.3815266191959381, "step": 744 }, { "advantages": -6.556510925292969e-07, "completion_length": 136.8125, "epoch": 0.24833333333333332, "grad_norm": 7.292759895324707, "kl": 0.39453125, "learning_rate": 7.516666666666666e-07, "loss": 0.0158, "reward": 0.5993728637695312, "reward_mean": 0.5993728637695312, "reward_std": 0.05829927325248718, "rewards/a_meteor_reward": 0.5993728637695312, "step": 745 }, { "advantages": 4.76837158203125e-07, "completion_length": 204.0625, "epoch": 0.24866666666666667, "grad_norm": 4.927826404571533, "kl": 0.40625, "learning_rate": 7.513333333333333e-07, "loss": 0.0162, "reward": 0.7111108303070068, "reward_mean": 0.7111108303070068, "reward_std": 0.0629633292555809, "rewards/a_meteor_reward": 0.7111108303070068, "step": 746 }, { "advantages": -9.96515154838562e-08, "completion_length": 15.5625, "epoch": 0.249, "grad_norm": 15.331281661987305, "kl": 0.25, "learning_rate": 7.51e-07, "loss": 0.01, "reward": 1.5680075883865356, "reward_mean": 1.5680075883865356, "reward_std": 0.0597713440656662, "rewards/iou_timestamp_reward": 0.5680075883865356, "rewards/t_format_reward": 1.0, "step": 747 }, { "advantages": -5.21540641784668e-08, "completion_length": 26.125, "epoch": 0.24933333333333332, "grad_norm": 8.729666709899902, "kl": 0.7421875, "learning_rate": 7.506666666666667e-07, "loss": 0.0296, "reward": 0.5146862864494324, "reward_mean": 0.5146862864494324, "reward_std": 0.10576999187469482, "rewards/a_meteor_reward": 0.5146862864494324, "step": 748 }, { "advantages": 4.0978193283081055e-07, "completion_length": 15.5625, "epoch": 0.24966666666666668, "grad_norm": 12.37338924407959, "kl": 0.453125, "learning_rate": 7.503333333333332e-07, "loss": 0.0181, "reward": 1.8473758697509766, "reward_mean": 1.8473758697509766, "reward_std": 0.1015956923365593, "rewards/iou_timestamp_reward": 0.8473759293556213, "rewards/t_format_reward": 1.0, "step": 749 }, { "advantages": -9.313225746154785e-09, "completion_length": 97.25, "epoch": 0.25, "grad_norm": 4.863839626312256, "kl": 0.189453125, "learning_rate": 7.5e-07, "loss": 0.0076, "reward": 0.44907283782958984, "reward_mean": 0.44907283782958984, "reward_std": 0.0738733559846878, "rewards/v_meteor_reward": 0.44907283782958984, "step": 750 }, { "advantages": -2.1606683731079102e-07, "completion_length": 15.0, "epoch": 0.25033333333333335, "grad_norm": 10.802257537841797, "kl": 0.3984375, "learning_rate": 7.496666666666667e-07, "loss": 0.0159, "reward": 1.9439536333084106, "reward_mean": 1.9439536333084106, "reward_std": 0.024276288226246834, "rewards/iou_timestamp_reward": 0.9439536929130554, "rewards/t_format_reward": 1.0, "step": 751 }, { "advantages": -7.450580596923828e-09, "completion_length": 94.6875, "epoch": 0.25066666666666665, "grad_norm": 4.8800129890441895, "kl": 0.1787109375, "learning_rate": 7.493333333333333e-07, "loss": 0.0072, "reward": 0.3100375235080719, "reward_mean": 0.3100375235080719, "reward_std": 0.06746116280555725, "rewards/v_meteor_reward": 0.3100375235080719, "step": 752 }, { "advantages": 6.146728992462158e-08, "completion_length": 174.6875, "epoch": 0.251, "grad_norm": 4.497082710266113, "kl": 0.158203125, "learning_rate": 7.489999999999999e-07, "loss": 0.0063, "reward": 0.6387547254562378, "reward_mean": 0.6387547254562378, "reward_std": 0.1082429587841034, "rewards/a_meteor_reward": 0.6387547254562378, "step": 753 }, { "advantages": 1.4528632164001465e-07, "completion_length": 108.0, "epoch": 0.25133333333333335, "grad_norm": 4.4846510887146, "kl": 0.193359375, "learning_rate": 7.486666666666666e-07, "loss": 0.0077, "reward": 0.4983174800872803, "reward_mean": 0.4983174800872803, "reward_std": 0.08030781149864197, "rewards/v_meteor_reward": 0.4983174800872803, "step": 754 }, { "advantages": -7.487833499908447e-07, "completion_length": 15.25, "epoch": 0.25166666666666665, "grad_norm": 13.961764335632324, "kl": 0.216796875, "learning_rate": 7.483333333333333e-07, "loss": 0.0087, "reward": 1.7587274312973022, "reward_mean": 1.7587274312973022, "reward_std": 0.07732057571411133, "rewards/iou_timestamp_reward": 0.7587273716926575, "rewards/t_format_reward": 1.0, "step": 755 }, { "advantages": -5.21540641784668e-08, "completion_length": 56.1875, "epoch": 0.252, "grad_norm": 6.429347515106201, "kl": 0.220703125, "learning_rate": 7.48e-07, "loss": 0.0088, "reward": 0.34319955110549927, "reward_mean": 0.34319955110549927, "reward_std": 0.06458065658807755, "rewards/v_meteor_reward": 0.34319955110549927, "step": 756 }, { "advantages": 8.344650268554688e-07, "completion_length": 16.75, "epoch": 0.25233333333333335, "grad_norm": 8.60831356048584, "kl": 0.341796875, "learning_rate": 7.476666666666667e-07, "loss": 0.0137, "reward": 1.8414982557296753, "reward_mean": 1.8414982557296753, "reward_std": 0.02486182190477848, "rewards/iou_timestamp_reward": 0.8414982557296753, "rewards/t_format_reward": 1.0, "step": 757 }, { "advantages": -5.21540641784668e-08, "completion_length": 99.125, "epoch": 0.25266666666666665, "grad_norm": 7.58743953704834, "kl": 0.4765625, "learning_rate": 7.473333333333332e-07, "loss": 0.0191, "reward": 0.6127432584762573, "reward_mean": 0.6127432584762573, "reward_std": 0.0656830370426178, "rewards/a_meteor_reward": 0.6127432584762573, "step": 758 }, { "advantages": 2.644956111907959e-07, "completion_length": 199.0, "epoch": 0.253, "grad_norm": 3.874263048171997, "kl": 0.1875, "learning_rate": 7.47e-07, "loss": 0.0075, "reward": 0.5316753387451172, "reward_mean": 0.5316753387451172, "reward_std": 0.05123528093099594, "rewards/a_meteor_reward": 0.5316753387451172, "step": 759 }, { "advantages": -2.1606683731079102e-07, "completion_length": 79.9375, "epoch": 0.25333333333333335, "grad_norm": 5.19149112701416, "kl": 0.193359375, "learning_rate": 7.466666666666667e-07, "loss": 0.0077, "reward": 0.4552949070930481, "reward_mean": 0.4552949070930481, "reward_std": 0.09045971930027008, "rewards/v_meteor_reward": 0.4552949070930481, "step": 760 }, { "advantages": -1.5273690223693848e-07, "completion_length": 75.8125, "epoch": 0.25366666666666665, "grad_norm": 6.142044544219971, "kl": 0.177734375, "learning_rate": 7.463333333333333e-07, "loss": 0.0071, "reward": 0.42949360609054565, "reward_mean": 0.42949360609054565, "reward_std": 0.05893012136220932, "rewards/v_meteor_reward": 0.42949360609054565, "step": 761 }, { "advantages": 1.4156103134155273e-07, "completion_length": 61.375, "epoch": 0.254, "grad_norm": 6.563636302947998, "kl": 0.36328125, "learning_rate": 7.459999999999999e-07, "loss": 0.0145, "reward": 0.6210685968399048, "reward_mean": 0.6210685968399048, "reward_std": 0.20298351347446442, "rewards/a_meteor_reward": 0.6210685968399048, "step": 762 }, { "advantages": -1.1920928955078125e-07, "completion_length": 154.1875, "epoch": 0.25433333333333336, "grad_norm": 3.004624605178833, "kl": 0.1044921875, "learning_rate": 7.456666666666667e-07, "loss": 0.0042, "reward": 0.506687343120575, "reward_mean": 0.506687343120575, "reward_std": 0.14739292860031128, "rewards/a_meteor_reward": 0.506687343120575, "step": 763 }, { "advantages": 5.960464477539063e-08, "completion_length": 49.9375, "epoch": 0.25466666666666665, "grad_norm": 6.490357398986816, "kl": 0.453125, "learning_rate": 7.453333333333333e-07, "loss": 0.0181, "reward": 0.7694869041442871, "reward_mean": 0.7694869041442871, "reward_std": 0.05297207459807396, "rewards/a_meteor_reward": 0.7694869041442871, "step": 764 }, { "advantages": -8.605420589447021e-07, "completion_length": 15.5, "epoch": 0.255, "grad_norm": 13.032042503356934, "kl": 0.326171875, "learning_rate": 7.45e-07, "loss": 0.0131, "reward": 1.881135106086731, "reward_mean": 1.881135106086731, "reward_std": 0.040324870496988297, "rewards/iou_timestamp_reward": 0.881135106086731, "rewards/t_format_reward": 1.0, "step": 765 }, { "advantages": 1.2665987014770508e-07, "completion_length": 15.75, "epoch": 0.25533333333333336, "grad_norm": 17.713443756103516, "kl": 0.2158203125, "learning_rate": 7.446666666666666e-07, "loss": 0.0086, "reward": 1.610698938369751, "reward_mean": 1.610698938369751, "reward_std": 0.06552334874868393, "rewards/iou_timestamp_reward": 0.6106989979743958, "rewards/t_format_reward": 1.0, "step": 766 }, { "advantages": -8.940696716308594e-07, "completion_length": 15.6875, "epoch": 0.25566666666666665, "grad_norm": 42.33953094482422, "kl": 0.220703125, "learning_rate": 7.443333333333332e-07, "loss": 0.0088, "reward": 1.6316124200820923, "reward_mean": 1.6316124200820923, "reward_std": 0.10288399457931519, "rewards/iou_timestamp_reward": 0.6316123604774475, "rewards/t_format_reward": 1.0, "step": 767 }, { "advantages": 5.21540641784668e-08, "completion_length": 57.25, "epoch": 0.256, "grad_norm": 6.354194164276123, "kl": 0.193359375, "learning_rate": 7.44e-07, "loss": 0.0077, "reward": 0.4448409676551819, "reward_mean": 0.4448409676551819, "reward_std": 0.0629967525601387, "rewards/v_meteor_reward": 0.4448409676551819, "step": 768 }, { "advantages": -1.6985461115837097e-05, "completion_length": 15.6875, "epoch": 0.25633333333333336, "grad_norm": 11.826237678527832, "kl": 0.37109375, "learning_rate": 7.436666666666667e-07, "loss": 0.0149, "reward": 1.8090541362762451, "reward_mean": 1.8090541362762451, "reward_std": 0.03643172234296799, "rewards/iou_timestamp_reward": 0.8090541362762451, "rewards/t_format_reward": 1.0, "step": 769 }, { "advantages": -3.948807716369629e-07, "completion_length": 105.1875, "epoch": 0.25666666666666665, "grad_norm": 3.568575143814087, "kl": 0.189453125, "learning_rate": 7.433333333333332e-07, "loss": 0.0076, "reward": 0.6654664278030396, "reward_mean": 0.6654664278030396, "reward_std": 0.06396697461605072, "rewards/a_meteor_reward": 0.6654664278030396, "step": 770 }, { "advantages": 1.043081283569336e-07, "completion_length": 129.4375, "epoch": 0.257, "grad_norm": 7.302309513092041, "kl": 0.275390625, "learning_rate": 7.429999999999999e-07, "loss": 0.011, "reward": 0.366179883480072, "reward_mean": 0.366179883480072, "reward_std": 0.06797074526548386, "rewards/a_meteor_reward": 0.366179883480072, "step": 771 }, { "advantages": 0.0, "completion_length": 64.75, "epoch": 0.25733333333333336, "grad_norm": 6.136823654174805, "kl": 0.240234375, "learning_rate": 7.426666666666667e-07, "loss": 0.0096, "reward": 0.3762352168560028, "reward_mean": 0.3762352168560028, "reward_std": 0.09835556149482727, "rewards/v_meteor_reward": 0.3762352168560028, "step": 772 }, { "advantages": -6.631016731262207e-07, "completion_length": 15.0, "epoch": 0.25766666666666665, "grad_norm": 11.569782257080078, "kl": 0.25, "learning_rate": 7.423333333333333e-07, "loss": 0.01, "reward": 1.4100263118743896, "reward_mean": 1.4100263118743896, "reward_std": 0.04642147198319435, "rewards/iou_timestamp_reward": 0.4100264310836792, "rewards/t_format_reward": 1.0, "step": 773 }, { "advantages": 2.4586915969848633e-07, "completion_length": 78.3125, "epoch": 0.258, "grad_norm": 5.45579719543457, "kl": 0.1865234375, "learning_rate": 7.42e-07, "loss": 0.0075, "reward": 0.42369937896728516, "reward_mean": 0.42369937896728516, "reward_std": 0.07945692539215088, "rewards/v_meteor_reward": 0.42369937896728516, "step": 774 }, { "advantages": -1.1548399925231934e-07, "completion_length": 65.0625, "epoch": 0.25833333333333336, "grad_norm": 5.993736267089844, "kl": 0.294921875, "learning_rate": 7.416666666666666e-07, "loss": 0.0118, "reward": 0.392180860042572, "reward_mean": 0.392180860042572, "reward_std": 0.0840594619512558, "rewards/v_meteor_reward": 0.392180860042572, "step": 775 }, { "advantages": 6.370246410369873e-06, "completion_length": 15.25, "epoch": 0.25866666666666666, "grad_norm": 17.491575241088867, "kl": 0.3203125, "learning_rate": 7.413333333333333e-07, "loss": 0.0128, "reward": 1.94920015335083, "reward_mean": 1.94920015335083, "reward_std": 0.01640503667294979, "rewards/iou_timestamp_reward": 0.9492002129554749, "rewards/t_format_reward": 1.0, "step": 776 }, { "advantages": -6.705522537231445e-08, "completion_length": 67.875, "epoch": 0.259, "grad_norm": 8.797382354736328, "kl": 0.42578125, "learning_rate": 7.41e-07, "loss": 0.0171, "reward": 0.6679432988166809, "reward_mean": 0.6679432988166809, "reward_std": 0.09283535927534103, "rewards/a_meteor_reward": 0.6679432988166809, "step": 777 }, { "advantages": 2.8870999813079834e-08, "completion_length": 70.1875, "epoch": 0.25933333333333336, "grad_norm": 5.822179317474365, "kl": 0.169921875, "learning_rate": 7.406666666666667e-07, "loss": 0.0068, "reward": 0.43042558431625366, "reward_mean": 0.43042558431625366, "reward_std": 0.09165318310260773, "rewards/v_meteor_reward": 0.43042558431625366, "step": 778 }, { "advantages": 1.1175870895385742e-08, "completion_length": 89.875, "epoch": 0.25966666666666666, "grad_norm": 5.675052165985107, "kl": 0.205078125, "learning_rate": 7.403333333333332e-07, "loss": 0.0082, "reward": 0.3955617845058441, "reward_mean": 0.3955617845058441, "reward_std": 0.09827519208192825, "rewards/v_meteor_reward": 0.3955617845058441, "step": 779 }, { "advantages": -4.6566128730773926e-08, "completion_length": 87.6875, "epoch": 0.26, "grad_norm": 6.3987298011779785, "kl": 0.25, "learning_rate": 7.4e-07, "loss": 0.01, "reward": 0.3179548382759094, "reward_mean": 0.3179548382759094, "reward_std": 0.046754948794841766, "rewards/v_meteor_reward": 0.3179548382759094, "step": 780 }, { "advantages": 1.4705583453178406e-06, "completion_length": 15.75, "epoch": 0.26033333333333336, "grad_norm": 15.507343292236328, "kl": 0.248046875, "learning_rate": 7.396666666666667e-07, "loss": 0.01, "reward": 1.7246534824371338, "reward_mean": 1.7246534824371338, "reward_std": 0.030871905386447906, "rewards/iou_timestamp_reward": 0.7246534824371338, "rewards/t_format_reward": 1.0, "step": 781 }, { "advantages": 6.543472409248352e-06, "completion_length": 16.25, "epoch": 0.26066666666666666, "grad_norm": 8.308622360229492, "kl": 0.318359375, "learning_rate": 7.393333333333333e-07, "loss": 0.0127, "reward": 1.8934935331344604, "reward_mean": 1.8934935331344604, "reward_std": 0.02731732651591301, "rewards/iou_timestamp_reward": 0.8934935331344604, "rewards/t_format_reward": 1.0, "step": 782 }, { "advantages": 7.636845111846924e-07, "completion_length": 15.5, "epoch": 0.261, "grad_norm": 12.311226844787598, "kl": 0.228515625, "learning_rate": 7.389999999999999e-07, "loss": 0.0092, "reward": 1.6806578636169434, "reward_mean": 1.6806578636169434, "reward_std": 0.029374241828918457, "rewards/iou_timestamp_reward": 0.6806578040122986, "rewards/t_format_reward": 1.0, "step": 783 }, { "advantages": -1.214444637298584e-06, "completion_length": 15.5, "epoch": 0.2613333333333333, "grad_norm": 14.768660545349121, "kl": 0.265625, "learning_rate": 7.386666666666666e-07, "loss": 0.0106, "reward": 1.8155958652496338, "reward_mean": 1.8155958652496338, "reward_std": 0.050196342170238495, "rewards/iou_timestamp_reward": 0.8155958652496338, "rewards/t_format_reward": 1.0, "step": 784 }, { "advantages": 2.4586915969848633e-07, "completion_length": 75.375, "epoch": 0.26166666666666666, "grad_norm": 5.412941932678223, "kl": 0.1572265625, "learning_rate": 7.383333333333333e-07, "loss": 0.0063, "reward": 0.5236473083496094, "reward_mean": 0.5236473083496094, "reward_std": 0.08079646527767181, "rewards/v_meteor_reward": 0.5236473083496094, "step": 785 }, { "advantages": 8.530914783477783e-07, "completion_length": 14.5, "epoch": 0.262, "grad_norm": 7.134978771209717, "kl": 0.181640625, "learning_rate": 7.38e-07, "loss": 0.0072, "reward": 1.5869553089141846, "reward_mean": 1.5869553089141846, "reward_std": 0.021573757752776146, "rewards/iou_timestamp_reward": 0.5869553089141846, "rewards/t_format_reward": 1.0, "step": 786 }, { "advantages": -3.501772880554199e-07, "completion_length": 61.625, "epoch": 0.2623333333333333, "grad_norm": 6.055797100067139, "kl": 0.287109375, "learning_rate": 7.376666666666666e-07, "loss": 0.0115, "reward": 0.3792189657688141, "reward_mean": 0.3792189657688141, "reward_std": 0.04144527390599251, "rewards/v_meteor_reward": 0.3792189657688141, "step": 787 }, { "advantages": 4.842877388000488e-08, "completion_length": 71.0625, "epoch": 0.26266666666666666, "grad_norm": 5.766775131225586, "kl": 0.18359375, "learning_rate": 7.373333333333332e-07, "loss": 0.0074, "reward": 0.44995003938674927, "reward_mean": 0.44995003938674927, "reward_std": 0.09387801587581635, "rewards/v_meteor_reward": 0.44995003938674927, "step": 788 }, { "advantages": 8.009374141693115e-08, "completion_length": 54.3125, "epoch": 0.263, "grad_norm": 6.2840728759765625, "kl": 0.35546875, "learning_rate": 7.37e-07, "loss": 0.0142, "reward": 0.43437308073043823, "reward_mean": 0.43437308073043823, "reward_std": 0.09028761088848114, "rewards/v_meteor_reward": 0.43437308073043823, "step": 789 }, { "advantages": 3.91155481338501e-07, "completion_length": 126.375, "epoch": 0.2633333333333333, "grad_norm": 2.8109912872314453, "kl": 0.1240234375, "learning_rate": 7.366666666666667e-07, "loss": 0.005, "reward": 0.5349380373954773, "reward_mean": 0.5349380373954773, "reward_std": 0.07756427675485611, "rewards/a_meteor_reward": 0.5349380373954773, "step": 790 }, { "advantages": -1.0710209608078003e-06, "completion_length": 15.25, "epoch": 0.26366666666666666, "grad_norm": 9.173556327819824, "kl": 0.216796875, "learning_rate": 7.363333333333332e-07, "loss": 0.0087, "reward": 1.912003517150879, "reward_mean": 1.912003517150879, "reward_std": 0.022475706413388252, "rewards/iou_timestamp_reward": 0.9120034575462341, "rewards/t_format_reward": 1.0, "step": 791 }, { "advantages": -4.6566128730773926e-08, "completion_length": 165.125, "epoch": 0.264, "grad_norm": 6.2492876052856445, "kl": 0.2197265625, "learning_rate": 7.359999999999999e-07, "loss": 0.0088, "reward": 0.44131624698638916, "reward_mean": 0.44131624698638916, "reward_std": 0.16380628943443298, "rewards/a_meteor_reward": 0.44131624698638916, "step": 792 }, { "advantages": 4.3213367462158203e-07, "completion_length": 83.375, "epoch": 0.2643333333333333, "grad_norm": 5.76492977142334, "kl": 0.234375, "learning_rate": 7.356666666666667e-07, "loss": 0.0094, "reward": 0.43902909755706787, "reward_mean": 0.43902909755706787, "reward_std": 0.0596756637096405, "rewards/v_meteor_reward": 0.43902909755706787, "step": 793 }, { "advantages": -1.2032687664031982e-06, "completion_length": 16.0, "epoch": 0.26466666666666666, "grad_norm": 8.520907402038574, "kl": 0.2021484375, "learning_rate": 7.353333333333333e-07, "loss": 0.0081, "reward": 1.5401557683944702, "reward_mean": 1.5401557683944702, "reward_std": 0.0365874283015728, "rewards/iou_timestamp_reward": 0.5401557087898254, "rewards/t_format_reward": 1.0, "step": 794 }, { "advantages": 4.0978193283081055e-08, "completion_length": 64.9375, "epoch": 0.265, "grad_norm": 6.065752983093262, "kl": 0.1708984375, "learning_rate": 7.35e-07, "loss": 0.0069, "reward": 0.25223401188850403, "reward_mean": 0.25223401188850403, "reward_std": 0.07108373939990997, "rewards/v_meteor_reward": 0.25223401188850403, "step": 795 }, { "advantages": -3.203749656677246e-07, "completion_length": 15.5, "epoch": 0.2653333333333333, "grad_norm": 9.295875549316406, "kl": 0.32421875, "learning_rate": 7.346666666666666e-07, "loss": 0.0129, "reward": 1.8943781852722168, "reward_mean": 1.8943781852722168, "reward_std": 0.047265224158763885, "rewards/iou_timestamp_reward": 0.8943781852722168, "rewards/t_format_reward": 1.0, "step": 796 }, { "advantages": 2.3283064365386963e-08, "completion_length": 52.625, "epoch": 0.26566666666666666, "grad_norm": 7.1117262840271, "kl": 0.2431640625, "learning_rate": 7.343333333333332e-07, "loss": 0.0097, "reward": 0.35799646377563477, "reward_mean": 0.35799646377563477, "reward_std": 0.05081082880496979, "rewards/v_meteor_reward": 0.35799646377563477, "step": 797 }, { "advantages": 7.450580596923828e-08, "completion_length": 51.8125, "epoch": 0.266, "grad_norm": 7.168635845184326, "kl": 0.203125, "learning_rate": 7.34e-07, "loss": 0.0081, "reward": 0.2576674222946167, "reward_mean": 0.2576674222946167, "reward_std": 0.07535874843597412, "rewards/v_meteor_reward": 0.2576674222946167, "step": 798 }, { "advantages": -3.1478703022003174e-07, "completion_length": 50.0625, "epoch": 0.2663333333333333, "grad_norm": 4.847860336303711, "kl": 0.265625, "learning_rate": 7.336666666666667e-07, "loss": 0.0106, "reward": 0.529625415802002, "reward_mean": 0.529625415802002, "reward_std": 0.11338076740503311, "rewards/a_meteor_reward": 0.529625415802002, "step": 799 }, { "advantages": -1.0617077350616455e-07, "completion_length": 98.0, "epoch": 0.26666666666666666, "grad_norm": 6.466470718383789, "kl": 0.1787109375, "learning_rate": 7.333333333333332e-07, "loss": 0.0071, "reward": 0.36358046531677246, "reward_mean": 0.36358046531677246, "reward_std": 0.06805761158466339, "rewards/v_meteor_reward": 0.36358046531677246, "step": 800 }, { "advantages": -8.195638656616211e-08, "completion_length": 80.125, "epoch": 0.267, "grad_norm": 4.6386260986328125, "kl": 0.150390625, "learning_rate": 7.329999999999999e-07, "loss": 0.006, "reward": 0.5679866075515747, "reward_mean": 0.5679866075515747, "reward_std": 0.1109338030219078, "rewards/v_meteor_reward": 0.5679866075515747, "step": 801 }, { "advantages": 3.3490359783172607e-06, "completion_length": 15.25, "epoch": 0.2673333333333333, "grad_norm": 21.77832794189453, "kl": 0.2451171875, "learning_rate": 7.326666666666667e-07, "loss": 0.0098, "reward": 1.5632531642913818, "reward_mean": 1.5632531642913818, "reward_std": 0.07430481910705566, "rewards/iou_timestamp_reward": 0.5632531642913818, "rewards/t_format_reward": 1.0, "step": 802 }, { "advantages": -9.94652509689331e-07, "completion_length": 15.5, "epoch": 0.26766666666666666, "grad_norm": 14.40422248840332, "kl": 0.26171875, "learning_rate": 7.323333333333333e-07, "loss": 0.0105, "reward": 1.859336495399475, "reward_mean": 1.859336495399475, "reward_std": 0.04075521230697632, "rewards/iou_timestamp_reward": 0.8593365550041199, "rewards/t_format_reward": 1.0, "step": 803 }, { "advantages": 6.09084963798523e-07, "completion_length": 15.5, "epoch": 0.268, "grad_norm": 15.928074836730957, "kl": 0.2470703125, "learning_rate": 7.319999999999999e-07, "loss": 0.0099, "reward": 1.5145069360733032, "reward_mean": 1.5145069360733032, "reward_std": 0.017616454511880875, "rewards/iou_timestamp_reward": 0.5145069360733032, "rewards/t_format_reward": 1.0, "step": 804 }, { "advantages": -4.470348358154297e-08, "completion_length": 61.125, "epoch": 0.2683333333333333, "grad_norm": 6.615947723388672, "kl": 0.2041015625, "learning_rate": 7.316666666666666e-07, "loss": 0.0082, "reward": 0.39632290601730347, "reward_mean": 0.39632290601730347, "reward_std": 0.1068749949336052, "rewards/v_meteor_reward": 0.39632290601730347, "step": 805 }, { "advantages": -1.3764947652816772e-06, "completion_length": 15.5, "epoch": 0.26866666666666666, "grad_norm": 17.23617935180664, "kl": 0.208984375, "learning_rate": 7.313333333333333e-07, "loss": 0.0084, "reward": 1.7879064083099365, "reward_mean": 1.7879064083099365, "reward_std": 0.09094181656837463, "rewards/iou_timestamp_reward": 0.7879063487052917, "rewards/t_format_reward": 1.0, "step": 806 }, { "advantages": 9.182840585708618e-07, "completion_length": 15.4375, "epoch": 0.269, "grad_norm": 14.0964994430542, "kl": 0.2275390625, "learning_rate": 7.31e-07, "loss": 0.0091, "reward": 1.6644346714019775, "reward_mean": 1.6644346714019775, "reward_std": 0.03255660459399223, "rewards/iou_timestamp_reward": 0.6644346714019775, "rewards/t_format_reward": 1.0, "step": 807 }, { "advantages": -1.8272548913955688e-06, "completion_length": 14.75, "epoch": 0.2693333333333333, "grad_norm": 13.512198448181152, "kl": 0.236328125, "learning_rate": 7.306666666666666e-07, "loss": 0.0094, "reward": 1.7180126905441284, "reward_mean": 1.7180126905441284, "reward_std": 0.057608745992183685, "rewards/iou_timestamp_reward": 0.7180126309394836, "rewards/t_format_reward": 1.0, "step": 808 }, { "advantages": 3.8929283618927e-07, "completion_length": 55.5625, "epoch": 0.26966666666666667, "grad_norm": 14.670210838317871, "kl": 0.359375, "learning_rate": 7.303333333333332e-07, "loss": 0.0143, "reward": 0.6269164085388184, "reward_mean": 0.6269164085388184, "reward_std": 0.09415949136018753, "rewards/a_meteor_reward": 0.6269164085388184, "step": 809 }, { "advantages": -3.1758099794387817e-06, "completion_length": 15.75, "epoch": 0.27, "grad_norm": 18.484989166259766, "kl": 0.2060546875, "learning_rate": 7.3e-07, "loss": 0.0082, "reward": 1.3837168216705322, "reward_mean": 1.3837168216705322, "reward_std": 0.03002813458442688, "rewards/iou_timestamp_reward": 0.38371679186820984, "rewards/t_format_reward": 1.0, "step": 810 }, { "advantages": -8.698552846908569e-07, "completion_length": 16.5, "epoch": 0.2703333333333333, "grad_norm": 17.346132278442383, "kl": 0.21484375, "learning_rate": 7.296666666666667e-07, "loss": 0.0086, "reward": 1.7820876836776733, "reward_mean": 1.7820876836776733, "reward_std": 0.061167195439338684, "rewards/iou_timestamp_reward": 0.7820876836776733, "rewards/t_format_reward": 1.0, "step": 811 }, { "advantages": -4.2282044887542725e-07, "completion_length": 108.0, "epoch": 0.27066666666666667, "grad_norm": 4.32645320892334, "kl": 0.173828125, "learning_rate": 7.293333333333332e-07, "loss": 0.007, "reward": 0.5108822584152222, "reward_mean": 0.5108822584152222, "reward_std": 0.06914345175027847, "rewards/v_meteor_reward": 0.5108822584152222, "step": 812 }, { "advantages": 2.0954757928848267e-08, "completion_length": 61.5, "epoch": 0.271, "grad_norm": 6.47935152053833, "kl": 0.2890625, "learning_rate": 7.289999999999999e-07, "loss": 0.0115, "reward": 0.3912762701511383, "reward_mean": 0.3912762701511383, "reward_std": 0.10087195038795471, "rewards/v_meteor_reward": 0.3912762701511383, "step": 813 }, { "advantages": 1.1175870895385742e-07, "completion_length": 102.125, "epoch": 0.2713333333333333, "grad_norm": 4.200403690338135, "kl": 0.21484375, "learning_rate": 7.286666666666666e-07, "loss": 0.0086, "reward": 0.4296203553676605, "reward_mean": 0.4296203553676605, "reward_std": 0.11635768413543701, "rewards/a_meteor_reward": 0.4296203553676605, "step": 814 }, { "advantages": -1.0803341865539551e-07, "completion_length": 50.4375, "epoch": 0.27166666666666667, "grad_norm": 6.155479907989502, "kl": 0.330078125, "learning_rate": 7.283333333333334e-07, "loss": 0.0132, "reward": 0.29925012588500977, "reward_mean": 0.29925012588500977, "reward_std": 0.08936432003974915, "rewards/v_meteor_reward": 0.29925012588500977, "step": 815 }, { "advantages": 2.980232238769531e-07, "completion_length": 103.75, "epoch": 0.272, "grad_norm": 4.165347099304199, "kl": 0.2451171875, "learning_rate": 7.28e-07, "loss": 0.0098, "reward": 0.6147679090499878, "reward_mean": 0.6147679090499878, "reward_std": 0.06130220741033554, "rewards/a_meteor_reward": 0.6147679090499878, "step": 816 }, { "advantages": 0.0, "completion_length": 16.0, "epoch": 0.2723333333333333, "grad_norm": 2.705883026123047, "kl": 0.248046875, "learning_rate": 7.276666666666666e-07, "loss": 0.0099, "reward": 1.156572937965393, "reward_mean": 1.156572937965393, "reward_std": 0.02949574403464794, "rewards/iou_timestamp_reward": 0.15657296776771545, "rewards/t_format_reward": 1.0, "step": 817 }, { "advantages": 2.868473529815674e-07, "completion_length": 146.5, "epoch": 0.27266666666666667, "grad_norm": 5.63612699508667, "kl": 0.333984375, "learning_rate": 7.273333333333333e-07, "loss": 0.0134, "reward": 0.7343471646308899, "reward_mean": 0.7343471646308899, "reward_std": 0.0944993644952774, "rewards/a_meteor_reward": 0.7343471646308899, "step": 818 }, { "advantages": -1.0803341865539551e-07, "completion_length": 59.5625, "epoch": 0.273, "grad_norm": 5.299894332885742, "kl": 0.15625, "learning_rate": 7.27e-07, "loss": 0.0063, "reward": 0.3772536814212799, "reward_mean": 0.3772536814212799, "reward_std": 0.07457905262708664, "rewards/v_meteor_reward": 0.3772536814212799, "step": 819 }, { "advantages": 1.7136335372924805e-07, "completion_length": 15.0625, "epoch": 0.2733333333333333, "grad_norm": 27.541488647460938, "kl": 0.2451171875, "learning_rate": 7.266666666666667e-07, "loss": 0.0098, "reward": 1.7381243705749512, "reward_mean": 1.7381243705749512, "reward_std": 0.07543259859085083, "rewards/iou_timestamp_reward": 0.7381243109703064, "rewards/t_format_reward": 1.0, "step": 820 }, { "advantages": 1.825392246246338e-07, "completion_length": 80.8125, "epoch": 0.27366666666666667, "grad_norm": 5.811254978179932, "kl": 0.2216796875, "learning_rate": 7.263333333333333e-07, "loss": 0.0088, "reward": 0.32336491346359253, "reward_mean": 0.32336491346359253, "reward_std": 0.07711879909038544, "rewards/v_meteor_reward": 0.32336491346359253, "step": 821 }, { "advantages": 4.842877388000488e-08, "completion_length": 64.0, "epoch": 0.274, "grad_norm": 5.354650974273682, "kl": 0.25390625, "learning_rate": 7.259999999999999e-07, "loss": 0.0102, "reward": 0.4928993284702301, "reward_mean": 0.4928993284702301, "reward_std": 0.11842057853937149, "rewards/v_meteor_reward": 0.4928993284702301, "step": 822 }, { "advantages": 8.344650268554688e-07, "completion_length": 15.125, "epoch": 0.2743333333333333, "grad_norm": 15.435065269470215, "kl": 0.28125, "learning_rate": 7.256666666666667e-07, "loss": 0.0113, "reward": 1.5614848136901855, "reward_mean": 1.5614848136901855, "reward_std": 0.05494612455368042, "rewards/iou_timestamp_reward": 0.561484694480896, "rewards/t_format_reward": 1.0, "step": 823 }, { "advantages": 4.470348358154297e-07, "completion_length": 86.1875, "epoch": 0.27466666666666667, "grad_norm": 6.000172138214111, "kl": 0.5703125, "learning_rate": 7.253333333333334e-07, "loss": 0.0229, "reward": 0.6462867259979248, "reward_mean": 0.6462867259979248, "reward_std": 0.04585660248994827, "rewards/a_meteor_reward": 0.6462867259979248, "step": 824 }, { "advantages": -3.725290298461914e-09, "completion_length": 96.3125, "epoch": 0.275, "grad_norm": 4.860565662384033, "kl": 0.232421875, "learning_rate": 7.249999999999999e-07, "loss": 0.0093, "reward": 0.7430703639984131, "reward_mean": 0.7430703639984131, "reward_std": 0.05601152032613754, "rewards/a_meteor_reward": 0.7430703639984131, "step": 825 }, { "advantages": -2.775341272354126e-07, "completion_length": 66.9375, "epoch": 0.2753333333333333, "grad_norm": 6.971970558166504, "kl": 0.2177734375, "learning_rate": 7.246666666666666e-07, "loss": 0.0087, "reward": 0.3695503771305084, "reward_mean": 0.3695503771305084, "reward_std": 0.040732480585575104, "rewards/v_meteor_reward": 0.3695503771305084, "step": 826 }, { "advantages": 1.955777406692505e-08, "completion_length": 67.9375, "epoch": 0.27566666666666667, "grad_norm": 6.115758895874023, "kl": 0.228515625, "learning_rate": 7.243333333333334e-07, "loss": 0.0091, "reward": 0.37153854966163635, "reward_mean": 0.37153854966163635, "reward_std": 0.08127053081989288, "rewards/v_meteor_reward": 0.37153854966163635, "step": 827 }, { "advantages": -8.195638656616211e-08, "completion_length": 286.75, "epoch": 0.276, "grad_norm": 3.0516717433929443, "kl": 0.125, "learning_rate": 7.24e-07, "loss": 0.005, "reward": 0.5711506605148315, "reward_mean": 0.5711506605148315, "reward_std": 0.11825698614120483, "rewards/a_meteor_reward": 0.5711506605148315, "step": 828 }, { "advantages": 6.332993507385254e-08, "completion_length": 64.625, "epoch": 0.2763333333333333, "grad_norm": 5.762519836425781, "kl": 0.169921875, "learning_rate": 7.236666666666666e-07, "loss": 0.0068, "reward": 0.40614423155784607, "reward_mean": 0.40614423155784607, "reward_std": 0.10525675117969513, "rewards/v_meteor_reward": 0.40614423155784607, "step": 829 }, { "advantages": 1.6391277313232422e-07, "completion_length": 82.875, "epoch": 0.27666666666666667, "grad_norm": 3.9117188453674316, "kl": 0.2177734375, "learning_rate": 7.233333333333333e-07, "loss": 0.0087, "reward": 0.5714771747589111, "reward_mean": 0.5714771747589111, "reward_std": 0.040694985538721085, "rewards/a_meteor_reward": 0.5714771747589111, "step": 830 }, { "advantages": 3.725290298461914e-08, "completion_length": 14.5, "epoch": 0.277, "grad_norm": 15.096478462219238, "kl": 0.279296875, "learning_rate": 7.229999999999999e-07, "loss": 0.0112, "reward": 1.7127490043640137, "reward_mean": 1.7127490043640137, "reward_std": 0.022196292877197266, "rewards/iou_timestamp_reward": 0.7127490043640137, "rewards/t_format_reward": 1.0, "step": 831 }, { "advantages": -2.3283064365386963e-07, "completion_length": 101.4375, "epoch": 0.2773333333333333, "grad_norm": 2.8195784091949463, "kl": 0.30078125, "learning_rate": 7.226666666666667e-07, "loss": 0.0121, "reward": 0.8077709674835205, "reward_mean": 0.8077709674835205, "reward_std": 0.02681799978017807, "rewards/a_meteor_reward": 0.8077709674835205, "step": 832 }, { "advantages": -1.601874828338623e-07, "completion_length": 62.8125, "epoch": 0.2776666666666667, "grad_norm": 6.030210494995117, "kl": 0.1796875, "learning_rate": 7.223333333333334e-07, "loss": 0.0072, "reward": 0.4001312255859375, "reward_mean": 0.4001312255859375, "reward_std": 0.0969940572977066, "rewards/v_meteor_reward": 0.4001312255859375, "step": 833 }, { "advantages": 9.08970832824707e-07, "completion_length": 44.5, "epoch": 0.278, "grad_norm": 7.058896064758301, "kl": 0.5078125, "learning_rate": 7.219999999999999e-07, "loss": 0.0204, "reward": 0.6670373678207397, "reward_mean": 0.6670373678207397, "reward_std": 0.08055625855922699, "rewards/a_meteor_reward": 0.6670373678207397, "step": 834 }, { "advantages": 3.91155481338501e-07, "completion_length": 126.3125, "epoch": 0.2783333333333333, "grad_norm": 18.165775299072266, "kl": 0.2392578125, "learning_rate": 7.216666666666666e-07, "loss": 0.0096, "reward": 0.7172313928604126, "reward_mean": 0.7172313928604126, "reward_std": 0.16532593965530396, "rewards/a_meteor_reward": 0.7172313928604126, "step": 835 }, { "advantages": -3.8370490074157715e-07, "completion_length": 75.25, "epoch": 0.2786666666666667, "grad_norm": 3.6154205799102783, "kl": 0.296875, "learning_rate": 7.213333333333334e-07, "loss": 0.0119, "reward": 0.7147257328033447, "reward_mean": 0.7147257328033447, "reward_std": 0.03595642000436783, "rewards/a_meteor_reward": 0.7147257328033447, "step": 836 }, { "advantages": -1.6391277313232422e-07, "completion_length": 63.8125, "epoch": 0.279, "grad_norm": 5.627851486206055, "kl": 0.259765625, "learning_rate": 7.21e-07, "loss": 0.0104, "reward": 0.3942228853702545, "reward_mean": 0.3942228853702545, "reward_std": 0.08348652720451355, "rewards/v_meteor_reward": 0.3942228853702545, "step": 837 }, { "advantages": -2.339482307434082e-06, "completion_length": 15.375, "epoch": 0.2793333333333333, "grad_norm": 31.797771453857422, "kl": 0.25390625, "learning_rate": 7.206666666666666e-07, "loss": 0.0102, "reward": 1.5226949453353882, "reward_mean": 1.5226949453353882, "reward_std": 0.094478078186512, "rewards/iou_timestamp_reward": 0.5226948261260986, "rewards/t_format_reward": 1.0, "step": 838 }, { "advantages": 2.2426247596740723e-06, "completion_length": 14.75, "epoch": 0.2796666666666667, "grad_norm": 8.15959358215332, "kl": 0.287109375, "learning_rate": 7.203333333333333e-07, "loss": 0.0115, "reward": 1.619005799293518, "reward_mean": 1.619005799293518, "reward_std": 0.028176642954349518, "rewards/iou_timestamp_reward": 0.6190057992935181, "rewards/t_format_reward": 1.0, "step": 839 }, { "advantages": 3.725290298461914e-09, "completion_length": 84.9375, "epoch": 0.28, "grad_norm": 4.988943576812744, "kl": 0.2119140625, "learning_rate": 7.2e-07, "loss": 0.0085, "reward": 0.42524057626724243, "reward_mean": 0.42524057626724243, "reward_std": 0.06280829012393951, "rewards/v_meteor_reward": 0.42524057626724243, "step": 840 }, { "advantages": 1.5273690223693848e-07, "completion_length": 16.0, "epoch": 0.2803333333333333, "grad_norm": 7.92397403717041, "kl": 0.322265625, "learning_rate": 7.196666666666667e-07, "loss": 0.0129, "reward": 1.94032883644104, "reward_mean": 1.94032883644104, "reward_std": 0.023185374215245247, "rewards/iou_timestamp_reward": 0.9403289556503296, "rewards/t_format_reward": 1.0, "step": 841 }, { "advantages": 1.30385160446167e-08, "completion_length": 52.625, "epoch": 0.2806666666666667, "grad_norm": 3.7857861518859863, "kl": 0.333984375, "learning_rate": 7.193333333333333e-07, "loss": 0.0134, "reward": 0.7935339212417603, "reward_mean": 0.7935339212417603, "reward_std": 0.03336101025342941, "rewards/a_meteor_reward": 0.7935339212417603, "step": 842 }, { "advantages": 2.48197466135025e-07, "completion_length": 16.5, "epoch": 0.281, "grad_norm": 8.073721885681152, "kl": 0.2177734375, "learning_rate": 7.189999999999999e-07, "loss": 0.0087, "reward": 1.6496082544326782, "reward_mean": 1.6496082544326782, "reward_std": 0.02947895973920822, "rewards/iou_timestamp_reward": 0.649608314037323, "rewards/t_format_reward": 1.0, "step": 843 }, { "advantages": -1.564621925354004e-07, "completion_length": 74.1875, "epoch": 0.2813333333333333, "grad_norm": 5.634206771850586, "kl": 0.2177734375, "learning_rate": 7.186666666666667e-07, "loss": 0.0087, "reward": 0.36242663860321045, "reward_mean": 0.36242663860321045, "reward_std": 0.05418936163187027, "rewards/v_meteor_reward": 0.36242663860321045, "step": 844 }, { "advantages": 5.21540641784668e-07, "completion_length": 81.25, "epoch": 0.2816666666666667, "grad_norm": 3.006378650665283, "kl": 0.240234375, "learning_rate": 7.183333333333334e-07, "loss": 0.0096, "reward": 0.8903632760047913, "reward_mean": 0.8903632760047913, "reward_std": 0.025702524930238724, "rewards/a_meteor_reward": 0.8903632760047913, "step": 845 }, { "advantages": 1.862645149230957e-07, "completion_length": 16.5, "epoch": 0.282, "grad_norm": 16.667646408081055, "kl": 0.21484375, "learning_rate": 7.179999999999999e-07, "loss": 0.0086, "reward": 1.6266164779663086, "reward_mean": 1.6266164779663086, "reward_std": 0.10296182334423065, "rewards/iou_timestamp_reward": 0.6266164779663086, "rewards/t_format_reward": 1.0, "step": 846 }, { "advantages": 1.862645149230957e-08, "completion_length": 74.75, "epoch": 0.2823333333333333, "grad_norm": 5.187102317810059, "kl": 0.1923828125, "learning_rate": 7.176666666666666e-07, "loss": 0.0077, "reward": 0.3624017834663391, "reward_mean": 0.3624017834663391, "reward_std": 0.06466708332300186, "rewards/v_meteor_reward": 0.3624017834663391, "step": 847 }, { "advantages": -7.82310962677002e-08, "completion_length": 69.0625, "epoch": 0.2826666666666667, "grad_norm": 6.110282897949219, "kl": 0.244140625, "learning_rate": 7.173333333333333e-07, "loss": 0.0098, "reward": 0.41265642642974854, "reward_mean": 0.41265642642974854, "reward_std": 0.08330674469470978, "rewards/v_meteor_reward": 0.41265642642974854, "step": 848 }, { "advantages": -6.332993507385254e-08, "completion_length": 82.625, "epoch": 0.283, "grad_norm": 5.139657974243164, "kl": 0.171875, "learning_rate": 7.17e-07, "loss": 0.0069, "reward": 0.4548770487308502, "reward_mean": 0.4548770487308502, "reward_std": 0.083588607609272, "rewards/v_meteor_reward": 0.4548770487308502, "step": 849 }, { "advantages": -1.4528632164001465e-07, "completion_length": 340.9375, "epoch": 0.2833333333333333, "grad_norm": 3.078012466430664, "kl": 0.1708984375, "learning_rate": 7.166666666666667e-07, "loss": 0.0068, "reward": 0.5521346926689148, "reward_mean": 0.5521346926689148, "reward_std": 0.07727670669555664, "rewards/a_meteor_reward": 0.5521346926689148, "step": 850 }, { "advantages": 1.0058283805847168e-07, "completion_length": 67.5, "epoch": 0.2836666666666667, "grad_norm": 6.161296844482422, "kl": 0.2177734375, "learning_rate": 7.163333333333333e-07, "loss": 0.0087, "reward": 0.37762725353240967, "reward_mean": 0.37762725353240967, "reward_std": 0.05999867618083954, "rewards/v_meteor_reward": 0.37762725353240967, "step": 851 }, { "advantages": 3.557652235031128e-07, "completion_length": 101.75, "epoch": 0.284, "grad_norm": 4.067807197570801, "kl": 0.1650390625, "learning_rate": 7.159999999999999e-07, "loss": 0.0066, "reward": 0.8385440111160278, "reward_mean": 0.8385440111160278, "reward_std": 0.029863040894269943, "rewards/a_meteor_reward": 0.8385440111160278, "step": 852 }, { "advantages": 6.09084963798523e-07, "completion_length": 308.1875, "epoch": 0.2843333333333333, "grad_norm": 3.119899034500122, "kl": 0.14453125, "learning_rate": 7.156666666666667e-07, "loss": 0.0058, "reward": 0.5051380395889282, "reward_mean": 0.5051380395889282, "reward_std": 0.021042753010988235, "rewards/a_meteor_reward": 0.5051380395889282, "step": 853 }, { "advantages": 4.284083843231201e-08, "completion_length": 68.75, "epoch": 0.2846666666666667, "grad_norm": 5.108089923858643, "kl": 0.236328125, "learning_rate": 7.153333333333334e-07, "loss": 0.0095, "reward": 0.4037701487541199, "reward_mean": 0.4037701487541199, "reward_std": 0.068926602602005, "rewards/v_meteor_reward": 0.4037701487541199, "step": 854 }, { "advantages": -2.980232238769531e-07, "completion_length": 165.125, "epoch": 0.285, "grad_norm": 4.7268757820129395, "kl": 0.1416015625, "learning_rate": 7.149999999999999e-07, "loss": 0.0057, "reward": 0.4627142548561096, "reward_mean": 0.4627142548561096, "reward_std": 0.12470446527004242, "rewards/a_meteor_reward": 0.4627142548561096, "step": 855 }, { "advantages": -1.4528632164001465e-07, "completion_length": 95.625, "epoch": 0.2853333333333333, "grad_norm": 5.181899547576904, "kl": 0.12890625, "learning_rate": 7.146666666666666e-07, "loss": 0.0052, "reward": 0.3801027834415436, "reward_mean": 0.3801027834415436, "reward_std": 0.07229979336261749, "rewards/v_meteor_reward": 0.3801027834415436, "step": 856 }, { "advantages": -9.760260581970215e-07, "completion_length": 15.5, "epoch": 0.2856666666666667, "grad_norm": 9.362540245056152, "kl": 0.3203125, "learning_rate": 7.143333333333334e-07, "loss": 0.0129, "reward": 1.8103363513946533, "reward_mean": 1.8103363513946533, "reward_std": 0.023999523371458054, "rewards/iou_timestamp_reward": 0.8103364109992981, "rewards/t_format_reward": 1.0, "step": 857 }, { "advantages": -4.842877388000488e-08, "completion_length": 15.25, "epoch": 0.286, "grad_norm": 20.88606834411621, "kl": 0.20703125, "learning_rate": 7.14e-07, "loss": 0.0082, "reward": 1.7449538707733154, "reward_mean": 1.7449538707733154, "reward_std": 0.062299344688653946, "rewards/iou_timestamp_reward": 0.7449537515640259, "rewards/t_format_reward": 1.0, "step": 858 }, { "advantages": -8.684583008289337e-07, "completion_length": 15.75, "epoch": 0.28633333333333333, "grad_norm": 17.278121948242188, "kl": 0.2177734375, "learning_rate": 7.136666666666666e-07, "loss": 0.0087, "reward": 1.5635942220687866, "reward_mean": 1.5635942220687866, "reward_std": 0.055825818330049515, "rewards/iou_timestamp_reward": 0.5635942220687866, "rewards/t_format_reward": 1.0, "step": 859 }, { "advantages": 5.21540641784668e-08, "completion_length": 104.9375, "epoch": 0.2866666666666667, "grad_norm": 5.528946399688721, "kl": 0.216796875, "learning_rate": 7.133333333333333e-07, "loss": 0.0087, "reward": 0.3946717381477356, "reward_mean": 0.3946717381477356, "reward_std": 0.06806223839521408, "rewards/v_meteor_reward": 0.3946717381477356, "step": 860 }, { "advantages": -5.811452865600586e-07, "completion_length": 16.75, "epoch": 0.287, "grad_norm": 12.84229564666748, "kl": 0.251953125, "learning_rate": 7.129999999999999e-07, "loss": 0.0101, "reward": 1.53755784034729, "reward_mean": 1.53755784034729, "reward_std": 0.03733318671584129, "rewards/iou_timestamp_reward": 0.53755784034729, "rewards/t_format_reward": 1.0, "step": 861 }, { "advantages": 6.629154086112976e-06, "completion_length": 15.3125, "epoch": 0.28733333333333333, "grad_norm": 8.67032527923584, "kl": 0.31640625, "learning_rate": 7.126666666666667e-07, "loss": 0.0127, "reward": 1.7668859958648682, "reward_mean": 1.7668859958648682, "reward_std": 0.10222085565328598, "rewards/iou_timestamp_reward": 0.7668861150741577, "rewards/t_format_reward": 1.0, "step": 862 }, { "advantages": 6.593763828277588e-07, "completion_length": 15.25, "epoch": 0.2876666666666667, "grad_norm": 39.7585563659668, "kl": 0.2099609375, "learning_rate": 7.123333333333333e-07, "loss": 0.0084, "reward": 1.493345856666565, "reward_mean": 1.493345856666565, "reward_std": 0.11660285294055939, "rewards/iou_timestamp_reward": 0.49334585666656494, "rewards/t_format_reward": 1.0, "step": 863 }, { "advantages": 5.960464477539063e-08, "completion_length": 70.4375, "epoch": 0.288, "grad_norm": 6.290071964263916, "kl": 0.2138671875, "learning_rate": 7.119999999999999e-07, "loss": 0.0086, "reward": 0.30903881788253784, "reward_mean": 0.30903881788253784, "reward_std": 0.07766211777925491, "rewards/v_meteor_reward": 0.30903881788253784, "step": 864 }, { "advantages": 1.1548399925231934e-07, "completion_length": 89.25, "epoch": 0.28833333333333333, "grad_norm": 3.1552515029907227, "kl": 0.185546875, "learning_rate": 7.116666666666666e-07, "loss": 0.0074, "reward": 0.6301735043525696, "reward_mean": 0.6301735043525696, "reward_std": 0.057552166283130646, "rewards/a_meteor_reward": 0.6301735043525696, "step": 865 }, { "advantages": 7.450580596923828e-08, "completion_length": 282.25, "epoch": 0.2886666666666667, "grad_norm": 3.225432872772217, "kl": 0.1357421875, "learning_rate": 7.113333333333334e-07, "loss": 0.0054, "reward": 0.42896297574043274, "reward_mean": 0.42896297574043274, "reward_std": 0.07140098512172699, "rewards/a_meteor_reward": 0.42896297574043274, "step": 866 }, { "advantages": -5.289912223815918e-07, "completion_length": 90.875, "epoch": 0.289, "grad_norm": 4.505661487579346, "kl": 0.1962890625, "learning_rate": 7.11e-07, "loss": 0.0078, "reward": 0.4134146571159363, "reward_mean": 0.4134146571159363, "reward_std": 0.060977086424827576, "rewards/a_meteor_reward": 0.4134146571159363, "step": 867 }, { "advantages": 4.0978193283081055e-07, "completion_length": 99.9375, "epoch": 0.28933333333333333, "grad_norm": 4.784788608551025, "kl": 0.140625, "learning_rate": 7.106666666666666e-07, "loss": 0.0056, "reward": 0.4558601975440979, "reward_mean": 0.4558601975440979, "reward_std": 0.06960069388151169, "rewards/v_meteor_reward": 0.4558601975440979, "step": 868 }, { "advantages": -4.578381776809692e-06, "completion_length": 15.75, "epoch": 0.2896666666666667, "grad_norm": 8.916399002075195, "kl": 0.359375, "learning_rate": 7.103333333333333e-07, "loss": 0.0144, "reward": 1.976631760597229, "reward_mean": 1.976631760597229, "reward_std": 0.006499145179986954, "rewards/iou_timestamp_reward": 0.976631760597229, "rewards/t_format_reward": 1.0, "step": 869 }, { "advantages": 1.862645149230957e-07, "completion_length": 98.5625, "epoch": 0.29, "grad_norm": 3.5364644527435303, "kl": 0.236328125, "learning_rate": 7.1e-07, "loss": 0.0095, "reward": 0.6834560036659241, "reward_mean": 0.6834560036659241, "reward_std": 0.04164502024650574, "rewards/a_meteor_reward": 0.6834560036659241, "step": 870 }, { "advantages": -3.8370490074157715e-07, "completion_length": 254.6875, "epoch": 0.29033333333333333, "grad_norm": 3.1139512062072754, "kl": 0.1630859375, "learning_rate": 7.096666666666667e-07, "loss": 0.0065, "reward": 0.5325478911399841, "reward_mean": 0.5325478911399841, "reward_std": 0.09425602853298187, "rewards/a_meteor_reward": 0.5325478911399841, "step": 871 }, { "advantages": 2.2351741790771484e-08, "completion_length": 74.4375, "epoch": 0.2906666666666667, "grad_norm": 5.210302829742432, "kl": 0.21875, "learning_rate": 7.093333333333333e-07, "loss": 0.0087, "reward": 0.4773675203323364, "reward_mean": 0.4773675203323364, "reward_std": 0.06940285116434097, "rewards/v_meteor_reward": 0.4773675203323364, "step": 872 }, { "advantages": -2.2351741790771484e-07, "completion_length": 91.8125, "epoch": 0.291, "grad_norm": 5.136337757110596, "kl": 0.140625, "learning_rate": 7.089999999999999e-07, "loss": 0.0056, "reward": 0.27506738901138306, "reward_mean": 0.27506738901138306, "reward_std": 0.054339293390512466, "rewards/v_meteor_reward": 0.27506738901138306, "step": 873 }, { "advantages": 1.4528632164001465e-07, "completion_length": 63.75, "epoch": 0.29133333333333333, "grad_norm": 4.929111003875732, "kl": 0.294921875, "learning_rate": 7.086666666666667e-07, "loss": 0.0118, "reward": 0.7281419038772583, "reward_mean": 0.7281419038772583, "reward_std": 0.026202717795968056, "rewards/a_meteor_reward": 0.7281419038772583, "step": 874 }, { "advantages": 9.313225746154785e-08, "completion_length": 91.375, "epoch": 0.2916666666666667, "grad_norm": 5.717706680297852, "kl": 0.2236328125, "learning_rate": 7.083333333333334e-07, "loss": 0.009, "reward": 0.4365052580833435, "reward_mean": 0.4365052580833435, "reward_std": 0.07254481315612793, "rewards/v_meteor_reward": 0.4365052580833435, "step": 875 }, { "advantages": 3.166496753692627e-07, "completion_length": 15.0625, "epoch": 0.292, "grad_norm": 10.5963716506958, "kl": 0.24609375, "learning_rate": 7.079999999999999e-07, "loss": 0.0098, "reward": 1.7847239971160889, "reward_mean": 1.7847239971160889, "reward_std": 0.11692966520786285, "rewards/iou_timestamp_reward": 0.7847240567207336, "rewards/t_format_reward": 1.0, "step": 876 }, { "advantages": 6.48200511932373e-07, "completion_length": 40.625, "epoch": 0.29233333333333333, "grad_norm": 6.803757190704346, "kl": 0.470703125, "learning_rate": 7.076666666666666e-07, "loss": 0.0188, "reward": 0.7565659284591675, "reward_mean": 0.7565659284591675, "reward_std": 0.048331499099731445, "rewards/a_meteor_reward": 0.7565659284591675, "step": 877 }, { "advantages": -1.6614794731140137e-06, "completion_length": 15.625, "epoch": 0.2926666666666667, "grad_norm": 10.915912628173828, "kl": 0.208984375, "learning_rate": 7.073333333333333e-07, "loss": 0.0084, "reward": 1.7204573154449463, "reward_mean": 1.7204573154449463, "reward_std": 0.04538553208112717, "rewards/iou_timestamp_reward": 0.7204573154449463, "rewards/t_format_reward": 1.0, "step": 878 }, { "advantages": 1.4901161193847656e-08, "completion_length": 71.5625, "epoch": 0.293, "grad_norm": 5.742271423339844, "kl": 0.2080078125, "learning_rate": 7.07e-07, "loss": 0.0083, "reward": 0.3481258153915405, "reward_mean": 0.3481258153915405, "reward_std": 0.06961505115032196, "rewards/v_meteor_reward": 0.3481258153915405, "step": 879 }, { "advantages": 4.2654573917388916e-07, "completion_length": 42.375, "epoch": 0.29333333333333333, "grad_norm": 3.9568490982055664, "kl": 0.388671875, "learning_rate": 7.066666666666666e-07, "loss": 0.0155, "reward": 0.7685115337371826, "reward_mean": 0.7685115337371826, "reward_std": 0.06275540590286255, "rewards/a_meteor_reward": 0.7685115337371826, "step": 880 }, { "advantages": 1.8998980522155762e-07, "completion_length": 65.8125, "epoch": 0.2936666666666667, "grad_norm": 6.743844985961914, "kl": 0.326171875, "learning_rate": 7.063333333333333e-07, "loss": 0.013, "reward": 0.4101196527481079, "reward_mean": 0.4101196527481079, "reward_std": 0.05307517573237419, "rewards/v_meteor_reward": 0.4101196527481079, "step": 881 }, { "advantages": -9.313225746154785e-09, "completion_length": 84.3125, "epoch": 0.294, "grad_norm": 5.884131908416748, "kl": 0.26171875, "learning_rate": 7.059999999999999e-07, "loss": 0.0105, "reward": 0.3848494291305542, "reward_mean": 0.3848494291305542, "reward_std": 0.09045091271400452, "rewards/v_meteor_reward": 0.3848494291305542, "step": 882 }, { "advantages": 3.501772880554199e-07, "completion_length": 16.0, "epoch": 0.29433333333333334, "grad_norm": 16.331941604614258, "kl": 0.2578125, "learning_rate": 7.056666666666667e-07, "loss": 0.0103, "reward": 1.90606689453125, "reward_mean": 1.90606689453125, "reward_std": 0.049428924918174744, "rewards/iou_timestamp_reward": 0.9060668349266052, "rewards/t_format_reward": 1.0, "step": 883 }, { "advantages": 2.9802322387695312e-08, "completion_length": 89.0625, "epoch": 0.2946666666666667, "grad_norm": 4.560796737670898, "kl": 0.201171875, "learning_rate": 7.053333333333333e-07, "loss": 0.008, "reward": 0.4614853262901306, "reward_mean": 0.4614853262901306, "reward_std": 0.07393045723438263, "rewards/v_meteor_reward": 0.4614853262901306, "step": 884 }, { "advantages": 9.499490261077881e-08, "completion_length": 71.3125, "epoch": 0.295, "grad_norm": 6.073352336883545, "kl": 0.2080078125, "learning_rate": 7.049999999999999e-07, "loss": 0.0083, "reward": 0.35627666115760803, "reward_mean": 0.35627666115760803, "reward_std": 0.08980299532413483, "rewards/v_meteor_reward": 0.35627666115760803, "step": 885 }, { "advantages": 7.82310962677002e-08, "completion_length": 76.8125, "epoch": 0.29533333333333334, "grad_norm": 5.435654640197754, "kl": 0.177734375, "learning_rate": 7.046666666666666e-07, "loss": 0.0071, "reward": 0.35794633626937866, "reward_mean": 0.35794633626937866, "reward_std": 0.08652491122484207, "rewards/v_meteor_reward": 0.35794633626937866, "step": 886 }, { "advantages": -2.5331974029541016e-07, "completion_length": 203.625, "epoch": 0.2956666666666667, "grad_norm": 2.9327425956726074, "kl": 0.23828125, "learning_rate": 7.043333333333334e-07, "loss": 0.0096, "reward": 0.7850555181503296, "reward_mean": 0.7850555181503296, "reward_std": 0.03286363184452057, "rewards/a_meteor_reward": 0.7850555181503296, "step": 887 }, { "advantages": 5.941838026046753e-07, "completion_length": 67.75, "epoch": 0.296, "grad_norm": 5.555049896240234, "kl": 0.25, "learning_rate": 7.04e-07, "loss": 0.01, "reward": 0.5085921287536621, "reward_mean": 0.5085921287536621, "reward_std": 0.054425738751888275, "rewards/v_meteor_reward": 0.5085921287536621, "step": 888 }, { "advantages": -1.6205012798309326e-07, "completion_length": 97.75, "epoch": 0.29633333333333334, "grad_norm": 5.204848289489746, "kl": 0.177734375, "learning_rate": 7.036666666666666e-07, "loss": 0.0071, "reward": 0.35419970750808716, "reward_mean": 0.35419970750808716, "reward_std": 0.07981809973716736, "rewards/v_meteor_reward": 0.35419970750808716, "step": 889 }, { "advantages": -1.0021030902862549e-06, "completion_length": 15.75, "epoch": 0.2966666666666667, "grad_norm": 13.594868659973145, "kl": 0.265625, "learning_rate": 7.033333333333333e-07, "loss": 0.0106, "reward": 1.8783153295516968, "reward_mean": 1.8783153295516968, "reward_std": 0.021626746281981468, "rewards/iou_timestamp_reward": 0.8783153295516968, "rewards/t_format_reward": 1.0, "step": 890 }, { "advantages": 7.82310962677002e-08, "completion_length": 81.875, "epoch": 0.297, "grad_norm": 6.033674716949463, "kl": 0.2138671875, "learning_rate": 7.029999999999999e-07, "loss": 0.0085, "reward": 0.3725980818271637, "reward_mean": 0.3725980818271637, "reward_std": 0.05270881950855255, "rewards/v_meteor_reward": 0.3725980818271637, "step": 891 }, { "advantages": 5.21540641784668e-08, "completion_length": 16.5, "epoch": 0.29733333333333334, "grad_norm": 14.870192527770996, "kl": 0.2197265625, "learning_rate": 7.026666666666667e-07, "loss": 0.0088, "reward": 1.5585174560546875, "reward_mean": 1.5585174560546875, "reward_std": 0.08661697804927826, "rewards/iou_timestamp_reward": 0.5585174560546875, "rewards/t_format_reward": 1.0, "step": 892 }, { "advantages": -1.3746321201324463e-06, "completion_length": 15.5, "epoch": 0.2976666666666667, "grad_norm": 6.104940891265869, "kl": 0.255859375, "learning_rate": 7.023333333333333e-07, "loss": 0.0103, "reward": 1.7723350524902344, "reward_mean": 1.7723350524902344, "reward_std": 0.026556042954325676, "rewards/iou_timestamp_reward": 0.7723351120948792, "rewards/t_format_reward": 1.0, "step": 893 }, { "advantages": 1.3113021850585938e-06, "completion_length": 152.875, "epoch": 0.298, "grad_norm": 4.116837501525879, "kl": 0.20703125, "learning_rate": 7.019999999999999e-07, "loss": 0.0083, "reward": 0.6350308656692505, "reward_mean": 0.6350308656692505, "reward_std": 0.12489189207553864, "rewards/a_meteor_reward": 0.6350308656692505, "step": 894 }, { "advantages": -1.1622905731201172e-06, "completion_length": 184.6875, "epoch": 0.29833333333333334, "grad_norm": 3.8358967304229736, "kl": 0.173828125, "learning_rate": 7.016666666666666e-07, "loss": 0.0069, "reward": 0.7880898714065552, "reward_mean": 0.7880898714065552, "reward_std": 0.03411146625876427, "rewards/a_meteor_reward": 0.7880898714065552, "step": 895 }, { "advantages": 1.5832483768463135e-07, "completion_length": 315.0625, "epoch": 0.2986666666666667, "grad_norm": 3.040588855743408, "kl": 0.1318359375, "learning_rate": 7.013333333333334e-07, "loss": 0.0053, "reward": 0.5497584342956543, "reward_mean": 0.5497584342956543, "reward_std": 0.07624027878046036, "rewards/a_meteor_reward": 0.5497584342956543, "step": 896 }, { "advantages": -2.5704503059387207e-07, "completion_length": 90.3125, "epoch": 0.299, "grad_norm": 5.536390781402588, "kl": 0.435546875, "learning_rate": 7.009999999999999e-07, "loss": 0.0174, "reward": 0.5465837717056274, "reward_mean": 0.5465837717056274, "reward_std": 0.07107613235712051, "rewards/a_meteor_reward": 0.5465837717056274, "step": 897 }, { "advantages": -1.471489667892456e-07, "completion_length": 85.4375, "epoch": 0.29933333333333334, "grad_norm": 5.61518669128418, "kl": 0.17578125, "learning_rate": 7.006666666666666e-07, "loss": 0.007, "reward": 0.3085041046142578, "reward_mean": 0.3085041046142578, "reward_std": 0.05963798984885216, "rewards/v_meteor_reward": 0.3085041046142578, "step": 898 }, { "advantages": 3.948807716369629e-07, "completion_length": 291.4375, "epoch": 0.2996666666666667, "grad_norm": 2.2916107177734375, "kl": 0.130859375, "learning_rate": 7.003333333333333e-07, "loss": 0.0052, "reward": 0.6411904096603394, "reward_mean": 0.6411904096603394, "reward_std": 0.04731711372733116, "rewards/a_meteor_reward": 0.6411904096603394, "step": 899 }, { "advantages": 1.0766088962554932e-06, "completion_length": 100.25, "epoch": 0.3, "grad_norm": 2.6309165954589844, "kl": 0.32421875, "learning_rate": 7e-07, "loss": 0.013, "reward": 0.9187983870506287, "reward_mean": 0.9187983870506287, "reward_std": 0.020042143762111664, "rewards/a_meteor_reward": 0.9187983870506287, "step": 900 }, { "advantages": 4.079192876815796e-07, "completion_length": 14.9375, "epoch": 0.30033333333333334, "grad_norm": 10.165332794189453, "kl": 0.263671875, "learning_rate": 6.996666666666666e-07, "loss": 0.0106, "reward": 1.65091872215271, "reward_mean": 1.65091872215271, "reward_std": 0.020323285833001137, "rewards/iou_timestamp_reward": 0.65091872215271, "rewards/t_format_reward": 1.0, "step": 901 }, { "advantages": -1.864507794380188e-06, "completion_length": 15.75, "epoch": 0.3006666666666667, "grad_norm": 9.238846778869629, "kl": 0.21875, "learning_rate": 6.993333333333333e-07, "loss": 0.0088, "reward": 1.693800926208496, "reward_mean": 1.693800926208496, "reward_std": 0.037087779492139816, "rewards/iou_timestamp_reward": 0.6938008069992065, "rewards/t_format_reward": 1.0, "step": 902 }, { "advantages": -2.7939677238464355e-07, "completion_length": 110.25, "epoch": 0.301, "grad_norm": 4.872712135314941, "kl": 0.181640625, "learning_rate": 6.989999999999999e-07, "loss": 0.0073, "reward": 0.400676429271698, "reward_mean": 0.400676429271698, "reward_std": 0.03949558734893799, "rewards/v_meteor_reward": 0.400676429271698, "step": 903 }, { "advantages": 3.725290298461914e-09, "completion_length": 76.1875, "epoch": 0.30133333333333334, "grad_norm": 5.25930118560791, "kl": 0.1474609375, "learning_rate": 6.986666666666667e-07, "loss": 0.0059, "reward": 0.3141935169696808, "reward_mean": 0.3141935169696808, "reward_std": 0.05760233476758003, "rewards/v_meteor_reward": 0.3141935169696808, "step": 904 }, { "advantages": -4.10713255405426e-07, "completion_length": 178.375, "epoch": 0.3016666666666667, "grad_norm": 3.1058735847473145, "kl": 0.1640625, "learning_rate": 6.983333333333334e-07, "loss": 0.0065, "reward": 0.4665875732898712, "reward_mean": 0.4665875732898712, "reward_std": 0.040687866508960724, "rewards/a_meteor_reward": 0.4665875732898712, "step": 905 }, { "advantages": -4.470348358154297e-08, "completion_length": 174.1875, "epoch": 0.302, "grad_norm": 2.90929913520813, "kl": 0.140625, "learning_rate": 6.979999999999999e-07, "loss": 0.0056, "reward": 0.521239161491394, "reward_mean": 0.521239161491394, "reward_std": 0.05670526996254921, "rewards/a_meteor_reward": 0.521239161491394, "step": 906 }, { "advantages": 1.5832483768463135e-08, "completion_length": 292.6875, "epoch": 0.30233333333333334, "grad_norm": 3.3440754413604736, "kl": 0.1865234375, "learning_rate": 6.976666666666666e-07, "loss": 0.0074, "reward": 0.38448554277420044, "reward_mean": 0.38448554277420044, "reward_std": 0.10233575105667114, "rewards/a_meteor_reward": 0.38448554277420044, "step": 907 }, { "advantages": 2.942979335784912e-07, "completion_length": 91.9375, "epoch": 0.30266666666666664, "grad_norm": 3.3291778564453125, "kl": 0.306640625, "learning_rate": 6.973333333333333e-07, "loss": 0.0123, "reward": 0.7360678911209106, "reward_mean": 0.7360678911209106, "reward_std": 0.05591065064072609, "rewards/a_meteor_reward": 0.7360678911209106, "step": 908 }, { "advantages": -5.029141902923584e-08, "completion_length": 80.25, "epoch": 0.303, "grad_norm": 4.741859436035156, "kl": 0.166015625, "learning_rate": 6.97e-07, "loss": 0.0066, "reward": 0.4256303310394287, "reward_mean": 0.4256303310394287, "reward_std": 0.08280780911445618, "rewards/v_meteor_reward": 0.4256303310394287, "step": 909 }, { "advantages": -1.0803341865539551e-07, "completion_length": 85.0, "epoch": 0.30333333333333334, "grad_norm": 5.694271564483643, "kl": 0.22265625, "learning_rate": 6.966666666666666e-07, "loss": 0.0089, "reward": 0.3833066523075104, "reward_mean": 0.3833066523075104, "reward_std": 0.0633745938539505, "rewards/v_meteor_reward": 0.3833066523075104, "step": 910 }, { "advantages": 3.46451997756958e-07, "completion_length": 26.6875, "epoch": 0.30366666666666664, "grad_norm": 9.418280601501465, "kl": 0.6328125, "learning_rate": 6.963333333333333e-07, "loss": 0.0252, "reward": 0.43673983216285706, "reward_mean": 0.43673983216285706, "reward_std": 0.05037819594144821, "rewards/a_meteor_reward": 0.43673983216285706, "step": 911 }, { "advantages": -1.5027821063995361e-05, "completion_length": 16.25, "epoch": 0.304, "grad_norm": 13.147588729858398, "kl": 0.265625, "learning_rate": 6.959999999999999e-07, "loss": 0.0107, "reward": 1.621587872505188, "reward_mean": 1.621587872505188, "reward_std": 0.013505075126886368, "rewards/iou_timestamp_reward": 0.621587872505188, "rewards/t_format_reward": 1.0, "step": 912 }, { "advantages": -1.0654330253601074e-06, "completion_length": 14.6875, "epoch": 0.30433333333333334, "grad_norm": 10.435384750366211, "kl": 0.345703125, "learning_rate": 6.956666666666667e-07, "loss": 0.0138, "reward": 1.7736421823501587, "reward_mean": 1.7736421823501587, "reward_std": 0.02889927104115486, "rewards/iou_timestamp_reward": 0.7736422419548035, "rewards/t_format_reward": 1.0, "step": 913 }, { "advantages": 2.60770320892334e-08, "completion_length": 249.8125, "epoch": 0.30466666666666664, "grad_norm": 4.121401786804199, "kl": 0.1875, "learning_rate": 6.953333333333333e-07, "loss": 0.0075, "reward": 0.6851102709770203, "reward_mean": 0.6851102709770203, "reward_std": 0.07675985991954803, "rewards/a_meteor_reward": 0.6851102709770203, "step": 914 }, { "advantages": -1.3746321201324463e-06, "completion_length": 16.25, "epoch": 0.305, "grad_norm": 3.7818491458892822, "kl": 0.193359375, "learning_rate": 6.949999999999999e-07, "loss": 0.0077, "reward": 1.6677534580230713, "reward_mean": 1.6677534580230713, "reward_std": 0.003074660897254944, "rewards/iou_timestamp_reward": 0.6677533984184265, "rewards/t_format_reward": 1.0, "step": 915 }, { "advantages": 3.0174851417541504e-07, "completion_length": 75.9375, "epoch": 0.30533333333333335, "grad_norm": 5.74137544631958, "kl": 0.2470703125, "learning_rate": 6.946666666666666e-07, "loss": 0.0099, "reward": 0.3136899471282959, "reward_mean": 0.3136899471282959, "reward_std": 0.038336776196956635, "rewards/v_meteor_reward": 0.3136899471282959, "step": 916 }, { "advantages": 3.986060619354248e-07, "completion_length": 15.0, "epoch": 0.30566666666666664, "grad_norm": 10.784875869750977, "kl": 0.173828125, "learning_rate": 6.943333333333334e-07, "loss": 0.007, "reward": 1.6973129510879517, "reward_mean": 1.6973129510879517, "reward_std": 0.04304630681872368, "rewards/iou_timestamp_reward": 0.6973130702972412, "rewards/t_format_reward": 1.0, "step": 917 }, { "advantages": -9.57399606704712e-07, "completion_length": 15.25, "epoch": 0.306, "grad_norm": 11.2430419921875, "kl": 0.26953125, "learning_rate": 6.939999999999999e-07, "loss": 0.0108, "reward": 1.8415824174880981, "reward_mean": 1.8415824174880981, "reward_std": 0.02211601287126541, "rewards/iou_timestamp_reward": 0.8415824174880981, "rewards/t_format_reward": 1.0, "step": 918 }, { "advantages": 1.9297003746032715e-06, "completion_length": 16.75, "epoch": 0.30633333333333335, "grad_norm": 11.821949005126953, "kl": 0.22265625, "learning_rate": 6.936666666666666e-07, "loss": 0.0089, "reward": 1.4893577098846436, "reward_mean": 1.4893577098846436, "reward_std": 0.051465246826410294, "rewards/iou_timestamp_reward": 0.48935773968696594, "rewards/t_format_reward": 1.0, "step": 919 }, { "advantages": -9.685754776000977e-08, "completion_length": 96.8125, "epoch": 0.30666666666666664, "grad_norm": 4.579200744628906, "kl": 0.2041015625, "learning_rate": 6.933333333333333e-07, "loss": 0.0082, "reward": 0.40577778220176697, "reward_mean": 0.40577778220176697, "reward_std": 0.08912242949008942, "rewards/v_meteor_reward": 0.40577778220176697, "step": 920 }, { "advantages": 1.1548399925231934e-07, "completion_length": 73.375, "epoch": 0.307, "grad_norm": 5.858115196228027, "kl": 0.212890625, "learning_rate": 6.929999999999999e-07, "loss": 0.0085, "reward": 0.4020790755748749, "reward_mean": 0.4020790755748749, "reward_std": 0.057061731815338135, "rewards/v_meteor_reward": 0.4020790755748749, "step": 921 }, { "advantages": 1.2665987014770508e-07, "completion_length": 153.4375, "epoch": 0.30733333333333335, "grad_norm": 3.1328604221343994, "kl": 0.171875, "learning_rate": 6.926666666666666e-07, "loss": 0.0069, "reward": 0.5962611436843872, "reward_mean": 0.5962611436843872, "reward_std": 0.08026918768882751, "rewards/a_meteor_reward": 0.5962611436843872, "step": 922 }, { "advantages": 4.0978193283081055e-08, "completion_length": 124.375, "epoch": 0.30766666666666664, "grad_norm": 4.559597492218018, "kl": 0.291015625, "learning_rate": 6.923333333333333e-07, "loss": 0.0116, "reward": 0.7285445332527161, "reward_mean": 0.7285445332527161, "reward_std": 0.04339500516653061, "rewards/a_meteor_reward": 0.7285445332527161, "step": 923 }, { "advantages": -3.6247074604034424e-06, "completion_length": 15.0, "epoch": 0.308, "grad_norm": 9.637309074401855, "kl": 0.263671875, "learning_rate": 6.919999999999999e-07, "loss": 0.0106, "reward": 1.7199983596801758, "reward_mean": 1.7199983596801758, "reward_std": 0.01211484894156456, "rewards/iou_timestamp_reward": 0.719998300075531, "rewards/t_format_reward": 1.0, "step": 924 }, { "advantages": -8.642673492431641e-07, "completion_length": 15.25, "epoch": 0.30833333333333335, "grad_norm": 11.714004516601562, "kl": 0.349609375, "learning_rate": 6.916666666666666e-07, "loss": 0.014, "reward": 1.8818202018737793, "reward_mean": 1.8818202018737793, "reward_std": 0.017751172184944153, "rewards/iou_timestamp_reward": 0.8818202614784241, "rewards/t_format_reward": 1.0, "step": 925 }, { "advantages": 5.997717380523682e-07, "completion_length": 15.5, "epoch": 0.30866666666666664, "grad_norm": 12.207188606262207, "kl": 0.26171875, "learning_rate": 6.913333333333334e-07, "loss": 0.0104, "reward": 1.8182001113891602, "reward_mean": 1.8182001113891602, "reward_std": 0.0941404402256012, "rewards/iou_timestamp_reward": 0.8182001113891602, "rewards/t_format_reward": 1.0, "step": 926 }, { "advantages": -2.2910535335540771e-07, "completion_length": 65.3125, "epoch": 0.309, "grad_norm": 5.8587775230407715, "kl": 0.2734375, "learning_rate": 6.909999999999999e-07, "loss": 0.011, "reward": 0.3363291025161743, "reward_mean": 0.3363291025161743, "reward_std": 0.08327844738960266, "rewards/v_meteor_reward": 0.3363291025161743, "step": 927 }, { "advantages": -1.1175870895385742e-08, "completion_length": 152.25, "epoch": 0.30933333333333335, "grad_norm": 2.100902557373047, "kl": 0.1337890625, "learning_rate": 6.906666666666666e-07, "loss": 0.0054, "reward": 0.81089186668396, "reward_mean": 0.81089186668396, "reward_std": 0.048101745545864105, "rewards/a_meteor_reward": 0.81089186668396, "step": 928 }, { "advantages": 2.5331974029541016e-07, "completion_length": 32.0625, "epoch": 0.30966666666666665, "grad_norm": 10.05424976348877, "kl": 0.69140625, "learning_rate": 6.903333333333333e-07, "loss": 0.0277, "reward": 0.5245980024337769, "reward_mean": 0.5245980024337769, "reward_std": 0.0722542479634285, "rewards/a_meteor_reward": 0.5245980024337769, "step": 929 }, { "advantages": 1.1883676052093506e-06, "completion_length": 16.5, "epoch": 0.31, "grad_norm": 12.939717292785645, "kl": 0.2470703125, "learning_rate": 6.9e-07, "loss": 0.0099, "reward": 1.622229814529419, "reward_mean": 1.622229814529419, "reward_std": 0.09129417687654495, "rewards/iou_timestamp_reward": 0.6222298741340637, "rewards/t_format_reward": 1.0, "step": 930 }, { "advantages": 8.381903171539307e-08, "completion_length": 86.125, "epoch": 0.31033333333333335, "grad_norm": 4.580420017242432, "kl": 0.2275390625, "learning_rate": 6.896666666666666e-07, "loss": 0.0091, "reward": 0.4570806324481964, "reward_mean": 0.4570806324481964, "reward_std": 0.047846533358097076, "rewards/v_meteor_reward": 0.4570806324481964, "step": 931 }, { "advantages": -1.642853021621704e-06, "completion_length": 15.4375, "epoch": 0.31066666666666665, "grad_norm": 15.691985130310059, "kl": 0.3046875, "learning_rate": 6.893333333333333e-07, "loss": 0.0122, "reward": 1.6828874349594116, "reward_mean": 1.6828874349594116, "reward_std": 0.05803588777780533, "rewards/iou_timestamp_reward": 0.6828873753547668, "rewards/t_format_reward": 1.0, "step": 932 }, { "advantages": 2.5741755962371826e-06, "completion_length": 16.25, "epoch": 0.311, "grad_norm": 10.583964347839355, "kl": 0.244140625, "learning_rate": 6.889999999999999e-07, "loss": 0.0098, "reward": 1.8157312870025635, "reward_mean": 1.8157312870025635, "reward_std": 0.030876513570547104, "rewards/iou_timestamp_reward": 0.815731406211853, "rewards/t_format_reward": 1.0, "step": 933 }, { "advantages": 9.126961231231689e-07, "completion_length": 311.0, "epoch": 0.31133333333333335, "grad_norm": 2.500781774520874, "kl": 0.2216796875, "learning_rate": 6.886666666666667e-07, "loss": 0.0089, "reward": 0.7473336458206177, "reward_mean": 0.7473336458206177, "reward_std": 0.024103263393044472, "rewards/a_meteor_reward": 0.7473336458206177, "step": 934 }, { "advantages": 1.9185245037078857e-06, "completion_length": 15.25, "epoch": 0.31166666666666665, "grad_norm": 13.343868255615234, "kl": 0.27734375, "learning_rate": 6.883333333333333e-07, "loss": 0.0111, "reward": 1.7669107913970947, "reward_mean": 1.7669107913970947, "reward_std": 0.029874522238969803, "rewards/iou_timestamp_reward": 0.7669109106063843, "rewards/t_format_reward": 1.0, "step": 935 }, { "advantages": -4.3213367462158203e-07, "completion_length": 15.0, "epoch": 0.312, "grad_norm": 5.512158393859863, "kl": 0.203125, "learning_rate": 6.879999999999999e-07, "loss": 0.0081, "reward": 1.4418749809265137, "reward_mean": 1.4418749809265137, "reward_std": 0.010317395441234112, "rewards/iou_timestamp_reward": 0.44187498092651367, "rewards/t_format_reward": 1.0, "step": 936 }, { "advantages": 5.4016709327697754e-08, "completion_length": 75.3125, "epoch": 0.31233333333333335, "grad_norm": 5.732391357421875, "kl": 0.171875, "learning_rate": 6.876666666666666e-07, "loss": 0.0069, "reward": 0.2969972491264343, "reward_mean": 0.2969972491264343, "reward_std": 0.08506885915994644, "rewards/v_meteor_reward": 0.2969972491264343, "step": 937 }, { "advantages": 1.6763806343078613e-07, "completion_length": 76.5625, "epoch": 0.31266666666666665, "grad_norm": 5.773232936859131, "kl": 0.2314453125, "learning_rate": 6.873333333333334e-07, "loss": 0.0093, "reward": 0.3846186399459839, "reward_mean": 0.3846186399459839, "reward_std": 0.0866735428571701, "rewards/v_meteor_reward": 0.3846186399459839, "step": 938 }, { "advantages": 4.861503839492798e-07, "completion_length": 109.75, "epoch": 0.313, "grad_norm": 3.9448249340057373, "kl": 0.2421875, "learning_rate": 6.87e-07, "loss": 0.0097, "reward": 0.5463488101959229, "reward_mean": 0.5463488101959229, "reward_std": 0.10262935608625412, "rewards/a_meteor_reward": 0.5463488101959229, "step": 939 }, { "advantages": -1.0058283805847168e-07, "completion_length": 90.75, "epoch": 0.31333333333333335, "grad_norm": 5.473631858825684, "kl": 0.224609375, "learning_rate": 6.866666666666666e-07, "loss": 0.009, "reward": 0.42653268575668335, "reward_mean": 0.42653268575668335, "reward_std": 0.06846630573272705, "rewards/v_meteor_reward": 0.42653268575668335, "step": 940 }, { "advantages": 3.5390257835388184e-08, "completion_length": 232.75, "epoch": 0.31366666666666665, "grad_norm": 3.950439691543579, "kl": 0.2265625, "learning_rate": 6.863333333333333e-07, "loss": 0.009, "reward": 0.4602825939655304, "reward_mean": 0.4602825939655304, "reward_std": 0.07245359569787979, "rewards/a_meteor_reward": 0.4602825939655304, "step": 941 }, { "advantages": 2.0489096641540527e-08, "completion_length": 80.125, "epoch": 0.314, "grad_norm": 5.686972141265869, "kl": 0.244140625, "learning_rate": 6.86e-07, "loss": 0.0098, "reward": 0.34000879526138306, "reward_mean": 0.34000879526138306, "reward_std": 0.06459502875804901, "rewards/v_meteor_reward": 0.34000879526138306, "step": 942 }, { "advantages": 9.238719940185547e-07, "completion_length": 16.625, "epoch": 0.31433333333333335, "grad_norm": 20.742887496948242, "kl": 0.3828125, "learning_rate": 6.856666666666667e-07, "loss": 0.0153, "reward": 1.803245186805725, "reward_mean": 1.803245186805725, "reward_std": 0.10862436890602112, "rewards/iou_timestamp_reward": 0.8032451868057251, "rewards/t_format_reward": 1.0, "step": 943 }, { "advantages": 2.466142177581787e-06, "completion_length": 15.6875, "epoch": 0.31466666666666665, "grad_norm": 11.589685440063477, "kl": 0.2470703125, "learning_rate": 6.853333333333333e-07, "loss": 0.0099, "reward": 1.646568775177002, "reward_mean": 1.646568775177002, "reward_std": 0.022255681455135345, "rewards/iou_timestamp_reward": 0.6465688347816467, "rewards/t_format_reward": 1.0, "step": 944 }, { "advantages": 4.209578037261963e-07, "completion_length": 81.125, "epoch": 0.315, "grad_norm": 4.037908554077148, "kl": 0.283203125, "learning_rate": 6.85e-07, "loss": 0.0113, "reward": 0.7708792686462402, "reward_mean": 0.7708792686462402, "reward_std": 0.06078391149640083, "rewards/a_meteor_reward": 0.7708792686462402, "step": 945 }, { "advantages": -2.514570951461792e-08, "completion_length": 72.0, "epoch": 0.31533333333333335, "grad_norm": 5.4714460372924805, "kl": 0.189453125, "learning_rate": 6.846666666666666e-07, "loss": 0.0076, "reward": 0.34707802534103394, "reward_mean": 0.34707802534103394, "reward_std": 0.05319035053253174, "rewards/v_meteor_reward": 0.34707802534103394, "step": 946 }, { "advantages": -1.1548399925231934e-07, "completion_length": 82.9375, "epoch": 0.31566666666666665, "grad_norm": 5.219618797302246, "kl": 0.19921875, "learning_rate": 6.843333333333334e-07, "loss": 0.008, "reward": 0.4330146908760071, "reward_mean": 0.4330146908760071, "reward_std": 0.04958713799715042, "rewards/v_meteor_reward": 0.4330146908760071, "step": 947 }, { "advantages": -1.1920928955078125e-07, "completion_length": 67.1875, "epoch": 0.316, "grad_norm": 8.313613891601562, "kl": 0.3828125, "learning_rate": 6.84e-07, "loss": 0.0153, "reward": 0.3702094852924347, "reward_mean": 0.3702094852924347, "reward_std": 0.07267993688583374, "rewards/v_meteor_reward": 0.3702094852924347, "step": 948 }, { "advantages": 7.059425115585327e-07, "completion_length": 16.0, "epoch": 0.31633333333333336, "grad_norm": 14.168096542358398, "kl": 0.150390625, "learning_rate": 6.836666666666666e-07, "loss": 0.006, "reward": 1.819765567779541, "reward_mean": 1.819765567779541, "reward_std": 0.04263624548912048, "rewards/iou_timestamp_reward": 0.819765567779541, "rewards/t_format_reward": 1.0, "step": 949 }, { "advantages": -4.284083843231201e-07, "completion_length": 71.625, "epoch": 0.31666666666666665, "grad_norm": 5.1521124839782715, "kl": 0.361328125, "learning_rate": 6.833333333333333e-07, "loss": 0.0145, "reward": 0.8481192588806152, "reward_mean": 0.8481192588806152, "reward_std": 0.03073916956782341, "rewards/a_meteor_reward": 0.8481192588806152, "step": 950 }, { "advantages": 1.1548399925231934e-07, "completion_length": 91.5, "epoch": 0.317, "grad_norm": 5.446624279022217, "kl": 0.2275390625, "learning_rate": 6.830000000000001e-07, "loss": 0.0091, "reward": 0.3636730909347534, "reward_mean": 0.3636730909347534, "reward_std": 0.059629641473293304, "rewards/v_meteor_reward": 0.3636730909347534, "step": 951 }, { "advantages": -1.1511147022247314e-06, "completion_length": 16.5, "epoch": 0.31733333333333336, "grad_norm": 95.37347412109375, "kl": 0.421875, "learning_rate": 6.826666666666666e-07, "loss": 0.0168, "reward": 1.6160776615142822, "reward_mean": 1.6160776615142822, "reward_std": 0.05078674852848053, "rewards/iou_timestamp_reward": 0.616077721118927, "rewards/t_format_reward": 1.0, "step": 952 }, { "advantages": 9.080395102500916e-07, "completion_length": 16.5, "epoch": 0.31766666666666665, "grad_norm": 36.40129852294922, "kl": 0.2890625, "learning_rate": 6.823333333333333e-07, "loss": 0.0116, "reward": 1.7656797170639038, "reward_mean": 1.7656797170639038, "reward_std": 0.05595595762133598, "rewards/iou_timestamp_reward": 0.7656798362731934, "rewards/t_format_reward": 1.0, "step": 953 }, { "advantages": -3.427267074584961e-07, "completion_length": 15.75, "epoch": 0.318, "grad_norm": 7.249070644378662, "kl": 0.27734375, "learning_rate": 6.82e-07, "loss": 0.0111, "reward": 1.5763912200927734, "reward_mean": 1.5763912200927734, "reward_std": 0.014255214482545853, "rewards/iou_timestamp_reward": 0.5763912200927734, "rewards/t_format_reward": 1.0, "step": 954 }, { "advantages": -2.60770320892334e-07, "completion_length": 185.625, "epoch": 0.31833333333333336, "grad_norm": 3.3160860538482666, "kl": 0.166015625, "learning_rate": 6.816666666666666e-07, "loss": 0.0066, "reward": 0.5037153363227844, "reward_mean": 0.5037153363227844, "reward_std": 0.11776046454906464, "rewards/a_meteor_reward": 0.5037153363227844, "step": 955 }, { "advantages": 1.5795230865478516e-06, "completion_length": 15.75, "epoch": 0.31866666666666665, "grad_norm": 16.198463439941406, "kl": 0.298828125, "learning_rate": 6.813333333333333e-07, "loss": 0.0119, "reward": 1.8422157764434814, "reward_mean": 1.8422157764434814, "reward_std": 0.03796825185418129, "rewards/iou_timestamp_reward": 0.8422157764434814, "rewards/t_format_reward": 1.0, "step": 956 }, { "advantages": 1.862645149230957e-09, "completion_length": 15.1875, "epoch": 0.319, "grad_norm": 15.5491304397583, "kl": 0.1796875, "learning_rate": 6.81e-07, "loss": 0.0072, "reward": 1.6074602603912354, "reward_mean": 1.6074602603912354, "reward_std": 0.05464610084891319, "rewards/iou_timestamp_reward": 0.6074602603912354, "rewards/t_format_reward": 1.0, "step": 957 }, { "advantages": -1.1920928955078125e-07, "completion_length": 197.375, "epoch": 0.31933333333333336, "grad_norm": 4.553712844848633, "kl": 0.1796875, "learning_rate": 6.806666666666666e-07, "loss": 0.0072, "reward": 0.6235862970352173, "reward_mean": 0.6235862970352173, "reward_std": 0.06801548600196838, "rewards/a_meteor_reward": 0.6235862970352173, "step": 958 }, { "advantages": 3.5800039768218994e-06, "completion_length": 15.75, "epoch": 0.31966666666666665, "grad_norm": 25.484163284301758, "kl": 0.2578125, "learning_rate": 6.803333333333333e-07, "loss": 0.0103, "reward": 1.8683040142059326, "reward_mean": 1.8683040142059326, "reward_std": 0.11071425676345825, "rewards/iou_timestamp_reward": 0.8683041334152222, "rewards/t_format_reward": 1.0, "step": 959 }, { "advantages": -1.1175870895385742e-08, "completion_length": 87.75, "epoch": 0.32, "grad_norm": 4.861374855041504, "kl": 0.26953125, "learning_rate": 6.800000000000001e-07, "loss": 0.0108, "reward": 0.45387327671051025, "reward_mean": 0.45387327671051025, "reward_std": 0.0676662027835846, "rewards/v_meteor_reward": 0.45387327671051025, "step": 960 }, { "advantages": 1.3299286365509033e-06, "completion_length": 16.25, "epoch": 0.32033333333333336, "grad_norm": 21.81098747253418, "kl": 0.291015625, "learning_rate": 6.796666666666666e-07, "loss": 0.0116, "reward": 1.6896190643310547, "reward_mean": 1.6896190643310547, "reward_std": 0.07346543669700623, "rewards/iou_timestamp_reward": 0.6896191835403442, "rewards/t_format_reward": 1.0, "step": 961 }, { "advantages": 1.4901161193847656e-08, "completion_length": 53.25, "epoch": 0.32066666666666666, "grad_norm": 7.093238353729248, "kl": 0.2578125, "learning_rate": 6.793333333333333e-07, "loss": 0.0103, "reward": 0.4083383083343506, "reward_mean": 0.4083383083343506, "reward_std": 0.08475343883037567, "rewards/v_meteor_reward": 0.4083383083343506, "step": 962 }, { "advantages": -1.2665987014770508e-07, "completion_length": 80.625, "epoch": 0.321, "grad_norm": 5.463585376739502, "kl": 0.15625, "learning_rate": 6.79e-07, "loss": 0.0062, "reward": 0.3766896426677704, "reward_mean": 0.3766896426677704, "reward_std": 0.05124882608652115, "rewards/v_meteor_reward": 0.3766896426677704, "step": 963 }, { "advantages": -3.427267074584961e-07, "completion_length": 15.5, "epoch": 0.32133333333333336, "grad_norm": 7.535172939300537, "kl": 0.2421875, "learning_rate": 6.786666666666667e-07, "loss": 0.0097, "reward": 1.4437285661697388, "reward_mean": 1.4437285661697388, "reward_std": 0.02484966069459915, "rewards/iou_timestamp_reward": 0.44372856616973877, "rewards/t_format_reward": 1.0, "step": 964 }, { "advantages": -2.682209014892578e-07, "completion_length": 71.375, "epoch": 0.32166666666666666, "grad_norm": 7.0787787437438965, "kl": 0.578125, "learning_rate": 6.783333333333333e-07, "loss": 0.0231, "reward": 0.5652780532836914, "reward_mean": 0.5652780532836914, "reward_std": 0.07430665194988251, "rewards/a_meteor_reward": 0.5652780532836914, "step": 965 }, { "advantages": 1.2665987014770508e-07, "completion_length": 15.3125, "epoch": 0.322, "grad_norm": 14.194844245910645, "kl": 0.1923828125, "learning_rate": 6.78e-07, "loss": 0.0077, "reward": 1.9304561614990234, "reward_mean": 1.9304561614990234, "reward_std": 0.014306925237178802, "rewards/iou_timestamp_reward": 0.9304562211036682, "rewards/t_format_reward": 1.0, "step": 966 }, { "advantages": -5.271285772323608e-07, "completion_length": 16.75, "epoch": 0.32233333333333336, "grad_norm": 19.933320999145508, "kl": 0.25, "learning_rate": 6.776666666666666e-07, "loss": 0.0101, "reward": 1.5324115753173828, "reward_mean": 1.5324115753173828, "reward_std": 0.04669696092605591, "rewards/iou_timestamp_reward": 0.5324116349220276, "rewards/t_format_reward": 1.0, "step": 967 }, { "advantages": 1.1175870895385742e-07, "completion_length": 83.25, "epoch": 0.32266666666666666, "grad_norm": 5.872437477111816, "kl": 0.279296875, "learning_rate": 6.773333333333334e-07, "loss": 0.0112, "reward": 0.36221790313720703, "reward_mean": 0.36221790313720703, "reward_std": 0.07993382215499878, "rewards/v_meteor_reward": 0.36221790313720703, "step": 968 }, { "advantages": 3.632158041000366e-07, "completion_length": 89.125, "epoch": 0.323, "grad_norm": 6.466512203216553, "kl": 0.439453125, "learning_rate": 6.77e-07, "loss": 0.0176, "reward": 0.7150428295135498, "reward_mean": 0.7150428295135498, "reward_std": 0.04289707541465759, "rewards/a_meteor_reward": 0.7150428295135498, "step": 969 }, { "advantages": 8.568167686462402e-08, "completion_length": 88.5, "epoch": 0.3233333333333333, "grad_norm": 3.5718417167663574, "kl": 0.42578125, "learning_rate": 6.766666666666666e-07, "loss": 0.017, "reward": 0.7613562345504761, "reward_mean": 0.7613562345504761, "reward_std": 0.03836316615343094, "rewards/a_meteor_reward": 0.7613562345504761, "step": 970 }, { "advantages": -2.868473529815674e-07, "completion_length": 16.0, "epoch": 0.32366666666666666, "grad_norm": 16.171663284301758, "kl": 0.216796875, "learning_rate": 6.763333333333333e-07, "loss": 0.0087, "reward": 1.6778191328048706, "reward_mean": 1.6778191328048706, "reward_std": 0.14798519015312195, "rewards/iou_timestamp_reward": 0.6778191924095154, "rewards/t_format_reward": 1.0, "step": 971 }, { "advantages": 1.825392246246338e-07, "completion_length": 136.4375, "epoch": 0.324, "grad_norm": 4.895210266113281, "kl": 0.23828125, "learning_rate": 6.76e-07, "loss": 0.0095, "reward": 0.5354483723640442, "reward_mean": 0.5354483723640442, "reward_std": 0.14049826562404633, "rewards/a_meteor_reward": 0.5354483723640442, "step": 972 }, { "advantages": -6.07222318649292e-07, "completion_length": 15.75, "epoch": 0.3243333333333333, "grad_norm": 18.759124755859375, "kl": 0.1875, "learning_rate": 6.756666666666666e-07, "loss": 0.0075, "reward": 1.8442010879516602, "reward_mean": 1.8442010879516602, "reward_std": 0.03219473361968994, "rewards/iou_timestamp_reward": 0.8442010879516602, "rewards/t_format_reward": 1.0, "step": 973 }, { "advantages": -6.891787052154541e-08, "completion_length": 15.25, "epoch": 0.32466666666666666, "grad_norm": 13.99242115020752, "kl": 0.287109375, "learning_rate": 6.753333333333333e-07, "loss": 0.0115, "reward": 1.6960338354110718, "reward_mean": 1.6960338354110718, "reward_std": 0.10040191560983658, "rewards/iou_timestamp_reward": 0.696033775806427, "rewards/t_format_reward": 1.0, "step": 974 }, { "advantages": 1.7508864402770996e-07, "completion_length": 58.5625, "epoch": 0.325, "grad_norm": 6.202629089355469, "kl": 0.31640625, "learning_rate": 6.75e-07, "loss": 0.0127, "reward": 0.3805946707725525, "reward_mean": 0.3805946707725525, "reward_std": 0.0781036987900734, "rewards/v_meteor_reward": 0.3805946707725525, "step": 975 }, { "advantages": -9.313225746154785e-07, "completion_length": 15.3125, "epoch": 0.3253333333333333, "grad_norm": 23.036540985107422, "kl": 0.185546875, "learning_rate": 6.746666666666666e-07, "loss": 0.0074, "reward": 1.7370260953903198, "reward_mean": 1.7370260953903198, "reward_std": 0.03938998281955719, "rewards/iou_timestamp_reward": 0.7370260953903198, "rewards/t_format_reward": 1.0, "step": 976 }, { "advantages": -8.568167686462402e-08, "completion_length": 69.125, "epoch": 0.32566666666666666, "grad_norm": 7.340576171875, "kl": 0.39453125, "learning_rate": 6.743333333333333e-07, "loss": 0.0158, "reward": 0.8455600738525391, "reward_mean": 0.8455600738525391, "reward_std": 0.05348903685808182, "rewards/a_meteor_reward": 0.8455600738525391, "step": 977 }, { "advantages": -3.9301812648773193e-07, "completion_length": 15.5, "epoch": 0.326, "grad_norm": 6.837684631347656, "kl": 0.2451171875, "learning_rate": 6.74e-07, "loss": 0.0098, "reward": 1.648923635482788, "reward_mean": 1.648923635482788, "reward_std": 0.024312686175107956, "rewards/iou_timestamp_reward": 0.6489236354827881, "rewards/t_format_reward": 1.0, "step": 978 }, { "advantages": -3.725290298461914e-08, "completion_length": 76.9375, "epoch": 0.3263333333333333, "grad_norm": 5.471406936645508, "kl": 0.1962890625, "learning_rate": 6.736666666666666e-07, "loss": 0.0078, "reward": 0.3097687363624573, "reward_mean": 0.3097687363624573, "reward_std": 0.05711360275745392, "rewards/v_meteor_reward": 0.3097687363624573, "step": 979 }, { "advantages": -2.603977918624878e-06, "completion_length": 15.5, "epoch": 0.32666666666666666, "grad_norm": 8.67215347290039, "kl": 0.232421875, "learning_rate": 6.733333333333333e-07, "loss": 0.0093, "reward": 1.695502519607544, "reward_mean": 1.695502519607544, "reward_std": 0.01637156307697296, "rewards/iou_timestamp_reward": 0.6955026388168335, "rewards/t_format_reward": 1.0, "step": 980 }, { "advantages": -2.3469328880310059e-07, "completion_length": 76.625, "epoch": 0.327, "grad_norm": 3.8620574474334717, "kl": 0.25390625, "learning_rate": 6.730000000000001e-07, "loss": 0.0102, "reward": 0.6487276554107666, "reward_mean": 0.6487276554107666, "reward_std": 0.05783923715353012, "rewards/a_meteor_reward": 0.6487276554107666, "step": 981 }, { "advantages": 2.7567148208618164e-07, "completion_length": 15.125, "epoch": 0.3273333333333333, "grad_norm": 11.559318542480469, "kl": 0.30859375, "learning_rate": 6.726666666666666e-07, "loss": 0.0123, "reward": 1.787630558013916, "reward_mean": 1.787630558013916, "reward_std": 0.1530149281024933, "rewards/iou_timestamp_reward": 0.787630558013916, "rewards/t_format_reward": 1.0, "step": 982 }, { "advantages": -1.4901161193847656e-08, "completion_length": 97.5625, "epoch": 0.32766666666666666, "grad_norm": 4.592036724090576, "kl": 0.2470703125, "learning_rate": 6.723333333333333e-07, "loss": 0.0099, "reward": 0.6444642543792725, "reward_mean": 0.6444642543792725, "reward_std": 0.1021534726023674, "rewards/a_meteor_reward": 0.6444642543792725, "step": 983 }, { "advantages": 5.364418029785156e-07, "completion_length": 15.75, "epoch": 0.328, "grad_norm": 13.76136589050293, "kl": 0.232421875, "learning_rate": 6.72e-07, "loss": 0.0093, "reward": 1.7707041501998901, "reward_mean": 1.7707041501998901, "reward_std": 0.04544331505894661, "rewards/iou_timestamp_reward": 0.7707042098045349, "rewards/t_format_reward": 1.0, "step": 984 }, { "advantages": 2.3674219846725464e-06, "completion_length": 16.1875, "epoch": 0.3283333333333333, "grad_norm": 11.038873672485352, "kl": 0.2412109375, "learning_rate": 6.716666666666666e-07, "loss": 0.0097, "reward": 1.5478296279907227, "reward_mean": 1.5478296279907227, "reward_std": 0.033013686537742615, "rewards/iou_timestamp_reward": 0.5478296279907227, "rewards/t_format_reward": 1.0, "step": 985 }, { "advantages": -1.862645149230957e-08, "completion_length": 67.9375, "epoch": 0.32866666666666666, "grad_norm": 6.388054847717285, "kl": 0.296875, "learning_rate": 6.713333333333333e-07, "loss": 0.0118, "reward": 0.33248406648635864, "reward_mean": 0.33248406648635864, "reward_std": 0.06830742210149765, "rewards/v_meteor_reward": 0.33248406648635864, "step": 986 }, { "advantages": 5.918554961681366e-07, "completion_length": 66.5625, "epoch": 0.329, "grad_norm": 5.3667402267456055, "kl": 0.28515625, "learning_rate": 6.71e-07, "loss": 0.0115, "reward": 0.36214035749435425, "reward_mean": 0.36214035749435425, "reward_std": 0.06366724520921707, "rewards/v_meteor_reward": 0.36214035749435425, "step": 987 }, { "advantages": -2.9802322387695312e-08, "completion_length": 92.5625, "epoch": 0.3293333333333333, "grad_norm": 5.176222801208496, "kl": 0.208984375, "learning_rate": 6.706666666666666e-07, "loss": 0.0083, "reward": 0.38530054688453674, "reward_mean": 0.38530054688453674, "reward_std": 0.09331310540437698, "rewards/v_meteor_reward": 0.38530054688453674, "step": 988 }, { "advantages": -1.4528632164001465e-07, "completion_length": 51.3125, "epoch": 0.32966666666666666, "grad_norm": 4.891565799713135, "kl": 0.4609375, "learning_rate": 6.703333333333333e-07, "loss": 0.0185, "reward": 0.7967113256454468, "reward_mean": 0.7967113256454468, "reward_std": 0.05439518392086029, "rewards/a_meteor_reward": 0.7967113256454468, "step": 989 }, { "advantages": -1.0691583156585693e-06, "completion_length": 15.0, "epoch": 0.33, "grad_norm": 12.214875221252441, "kl": 0.2890625, "learning_rate": 6.7e-07, "loss": 0.0116, "reward": 1.8973770141601562, "reward_mean": 1.8973770141601562, "reward_std": 0.02159426361322403, "rewards/iou_timestamp_reward": 0.8973768949508667, "rewards/t_format_reward": 1.0, "step": 990 }, { "advantages": -1.5534460544586182e-06, "completion_length": 15.125, "epoch": 0.3303333333333333, "grad_norm": 9.688263893127441, "kl": 0.255859375, "learning_rate": 6.696666666666666e-07, "loss": 0.0103, "reward": 1.8325083255767822, "reward_mean": 1.8325083255767822, "reward_std": 0.03716354817152023, "rewards/iou_timestamp_reward": 0.8325082063674927, "rewards/t_format_reward": 1.0, "step": 991 }, { "advantages": -1.862645149230957e-07, "completion_length": 106.3125, "epoch": 0.33066666666666666, "grad_norm": 3.419588327407837, "kl": 0.1708984375, "learning_rate": 6.693333333333333e-07, "loss": 0.0068, "reward": 0.66767418384552, "reward_mean": 0.66767418384552, "reward_std": 0.08243440091609955, "rewards/a_meteor_reward": 0.66767418384552, "step": 992 }, { "advantages": -5.960464477539063e-08, "completion_length": 112.625, "epoch": 0.331, "grad_norm": 4.7214508056640625, "kl": 0.25, "learning_rate": 6.69e-07, "loss": 0.01, "reward": 0.5386037826538086, "reward_mean": 0.5386037826538086, "reward_std": 0.09456141293048859, "rewards/a_meteor_reward": 0.5386037826538086, "step": 993 }, { "advantages": 3.725290298461914e-09, "completion_length": 55.3125, "epoch": 0.3313333333333333, "grad_norm": 6.961907863616943, "kl": 0.26171875, "learning_rate": 6.686666666666666e-07, "loss": 0.0104, "reward": 0.32136833667755127, "reward_mean": 0.32136833667755127, "reward_std": 0.07450974732637405, "rewards/v_meteor_reward": 0.32136833667755127, "step": 994 }, { "advantages": 6.658956408500671e-06, "completion_length": 15.25, "epoch": 0.33166666666666667, "grad_norm": 15.203697204589844, "kl": 0.2373046875, "learning_rate": 6.683333333333333e-07, "loss": 0.0095, "reward": 1.8712260723114014, "reward_mean": 1.8712260723114014, "reward_std": 0.06623378396034241, "rewards/iou_timestamp_reward": 0.8712261319160461, "rewards/t_format_reward": 1.0, "step": 995 }, { "advantages": 2.421438694000244e-07, "completion_length": 82.9375, "epoch": 0.332, "grad_norm": 5.952075004577637, "kl": 0.181640625, "learning_rate": 6.68e-07, "loss": 0.0073, "reward": 0.35795313119888306, "reward_mean": 0.35795313119888306, "reward_std": 0.06435751914978027, "rewards/v_meteor_reward": 0.35795313119888306, "step": 996 }, { "advantages": 8.046627044677734e-07, "completion_length": 135.375, "epoch": 0.3323333333333333, "grad_norm": 6.696615219116211, "kl": 0.4609375, "learning_rate": 6.676666666666666e-07, "loss": 0.0185, "reward": 0.7549718618392944, "reward_mean": 0.7549718618392944, "reward_std": 0.03229795768857002, "rewards/a_meteor_reward": 0.7549718618392944, "step": 997 }, { "advantages": -1.903623342514038e-06, "completion_length": 15.8125, "epoch": 0.33266666666666667, "grad_norm": 18.94729232788086, "kl": 0.2353515625, "learning_rate": 6.673333333333334e-07, "loss": 0.0094, "reward": 1.5502386093139648, "reward_mean": 1.5502386093139648, "reward_std": 0.06595120579004288, "rewards/iou_timestamp_reward": 0.5502386093139648, "rewards/t_format_reward": 1.0, "step": 998 }, { "advantages": -1.7434358596801758e-06, "completion_length": 15.75, "epoch": 0.333, "grad_norm": 12.704301834106445, "kl": 0.380859375, "learning_rate": 6.67e-07, "loss": 0.0152, "reward": 1.957871675491333, "reward_mean": 1.957871675491333, "reward_std": 0.013820129446685314, "rewards/iou_timestamp_reward": 0.957871675491333, "rewards/t_format_reward": 1.0, "step": 999 }, { "advantages": 5.960464477539063e-08, "completion_length": 76.4375, "epoch": 0.3333333333333333, "grad_norm": 5.288830280303955, "kl": 0.265625, "learning_rate": 6.666666666666666e-07, "loss": 0.0106, "reward": 0.35228097438812256, "reward_mean": 0.35228097438812256, "reward_std": 0.06187451630830765, "rewards/v_meteor_reward": 0.35228097438812256, "step": 1000 } ], "logging_steps": 1.0, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }