diff --git "a/checkpoint-3138/trainer_state.json" "b/checkpoint-3138/trainer_state.json" deleted file mode 100644--- "a/checkpoint-3138/trainer_state.json" +++ /dev/null @@ -1,43965 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 3138, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "completion_length": 108.203125, - "epoch": 0.00031867431485022306, - "grad_norm": 29.355905532836914, - "kl": 0.0, - "learning_rate": 9.996813256851498e-07, - "loss": 0.0, - "reward": 0.990570604801178, - "reward_std": 0.5569084882736206, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 0.71875, - "rewards/iou_glue_reward": 0.13119560480117798, - "step": 1 - }, - { - "completion_length": 130.765625, - "epoch": 0.0006373486297004461, - "grad_norm": 54.51008987426758, - "kl": 0.00274658203125, - "learning_rate": 9.993626513702996e-07, - "loss": 0.0001, - "reward": 0.6362947225570679, - "reward_std": 0.46545886993408203, - "rewards/format_reward_tg": 0.484375, - "rewards/iou_timestamp_reward": 0.15191972255706787, - "rewards/pad": 0.0, - "step": 2 - }, - { - "completion_length": 106.125, - "epoch": 0.0009560229445506692, - "grad_norm": 27.533252716064453, - "kl": 0.006683349609375, - "learning_rate": 9.990439770554494e-07, - "loss": 0.0003, - "reward": 1.0761747360229492, - "reward_std": 0.4438322186470032, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 0.796875, - "rewards/tracking_iou_reward": 0.232424795627594, - "step": 3 - }, - { - "completion_length": 18.84375, - "epoch": 0.0012746972594008922, - "grad_norm": 27.014707565307617, - "kl": 0.00341796875, - "learning_rate": 9.98725302740599e-07, - "loss": 0.0001, - "reward": 1.1961885690689087, - "reward_std": 0.2257140427827835, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.2274385243654251, - "rewards/pad": 0.0, - "step": 4 - }, - { - "completion_length": 63.953125, - "epoch": 0.0015933715742511153, - "grad_norm": 45.08222579956055, - "kl": 0.0023345947265625, - "learning_rate": 9.984066284257488e-07, - "loss": 0.0001, - "reward": 1.518608570098877, - "reward_std": 0.3346985876560211, - "rewards/answer_reward": 0.328125, - "rewards/format_reward_gqa": 0.90625, - "rewards/iou_glue_reward": 0.2842335104942322, - "step": 5 - }, - { - "completion_length": 80.90625, - "epoch": 0.0019120458891013384, - "grad_norm": 14.024462699890137, - "kl": 0.0037841796875, - "learning_rate": 9.980879541108986e-07, - "loss": 0.0002, - "reward": 1.2835232019424438, - "reward_std": 0.21698495745658875, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 0.9375, - "rewards/tracking_iou_reward": 0.20539820194244385, - "step": 6 - }, - { - "completion_length": 74.609375, - "epoch": 0.0022307202039515616, - "grad_norm": 24.76462745666504, - "kl": 0.004638671875, - "learning_rate": 9.977692797960484e-07, - "loss": 0.0002, - "reward": 1.3342989683151245, - "reward_std": 0.33252787590026855, - "rewards/format_reward_tg": 0.953125, - "rewards/iou_timestamp_reward": 0.17804892361164093, - "rewards/pad": 0.203125, - "step": 7 - }, - { - "completion_length": 94.6875, - "epoch": 0.0025493945188017845, - "grad_norm": 111.181884765625, - "kl": 0.00567626953125, - "learning_rate": 9.974506054811982e-07, - "loss": 0.0002, - "reward": 1.2611994743347168, - "reward_std": 0.356869637966156, - "rewards/format_reward_tg": 0.890625, - "rewards/iou_timestamp_reward": 0.37057459354400635, - "rewards/pad": 0.0, - "step": 8 - }, - { - "completion_length": 92.203125, - "epoch": 0.0028680688336520078, - "grad_norm": 30.41423988342285, - "kl": 0.017578125, - "learning_rate": 9.97131931166348e-07, - "loss": 0.0007, - "reward": 1.3457612991333008, - "reward_std": 0.3058345317840576, - "rewards/format_reward_tg": 0.875, - "rewards/iou_timestamp_reward": 0.36138632893562317, - "rewards/pad": 0.109375, - "step": 9 - }, - { - "completion_length": 104.0625, - "epoch": 0.0031867431485022306, - "grad_norm": 44.20106887817383, - "kl": 0.006317138671875, - "learning_rate": 9.968132568514978e-07, - "loss": 0.0003, - "reward": 1.3545455932617188, - "reward_std": 0.3300294280052185, - "rewards/answer_reward": 0.1875, - "rewards/format_reward_gqa": 0.921875, - "rewards/iou_glue_reward": 0.2451706826686859, - "step": 10 - }, - { - "completion_length": 120.953125, - "epoch": 0.003505417463352454, - "grad_norm": 63.420772552490234, - "kl": 0.01275634765625, - "learning_rate": 9.964945825366476e-07, - "loss": 0.0005, - "reward": 1.15547776222229, - "reward_std": 0.2632618248462677, - "rewards/format_reward_tg": 0.90625, - "rewards/iou_timestamp_reward": 0.24922770261764526, - "rewards/pad": 0.0, - "step": 11 - }, - { - "completion_length": 124.828125, - "epoch": 0.0038240917782026767, - "grad_norm": 7.296823978424072, - "kl": 0.004913330078125, - "learning_rate": 9.961759082217972e-07, - "loss": 0.0002, - "reward": 1.311105728149414, - "reward_std": 0.36636123061180115, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 0.953125, - "rewards/tracking_iou_reward": 0.32673075795173645, - "step": 12 - }, - { - "completion_length": 75.421875, - "epoch": 0.0041427660930529, - "grad_norm": 32.08348083496094, - "kl": 0.017578125, - "learning_rate": 9.95857233906947e-07, - "loss": 0.0007, - "reward": 1.307035207748413, - "reward_std": 0.2346232533454895, - "rewards/format_reward_tg": 0.9375, - "rewards/iou_timestamp_reward": 0.27578526735305786, - "rewards/pad": 0.09375, - "step": 13 - }, - { - "completion_length": 120.078125, - "epoch": 0.004461440407903123, - "grad_norm": 17.275251388549805, - "kl": 0.01446533203125, - "learning_rate": 9.955385595920968e-07, - "loss": 0.0006, - "reward": 1.2805306911468506, - "reward_std": 0.2489503175020218, - "rewards/format_reward_tg": 0.953125, - "rewards/iou_timestamp_reward": 0.32740581035614014, - "rewards/pad": 0.0, - "step": 14 - }, - { - "completion_length": 99.359375, - "epoch": 0.004780114722753346, - "grad_norm": 17.721607208251953, - "kl": 0.009521484375, - "learning_rate": 9.952198852772466e-07, - "loss": 0.0004, - "reward": 1.4107003211975098, - "reward_std": 0.3282482922077179, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 0.96875, - "rewards/iou_glue_reward": 0.33257538080215454, - "step": 15 - }, - { - "completion_length": 41.578125, - "epoch": 0.005098789037603569, - "grad_norm": 17.887115478515625, - "kl": 0.0361328125, - "learning_rate": 9.949012109623964e-07, - "loss": 0.0014, - "reward": 1.4330815076828003, - "reward_std": 0.26210516691207886, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.3393315076828003, - "rewards/pad": 0.125, - "step": 16 - }, - { - "completion_length": 93.46875, - "epoch": 0.005417463352453792, - "grad_norm": 49.0717658996582, - "kl": 0.019287109375, - "learning_rate": 9.945825366475462e-07, - "loss": 0.0008, - "reward": 1.46000337600708, - "reward_std": 0.2627638578414917, - "rewards/format_reward_tg": 0.953125, - "rewards/iou_timestamp_reward": 0.2725033462047577, - "rewards/pad": 0.234375, - "step": 17 - }, - { - "completion_length": 128.984375, - "epoch": 0.0057361376673040155, - "grad_norm": 16.0430965423584, - "kl": 0.018798828125, - "learning_rate": 9.94263862332696e-07, - "loss": 0.0008, - "reward": 1.2777396440505981, - "reward_std": 0.3498499095439911, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.90625, - "rewards/tracking_iou_reward": 0.262114554643631, - "step": 18 - }, - { - "completion_length": 120.3125, - "epoch": 0.006054811982154238, - "grad_norm": 21.588294982910156, - "kl": 0.0830078125, - "learning_rate": 9.939451880178459e-07, - "loss": 0.0033, - "reward": 1.2359671592712402, - "reward_std": 0.22133731842041016, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.2672172784805298, - "rewards/pad": 0.0, - "step": 19 - }, - { - "completion_length": 117.171875, - "epoch": 0.006373486297004461, - "grad_norm": 9.763358116149902, - "kl": 0.0220947265625, - "learning_rate": 9.936265137029955e-07, - "loss": 0.0009, - "reward": 1.3035460710525513, - "reward_std": 0.17414405941963196, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3191710114479065, - "rewards/pad": 0.0, - "step": 20 - }, - { - "completion_length": 143.859375, - "epoch": 0.006692160611854685, - "grad_norm": 22.552978515625, - "kl": 0.030517578125, - "learning_rate": 9.933078393881453e-07, - "loss": 0.0012, - "reward": 1.4276816844940186, - "reward_std": 0.2420690506696701, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.96875, - "rewards/tracking_iou_reward": 0.45893165469169617, - "step": 21 - }, - { - "completion_length": 116.59375, - "epoch": 0.007010834926704908, - "grad_norm": 19.098342895507812, - "kl": 0.04345703125, - "learning_rate": 9.92989165073295e-07, - "loss": 0.0017, - "reward": 1.4236531257629395, - "reward_std": 0.24127241969108582, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.31427815556526184, - "step": 22 - }, - { - "completion_length": 124.640625, - "epoch": 0.007329509241555131, - "grad_norm": 13.978249549865723, - "kl": 0.0301513671875, - "learning_rate": 9.926704907584449e-07, - "loss": 0.0012, - "reward": 1.263721227645874, - "reward_std": 0.2416178286075592, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.2637212872505188, - "step": 23 - }, - { - "completion_length": 95.109375, - "epoch": 0.0076481835564053535, - "grad_norm": 18.538372039794922, - "kl": 0.034912109375, - "learning_rate": 9.923518164435945e-07, - "loss": 0.0014, - "reward": 1.498464822769165, - "reward_std": 0.18835774064064026, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2640899121761322, - "rewards/pad": 0.234375, - "step": 24 - }, - { - "completion_length": 139.75, - "epoch": 0.007966857871255577, - "grad_norm": 15.361570358276367, - "kl": 0.017822265625, - "learning_rate": 9.920331421287443e-07, - "loss": 0.0007, - "reward": 1.37433922290802, - "reward_std": 0.13456077873706818, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37433922290802, - "step": 25 - }, - { - "completion_length": 71.703125, - "epoch": 0.0082855321861058, - "grad_norm": 192.22190856933594, - "kl": 0.0279541015625, - "learning_rate": 9.91714467813894e-07, - "loss": 0.0011, - "reward": 1.4138457775115967, - "reward_std": 0.19462621212005615, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2732207179069519, - "step": 26 - }, - { - "completion_length": 114.0, - "epoch": 0.008604206500956023, - "grad_norm": 39.867950439453125, - "kl": 0.05029296875, - "learning_rate": 9.91395793499044e-07, - "loss": 0.002, - "reward": 1.3192813396453857, - "reward_std": 0.20698723196983337, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.33490636944770813, - "step": 27 - }, - { - "completion_length": 69.296875, - "epoch": 0.008922880815806247, - "grad_norm": 25.988426208496094, - "kl": 0.06640625, - "learning_rate": 9.910771191841937e-07, - "loss": 0.0026, - "reward": 1.4162707328796387, - "reward_std": 0.16801214218139648, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41627073287963867, - "step": 28 - }, - { - "completion_length": 95.875, - "epoch": 0.009241555130656469, - "grad_norm": 11.73469352722168, - "kl": 0.04638671875, - "learning_rate": 9.907584448693435e-07, - "loss": 0.0019, - "reward": 1.5614410638809204, - "reward_std": 0.2746657133102417, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.3426910638809204, - "step": 29 - }, - { - "completion_length": 92.140625, - "epoch": 0.009560229445506692, - "grad_norm": 27.682270050048828, - "kl": 0.06494140625, - "learning_rate": 9.904397705544933e-07, - "loss": 0.0026, - "reward": 1.2468478679656982, - "reward_std": 0.19129014015197754, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.27809786796569824, - "rewards/pad": 0.0, - "step": 30 - }, - { - "completion_length": 94.25, - "epoch": 0.009878903760356916, - "grad_norm": 35.90079116821289, - "kl": 0.043212890625, - "learning_rate": 9.90121096239643e-07, - "loss": 0.0017, - "reward": 1.382009744644165, - "reward_std": 0.19762729108333588, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3038847744464874, - "step": 31 - }, - { - "completion_length": 118.140625, - "epoch": 0.010197578075207138, - "grad_norm": 15.5087251663208, - "kl": 0.0191650390625, - "learning_rate": 9.898024219247927e-07, - "loss": 0.0008, - "reward": 1.4223272800445557, - "reward_std": 0.18307654559612274, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.2973273694515228, - "step": 32 - }, - { - "completion_length": 94.015625, - "epoch": 0.010516252390057362, - "grad_norm": 18.41666030883789, - "kl": 0.0654296875, - "learning_rate": 9.894837476099425e-07, - "loss": 0.0026, - "reward": 1.390166163444519, - "reward_std": 0.11873508989810944, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39016619324684143, - "step": 33 - }, - { - "completion_length": 144.484375, - "epoch": 0.010834926704907584, - "grad_norm": 23.03544807434082, - "kl": 0.049560546875, - "learning_rate": 9.891650732950923e-07, - "loss": 0.002, - "reward": 1.2460135221481323, - "reward_std": 0.13103874027729034, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.26163846254348755, - "step": 34 - }, - { - "completion_length": 43.703125, - "epoch": 0.011153601019757807, - "grad_norm": 74.67973327636719, - "kl": 0.2021484375, - "learning_rate": 9.888463989802421e-07, - "loss": 0.0081, - "reward": 1.3883543014526367, - "reward_std": 0.2638249099254608, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4039793610572815, - "rewards/pad": 0.0, - "step": 35 - }, - { - "completion_length": 72.484375, - "epoch": 0.011472275334608031, - "grad_norm": 18.90091323852539, - "kl": 0.034912109375, - "learning_rate": 9.88527724665392e-07, - "loss": 0.0014, - "reward": 1.5052143335342407, - "reward_std": 0.19993306696414948, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.36458930373191833, - "step": 36 - }, - { - "completion_length": 93.5, - "epoch": 0.011790949649458253, - "grad_norm": 42.74220657348633, - "kl": 0.04931640625, - "learning_rate": 9.882090503505418e-07, - "loss": 0.002, - "reward": 1.364863634109497, - "reward_std": 0.1607465147972107, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.27111363410949707, - "rewards/pad": 0.09375, - "step": 37 - }, - { - "completion_length": 121.765625, - "epoch": 0.012109623964308477, - "grad_norm": 12.138788223266602, - "kl": 0.0322265625, - "learning_rate": 9.878903760356916e-07, - "loss": 0.0013, - "reward": 1.5847129821777344, - "reward_std": 0.12323808670043945, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3347129821777344, - "step": 38 - }, - { - "completion_length": 45.453125, - "epoch": 0.0124282982791587, - "grad_norm": 94.58944702148438, - "kl": 0.126953125, - "learning_rate": 9.875717017208412e-07, - "loss": 0.0051, - "reward": 1.4467828273773193, - "reward_std": 0.22572225332260132, - "rewards/answer_reward": 0.046875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.39990779757499695, - "step": 39 - }, - { - "completion_length": 93.390625, - "epoch": 0.012746972594008922, - "grad_norm": 26.145246505737305, - "kl": 0.053466796875, - "learning_rate": 9.87253027405991e-07, - "loss": 0.0021, - "reward": 1.4706435203552246, - "reward_std": 0.1406538188457489, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.470643550157547, - "step": 40 - }, - { - "completion_length": 45.984375, - "epoch": 0.013065646908859146, - "grad_norm": 76.75983428955078, - "kl": 0.0732421875, - "learning_rate": 9.869343530911408e-07, - "loss": 0.0029, - "reward": 1.4271475076675415, - "reward_std": 0.16546779870986938, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3021474778652191, - "rewards/pad": 0.125, - "step": 41 - }, - { - "completion_length": 73.953125, - "epoch": 0.01338432122370937, - "grad_norm": 21.900846481323242, - "kl": 0.058349609375, - "learning_rate": 9.866156787762906e-07, - "loss": 0.0023, - "reward": 1.5581902265548706, - "reward_std": 0.10484491288661957, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.30819016695022583, - "step": 42 - }, - { - "completion_length": 71.515625, - "epoch": 0.013702995538559592, - "grad_norm": 36.57977294921875, - "kl": 0.06689453125, - "learning_rate": 9.862970044614404e-07, - "loss": 0.0027, - "reward": 1.5820021629333496, - "reward_std": 0.21441596746444702, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.36325210332870483, - "step": 43 - }, - { - "completion_length": 146.640625, - "epoch": 0.014021669853409816, - "grad_norm": 18.229549407958984, - "kl": 0.0289306640625, - "learning_rate": 9.859783301465902e-07, - "loss": 0.0012, - "reward": 1.2833337783813477, - "reward_std": 0.10058476030826569, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.28333374857902527, - "rewards/pad": 0.0, - "step": 44 - }, - { - "completion_length": 70.28125, - "epoch": 0.014340344168260038, - "grad_norm": 15.984444618225098, - "kl": 0.06640625, - "learning_rate": 9.8565965583174e-07, - "loss": 0.0026, - "reward": 1.5217663049697876, - "reward_std": 0.21203118562698364, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.537391185760498, - "rewards/pad": 0.0, - "step": 45 - }, - { - "completion_length": 70.953125, - "epoch": 0.014659018483110261, - "grad_norm": 24.116294860839844, - "kl": 0.054443359375, - "learning_rate": 9.853409815168898e-07, - "loss": 0.0022, - "reward": 1.4242897033691406, - "reward_std": 0.1250244379043579, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3149147033691406, - "rewards/pad": 0.109375, - "step": 46 - }, - { - "completion_length": 96.171875, - "epoch": 0.014977692797960485, - "grad_norm": 10.824599266052246, - "kl": 0.04931640625, - "learning_rate": 9.850223072020394e-07, - "loss": 0.002, - "reward": 1.3301568031311035, - "reward_std": 0.09323126077651978, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3301568627357483, - "rewards/pad": 0.0, - "step": 47 - }, - { - "completion_length": 69.328125, - "epoch": 0.015296367112810707, - "grad_norm": 54.09326171875, - "kl": 0.0859375, - "learning_rate": 9.847036328871892e-07, - "loss": 0.0034, - "reward": 1.5022802352905273, - "reward_std": 0.11983383446931839, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5022802352905273, - "rewards/pad": 0.0, - "step": 48 - }, - { - "completion_length": 94.390625, - "epoch": 0.01561504142766093, - "grad_norm": 20.41078758239746, - "kl": 0.04345703125, - "learning_rate": 9.84384958572339e-07, - "loss": 0.0017, - "reward": 1.5264580249786377, - "reward_std": 0.11588997393846512, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5264579653739929, - "step": 49 - }, - { - "completion_length": 70.96875, - "epoch": 0.015933715742511154, - "grad_norm": 11.669675827026367, - "kl": 0.052001953125, - "learning_rate": 9.840662842574888e-07, - "loss": 0.0021, - "reward": 1.5469558238983154, - "reward_std": 0.1315864473581314, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5469557642936707, - "rewards/pad": 0.0, - "step": 50 - }, - { - "completion_length": 44.578125, - "epoch": 0.016252390057361378, - "grad_norm": 49.088687896728516, - "kl": 0.05322265625, - "learning_rate": 9.837476099426386e-07, - "loss": 0.0021, - "reward": 1.5957996845245361, - "reward_std": 0.13811209797859192, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47079968452453613, - "rewards/pad": 0.125, - "step": 51 - }, - { - "completion_length": 146.546875, - "epoch": 0.0165710643722116, - "grad_norm": 7.562353610992432, - "kl": 0.02490234375, - "learning_rate": 9.834289356277884e-07, - "loss": 0.001, - "reward": 1.329128623008728, - "reward_std": 0.17861336469650269, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.360378623008728, - "rewards/pad": 0.0, - "step": 52 - }, - { - "completion_length": 120.109375, - "epoch": 0.016889738687061822, - "grad_norm": 20.31233024597168, - "kl": 0.048583984375, - "learning_rate": 9.831102613129382e-07, - "loss": 0.0019, - "reward": 1.5360966920852661, - "reward_std": 0.195622980594635, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4735966622829437, - "step": 53 - }, - { - "completion_length": 95.796875, - "epoch": 0.017208413001912046, - "grad_norm": 18.229900360107422, - "kl": 0.048583984375, - "learning_rate": 9.82791586998088e-07, - "loss": 0.0019, - "reward": 1.4456595182418823, - "reward_std": 0.1510319858789444, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.38315948843955994, - "rewards/pad": 0.0625, - "step": 54 - }, - { - "completion_length": 71.78125, - "epoch": 0.01752708731676227, - "grad_norm": 41.68214416503906, - "kl": 0.1240234375, - "learning_rate": 9.824729126832376e-07, - "loss": 0.0049, - "reward": 1.5753334760665894, - "reward_std": 0.12371576577425003, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4503334164619446, - "rewards/pad": 0.125, - "step": 55 - }, - { - "completion_length": 45.34375, - "epoch": 0.017845761631612493, - "grad_norm": 19.551204681396484, - "kl": 0.0927734375, - "learning_rate": 9.821542383683875e-07, - "loss": 0.0037, - "reward": 1.3601016998291016, - "reward_std": 0.09858176857233047, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36010172963142395, - "rewards/pad": 0.0, - "step": 56 - }, - { - "completion_length": 132.375, - "epoch": 0.018164435946462717, - "grad_norm": 25.195838928222656, - "kl": 0.039794921875, - "learning_rate": 9.818355640535373e-07, - "loss": 0.0016, - "reward": 1.487829566001892, - "reward_std": 0.11461742967367172, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3784545361995697, - "step": 57 - }, - { - "completion_length": 97.546875, - "epoch": 0.018483110261312937, - "grad_norm": 38.37260818481445, - "kl": 0.0478515625, - "learning_rate": 9.81516889738687e-07, - "loss": 0.0019, - "reward": 1.5829166173934937, - "reward_std": 0.1692945510149002, - "rewards/pad": 0.15625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4266665577888489, - "step": 58 - }, - { - "completion_length": 97.65625, - "epoch": 0.01880178457616316, - "grad_norm": 28.56045150756836, - "kl": 0.0625, - "learning_rate": 9.811982154238367e-07, - "loss": 0.0025, - "reward": 1.3472189903259277, - "reward_std": 0.22873595356941223, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.31596893072128296, - "step": 59 - }, - { - "completion_length": 150.375, - "epoch": 0.019120458891013385, - "grad_norm": 9.740062713623047, - "kl": 0.05224609375, - "learning_rate": 9.808795411089865e-07, - "loss": 0.0021, - "reward": 1.4982421398162842, - "reward_std": 0.14206960797309875, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.40449216961860657, - "step": 60 - }, - { - "completion_length": 149.0, - "epoch": 0.019439133205863608, - "grad_norm": 13.651617050170898, - "kl": 0.037841796875, - "learning_rate": 9.805608667941363e-07, - "loss": 0.0015, - "reward": 1.5308611392974854, - "reward_std": 0.1005527451634407, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43711116909980774, - "step": 61 - }, - { - "completion_length": 96.15625, - "epoch": 0.019757807520713832, - "grad_norm": 17.998046875, - "kl": 0.0810546875, - "learning_rate": 9.80242192479286e-07, - "loss": 0.0032, - "reward": 1.4157941341400146, - "reward_std": 0.08607807010412216, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41579413414001465, - "rewards/pad": 0.0, - "step": 62 - }, - { - "completion_length": 71.890625, - "epoch": 0.020076481835564052, - "grad_norm": 137.67347717285156, - "kl": 0.1318359375, - "learning_rate": 9.799235181644359e-07, - "loss": 0.0053, - "reward": 1.411577582359314, - "reward_std": 0.1643928587436676, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3803275525569916, - "rewards/pad": 0.03125, - "step": 63 - }, - { - "completion_length": 95.40625, - "epoch": 0.020395156150414276, - "grad_norm": 12.385643005371094, - "kl": 0.059814453125, - "learning_rate": 9.796048438495857e-07, - "loss": 0.0024, - "reward": 1.5682718753814697, - "reward_std": 0.17435622215270996, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4901469349861145, - "step": 64 - }, - { - "completion_length": 97.96875, - "epoch": 0.0207138304652645, - "grad_norm": 29.116153717041016, - "kl": 0.10791015625, - "learning_rate": 9.792861695347355e-07, - "loss": 0.0043, - "reward": 1.6022427082061768, - "reward_std": 0.10109306126832962, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47724267840385437, - "rewards/pad": 0.125, - "step": 65 - }, - { - "completion_length": 96.171875, - "epoch": 0.021032504780114723, - "grad_norm": 24.117246627807617, - "kl": 0.09423828125, - "learning_rate": 9.78967495219885e-07, - "loss": 0.0038, - "reward": 1.3648128509521484, - "reward_std": 0.09639433771371841, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3648129105567932, - "rewards/pad": 0.0, - "step": 66 - }, - { - "completion_length": 69.65625, - "epoch": 0.021351179094964947, - "grad_norm": 89.97595977783203, - "kl": 0.09228515625, - "learning_rate": 9.78648820905035e-07, - "loss": 0.0037, - "reward": 1.284343957901001, - "reward_std": 0.14161378145217896, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.2999690771102905, - "step": 67 - }, - { - "completion_length": 74.125, - "epoch": 0.021669853409815167, - "grad_norm": 28.639371871948242, - "kl": 0.0859375, - "learning_rate": 9.783301465901847e-07, - "loss": 0.0034, - "reward": 1.4897210597991943, - "reward_std": 0.1362878680229187, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47409600019454956, - "step": 68 - }, - { - "completion_length": 97.53125, - "epoch": 0.02198852772466539, - "grad_norm": 39.00709533691406, - "kl": 0.09814453125, - "learning_rate": 9.780114722753345e-07, - "loss": 0.0039, - "reward": 1.5572547912597656, - "reward_std": 0.0959487184882164, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5572547912597656, - "rewards/pad": 0.0, - "step": 69 - }, - { - "completion_length": 147.46875, - "epoch": 0.022307202039515615, - "grad_norm": 24.02517318725586, - "kl": 0.0537109375, - "learning_rate": 9.776927979604843e-07, - "loss": 0.0022, - "reward": 1.6853044033050537, - "reward_std": 0.06498823314905167, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4353044033050537, - "step": 70 - }, - { - "completion_length": 124.421875, - "epoch": 0.02262587635436584, - "grad_norm": 12.759849548339844, - "kl": 0.0771484375, - "learning_rate": 9.773741236456341e-07, - "loss": 0.0031, - "reward": 1.4843802452087402, - "reward_std": 0.07213138788938522, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4843802750110626, - "rewards/pad": 0.0, - "step": 71 - }, - { - "completion_length": 125.28125, - "epoch": 0.022944550669216062, - "grad_norm": 33.39959716796875, - "kl": 0.0859375, - "learning_rate": 9.77055449330784e-07, - "loss": 0.0035, - "reward": 1.5397720336914062, - "reward_std": 0.07022380828857422, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4147719740867615, - "step": 72 - }, - { - "completion_length": 121.890625, - "epoch": 0.023263224984066286, - "grad_norm": 17.097454071044922, - "kl": 0.0537109375, - "learning_rate": 9.767367750159337e-07, - "loss": 0.0022, - "reward": 1.3611290454864502, - "reward_std": 0.07813698053359985, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3611290752887726, - "step": 73 - }, - { - "completion_length": 73.34375, - "epoch": 0.023581899298916506, - "grad_norm": 32.879371643066406, - "kl": 0.07177734375, - "learning_rate": 9.764181007010833e-07, - "loss": 0.0029, - "reward": 1.70439875125885, - "reward_std": 0.1615179181098938, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4543987810611725, - "step": 74 - }, - { - "completion_length": 178.828125, - "epoch": 0.02390057361376673, - "grad_norm": 42.670570373535156, - "kl": 0.031982421875, - "learning_rate": 9.760994263862331e-07, - "loss": 0.0013, - "reward": 1.333021879196167, - "reward_std": 0.05501440539956093, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33302196860313416, - "step": 75 - }, - { - "completion_length": 147.0625, - "epoch": 0.024219247928616953, - "grad_norm": 7.011380195617676, - "kl": 0.057861328125, - "learning_rate": 9.75780752071383e-07, - "loss": 0.0023, - "reward": 1.564239740371704, - "reward_std": 0.06192938610911369, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4392397403717041, - "step": 76 - }, - { - "completion_length": 98.046875, - "epoch": 0.024537922243467177, - "grad_norm": 18.060924530029297, - "kl": 0.0810546875, - "learning_rate": 9.754620777565328e-07, - "loss": 0.0032, - "reward": 1.668859839439392, - "reward_std": 0.1618225872516632, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4344848394393921, - "step": 77 - }, - { - "completion_length": 72.84375, - "epoch": 0.0248565965583174, - "grad_norm": 108.19387817382812, - "kl": 0.0908203125, - "learning_rate": 9.751434034416826e-07, - "loss": 0.0036, - "reward": 1.6406117677688599, - "reward_std": 0.10990151762962341, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40623676776885986, - "rewards/pad": 0.234375, - "step": 78 - }, - { - "completion_length": 18.921875, - "epoch": 0.02517527087316762, - "grad_norm": 44.06772232055664, - "kl": 0.1240234375, - "learning_rate": 9.748247291268324e-07, - "loss": 0.005, - "reward": 1.5604881048202515, - "reward_std": 0.13976165652275085, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43548810482025146, - "rewards/pad": 0.125, - "step": 79 - }, - { - "completion_length": 46.390625, - "epoch": 0.025493945188017845, - "grad_norm": 43.962501525878906, - "kl": 0.09326171875, - "learning_rate": 9.745060548119822e-07, - "loss": 0.0037, - "reward": 1.5611708164215088, - "reward_std": 0.10798977315425873, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43617090582847595, - "rewards/pad": 0.125, - "step": 80 - }, - { - "completion_length": 18.734375, - "epoch": 0.02581261950286807, - "grad_norm": 21.721975326538086, - "kl": 0.08203125, - "learning_rate": 9.74187380497132e-07, - "loss": 0.0033, - "reward": 1.5381839275360107, - "reward_std": 0.10050734877586365, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5381839871406555, - "step": 81 - }, - { - "completion_length": 70.0, - "epoch": 0.026131293817718292, - "grad_norm": 57.080387115478516, - "kl": 0.09912109375, - "learning_rate": 9.738687061822816e-07, - "loss": 0.004, - "reward": 1.4988303184509277, - "reward_std": 0.10620397329330444, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4988303780555725, - "rewards/pad": 0.0, - "step": 82 - }, - { - "completion_length": 125.953125, - "epoch": 0.026449968132568516, - "grad_norm": 89.73050689697266, - "kl": 0.0615234375, - "learning_rate": 9.735500318674314e-07, - "loss": 0.0025, - "reward": 1.4511033296585083, - "reward_std": 0.06983766704797745, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4511033296585083, - "rewards/pad": 0.0, - "step": 83 - }, - { - "completion_length": 68.9375, - "epoch": 0.02676864244741874, - "grad_norm": 52.88443374633789, - "kl": 0.1171875, - "learning_rate": 9.732313575525812e-07, - "loss": 0.0047, - "reward": 1.5206553936004639, - "reward_std": 0.07981610298156738, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3956553339958191, - "step": 84 - }, - { - "completion_length": 124.8125, - "epoch": 0.02708731676226896, - "grad_norm": 30.62926483154297, - "kl": 0.06982421875, - "learning_rate": 9.72912683237731e-07, - "loss": 0.0028, - "reward": 1.3461110591888428, - "reward_std": 0.07935292273759842, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34611111879348755, - "rewards/pad": 0.0, - "step": 85 - }, - { - "completion_length": 97.4375, - "epoch": 0.027405991077119184, - "grad_norm": 21.218219757080078, - "kl": 0.09326171875, - "learning_rate": 9.725940089228808e-07, - "loss": 0.0037, - "reward": 1.3955249786376953, - "reward_std": 0.06659087538719177, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39552485942840576, - "rewards/pad": 0.0, - "step": 86 - }, - { - "completion_length": 45.296875, - "epoch": 0.027724665391969407, - "grad_norm": 34.96338653564453, - "kl": 0.14453125, - "learning_rate": 9.722753346080306e-07, - "loss": 0.0058, - "reward": 1.6848323345184326, - "reward_std": 0.09776704013347626, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6848323941230774, - "step": 87 - }, - { - "completion_length": 43.28125, - "epoch": 0.02804333970681963, - "grad_norm": 39.88020324707031, - "kl": 0.1259765625, - "learning_rate": 9.719566602931804e-07, - "loss": 0.005, - "reward": 1.414764404296875, - "reward_std": 0.12384898960590363, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4147644639015198, - "rewards/pad": 0.0, - "step": 88 - }, - { - "completion_length": 72.03125, - "epoch": 0.028362014021669855, - "grad_norm": 75.94140625, - "kl": 0.1044921875, - "learning_rate": 9.716379859783302e-07, - "loss": 0.0042, - "reward": 1.4549014568328857, - "reward_std": 0.10878540575504303, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4549015164375305, - "step": 89 - }, - { - "completion_length": 45.140625, - "epoch": 0.028680688336520075, - "grad_norm": 31.220703125, - "kl": 0.1328125, - "learning_rate": 9.713193116634798e-07, - "loss": 0.0053, - "reward": 1.4554842710494995, - "reward_std": 0.12406035512685776, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4554842710494995, - "step": 90 - }, - { - "completion_length": 96.9375, - "epoch": 0.0289993626513703, - "grad_norm": 17.68728256225586, - "kl": 0.1064453125, - "learning_rate": 9.710006373486296e-07, - "loss": 0.0043, - "reward": 1.5634052753448486, - "reward_std": 0.12161899358034134, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5634053349494934, - "rewards/pad": 0.0, - "step": 91 - }, - { - "completion_length": 73.71875, - "epoch": 0.029318036966220522, - "grad_norm": 32.400978088378906, - "kl": 0.138671875, - "learning_rate": 9.706819630337794e-07, - "loss": 0.0056, - "reward": 1.432898759841919, - "reward_std": 0.17899563908576965, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35477375984191895, - "rewards/pad": 0.078125, - "step": 92 - }, - { - "completion_length": 125.6875, - "epoch": 0.029636711281070746, - "grad_norm": 47.505828857421875, - "kl": 0.08740234375, - "learning_rate": 9.703632887189293e-07, - "loss": 0.0035, - "reward": 1.3663368225097656, - "reward_std": 0.02895917370915413, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3663369119167328, - "rewards/pad": 0.0, - "step": 93 - }, - { - "completion_length": 72.234375, - "epoch": 0.02995538559592097, - "grad_norm": 66.74494934082031, - "kl": 0.10791015625, - "learning_rate": 9.70044614404079e-07, - "loss": 0.0043, - "reward": 1.5021812915802002, - "reward_std": 0.048981934785842896, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.37718141078948975, - "step": 94 - }, - { - "completion_length": 70.796875, - "epoch": 0.030274059910771194, - "grad_norm": 99.29576110839844, - "kl": 0.0859375, - "learning_rate": 9.697259400892289e-07, - "loss": 0.0034, - "reward": 1.5479052066802979, - "reward_std": 0.14039847254753113, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5479051470756531, - "rewards/pad": 0.0, - "step": 95 - }, - { - "completion_length": 18.96875, - "epoch": 0.030592734225621414, - "grad_norm": 39.55213165283203, - "kl": 0.1103515625, - "learning_rate": 9.694072657743787e-07, - "loss": 0.0044, - "reward": 1.5478798151016235, - "reward_std": 0.14118030667304993, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43850481510162354, - "rewards/pad": 0.109375, - "step": 96 - }, - { - "completion_length": 98.5625, - "epoch": 0.030911408540471638, - "grad_norm": 14.295442581176758, - "kl": 0.0986328125, - "learning_rate": 9.690885914595283e-07, - "loss": 0.004, - "reward": 1.566028118133545, - "reward_std": 0.06352254748344421, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44102805852890015, - "step": 97 - }, - { - "completion_length": 43.828125, - "epoch": 0.03123008285532186, - "grad_norm": 18.267284393310547, - "kl": 0.1201171875, - "learning_rate": 9.68769917144678e-07, - "loss": 0.0048, - "reward": 1.4908885955810547, - "reward_std": 0.0876583606004715, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49088847637176514, - "rewards/pad": 0.0, - "step": 98 - }, - { - "completion_length": 125.5625, - "epoch": 0.03154875717017208, - "grad_norm": 92.24208068847656, - "kl": 0.0576171875, - "learning_rate": 9.684512428298279e-07, - "loss": 0.0023, - "reward": 1.5934380292892456, - "reward_std": 0.05329665541648865, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4684380292892456, - "rewards/pad": 0.125, - "step": 99 - }, - { - "completion_length": 45.984375, - "epoch": 0.03186743148502231, - "grad_norm": 182.80059814453125, - "kl": 0.1005859375, - "learning_rate": 9.681325685149777e-07, - "loss": 0.004, - "reward": 1.5487216711044312, - "reward_std": 0.16690734028816223, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5018466711044312, - "rewards/pad": 0.046875, - "step": 100 - }, - { - "completion_length": 96.59375, - "epoch": 0.03218610579987253, - "grad_norm": 47.996280670166016, - "kl": 0.08544921875, - "learning_rate": 9.678138942001273e-07, - "loss": 0.0034, - "reward": 1.4153481721878052, - "reward_std": 0.11429637670516968, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.415348082780838, - "rewards/pad": 0.0, - "step": 101 - }, - { - "completion_length": 73.4375, - "epoch": 0.032504780114722756, - "grad_norm": 129.4193878173828, - "kl": 0.09765625, - "learning_rate": 9.67495219885277e-07, - "loss": 0.0039, - "reward": 1.408902883529663, - "reward_std": 0.10345868021249771, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2995278239250183, - "rewards/pad": 0.109375, - "step": 102 - }, - { - "completion_length": 98.625, - "epoch": 0.032823454429572976, - "grad_norm": 26.774145126342773, - "kl": 0.072265625, - "learning_rate": 9.67176545570427e-07, - "loss": 0.0029, - "reward": 1.6206049919128418, - "reward_std": 0.0844387486577034, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4956050515174866, - "step": 103 - }, - { - "completion_length": 96.5625, - "epoch": 0.0331421287444232, - "grad_norm": 105.86109161376953, - "kl": 0.08984375, - "learning_rate": 9.668578712555767e-07, - "loss": 0.0036, - "reward": 1.3795945644378662, - "reward_std": 0.06641143560409546, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.25459450483322144, - "step": 104 - }, - { - "completion_length": 99.796875, - "epoch": 0.033460803059273424, - "grad_norm": 13.890254020690918, - "kl": 0.08984375, - "learning_rate": 9.665391969407265e-07, - "loss": 0.0036, - "reward": 1.606649398803711, - "reward_std": 0.13339820504188538, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.45039939880371094, - "step": 105 - }, - { - "completion_length": 147.921875, - "epoch": 0.033779477374123644, - "grad_norm": 7.908030033111572, - "kl": 0.05419921875, - "learning_rate": 9.662205226258763e-07, - "loss": 0.0022, - "reward": 1.4531539678573608, - "reward_std": 0.027786817401647568, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4531539976596832, - "step": 106 - }, - { - "completion_length": 72.859375, - "epoch": 0.03409815168897387, - "grad_norm": 32.176780700683594, - "kl": 0.173828125, - "learning_rate": 9.659018483110261e-07, - "loss": 0.0069, - "reward": 1.6003332138061523, - "reward_std": 0.11983513832092285, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47533318400382996, - "rewards/pad": 0.125, - "step": 107 - }, - { - "completion_length": 151.421875, - "epoch": 0.03441682600382409, - "grad_norm": 50.097869873046875, - "kl": 0.054443359375, - "learning_rate": 9.65583173996176e-07, - "loss": 0.0022, - "reward": 1.3763339519500732, - "reward_std": 0.08023016154766083, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3763338625431061, - "rewards/pad": 0.0, - "step": 108 - }, - { - "completion_length": 74.28125, - "epoch": 0.03473550031867431, - "grad_norm": 54.777732849121094, - "kl": 0.06689453125, - "learning_rate": 9.652644996813255e-07, - "loss": 0.0027, - "reward": 1.739059329032898, - "reward_std": 0.11556351184844971, - "rewards/answer_reward": 0.265625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47343435883522034, - "step": 109 - }, - { - "completion_length": 45.03125, - "epoch": 0.03505417463352454, - "grad_norm": 51.98408508300781, - "kl": 0.1953125, - "learning_rate": 9.649458253664753e-07, - "loss": 0.0078, - "reward": 1.497527837753296, - "reward_std": 0.11776410043239594, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4975278973579407, - "rewards/pad": 0.0, - "step": 110 - }, - { - "completion_length": 97.546875, - "epoch": 0.03537284894837476, - "grad_norm": 193.61141967773438, - "kl": 0.10791015625, - "learning_rate": 9.646271510516251e-07, - "loss": 0.0043, - "reward": 1.5598293542861938, - "reward_std": 0.07324188947677612, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5598292946815491, - "step": 111 - }, - { - "completion_length": 47.328125, - "epoch": 0.035691523263224986, - "grad_norm": 27.857454299926758, - "kl": 0.10595703125, - "learning_rate": 9.64308476736775e-07, - "loss": 0.0042, - "reward": 1.9212465286254883, - "reward_std": 0.12968027591705322, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5462466478347778, - "step": 112 - }, - { - "completion_length": 124.796875, - "epoch": 0.036010197578075206, - "grad_norm": 18.940677642822266, - "kl": 0.0732421875, - "learning_rate": 9.639898024219248e-07, - "loss": 0.0029, - "reward": 1.4641155004501343, - "reward_std": 0.09638328850269318, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3391154706478119, - "step": 113 - }, - { - "completion_length": 97.8125, - "epoch": 0.036328871892925434, - "grad_norm": 31.145307540893555, - "kl": 0.0908203125, - "learning_rate": 9.636711281070746e-07, - "loss": 0.0036, - "reward": 1.6159067153930664, - "reward_std": 0.07679077982902527, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6159066557884216, - "rewards/pad": 0.0, - "step": 114 - }, - { - "completion_length": 100.859375, - "epoch": 0.036647546207775654, - "grad_norm": 26.970640182495117, - "kl": 0.0849609375, - "learning_rate": 9.633524537922244e-07, - "loss": 0.0034, - "reward": 1.561607003211975, - "reward_std": 0.1283750832080841, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43660691380500793, - "step": 115 - }, - { - "completion_length": 73.5, - "epoch": 0.036966220522625874, - "grad_norm": 19.1359920501709, - "kl": 0.0927734375, - "learning_rate": 9.630337794773742e-07, - "loss": 0.0037, - "reward": 1.2844882011413574, - "reward_std": 0.15992632508277893, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.22198832035064697, - "step": 116 - }, - { - "completion_length": 118.59375, - "epoch": 0.0372848948374761, - "grad_norm": 30.564661026000977, - "kl": 0.0966796875, - "learning_rate": 9.627151051625238e-07, - "loss": 0.0039, - "reward": 1.3892366886138916, - "reward_std": 0.09500478953123093, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.389236718416214, - "rewards/pad": 0.0, - "step": 117 - }, - { - "completion_length": 97.96875, - "epoch": 0.03760356915232632, - "grad_norm": 38.80213928222656, - "kl": 0.1904296875, - "learning_rate": 9.623964308476736e-07, - "loss": 0.0076, - "reward": 1.5466018915176392, - "reward_std": 0.06538740545511246, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4216019809246063, - "step": 118 - }, - { - "completion_length": 100.90625, - "epoch": 0.03792224346717655, - "grad_norm": 219.59561157226562, - "kl": 0.08837890625, - "learning_rate": 9.620777565328234e-07, - "loss": 0.0035, - "reward": 1.541412591934204, - "reward_std": 0.08481577038764954, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4164126217365265, - "step": 119 - }, - { - "completion_length": 121.953125, - "epoch": 0.03824091778202677, - "grad_norm": 21.892406463623047, - "kl": 0.1416015625, - "learning_rate": 9.617590822179732e-07, - "loss": 0.0057, - "reward": 1.3950331211090088, - "reward_std": 0.04945141077041626, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39503324031829834, - "step": 120 - }, - { - "completion_length": 70.25, - "epoch": 0.03855959209687699, - "grad_norm": 25.756412506103516, - "kl": 0.08154296875, - "learning_rate": 9.61440407903123e-07, - "loss": 0.0033, - "reward": 1.7107679843902588, - "reward_std": 0.13024044036865234, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4607679545879364, - "rewards/pad": 0.25, - "step": 121 - }, - { - "completion_length": 98.953125, - "epoch": 0.038878266411727216, - "grad_norm": 58.248538970947266, - "kl": 0.119140625, - "learning_rate": 9.611217335882728e-07, - "loss": 0.0047, - "reward": 1.3422318696975708, - "reward_std": 0.08788175880908966, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3266068398952484, - "step": 122 - }, - { - "completion_length": 125.6875, - "epoch": 0.03919694072657744, - "grad_norm": 21.87851905822754, - "kl": 0.08056640625, - "learning_rate": 9.608030592734226e-07, - "loss": 0.0032, - "reward": 1.7041547298431396, - "reward_std": 0.10210521519184113, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3447796404361725, - "step": 123 - }, - { - "completion_length": 73.796875, - "epoch": 0.039515615041427664, - "grad_norm": 15.746590614318848, - "kl": 0.09765625, - "learning_rate": 9.604843849585724e-07, - "loss": 0.0039, - "reward": 1.4994181394577026, - "reward_std": 0.0809524804353714, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.24941813945770264, - "step": 124 - }, - { - "completion_length": 44.90625, - "epoch": 0.039834289356277884, - "grad_norm": 103.32615661621094, - "kl": 0.1845703125, - "learning_rate": 9.60165710643722e-07, - "loss": 0.0074, - "reward": 1.421347737312317, - "reward_std": 0.07847784459590912, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4213476777076721, - "rewards/pad": 0.0, - "step": 125 - }, - { - "completion_length": 18.03125, - "epoch": 0.040152963671128104, - "grad_norm": 58.665218353271484, - "kl": 0.1748046875, - "learning_rate": 9.598470363288718e-07, - "loss": 0.007, - "reward": 1.4241862297058105, - "reward_std": 0.15667256712913513, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42418625950813293, - "rewards/pad": 0.0, - "step": 126 - }, - { - "completion_length": 74.640625, - "epoch": 0.04047163798597833, - "grad_norm": 21.96946907043457, - "kl": 0.09716796875, - "learning_rate": 9.595283620140216e-07, - "loss": 0.0039, - "reward": 1.6786143779754639, - "reward_std": 0.14680366218090057, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3348643481731415, - "rewards/pad": 0.34375, - "step": 127 - }, - { - "completion_length": 124.390625, - "epoch": 0.04079031230082855, - "grad_norm": 92.82318115234375, - "kl": 0.09912109375, - "learning_rate": 9.592096876991714e-07, - "loss": 0.004, - "reward": 1.4549717903137207, - "reward_std": 0.05895674601197243, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4549717903137207, - "rewards/pad": 0.0, - "step": 128 - }, - { - "completion_length": 98.109375, - "epoch": 0.04110898661567878, - "grad_norm": 46.42023849487305, - "kl": 0.150390625, - "learning_rate": 9.588910133843212e-07, - "loss": 0.006, - "reward": 1.5591964721679688, - "reward_std": 0.056670114398002625, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5591964721679688, - "rewards/pad": 0.0, - "step": 129 - }, - { - "completion_length": 123.140625, - "epoch": 0.041427660930529, - "grad_norm": 23.843793869018555, - "kl": 0.0849609375, - "learning_rate": 9.58572339069471e-07, - "loss": 0.0034, - "reward": 1.438868761062622, - "reward_std": 0.07457967847585678, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43886876106262207, - "step": 130 - }, - { - "completion_length": 124.53125, - "epoch": 0.04174633524537922, - "grad_norm": 34.21272659301758, - "kl": 0.06494140625, - "learning_rate": 9.582536647546209e-07, - "loss": 0.0026, - "reward": 1.582020878791809, - "reward_std": 0.09043914824724197, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4570208489894867, - "rewards/pad": 0.125, - "step": 131 - }, - { - "completion_length": 72.96875, - "epoch": 0.04206500956022945, - "grad_norm": 24.28638458251953, - "kl": 0.10546875, - "learning_rate": 9.579349904397705e-07, - "loss": 0.0042, - "reward": 1.610738754272461, - "reward_std": 0.10264147818088531, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4857388436794281, - "rewards/pad": 0.125, - "step": 132 - }, - { - "completion_length": 98.0625, - "epoch": 0.04238368387507967, - "grad_norm": 23.711889266967773, - "kl": 0.091796875, - "learning_rate": 9.576163161249203e-07, - "loss": 0.0037, - "reward": 1.6860997676849365, - "reward_std": 0.06541076302528381, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5610998272895813, - "step": 133 - }, - { - "completion_length": 126.109375, - "epoch": 0.042702358189929894, - "grad_norm": 18.623653411865234, - "kl": 0.06689453125, - "learning_rate": 9.5729764181007e-07, - "loss": 0.0027, - "reward": 1.5307762622833252, - "reward_std": 0.030945513397455215, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4057762920856476, - "step": 134 - }, - { - "completion_length": 125.59375, - "epoch": 0.043021032504780114, - "grad_norm": 17.338851928710938, - "kl": 0.09765625, - "learning_rate": 9.569789674952199e-07, - "loss": 0.0039, - "reward": 1.5449299812316895, - "reward_std": 0.032470062375068665, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5449299812316895, - "step": 135 - }, - { - "completion_length": 44.5, - "epoch": 0.043339706819630335, - "grad_norm": 58.93782043457031, - "kl": 0.208984375, - "learning_rate": 9.566602931803697e-07, - "loss": 0.0084, - "reward": 1.482414722442627, - "reward_std": 0.11973986029624939, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.49803969264030457, - "step": 136 - }, - { - "completion_length": 124.65625, - "epoch": 0.04365838113448056, - "grad_norm": 68.23857879638672, - "kl": 0.1171875, - "learning_rate": 9.563416188655195e-07, - "loss": 0.0047, - "reward": 1.4399811029434204, - "reward_std": 0.03828233852982521, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.31498122215270996, - "step": 137 - }, - { - "completion_length": 45.65625, - "epoch": 0.04397705544933078, - "grad_norm": 124.44922637939453, - "kl": 0.14453125, - "learning_rate": 9.56022944550669e-07, - "loss": 0.0058, - "reward": 1.6428006887435913, - "reward_std": 0.07791727781295776, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5178006887435913, - "rewards/pad": 0.125, - "step": 138 - }, - { - "completion_length": 98.3125, - "epoch": 0.04429572976418101, - "grad_norm": 15.683511734008789, - "kl": 0.09619140625, - "learning_rate": 9.557042702358189e-07, - "loss": 0.0038, - "reward": 1.4232029914855957, - "reward_std": 0.06578314304351807, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2982029318809509, - "step": 139 - }, - { - "completion_length": 101.09375, - "epoch": 0.04461440407903123, - "grad_norm": 46.468170166015625, - "kl": 0.0947265625, - "learning_rate": 9.553855959209687e-07, - "loss": 0.0038, - "reward": 1.7962223291397095, - "reward_std": 0.06880475580692291, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42122238874435425, - "step": 140 - }, - { - "completion_length": 99.25, - "epoch": 0.044933078393881457, - "grad_norm": 24.893571853637695, - "kl": 0.08447265625, - "learning_rate": 9.550669216061185e-07, - "loss": 0.0034, - "reward": 1.7566449642181396, - "reward_std": 0.07449847459793091, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5066449642181396, - "step": 141 - }, - { - "completion_length": 126.484375, - "epoch": 0.04525175270873168, - "grad_norm": 47.23504638671875, - "kl": 0.05615234375, - "learning_rate": 9.547482472912683e-07, - "loss": 0.0022, - "reward": 1.7445024251937866, - "reward_std": 0.08240990340709686, - "rewards/pad": 0.328125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4163774847984314, - "step": 142 - }, - { - "completion_length": 151.34375, - "epoch": 0.0455704270235819, - "grad_norm": 24.97036361694336, - "kl": 0.07275390625, - "learning_rate": 9.544295729764181e-07, - "loss": 0.0029, - "reward": 1.490628957748413, - "reward_std": 0.03826358914375305, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4906289279460907, - "step": 143 - }, - { - "completion_length": 150.546875, - "epoch": 0.045889101338432124, - "grad_norm": 1052.7979736328125, - "kl": 0.099609375, - "learning_rate": 9.541108986615677e-07, - "loss": 0.004, - "reward": 1.446044921875, - "reward_std": 0.054064881056547165, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4460448920726776, - "rewards/pad": 0.0, - "step": 144 - }, - { - "completion_length": 123.046875, - "epoch": 0.046207775653282344, - "grad_norm": 26.211153030395508, - "kl": 0.1005859375, - "learning_rate": 9.537922243467175e-07, - "loss": 0.004, - "reward": 1.42512047290802, - "reward_std": 0.034479159861803055, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4251205325126648, - "step": 145 - }, - { - "completion_length": 71.53125, - "epoch": 0.04652644996813257, - "grad_norm": 69.134033203125, - "kl": 0.1298828125, - "learning_rate": 9.534735500318673e-07, - "loss": 0.0052, - "reward": 1.553114652633667, - "reward_std": 0.12183529883623123, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45936471223831177, - "rewards/pad": 0.09375, - "step": 146 - }, - { - "completion_length": 97.984375, - "epoch": 0.04684512428298279, - "grad_norm": 56.753135681152344, - "kl": 0.115234375, - "learning_rate": 9.531548757170171e-07, - "loss": 0.0046, - "reward": 1.6361443996429443, - "reward_std": 0.056340161710977554, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5111443996429443, - "step": 147 - }, - { - "completion_length": 46.15625, - "epoch": 0.04716379859783301, - "grad_norm": 90.41157531738281, - "kl": 1.4453125, - "learning_rate": 9.528362014021669e-07, - "loss": 0.0577, - "reward": 1.532861590385437, - "reward_std": 0.14235638082027435, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4234865605831146, - "step": 148 - }, - { - "completion_length": 45.6875, - "epoch": 0.04748247291268324, - "grad_norm": 108.94004821777344, - "kl": 0.11865234375, - "learning_rate": 9.525175270873167e-07, - "loss": 0.0048, - "reward": 1.6035866737365723, - "reward_std": 0.09949560463428497, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4785866141319275, - "step": 149 - }, - { - "completion_length": 71.78125, - "epoch": 0.04780114722753346, - "grad_norm": 50.10005187988281, - "kl": 0.1318359375, - "learning_rate": 9.521988527724664e-07, - "loss": 0.0053, - "reward": 1.529174566268921, - "reward_std": 0.09229524433612823, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5291745662689209, - "step": 150 - }, - { - "completion_length": 71.578125, - "epoch": 0.04811982154238369, - "grad_norm": 42.62626266479492, - "kl": 0.11572265625, - "learning_rate": 9.518801784576163e-07, - "loss": 0.0046, - "reward": 1.4902132749557495, - "reward_std": 0.03644484281539917, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4902133345603943, - "rewards/pad": 0.0, - "step": 151 - }, - { - "completion_length": 72.5, - "epoch": 0.04843849585723391, - "grad_norm": 37.46217346191406, - "kl": 0.1318359375, - "learning_rate": 9.515615041427661e-07, - "loss": 0.0053, - "reward": 1.7063305377960205, - "reward_std": 0.08680470287799835, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5813304781913757, - "rewards/pad": 0.125, - "step": 152 - }, - { - "completion_length": 98.09375, - "epoch": 0.04875717017208413, - "grad_norm": 27.439014434814453, - "kl": 0.0849609375, - "learning_rate": 9.512428298279159e-07, - "loss": 0.0034, - "reward": 1.634913444519043, - "reward_std": 0.08088289201259613, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.509913444519043, - "step": 153 - }, - { - "completion_length": 175.203125, - "epoch": 0.049075844486934354, - "grad_norm": 22.50582504272461, - "kl": 0.046630859375, - "learning_rate": 9.509241555130656e-07, - "loss": 0.0019, - "reward": 1.4140303134918213, - "reward_std": 0.0470597930252552, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41403043270111084, - "step": 154 - }, - { - "completion_length": 72.328125, - "epoch": 0.049394518801784575, - "grad_norm": 165.1951446533203, - "kl": 0.1865234375, - "learning_rate": 9.506054811982154e-07, - "loss": 0.0075, - "reward": 1.6961311101913452, - "reward_std": 0.06627112627029419, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5711310505867004, - "rewards/pad": 0.125, - "step": 155 - }, - { - "completion_length": 149.5, - "epoch": 0.0497131931166348, - "grad_norm": 165.7172393798828, - "kl": 0.1474609375, - "learning_rate": 9.502868068833652e-07, - "loss": 0.0059, - "reward": 1.3272875547409058, - "reward_std": 0.04778099060058594, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32728755474090576, - "step": 156 - }, - { - "completion_length": 71.9375, - "epoch": 0.05003186743148502, - "grad_norm": 56.9672737121582, - "kl": 0.1044921875, - "learning_rate": 9.499681325685149e-07, - "loss": 0.0042, - "reward": 1.6385527849197388, - "reward_std": 0.055156029760837555, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5135527849197388, - "rewards/pad": 0.125, - "step": 157 - }, - { - "completion_length": 70.9375, - "epoch": 0.05035054174633524, - "grad_norm": 18.57289695739746, - "kl": 0.12158203125, - "learning_rate": 9.496494582536647e-07, - "loss": 0.0049, - "reward": 1.642876148223877, - "reward_std": 0.07265563309192657, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6428762078285217, - "rewards/pad": 0.0, - "step": 158 - }, - { - "completion_length": 49.21875, - "epoch": 0.05066921606118547, - "grad_norm": 31.87525749206543, - "kl": 0.1357421875, - "learning_rate": 9.493307839388145e-07, - "loss": 0.0054, - "reward": 1.5862581729888916, - "reward_std": 0.11294609308242798, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.586258053779602, - "rewards/pad": 0.0, - "step": 159 - }, - { - "completion_length": 120.8125, - "epoch": 0.05098789037603569, - "grad_norm": 28.032197952270508, - "kl": 0.0830078125, - "learning_rate": 9.490121096239643e-07, - "loss": 0.0033, - "reward": 1.4823583364486694, - "reward_std": 0.056141920387744904, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48235827684402466, - "rewards/pad": 0.0, - "step": 160 - }, - { - "completion_length": 68.859375, - "epoch": 0.05130656469088592, - "grad_norm": 130.83055114746094, - "kl": 0.0751953125, - "learning_rate": 9.48693435309114e-07, - "loss": 0.003, - "reward": 1.520409345626831, - "reward_std": 0.10488361120223999, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.520409345626831, - "step": 161 - }, - { - "completion_length": 68.625, - "epoch": 0.05162523900573614, - "grad_norm": 38.07346725463867, - "kl": 0.1494140625, - "learning_rate": 9.483747609942638e-07, - "loss": 0.006, - "reward": 1.66463303565979, - "reward_std": 0.13315197825431824, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5552579760551453, - "step": 162 - }, - { - "completion_length": 72.765625, - "epoch": 0.05194391332058636, - "grad_norm": 17.296497344970703, - "kl": 0.1904296875, - "learning_rate": 9.480560866794136e-07, - "loss": 0.0076, - "reward": 1.6184468269348145, - "reward_std": 0.07699354737997055, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.36844679713249207, - "step": 163 - }, - { - "completion_length": 98.1875, - "epoch": 0.052262587635436585, - "grad_norm": 30.80031394958496, - "kl": 0.09814453125, - "learning_rate": 9.477374123645634e-07, - "loss": 0.0039, - "reward": 1.640599012374878, - "reward_std": 0.12581594288349152, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5312240123748779, - "rewards/pad": 0.109375, - "step": 164 - }, - { - "completion_length": 148.9375, - "epoch": 0.052581261950286805, - "grad_norm": 63.38698959350586, - "kl": 0.0615234375, - "learning_rate": 9.474187380497131e-07, - "loss": 0.0025, - "reward": 1.4976701736450195, - "reward_std": 0.08813410997390747, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41954517364501953, - "rewards/pad": 0.078125, - "step": 165 - }, - { - "completion_length": 95.671875, - "epoch": 0.05289993626513703, - "grad_norm": 24.988746643066406, - "kl": 0.09228515625, - "learning_rate": 9.471000637348629e-07, - "loss": 0.0037, - "reward": 1.6525627374649048, - "reward_std": 0.06571772694587708, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.52756267786026, - "step": 166 - }, - { - "completion_length": 205.3125, - "epoch": 0.05321861057998725, - "grad_norm": 7.888558864593506, - "kl": 0.0306396484375, - "learning_rate": 9.467813894200127e-07, - "loss": 0.0012, - "reward": 1.3351364135742188, - "reward_std": 0.07257430255413055, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.35076141357421875, - "step": 167 - }, - { - "completion_length": 72.03125, - "epoch": 0.05353728489483748, - "grad_norm": 531.3428955078125, - "kl": 0.10791015625, - "learning_rate": 9.464627151051625e-07, - "loss": 0.0043, - "reward": 1.7654680013656616, - "reward_std": 0.13501885533332825, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5310930013656616, - "step": 168 - }, - { - "completion_length": 18.78125, - "epoch": 0.0538559592096877, - "grad_norm": 35.41008758544922, - "kl": 0.1513671875, - "learning_rate": 9.461440407903123e-07, - "loss": 0.0061, - "reward": 1.6094670295715332, - "reward_std": 0.08094550669193268, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4844670295715332, - "rewards/pad": 0.125, - "step": 169 - }, - { - "completion_length": 68.234375, - "epoch": 0.05417463352453792, - "grad_norm": 80.35694122314453, - "kl": 0.10693359375, - "learning_rate": 9.458253664754621e-07, - "loss": 0.0043, - "reward": 1.7217336893081665, - "reward_std": 0.15385891497135162, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.6123586297035217, - "rewards/pad": 0.125, - "step": 170 - }, - { - "completion_length": 71.3125, - "epoch": 0.05449330783938815, - "grad_norm": 43.178199768066406, - "kl": 0.10400390625, - "learning_rate": 9.455066921606119e-07, - "loss": 0.0042, - "reward": 1.353318214416504, - "reward_std": 0.03510623052716255, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35331809520721436, - "step": 171 - }, - { - "completion_length": 96.90625, - "epoch": 0.05481198215423837, - "grad_norm": 71.26564025878906, - "kl": 0.08642578125, - "learning_rate": 9.451880178457617e-07, - "loss": 0.0034, - "reward": 1.468902826309204, - "reward_std": 0.06541343033313751, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46890294551849365, - "rewards/pad": 0.0, - "step": 172 - }, - { - "completion_length": 150.1875, - "epoch": 0.055130656469088594, - "grad_norm": 77.41221618652344, - "kl": 0.06884765625, - "learning_rate": 9.448693435309114e-07, - "loss": 0.0028, - "reward": 1.3299999237060547, - "reward_std": 0.07897648215293884, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.34562498331069946, - "rewards/pad": 0.0, - "step": 173 - }, - { - "completion_length": 45.5, - "epoch": 0.055449330783938815, - "grad_norm": 30.494190216064453, - "kl": 0.119140625, - "learning_rate": 9.445506692160612e-07, - "loss": 0.0048, - "reward": 1.527397871017456, - "reward_std": 0.12671810388565063, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.41802287101745605, - "rewards/pad": 0.125, - "step": 174 - }, - { - "completion_length": 19.90625, - "epoch": 0.055768005098789035, - "grad_norm": 29.110815048217773, - "kl": 0.1376953125, - "learning_rate": 9.44231994901211e-07, - "loss": 0.0055, - "reward": 1.4674365520477295, - "reward_std": 0.08331962674856186, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4674365222454071, - "rewards/pad": 0.0, - "step": 175 - }, - { - "completion_length": 123.265625, - "epoch": 0.05608667941363926, - "grad_norm": 35.95558547973633, - "kl": 0.0791015625, - "learning_rate": 9.439133205863608e-07, - "loss": 0.0032, - "reward": 1.4634273052215576, - "reward_std": 0.06282143294811249, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46342724561691284, - "step": 176 - }, - { - "completion_length": 97.890625, - "epoch": 0.05640535372848948, - "grad_norm": 26.46843719482422, - "kl": 0.0810546875, - "learning_rate": 9.435946462715104e-07, - "loss": 0.0032, - "reward": 1.5546252727508545, - "reward_std": 0.09996479004621506, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4452502727508545, - "rewards/pad": 0.109375, - "step": 177 - }, - { - "completion_length": 176.015625, - "epoch": 0.05672402804333971, - "grad_norm": 23.104013442993164, - "kl": 0.07373046875, - "learning_rate": 9.432759719566602e-07, - "loss": 0.0029, - "reward": 1.4256776571273804, - "reward_std": 0.038105182349681854, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42567768692970276, - "step": 178 - }, - { - "completion_length": 72.109375, - "epoch": 0.05704270235818993, - "grad_norm": 152.2291717529297, - "kl": 0.103515625, - "learning_rate": 9.4295729764181e-07, - "loss": 0.0041, - "reward": 1.5971485376358032, - "reward_std": 0.111331045627594, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.425273597240448, - "rewards/pad": 0.171875, - "step": 179 - }, - { - "completion_length": 123.390625, - "epoch": 0.05736137667304015, - "grad_norm": 62.74386978149414, - "kl": 0.146484375, - "learning_rate": 9.426386233269598e-07, - "loss": 0.0059, - "reward": 1.5698823928833008, - "reward_std": 0.08544135093688965, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5698824524879456, - "step": 180 - }, - { - "completion_length": 97.421875, - "epoch": 0.05768005098789038, - "grad_norm": 21.25946807861328, - "kl": 0.0966796875, - "learning_rate": 9.423199490121095e-07, - "loss": 0.0039, - "reward": 1.7297738790512085, - "reward_std": 0.04847279191017151, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47977393865585327, - "rewards/pad": 0.25, - "step": 181 - }, - { - "completion_length": 19.953125, - "epoch": 0.0579987253027406, - "grad_norm": 72.88169860839844, - "kl": 0.1396484375, - "learning_rate": 9.420012746972593e-07, - "loss": 0.0056, - "reward": 1.551754117012024, - "reward_std": 0.09995436668395996, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3017541766166687, - "rewards/pad": 0.25, - "step": 182 - }, - { - "completion_length": 98.6875, - "epoch": 0.058317399617590825, - "grad_norm": 82.24500274658203, - "kl": 0.07470703125, - "learning_rate": 9.416826003824091e-07, - "loss": 0.003, - "reward": 1.7466256618499756, - "reward_std": 0.0803009569644928, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49662575125694275, - "rewards/pad": 0.25, - "step": 183 - }, - { - "completion_length": 123.078125, - "epoch": 0.058636073932441045, - "grad_norm": 58.43217849731445, - "kl": 0.09033203125, - "learning_rate": 9.413639260675588e-07, - "loss": 0.0036, - "reward": 1.549285888671875, - "reward_std": 0.05893456190824509, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.549285888671875, - "step": 184 - }, - { - "completion_length": 97.734375, - "epoch": 0.058954748247291265, - "grad_norm": 34.839447021484375, - "kl": 0.1396484375, - "learning_rate": 9.410452517527086e-07, - "loss": 0.0056, - "reward": 1.7135741710662842, - "reward_std": 0.08446040004491806, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5885741710662842, - "step": 185 - }, - { - "completion_length": 71.25, - "epoch": 0.05927342256214149, - "grad_norm": 27.027359008789062, - "kl": 0.125, - "learning_rate": 9.407265774378584e-07, - "loss": 0.005, - "reward": 1.5394039154052734, - "reward_std": 0.10799270868301392, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4300287961959839, - "rewards/pad": 0.109375, - "step": 186 - }, - { - "completion_length": 123.328125, - "epoch": 0.05959209687699171, - "grad_norm": 115.98084259033203, - "kl": 0.12451171875, - "learning_rate": 9.404079031230082e-07, - "loss": 0.005, - "reward": 1.323099136352539, - "reward_std": 0.05522124096751213, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.32309919595718384, - "rewards/pad": 0.0, - "step": 187 - }, - { - "completion_length": 97.640625, - "epoch": 0.05991077119184194, - "grad_norm": 51.958438873291016, - "kl": 0.09619140625, - "learning_rate": 9.40089228808158e-07, - "loss": 0.0039, - "reward": 1.541533350944519, - "reward_std": 0.11370637267827988, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.557158350944519, - "rewards/pad": 0.0, - "step": 188 - }, - { - "completion_length": 148.828125, - "epoch": 0.06022944550669216, - "grad_norm": 33.664268493652344, - "kl": 0.04931640625, - "learning_rate": 9.397705544933078e-07, - "loss": 0.002, - "reward": 1.4180933237075806, - "reward_std": 0.05330347269773483, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41809332370758057, - "rewards/pad": 0.0, - "step": 189 - }, - { - "completion_length": 97.421875, - "epoch": 0.06054811982154239, - "grad_norm": 138.2771453857422, - "kl": 0.203125, - "learning_rate": 9.394518801784576e-07, - "loss": 0.0081, - "reward": 1.3894553184509277, - "reward_std": 0.11522915214300156, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34258025884628296, - "step": 190 - }, - { - "completion_length": 123.796875, - "epoch": 0.06086679413639261, - "grad_norm": 100.51148223876953, - "kl": 0.048095703125, - "learning_rate": 9.391332058636074e-07, - "loss": 0.0019, - "reward": 1.4818546772003174, - "reward_std": 0.034188855439424515, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.35685479640960693, - "step": 191 - }, - { - "completion_length": 46.296875, - "epoch": 0.06118546845124283, - "grad_norm": 30.46354866027832, - "kl": 0.126953125, - "learning_rate": 9.388145315487571e-07, - "loss": 0.0051, - "reward": 1.6870028972625732, - "reward_std": 0.13461649417877197, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.515127956867218, - "step": 192 - }, - { - "completion_length": 94.734375, - "epoch": 0.061504142766093055, - "grad_norm": 79.05770111083984, - "kl": 0.1982421875, - "learning_rate": 9.384958572339069e-07, - "loss": 0.0079, - "reward": 1.3827359676361084, - "reward_std": 0.04828489571809769, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3827360272407532, - "step": 193 - }, - { - "completion_length": 46.5, - "epoch": 0.061822817080943275, - "grad_norm": 31.767784118652344, - "kl": 0.2177734375, - "learning_rate": 9.381771829190567e-07, - "loss": 0.0087, - "reward": 1.6183278560638428, - "reward_std": 0.06483359634876251, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4933279752731323, - "rewards/pad": 0.125, - "step": 194 - }, - { - "completion_length": 122.140625, - "epoch": 0.0621414913957935, - "grad_norm": 55.397586822509766, - "kl": 0.10791015625, - "learning_rate": 9.378585086042065e-07, - "loss": 0.0043, - "reward": 1.411421537399292, - "reward_std": 0.042458243668079376, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4114214777946472, - "step": 195 - }, - { - "completion_length": 70.1875, - "epoch": 0.06246016571064372, - "grad_norm": 21.96168327331543, - "kl": 0.24609375, - "learning_rate": 9.375398342893562e-07, - "loss": 0.0098, - "reward": 1.587853193283081, - "reward_std": 0.05139303579926491, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4628531038761139, - "rewards/pad": 0.125, - "step": 196 - }, - { - "completion_length": 73.34375, - "epoch": 0.06277884002549394, - "grad_norm": 138.9192657470703, - "kl": 0.0830078125, - "learning_rate": 9.37221159974506e-07, - "loss": 0.0033, - "reward": 1.7415766716003418, - "reward_std": 0.08817744255065918, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6322016716003418, - "rewards/pad": 0.109375, - "step": 197 - }, - { - "completion_length": 72.1875, - "epoch": 0.06309751434034416, - "grad_norm": 27.619943618774414, - "kl": 0.1103515625, - "learning_rate": 9.369024856596558e-07, - "loss": 0.0044, - "reward": 1.859155535697937, - "reward_std": 0.13565590977668762, - "rewards/answer_reward": 0.40625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.452905535697937, - "step": 198 - }, - { - "completion_length": 125.484375, - "epoch": 0.0634161886551944, - "grad_norm": 17.163469314575195, - "kl": 0.07080078125, - "learning_rate": 9.365838113448056e-07, - "loss": 0.0028, - "reward": 1.4651894569396973, - "reward_std": 0.04208006337285042, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34018948674201965, - "step": 199 - }, - { - "completion_length": 96.28125, - "epoch": 0.06373486297004462, - "grad_norm": 76.47855377197266, - "kl": 0.0849609375, - "learning_rate": 9.362651370299553e-07, - "loss": 0.0034, - "reward": 1.6315760612487793, - "reward_std": 0.03873869776725769, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5065759420394897, - "step": 200 - }, - { - "completion_length": 150.71875, - "epoch": 0.06405353728489484, - "grad_norm": 33.26072311401367, - "kl": 0.037109375, - "learning_rate": 9.359464627151051e-07, - "loss": 0.0015, - "reward": 1.3809221982955933, - "reward_std": 0.11550474166870117, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.25592225790023804, - "step": 201 - }, - { - "completion_length": 70.5, - "epoch": 0.06437221159974506, - "grad_norm": 43.25547409057617, - "kl": 0.1689453125, - "learning_rate": 9.356277884002549e-07, - "loss": 0.0067, - "reward": 1.5856120586395264, - "reward_std": 0.0652899295091629, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46061214804649353, - "rewards/pad": 0.125, - "step": 202 - }, - { - "completion_length": 148.84375, - "epoch": 0.06469088591459528, - "grad_norm": 43.30719757080078, - "kl": 0.060302734375, - "learning_rate": 9.353091140854047e-07, - "loss": 0.0024, - "reward": 1.3250983953475952, - "reward_std": 0.06848461925983429, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3250983655452728, - "rewards/pad": 0.0, - "step": 203 - }, - { - "completion_length": 96.03125, - "epoch": 0.06500956022944551, - "grad_norm": 14.09665584564209, - "kl": 0.220703125, - "learning_rate": 9.349904397705544e-07, - "loss": 0.0088, - "reward": 1.6423754692077637, - "reward_std": 0.05685172975063324, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5173755288124084, - "rewards/pad": 0.125, - "step": 204 - }, - { - "completion_length": 72.328125, - "epoch": 0.06532823454429573, - "grad_norm": 30.157978057861328, - "kl": 0.1015625, - "learning_rate": 9.346717654557042e-07, - "loss": 0.0041, - "reward": 1.7073540687561035, - "reward_std": 0.03902032971382141, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5823540687561035, - "step": 205 - }, - { - "completion_length": 46.40625, - "epoch": 0.06564690885914595, - "grad_norm": 518.1132202148438, - "kl": 0.24609375, - "learning_rate": 9.34353091140854e-07, - "loss": 0.0098, - "reward": 1.7617124319076538, - "reward_std": 0.0738602951169014, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5117124915122986, - "rewards/pad": 0.25, - "step": 206 - }, - { - "completion_length": 45.890625, - "epoch": 0.06596558317399617, - "grad_norm": 57.30687713623047, - "kl": 0.154296875, - "learning_rate": 9.340344168260039e-07, - "loss": 0.0062, - "reward": 1.6535007953643799, - "reward_std": 0.07546480000019073, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5285007953643799, - "step": 207 - }, - { - "completion_length": 72.3125, - "epoch": 0.0662842574888464, - "grad_norm": 134.83474731445312, - "kl": 0.275390625, - "learning_rate": 9.337157425111536e-07, - "loss": 0.011, - "reward": 1.4936914443969727, - "reward_std": 0.08497796952724457, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4936913847923279, - "rewards/pad": 0.0, - "step": 208 - }, - { - "completion_length": 150.125, - "epoch": 0.06660293180369663, - "grad_norm": 63.739009857177734, - "kl": 0.0751953125, - "learning_rate": 9.333970681963034e-07, - "loss": 0.003, - "reward": 1.506711483001709, - "reward_std": 0.03748561441898346, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5067115426063538, - "step": 209 - }, - { - "completion_length": 97.9375, - "epoch": 0.06692160611854685, - "grad_norm": 61.181392669677734, - "kl": 0.08447265625, - "learning_rate": 9.330783938814532e-07, - "loss": 0.0034, - "reward": 1.534330129623413, - "reward_std": 0.09595834463834763, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42495498061180115, - "rewards/pad": 0.109375, - "step": 210 - }, - { - "completion_length": 72.296875, - "epoch": 0.06724028043339707, - "grad_norm": 139.2161102294922, - "kl": 0.1494140625, - "learning_rate": 9.32759719566603e-07, - "loss": 0.006, - "reward": 1.6207249164581299, - "reward_std": 0.07159844040870667, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4957249164581299, - "rewards/pad": 0.125, - "step": 211 - }, - { - "completion_length": 99.875, - "epoch": 0.06755895474824729, - "grad_norm": 38.422767639160156, - "kl": 0.07763671875, - "learning_rate": 9.324410452517527e-07, - "loss": 0.0031, - "reward": 1.711874008178711, - "reward_std": 0.11063874512910843, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49312400817871094, - "rewards/pad": 0.21875, - "step": 212 - }, - { - "completion_length": 96.15625, - "epoch": 0.06787762906309751, - "grad_norm": 18.608835220336914, - "kl": 0.1279296875, - "learning_rate": 9.321223709369025e-07, - "loss": 0.0051, - "reward": 1.4139230251312256, - "reward_std": 0.06470437347888947, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4139229357242584, - "rewards/pad": 0.0, - "step": 213 - }, - { - "completion_length": 70.296875, - "epoch": 0.06819630337794774, - "grad_norm": 31.750450134277344, - "kl": 0.134765625, - "learning_rate": 9.318036966220523e-07, - "loss": 0.0054, - "reward": 1.3242194652557373, - "reward_std": 0.058769889175891876, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.32421940565109253, - "rewards/pad": 0.0, - "step": 214 - }, - { - "completion_length": 125.625, - "epoch": 0.06851497769279796, - "grad_norm": 88.57303619384766, - "kl": 0.04736328125, - "learning_rate": 9.314850223072021e-07, - "loss": 0.0019, - "reward": 1.5565061569213867, - "reward_std": 0.17865237593650818, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4627561569213867, - "step": 215 - }, - { - "completion_length": 99.765625, - "epoch": 0.06883365200764818, - "grad_norm": 37.419498443603516, - "kl": 0.0751953125, - "learning_rate": 9.311663479923517e-07, - "loss": 0.003, - "reward": 1.623964786529541, - "reward_std": 0.15740472078323364, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4520898163318634, - "rewards/pad": 0.171875, - "step": 216 - }, - { - "completion_length": 93.921875, - "epoch": 0.0691523263224984, - "grad_norm": 32.91117858886719, - "kl": 0.09716796875, - "learning_rate": 9.308476736775015e-07, - "loss": 0.0039, - "reward": 1.4654639959335327, - "reward_std": 0.09401823580265045, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34046393632888794, - "rewards/pad": 0.125, - "step": 217 - }, - { - "completion_length": 98.546875, - "epoch": 0.06947100063734862, - "grad_norm": 32.001346588134766, - "kl": 0.1025390625, - "learning_rate": 9.305289993626513e-07, - "loss": 0.0042, - "reward": 1.5083110332489014, - "reward_std": 0.06683140993118286, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49268612265586853, - "rewards/pad": 0.015625, - "step": 218 - }, - { - "completion_length": 151.0, - "epoch": 0.06978967495219886, - "grad_norm": 9.118590354919434, - "kl": 0.041015625, - "learning_rate": 9.30210325047801e-07, - "loss": 0.0016, - "reward": 1.685126781463623, - "reward_std": 0.04756775498390198, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.560126781463623, - "step": 219 - }, - { - "completion_length": 71.046875, - "epoch": 0.07010834926704908, - "grad_norm": 16.421247482299805, - "kl": 0.0849609375, - "learning_rate": 9.298916507329508e-07, - "loss": 0.0034, - "reward": 1.5211446285247803, - "reward_std": 0.0653223991394043, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3961445093154907, - "step": 220 - }, - { - "completion_length": 98.21875, - "epoch": 0.0704270235818993, - "grad_norm": 16.440828323364258, - "kl": 0.08154296875, - "learning_rate": 9.295729764181006e-07, - "loss": 0.0033, - "reward": 1.4838128089904785, - "reward_std": 0.061297886073589325, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4838128089904785, - "step": 221 - }, - { - "completion_length": 73.234375, - "epoch": 0.07074569789674952, - "grad_norm": 138.84011840820312, - "kl": 0.08544921875, - "learning_rate": 9.292543021032504e-07, - "loss": 0.0034, - "reward": 1.5097116231918335, - "reward_std": 0.2135867327451706, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4315866231918335, - "step": 222 - }, - { - "completion_length": 125.265625, - "epoch": 0.07106437221159974, - "grad_norm": 72.04967498779297, - "kl": 0.0859375, - "learning_rate": 9.289356277884001e-07, - "loss": 0.0034, - "reward": 1.4920474290847778, - "reward_std": 0.07400809228420258, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3670474886894226, - "step": 223 - }, - { - "completion_length": 150.515625, - "epoch": 0.07138304652644997, - "grad_norm": 44.25281524658203, - "kl": 0.06689453125, - "learning_rate": 9.286169534735499e-07, - "loss": 0.0027, - "reward": 1.4957704544067383, - "reward_std": 0.07273541390895844, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3863954544067383, - "step": 224 - }, - { - "completion_length": 71.5625, - "epoch": 0.07170172084130019, - "grad_norm": 22.029436111450195, - "kl": 0.09912109375, - "learning_rate": 9.282982791586997e-07, - "loss": 0.004, - "reward": 1.446871280670166, - "reward_std": 0.08580408990383148, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4468713700771332, - "rewards/pad": 0.0, - "step": 225 - }, - { - "completion_length": 122.59375, - "epoch": 0.07202039515615041, - "grad_norm": 50.02418899536133, - "kl": 0.0859375, - "learning_rate": 9.279796048438496e-07, - "loss": 0.0034, - "reward": 1.314070701599121, - "reward_std": 0.024542158469557762, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.31407079100608826, - "step": 226 - }, - { - "completion_length": 98.921875, - "epoch": 0.07233906947100063, - "grad_norm": 26.701772689819336, - "kl": 0.099609375, - "learning_rate": 9.276609305289993e-07, - "loss": 0.004, - "reward": 1.5902478694915771, - "reward_std": 0.14807233214378357, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37149783968925476, - "step": 227 - }, - { - "completion_length": 121.953125, - "epoch": 0.07265774378585087, - "grad_norm": 51.00769805908203, - "kl": 0.083984375, - "learning_rate": 9.273422562141491e-07, - "loss": 0.0034, - "reward": 1.341005802154541, - "reward_std": 0.06682395935058594, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.341005802154541, - "rewards/pad": 0.0, - "step": 228 - }, - { - "completion_length": 69.484375, - "epoch": 0.07297641810070109, - "grad_norm": 31.971534729003906, - "kl": 0.11767578125, - "learning_rate": 9.270235818992989e-07, - "loss": 0.0047, - "reward": 1.5693368911743164, - "reward_std": 0.07823251187801361, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5693368911743164, - "rewards/pad": 0.0, - "step": 229 - }, - { - "completion_length": 122.03125, - "epoch": 0.07329509241555131, - "grad_norm": 31.826183319091797, - "kl": 0.0673828125, - "learning_rate": 9.267049075844487e-07, - "loss": 0.0027, - "reward": 1.4567553997039795, - "reward_std": 0.035778652876615524, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4567553699016571, - "rewards/pad": 0.0, - "step": 230 - }, - { - "completion_length": 93.734375, - "epoch": 0.07361376673040153, - "grad_norm": 124.7455062866211, - "kl": 0.08251953125, - "learning_rate": 9.263862332695984e-07, - "loss": 0.0033, - "reward": 1.3886239528656006, - "reward_std": 0.06762949377298355, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.38862404227256775, - "rewards/pad": 0.0, - "step": 231 - }, - { - "completion_length": 70.890625, - "epoch": 0.07393244104525175, - "grad_norm": 50.17414093017578, - "kl": 0.283203125, - "learning_rate": 9.260675589547482e-07, - "loss": 0.0114, - "reward": 1.7248307466506958, - "reward_std": 0.08726703375577927, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.599830687046051, - "step": 232 - }, - { - "completion_length": 71.4375, - "epoch": 0.07425111536010198, - "grad_norm": 56.714786529541016, - "kl": 0.13671875, - "learning_rate": 9.25748884639898e-07, - "loss": 0.0055, - "reward": 1.726217269897461, - "reward_std": 0.05578962340950966, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4762171804904938, - "rewards/pad": 0.25, - "step": 233 - }, - { - "completion_length": 18.359375, - "epoch": 0.0745697896749522, - "grad_norm": 53.17715835571289, - "kl": 0.228515625, - "learning_rate": 9.254302103250478e-07, - "loss": 0.0091, - "reward": 1.6254773139953613, - "reward_std": 0.10757525265216827, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6254771947860718, - "rewards/pad": 0.0, - "step": 234 - }, - { - "completion_length": 47.0, - "epoch": 0.07488846398980242, - "grad_norm": 25.17940902709961, - "kl": 0.1826171875, - "learning_rate": 9.251115360101975e-07, - "loss": 0.0073, - "reward": 1.8453280925750732, - "reward_std": 0.15831027925014496, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5015780329704285, - "rewards/pad": 0.34375, - "step": 235 - }, - { - "completion_length": 71.421875, - "epoch": 0.07520713830465264, - "grad_norm": 19.95438003540039, - "kl": 0.1611328125, - "learning_rate": 9.247928616953473e-07, - "loss": 0.0064, - "reward": 1.5744729042053223, - "reward_std": 0.04170050472021103, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44947290420532227, - "rewards/pad": 0.125, - "step": 236 - }, - { - "completion_length": 74.15625, - "epoch": 0.07552581261950286, - "grad_norm": 28.376657485961914, - "kl": 0.107421875, - "learning_rate": 9.244741873804971e-07, - "loss": 0.0043, - "reward": 1.4814774990081787, - "reward_std": 0.06581274420022964, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3564775884151459, - "step": 237 - }, - { - "completion_length": 72.015625, - "epoch": 0.0758444869343531, - "grad_norm": 34.720123291015625, - "kl": 0.11279296875, - "learning_rate": 9.241555130656469e-07, - "loss": 0.0045, - "reward": 1.7763080596923828, - "reward_std": 0.10871845483779907, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5419331192970276, - "step": 238 - }, - { - "completion_length": 69.546875, - "epoch": 0.07616316124920332, - "grad_norm": 125.89852905273438, - "kl": 0.1767578125, - "learning_rate": 9.238368387507966e-07, - "loss": 0.0071, - "reward": 1.3550302982330322, - "reward_std": 0.10732068866491318, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35503026843070984, - "rewards/pad": 0.0, - "step": 239 - }, - { - "completion_length": 124.578125, - "epoch": 0.07648183556405354, - "grad_norm": 10.14435863494873, - "kl": 0.07275390625, - "learning_rate": 9.235181644359464e-07, - "loss": 0.0029, - "reward": 1.3843462467193604, - "reward_std": 0.03742659091949463, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38434627652168274, - "step": 240 - }, - { - "completion_length": 47.15625, - "epoch": 0.07680050987890376, - "grad_norm": 72.89696502685547, - "kl": 0.11962890625, - "learning_rate": 9.231994901210962e-07, - "loss": 0.0048, - "reward": 1.5216596126556396, - "reward_std": 0.09107674658298492, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3966595530509949, - "step": 241 - }, - { - "completion_length": 74.703125, - "epoch": 0.07711918419375398, - "grad_norm": 30.012723922729492, - "kl": 0.21875, - "learning_rate": 9.22880815806246e-07, - "loss": 0.0088, - "reward": 1.3861610889434814, - "reward_std": 0.038876067847013474, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.26116111874580383, - "step": 242 - }, - { - "completion_length": 98.4375, - "epoch": 0.07743785850860421, - "grad_norm": 32.56050109863281, - "kl": 0.2421875, - "learning_rate": 9.225621414913957e-07, - "loss": 0.0097, - "reward": 1.386494755744934, - "reward_std": 0.06979185342788696, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3864947557449341, - "step": 243 - }, - { - "completion_length": 71.796875, - "epoch": 0.07775653282345443, - "grad_norm": 24.898000717163086, - "kl": 0.236328125, - "learning_rate": 9.222434671765456e-07, - "loss": 0.0094, - "reward": 1.5179388523101807, - "reward_std": 0.07019379734992981, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5023138523101807, - "rewards/pad": 0.015625, - "step": 244 - }, - { - "completion_length": 98.328125, - "epoch": 0.07807520713830465, - "grad_norm": 44.98802185058594, - "kl": 0.259765625, - "learning_rate": 9.219247928616954e-07, - "loss": 0.0104, - "reward": 1.4779443740844727, - "reward_std": 0.05034922808408737, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35294443368911743, - "rewards/pad": 0.125, - "step": 245 - }, - { - "completion_length": 47.515625, - "epoch": 0.07839388145315487, - "grad_norm": 58.65922546386719, - "kl": 0.26953125, - "learning_rate": 9.216061185468452e-07, - "loss": 0.0108, - "reward": 1.9427738189697266, - "reward_std": 0.10847378522157669, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5833987593650818, - "rewards/pad": 0.359375, - "step": 246 - }, - { - "completion_length": 97.5625, - "epoch": 0.0787125557680051, - "grad_norm": 48.99842834472656, - "kl": 0.244140625, - "learning_rate": 9.212874442319949e-07, - "loss": 0.0097, - "reward": 1.519799828529358, - "reward_std": 0.06081816181540489, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5197998285293579, - "step": 247 - }, - { - "completion_length": 123.5625, - "epoch": 0.07903123008285533, - "grad_norm": 29.7005558013916, - "kl": 0.279296875, - "learning_rate": 9.209687699171447e-07, - "loss": 0.0112, - "reward": 1.5789108276367188, - "reward_std": 0.10013554990291595, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.469535768032074, - "step": 248 - }, - { - "completion_length": 124.953125, - "epoch": 0.07934990439770555, - "grad_norm": 57.411460876464844, - "kl": 0.09228515625, - "learning_rate": 9.206500956022945e-07, - "loss": 0.0037, - "reward": 1.602877140045166, - "reward_std": 0.0730300098657608, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3528771698474884, - "step": 249 - }, - { - "completion_length": 98.1875, - "epoch": 0.07966857871255577, - "grad_norm": 22.777734756469727, - "kl": 0.357421875, - "learning_rate": 9.203314212874442e-07, - "loss": 0.0143, - "reward": 1.448513388633728, - "reward_std": 0.03580144792795181, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.32351335883140564, - "rewards/pad": 0.125, - "step": 250 - }, - { - "completion_length": 70.28125, - "epoch": 0.07998725302740599, - "grad_norm": 53.4788703918457, - "kl": 0.37109375, - "learning_rate": 9.20012746972594e-07, - "loss": 0.0149, - "reward": 1.553734302520752, - "reward_std": 0.13574323058128357, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.33498427271842957, - "rewards/pad": 0.234375, - "step": 251 - }, - { - "completion_length": 152.71875, - "epoch": 0.08030592734225621, - "grad_norm": 20.498197555541992, - "kl": 0.0693359375, - "learning_rate": 9.196940726577438e-07, - "loss": 0.0027, - "reward": 1.369762659072876, - "reward_std": 0.054935187101364136, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36976274847984314, - "step": 252 - }, - { - "completion_length": 71.046875, - "epoch": 0.08062460165710644, - "grad_norm": 47.005611419677734, - "kl": 0.208984375, - "learning_rate": 9.193753983428936e-07, - "loss": 0.0084, - "reward": 1.6958516836166382, - "reward_std": 0.10709372162818909, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5708516836166382, - "step": 253 - }, - { - "completion_length": 124.765625, - "epoch": 0.08094327597195666, - "grad_norm": 23.856901168823242, - "kl": 0.06884765625, - "learning_rate": 9.190567240280433e-07, - "loss": 0.0028, - "reward": 1.4443206787109375, - "reward_std": 0.03854493796825409, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3193206787109375, - "rewards/pad": 0.125, - "step": 254 - }, - { - "completion_length": 97.53125, - "epoch": 0.08126195028680688, - "grad_norm": 22.07203483581543, - "kl": 0.12451171875, - "learning_rate": 9.18738049713193e-07, - "loss": 0.005, - "reward": 1.414333462715149, - "reward_std": 0.04794395714998245, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41433337330818176, - "rewards/pad": 0.0, - "step": 255 - }, - { - "completion_length": 71.53125, - "epoch": 0.0815806246016571, - "grad_norm": 33.607948303222656, - "kl": 0.103515625, - "learning_rate": 9.184193753983428e-07, - "loss": 0.0041, - "reward": 1.5641847848892212, - "reward_std": 0.06226586550474167, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4391847550868988, - "rewards/pad": 0.125, - "step": 256 - }, - { - "completion_length": 123.53125, - "epoch": 0.08189929891650732, - "grad_norm": 11.251544952392578, - "kl": 0.1015625, - "learning_rate": 9.181007010834926e-07, - "loss": 0.0041, - "reward": 1.3956583738327026, - "reward_std": 0.10355216264724731, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.41128334403038025, - "step": 257 - }, - { - "completion_length": 123.890625, - "epoch": 0.08221797323135756, - "grad_norm": 32.35093307495117, - "kl": 0.07421875, - "learning_rate": 9.177820267686423e-07, - "loss": 0.003, - "reward": 1.4286943674087524, - "reward_std": 0.04091978818178177, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4286944270133972, - "step": 258 - }, - { - "completion_length": 73.65625, - "epoch": 0.08253664754620778, - "grad_norm": 42.930885314941406, - "kl": 0.138671875, - "learning_rate": 9.174633524537921e-07, - "loss": 0.0055, - "reward": 1.6769955158233643, - "reward_std": 0.13682706654071808, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5207455158233643, - "rewards/pad": 0.15625, - "step": 259 - }, - { - "completion_length": 45.46875, - "epoch": 0.082855321861058, - "grad_norm": 73.59136199951172, - "kl": 0.5703125, - "learning_rate": 9.171446781389419e-07, - "loss": 0.0229, - "reward": 1.7261182069778442, - "reward_std": 0.13216665387153625, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4761181175708771, - "step": 260 - }, - { - "completion_length": 99.265625, - "epoch": 0.08317399617590822, - "grad_norm": 50.60124969482422, - "kl": 0.123046875, - "learning_rate": 9.168260038240917e-07, - "loss": 0.0049, - "reward": 1.5830243825912476, - "reward_std": 0.06387736648321152, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47364941239356995, - "rewards/pad": 0.109375, - "step": 261 - }, - { - "completion_length": 95.359375, - "epoch": 0.08349267049075844, - "grad_norm": 40.418121337890625, - "kl": 0.1484375, - "learning_rate": 9.165073295092414e-07, - "loss": 0.0059, - "reward": 1.3880648612976074, - "reward_std": 0.09246179461479187, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3880648612976074, - "step": 262 - }, - { - "completion_length": 146.625, - "epoch": 0.08381134480560867, - "grad_norm": 34.34090805053711, - "kl": 0.0859375, - "learning_rate": 9.161886551943912e-07, - "loss": 0.0034, - "reward": 1.482377052307129, - "reward_std": 0.03902214393019676, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48237714171409607, - "step": 263 - }, - { - "completion_length": 124.390625, - "epoch": 0.0841300191204589, - "grad_norm": 19.643157958984375, - "kl": 0.11572265625, - "learning_rate": 9.158699808795411e-07, - "loss": 0.0046, - "reward": 1.3950953483581543, - "reward_std": 0.0343056246638298, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3950953483581543, - "rewards/pad": 0.0, - "step": 264 - }, - { - "completion_length": 45.71875, - "epoch": 0.08444869343530911, - "grad_norm": 109.31044006347656, - "kl": 0.216796875, - "learning_rate": 9.155513065646909e-07, - "loss": 0.0086, - "reward": 1.6760146617889404, - "reward_std": 0.10303567349910736, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5510146021842957, - "step": 265 - }, - { - "completion_length": 71.390625, - "epoch": 0.08476736775015933, - "grad_norm": 78.38106536865234, - "kl": 0.1689453125, - "learning_rate": 9.152326322498406e-07, - "loss": 0.0068, - "reward": 1.678178071975708, - "reward_std": 0.0742669403553009, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.553178071975708, - "rewards/pad": 0.125, - "step": 266 - }, - { - "completion_length": 121.96875, - "epoch": 0.08508604206500955, - "grad_norm": 56.864837646484375, - "kl": 0.095703125, - "learning_rate": 9.149139579349904e-07, - "loss": 0.0038, - "reward": 1.3991233110427856, - "reward_std": 0.035200487822294235, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3991233706474304, - "step": 267 - }, - { - "completion_length": 71.0625, - "epoch": 0.08540471637985979, - "grad_norm": 30.662776947021484, - "kl": 0.1484375, - "learning_rate": 9.145952836201402e-07, - "loss": 0.0059, - "reward": 1.4969425201416016, - "reward_std": 0.13425540924072266, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.38756754994392395, - "rewards/pad": 0.125, - "step": 268 - }, - { - "completion_length": 97.734375, - "epoch": 0.08572339069471001, - "grad_norm": 57.5777587890625, - "kl": 0.138671875, - "learning_rate": 9.1427660930529e-07, - "loss": 0.0056, - "reward": 1.4289636611938477, - "reward_std": 0.05716238170862198, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42896372079849243, - "rewards/pad": 0.0, - "step": 269 - }, - { - "completion_length": 175.0625, - "epoch": 0.08604206500956023, - "grad_norm": 39.821189880371094, - "kl": 0.06982421875, - "learning_rate": 9.139579349904397e-07, - "loss": 0.0028, - "reward": 1.4713307619094849, - "reward_std": 0.03898857533931732, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47133076190948486, - "step": 270 - }, - { - "completion_length": 102.734375, - "epoch": 0.08636073932441045, - "grad_norm": 37.10630416870117, - "kl": 0.1181640625, - "learning_rate": 9.136392606755895e-07, - "loss": 0.0047, - "reward": 1.4957199096679688, - "reward_std": 0.05200393870472908, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37071990966796875, - "step": 271 - }, - { - "completion_length": 73.59375, - "epoch": 0.08667941363926067, - "grad_norm": 135.67630004882812, - "kl": 0.11865234375, - "learning_rate": 9.133205863607393e-07, - "loss": 0.0047, - "reward": 1.598374843597412, - "reward_std": 0.15692463517189026, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5358746647834778, - "step": 272 - }, - { - "completion_length": 71.78125, - "epoch": 0.0869980879541109, - "grad_norm": 41.58750915527344, - "kl": 0.2001953125, - "learning_rate": 9.130019120458891e-07, - "loss": 0.008, - "reward": 1.4793663024902344, - "reward_std": 0.07295481860637665, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4793663024902344, - "rewards/pad": 0.0, - "step": 273 - }, - { - "completion_length": 44.828125, - "epoch": 0.08731676226896112, - "grad_norm": 62.157081604003906, - "kl": 0.138671875, - "learning_rate": 9.126832377310388e-07, - "loss": 0.0056, - "reward": 1.5699678659439087, - "reward_std": 0.09194605052471161, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4918428659439087, - "rewards/pad": 0.078125, - "step": 274 - }, - { - "completion_length": 96.859375, - "epoch": 0.08763543658381134, - "grad_norm": 34.54340744018555, - "kl": 0.298828125, - "learning_rate": 9.123645634161886e-07, - "loss": 0.0119, - "reward": 1.5665819644927979, - "reward_std": 0.07807460427284241, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44158199429512024, - "step": 275 - }, - { - "completion_length": 96.1875, - "epoch": 0.08795411089866156, - "grad_norm": 72.8243179321289, - "kl": 0.09521484375, - "learning_rate": 9.120458891013384e-07, - "loss": 0.0038, - "reward": 1.6681749820709229, - "reward_std": 0.051725488156080246, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5431750416755676, - "rewards/pad": 0.125, - "step": 276 - }, - { - "completion_length": 122.734375, - "epoch": 0.08827278521351178, - "grad_norm": 25.338285446166992, - "kl": 0.361328125, - "learning_rate": 9.117272147864882e-07, - "loss": 0.0144, - "reward": 1.3991707563400269, - "reward_std": 0.07661792635917664, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39917081594467163, - "rewards/pad": 0.0, - "step": 277 - }, - { - "completion_length": 20.0625, - "epoch": 0.08859145952836202, - "grad_norm": 56.99957275390625, - "kl": 0.1826171875, - "learning_rate": 9.114085404716379e-07, - "loss": 0.0073, - "reward": 1.7226518392562866, - "reward_std": 0.11980736255645752, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6132767796516418, - "rewards/pad": 0.109375, - "step": 278 - }, - { - "completion_length": 124.9375, - "epoch": 0.08891013384321224, - "grad_norm": 16.744503021240234, - "kl": 0.0654296875, - "learning_rate": 9.110898661567877e-07, - "loss": 0.0026, - "reward": 1.5589927434921265, - "reward_std": 0.042084500193595886, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4339926838874817, - "step": 279 - }, - { - "completion_length": 95.1875, - "epoch": 0.08922880815806246, - "grad_norm": 39.232322692871094, - "kl": 0.08740234375, - "learning_rate": 9.107711918419375e-07, - "loss": 0.0035, - "reward": 1.452876329421997, - "reward_std": 0.17438113689422607, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.46850135922431946, - "step": 280 - }, - { - "completion_length": 123.921875, - "epoch": 0.08954748247291268, - "grad_norm": 31.64417839050293, - "kl": 0.12353515625, - "learning_rate": 9.104525175270872e-07, - "loss": 0.005, - "reward": 1.5177637338638306, - "reward_std": 0.055521003901958466, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5177636742591858, - "step": 281 - }, - { - "completion_length": 19.5, - "epoch": 0.08986615678776291, - "grad_norm": 32.218997955322266, - "kl": 0.29296875, - "learning_rate": 9.10133843212237e-07, - "loss": 0.0117, - "reward": 1.470993995666504, - "reward_std": 0.12481606006622314, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4397439956665039, - "rewards/pad": 0.03125, - "step": 282 - }, - { - "completion_length": 46.0625, - "epoch": 0.09018483110261313, - "grad_norm": 43.10451126098633, - "kl": 0.1357421875, - "learning_rate": 9.098151688973869e-07, - "loss": 0.0054, - "reward": 1.586465835571289, - "reward_std": 0.15732800960540771, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4302159249782562, - "rewards/pad": 0.15625, - "step": 283 - }, - { - "completion_length": 97.484375, - "epoch": 0.09050350541746335, - "grad_norm": 19.141231536865234, - "kl": 0.09228515625, - "learning_rate": 9.094964945825367e-07, - "loss": 0.0037, - "reward": 1.7354822158813477, - "reward_std": 0.07660651952028275, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5011070966720581, - "rewards/pad": 0.234375, - "step": 284 - }, - { - "completion_length": 98.140625, - "epoch": 0.09082217973231357, - "grad_norm": 14.528425216674805, - "kl": 0.09765625, - "learning_rate": 9.091778202676864e-07, - "loss": 0.0039, - "reward": 1.5818281173706055, - "reward_std": 0.05947490036487579, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45682817697525024, - "rewards/pad": 0.125, - "step": 285 - }, - { - "completion_length": 98.359375, - "epoch": 0.0911408540471638, - "grad_norm": 15.328421592712402, - "kl": 0.07861328125, - "learning_rate": 9.088591459528362e-07, - "loss": 0.0031, - "reward": 1.688610553741455, - "reward_std": 0.05649499222636223, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4386104941368103, - "step": 286 - }, - { - "completion_length": 124.65625, - "epoch": 0.09145952836201403, - "grad_norm": 10.835719108581543, - "kl": 0.0810546875, - "learning_rate": 9.08540471637986e-07, - "loss": 0.0032, - "reward": 1.665027141571045, - "reward_std": 0.10295233130455017, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5556520819664001, - "step": 287 - }, - { - "completion_length": 150.796875, - "epoch": 0.09177820267686425, - "grad_norm": 14.098941802978516, - "kl": 0.07568359375, - "learning_rate": 9.082217973231358e-07, - "loss": 0.0031, - "reward": 1.432703971862793, - "reward_std": 0.05922875553369522, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43270397186279297, - "step": 288 - }, - { - "completion_length": 46.125, - "epoch": 0.09209687699171447, - "grad_norm": 80.2524185180664, - "kl": 0.216796875, - "learning_rate": 9.079031230082855e-07, - "loss": 0.0087, - "reward": 1.7530198097229004, - "reward_std": 0.10651280730962753, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6280196905136108, - "rewards/pad": 0.125, - "step": 289 - }, - { - "completion_length": 97.125, - "epoch": 0.09241555130656469, - "grad_norm": 15.310797691345215, - "kl": 0.166015625, - "learning_rate": 9.075844486934353e-07, - "loss": 0.0066, - "reward": 1.7720351219177246, - "reward_std": 0.13050448894500732, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5376600623130798, - "step": 290 - }, - { - "completion_length": 121.671875, - "epoch": 0.09273422562141491, - "grad_norm": 15.590791702270508, - "kl": 0.09033203125, - "learning_rate": 9.072657743785851e-07, - "loss": 0.0036, - "reward": 1.4074218273162842, - "reward_std": 0.04259195923805237, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4074217677116394, - "step": 291 - }, - { - "completion_length": 18.265625, - "epoch": 0.09305289993626514, - "grad_norm": 36.518375396728516, - "kl": 0.55859375, - "learning_rate": 9.069471000637349e-07, - "loss": 0.0224, - "reward": 1.3181414604187012, - "reward_std": 0.08523581176996231, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.31814149022102356, - "rewards/pad": 0.0, - "step": 292 - }, - { - "completion_length": 97.609375, - "epoch": 0.09337157425111536, - "grad_norm": 25.183494567871094, - "kl": 0.08984375, - "learning_rate": 9.066284257488846e-07, - "loss": 0.0036, - "reward": 1.4201927185058594, - "reward_std": 0.04629743844270706, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4201928973197937, - "step": 293 - }, - { - "completion_length": 72.8125, - "epoch": 0.09369024856596558, - "grad_norm": 23.733295440673828, - "kl": 0.24609375, - "learning_rate": 9.063097514340344e-07, - "loss": 0.0098, - "reward": 1.611457347869873, - "reward_std": 0.08153215050697327, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48645731806755066, - "rewards/pad": 0.125, - "step": 294 - }, - { - "completion_length": 99.671875, - "epoch": 0.0940089228808158, - "grad_norm": 27.60891342163086, - "kl": 0.2109375, - "learning_rate": 9.059910771191841e-07, - "loss": 0.0084, - "reward": 1.6462310552597046, - "reward_std": 0.08268588036298752, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5368560552597046, - "rewards/pad": 0.125, - "step": 295 - }, - { - "completion_length": 97.9375, - "epoch": 0.09432759719566602, - "grad_norm": 60.23105239868164, - "kl": 0.1171875, - "learning_rate": 9.056724028043339e-07, - "loss": 0.0047, - "reward": 1.507036805152893, - "reward_std": 0.24251246452331543, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 0.96875, - "rewards/tracking_iou_reward": 0.4757867455482483, - "step": 296 - }, - { - "completion_length": 122.328125, - "epoch": 0.09464627151051626, - "grad_norm": 31.52186393737793, - "kl": 0.0869140625, - "learning_rate": 9.053537284894836e-07, - "loss": 0.0035, - "reward": 1.488356113433838, - "reward_std": 0.062449708580970764, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4883561432361603, - "rewards/pad": 0.0, - "step": 297 - }, - { - "completion_length": 97.96875, - "epoch": 0.09496494582536648, - "grad_norm": 13.065885543823242, - "kl": 0.10498046875, - "learning_rate": 9.050350541746334e-07, - "loss": 0.0042, - "reward": 1.5660672187805176, - "reward_std": 0.03638705611228943, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4410672187805176, - "rewards/pad": 0.125, - "step": 298 - }, - { - "completion_length": 96.75, - "epoch": 0.0952836201402167, - "grad_norm": 38.61499786376953, - "kl": 0.10107421875, - "learning_rate": 9.047163798597832e-07, - "loss": 0.004, - "reward": 1.4196743965148926, - "reward_std": 0.06692565977573395, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4196743071079254, - "rewards/pad": 0.0, - "step": 299 - }, - { - "completion_length": 99.8125, - "epoch": 0.09560229445506692, - "grad_norm": 31.40985679626465, - "kl": 0.07177734375, - "learning_rate": 9.04397705544933e-07, - "loss": 0.0029, - "reward": 1.7245066165924072, - "reward_std": 0.05942245572805405, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4745066165924072, - "step": 300 - }, - { - "completion_length": 99.34375, - "epoch": 0.09592096876991714, - "grad_norm": 13.066228866577148, - "kl": 0.1318359375, - "learning_rate": 9.040790312300827e-07, - "loss": 0.0053, - "reward": 1.365534782409668, - "reward_std": 0.09436441212892532, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.38115984201431274, - "rewards/pad": 0.0, - "step": 301 - }, - { - "completion_length": 97.21875, - "epoch": 0.09623964308476737, - "grad_norm": 45.45714569091797, - "kl": 0.083984375, - "learning_rate": 9.037603569152326e-07, - "loss": 0.0034, - "reward": 1.6654975414276123, - "reward_std": 0.15218521654605865, - "rewards/answer_reward": 0.203125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46237245202064514, - "step": 302 - }, - { - "completion_length": 123.859375, - "epoch": 0.0965583173996176, - "grad_norm": 62.40977096557617, - "kl": 0.08349609375, - "learning_rate": 9.034416826003824e-07, - "loss": 0.0033, - "reward": 1.310499668121338, - "reward_std": 0.06527701765298843, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3104996085166931, - "step": 303 - }, - { - "completion_length": 70.203125, - "epoch": 0.09687699171446781, - "grad_norm": 78.83379364013672, - "kl": 0.080078125, - "learning_rate": 9.031230082855322e-07, - "loss": 0.0032, - "reward": 1.51714289188385, - "reward_std": 0.09799807518720627, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5171428918838501, - "step": 304 - }, - { - "completion_length": 148.671875, - "epoch": 0.09719566602931803, - "grad_norm": 7.507280349731445, - "kl": 0.05908203125, - "learning_rate": 9.028043339706819e-07, - "loss": 0.0024, - "reward": 1.4250938892364502, - "reward_std": 0.02290337160229683, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42509400844573975, - "rewards/pad": 0.0, - "step": 305 - }, - { - "completion_length": 44.546875, - "epoch": 0.09751434034416825, - "grad_norm": 109.25292205810547, - "kl": 0.1220703125, - "learning_rate": 9.024856596558317e-07, - "loss": 0.0049, - "reward": 1.4947134256362915, - "reward_std": 0.08180755376815796, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49471336603164673, - "rewards/pad": 0.0, - "step": 306 - }, - { - "completion_length": 48.046875, - "epoch": 0.09783301465901849, - "grad_norm": 135.5470428466797, - "kl": 0.125, - "learning_rate": 9.021669853409815e-07, - "loss": 0.005, - "reward": 1.6899664402008057, - "reward_std": 0.14552678167819977, - "rewards/answer_reward": 0.3125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3774664103984833, - "step": 307 - }, - { - "completion_length": 71.75, - "epoch": 0.09815168897386871, - "grad_norm": 19.311742782592773, - "kl": 0.1279296875, - "learning_rate": 9.018483110261312e-07, - "loss": 0.0051, - "reward": 1.4897065162658691, - "reward_std": 0.054242588579654694, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48970645666122437, - "rewards/pad": 0.0, - "step": 308 - }, - { - "completion_length": 97.0625, - "epoch": 0.09847036328871893, - "grad_norm": 14.59001636505127, - "kl": 0.1982421875, - "learning_rate": 9.01529636711281e-07, - "loss": 0.0079, - "reward": 1.43793523311615, - "reward_std": 0.05904996767640114, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4379352629184723, - "step": 309 - }, - { - "completion_length": 123.28125, - "epoch": 0.09878903760356915, - "grad_norm": 42.15071487426758, - "kl": 0.039794921875, - "learning_rate": 9.012109623964308e-07, - "loss": 0.0016, - "reward": 1.7498985528945923, - "reward_std": 0.04076165705919266, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4998985528945923, - "step": 310 - }, - { - "completion_length": 74.34375, - "epoch": 0.09910771191841937, - "grad_norm": 77.68610382080078, - "kl": 0.126953125, - "learning_rate": 9.008922880815806e-07, - "loss": 0.0051, - "reward": 1.4720065593719482, - "reward_std": 0.10089914500713348, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4720064103603363, - "rewards/pad": 0.0, - "step": 311 - }, - { - "completion_length": 176.0625, - "epoch": 0.0994263862332696, - "grad_norm": 46.3254508972168, - "kl": 0.036376953125, - "learning_rate": 9.005736137667303e-07, - "loss": 0.0015, - "reward": 1.5705065727233887, - "reward_std": 0.030933646485209465, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44550663232803345, - "step": 312 - }, - { - "completion_length": 71.703125, - "epoch": 0.09974506054811982, - "grad_norm": 50.289485931396484, - "kl": 0.380859375, - "learning_rate": 9.002549394518801e-07, - "loss": 0.0153, - "reward": 1.5453498363494873, - "reward_std": 0.053975529968738556, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42034971714019775, - "step": 313 - }, - { - "completion_length": 97.25, - "epoch": 0.10006373486297004, - "grad_norm": 96.94417572021484, - "kl": 0.8125, - "learning_rate": 8.999362651370299e-07, - "loss": 0.0326, - "reward": 1.7055671215057373, - "reward_std": 0.07332450896501541, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5805671215057373, - "step": 314 - }, - { - "completion_length": 95.4375, - "epoch": 0.10038240917782026, - "grad_norm": 81.17061614990234, - "kl": 0.07861328125, - "learning_rate": 8.996175908221797e-07, - "loss": 0.0031, - "reward": 1.5590391159057617, - "reward_std": 0.14300386607646942, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48091405630111694, - "step": 315 - }, - { - "completion_length": 72.828125, - "epoch": 0.10070108349267048, - "grad_norm": 88.43734741210938, - "kl": 0.79296875, - "learning_rate": 8.992989165073294e-07, - "loss": 0.0318, - "reward": 1.6209162473678589, - "reward_std": 0.1531776785850525, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4959162473678589, - "rewards/pad": 0.125, - "step": 316 - }, - { - "completion_length": 68.109375, - "epoch": 0.10101975780752072, - "grad_norm": 31.90052032470703, - "kl": 0.12890625, - "learning_rate": 8.989802421924792e-07, - "loss": 0.0052, - "reward": 1.5254950523376465, - "reward_std": 0.07256342470645905, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5254950523376465, - "rewards/pad": 0.0, - "step": 317 - }, - { - "completion_length": 71.765625, - "epoch": 0.10133843212237094, - "grad_norm": 27.011852264404297, - "kl": 0.25390625, - "learning_rate": 8.98661567877629e-07, - "loss": 0.0101, - "reward": 1.567828893661499, - "reward_std": 0.12586727738380432, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.567828893661499, - "step": 318 - }, - { - "completion_length": 124.890625, - "epoch": 0.10165710643722116, - "grad_norm": 28.799516677856445, - "kl": 0.09716796875, - "learning_rate": 8.983428935627788e-07, - "loss": 0.0039, - "reward": 1.7499445676803589, - "reward_std": 0.04650557041168213, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4999445080757141, - "step": 319 - }, - { - "completion_length": 98.84375, - "epoch": 0.10197578075207138, - "grad_norm": 22.20235252380371, - "kl": 0.10400390625, - "learning_rate": 8.980242192479286e-07, - "loss": 0.0041, - "reward": 1.466164231300354, - "reward_std": 0.0942983478307724, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4661642909049988, - "step": 320 - }, - { - "completion_length": 98.28125, - "epoch": 0.1022944550669216, - "grad_norm": 35.46802520751953, - "kl": 0.1015625, - "learning_rate": 8.977055449330784e-07, - "loss": 0.004, - "reward": 1.5338505506515503, - "reward_std": 0.050848715007305145, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.2838504910469055, - "step": 321 - }, - { - "completion_length": 76.0, - "epoch": 0.10261312938177183, - "grad_norm": 28.48044776916504, - "kl": 0.1181640625, - "learning_rate": 8.973868706182282e-07, - "loss": 0.0047, - "reward": 1.3551535606384277, - "reward_std": 0.09504449367523193, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3551536798477173, - "rewards/pad": 0.0, - "step": 322 - }, - { - "completion_length": 45.03125, - "epoch": 0.10293180369662205, - "grad_norm": 20.255050659179688, - "kl": 0.1435546875, - "learning_rate": 8.97068196303378e-07, - "loss": 0.0058, - "reward": 1.4209308624267578, - "reward_std": 0.0825200229883194, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42093077301979065, - "rewards/pad": 0.0, - "step": 323 - }, - { - "completion_length": 127.28125, - "epoch": 0.10325047801147227, - "grad_norm": 31.418140411376953, - "kl": 0.06640625, - "learning_rate": 8.967495219885277e-07, - "loss": 0.0027, - "reward": 1.5614705085754395, - "reward_std": 0.08935748040676117, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4208453893661499, - "step": 324 - }, - { - "completion_length": 70.859375, - "epoch": 0.1035691523263225, - "grad_norm": 46.53847122192383, - "kl": 0.09814453125, - "learning_rate": 8.964308476736775e-07, - "loss": 0.0039, - "reward": 1.7592271566390991, - "reward_std": 0.10191182047128677, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3998522162437439, - "rewards/pad": 0.375, - "step": 325 - }, - { - "completion_length": 45.0625, - "epoch": 0.10388782664117271, - "grad_norm": 63.19196701049805, - "kl": 0.1142578125, - "learning_rate": 8.961121733588273e-07, - "loss": 0.0046, - "reward": 1.588505506515503, - "reward_std": 0.10087063908576965, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4635055661201477, - "rewards/pad": 0.125, - "step": 326 - }, - { - "completion_length": 177.09375, - "epoch": 0.10420650095602295, - "grad_norm": 14.488212585449219, - "kl": 0.032958984375, - "learning_rate": 8.957934990439771e-07, - "loss": 0.0013, - "reward": 1.3769497871398926, - "reward_std": 0.03301223739981651, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3769497275352478, - "step": 327 - }, - { - "completion_length": 72.4375, - "epoch": 0.10452517527087317, - "grad_norm": 28.563243865966797, - "kl": 0.1337890625, - "learning_rate": 8.954748247291268e-07, - "loss": 0.0053, - "reward": 1.5478661060333252, - "reward_std": 0.12154059112071991, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3603661060333252, - "rewards/pad": 0.1875, - "step": 328 - }, - { - "completion_length": 125.125, - "epoch": 0.10484384958572339, - "grad_norm": 199.4341583251953, - "kl": 0.07958984375, - "learning_rate": 8.951561504142766e-07, - "loss": 0.0032, - "reward": 1.3509695529937744, - "reward_std": 0.12035353481769562, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.21034449338912964, - "step": 329 - }, - { - "completion_length": 18.8125, - "epoch": 0.10516252390057361, - "grad_norm": 626.1193237304688, - "kl": 0.197265625, - "learning_rate": 8.948374760994264e-07, - "loss": 0.0079, - "reward": 1.6512441635131836, - "reward_std": 0.1490609496831894, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6512441635131836, - "rewards/pad": 0.0, - "step": 330 - }, - { - "completion_length": 122.71875, - "epoch": 0.10548119821542384, - "grad_norm": 24.436813354492188, - "kl": 0.1044921875, - "learning_rate": 8.945188017845762e-07, - "loss": 0.0042, - "reward": 1.5120811462402344, - "reward_std": 0.03990060091018677, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5120812058448792, - "step": 331 - }, - { - "completion_length": 73.984375, - "epoch": 0.10579987253027406, - "grad_norm": 24.41294288635254, - "kl": 0.130859375, - "learning_rate": 8.942001274697259e-07, - "loss": 0.0052, - "reward": 1.5127191543579102, - "reward_std": 0.06821781396865845, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3877192735671997, - "rewards/pad": 0.125, - "step": 332 - }, - { - "completion_length": 121.734375, - "epoch": 0.10611854684512428, - "grad_norm": 22.645463943481445, - "kl": 0.0732421875, - "learning_rate": 8.938814531548757e-07, - "loss": 0.003, - "reward": 1.3383376598358154, - "reward_std": 0.027271658182144165, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.33833763003349304, - "rewards/pad": 0.0, - "step": 333 - }, - { - "completion_length": 149.6875, - "epoch": 0.1064372211599745, - "grad_norm": 15.65821361541748, - "kl": 0.039794921875, - "learning_rate": 8.935627788400254e-07, - "loss": 0.0016, - "reward": 1.5990335941314697, - "reward_std": 0.04202403873205185, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4740336537361145, - "step": 334 - }, - { - "completion_length": 178.015625, - "epoch": 0.10675589547482472, - "grad_norm": 25.23544692993164, - "kl": 0.0439453125, - "learning_rate": 8.932441045251752e-07, - "loss": 0.0018, - "reward": 1.4337772130966187, - "reward_std": 0.08441469818353653, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4494021236896515, - "step": 335 - }, - { - "completion_length": 144.8125, - "epoch": 0.10707456978967496, - "grad_norm": 22.692462921142578, - "kl": 0.0703125, - "learning_rate": 8.929254302103249e-07, - "loss": 0.0028, - "reward": 1.4097086191177368, - "reward_std": 0.05583653971552849, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40970855951309204, - "step": 336 - }, - { - "completion_length": 170.28125, - "epoch": 0.10739324410452518, - "grad_norm": 44.01387405395508, - "kl": 0.038818359375, - "learning_rate": 8.926067558954747e-07, - "loss": 0.0016, - "reward": 1.407758116722107, - "reward_std": 0.05321196839213371, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.28275805711746216, - "step": 337 - }, - { - "completion_length": 98.15625, - "epoch": 0.1077119184193754, - "grad_norm": 72.85901641845703, - "kl": 0.0966796875, - "learning_rate": 8.922880815806245e-07, - "loss": 0.0039, - "reward": 1.5778709650039673, - "reward_std": 0.08505690097808838, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45287102460861206, - "rewards/pad": 0.125, - "step": 338 - }, - { - "completion_length": 98.4375, - "epoch": 0.10803059273422562, - "grad_norm": 18.919353485107422, - "kl": 0.0966796875, - "learning_rate": 8.919694072657742e-07, - "loss": 0.0039, - "reward": 1.6748125553131104, - "reward_std": 0.04199599474668503, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5498126149177551, - "rewards/pad": 0.125, - "step": 339 - }, - { - "completion_length": 95.484375, - "epoch": 0.10834926704907584, - "grad_norm": 17.899627685546875, - "kl": 0.1201171875, - "learning_rate": 8.916507329509241e-07, - "loss": 0.0048, - "reward": 1.4470590353012085, - "reward_std": 0.06309215724468231, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3220589756965637, - "rewards/pad": 0.125, - "step": 340 - }, - { - "completion_length": 45.546875, - "epoch": 0.10866794136392607, - "grad_norm": 34.16611099243164, - "kl": 0.1611328125, - "learning_rate": 8.913320586360739e-07, - "loss": 0.0064, - "reward": 1.5854992866516113, - "reward_std": 0.10462392866611481, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49174928665161133, - "step": 341 - }, - { - "completion_length": 73.15625, - "epoch": 0.1089866156787763, - "grad_norm": 30.530794143676758, - "kl": 0.177734375, - "learning_rate": 8.910133843212237e-07, - "loss": 0.0071, - "reward": 1.880784511566162, - "reward_std": 0.06079376861453056, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5057844519615173, - "step": 342 - }, - { - "completion_length": 72.015625, - "epoch": 0.10930528999362651, - "grad_norm": 19.3212833404541, - "kl": 0.142578125, - "learning_rate": 8.906947100063734e-07, - "loss": 0.0057, - "reward": 1.5231934785842896, - "reward_std": 0.05362851545214653, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39819350838661194, - "rewards/pad": 0.125, - "step": 343 - }, - { - "completion_length": 44.890625, - "epoch": 0.10962396430847673, - "grad_norm": 113.36170196533203, - "kl": 0.671875, - "learning_rate": 8.903760356915232e-07, - "loss": 0.0269, - "reward": 1.7440569400787354, - "reward_std": 0.13982772827148438, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6190569400787354, - "step": 344 - }, - { - "completion_length": 94.1875, - "epoch": 0.10994263862332695, - "grad_norm": 84.85638427734375, - "kl": 0.10986328125, - "learning_rate": 8.90057361376673e-07, - "loss": 0.0044, - "reward": 1.4598510265350342, - "reward_std": 0.09140007197856903, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3504760265350342, - "step": 345 - }, - { - "completion_length": 124.046875, - "epoch": 0.11026131293817719, - "grad_norm": 106.96361541748047, - "kl": 0.10595703125, - "learning_rate": 8.897386870618228e-07, - "loss": 0.0042, - "reward": 1.5757009983062744, - "reward_std": 0.11959008872509003, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5132008790969849, - "step": 346 - }, - { - "completion_length": 98.046875, - "epoch": 0.11057998725302741, - "grad_norm": 62.11159133911133, - "kl": 0.404296875, - "learning_rate": 8.894200127469725e-07, - "loss": 0.0162, - "reward": 1.4973745346069336, - "reward_std": 0.09182165563106537, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35674965381622314, - "rewards/pad": 0.140625, - "step": 347 - }, - { - "completion_length": 70.578125, - "epoch": 0.11089866156787763, - "grad_norm": 28.104806900024414, - "kl": 0.1357421875, - "learning_rate": 8.891013384321223e-07, - "loss": 0.0054, - "reward": 1.7881293296813965, - "reward_std": 0.10459479689598083, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.538129448890686, - "step": 348 - }, - { - "completion_length": 124.671875, - "epoch": 0.11121733588272785, - "grad_norm": 39.63365936279297, - "kl": 0.08544921875, - "learning_rate": 8.887826641172721e-07, - "loss": 0.0034, - "reward": 1.4937341213226318, - "reward_std": 0.04106447100639343, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49373406171798706, - "rewards/pad": 0.0, - "step": 349 - }, - { - "completion_length": 96.65625, - "epoch": 0.11153601019757807, - "grad_norm": 13.17743968963623, - "kl": 0.115234375, - "learning_rate": 8.884639898024219e-07, - "loss": 0.0046, - "reward": 1.3785258531570435, - "reward_std": 0.05048896372318268, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37852585315704346, - "rewards/pad": 0.0, - "step": 350 - }, - { - "completion_length": 150.3125, - "epoch": 0.1118546845124283, - "grad_norm": 10.41886043548584, - "kl": 0.0703125, - "learning_rate": 8.881453154875716e-07, - "loss": 0.0029, - "reward": 1.6018152236938477, - "reward_std": 0.023036912083625793, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47681528329849243, - "step": 351 - }, - { - "completion_length": 98.09375, - "epoch": 0.11217335882727852, - "grad_norm": 24.771432876586914, - "kl": 0.1845703125, - "learning_rate": 8.878266411727214e-07, - "loss": 0.0074, - "reward": 1.751751184463501, - "reward_std": 0.07713186740875244, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.501751184463501, - "step": 352 - }, - { - "completion_length": 95.296875, - "epoch": 0.11249203314212874, - "grad_norm": 108.3392333984375, - "kl": 0.09619140625, - "learning_rate": 8.875079668578712e-07, - "loss": 0.0038, - "reward": 1.344058871269226, - "reward_std": 0.06936553120613098, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34405890107154846, - "step": 353 - }, - { - "completion_length": 121.9375, - "epoch": 0.11281070745697896, - "grad_norm": 16.075292587280273, - "kl": 0.1689453125, - "learning_rate": 8.87189292543021e-07, - "loss": 0.0068, - "reward": 1.3336937427520752, - "reward_std": 0.05201172083616257, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.33369380235671997, - "rewards/pad": 0.0, - "step": 354 - }, - { - "completion_length": 96.609375, - "epoch": 0.11312938177182918, - "grad_norm": 57.56117248535156, - "kl": 0.0966796875, - "learning_rate": 8.868706182281707e-07, - "loss": 0.0039, - "reward": 1.5197579860687256, - "reward_std": 0.040330007672309875, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5197579860687256, - "rewards/pad": 0.0, - "step": 355 - }, - { - "completion_length": 70.921875, - "epoch": 0.11344805608667942, - "grad_norm": 53.66902160644531, - "kl": 0.1826171875, - "learning_rate": 8.865519439133205e-07, - "loss": 0.0073, - "reward": 1.4107165336608887, - "reward_std": 0.20225141942501068, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3482164740562439, - "rewards/pad": 0.0625, - "step": 356 - }, - { - "completion_length": 124.359375, - "epoch": 0.11376673040152964, - "grad_norm": 23.252378463745117, - "kl": 0.0732421875, - "learning_rate": 8.862332695984703e-07, - "loss": 0.0029, - "reward": 1.4586352109909058, - "reward_std": 0.09969377517700195, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.474260151386261, - "step": 357 - }, - { - "completion_length": 177.578125, - "epoch": 0.11408540471637986, - "grad_norm": 25.044593811035156, - "kl": 0.05126953125, - "learning_rate": 8.859145952836202e-07, - "loss": 0.0021, - "reward": 1.4991048574447632, - "reward_std": 0.023005153983831406, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.37410491704940796, - "step": 358 - }, - { - "completion_length": 94.765625, - "epoch": 0.11440407903123008, - "grad_norm": 24.907939910888672, - "kl": 0.123046875, - "learning_rate": 8.855959209687699e-07, - "loss": 0.0049, - "reward": 1.4080394506454468, - "reward_std": 0.03388051688671112, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40803948044776917, - "rewards/pad": 0.0, - "step": 359 - }, - { - "completion_length": 72.53125, - "epoch": 0.1147227533460803, - "grad_norm": 41.2052116394043, - "kl": 0.126953125, - "learning_rate": 8.852772466539197e-07, - "loss": 0.0051, - "reward": 1.5547771453857422, - "reward_std": 0.07238291949033737, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4297771453857422, - "rewards/pad": 0.125, - "step": 360 - }, - { - "completion_length": 98.53125, - "epoch": 0.11504142766093053, - "grad_norm": 47.04529571533203, - "kl": 0.11279296875, - "learning_rate": 8.849585723390695e-07, - "loss": 0.0045, - "reward": 1.6135101318359375, - "reward_std": 0.07768786698579788, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4885101318359375, - "step": 361 - }, - { - "completion_length": 125.328125, - "epoch": 0.11536010197578075, - "grad_norm": 124.77279663085938, - "kl": 0.0673828125, - "learning_rate": 8.846398980242193e-07, - "loss": 0.0027, - "reward": 1.5826597213745117, - "reward_std": 0.09398077428340912, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42640984058380127, - "step": 362 - }, - { - "completion_length": 22.109375, - "epoch": 0.11567877629063097, - "grad_norm": 55.29496383666992, - "kl": 0.11767578125, - "learning_rate": 8.84321223709369e-07, - "loss": 0.0047, - "reward": 1.9375572204589844, - "reward_std": 0.19804233312606812, - "rewards/answer_reward": 0.515625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.421932190656662, - "step": 363 - }, - { - "completion_length": 97.78125, - "epoch": 0.1159974506054812, - "grad_norm": 23.788480758666992, - "kl": 0.080078125, - "learning_rate": 8.840025493945188e-07, - "loss": 0.0032, - "reward": 1.5006022453308105, - "reward_std": 0.04466657713055611, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5006022453308105, - "rewards/pad": 0.0, - "step": 364 - }, - { - "completion_length": 146.4375, - "epoch": 0.11631612492033142, - "grad_norm": 20.29625129699707, - "kl": 0.0888671875, - "learning_rate": 8.836838750796686e-07, - "loss": 0.0036, - "reward": 1.5072904825210571, - "reward_std": 0.04846058040857315, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38229045271873474, - "step": 365 - }, - { - "completion_length": 70.609375, - "epoch": 0.11663479923518165, - "grad_norm": 64.4805679321289, - "kl": 0.1962890625, - "learning_rate": 8.833652007648184e-07, - "loss": 0.0078, - "reward": 1.693876028060913, - "reward_std": 0.14755964279174805, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5688760876655579, - "rewards/pad": 0.125, - "step": 366 - }, - { - "completion_length": 69.875, - "epoch": 0.11695347355003187, - "grad_norm": 35.21623229980469, - "kl": 0.1298828125, - "learning_rate": 8.830465264499681e-07, - "loss": 0.0052, - "reward": 1.5406925678253174, - "reward_std": 0.15816015005111694, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5094425678253174, - "rewards/pad": 0.03125, - "step": 367 - }, - { - "completion_length": 98.671875, - "epoch": 0.11727214786488209, - "grad_norm": 257.2796936035156, - "kl": 0.08935546875, - "learning_rate": 8.827278521351179e-07, - "loss": 0.0036, - "reward": 1.5796436071395874, - "reward_std": 0.14178229868412018, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5327686667442322, - "rewards/pad": 0.046875, - "step": 368 - }, - { - "completion_length": 121.796875, - "epoch": 0.11759082217973231, - "grad_norm": 48.446983337402344, - "kl": 0.083984375, - "learning_rate": 8.824091778202677e-07, - "loss": 0.0033, - "reward": 1.512686014175415, - "reward_std": 0.06343154609203339, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38768601417541504, - "step": 369 - }, - { - "completion_length": 71.015625, - "epoch": 0.11790949649458253, - "grad_norm": 26.253490447998047, - "kl": 0.125, - "learning_rate": 8.820905035054175e-07, - "loss": 0.005, - "reward": 1.541078805923462, - "reward_std": 0.09650516510009766, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5410787463188171, - "rewards/pad": 0.0, - "step": 370 - }, - { - "completion_length": 47.359375, - "epoch": 0.11822817080943276, - "grad_norm": 41.3243408203125, - "kl": 0.1650390625, - "learning_rate": 8.817718291905672e-07, - "loss": 0.0066, - "reward": 1.6651489734649658, - "reward_std": 0.1509116291999817, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43077388405799866, - "step": 371 - }, - { - "completion_length": 70.703125, - "epoch": 0.11854684512428298, - "grad_norm": 55.84326934814453, - "kl": 0.1875, - "learning_rate": 8.81453154875717e-07, - "loss": 0.0075, - "reward": 1.609081745147705, - "reward_std": 0.12055587768554688, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49970677495002747, - "step": 372 - }, - { - "completion_length": 45.640625, - "epoch": 0.1188655194391332, - "grad_norm": 151.74977111816406, - "kl": 0.1435546875, - "learning_rate": 8.811344805608667e-07, - "loss": 0.0057, - "reward": 1.4652079343795776, - "reward_std": 0.09197273850440979, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4652078449726105, - "step": 373 - }, - { - "completion_length": 44.625, - "epoch": 0.11918419375398343, - "grad_norm": 45.7737922668457, - "kl": 0.1513671875, - "learning_rate": 8.808158062460164e-07, - "loss": 0.0061, - "reward": 1.5594871044158936, - "reward_std": 0.15570798516273499, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5594871640205383, - "rewards/pad": 0.0, - "step": 374 - }, - { - "completion_length": 70.296875, - "epoch": 0.11950286806883365, - "grad_norm": 15.35860824584961, - "kl": 0.1669921875, - "learning_rate": 8.804971319311662e-07, - "loss": 0.0067, - "reward": 1.7419081926345825, - "reward_std": 0.06085880100727081, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6169082522392273, - "rewards/pad": 0.125, - "step": 375 - }, - { - "completion_length": 73.140625, - "epoch": 0.11982154238368388, - "grad_norm": 43.59029006958008, - "kl": 0.11181640625, - "learning_rate": 8.80178457616316e-07, - "loss": 0.0045, - "reward": 1.6957318782806396, - "reward_std": 0.09901833534240723, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4457317590713501, - "rewards/pad": 0.25, - "step": 376 - }, - { - "completion_length": 151.546875, - "epoch": 0.1201402166985341, - "grad_norm": 44.84366989135742, - "kl": 0.05126953125, - "learning_rate": 8.798597833014659e-07, - "loss": 0.0021, - "reward": 1.5476773977279663, - "reward_std": 0.061405330896377563, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4226773977279663, - "step": 377 - }, - { - "completion_length": 74.0625, - "epoch": 0.12045889101338432, - "grad_norm": 33.01712417602539, - "kl": 0.09521484375, - "learning_rate": 8.795411089866156e-07, - "loss": 0.0038, - "reward": 1.4716873168945312, - "reward_std": 0.13285109400749207, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3779374063014984, - "step": 378 - }, - { - "completion_length": 46.515625, - "epoch": 0.12077756532823454, - "grad_norm": 36.0592041015625, - "kl": 0.1064453125, - "learning_rate": 8.792224346717654e-07, - "loss": 0.0042, - "reward": 1.7025115489959717, - "reward_std": 0.18063417077064514, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5306365489959717, - "rewards/pad": 0.171875, - "step": 379 - }, - { - "completion_length": 97.40625, - "epoch": 0.12109623964308477, - "grad_norm": 63.695777893066406, - "kl": 0.0859375, - "learning_rate": 8.789037603569152e-07, - "loss": 0.0034, - "reward": 1.573782205581665, - "reward_std": 0.11549675464630127, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.44878220558166504, - "step": 380 - }, - { - "completion_length": 97.625, - "epoch": 0.121414913957935, - "grad_norm": 45.81637954711914, - "kl": 0.326171875, - "learning_rate": 8.78585086042065e-07, - "loss": 0.013, - "reward": 1.4872021675109863, - "reward_std": 0.10220817476511002, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4872022271156311, - "rewards/pad": 0.0, - "step": 381 - }, - { - "completion_length": 97.0, - "epoch": 0.12173358827278521, - "grad_norm": 79.03022766113281, - "kl": 0.08935546875, - "learning_rate": 8.782664117272147e-07, - "loss": 0.0036, - "reward": 1.592256784439087, - "reward_std": 0.09763297438621521, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46725672483444214, - "step": 382 - }, - { - "completion_length": 98.640625, - "epoch": 0.12205226258763544, - "grad_norm": 81.4788818359375, - "kl": 0.0947265625, - "learning_rate": 8.779477374123645e-07, - "loss": 0.0038, - "reward": 1.4392238855361938, - "reward_std": 0.10192390531301498, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.31422388553619385, - "rewards/pad": 0.125, - "step": 383 - }, - { - "completion_length": 123.609375, - "epoch": 0.12237093690248566, - "grad_norm": 61.001285552978516, - "kl": 0.08740234375, - "learning_rate": 8.776290630975143e-07, - "loss": 0.0035, - "reward": 1.4689092636108398, - "reward_std": 0.057621996849775314, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3439093828201294, - "rewards/pad": 0.125, - "step": 384 - }, - { - "completion_length": 98.40625, - "epoch": 0.12268961121733589, - "grad_norm": 28.01368522644043, - "kl": 0.12890625, - "learning_rate": 8.773103887826641e-07, - "loss": 0.0051, - "reward": 1.4256947040557861, - "reward_std": 0.08606939017772675, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42569467425346375, - "step": 385 - }, - { - "completion_length": 71.609375, - "epoch": 0.12300828553218611, - "grad_norm": 29.05324363708496, - "kl": 0.1259765625, - "learning_rate": 8.769917144678138e-07, - "loss": 0.005, - "reward": 1.6970813274383545, - "reward_std": 0.06371637433767319, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5720812082290649, - "rewards/pad": 0.125, - "step": 386 - }, - { - "completion_length": 46.125, - "epoch": 0.12332695984703633, - "grad_norm": 37.76282501220703, - "kl": 0.1279296875, - "learning_rate": 8.766730401529636e-07, - "loss": 0.0051, - "reward": 1.5817197561264038, - "reward_std": 0.12376982718706131, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4567198157310486, - "rewards/pad": 0.125, - "step": 387 - }, - { - "completion_length": 74.953125, - "epoch": 0.12364563416188655, - "grad_norm": 29.852304458618164, - "kl": 0.09228515625, - "learning_rate": 8.763543658381134e-07, - "loss": 0.0037, - "reward": 1.7793254852294922, - "reward_std": 0.10431215912103653, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4355755150318146, - "rewards/pad": 0.34375, - "step": 388 - }, - { - "completion_length": 123.40625, - "epoch": 0.12396430847673677, - "grad_norm": 95.40129852294922, - "kl": 0.06103515625, - "learning_rate": 8.760356915232632e-07, - "loss": 0.0024, - "reward": 1.5170669555664062, - "reward_std": 0.19634395837783813, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3608168363571167, - "step": 389 - }, - { - "completion_length": 151.34375, - "epoch": 0.124282982791587, - "grad_norm": 14.677362442016602, - "kl": 0.087890625, - "learning_rate": 8.757170172084129e-07, - "loss": 0.0035, - "reward": 1.4010517597198486, - "reward_std": 0.02272605523467064, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2760518193244934, - "step": 390 - }, - { - "completion_length": 70.28125, - "epoch": 0.12460165710643722, - "grad_norm": 32.54584884643555, - "kl": 0.150390625, - "learning_rate": 8.753983428935627e-07, - "loss": 0.006, - "reward": 1.6120429039001465, - "reward_std": 0.08093911409378052, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48704293370246887, - "step": 391 - }, - { - "completion_length": 148.484375, - "epoch": 0.12492033142128744, - "grad_norm": 22.513427734375, - "kl": 0.06298828125, - "learning_rate": 8.750796685787125e-07, - "loss": 0.0025, - "reward": 1.3614397048950195, - "reward_std": 0.05563504248857498, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3614397644996643, - "step": 392 - }, - { - "completion_length": 124.734375, - "epoch": 0.12523900573613767, - "grad_norm": 27.51471519470215, - "kl": 0.0791015625, - "learning_rate": 8.747609942638623e-07, - "loss": 0.0032, - "reward": 1.738678216934204, - "reward_std": 0.1270914524793625, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5043032169342041, - "step": 393 - }, - { - "completion_length": 124.046875, - "epoch": 0.12555768005098789, - "grad_norm": 78.34660339355469, - "kl": 0.06201171875, - "learning_rate": 8.74442319949012e-07, - "loss": 0.0025, - "reward": 1.4761624336242676, - "reward_std": 0.08410072326660156, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3667874038219452, - "step": 394 - }, - { - "completion_length": 177.921875, - "epoch": 0.1258763543658381, - "grad_norm": 38.84751892089844, - "kl": 0.1650390625, - "learning_rate": 8.741236456341619e-07, - "loss": 0.0066, - "reward": 1.2327470779418945, - "reward_std": 0.019952615723013878, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.23274700343608856, - "step": 395 - }, - { - "completion_length": 46.65625, - "epoch": 0.12619502868068833, - "grad_norm": 29.15982437133789, - "kl": 0.1494140625, - "learning_rate": 8.738049713193117e-07, - "loss": 0.0059, - "reward": 1.751692533493042, - "reward_std": 0.07417924702167511, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5016926527023315, - "step": 396 - }, - { - "completion_length": 45.5, - "epoch": 0.12651370299553855, - "grad_norm": 53.9080696105957, - "kl": 0.0947265625, - "learning_rate": 8.734862970044615e-07, - "loss": 0.0038, - "reward": 1.654681921005249, - "reward_std": 0.14040318131446838, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48280686140060425, - "rewards/pad": 0.171875, - "step": 397 - }, - { - "completion_length": 124.953125, - "epoch": 0.1268323773103888, - "grad_norm": 11.394405364990234, - "kl": 0.064453125, - "learning_rate": 8.731676226896112e-07, - "loss": 0.0026, - "reward": 1.7598766088485718, - "reward_std": 0.08118850737810135, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4005016088485718, - "step": 398 - }, - { - "completion_length": 121.28125, - "epoch": 0.12715105162523901, - "grad_norm": 107.51454162597656, - "kl": 0.07958984375, - "learning_rate": 8.72848948374761e-07, - "loss": 0.0032, - "reward": 1.3193341493606567, - "reward_std": 0.06677798926830292, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3193341791629791, - "step": 399 - }, - { - "completion_length": 19.828125, - "epoch": 0.12746972594008923, - "grad_norm": 31.021102905273438, - "kl": 0.2021484375, - "learning_rate": 8.725302740599108e-07, - "loss": 0.0081, - "reward": 1.7511768341064453, - "reward_std": 0.14500945806503296, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5168018937110901, - "step": 400 - }, - { - "completion_length": 148.140625, - "epoch": 0.12778840025493945, - "grad_norm": 16.07240104675293, - "kl": 0.09375, - "learning_rate": 8.722115997450606e-07, - "loss": 0.0038, - "reward": 1.3646976947784424, - "reward_std": 0.03278578072786331, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36469757556915283, - "step": 401 - }, - { - "completion_length": 45.4375, - "epoch": 0.12810707456978968, - "grad_norm": 40.273704528808594, - "kl": 0.15234375, - "learning_rate": 8.718929254302103e-07, - "loss": 0.0061, - "reward": 1.6111754179000854, - "reward_std": 0.09451623260974884, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4861753582954407, - "step": 402 - }, - { - "completion_length": 123.953125, - "epoch": 0.1284257488846399, - "grad_norm": 25.872350692749023, - "kl": 0.07275390625, - "learning_rate": 8.715742511153601e-07, - "loss": 0.0029, - "reward": 1.568497657775879, - "reward_std": 0.07783734798431396, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44349759817123413, - "rewards/pad": 0.125, - "step": 403 - }, - { - "completion_length": 68.53125, - "epoch": 0.12874442319949012, - "grad_norm": 25.39286231994629, - "kl": 0.1826171875, - "learning_rate": 8.712555768005099e-07, - "loss": 0.0073, - "reward": 1.4752670526504517, - "reward_std": 0.15198847651481628, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.49089211225509644, - "rewards/pad": 0.0, - "step": 404 - }, - { - "completion_length": 71.8125, - "epoch": 0.12906309751434034, - "grad_norm": 177.87518310546875, - "kl": 0.1103515625, - "learning_rate": 8.709369024856596e-07, - "loss": 0.0044, - "reward": 1.634433627128601, - "reward_std": 0.10617288947105408, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5250586271286011, - "rewards/pad": 0.109375, - "step": 405 - }, - { - "completion_length": 100.03125, - "epoch": 0.12938177182919056, - "grad_norm": 33.5173454284668, - "kl": 0.158203125, - "learning_rate": 8.706182281708094e-07, - "loss": 0.0063, - "reward": 1.6183446645736694, - "reward_std": 0.12125556170940399, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47771957516670227, - "step": 406 - }, - { - "completion_length": 73.484375, - "epoch": 0.1297004461440408, - "grad_norm": 101.80476379394531, - "kl": 0.162109375, - "learning_rate": 8.702995538559592e-07, - "loss": 0.0065, - "reward": 1.7018132209777832, - "reward_std": 0.13162967562675476, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4830631613731384, - "step": 407 - }, - { - "completion_length": 95.15625, - "epoch": 0.13001912045889102, - "grad_norm": 36.46905517578125, - "kl": 0.125, - "learning_rate": 8.69980879541109e-07, - "loss": 0.005, - "reward": 1.3658957481384277, - "reward_std": 0.06615029275417328, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36589574813842773, - "step": 408 - }, - { - "completion_length": 174.375, - "epoch": 0.13033779477374124, - "grad_norm": 30.588741302490234, - "kl": 0.58203125, - "learning_rate": 8.696622052262587e-07, - "loss": 0.0233, - "reward": 1.4599390029907227, - "reward_std": 0.09191317856311798, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45993906259536743, - "step": 409 - }, - { - "completion_length": 72.21875, - "epoch": 0.13065646908859146, - "grad_norm": 28.82553482055664, - "kl": 0.12890625, - "learning_rate": 8.693435309114085e-07, - "loss": 0.0052, - "reward": 1.7069793939590454, - "reward_std": 0.10922467708587646, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4569793939590454, - "step": 410 - }, - { - "completion_length": 20.109375, - "epoch": 0.13097514340344169, - "grad_norm": 31.949430465698242, - "kl": 0.15234375, - "learning_rate": 8.690248565965583e-07, - "loss": 0.0061, - "reward": 1.583141565322876, - "reward_std": 0.07516056299209595, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.458141565322876, - "rewards/pad": 0.125, - "step": 411 - }, - { - "completion_length": 96.875, - "epoch": 0.1312938177182919, - "grad_norm": 21.5670223236084, - "kl": 0.09375, - "learning_rate": 8.68706182281708e-07, - "loss": 0.0037, - "reward": 1.437939167022705, - "reward_std": 0.10327853262424469, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4379391074180603, - "step": 412 - }, - { - "completion_length": 100.40625, - "epoch": 0.13161249203314213, - "grad_norm": 55.93552017211914, - "kl": 0.0859375, - "learning_rate": 8.683875079668577e-07, - "loss": 0.0034, - "reward": 1.4741472005844116, - "reward_std": 0.05575428530573845, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34914714097976685, - "rewards/pad": 0.125, - "step": 413 - }, - { - "completion_length": 98.203125, - "epoch": 0.13193116634799235, - "grad_norm": 121.39154052734375, - "kl": 0.1279296875, - "learning_rate": 8.680688336520075e-07, - "loss": 0.0051, - "reward": 1.4627301692962646, - "reward_std": 0.09106460213661194, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33773019909858704, - "step": 414 - }, - { - "completion_length": 98.90625, - "epoch": 0.13224984066284257, - "grad_norm": 37.05792999267578, - "kl": 0.0830078125, - "learning_rate": 8.677501593371574e-07, - "loss": 0.0033, - "reward": 1.5707786083221436, - "reward_std": 0.11219216883182526, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5082787275314331, - "step": 415 - }, - { - "completion_length": 46.234375, - "epoch": 0.1325685149776928, - "grad_norm": 26.515222549438477, - "kl": 0.146484375, - "learning_rate": 8.674314850223072e-07, - "loss": 0.0059, - "reward": 1.4971379041671753, - "reward_std": 0.09204782545566559, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4971378445625305, - "step": 416 - }, - { - "completion_length": 97.828125, - "epoch": 0.13288718929254303, - "grad_norm": 48.868499755859375, - "kl": 0.07080078125, - "learning_rate": 8.671128107074569e-07, - "loss": 0.0028, - "reward": 1.7335339784622192, - "reward_std": 0.12875020503997803, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.499159038066864, - "rewards/pad": 0.234375, - "step": 417 - }, - { - "completion_length": 70.671875, - "epoch": 0.13320586360739325, - "grad_norm": 33.53970718383789, - "kl": 0.1474609375, - "learning_rate": 8.667941363926067e-07, - "loss": 0.0059, - "reward": 1.6477628946304321, - "reward_std": 0.11864393204450607, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5383878946304321, - "rewards/pad": 0.109375, - "step": 418 - }, - { - "completion_length": 94.390625, - "epoch": 0.13352453792224347, - "grad_norm": 44.05862045288086, - "kl": 0.11328125, - "learning_rate": 8.664754620777565e-07, - "loss": 0.0045, - "reward": 1.4214290380477905, - "reward_std": 0.08193930238485336, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42142897844314575, - "step": 419 - }, - { - "completion_length": 70.859375, - "epoch": 0.1338432122370937, - "grad_norm": 16.737979888916016, - "kl": 0.12451171875, - "learning_rate": 8.661567877629063e-07, - "loss": 0.005, - "reward": 1.6469398736953735, - "reward_std": 0.07638402283191681, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6469398736953735, - "rewards/pad": 0.0, - "step": 420 - }, - { - "completion_length": 120.421875, - "epoch": 0.13416188655194392, - "grad_norm": 26.583585739135742, - "kl": 0.2138671875, - "learning_rate": 8.65838113448056e-07, - "loss": 0.0086, - "reward": 1.4169089794158936, - "reward_std": 0.07294677197933197, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41690900921821594, - "rewards/pad": 0.0, - "step": 421 - }, - { - "completion_length": 97.40625, - "epoch": 0.13448056086679414, - "grad_norm": 21.15988540649414, - "kl": 0.068359375, - "learning_rate": 8.655194391332058e-07, - "loss": 0.0027, - "reward": 1.4524857997894287, - "reward_std": 0.054185282438993454, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4524857997894287, - "rewards/pad": 0.0, - "step": 422 - }, - { - "completion_length": 46.53125, - "epoch": 0.13479923518164436, - "grad_norm": 90.01542663574219, - "kl": 0.123046875, - "learning_rate": 8.652007648183556e-07, - "loss": 0.0049, - "reward": 1.6122829914093018, - "reward_std": 0.11555016785860062, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48728299140930176, - "rewards/pad": 0.125, - "step": 423 - }, - { - "completion_length": 94.4375, - "epoch": 0.13511790949649458, - "grad_norm": 30.43792724609375, - "kl": 0.0908203125, - "learning_rate": 8.648820905035054e-07, - "loss": 0.0036, - "reward": 1.5253713130950928, - "reward_std": 0.06750549376010895, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5253711938858032, - "rewards/pad": 0.0, - "step": 424 - }, - { - "completion_length": 98.25, - "epoch": 0.1354365838113448, - "grad_norm": 50.584659576416016, - "kl": 0.125, - "learning_rate": 8.645634161886551e-07, - "loss": 0.005, - "reward": 1.4811164140701294, - "reward_std": 0.04695218801498413, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4811164438724518, - "step": 425 - }, - { - "completion_length": 174.84375, - "epoch": 0.13575525812619502, - "grad_norm": 26.733871459960938, - "kl": 0.053466796875, - "learning_rate": 8.642447418738049e-07, - "loss": 0.0021, - "reward": 1.3371973037719727, - "reward_std": 0.024800507351756096, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3371972441673279, - "rewards/pad": 0.0, - "step": 426 - }, - { - "completion_length": 98.34375, - "epoch": 0.13607393244104526, - "grad_norm": 36.893760681152344, - "kl": 0.11962890625, - "learning_rate": 8.639260675589547e-07, - "loss": 0.0048, - "reward": 1.6948579549789429, - "reward_std": 0.10014330595731735, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5698579549789429, - "rewards/pad": 0.125, - "step": 427 - }, - { - "completion_length": 122.75, - "epoch": 0.13639260675589548, - "grad_norm": 152.30303955078125, - "kl": 0.12158203125, - "learning_rate": 8.636073932441045e-07, - "loss": 0.0049, - "reward": 1.5262490510940552, - "reward_std": 0.07107570022344589, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4012489914894104, - "step": 428 - }, - { - "completion_length": 96.6875, - "epoch": 0.1367112810707457, - "grad_norm": 40.88010787963867, - "kl": 0.12890625, - "learning_rate": 8.632887189292542e-07, - "loss": 0.0051, - "reward": 1.2539267539978027, - "reward_std": 0.09933836758136749, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2539267838001251, - "rewards/pad": 0.0, - "step": 429 - }, - { - "completion_length": 72.09375, - "epoch": 0.13702995538559593, - "grad_norm": 58.52267074584961, - "kl": 0.162109375, - "learning_rate": 8.62970044614404e-07, - "loss": 0.0065, - "reward": 1.5780236721038818, - "reward_std": 0.05143863707780838, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.32802361249923706, - "step": 430 - }, - { - "completion_length": 45.765625, - "epoch": 0.13734862970044615, - "grad_norm": 31.6570987701416, - "kl": 0.1318359375, - "learning_rate": 8.626513702995538e-07, - "loss": 0.0053, - "reward": 1.4814674854278564, - "reward_std": 0.05928094685077667, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35646751523017883, - "rewards/pad": 0.125, - "step": 431 - }, - { - "completion_length": 153.484375, - "epoch": 0.13766730401529637, - "grad_norm": 109.1246337890625, - "kl": 0.09521484375, - "learning_rate": 8.623326959847035e-07, - "loss": 0.0038, - "reward": 1.4952130317687988, - "reward_std": 0.0844242125749588, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49521297216415405, - "rewards/pad": 0.0, - "step": 432 - }, - { - "completion_length": 47.296875, - "epoch": 0.13798597833014659, - "grad_norm": 52.604557037353516, - "kl": 0.14453125, - "learning_rate": 8.620140216698534e-07, - "loss": 0.0058, - "reward": 1.4703024625778198, - "reward_std": 0.16446053981781006, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3453024625778198, - "rewards/pad": 0.125, - "step": 433 - }, - { - "completion_length": 175.828125, - "epoch": 0.1383046526449968, - "grad_norm": 21.99553108215332, - "kl": 0.0400390625, - "learning_rate": 8.616953473550032e-07, - "loss": 0.0016, - "reward": 1.3667819499969482, - "reward_std": 0.03961355984210968, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36678197979927063, - "rewards/pad": 0.0, - "step": 434 - }, - { - "completion_length": 97.375, - "epoch": 0.13862332695984703, - "grad_norm": 69.48381042480469, - "kl": 0.11376953125, - "learning_rate": 8.61376673040153e-07, - "loss": 0.0045, - "reward": 1.5398564338684082, - "reward_std": 0.08691239356994629, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41485652327537537, - "step": 435 - }, - { - "completion_length": 120.765625, - "epoch": 0.13894200127469725, - "grad_norm": 52.82097625732422, - "kl": 0.15625, - "learning_rate": 8.610579987253027e-07, - "loss": 0.0062, - "reward": 1.4473316669464111, - "reward_std": 0.09470543265342712, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44733160734176636, - "step": 436 - }, - { - "completion_length": 124.859375, - "epoch": 0.1392606755895475, - "grad_norm": 14.809861183166504, - "kl": 0.1123046875, - "learning_rate": 8.607393244104525e-07, - "loss": 0.0045, - "reward": 1.487265706062317, - "reward_std": 0.06385494023561478, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2372657209634781, - "step": 437 - }, - { - "completion_length": 43.8125, - "epoch": 0.13957934990439771, - "grad_norm": 24.490432739257812, - "kl": 0.14453125, - "learning_rate": 8.604206500956023e-07, - "loss": 0.0058, - "reward": 1.5618922710418701, - "reward_std": 0.08179674297571182, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5618922710418701, - "rewards/pad": 0.0, - "step": 438 - }, - { - "completion_length": 45.671875, - "epoch": 0.13989802421924794, - "grad_norm": 37.8846549987793, - "kl": 0.13671875, - "learning_rate": 8.601019757807521e-07, - "loss": 0.0055, - "reward": 1.68092942237854, - "reward_std": 0.16623428463935852, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4465544819831848, - "step": 439 - }, - { - "completion_length": 94.625, - "epoch": 0.14021669853409816, - "grad_norm": 27.170732498168945, - "kl": 0.09130859375, - "learning_rate": 8.597833014659018e-07, - "loss": 0.0037, - "reward": 1.446689248085022, - "reward_std": 0.089576356112957, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.446689248085022, - "step": 440 - }, - { - "completion_length": 69.03125, - "epoch": 0.14053537284894838, - "grad_norm": 51.69449234008789, - "kl": 0.1025390625, - "learning_rate": 8.594646271510516e-07, - "loss": 0.0041, - "reward": 1.4989007711410522, - "reward_std": 0.08558598160743713, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37390074133872986, - "rewards/pad": 0.125, - "step": 441 - }, - { - "completion_length": 122.75, - "epoch": 0.1408540471637986, - "grad_norm": 20.52138900756836, - "kl": 0.1396484375, - "learning_rate": 8.591459528362014e-07, - "loss": 0.0056, - "reward": 1.5021915435791016, - "reward_std": 0.08817140758037567, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5021916031837463, - "rewards/pad": 0.0, - "step": 442 - }, - { - "completion_length": 69.15625, - "epoch": 0.14117272147864882, - "grad_norm": 69.19669342041016, - "kl": 0.2158203125, - "learning_rate": 8.588272785213512e-07, - "loss": 0.0086, - "reward": 1.5405244827270508, - "reward_std": 0.07173141837120056, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4155244529247284, - "rewards/pad": 0.125, - "step": 443 - }, - { - "completion_length": 122.1875, - "epoch": 0.14149139579349904, - "grad_norm": 26.565298080444336, - "kl": 0.078125, - "learning_rate": 8.585086042065009e-07, - "loss": 0.0031, - "reward": 1.5234211683273315, - "reward_std": 0.044328056275844574, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5234212279319763, - "step": 444 - }, - { - "completion_length": 72.453125, - "epoch": 0.14181007010834926, - "grad_norm": 26.43044090270996, - "kl": 0.103515625, - "learning_rate": 8.581899298916507e-07, - "loss": 0.0041, - "reward": 1.720982313156128, - "reward_std": 0.05353473871946335, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5959822535514832, - "rewards/pad": 0.125, - "step": 445 - }, - { - "completion_length": 123.890625, - "epoch": 0.14212874442319948, - "grad_norm": 25.059282302856445, - "kl": 0.09130859375, - "learning_rate": 8.578712555768005e-07, - "loss": 0.0037, - "reward": 1.4826295375823975, - "reward_std": 0.06169476360082626, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4826295077800751, - "step": 446 - }, - { - "completion_length": 151.625, - "epoch": 0.14244741873804972, - "grad_norm": 17.98177719116211, - "kl": 0.068359375, - "learning_rate": 8.575525812619503e-07, - "loss": 0.0027, - "reward": 1.389360785484314, - "reward_std": 0.04530121758580208, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3893607258796692, - "step": 447 - }, - { - "completion_length": 71.0, - "epoch": 0.14276609305289995, - "grad_norm": 25.77447509765625, - "kl": 0.1806640625, - "learning_rate": 8.572339069471e-07, - "loss": 0.0072, - "reward": 1.6751841306686401, - "reward_std": 0.057732950896024704, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6751840710639954, - "rewards/pad": 0.0, - "step": 448 - }, - { - "completion_length": 151.703125, - "epoch": 0.14308476736775017, - "grad_norm": 17.309431076049805, - "kl": 0.11669921875, - "learning_rate": 8.569152326322498e-07, - "loss": 0.0047, - "reward": 1.6690850257873535, - "reward_std": 0.058293867856264114, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5440850257873535, - "step": 449 - }, - { - "completion_length": 67.96875, - "epoch": 0.14340344168260039, - "grad_norm": 43.26914978027344, - "kl": 0.13671875, - "learning_rate": 8.565965583173996e-07, - "loss": 0.0055, - "reward": 1.5374586582183838, - "reward_std": 0.07611675560474396, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5374586582183838, - "rewards/pad": 0.0, - "step": 450 - }, - { - "completion_length": 49.984375, - "epoch": 0.1437221159974506, - "grad_norm": 38.74765396118164, - "kl": 0.0830078125, - "learning_rate": 8.562778840025495e-07, - "loss": 0.0033, - "reward": 1.665922999382019, - "reward_std": 0.1814975142478943, - "rewards/answer_reward": 0.28125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.38467299938201904, - "step": 451 - }, - { - "completion_length": 72.046875, - "epoch": 0.14404079031230083, - "grad_norm": 183.16456604003906, - "kl": 0.1533203125, - "learning_rate": 8.55959209687699e-07, - "loss": 0.0061, - "reward": 1.5401326417922974, - "reward_std": 0.05094554275274277, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.41513270139694214, - "step": 452 - }, - { - "completion_length": 72.515625, - "epoch": 0.14435946462715105, - "grad_norm": 242.4665069580078, - "kl": 0.11669921875, - "learning_rate": 8.556405353728489e-07, - "loss": 0.0047, - "reward": 1.6754543781280518, - "reward_std": 0.0814475268125534, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4254542887210846, - "rewards/pad": 0.25, - "step": 453 - }, - { - "completion_length": 176.53125, - "epoch": 0.14467813894200127, - "grad_norm": 62.67466354370117, - "kl": 0.0625, - "learning_rate": 8.553218610579987e-07, - "loss": 0.0025, - "reward": 1.4424443244934082, - "reward_std": 0.05421048775315285, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.442444384098053, - "step": 454 - }, - { - "completion_length": 124.0625, - "epoch": 0.1449968132568515, - "grad_norm": 27.845863342285156, - "kl": 0.107421875, - "learning_rate": 8.550031867431485e-07, - "loss": 0.0043, - "reward": 1.4571428298950195, - "reward_std": 0.07115037739276886, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45714280009269714, - "rewards/pad": 0.0, - "step": 455 - }, - { - "completion_length": 99.859375, - "epoch": 0.14531548757170173, - "grad_norm": 18.424503326416016, - "kl": 0.10791015625, - "learning_rate": 8.546845124282982e-07, - "loss": 0.0043, - "reward": 1.5426874160766602, - "reward_std": 0.04801858961582184, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41768738627433777, - "rewards/pad": 0.125, - "step": 456 - }, - { - "completion_length": 99.546875, - "epoch": 0.14563416188655195, - "grad_norm": 42.82892608642578, - "kl": 0.2041015625, - "learning_rate": 8.54365838113448e-07, - "loss": 0.0082, - "reward": 1.4242668151855469, - "reward_std": 0.07840073853731155, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4242668151855469, - "step": 457 - }, - { - "completion_length": 47.53125, - "epoch": 0.14595283620140218, - "grad_norm": 83.61685180664062, - "kl": 0.1259765625, - "learning_rate": 8.540471637985978e-07, - "loss": 0.005, - "reward": 1.5576732158660889, - "reward_std": 0.1142008900642395, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43267327547073364, - "rewards/pad": 0.125, - "step": 458 - }, - { - "completion_length": 72.328125, - "epoch": 0.1462715105162524, - "grad_norm": 36.45909881591797, - "kl": 0.1318359375, - "learning_rate": 8.537284894837476e-07, - "loss": 0.0053, - "reward": 1.6129789352416992, - "reward_std": 0.06982880085706711, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.612978994846344, - "rewards/pad": 0.0, - "step": 459 - }, - { - "completion_length": 74.953125, - "epoch": 0.14659018483110262, - "grad_norm": 43.895931243896484, - "kl": 0.1328125, - "learning_rate": 8.534098151688973e-07, - "loss": 0.0053, - "reward": 1.4296586513519287, - "reward_std": 0.136674702167511, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3984086215496063, - "step": 460 - }, - { - "completion_length": 73.40625, - "epoch": 0.14690885914595284, - "grad_norm": 453.3140869140625, - "kl": 0.1181640625, - "learning_rate": 8.530911408540471e-07, - "loss": 0.0047, - "reward": 1.6739246845245361, - "reward_std": 0.11186471581459045, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5020498037338257, - "step": 461 - }, - { - "completion_length": 99.703125, - "epoch": 0.14722753346080306, - "grad_norm": 98.9237060546875, - "kl": 0.09423828125, - "learning_rate": 8.527724665391969e-07, - "loss": 0.0038, - "reward": 1.5939929485321045, - "reward_std": 0.06156596168875694, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4689928889274597, - "step": 462 - }, - { - "completion_length": 177.03125, - "epoch": 0.14754620777565328, - "grad_norm": 12.542116165161133, - "kl": 0.06201171875, - "learning_rate": 8.524537922243466e-07, - "loss": 0.0025, - "reward": 1.4433605670928955, - "reward_std": 0.04455003887414932, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4433605670928955, - "step": 463 - }, - { - "completion_length": 179.28125, - "epoch": 0.1478648820905035, - "grad_norm": 31.899578094482422, - "kl": 0.052734375, - "learning_rate": 8.521351179094964e-07, - "loss": 0.0021, - "reward": 1.3155629634857178, - "reward_std": 0.07355151325464249, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3155629336833954, - "step": 464 - }, - { - "completion_length": 45.609375, - "epoch": 0.14818355640535372, - "grad_norm": 77.76405334472656, - "kl": 0.1689453125, - "learning_rate": 8.518164435946462e-07, - "loss": 0.0068, - "reward": 1.4881693124771118, - "reward_std": 0.09249794483184814, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4881693124771118, - "rewards/pad": 0.0, - "step": 465 - }, - { - "completion_length": 73.953125, - "epoch": 0.14850223072020396, - "grad_norm": 73.53346252441406, - "kl": 0.35546875, - "learning_rate": 8.51497769279796e-07, - "loss": 0.0142, - "reward": 1.422756314277649, - "reward_std": 0.1399630308151245, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2821313142776489, - "rewards/pad": 0.140625, - "step": 466 - }, - { - "completion_length": 70.421875, - "epoch": 0.14882090503505419, - "grad_norm": 33.81299591064453, - "kl": 0.125, - "learning_rate": 8.511790949649457e-07, - "loss": 0.005, - "reward": 1.705796718597412, - "reward_std": 0.07710770517587662, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5807967782020569, - "rewards/pad": 0.125, - "step": 467 - }, - { - "completion_length": 125.328125, - "epoch": 0.1491395793499044, - "grad_norm": 191.01551818847656, - "kl": 0.09765625, - "learning_rate": 8.508604206500955e-07, - "loss": 0.0039, - "reward": 1.4011149406433105, - "reward_std": 0.10568197071552277, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4167398512363434, - "rewards/pad": 0.0, - "step": 468 - }, - { - "completion_length": 101.484375, - "epoch": 0.14945825366475463, - "grad_norm": 21.767581939697266, - "kl": 0.291015625, - "learning_rate": 8.505417463352453e-07, - "loss": 0.0116, - "reward": 1.6511127948760986, - "reward_std": 0.2230065017938614, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.44798779487609863, - "step": 469 - }, - { - "completion_length": 77.421875, - "epoch": 0.14977692797960485, - "grad_norm": 54.641815185546875, - "kl": 0.1357421875, - "learning_rate": 8.502230720203951e-07, - "loss": 0.0054, - "reward": 1.6336212158203125, - "reward_std": 0.18447574973106384, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3992462754249573, - "step": 470 - }, - { - "completion_length": 99.40625, - "epoch": 0.15009560229445507, - "grad_norm": 21.85428810119629, - "kl": 0.10888671875, - "learning_rate": 8.499043977055449e-07, - "loss": 0.0044, - "reward": 1.6280136108398438, - "reward_std": 0.13695479929447174, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5186384916305542, - "rewards/pad": 0.125, - "step": 471 - }, - { - "completion_length": 44.84375, - "epoch": 0.1504142766093053, - "grad_norm": 31.97594451904297, - "kl": 0.1357421875, - "learning_rate": 8.495857233906947e-07, - "loss": 0.0054, - "reward": 1.5573021173477173, - "reward_std": 0.051747020334005356, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5573021769523621, - "rewards/pad": 0.0, - "step": 472 - }, - { - "completion_length": 128.71875, - "epoch": 0.1507329509241555, - "grad_norm": 43.0109977722168, - "kl": 0.1357421875, - "learning_rate": 8.492670490758445e-07, - "loss": 0.0054, - "reward": 1.5580388307571411, - "reward_std": 0.11918258666992188, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4174138903617859, - "rewards/pad": 0.140625, - "step": 473 - }, - { - "completion_length": 99.21875, - "epoch": 0.15105162523900573, - "grad_norm": 43.45140838623047, - "kl": 0.091796875, - "learning_rate": 8.489483747609943e-07, - "loss": 0.0037, - "reward": 1.6057767868041992, - "reward_std": 0.08008240908384323, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6057767868041992, - "step": 474 - }, - { - "completion_length": 124.359375, - "epoch": 0.15137029955385595, - "grad_norm": 113.9756088256836, - "kl": 0.08642578125, - "learning_rate": 8.48629700446144e-07, - "loss": 0.0035, - "reward": 1.5617237091064453, - "reward_std": 0.061002567410469055, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5617236495018005, - "step": 475 - }, - { - "completion_length": 102.765625, - "epoch": 0.1516889738687062, - "grad_norm": 57.76559829711914, - "kl": 0.1328125, - "learning_rate": 8.483110261312938e-07, - "loss": 0.0053, - "reward": 1.4934542179107666, - "reward_std": 0.07229776680469513, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38407933712005615, - "step": 476 - }, - { - "completion_length": 72.296875, - "epoch": 0.15200764818355642, - "grad_norm": 21.623546600341797, - "kl": 0.10595703125, - "learning_rate": 8.479923518164436e-07, - "loss": 0.0042, - "reward": 1.6081228256225586, - "reward_std": 0.08545694500207901, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4831227958202362, - "step": 477 - }, - { - "completion_length": 101.859375, - "epoch": 0.15232632249840664, - "grad_norm": 116.93062591552734, - "kl": 0.11279296875, - "learning_rate": 8.476736775015934e-07, - "loss": 0.0045, - "reward": 1.6542677879333496, - "reward_std": 0.14931340515613556, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.482392817735672, - "step": 478 - }, - { - "completion_length": 125.34375, - "epoch": 0.15264499681325686, - "grad_norm": 107.00728607177734, - "kl": 0.142578125, - "learning_rate": 8.473550031867431e-07, - "loss": 0.0057, - "reward": 1.472907304763794, - "reward_std": 0.06427513062953949, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4729074239730835, - "step": 479 - }, - { - "completion_length": 102.4375, - "epoch": 0.15296367112810708, - "grad_norm": 18.603273391723633, - "kl": 0.1142578125, - "learning_rate": 8.470363288718929e-07, - "loss": 0.0046, - "reward": 1.703479528427124, - "reward_std": 0.09505253285169601, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4534795582294464, - "step": 480 - }, - { - "completion_length": 155.15625, - "epoch": 0.1532823454429573, - "grad_norm": 12.259832382202148, - "kl": 0.14453125, - "learning_rate": 8.467176545570427e-07, - "loss": 0.0058, - "reward": 1.3880560398101807, - "reward_std": 0.08335339277982712, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38805609941482544, - "step": 481 - }, - { - "completion_length": 99.59375, - "epoch": 0.15360101975780752, - "grad_norm": 20.600967407226562, - "kl": 0.1025390625, - "learning_rate": 8.463989802421925e-07, - "loss": 0.0041, - "reward": 1.613484501838684, - "reward_std": 0.05416692793369293, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4884844720363617, - "step": 482 - }, - { - "completion_length": 127.203125, - "epoch": 0.15391969407265774, - "grad_norm": 53.93398666381836, - "kl": 0.05517578125, - "learning_rate": 8.460803059273422e-07, - "loss": 0.0022, - "reward": 1.697056531906128, - "reward_std": 0.09304764866828918, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5720565319061279, - "step": 483 - }, - { - "completion_length": 181.8125, - "epoch": 0.15423836838750796, - "grad_norm": 48.73480987548828, - "kl": 0.052490234375, - "learning_rate": 8.45761631612492e-07, - "loss": 0.0021, - "reward": 1.4899718761444092, - "reward_std": 0.025554783642292023, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36497190594673157, - "step": 484 - }, - { - "completion_length": 129.859375, - "epoch": 0.15455704270235818, - "grad_norm": 30.73780059814453, - "kl": 0.07470703125, - "learning_rate": 8.454429572976418e-07, - "loss": 0.003, - "reward": 1.611334204673767, - "reward_std": 0.10449053347110748, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37695926427841187, - "rewards/pad": 0.234375, - "step": 485 - }, - { - "completion_length": 100.640625, - "epoch": 0.15487571701720843, - "grad_norm": 57.61213684082031, - "kl": 0.1435546875, - "learning_rate": 8.451242829827916e-07, - "loss": 0.0057, - "reward": 1.4112203121185303, - "reward_std": 0.16161048412322998, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3799702823162079, - "rewards/pad": 0.03125, - "step": 486 - }, - { - "completion_length": 154.90625, - "epoch": 0.15519439133205865, - "grad_norm": 16.694082260131836, - "kl": 0.095703125, - "learning_rate": 8.448056086679413e-07, - "loss": 0.0038, - "reward": 1.5090348720550537, - "reward_std": 0.040350381284952164, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5090348124504089, - "step": 487 - }, - { - "completion_length": 126.96875, - "epoch": 0.15551306564690887, - "grad_norm": 17.563032150268555, - "kl": 0.1181640625, - "learning_rate": 8.444869343530911e-07, - "loss": 0.0047, - "reward": 1.538849115371704, - "reward_std": 0.09288837015628815, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.30447399616241455, - "step": 488 - }, - { - "completion_length": 45.3125, - "epoch": 0.15583173996175909, - "grad_norm": 19.666879653930664, - "kl": 0.181640625, - "learning_rate": 8.44168260038241e-07, - "loss": 0.0073, - "reward": 1.4959934949874878, - "reward_std": 0.151860311627388, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49599355459213257, - "rewards/pad": 0.0, - "step": 489 - }, - { - "completion_length": 73.28125, - "epoch": 0.1561504142766093, - "grad_norm": 33.24127960205078, - "kl": 0.259765625, - "learning_rate": 8.438495857233908e-07, - "loss": 0.0104, - "reward": 1.5356709957122803, - "reward_std": 0.11758964508771896, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5356709361076355, - "step": 490 - }, - { - "completion_length": 152.703125, - "epoch": 0.15646908859145953, - "grad_norm": 29.862022399902344, - "kl": 0.30859375, - "learning_rate": 8.435309114085404e-07, - "loss": 0.0123, - "reward": 1.3644418716430664, - "reward_std": 0.02914932370185852, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3644419312477112, - "rewards/pad": 0.0, - "step": 491 - }, - { - "completion_length": 100.78125, - "epoch": 0.15678776290630975, - "grad_norm": 147.37142944335938, - "kl": 0.0986328125, - "learning_rate": 8.432122370936902e-07, - "loss": 0.004, - "reward": 1.4690436124801636, - "reward_std": 0.11149085313081741, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3752935826778412, - "rewards/pad": 0.09375, - "step": 492 - }, - { - "completion_length": 46.59375, - "epoch": 0.15710643722115997, - "grad_norm": 169.15550231933594, - "kl": 0.1357421875, - "learning_rate": 8.4289356277884e-07, - "loss": 0.0054, - "reward": 1.700716495513916, - "reward_std": 0.11945337802171707, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5757165551185608, - "step": 493 - }, - { - "completion_length": 72.765625, - "epoch": 0.1574251115360102, - "grad_norm": 58.808162689208984, - "kl": 0.1904296875, - "learning_rate": 8.425748884639897e-07, - "loss": 0.0076, - "reward": 1.3045260906219482, - "reward_std": 0.1511131078004837, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.30452603101730347, - "step": 494 - }, - { - "completion_length": 47.03125, - "epoch": 0.1577437858508604, - "grad_norm": 43.41145706176758, - "kl": 0.1767578125, - "learning_rate": 8.422562141491395e-07, - "loss": 0.0071, - "reward": 1.5499234199523926, - "reward_std": 0.07011061161756516, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4249233901500702, - "rewards/pad": 0.125, - "step": 495 - }, - { - "completion_length": 98.609375, - "epoch": 0.15806246016571066, - "grad_norm": 45.90449905395508, - "kl": 0.09765625, - "learning_rate": 8.419375398342893e-07, - "loss": 0.0039, - "reward": 1.5362162590026855, - "reward_std": 0.08368147164583206, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5362161993980408, - "rewards/pad": 0.0, - "step": 496 - }, - { - "completion_length": 125.5, - "epoch": 0.15838113448056088, - "grad_norm": 26.423377990722656, - "kl": 0.1240234375, - "learning_rate": 8.416188655194391e-07, - "loss": 0.005, - "reward": 1.3670811653137207, - "reward_std": 0.13680918514728546, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3827061653137207, - "step": 497 - }, - { - "completion_length": 72.828125, - "epoch": 0.1586998087954111, - "grad_norm": 27.449508666992188, - "kl": 0.28515625, - "learning_rate": 8.413001912045888e-07, - "loss": 0.0114, - "reward": 1.7771075963974, - "reward_std": 0.08483612537384033, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5271075963973999, - "rewards/pad": 0.25, - "step": 498 - }, - { - "completion_length": 47.578125, - "epoch": 0.15901848311026132, - "grad_norm": 22.990766525268555, - "kl": 0.142578125, - "learning_rate": 8.409815168897386e-07, - "loss": 0.0057, - "reward": 1.6906874179840088, - "reward_std": 0.07498195022344589, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4406873881816864, - "rewards/pad": 0.25, - "step": 499 - }, - { - "completion_length": 152.90625, - "epoch": 0.15933715742511154, - "grad_norm": 13.581183433532715, - "kl": 0.07666015625, - "learning_rate": 8.406628425748884e-07, - "loss": 0.0031, - "reward": 1.4057791233062744, - "reward_std": 0.03723069280385971, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.405779093503952, - "step": 500 - }, - { - "completion_length": 98.0, - "epoch": 0.15965583173996176, - "grad_norm": 29.467369079589844, - "kl": 0.0927734375, - "learning_rate": 8.403441682600382e-07, - "loss": 0.0037, - "reward": 1.6264564990997314, - "reward_std": 0.1466260701417923, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5170813798904419, - "step": 501 - }, - { - "completion_length": 125.875, - "epoch": 0.15997450605481198, - "grad_norm": 75.3805923461914, - "kl": 0.5625, - "learning_rate": 8.400254939451879e-07, - "loss": 0.0224, - "reward": 1.4467419385910034, - "reward_std": 0.17177796363830566, - "rewards/answer_reward": 0.078125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.3842419385910034, - "step": 502 - }, - { - "completion_length": 98.953125, - "epoch": 0.1602931803696622, - "grad_norm": 17.550413131713867, - "kl": 0.1767578125, - "learning_rate": 8.397068196303377e-07, - "loss": 0.0071, - "reward": 1.3917758464813232, - "reward_std": 0.090950608253479, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39177584648132324, - "step": 503 - }, - { - "completion_length": 124.84375, - "epoch": 0.16061185468451242, - "grad_norm": 18.48185920715332, - "kl": 0.0888671875, - "learning_rate": 8.393881453154875e-07, - "loss": 0.0036, - "reward": 1.5010762214660645, - "reward_std": 0.1343594491481781, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3917011618614197, - "rewards/pad": 0.109375, - "step": 504 - }, - { - "completion_length": 175.859375, - "epoch": 0.16093052899936264, - "grad_norm": 10.59714412689209, - "kl": 0.06494140625, - "learning_rate": 8.390694710006373e-07, - "loss": 0.0026, - "reward": 1.4233787059783936, - "reward_std": 0.051402267068624496, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4233787953853607, - "step": 505 - }, - { - "completion_length": 155.640625, - "epoch": 0.16124920331421289, - "grad_norm": 11.212116241455078, - "kl": 0.06640625, - "learning_rate": 8.38750796685787e-07, - "loss": 0.0026, - "reward": 1.450839877128601, - "reward_std": 0.039273642003536224, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3258398175239563, - "step": 506 - }, - { - "completion_length": 70.5, - "epoch": 0.1615678776290631, - "grad_norm": 19.700626373291016, - "kl": 0.1455078125, - "learning_rate": 8.384321223709368e-07, - "loss": 0.0058, - "reward": 1.5653252601623535, - "reward_std": 0.13670827448368073, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44032523036003113, - "rewards/pad": 0.125, - "step": 507 - }, - { - "completion_length": 98.8125, - "epoch": 0.16188655194391333, - "grad_norm": 24.49850845336914, - "kl": 0.07080078125, - "learning_rate": 8.381134480560866e-07, - "loss": 0.0028, - "reward": 1.6112509965896606, - "reward_std": 0.12817604839801788, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39250099658966064, - "step": 508 - }, - { - "completion_length": 132.0625, - "epoch": 0.16220522625876355, - "grad_norm": 15.46005916595459, - "kl": 0.0615234375, - "learning_rate": 8.377947737412365e-07, - "loss": 0.0025, - "reward": 1.5222032070159912, - "reward_std": 0.05216310918331146, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.397203266620636, - "step": 509 - }, - { - "completion_length": 179.265625, - "epoch": 0.16252390057361377, - "grad_norm": 8.991511344909668, - "kl": 0.039794921875, - "learning_rate": 8.374760994263862e-07, - "loss": 0.0016, - "reward": 1.5481150150299072, - "reward_std": 0.05589091405272484, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42311498522758484, - "step": 510 - }, - { - "completion_length": 102.59375, - "epoch": 0.162842574888464, - "grad_norm": 60.91361618041992, - "kl": 0.10400390625, - "learning_rate": 8.37157425111536e-07, - "loss": 0.0042, - "reward": 1.682870864868164, - "reward_std": 0.07718949019908905, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5578707456588745, - "rewards/pad": 0.125, - "step": 511 - }, - { - "completion_length": 176.59375, - "epoch": 0.1631612492033142, - "grad_norm": 8.942208290100098, - "kl": 0.039794921875, - "learning_rate": 8.368387507966858e-07, - "loss": 0.0016, - "reward": 1.4696202278137207, - "reward_std": 0.033947329968214035, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4696202874183655, - "step": 512 - }, - { - "completion_length": 124.28125, - "epoch": 0.16347992351816443, - "grad_norm": 21.55902862548828, - "kl": 0.353515625, - "learning_rate": 8.365200764818356e-07, - "loss": 0.0141, - "reward": 1.493517518043518, - "reward_std": 0.057740386575460434, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49351754784584045, - "rewards/pad": 0.0, - "step": 513 - }, - { - "completion_length": 71.59375, - "epoch": 0.16379859783301465, - "grad_norm": 17.975238800048828, - "kl": 0.10693359375, - "learning_rate": 8.362014021669853e-07, - "loss": 0.0043, - "reward": 1.531064510345459, - "reward_std": 0.18845269083976746, - "rewards/answer_reward": 0.09375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4373144805431366, - "step": 514 - }, - { - "completion_length": 45.078125, - "epoch": 0.1641172721478649, - "grad_norm": 19.34892463684082, - "kl": 0.12890625, - "learning_rate": 8.358827278521351e-07, - "loss": 0.0052, - "reward": 1.394120216369629, - "reward_std": 0.09514320641756058, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3941202461719513, - "rewards/pad": 0.0, - "step": 515 - }, - { - "completion_length": 99.84375, - "epoch": 0.16443594646271512, - "grad_norm": 43.57005310058594, - "kl": 0.0859375, - "learning_rate": 8.355640535372849e-07, - "loss": 0.0034, - "reward": 1.5611262321472168, - "reward_std": 0.10545508563518524, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5142512321472168, - "step": 516 - }, - { - "completion_length": 99.75, - "epoch": 0.16475462077756534, - "grad_norm": 34.81675720214844, - "kl": 0.08837890625, - "learning_rate": 8.352453792224347e-07, - "loss": 0.0035, - "reward": 1.697974681854248, - "reward_std": 0.14656002819538116, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.526099681854248, - "step": 517 - }, - { - "completion_length": 130.859375, - "epoch": 0.16507329509241556, - "grad_norm": 20.660293579101562, - "kl": 0.0791015625, - "learning_rate": 8.349267049075844e-07, - "loss": 0.0032, - "reward": 1.4722533226013184, - "reward_std": 0.07390671223402023, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34725329279899597, - "rewards/pad": 0.125, - "step": 518 - }, - { - "completion_length": 152.78125, - "epoch": 0.16539196940726578, - "grad_norm": 21.820556640625, - "kl": 0.08203125, - "learning_rate": 8.346080305927342e-07, - "loss": 0.0033, - "reward": 1.314011573791504, - "reward_std": 0.04784318059682846, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3140115737915039, - "step": 519 - }, - { - "completion_length": 45.515625, - "epoch": 0.165710643722116, - "grad_norm": 43.039615631103516, - "kl": 0.1259765625, - "learning_rate": 8.34289356277884e-07, - "loss": 0.0051, - "reward": 1.6405762434005737, - "reward_std": 0.16814163327217102, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5937012434005737, - "rewards/pad": 0.046875, - "step": 520 - }, - { - "completion_length": 122.75, - "epoch": 0.16602931803696622, - "grad_norm": 14.376192092895508, - "kl": 0.1298828125, - "learning_rate": 8.339706819630338e-07, - "loss": 0.0052, - "reward": 1.4471235275268555, - "reward_std": 0.06147985905408859, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44712358713150024, - "step": 521 - }, - { - "completion_length": 98.84375, - "epoch": 0.16634799235181644, - "grad_norm": 16.817852020263672, - "kl": 0.11962890625, - "learning_rate": 8.336520076481835e-07, - "loss": 0.0048, - "reward": 1.396544098854065, - "reward_std": 0.14501231908798218, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.41216909885406494, - "rewards/pad": 0.0, - "step": 522 - }, - { - "completion_length": 94.125, - "epoch": 0.16666666666666666, - "grad_norm": 16.053434371948242, - "kl": 0.1279296875, - "learning_rate": 8.333333333333333e-07, - "loss": 0.0051, - "reward": 1.5878493785858154, - "reward_std": 0.17199000716209412, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.47847428917884827, - "rewards/pad": 0.125, - "step": 523 - }, - { - "completion_length": 73.546875, - "epoch": 0.16698534098151688, - "grad_norm": 63.2349853515625, - "kl": 0.1484375, - "learning_rate": 8.330146590184831e-07, - "loss": 0.0059, - "reward": 1.5571330785751343, - "reward_std": 0.18380963802337646, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43213310837745667, - "rewards/pad": 0.125, - "step": 524 - }, - { - "completion_length": 185.515625, - "epoch": 0.16730401529636713, - "grad_norm": 13.684309959411621, - "kl": 0.06689453125, - "learning_rate": 8.326959847036329e-07, - "loss": 0.0027, - "reward": 1.6064646244049072, - "reward_std": 0.05707468092441559, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.481464684009552, - "step": 525 - }, - { - "completion_length": 100.328125, - "epoch": 0.16762268961121735, - "grad_norm": 20.025402069091797, - "kl": 0.078125, - "learning_rate": 8.323773103887826e-07, - "loss": 0.0031, - "reward": 1.5280306339263916, - "reward_std": 0.07771054655313492, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40303054451942444, - "step": 526 - }, - { - "completion_length": 72.96875, - "epoch": 0.16794136392606757, - "grad_norm": 20.100772857666016, - "kl": 0.169921875, - "learning_rate": 8.320586360739325e-07, - "loss": 0.0068, - "reward": 1.463815450668335, - "reward_std": 0.14635515213012695, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4794403910636902, - "rewards/pad": 0.0, - "step": 527 - }, - { - "completion_length": 96.71875, - "epoch": 0.1682600382409178, - "grad_norm": 60.14537048339844, - "kl": 0.10791015625, - "learning_rate": 8.317399617590823e-07, - "loss": 0.0043, - "reward": 1.6089813709259033, - "reward_std": 0.06876353919506073, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6089814305305481, - "step": 528 - }, - { - "completion_length": 124.0, - "epoch": 0.168578712555768, - "grad_norm": 41.38592529296875, - "kl": 0.12158203125, - "learning_rate": 8.31421287444232e-07, - "loss": 0.0049, - "reward": 1.5476176738739014, - "reward_std": 0.08636157214641571, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42261767387390137, - "rewards/pad": 0.125, - "step": 529 - }, - { - "completion_length": 123.09375, - "epoch": 0.16889738687061823, - "grad_norm": 21.460275650024414, - "kl": 0.11328125, - "learning_rate": 8.311026131293817e-07, - "loss": 0.0045, - "reward": 1.5313866138458252, - "reward_std": 0.07336382567882538, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5313864946365356, - "rewards/pad": 0.0, - "step": 530 - }, - { - "completion_length": 97.375, - "epoch": 0.16921606118546845, - "grad_norm": 61.537071228027344, - "kl": 0.1044921875, - "learning_rate": 8.307839388145315e-07, - "loss": 0.0042, - "reward": 1.7152791023254395, - "reward_std": 0.1346595138311386, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48090416193008423, - "rewards/pad": 0.234375, - "step": 531 - }, - { - "completion_length": 124.046875, - "epoch": 0.16953473550031867, - "grad_norm": 119.3514633178711, - "kl": 0.0791015625, - "learning_rate": 8.304652644996813e-07, - "loss": 0.0032, - "reward": 1.3922349214553833, - "reward_std": 0.0685240775346756, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3922349512577057, - "rewards/pad": 0.0, - "step": 532 - }, - { - "completion_length": 47.28125, - "epoch": 0.1698534098151689, - "grad_norm": 35.74647903442383, - "kl": 0.1572265625, - "learning_rate": 8.30146590184831e-07, - "loss": 0.0063, - "reward": 1.6910107135772705, - "reward_std": 0.17851702868938446, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36288559436798096, - "rewards/pad": 0.328125, - "step": 533 - }, - { - "completion_length": 97.734375, - "epoch": 0.1701720841300191, - "grad_norm": 10.062421798706055, - "kl": 0.1298828125, - "learning_rate": 8.298279158699808e-07, - "loss": 0.0052, - "reward": 1.5344090461730957, - "reward_std": 0.06182171404361725, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4094090759754181, - "rewards/pad": 0.125, - "step": 534 - }, - { - "completion_length": 98.390625, - "epoch": 0.17049075844486936, - "grad_norm": 22.848539352416992, - "kl": 0.20703125, - "learning_rate": 8.295092415551306e-07, - "loss": 0.0083, - "reward": 1.459669828414917, - "reward_std": 0.08608236908912659, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.459669828414917, - "step": 535 - }, - { - "completion_length": 99.671875, - "epoch": 0.17080943275971958, - "grad_norm": 32.05093765258789, - "kl": 0.09521484375, - "learning_rate": 8.291905672402804e-07, - "loss": 0.0038, - "reward": 1.8032017946243286, - "reward_std": 0.08446711301803589, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42820173501968384, - "step": 536 - }, - { - "completion_length": 21.140625, - "epoch": 0.1711281070745698, - "grad_norm": 44.99974060058594, - "kl": 0.1337890625, - "learning_rate": 8.288718929254301e-07, - "loss": 0.0054, - "reward": 1.8819546699523926, - "reward_std": 0.10786371678113937, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5069546699523926, - "rewards/pad": 0.375, - "step": 537 - }, - { - "completion_length": 73.453125, - "epoch": 0.17144678138942002, - "grad_norm": 41.52457046508789, - "kl": 0.08642578125, - "learning_rate": 8.285532186105799e-07, - "loss": 0.0034, - "reward": 1.7312819957733154, - "reward_std": 0.12865957617759705, - "rewards/pad": 0.359375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3719070553779602, - "step": 538 - }, - { - "completion_length": 124.828125, - "epoch": 0.17176545570427024, - "grad_norm": 34.48262023925781, - "kl": 0.10546875, - "learning_rate": 8.282345442957297e-07, - "loss": 0.0042, - "reward": 1.6211336851119995, - "reward_std": 0.11349187791347504, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4961336851119995, - "rewards/pad": 0.125, - "step": 539 - }, - { - "completion_length": 122.203125, - "epoch": 0.17208413001912046, - "grad_norm": 37.691978454589844, - "kl": 0.0673828125, - "learning_rate": 8.279158699808795e-07, - "loss": 0.0027, - "reward": 1.7140142917633057, - "reward_std": 0.0915536880493164, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4640142321586609, - "rewards/pad": 0.25, - "step": 540 - }, - { - "completion_length": 149.6875, - "epoch": 0.17240280433397068, - "grad_norm": 42.488380432128906, - "kl": 0.08349609375, - "learning_rate": 8.275971956660292e-07, - "loss": 0.0033, - "reward": 1.5138425827026367, - "reward_std": 0.07240451872348785, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5138426423072815, - "step": 541 - }, - { - "completion_length": 95.84375, - "epoch": 0.1727214786488209, - "grad_norm": 669.1302490234375, - "kl": 0.1279296875, - "learning_rate": 8.27278521351179e-07, - "loss": 0.0051, - "reward": 1.321406602859497, - "reward_std": 0.10743975639343262, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3214065134525299, - "rewards/pad": 0.0, - "step": 542 - }, - { - "completion_length": 97.609375, - "epoch": 0.17304015296367112, - "grad_norm": 37.46754455566406, - "kl": 0.08544921875, - "learning_rate": 8.269598470363288e-07, - "loss": 0.0034, - "reward": 1.5845844745635986, - "reward_std": 0.11835047602653503, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42833447456359863, - "step": 543 - }, - { - "completion_length": 71.609375, - "epoch": 0.17335882727852134, - "grad_norm": 83.38258361816406, - "kl": 0.1259765625, - "learning_rate": 8.266411727214786e-07, - "loss": 0.005, - "reward": 1.4551091194152832, - "reward_std": 0.11183137446641922, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3769841194152832, - "rewards/pad": 0.078125, - "step": 544 - }, - { - "completion_length": 97.40625, - "epoch": 0.17367750159337159, - "grad_norm": 19.901351928710938, - "kl": 0.1533203125, - "learning_rate": 8.263224984066283e-07, - "loss": 0.0061, - "reward": 1.5000262260437012, - "reward_std": 0.10315240919589996, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5000262260437012, - "rewards/pad": 0.0, - "step": 545 - }, - { - "completion_length": 45.46875, - "epoch": 0.1739961759082218, - "grad_norm": 21.25617218017578, - "kl": 0.12060546875, - "learning_rate": 8.260038240917782e-07, - "loss": 0.0048, - "reward": 1.5835143327713013, - "reward_std": 0.09237085282802582, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5835143327713013, - "rewards/pad": 0.0, - "step": 546 - }, - { - "completion_length": 98.421875, - "epoch": 0.17431485022307203, - "grad_norm": 25.097929000854492, - "kl": 0.08837890625, - "learning_rate": 8.25685149776928e-07, - "loss": 0.0035, - "reward": 1.6923954486846924, - "reward_std": 0.04992898926138878, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5673954486846924, - "step": 547 - }, - { - "completion_length": 97.890625, - "epoch": 0.17463352453792225, - "grad_norm": 97.4831771850586, - "kl": 0.109375, - "learning_rate": 8.253664754620778e-07, - "loss": 0.0044, - "reward": 1.6336106061935425, - "reward_std": 0.1055816188454628, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5242354869842529, - "step": 548 - }, - { - "completion_length": 122.4375, - "epoch": 0.17495219885277247, - "grad_norm": 27.211618423461914, - "kl": 0.1337890625, - "learning_rate": 8.250478011472275e-07, - "loss": 0.0054, - "reward": 1.476280927658081, - "reward_std": 0.05999341607093811, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47628092765808105, - "step": 549 - }, - { - "completion_length": 47.625, - "epoch": 0.1752708731676227, - "grad_norm": 62.962547302246094, - "kl": 0.126953125, - "learning_rate": 8.247291268323773e-07, - "loss": 0.0051, - "reward": 1.7597732543945312, - "reward_std": 0.23647846281528473, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4160231053829193, - "step": 550 - }, - { - "completion_length": 123.65625, - "epoch": 0.1755895474824729, - "grad_norm": 40.25691604614258, - "kl": 0.07177734375, - "learning_rate": 8.244104525175271e-07, - "loss": 0.0029, - "reward": 1.5125081539154053, - "reward_std": 0.10185742378234863, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.52813321352005, - "step": 551 - }, - { - "completion_length": 98.796875, - "epoch": 0.17590822179732313, - "grad_norm": 27.152267456054688, - "kl": 0.11572265625, - "learning_rate": 8.240917782026769e-07, - "loss": 0.0046, - "reward": 1.339756965637207, - "reward_std": 0.06058652698993683, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3397570252418518, - "step": 552 - }, - { - "completion_length": 175.9375, - "epoch": 0.17622689611217335, - "grad_norm": 10.05562686920166, - "kl": 0.09521484375, - "learning_rate": 8.237731038878266e-07, - "loss": 0.0038, - "reward": 1.601670265197754, - "reward_std": 0.04825048893690109, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4766702950000763, - "rewards/pad": 0.125, - "step": 553 - }, - { - "completion_length": 69.734375, - "epoch": 0.17654557042702357, - "grad_norm": 71.13457489013672, - "kl": 0.1494140625, - "learning_rate": 8.234544295729764e-07, - "loss": 0.006, - "reward": 1.588582158088684, - "reward_std": 0.12747931480407715, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4792071580886841, - "rewards/pad": 0.109375, - "step": 554 - }, - { - "completion_length": 70.953125, - "epoch": 0.17686424474187382, - "grad_norm": 23.054285049438477, - "kl": 0.1806640625, - "learning_rate": 8.231357552581262e-07, - "loss": 0.0072, - "reward": 1.4141204357147217, - "reward_std": 0.14130906760692596, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4141203463077545, - "rewards/pad": 0.0, - "step": 555 - }, - { - "completion_length": 123.109375, - "epoch": 0.17718291905672404, - "grad_norm": 24.595977783203125, - "kl": 0.08544921875, - "learning_rate": 8.228170809432759e-07, - "loss": 0.0034, - "reward": 1.548499584197998, - "reward_std": 0.052562348544597626, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5484994649887085, - "step": 556 - }, - { - "completion_length": 126.328125, - "epoch": 0.17750159337157426, - "grad_norm": 42.06749725341797, - "kl": 0.0751953125, - "learning_rate": 8.224984066284257e-07, - "loss": 0.003, - "reward": 1.403905987739563, - "reward_std": 0.08751250803470612, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3882809579372406, - "step": 557 - }, - { - "completion_length": 71.40625, - "epoch": 0.17782026768642448, - "grad_norm": 27.221906661987305, - "kl": 0.2265625, - "learning_rate": 8.221797323135755e-07, - "loss": 0.0091, - "reward": 1.4237648248672485, - "reward_std": 0.0997081995010376, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4237648844718933, - "rewards/pad": 0.0, - "step": 558 - }, - { - "completion_length": 44.96875, - "epoch": 0.1781389420012747, - "grad_norm": 16.742216110229492, - "kl": 0.1630859375, - "learning_rate": 8.218610579987253e-07, - "loss": 0.0065, - "reward": 1.3757002353668213, - "reward_std": 0.09360139071941376, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3757002055644989, - "rewards/pad": 0.0, - "step": 559 - }, - { - "completion_length": 147.6875, - "epoch": 0.17845761631612492, - "grad_norm": 12.943758010864258, - "kl": 0.07958984375, - "learning_rate": 8.21542383683875e-07, - "loss": 0.0032, - "reward": 1.3315762281417847, - "reward_std": 0.039429426193237305, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33157628774642944, - "step": 560 - }, - { - "completion_length": 71.5625, - "epoch": 0.17877629063097514, - "grad_norm": 75.57307434082031, - "kl": 0.23828125, - "learning_rate": 8.212237093690248e-07, - "loss": 0.0095, - "reward": 1.5689356327056885, - "reward_std": 0.10561536252498627, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.568935751914978, - "rewards/pad": 0.0, - "step": 561 - }, - { - "completion_length": 71.671875, - "epoch": 0.17909496494582536, - "grad_norm": 26.024625778198242, - "kl": 0.1953125, - "learning_rate": 8.209050350541746e-07, - "loss": 0.0078, - "reward": 1.5372991561889648, - "reward_std": 0.09807107597589493, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5372991561889648, - "step": 562 - }, - { - "completion_length": 71.421875, - "epoch": 0.17941363926067558, - "grad_norm": 28.438175201416016, - "kl": 0.189453125, - "learning_rate": 8.205863607393244e-07, - "loss": 0.0076, - "reward": 1.480418086051941, - "reward_std": 0.07410193979740143, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3554180860519409, - "rewards/pad": 0.125, - "step": 563 - }, - { - "completion_length": 96.1875, - "epoch": 0.17973231357552583, - "grad_norm": 47.855194091796875, - "kl": 0.1435546875, - "learning_rate": 8.202676864244741e-07, - "loss": 0.0058, - "reward": 1.385023593902588, - "reward_std": 0.13159047067165375, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3225235939025879, - "rewards/pad": 0.0625, - "step": 564 - }, - { - "completion_length": 97.8125, - "epoch": 0.18005098789037605, - "grad_norm": 17.224849700927734, - "kl": 0.1611328125, - "learning_rate": 8.19949012109624e-07, - "loss": 0.0064, - "reward": 1.4678709506988525, - "reward_std": 0.061766088008880615, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46787095069885254, - "step": 565 - }, - { - "completion_length": 120.28125, - "epoch": 0.18036966220522627, - "grad_norm": 14.757925033569336, - "kl": 0.10009765625, - "learning_rate": 8.196303377947738e-07, - "loss": 0.004, - "reward": 1.3626590967178345, - "reward_std": 0.05897171050310135, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3626590967178345, - "step": 566 - }, - { - "completion_length": 71.03125, - "epoch": 0.1806883365200765, - "grad_norm": 220.81106567382812, - "kl": 0.1337890625, - "learning_rate": 8.193116634799236e-07, - "loss": 0.0054, - "reward": 1.5400254726409912, - "reward_std": 0.11256685853004456, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4150254726409912, - "rewards/pad": 0.125, - "step": 567 - }, - { - "completion_length": 125.609375, - "epoch": 0.1810070108349267, - "grad_norm": 603.3175659179688, - "kl": 0.1005859375, - "learning_rate": 8.189929891650733e-07, - "loss": 0.004, - "reward": 1.4392961263656616, - "reward_std": 0.10471741855144501, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4236711263656616, - "step": 568 - }, - { - "completion_length": 150.609375, - "epoch": 0.18132568514977693, - "grad_norm": 216.56732177734375, - "kl": 0.08154296875, - "learning_rate": 8.18674314850223e-07, - "loss": 0.0033, - "reward": 1.5750045776367188, - "reward_std": 0.09949097037315369, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49687960743904114, - "rewards/pad": 0.078125, - "step": 569 - }, - { - "completion_length": 99.515625, - "epoch": 0.18164435946462715, - "grad_norm": 43.249183654785156, - "kl": 0.08837890625, - "learning_rate": 8.183556405353728e-07, - "loss": 0.0035, - "reward": 1.7908916473388672, - "reward_std": 0.0621541291475296, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41589170694351196, - "rewards/pad": 0.375, - "step": 570 - }, - { - "completion_length": 148.765625, - "epoch": 0.18196303377947737, - "grad_norm": 37.219024658203125, - "kl": 0.125, - "learning_rate": 8.180369662205226e-07, - "loss": 0.005, - "reward": 1.535273551940918, - "reward_std": 0.03435523062944412, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.535273551940918, - "rewards/pad": 0.0, - "step": 571 - }, - { - "completion_length": 70.875, - "epoch": 0.1822817080943276, - "grad_norm": 28.961162567138672, - "kl": 0.10205078125, - "learning_rate": 8.177182919056723e-07, - "loss": 0.0041, - "reward": 1.67730712890625, - "reward_std": 0.13280938565731049, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.44293221831321716, - "step": 572 - }, - { - "completion_length": 123.71875, - "epoch": 0.1826003824091778, - "grad_norm": 43.969078063964844, - "kl": 0.166015625, - "learning_rate": 8.173996175908221e-07, - "loss": 0.0066, - "reward": 1.3603620529174805, - "reward_std": 0.07749934494495392, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3603619933128357, - "rewards/pad": 0.0, - "step": 573 - }, - { - "completion_length": 123.6875, - "epoch": 0.18291905672402806, - "grad_norm": 56.06167221069336, - "kl": 0.09619140625, - "learning_rate": 8.170809432759719e-07, - "loss": 0.0039, - "reward": 1.6252659559249878, - "reward_std": 0.06589239835739136, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.500265896320343, - "rewards/pad": 0.125, - "step": 574 - }, - { - "completion_length": 98.546875, - "epoch": 0.18323773103887828, - "grad_norm": 28.925315856933594, - "kl": 0.1630859375, - "learning_rate": 8.167622689611217e-07, - "loss": 0.0065, - "reward": 1.5602610111236572, - "reward_std": 0.07640358805656433, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4352610111236572, - "rewards/pad": 0.125, - "step": 575 - }, - { - "completion_length": 45.59375, - "epoch": 0.1835564053537285, - "grad_norm": 62.40200424194336, - "kl": 0.42578125, - "learning_rate": 8.164435946462714e-07, - "loss": 0.017, - "reward": 1.6693676710128784, - "reward_std": 0.08748309314250946, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4193676710128784, - "rewards/pad": 0.25, - "step": 576 - }, - { - "completion_length": 70.6875, - "epoch": 0.18387507966857872, - "grad_norm": 35.13634490966797, - "kl": 0.1953125, - "learning_rate": 8.161249203314212e-07, - "loss": 0.0078, - "reward": 1.6500343084335327, - "reward_std": 0.11489731073379517, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4156593084335327, - "step": 577 - }, - { - "completion_length": 123.15625, - "epoch": 0.18419375398342894, - "grad_norm": 52.97418212890625, - "kl": 0.2890625, - "learning_rate": 8.15806246016571e-07, - "loss": 0.0116, - "reward": 1.5006070137023926, - "reward_std": 0.08267797529697418, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5006070733070374, - "step": 578 - }, - { - "completion_length": 46.859375, - "epoch": 0.18451242829827916, - "grad_norm": 94.87614440917969, - "kl": 0.10302734375, - "learning_rate": 8.154875717017208e-07, - "loss": 0.0041, - "reward": 1.465181827545166, - "reward_std": 0.06751959025859833, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3401819169521332, - "step": 579 - }, - { - "completion_length": 97.046875, - "epoch": 0.18483110261312938, - "grad_norm": 35.05488586425781, - "kl": 0.30078125, - "learning_rate": 8.151688973868705e-07, - "loss": 0.0121, - "reward": 1.5429270267486572, - "reward_std": 0.12125824391841888, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5429270267486572, - "rewards/pad": 0.0, - "step": 580 - }, - { - "completion_length": 48.671875, - "epoch": 0.1851497769279796, - "grad_norm": 88.4456558227539, - "kl": 0.11083984375, - "learning_rate": 8.148502230720203e-07, - "loss": 0.0044, - "reward": 1.610171914100647, - "reward_std": 0.16949138045310974, - "rewards/pad": 0.34375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.26642197370529175, - "step": 581 - }, - { - "completion_length": 172.65625, - "epoch": 0.18546845124282982, - "grad_norm": 19.403085708618164, - "kl": 0.0556640625, - "learning_rate": 8.145315487571701e-07, - "loss": 0.0022, - "reward": 1.3586130142211914, - "reward_std": 0.12332511693239212, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.358612984418869, - "step": 582 - }, - { - "completion_length": 42.953125, - "epoch": 0.18578712555768004, - "grad_norm": 47.306217193603516, - "kl": 0.119140625, - "learning_rate": 8.1421287444232e-07, - "loss": 0.0048, - "reward": 1.6474767923355103, - "reward_std": 0.057992108166217804, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5224767923355103, - "step": 583 - }, - { - "completion_length": 96.65625, - "epoch": 0.1861057998725303, - "grad_norm": 37.99491500854492, - "kl": 0.0849609375, - "learning_rate": 8.138942001274697e-07, - "loss": 0.0034, - "reward": 1.5028228759765625, - "reward_std": 0.23027420043945312, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.96875, - "rewards/iou_glue_reward": 0.4090728163719177, - "step": 584 - }, - { - "completion_length": 176.5625, - "epoch": 0.1864244741873805, - "grad_norm": 17.070524215698242, - "kl": 0.07763671875, - "learning_rate": 8.135755258126195e-07, - "loss": 0.0031, - "reward": 1.446868658065796, - "reward_std": 0.09230832755565643, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3374937176704407, - "step": 585 - }, - { - "completion_length": 172.953125, - "epoch": 0.18674314850223073, - "grad_norm": 15.915264129638672, - "kl": 0.048583984375, - "learning_rate": 8.132568514977693e-07, - "loss": 0.0019, - "reward": 1.546690583229065, - "reward_std": 0.09776504337787628, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.43731555342674255, - "step": 586 - }, - { - "completion_length": 150.265625, - "epoch": 0.18706182281708095, - "grad_norm": 18.803260803222656, - "kl": 0.1044921875, - "learning_rate": 8.12938177182919e-07, - "loss": 0.0042, - "reward": 1.3496114015579224, - "reward_std": 0.04225423187017441, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34961140155792236, - "rewards/pad": 0.0, - "step": 587 - }, - { - "completion_length": 173.234375, - "epoch": 0.18738049713193117, - "grad_norm": 7.521479606628418, - "kl": 0.0771484375, - "learning_rate": 8.126195028680688e-07, - "loss": 0.0031, - "reward": 1.3901759386062622, - "reward_std": 0.13265454769134521, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.40580081939697266, - "rewards/pad": 0.0, - "step": 588 - }, - { - "completion_length": 97.578125, - "epoch": 0.1876991714467814, - "grad_norm": 45.2459716796875, - "kl": 0.6015625, - "learning_rate": 8.123008285532186e-07, - "loss": 0.0242, - "reward": 1.5526204109191895, - "reward_std": 0.08004587888717651, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42762038111686707, - "rewards/pad": 0.125, - "step": 589 - }, - { - "completion_length": 144.875, - "epoch": 0.1880178457616316, - "grad_norm": 34.44723129272461, - "kl": 0.091796875, - "learning_rate": 8.119821542383684e-07, - "loss": 0.0037, - "reward": 1.4470405578613281, - "reward_std": 0.0640207901597023, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44704049825668335, - "step": 590 - }, - { - "completion_length": 121.9375, - "epoch": 0.18833652007648183, - "grad_norm": 21.2751407623291, - "kl": 0.11181640625, - "learning_rate": 8.116634799235181e-07, - "loss": 0.0045, - "reward": 1.552978515625, - "reward_std": 0.05177850276231766, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4279784560203552, - "step": 591 - }, - { - "completion_length": 97.953125, - "epoch": 0.18865519439133205, - "grad_norm": 36.41278839111328, - "kl": 0.10888671875, - "learning_rate": 8.113448056086679e-07, - "loss": 0.0044, - "reward": 1.3514432907104492, - "reward_std": 0.08961690962314606, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35144323110580444, - "rewards/pad": 0.0, - "step": 592 - }, - { - "completion_length": 122.171875, - "epoch": 0.18897386870618227, - "grad_norm": 23.956724166870117, - "kl": 0.1123046875, - "learning_rate": 8.110261312938177e-07, - "loss": 0.0045, - "reward": 1.3557486534118652, - "reward_std": 0.08091498911380768, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3557485342025757, - "step": 593 - }, - { - "completion_length": 123.015625, - "epoch": 0.18929254302103252, - "grad_norm": 73.67423248291016, - "kl": 0.10107421875, - "learning_rate": 8.107074569789675e-07, - "loss": 0.004, - "reward": 1.5595533847808838, - "reward_std": 0.0999167412519455, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5595533847808838, - "step": 594 - }, - { - "completion_length": 45.4375, - "epoch": 0.18961121733588274, - "grad_norm": 18.285587310791016, - "kl": 0.1552734375, - "learning_rate": 8.103887826641172e-07, - "loss": 0.0062, - "reward": 1.662194013595581, - "reward_std": 0.08547796308994293, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.537194013595581, - "rewards/pad": 0.125, - "step": 595 - }, - { - "completion_length": 70.6875, - "epoch": 0.18992989165073296, - "grad_norm": 20.423418045043945, - "kl": 0.1455078125, - "learning_rate": 8.10070108349267e-07, - "loss": 0.0058, - "reward": 1.7276816368103027, - "reward_std": 0.1355665922164917, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6026816368103027, - "rewards/pad": 0.125, - "step": 596 - }, - { - "completion_length": 175.640625, - "epoch": 0.19024856596558318, - "grad_norm": 4.760740280151367, - "kl": 0.09716796875, - "learning_rate": 8.097514340344168e-07, - "loss": 0.0039, - "reward": 1.4160912036895752, - "reward_std": 0.04427863657474518, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4160912334918976, - "step": 597 - }, - { - "completion_length": 45.375, - "epoch": 0.1905672402804334, - "grad_norm": 70.4986343383789, - "kl": 0.1552734375, - "learning_rate": 8.094327597195666e-07, - "loss": 0.0062, - "reward": 1.5294097661972046, - "reward_std": 0.09039345383644104, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5294097661972046, - "step": 598 - }, - { - "completion_length": 71.28125, - "epoch": 0.19088591459528362, - "grad_norm": 47.24021530151367, - "kl": 0.130859375, - "learning_rate": 8.091140854047163e-07, - "loss": 0.0052, - "reward": 1.696016550064087, - "reward_std": 0.19736725091934204, - "rewards/pad": 0.296875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39914146065711975, - "step": 599 - }, - { - "completion_length": 19.96875, - "epoch": 0.19120458891013384, - "grad_norm": 108.0482177734375, - "kl": 0.244140625, - "learning_rate": 8.087954110898661e-07, - "loss": 0.0098, - "reward": 1.8678321838378906, - "reward_std": 0.18381820619106293, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6959572434425354, - "rewards/pad": 0.171875, - "step": 600 - }, - { - "completion_length": 95.078125, - "epoch": 0.19152326322498406, - "grad_norm": 53.70155715942383, - "kl": 0.216796875, - "learning_rate": 8.084767367750159e-07, - "loss": 0.0087, - "reward": 1.633589744567871, - "reward_std": 0.08066779375076294, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6335896849632263, - "step": 601 - }, - { - "completion_length": 119.21875, - "epoch": 0.19184193753983428, - "grad_norm": 12.007109642028809, - "kl": 0.09912109375, - "learning_rate": 8.081580624601658e-07, - "loss": 0.004, - "reward": 1.506300687789917, - "reward_std": 0.06164667010307312, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.506300687789917, - "rewards/pad": 0.0, - "step": 602 - }, - { - "completion_length": 94.890625, - "epoch": 0.1921606118546845, - "grad_norm": 19.688268661499023, - "kl": 0.185546875, - "learning_rate": 8.078393881453155e-07, - "loss": 0.0074, - "reward": 1.5328928232192993, - "reward_std": 0.06634339690208435, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5328927040100098, - "rewards/pad": 0.0, - "step": 603 - }, - { - "completion_length": 45.875, - "epoch": 0.19247928616953475, - "grad_norm": 87.2319107055664, - "kl": 0.150390625, - "learning_rate": 8.075207138304653e-07, - "loss": 0.006, - "reward": 1.7554612159729004, - "reward_std": 0.09936299175024033, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5054613351821899, - "rewards/pad": 0.25, - "step": 604 - }, - { - "completion_length": 69.6875, - "epoch": 0.19279796048438497, - "grad_norm": 35.294227600097656, - "kl": 0.232421875, - "learning_rate": 8.072020395156151e-07, - "loss": 0.0093, - "reward": 1.4599926471710205, - "reward_std": 0.04127166420221329, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33499255776405334, - "step": 605 - }, - { - "completion_length": 72.140625, - "epoch": 0.1931166347992352, - "grad_norm": 43.590309143066406, - "kl": 0.09814453125, - "learning_rate": 8.068833652007649e-07, - "loss": 0.0039, - "reward": 1.6009583473205566, - "reward_std": 0.07119986414909363, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47595831751823425, - "step": 606 - }, - { - "completion_length": 149.171875, - "epoch": 0.1934353091140854, - "grad_norm": 41.91403579711914, - "kl": 0.0751953125, - "learning_rate": 8.065646908859146e-07, - "loss": 0.003, - "reward": 1.6249721050262451, - "reward_std": 0.05708273500204086, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49997201561927795, - "step": 607 - }, - { - "completion_length": 94.34375, - "epoch": 0.19375398342893563, - "grad_norm": 22.897380828857422, - "kl": 0.09814453125, - "learning_rate": 8.062460165710643e-07, - "loss": 0.0039, - "reward": 1.4169743061065674, - "reward_std": 0.07337184995412827, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4169743061065674, - "step": 608 - }, - { - "completion_length": 71.609375, - "epoch": 0.19407265774378585, - "grad_norm": 37.60327911376953, - "kl": 0.2255859375, - "learning_rate": 8.059273422562141e-07, - "loss": 0.009, - "reward": 1.5804510116577148, - "reward_std": 0.11089134961366653, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4554510712623596, - "step": 609 - }, - { - "completion_length": 70.875, - "epoch": 0.19439133205863607, - "grad_norm": 27.8912296295166, - "kl": 0.1640625, - "learning_rate": 8.056086679413639e-07, - "loss": 0.0066, - "reward": 1.7579829692840576, - "reward_std": 0.07795955240726471, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6329829692840576, - "rewards/pad": 0.125, - "step": 610 - }, - { - "completion_length": 119.65625, - "epoch": 0.1947100063734863, - "grad_norm": 22.978052139282227, - "kl": 0.1708984375, - "learning_rate": 8.052899936265136e-07, - "loss": 0.0068, - "reward": 1.6393288373947144, - "reward_std": 0.06025080382823944, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5143288969993591, - "rewards/pad": 0.125, - "step": 611 - }, - { - "completion_length": 91.90625, - "epoch": 0.1950286806883365, - "grad_norm": 61.15570068359375, - "kl": 0.1474609375, - "learning_rate": 8.049713193116634e-07, - "loss": 0.0059, - "reward": 1.4246989488601685, - "reward_std": 0.0907503068447113, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42469894886016846, - "rewards/pad": 0.0, - "step": 612 - }, - { - "completion_length": 70.859375, - "epoch": 0.19534735500318676, - "grad_norm": 18.17074203491211, - "kl": 0.130859375, - "learning_rate": 8.046526449968132e-07, - "loss": 0.0052, - "reward": 1.5264906883239746, - "reward_std": 0.09300049394369125, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5264906883239746, - "rewards/pad": 0.0, - "step": 613 - }, - { - "completion_length": 68.375, - "epoch": 0.19566602931803698, - "grad_norm": 44.83185577392578, - "kl": 0.1572265625, - "learning_rate": 8.043339706819629e-07, - "loss": 0.0063, - "reward": 1.7386105060577393, - "reward_std": 0.10457254946231842, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48861050605773926, - "step": 614 - }, - { - "completion_length": 70.609375, - "epoch": 0.1959847036328872, - "grad_norm": 13.184401512145996, - "kl": 0.130859375, - "learning_rate": 8.040152963671127e-07, - "loss": 0.0052, - "reward": 1.5657302141189575, - "reward_std": 0.09259041398763657, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3313552141189575, - "step": 615 - }, - { - "completion_length": 71.203125, - "epoch": 0.19630337794773742, - "grad_norm": 20.05096435546875, - "kl": 0.1611328125, - "learning_rate": 8.036966220522625e-07, - "loss": 0.0064, - "reward": 1.461806058883667, - "reward_std": 0.04461919888854027, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4618060290813446, - "rewards/pad": 0.0, - "step": 616 - }, - { - "completion_length": 94.0, - "epoch": 0.19662205226258764, - "grad_norm": 71.64122772216797, - "kl": 0.1416015625, - "learning_rate": 8.033779477374123e-07, - "loss": 0.0056, - "reward": 1.4444448947906494, - "reward_std": 0.09489044547080994, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4444449543952942, - "step": 617 - }, - { - "completion_length": 93.359375, - "epoch": 0.19694072657743786, - "grad_norm": 222.44659423828125, - "kl": 0.1396484375, - "learning_rate": 8.03059273422562e-07, - "loss": 0.0056, - "reward": 1.4109399318695068, - "reward_std": 0.03379690647125244, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41093993186950684, - "rewards/pad": 0.0, - "step": 618 - }, - { - "completion_length": 70.109375, - "epoch": 0.19725940089228808, - "grad_norm": 64.34793853759766, - "kl": 0.1552734375, - "learning_rate": 8.027405991077118e-07, - "loss": 0.0062, - "reward": 1.6567516326904297, - "reward_std": 0.122499480843544, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5786266326904297, - "step": 619 - }, - { - "completion_length": 93.859375, - "epoch": 0.1975780752071383, - "grad_norm": 22.94757652282715, - "kl": 0.1728515625, - "learning_rate": 8.024219247928616e-07, - "loss": 0.0069, - "reward": 1.5621228218078613, - "reward_std": 0.08263997733592987, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5621228218078613, - "rewards/pad": 0.0, - "step": 620 - }, - { - "completion_length": 119.5625, - "epoch": 0.19789674952198852, - "grad_norm": 28.08785057067871, - "kl": 0.09228515625, - "learning_rate": 8.021032504780114e-07, - "loss": 0.0037, - "reward": 1.6540088653564453, - "reward_std": 0.05631333217024803, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5290088653564453, - "rewards/pad": 0.125, - "step": 621 - }, - { - "completion_length": 96.734375, - "epoch": 0.19821542383683874, - "grad_norm": 10.783617973327637, - "kl": 0.10888671875, - "learning_rate": 8.017845761631612e-07, - "loss": 0.0044, - "reward": 1.660102367401123, - "reward_std": 0.11361619085073471, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5507273077964783, - "step": 622 - }, - { - "completion_length": 173.765625, - "epoch": 0.198534098151689, - "grad_norm": 33.31416702270508, - "kl": 0.07421875, - "learning_rate": 8.01465901848311e-07, - "loss": 0.003, - "reward": 1.628753900527954, - "reward_std": 0.16182458400726318, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.41000401973724365, - "step": 623 - }, - { - "completion_length": 144.65625, - "epoch": 0.1988527724665392, - "grad_norm": 9.223255157470703, - "kl": 0.287109375, - "learning_rate": 8.011472275334608e-07, - "loss": 0.0115, - "reward": 1.6554591655731201, - "reward_std": 0.060511715710163116, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5304592251777649, - "step": 624 - }, - { - "completion_length": 96.53125, - "epoch": 0.19917144678138943, - "grad_norm": 21.87114906311035, - "kl": 0.11181640625, - "learning_rate": 8.008285532186106e-07, - "loss": 0.0045, - "reward": 1.5894794464111328, - "reward_std": 0.05112447962164879, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5894794464111328, - "step": 625 - }, - { - "completion_length": 71.28125, - "epoch": 0.19949012109623965, - "grad_norm": 19.726043701171875, - "kl": 0.13671875, - "learning_rate": 8.005098789037603e-07, - "loss": 0.0055, - "reward": 1.563927412033081, - "reward_std": 0.06233830377459526, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4389273524284363, - "step": 626 - }, - { - "completion_length": 17.96875, - "epoch": 0.19980879541108987, - "grad_norm": 35.064876556396484, - "kl": 0.41015625, - "learning_rate": 8.001912045889101e-07, - "loss": 0.0165, - "reward": 1.6311964988708496, - "reward_std": 0.12043973058462143, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6311965584754944, - "rewards/pad": 0.0, - "step": 627 - }, - { - "completion_length": 95.609375, - "epoch": 0.2001274697259401, - "grad_norm": 36.62488555908203, - "kl": 0.259765625, - "learning_rate": 7.998725302740599e-07, - "loss": 0.0104, - "reward": 1.4664359092712402, - "reward_std": 0.19014671444892883, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.450810968875885, - "step": 628 - }, - { - "completion_length": 45.25, - "epoch": 0.2004461440407903, - "grad_norm": 18.387773513793945, - "kl": 0.1767578125, - "learning_rate": 7.995538559592097e-07, - "loss": 0.0071, - "reward": 1.5831278562545776, - "reward_std": 0.07549235969781876, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45812782645225525, - "rewards/pad": 0.125, - "step": 629 - }, - { - "completion_length": 73.421875, - "epoch": 0.20076481835564053, - "grad_norm": 35.43833541870117, - "kl": 0.16015625, - "learning_rate": 7.992351816443594e-07, - "loss": 0.0064, - "reward": 1.4841811656951904, - "reward_std": 0.12676116824150085, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3591812252998352, - "rewards/pad": 0.125, - "step": 630 - }, - { - "completion_length": 95.703125, - "epoch": 0.20108349267049075, - "grad_norm": 54.55873107910156, - "kl": 0.1337890625, - "learning_rate": 7.989165073295092e-07, - "loss": 0.0053, - "reward": 1.2906908988952637, - "reward_std": 0.05583186075091362, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2906908690929413, - "step": 631 - }, - { - "completion_length": 97.78125, - "epoch": 0.20140216698534097, - "grad_norm": 25.21178436279297, - "kl": 0.15234375, - "learning_rate": 7.98597833014659e-07, - "loss": 0.0061, - "reward": 1.444516897201538, - "reward_std": 0.0909559354186058, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44451695680618286, - "step": 632 - }, - { - "completion_length": 96.546875, - "epoch": 0.20172084130019122, - "grad_norm": 34.26476287841797, - "kl": 0.224609375, - "learning_rate": 7.982791586998088e-07, - "loss": 0.009, - "reward": 1.4543168544769287, - "reward_std": 0.08605051785707474, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4543168544769287, - "rewards/pad": 0.0, - "step": 633 - }, - { - "completion_length": 149.78125, - "epoch": 0.20203951561504144, - "grad_norm": 44.93718338012695, - "kl": 0.10205078125, - "learning_rate": 7.979604843849585e-07, - "loss": 0.0041, - "reward": 1.506622076034546, - "reward_std": 0.04655434936285019, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5066221356391907, - "rewards/pad": 0.0, - "step": 634 - }, - { - "completion_length": 45.21875, - "epoch": 0.20235818992989166, - "grad_norm": 60.32438278198242, - "kl": 0.1396484375, - "learning_rate": 7.976418100701083e-07, - "loss": 0.0056, - "reward": 1.6560125350952148, - "reward_std": 0.11041535437107086, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40601247549057007, - "rewards/pad": 0.25, - "step": 635 - }, - { - "completion_length": 148.0, - "epoch": 0.20267686424474188, - "grad_norm": 13.036028861999512, - "kl": 0.11328125, - "learning_rate": 7.973231357552581e-07, - "loss": 0.0045, - "reward": 1.3512401580810547, - "reward_std": 0.03171146661043167, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3512400686740875, - "rewards/pad": 0.0, - "step": 636 - }, - { - "completion_length": 70.421875, - "epoch": 0.2029955385595921, - "grad_norm": 24.219802856445312, - "kl": 0.255859375, - "learning_rate": 7.970044614404079e-07, - "loss": 0.0102, - "reward": 1.3509703874588013, - "reward_std": 0.1400013566017151, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3509703278541565, - "rewards/pad": 0.0, - "step": 637 - }, - { - "completion_length": 123.578125, - "epoch": 0.20331421287444232, - "grad_norm": 17.177087783813477, - "kl": 0.1171875, - "learning_rate": 7.966857871255576e-07, - "loss": 0.0047, - "reward": 1.5644482374191284, - "reward_std": 0.039421312510967255, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4394482374191284, - "step": 638 - }, - { - "completion_length": 97.078125, - "epoch": 0.20363288718929254, - "grad_norm": 36.70372009277344, - "kl": 0.189453125, - "learning_rate": 7.963671128107074e-07, - "loss": 0.0076, - "reward": 1.605509638786316, - "reward_std": 0.09545377641916275, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6055095195770264, - "rewards/pad": 0.0, - "step": 639 - }, - { - "completion_length": 71.453125, - "epoch": 0.20395156150414276, - "grad_norm": 17.04865837097168, - "kl": 0.26171875, - "learning_rate": 7.960484384958573e-07, - "loss": 0.0105, - "reward": 1.6390584707260132, - "reward_std": 0.14676693081855774, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5140584111213684, - "rewards/pad": 0.125, - "step": 640 - }, - { - "completion_length": 70.640625, - "epoch": 0.20427023581899298, - "grad_norm": 47.15485382080078, - "kl": 0.1767578125, - "learning_rate": 7.957297641810071e-07, - "loss": 0.0071, - "reward": 1.4928598403930664, - "reward_std": 0.08098393678665161, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4928598403930664, - "rewards/pad": 0.0, - "step": 641 - }, - { - "completion_length": 123.046875, - "epoch": 0.2045889101338432, - "grad_norm": 24.090925216674805, - "kl": 0.10693359375, - "learning_rate": 7.954110898661568e-07, - "loss": 0.0043, - "reward": 1.5162612199783325, - "reward_std": 0.027123302221298218, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5162612199783325, - "rewards/pad": 0.0, - "step": 642 - }, - { - "completion_length": 40.46875, - "epoch": 0.20490758444869345, - "grad_norm": 57.921077728271484, - "kl": 0.19921875, - "learning_rate": 7.950924155513066e-07, - "loss": 0.008, - "reward": 1.4682936668395996, - "reward_std": 0.11122076958417892, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34329357743263245, - "rewards/pad": 0.125, - "step": 643 - }, - { - "completion_length": 149.890625, - "epoch": 0.20522625876354367, - "grad_norm": 100.49211883544922, - "kl": 0.1708984375, - "learning_rate": 7.947737412364564e-07, - "loss": 0.0068, - "reward": 1.3794997930526733, - "reward_std": 0.10549529641866684, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3638748228549957, - "step": 644 - }, - { - "completion_length": 94.984375, - "epoch": 0.2055449330783939, - "grad_norm": 91.87433624267578, - "kl": 0.2138671875, - "learning_rate": 7.944550669216062e-07, - "loss": 0.0086, - "reward": 1.4812575578689575, - "reward_std": 0.09469930827617645, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3562575578689575, - "rewards/pad": 0.125, - "step": 645 - }, - { - "completion_length": 93.125, - "epoch": 0.2058636073932441, - "grad_norm": 17.375213623046875, - "kl": 0.1708984375, - "learning_rate": 7.941363926067559e-07, - "loss": 0.0068, - "reward": 1.3135367631912231, - "reward_std": 0.04011062905192375, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.31353676319122314, - "rewards/pad": 0.0, - "step": 646 - }, - { - "completion_length": 96.40625, - "epoch": 0.20618228170809433, - "grad_norm": 16.635448455810547, - "kl": 0.162109375, - "learning_rate": 7.938177182919057e-07, - "loss": 0.0065, - "reward": 1.6755173206329346, - "reward_std": 0.14716209471225739, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5505174398422241, - "rewards/pad": 0.125, - "step": 647 - }, - { - "completion_length": 148.796875, - "epoch": 0.20650095602294455, - "grad_norm": 187.33123779296875, - "kl": 0.07861328125, - "learning_rate": 7.934990439770554e-07, - "loss": 0.0031, - "reward": 1.4606094360351562, - "reward_std": 0.09565722942352295, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39810943603515625, - "step": 648 - }, - { - "completion_length": 73.546875, - "epoch": 0.20681963033779477, - "grad_norm": 36.188865661621094, - "kl": 0.126953125, - "learning_rate": 7.931803696622051e-07, - "loss": 0.0051, - "reward": 1.581721305847168, - "reward_std": 0.05162818729877472, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45672133564949036, - "rewards/pad": 0.125, - "step": 649 - }, - { - "completion_length": 96.640625, - "epoch": 0.207138304652645, - "grad_norm": 46.26791000366211, - "kl": 0.162109375, - "learning_rate": 7.928616953473549e-07, - "loss": 0.0065, - "reward": 1.7344567775726318, - "reward_std": 0.10052649676799774, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6094566583633423, - "rewards/pad": 0.125, - "step": 650 - }, - { - "completion_length": 68.734375, - "epoch": 0.2074569789674952, - "grad_norm": 52.092620849609375, - "kl": 0.474609375, - "learning_rate": 7.925430210325047e-07, - "loss": 0.019, - "reward": 1.5348825454711914, - "reward_std": 0.11189387738704681, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40988245606422424, - "rewards/pad": 0.125, - "step": 651 - }, - { - "completion_length": 96.078125, - "epoch": 0.20777565328234543, - "grad_norm": 100.03245544433594, - "kl": 0.1376953125, - "learning_rate": 7.922243467176545e-07, - "loss": 0.0055, - "reward": 1.4961674213409424, - "reward_std": 0.09468173235654831, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5117923617362976, - "rewards/pad": 0.0, - "step": 652 - }, - { - "completion_length": 97.484375, - "epoch": 0.20809432759719568, - "grad_norm": 35.96536636352539, - "kl": 0.1435546875, - "learning_rate": 7.919056724028042e-07, - "loss": 0.0057, - "reward": 1.52105712890625, - "reward_std": 0.045935630798339844, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3960571587085724, - "rewards/pad": 0.125, - "step": 653 - }, - { - "completion_length": 97.75, - "epoch": 0.2084130019120459, - "grad_norm": 30.760404586791992, - "kl": 0.1884765625, - "learning_rate": 7.91586998087954e-07, - "loss": 0.0075, - "reward": 1.5083422660827637, - "reward_std": 0.11229605227708817, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5083421468734741, - "rewards/pad": 0.0, - "step": 654 - }, - { - "completion_length": 146.828125, - "epoch": 0.20873167622689612, - "grad_norm": 251.43666076660156, - "kl": 0.10595703125, - "learning_rate": 7.912683237731038e-07, - "loss": 0.0042, - "reward": 1.4129326343536377, - "reward_std": 0.04007997736334801, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4129326045513153, - "step": 655 - }, - { - "completion_length": 71.875, - "epoch": 0.20905035054174634, - "grad_norm": 22.842525482177734, - "kl": 0.2392578125, - "learning_rate": 7.909496494582536e-07, - "loss": 0.0096, - "reward": 1.6012282371520996, - "reward_std": 0.08485373854637146, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5856032371520996, - "rewards/pad": 0.015625, - "step": 656 - }, - { - "completion_length": 124.265625, - "epoch": 0.20936902485659656, - "grad_norm": 25.405227661132812, - "kl": 0.2001953125, - "learning_rate": 7.906309751434033e-07, - "loss": 0.008, - "reward": 1.6624181270599365, - "reward_std": 0.08725246042013168, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4124181568622589, - "rewards/pad": 0.25, - "step": 657 - }, - { - "completion_length": 71.84375, - "epoch": 0.20968769917144678, - "grad_norm": 47.46111297607422, - "kl": 0.189453125, - "learning_rate": 7.903123008285531e-07, - "loss": 0.0076, - "reward": 1.605202555656433, - "reward_std": 0.10207031667232513, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35520249605178833, - "rewards/pad": 0.25, - "step": 658 - }, - { - "completion_length": 125.53125, - "epoch": 0.210006373486297, - "grad_norm": 24.989748001098633, - "kl": 0.12890625, - "learning_rate": 7.89993626513703e-07, - "loss": 0.0052, - "reward": 1.513319492340088, - "reward_std": 0.053028907626867294, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5133194923400879, - "rewards/pad": 0.0, - "step": 659 - }, - { - "completion_length": 98.125, - "epoch": 0.21032504780114722, - "grad_norm": 18.915529251098633, - "kl": 0.2080078125, - "learning_rate": 7.896749521988528e-07, - "loss": 0.0083, - "reward": 1.4596118927001953, - "reward_std": 0.09722685813903809, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4596119821071625, - "rewards/pad": 0.0, - "step": 660 - }, - { - "completion_length": 97.546875, - "epoch": 0.21064372211599744, - "grad_norm": 114.42607879638672, - "kl": 0.12060546875, - "learning_rate": 7.893562778840025e-07, - "loss": 0.0048, - "reward": 1.5191466808319092, - "reward_std": 0.09039562195539474, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34727177023887634, - "rewards/pad": 0.171875, - "step": 661 - }, - { - "completion_length": 126.953125, - "epoch": 0.2109623964308477, - "grad_norm": 19.043169021606445, - "kl": 0.09619140625, - "learning_rate": 7.890376035691523e-07, - "loss": 0.0038, - "reward": 1.7435672283172607, - "reward_std": 0.046088241040706635, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4935673475265503, - "rewards/pad": 0.25, - "step": 662 - }, - { - "completion_length": 69.75, - "epoch": 0.2112810707456979, - "grad_norm": 25.736791610717773, - "kl": 0.212890625, - "learning_rate": 7.887189292543021e-07, - "loss": 0.0085, - "reward": 1.5893003940582275, - "reward_std": 0.07170729339122772, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5893005132675171, - "rewards/pad": 0.0, - "step": 663 - }, - { - "completion_length": 97.71875, - "epoch": 0.21159974506054813, - "grad_norm": 66.05033111572266, - "kl": 0.1552734375, - "learning_rate": 7.884002549394519e-07, - "loss": 0.0062, - "reward": 1.508967399597168, - "reward_std": 0.0578170046210289, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5089674592018127, - "rewards/pad": 0.0, - "step": 664 - }, - { - "completion_length": 97.90625, - "epoch": 0.21191841937539835, - "grad_norm": 34.869117736816406, - "kl": 0.10205078125, - "learning_rate": 7.880815806246016e-07, - "loss": 0.0041, - "reward": 1.3681917190551758, - "reward_std": 0.057296961545944214, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3681916892528534, - "step": 665 - }, - { - "completion_length": 123.03125, - "epoch": 0.21223709369024857, - "grad_norm": 25.530683517456055, - "kl": 0.1123046875, - "learning_rate": 7.877629063097514e-07, - "loss": 0.0045, - "reward": 1.5382572412490845, - "reward_std": 0.07736440747976303, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4132571816444397, - "step": 666 - }, - { - "completion_length": 96.1875, - "epoch": 0.2125557680050988, - "grad_norm": 27.343393325805664, - "kl": 0.212890625, - "learning_rate": 7.874442319949012e-07, - "loss": 0.0085, - "reward": 1.3758732080459595, - "reward_std": 0.0433371402323246, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3758732080459595, - "rewards/pad": 0.0, - "step": 667 - }, - { - "completion_length": 126.1875, - "epoch": 0.212874442319949, - "grad_norm": 25.15089988708496, - "kl": 0.08154296875, - "learning_rate": 7.87125557680051e-07, - "loss": 0.0033, - "reward": 1.8917670249938965, - "reward_std": 0.04995495826005936, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5167669653892517, - "step": 668 - }, - { - "completion_length": 146.21875, - "epoch": 0.21319311663479923, - "grad_norm": 39.95366287231445, - "kl": 0.12109375, - "learning_rate": 7.868068833652007e-07, - "loss": 0.0048, - "reward": 1.496253252029419, - "reward_std": 0.06662138551473618, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49625325202941895, - "step": 669 - }, - { - "completion_length": 98.21875, - "epoch": 0.21351179094964945, - "grad_norm": 17.594482421875, - "kl": 0.22265625, - "learning_rate": 7.864882090503505e-07, - "loss": 0.0089, - "reward": 1.582715630531311, - "reward_std": 0.06213941425085068, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5827155709266663, - "rewards/pad": 0.0, - "step": 670 - }, - { - "completion_length": 72.796875, - "epoch": 0.21383046526449967, - "grad_norm": 43.37171936035156, - "kl": 0.1728515625, - "learning_rate": 7.861695347355003e-07, - "loss": 0.0069, - "reward": 1.473966121673584, - "reward_std": 0.09620095789432526, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45834115147590637, - "rewards/pad": 0.015625, - "step": 671 - }, - { - "completion_length": 96.96875, - "epoch": 0.21414913957934992, - "grad_norm": 34.0592155456543, - "kl": 0.1015625, - "learning_rate": 7.858508604206501e-07, - "loss": 0.0041, - "reward": 1.7571685314178467, - "reward_std": 0.0643969252705574, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3821684420108795, - "step": 672 - }, - { - "completion_length": 123.1875, - "epoch": 0.21446781389420014, - "grad_norm": 137.472412109375, - "kl": 0.138671875, - "learning_rate": 7.855321861057998e-07, - "loss": 0.0056, - "reward": 1.4088833332061768, - "reward_std": 0.07924804091453552, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.40888330340385437, - "step": 673 - }, - { - "completion_length": 123.8125, - "epoch": 0.21478648820905036, - "grad_norm": 20.52865982055664, - "kl": 0.11572265625, - "learning_rate": 7.852135117909496e-07, - "loss": 0.0046, - "reward": 1.6505613327026367, - "reward_std": 0.053364917635917664, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5255613923072815, - "step": 674 - }, - { - "completion_length": 98.140625, - "epoch": 0.21510516252390058, - "grad_norm": 25.846698760986328, - "kl": 0.197265625, - "learning_rate": 7.848948374760994e-07, - "loss": 0.0079, - "reward": 1.5987257957458496, - "reward_std": 0.13354355096817017, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5674759149551392, - "step": 675 - }, - { - "completion_length": 73.765625, - "epoch": 0.2154238368387508, - "grad_norm": 134.68280029296875, - "kl": 0.1416015625, - "learning_rate": 7.845761631612492e-07, - "loss": 0.0056, - "reward": 1.5867506265640259, - "reward_std": 0.10678602755069733, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3367506265640259, - "rewards/pad": 0.25, - "step": 676 - }, - { - "completion_length": 68.921875, - "epoch": 0.21574251115360102, - "grad_norm": 25.11921501159668, - "kl": 0.2275390625, - "learning_rate": 7.842574888463989e-07, - "loss": 0.0091, - "reward": 1.5466156005859375, - "reward_std": 0.05886758118867874, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5466156005859375, - "step": 677 - }, - { - "completion_length": 122.078125, - "epoch": 0.21606118546845124, - "grad_norm": 41.1529426574707, - "kl": 0.1162109375, - "learning_rate": 7.839388145315488e-07, - "loss": 0.0046, - "reward": 1.5090479850769043, - "reward_std": 0.0427854061126709, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5090479850769043, - "step": 678 - }, - { - "completion_length": 46.921875, - "epoch": 0.21637985978330146, - "grad_norm": 32.37312316894531, - "kl": 0.1474609375, - "learning_rate": 7.836201402166986e-07, - "loss": 0.0059, - "reward": 2.004756450653076, - "reward_std": 0.09021234512329102, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6297565698623657, - "rewards/pad": 0.375, - "step": 679 - }, - { - "completion_length": 126.328125, - "epoch": 0.21669853409815168, - "grad_norm": 71.86237335205078, - "kl": 0.07958984375, - "learning_rate": 7.833014659018483e-07, - "loss": 0.0032, - "reward": 1.5416804552078247, - "reward_std": 0.041763074696063995, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4166804552078247, - "rewards/pad": 0.125, - "step": 680 - }, - { - "completion_length": 121.59375, - "epoch": 0.2170172084130019, - "grad_norm": 16.156023025512695, - "kl": 0.0859375, - "learning_rate": 7.829827915869981e-07, - "loss": 0.0034, - "reward": 1.3597514629364014, - "reward_std": 0.06102152541279793, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35975152254104614, - "step": 681 - }, - { - "completion_length": 125.03125, - "epoch": 0.21733588272785215, - "grad_norm": 40.038936614990234, - "kl": 0.07666015625, - "learning_rate": 7.826641172721479e-07, - "loss": 0.0031, - "reward": 1.729158639907837, - "reward_std": 0.03963400423526764, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47915855050086975, - "rewards/pad": 0.25, - "step": 682 - }, - { - "completion_length": 96.21875, - "epoch": 0.21765455704270237, - "grad_norm": 85.6227798461914, - "kl": 0.15234375, - "learning_rate": 7.823454429572977e-07, - "loss": 0.0061, - "reward": 1.4844729900360107, - "reward_std": 0.07393090426921844, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48447293043136597, - "step": 683 - }, - { - "completion_length": 121.40625, - "epoch": 0.2179732313575526, - "grad_norm": 11.68277359008789, - "kl": 0.0966796875, - "learning_rate": 7.820267686424474e-07, - "loss": 0.0039, - "reward": 1.4158823490142822, - "reward_std": 0.04511553421616554, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4158823788166046, - "rewards/pad": 0.0, - "step": 684 - }, - { - "completion_length": 146.984375, - "epoch": 0.2182919056724028, - "grad_norm": 8.718667984008789, - "kl": 0.046630859375, - "learning_rate": 7.817080943275972e-07, - "loss": 0.0019, - "reward": 1.6158006191253662, - "reward_std": 0.03874984011054039, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3658006191253662, - "step": 685 - }, - { - "completion_length": 98.03125, - "epoch": 0.21861057998725303, - "grad_norm": 12.766626358032227, - "kl": 0.13671875, - "learning_rate": 7.81389420012747e-07, - "loss": 0.0055, - "reward": 1.731637954711914, - "reward_std": 0.05565390735864639, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6066379547119141, - "rewards/pad": 0.125, - "step": 686 - }, - { - "completion_length": 125.109375, - "epoch": 0.21892925430210325, - "grad_norm": 19.363597869873047, - "kl": 0.12353515625, - "learning_rate": 7.810707456978967e-07, - "loss": 0.0049, - "reward": 1.555992841720581, - "reward_std": 0.08267258107662201, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.555992841720581, - "rewards/pad": 0.0, - "step": 687 - }, - { - "completion_length": 98.484375, - "epoch": 0.21924792861695347, - "grad_norm": 29.480148315429688, - "kl": 0.16015625, - "learning_rate": 7.807520713830464e-07, - "loss": 0.0064, - "reward": 1.4770445823669434, - "reward_std": 0.0575791671872139, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35204464197158813, - "rewards/pad": 0.125, - "step": 688 - }, - { - "completion_length": 150.578125, - "epoch": 0.2195666029318037, - "grad_norm": 33.247291564941406, - "kl": 0.1279296875, - "learning_rate": 7.804333970681962e-07, - "loss": 0.0051, - "reward": 1.4387083053588867, - "reward_std": 0.04876134172081947, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3137083351612091, - "step": 689 - }, - { - "completion_length": 99.796875, - "epoch": 0.2198852772466539, - "grad_norm": 19.67684555053711, - "kl": 0.11181640625, - "learning_rate": 7.80114722753346e-07, - "loss": 0.0045, - "reward": 1.576588749885559, - "reward_std": 0.06405520439147949, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4515886902809143, - "rewards/pad": 0.125, - "step": 690 - }, - { - "completion_length": 70.453125, - "epoch": 0.22020395156150413, - "grad_norm": 17.070262908935547, - "kl": 0.1552734375, - "learning_rate": 7.797960484384958e-07, - "loss": 0.0062, - "reward": 1.4112935066223145, - "reward_std": 0.06292285025119781, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4112934470176697, - "step": 691 - }, - { - "completion_length": 92.796875, - "epoch": 0.22052262587635438, - "grad_norm": 8.107675552368164, - "kl": 0.16015625, - "learning_rate": 7.794773741236455e-07, - "loss": 0.0064, - "reward": 1.4180748462677002, - "reward_std": 0.07736538350582123, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4180748164653778, - "rewards/pad": 0.0, - "step": 692 - }, - { - "completion_length": 98.6875, - "epoch": 0.2208413001912046, - "grad_norm": 38.32047653198242, - "kl": 0.19140625, - "learning_rate": 7.791586998087953e-07, - "loss": 0.0077, - "reward": 1.3982741832733154, - "reward_std": 0.06808879971504211, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3982742428779602, - "rewards/pad": 0.0, - "step": 693 - }, - { - "completion_length": 148.578125, - "epoch": 0.22115997450605482, - "grad_norm": 32.58980941772461, - "kl": 0.115234375, - "learning_rate": 7.788400254939451e-07, - "loss": 0.0046, - "reward": 1.4551475048065186, - "reward_std": 0.0770447701215744, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45514750480651855, - "step": 694 - }, - { - "completion_length": 149.90625, - "epoch": 0.22147864882090504, - "grad_norm": 21.80135154724121, - "kl": 0.1064453125, - "learning_rate": 7.785213511790949e-07, - "loss": 0.0043, - "reward": 1.4896881580352783, - "reward_std": 0.05898185819387436, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36468809843063354, - "step": 695 - }, - { - "completion_length": 46.96875, - "epoch": 0.22179732313575526, - "grad_norm": 161.15521240234375, - "kl": 0.1484375, - "learning_rate": 7.782026768642446e-07, - "loss": 0.0059, - "reward": 1.559443712234497, - "reward_std": 0.1841370314359665, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3719435930252075, - "rewards/pad": 0.1875, - "step": 696 - }, - { - "completion_length": 97.328125, - "epoch": 0.22211599745060548, - "grad_norm": 43.470455169677734, - "kl": 0.11376953125, - "learning_rate": 7.778840025493945e-07, - "loss": 0.0045, - "reward": 1.5548875331878662, - "reward_std": 0.13499760627746582, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5392624139785767, - "step": 697 - }, - { - "completion_length": 149.1875, - "epoch": 0.2224346717654557, - "grad_norm": 46.18128967285156, - "kl": 0.078125, - "learning_rate": 7.775653282345443e-07, - "loss": 0.0031, - "reward": 1.4198707342147827, - "reward_std": 0.062181681394577026, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4198707640171051, - "step": 698 - }, - { - "completion_length": 97.953125, - "epoch": 0.22275334608030592, - "grad_norm": 26.341793060302734, - "kl": 0.12451171875, - "learning_rate": 7.772466539196941e-07, - "loss": 0.005, - "reward": 1.64949369430542, - "reward_std": 0.1283523142337799, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5401187539100647, - "step": 699 - }, - { - "completion_length": 146.609375, - "epoch": 0.22307202039515614, - "grad_norm": 10.743393898010254, - "kl": 0.05810546875, - "learning_rate": 7.769279796048438e-07, - "loss": 0.0023, - "reward": 1.4474786520004272, - "reward_std": 0.03060361184179783, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44747862219810486, - "rewards/pad": 0.0, - "step": 700 - }, - { - "completion_length": 123.90625, - "epoch": 0.22339069471000636, - "grad_norm": 22.797927856445312, - "kl": 0.357421875, - "learning_rate": 7.766093052899936e-07, - "loss": 0.0143, - "reward": 1.7013278007507324, - "reward_std": 0.08491910994052887, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5763278603553772, - "rewards/pad": 0.125, - "step": 701 - }, - { - "completion_length": 71.453125, - "epoch": 0.2237093690248566, - "grad_norm": 17.504398345947266, - "kl": 0.27734375, - "learning_rate": 7.762906309751434e-07, - "loss": 0.0111, - "reward": 1.6171817779541016, - "reward_std": 0.11721836030483246, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6171817779541016, - "rewards/pad": 0.0, - "step": 702 - }, - { - "completion_length": 46.109375, - "epoch": 0.22402804333970683, - "grad_norm": 40.05861282348633, - "kl": 0.1328125, - "learning_rate": 7.759719566602932e-07, - "loss": 0.0053, - "reward": 1.6811028718948364, - "reward_std": 0.08720278739929199, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4311029613018036, - "step": 703 - }, - { - "completion_length": 151.890625, - "epoch": 0.22434671765455705, - "grad_norm": 27.30905532836914, - "kl": 0.1025390625, - "learning_rate": 7.756532823454429e-07, - "loss": 0.0041, - "reward": 1.602893352508545, - "reward_std": 0.12148568034172058, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4935183823108673, - "rewards/pad": 0.125, - "step": 704 - }, - { - "completion_length": 121.765625, - "epoch": 0.22466539196940727, - "grad_norm": 20.993581771850586, - "kl": 0.21484375, - "learning_rate": 7.753346080305927e-07, - "loss": 0.0086, - "reward": 1.5430442094802856, - "reward_std": 0.09577062726020813, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5430442690849304, - "rewards/pad": 0.0, - "step": 705 - }, - { - "completion_length": 21.5625, - "epoch": 0.2249840662842575, - "grad_norm": 50.83089828491211, - "kl": 0.15234375, - "learning_rate": 7.750159337157425e-07, - "loss": 0.0061, - "reward": 1.6595174074172974, - "reward_std": 0.07145840674638748, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.2845174968242645, - "step": 706 - }, - { - "completion_length": 98.28125, - "epoch": 0.2253027405991077, - "grad_norm": 35.206878662109375, - "kl": 0.115234375, - "learning_rate": 7.746972594008923e-07, - "loss": 0.0046, - "reward": 1.5010448694229126, - "reward_std": 0.11892069876194, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2822949290275574, - "rewards/pad": 0.21875, - "step": 707 - }, - { - "completion_length": 96.734375, - "epoch": 0.22562141491395793, - "grad_norm": 97.78717803955078, - "kl": 0.1435546875, - "learning_rate": 7.74378585086042e-07, - "loss": 0.0057, - "reward": 1.5857737064361572, - "reward_std": 0.13069474697113037, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.460773766040802, - "rewards/pad": 0.125, - "step": 708 - }, - { - "completion_length": 45.015625, - "epoch": 0.22594008922880815, - "grad_norm": 37.87092208862305, - "kl": 0.1396484375, - "learning_rate": 7.740599107711918e-07, - "loss": 0.0056, - "reward": 1.6845557689666748, - "reward_std": 0.0830632746219635, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5595556497573853, - "step": 709 - }, - { - "completion_length": 73.625, - "epoch": 0.22625876354365837, - "grad_norm": 21.10732650756836, - "kl": 0.142578125, - "learning_rate": 7.737412364563416e-07, - "loss": 0.0057, - "reward": 1.508007526397705, - "reward_std": 0.15659797191619873, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5236325263977051, - "step": 710 - }, - { - "completion_length": 46.296875, - "epoch": 0.22657743785850862, - "grad_norm": 91.6818618774414, - "kl": 0.1669921875, - "learning_rate": 7.734225621414913e-07, - "loss": 0.0067, - "reward": 1.6029472351074219, - "reward_std": 0.11285988241434097, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47794726490974426, - "step": 711 - }, - { - "completion_length": 123.28125, - "epoch": 0.22689611217335884, - "grad_norm": 85.90867614746094, - "kl": 0.263671875, - "learning_rate": 7.731038878266411e-07, - "loss": 0.0106, - "reward": 1.525963544845581, - "reward_std": 0.09025786817073822, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5259636640548706, - "step": 712 - }, - { - "completion_length": 123.890625, - "epoch": 0.22721478648820906, - "grad_norm": 61.534950256347656, - "kl": 0.0791015625, - "learning_rate": 7.727852135117909e-07, - "loss": 0.0032, - "reward": 1.6034601926803589, - "reward_std": 0.07537369430065155, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4784601330757141, - "rewards/pad": 0.125, - "step": 713 - }, - { - "completion_length": 98.46875, - "epoch": 0.22753346080305928, - "grad_norm": 45.14211654663086, - "kl": 0.1328125, - "learning_rate": 7.724665391969407e-07, - "loss": 0.0053, - "reward": 1.4492158889770508, - "reward_std": 0.14678949117660522, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4179658591747284, - "rewards/pad": 0.03125, - "step": 714 - }, - { - "completion_length": 72.203125, - "epoch": 0.2278521351179095, - "grad_norm": 30.922330856323242, - "kl": 0.15234375, - "learning_rate": 7.721478648820904e-07, - "loss": 0.0061, - "reward": 1.5465211868286133, - "reward_std": 0.08825637400150299, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5465211868286133, - "rewards/pad": 0.0, - "step": 715 - }, - { - "completion_length": 175.65625, - "epoch": 0.22817080943275972, - "grad_norm": 17.37681770324707, - "kl": 0.04443359375, - "learning_rate": 7.718291905672403e-07, - "loss": 0.0018, - "reward": 1.5493981838226318, - "reward_std": 0.06132122129201889, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4243982434272766, - "step": 716 - }, - { - "completion_length": 124.8125, - "epoch": 0.22848948374760994, - "grad_norm": 31.47076988220215, - "kl": 0.09521484375, - "learning_rate": 7.715105162523901e-07, - "loss": 0.0038, - "reward": 1.6855530738830566, - "reward_std": 0.05492226034402847, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5605530738830566, - "rewards/pad": 0.125, - "step": 717 - }, - { - "completion_length": 71.03125, - "epoch": 0.22880815806246016, - "grad_norm": 23.651077270507812, - "kl": 0.21484375, - "learning_rate": 7.711918419375399e-07, - "loss": 0.0086, - "reward": 1.5401701927185059, - "reward_std": 0.06382852792739868, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5401701331138611, - "step": 718 - }, - { - "completion_length": 70.734375, - "epoch": 0.22912683237731038, - "grad_norm": 27.53903579711914, - "kl": 0.22265625, - "learning_rate": 7.708731676226896e-07, - "loss": 0.0089, - "reward": 1.4649901390075684, - "reward_std": 0.13506777584552765, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4649902284145355, - "rewards/pad": 0.0, - "step": 719 - }, - { - "completion_length": 72.0625, - "epoch": 0.2294455066921606, - "grad_norm": 31.68392562866211, - "kl": 0.126953125, - "learning_rate": 7.705544933078394e-07, - "loss": 0.0051, - "reward": 1.4837205410003662, - "reward_std": 0.07027556002140045, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.358720600605011, - "step": 720 - }, - { - "completion_length": 94.421875, - "epoch": 0.22976418100701085, - "grad_norm": 36.224647521972656, - "kl": 0.130859375, - "learning_rate": 7.702358189929892e-07, - "loss": 0.0052, - "reward": 1.4542423486709595, - "reward_std": 0.08611422777175903, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4542423486709595, - "rewards/pad": 0.0, - "step": 721 - }, - { - "completion_length": 118.578125, - "epoch": 0.23008285532186107, - "grad_norm": 27.81844139099121, - "kl": 0.11572265625, - "learning_rate": 7.69917144678139e-07, - "loss": 0.0046, - "reward": 1.655103087425232, - "reward_std": 0.0717756524682045, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5301031470298767, - "step": 722 - }, - { - "completion_length": 100.34375, - "epoch": 0.2304015296367113, - "grad_norm": 17.767757415771484, - "kl": 0.11083984375, - "learning_rate": 7.695984703632887e-07, - "loss": 0.0044, - "reward": 1.4792160987854004, - "reward_std": 0.06914187967777252, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35421618819236755, - "rewards/pad": 0.125, - "step": 723 - }, - { - "completion_length": 123.5625, - "epoch": 0.2307202039515615, - "grad_norm": 48.519046783447266, - "kl": 0.1357421875, - "learning_rate": 7.692797960484385e-07, - "loss": 0.0054, - "reward": 1.6084320545196533, - "reward_std": 0.13084352016448975, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5459321737289429, - "step": 724 - }, - { - "completion_length": 151.90625, - "epoch": 0.23103887826641173, - "grad_norm": 17.062904357910156, - "kl": 0.0634765625, - "learning_rate": 7.689611217335883e-07, - "loss": 0.0025, - "reward": 1.7455984354019165, - "reward_std": 0.06851904094219208, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4955984652042389, - "step": 725 - }, - { - "completion_length": 122.59375, - "epoch": 0.23135755258126195, - "grad_norm": 86.39067077636719, - "kl": 0.1064453125, - "learning_rate": 7.68642447418738e-07, - "loss": 0.0043, - "reward": 1.4053685665130615, - "reward_std": 0.08708634972572327, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4053686261177063, - "step": 726 - }, - { - "completion_length": 70.765625, - "epoch": 0.23167622689611217, - "grad_norm": 253.50587463378906, - "kl": 0.25, - "learning_rate": 7.683237731038877e-07, - "loss": 0.01, - "reward": 1.5726487636566162, - "reward_std": 0.11331886053085327, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.572648823261261, - "rewards/pad": 0.0, - "step": 727 - }, - { - "completion_length": 70.640625, - "epoch": 0.2319949012109624, - "grad_norm": 20.981658935546875, - "kl": 0.1337890625, - "learning_rate": 7.680050987890375e-07, - "loss": 0.0053, - "reward": 1.7513389587402344, - "reward_std": 0.0925459936261177, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6263389587402344, - "rewards/pad": 0.125, - "step": 728 - }, - { - "completion_length": 149.046875, - "epoch": 0.2323135755258126, - "grad_norm": 19.781028747558594, - "kl": 0.068359375, - "learning_rate": 7.676864244741873e-07, - "loss": 0.0027, - "reward": 1.5592389106750488, - "reward_std": 0.038696493953466415, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4342389404773712, - "step": 729 - }, - { - "completion_length": 69.6875, - "epoch": 0.23263224984066283, - "grad_norm": 32.08163833618164, - "kl": 0.09619140625, - "learning_rate": 7.673677501593371e-07, - "loss": 0.0038, - "reward": 1.599071979522705, - "reward_std": 0.06046079471707344, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5990719795227051, - "rewards/pad": 0.0, - "step": 730 - }, - { - "completion_length": 71.109375, - "epoch": 0.23295092415551308, - "grad_norm": 55.546295166015625, - "kl": 0.16015625, - "learning_rate": 7.670490758444868e-07, - "loss": 0.0064, - "reward": 1.6382114887237549, - "reward_std": 0.12515056133270264, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5132115483283997, - "step": 731 - }, - { - "completion_length": 45.84375, - "epoch": 0.2332695984703633, - "grad_norm": 19.57425880432129, - "kl": 0.205078125, - "learning_rate": 7.667304015296366e-07, - "loss": 0.0082, - "reward": 1.5063813924789429, - "reward_std": 0.07701030373573303, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5063812732696533, - "rewards/pad": 0.0, - "step": 732 - }, - { - "completion_length": 176.40625, - "epoch": 0.23358827278521352, - "grad_norm": 15.526483535766602, - "kl": 0.06591796875, - "learning_rate": 7.664117272147864e-07, - "loss": 0.0026, - "reward": 1.5167168378829956, - "reward_std": 0.06336955726146698, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5167168974876404, - "rewards/pad": 0.0, - "step": 733 - }, - { - "completion_length": 122.890625, - "epoch": 0.23390694710006374, - "grad_norm": 20.98141098022461, - "kl": 0.08447265625, - "learning_rate": 7.660930528999362e-07, - "loss": 0.0034, - "reward": 1.6783380508422852, - "reward_std": 0.07113255560398102, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5533379316329956, - "rewards/pad": 0.125, - "step": 734 - }, - { - "completion_length": 98.4375, - "epoch": 0.23422562141491396, - "grad_norm": 34.61638259887695, - "kl": 0.1943359375, - "learning_rate": 7.65774378585086e-07, - "loss": 0.0077, - "reward": 1.6574945449829102, - "reward_std": 0.08876973390579224, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40749454498291016, - "rewards/pad": 0.25, - "step": 735 - }, - { - "completion_length": 71.28125, - "epoch": 0.23454429572976418, - "grad_norm": 119.02059936523438, - "kl": 0.1787109375, - "learning_rate": 7.654557042702358e-07, - "loss": 0.0072, - "reward": 1.688605785369873, - "reward_std": 0.09231105446815491, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.563605785369873, - "step": 736 - }, - { - "completion_length": 101.03125, - "epoch": 0.2348629700446144, - "grad_norm": 47.49235153198242, - "kl": 0.107421875, - "learning_rate": 7.651370299553856e-07, - "loss": 0.0043, - "reward": 1.6657613515853882, - "reward_std": 0.058830223977565765, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.540761411190033, - "step": 737 - }, - { - "completion_length": 20.640625, - "epoch": 0.23518164435946462, - "grad_norm": 181.3784637451172, - "kl": 0.2109375, - "learning_rate": 7.648183556405353e-07, - "loss": 0.0084, - "reward": 1.7108551263809204, - "reward_std": 0.0872071236371994, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3358551263809204, - "rewards/pad": 0.375, - "step": 738 - }, - { - "completion_length": 45.609375, - "epoch": 0.23550031867431484, - "grad_norm": 47.21954345703125, - "kl": 0.1396484375, - "learning_rate": 7.644996813256851e-07, - "loss": 0.0056, - "reward": 1.4048912525177002, - "reward_std": 0.09731826186180115, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40489131212234497, - "rewards/pad": 0.0, - "step": 739 - }, - { - "completion_length": 149.640625, - "epoch": 0.23581899298916506, - "grad_norm": 21.183887481689453, - "kl": 0.09521484375, - "learning_rate": 7.641810070108349e-07, - "loss": 0.0038, - "reward": 1.4415016174316406, - "reward_std": 0.09589053690433502, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44150158762931824, - "step": 740 - }, - { - "completion_length": 97.921875, - "epoch": 0.2361376673040153, - "grad_norm": 84.9200439453125, - "kl": 0.11669921875, - "learning_rate": 7.638623326959847e-07, - "loss": 0.0047, - "reward": 1.4293819665908813, - "reward_std": 0.07976148277521133, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42938196659088135, - "step": 741 - }, - { - "completion_length": 72.359375, - "epoch": 0.23645634161886553, - "grad_norm": 28.53392791748047, - "kl": 0.1396484375, - "learning_rate": 7.635436583811344e-07, - "loss": 0.0056, - "reward": 1.4849293231964111, - "reward_std": 0.05757514387369156, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3599294126033783, - "rewards/pad": 0.125, - "step": 742 - }, - { - "completion_length": 46.703125, - "epoch": 0.23677501593371575, - "grad_norm": 44.556705474853516, - "kl": 0.1376953125, - "learning_rate": 7.632249840662842e-07, - "loss": 0.0055, - "reward": 1.5273563861846924, - "reward_std": 0.08845867216587067, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40235644578933716, - "rewards/pad": 0.125, - "step": 743 - }, - { - "completion_length": 123.4375, - "epoch": 0.23709369024856597, - "grad_norm": 45.98194122314453, - "kl": 0.10009765625, - "learning_rate": 7.62906309751434e-07, - "loss": 0.004, - "reward": 1.3977434635162354, - "reward_std": 0.043766941875219345, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3977435231208801, - "step": 744 - }, - { - "completion_length": 72.25, - "epoch": 0.2374123645634162, - "grad_norm": 16.99376106262207, - "kl": 0.2060546875, - "learning_rate": 7.625876354365838e-07, - "loss": 0.0082, - "reward": 1.5830817222595215, - "reward_std": 0.03701595962047577, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.45808184146881104, - "step": 745 - }, - { - "completion_length": 121.546875, - "epoch": 0.2377310388782664, - "grad_norm": 47.60268020629883, - "kl": 0.09716796875, - "learning_rate": 7.622689611217335e-07, - "loss": 0.0039, - "reward": 1.6375110149383545, - "reward_std": 0.07031754404306412, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5125110745429993, - "step": 746 - }, - { - "completion_length": 151.84375, - "epoch": 0.23804971319311663, - "grad_norm": 24.645835876464844, - "kl": 0.03662109375, - "learning_rate": 7.619502868068833e-07, - "loss": 0.0015, - "reward": 1.6020203828811646, - "reward_std": 0.03844858705997467, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4770203232765198, - "step": 747 - }, - { - "completion_length": 74.140625, - "epoch": 0.23836838750796685, - "grad_norm": 51.959228515625, - "kl": 0.1005859375, - "learning_rate": 7.616316124920331e-07, - "loss": 0.004, - "reward": 1.730022668838501, - "reward_std": 0.1415863335132599, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43314772844314575, - "rewards/pad": 0.296875, - "step": 748 - }, - { - "completion_length": 71.375, - "epoch": 0.23868706182281707, - "grad_norm": 108.53982543945312, - "kl": 0.267578125, - "learning_rate": 7.613129381771829e-07, - "loss": 0.0107, - "reward": 1.4023091793060303, - "reward_std": 0.09092642366886139, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4023091197013855, - "rewards/pad": 0.0, - "step": 749 - }, - { - "completion_length": 173.0625, - "epoch": 0.2390057361376673, - "grad_norm": 9.18797779083252, - "kl": 0.06640625, - "learning_rate": 7.609942638623326e-07, - "loss": 0.0027, - "reward": 1.4714884757995605, - "reward_std": 0.09247317910194397, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4871135354042053, - "rewards/pad": 0.0, - "step": 750 - }, - { - "completion_length": 123.609375, - "epoch": 0.23932441045251754, - "grad_norm": 15.563135147094727, - "kl": 0.11474609375, - "learning_rate": 7.606755895474824e-07, - "loss": 0.0046, - "reward": 1.6107486486434937, - "reward_std": 0.0644153356552124, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4857487380504608, - "rewards/pad": 0.125, - "step": 751 - }, - { - "completion_length": 122.96875, - "epoch": 0.23964308476736776, - "grad_norm": 40.96657943725586, - "kl": 0.07177734375, - "learning_rate": 7.603569152326322e-07, - "loss": 0.0029, - "reward": 1.460272192955017, - "reward_std": 0.05413992702960968, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4602721631526947, - "rewards/pad": 0.0, - "step": 752 - }, - { - "completion_length": 72.1875, - "epoch": 0.23996175908221798, - "grad_norm": 37.28609085083008, - "kl": 0.14453125, - "learning_rate": 7.60038240917782e-07, - "loss": 0.0058, - "reward": 1.4650516510009766, - "reward_std": 0.05887313932180405, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46505165100097656, - "rewards/pad": 0.0, - "step": 753 - }, - { - "completion_length": 70.875, - "epoch": 0.2402804333970682, - "grad_norm": 28.794374465942383, - "kl": 0.2431640625, - "learning_rate": 7.597195666029318e-07, - "loss": 0.0097, - "reward": 1.8270920515060425, - "reward_std": 0.11150316894054413, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7020920515060425, - "rewards/pad": 0.125, - "step": 754 - }, - { - "completion_length": 96.046875, - "epoch": 0.24059910771191842, - "grad_norm": 22.261991500854492, - "kl": 0.11767578125, - "learning_rate": 7.594008922880816e-07, - "loss": 0.0047, - "reward": 1.6866573095321655, - "reward_std": 0.045180194079875946, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6866573095321655, - "rewards/pad": 0.0, - "step": 755 - }, - { - "completion_length": 71.75, - "epoch": 0.24091778202676864, - "grad_norm": 33.53934097290039, - "kl": 0.150390625, - "learning_rate": 7.590822179732314e-07, - "loss": 0.006, - "reward": 1.5973482131958008, - "reward_std": 0.1039232611656189, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.472348153591156, - "rewards/pad": 0.125, - "step": 756 - }, - { - "completion_length": 98.234375, - "epoch": 0.24123645634161886, - "grad_norm": 24.93689727783203, - "kl": 0.1025390625, - "learning_rate": 7.587635436583812e-07, - "loss": 0.0041, - "reward": 1.6780802011489868, - "reward_std": 0.10466856509447098, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5530802607536316, - "step": 757 - }, - { - "completion_length": 72.859375, - "epoch": 0.24155513065646908, - "grad_norm": 33.80509567260742, - "kl": 0.10546875, - "learning_rate": 7.584448693435309e-07, - "loss": 0.0042, - "reward": 1.6861860752105713, - "reward_std": 0.08993145823478699, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43618613481521606, - "step": 758 - }, - { - "completion_length": 46.296875, - "epoch": 0.2418738049713193, - "grad_norm": 38.505409240722656, - "kl": 0.1767578125, - "learning_rate": 7.581261950286807e-07, - "loss": 0.0071, - "reward": 1.5051474571228027, - "reward_std": 0.11469937860965729, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3801473379135132, - "rewards/pad": 0.125, - "step": 759 - }, - { - "completion_length": 150.734375, - "epoch": 0.24219247928616955, - "grad_norm": 51.90461349487305, - "kl": 0.091796875, - "learning_rate": 7.578075207138305e-07, - "loss": 0.0037, - "reward": 1.5936484336853027, - "reward_std": 0.08334853500127792, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46864843368530273, - "step": 760 - }, - { - "completion_length": 20.125, - "epoch": 0.24251115360101977, - "grad_norm": 49.98828887939453, - "kl": 0.18359375, - "learning_rate": 7.574888463989803e-07, - "loss": 0.0073, - "reward": 1.5463662147521973, - "reward_std": 0.11250153183937073, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4213660955429077, - "rewards/pad": 0.125, - "step": 761 - }, - { - "completion_length": 175.65625, - "epoch": 0.24282982791587, - "grad_norm": 10.89474868774414, - "kl": 0.058349609375, - "learning_rate": 7.5717017208413e-07, - "loss": 0.0023, - "reward": 1.4550466537475586, - "reward_std": 0.05955340713262558, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45504674315452576, - "step": 762 - }, - { - "completion_length": 97.828125, - "epoch": 0.2431485022307202, - "grad_norm": 26.39861488342285, - "kl": 0.47265625, - "learning_rate": 7.568514977692798e-07, - "loss": 0.0189, - "reward": 1.3110451698303223, - "reward_std": 0.08869006484746933, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.29542022943496704, - "step": 763 - }, - { - "completion_length": 71.5625, - "epoch": 0.24346717654557043, - "grad_norm": 74.72175598144531, - "kl": 0.13671875, - "learning_rate": 7.565328234544296e-07, - "loss": 0.0055, - "reward": 1.6570360660552979, - "reward_std": 0.06887185573577881, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4070361256599426, - "step": 764 - }, - { - "completion_length": 122.078125, - "epoch": 0.24378585086042065, - "grad_norm": 26.987380981445312, - "kl": 0.173828125, - "learning_rate": 7.562141491395793e-07, - "loss": 0.0069, - "reward": 1.4231479167938232, - "reward_std": 0.06835635006427765, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.423147976398468, - "step": 765 - }, - { - "completion_length": 123.5, - "epoch": 0.24410452517527087, - "grad_norm": 18.751310348510742, - "kl": 0.091796875, - "learning_rate": 7.55895474824729e-07, - "loss": 0.0037, - "reward": 1.412963628768921, - "reward_std": 0.04932909086346626, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4129636585712433, - "rewards/pad": 0.0, - "step": 766 - }, - { - "completion_length": 71.390625, - "epoch": 0.2444231994901211, - "grad_norm": 201.6680450439453, - "kl": 0.146484375, - "learning_rate": 7.555768005098788e-07, - "loss": 0.0059, - "reward": 1.555466890335083, - "reward_std": 0.06505988538265228, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43046680092811584, - "rewards/pad": 0.125, - "step": 767 - }, - { - "completion_length": 97.40625, - "epoch": 0.2447418738049713, - "grad_norm": 33.423675537109375, - "kl": 0.1796875, - "learning_rate": 7.552581261950286e-07, - "loss": 0.0072, - "reward": 1.3956812620162964, - "reward_std": 0.07769222557544708, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.395681232213974, - "rewards/pad": 0.0, - "step": 768 - }, - { - "completion_length": 77.265625, - "epoch": 0.24506054811982153, - "grad_norm": 28.018936157226562, - "kl": 0.1103515625, - "learning_rate": 7.549394518801783e-07, - "loss": 0.0044, - "reward": 1.811413049697876, - "reward_std": 0.07297977805137634, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43641290068626404, - "step": 769 - }, - { - "completion_length": 122.46875, - "epoch": 0.24537922243467178, - "grad_norm": 21.377849578857422, - "kl": 0.095703125, - "learning_rate": 7.546207775653281e-07, - "loss": 0.0038, - "reward": 1.536658525466919, - "reward_std": 0.14788584411144257, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4741585850715637, - "rewards/pad": 0.078125, - "step": 770 - }, - { - "completion_length": 72.6875, - "epoch": 0.245697896749522, - "grad_norm": 68.76750946044922, - "kl": 0.15234375, - "learning_rate": 7.543021032504779e-07, - "loss": 0.0061, - "reward": 1.7531397342681885, - "reward_std": 0.08184407651424408, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5031397938728333, - "rewards/pad": 0.25, - "step": 771 - }, - { - "completion_length": 43.765625, - "epoch": 0.24601657106437222, - "grad_norm": 76.53553771972656, - "kl": 0.16015625, - "learning_rate": 7.539834289356277e-07, - "loss": 0.0064, - "reward": 1.7371867895126343, - "reward_std": 0.11019481718540192, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6121867895126343, - "rewards/pad": 0.125, - "step": 772 - }, - { - "completion_length": 98.421875, - "epoch": 0.24633524537922244, - "grad_norm": 51.312599182128906, - "kl": 0.2119140625, - "learning_rate": 7.536647546207775e-07, - "loss": 0.0085, - "reward": 1.6053152084350586, - "reward_std": 0.05878560245037079, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48031532764434814, - "rewards/pad": 0.125, - "step": 773 - }, - { - "completion_length": 123.9375, - "epoch": 0.24665391969407266, - "grad_norm": 9.957165718078613, - "kl": 0.16015625, - "learning_rate": 7.533460803059273e-07, - "loss": 0.0064, - "reward": 1.5390030145645142, - "reward_std": 0.054607875645160675, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5390030145645142, - "rewards/pad": 0.0, - "step": 774 - }, - { - "completion_length": 69.703125, - "epoch": 0.24697259400892288, - "grad_norm": 35.7075080871582, - "kl": 0.1162109375, - "learning_rate": 7.530274059910771e-07, - "loss": 0.0046, - "reward": 1.5532881021499634, - "reward_std": 0.06243626028299332, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5532881021499634, - "rewards/pad": 0.0, - "step": 775 - }, - { - "completion_length": 43.390625, - "epoch": 0.2472912683237731, - "grad_norm": 34.16914367675781, - "kl": 0.16796875, - "learning_rate": 7.527087316762269e-07, - "loss": 0.0067, - "reward": 1.642035961151123, - "reward_std": 0.12714436650276184, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5951609015464783, - "rewards/pad": 0.046875, - "step": 776 - }, - { - "completion_length": 96.9375, - "epoch": 0.24760994263862332, - "grad_norm": 64.47351837158203, - "kl": 0.150390625, - "learning_rate": 7.523900573613766e-07, - "loss": 0.006, - "reward": 1.5337800979614258, - "reward_std": 0.11536523699760437, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.533780038356781, - "rewards/pad": 0.0, - "step": 777 - }, - { - "completion_length": 45.4375, - "epoch": 0.24792861695347354, - "grad_norm": 20.62556266784668, - "kl": 0.2265625, - "learning_rate": 7.520713830465264e-07, - "loss": 0.0091, - "reward": 1.589827537536621, - "reward_std": 0.08547364175319672, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5898275375366211, - "rewards/pad": 0.0, - "step": 778 - }, - { - "completion_length": 125.1875, - "epoch": 0.24824729126832376, - "grad_norm": 43.06966018676758, - "kl": 0.25390625, - "learning_rate": 7.517527087316762e-07, - "loss": 0.0101, - "reward": 1.522845983505249, - "reward_std": 0.06987912952899933, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5228460431098938, - "rewards/pad": 0.0, - "step": 779 - }, - { - "completion_length": 126.515625, - "epoch": 0.248565965583174, - "grad_norm": 93.59717559814453, - "kl": 0.10205078125, - "learning_rate": 7.51434034416826e-07, - "loss": 0.0041, - "reward": 1.511330485343933, - "reward_std": 0.07540356367826462, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.2925805449485779, - "step": 780 - }, - { - "completion_length": 96.0625, - "epoch": 0.24888463989802423, - "grad_norm": 24.386804580688477, - "kl": 0.1279296875, - "learning_rate": 7.511153601019757e-07, - "loss": 0.0051, - "reward": 1.5739428997039795, - "reward_std": 0.04993325099349022, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4489428997039795, - "rewards/pad": 0.125, - "step": 781 - }, - { - "completion_length": 97.515625, - "epoch": 0.24920331421287445, - "grad_norm": 21.700822830200195, - "kl": 0.1279296875, - "learning_rate": 7.507966857871255e-07, - "loss": 0.0051, - "reward": 1.456433653831482, - "reward_std": 0.060126226395368576, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45643362402915955, - "rewards/pad": 0.0, - "step": 782 - }, - { - "completion_length": 72.609375, - "epoch": 0.24952198852772467, - "grad_norm": 12.901528358459473, - "kl": 0.12890625, - "learning_rate": 7.504780114722753e-07, - "loss": 0.0052, - "reward": 1.8618481159210205, - "reward_std": 0.06517429649829865, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6118481159210205, - "step": 783 - }, - { - "completion_length": 70.9375, - "epoch": 0.2498406628425749, - "grad_norm": 112.09280395507812, - "kl": 0.1435546875, - "learning_rate": 7.501593371574251e-07, - "loss": 0.0057, - "reward": 1.5489466190338135, - "reward_std": 0.061221715062856674, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4239466190338135, - "rewards/pad": 0.125, - "step": 784 - }, - { - "completion_length": 97.859375, - "epoch": 0.2501593371574251, - "grad_norm": 24.970352172851562, - "kl": 0.146484375, - "learning_rate": 7.498406628425748e-07, - "loss": 0.0058, - "reward": 1.50575590133667, - "reward_std": 0.07637608051300049, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5057559609413147, - "rewards/pad": 0.0, - "step": 785 - }, - { - "completion_length": 68.609375, - "epoch": 0.25047801147227533, - "grad_norm": 78.99993133544922, - "kl": 0.11669921875, - "learning_rate": 7.495219885277246e-07, - "loss": 0.0047, - "reward": 1.6616427898406982, - "reward_std": 0.11297961324453354, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.599142849445343, - "rewards/pad": 0.0625, - "step": 786 - }, - { - "completion_length": 96.703125, - "epoch": 0.25079668578712555, - "grad_norm": 28.041793823242188, - "kl": 0.107421875, - "learning_rate": 7.492033142128744e-07, - "loss": 0.0043, - "reward": 1.4415643215179443, - "reward_std": 0.05869434028863907, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44156432151794434, - "step": 787 - }, - { - "completion_length": 176.875, - "epoch": 0.25111536010197577, - "grad_norm": 38.69453430175781, - "kl": 0.078125, - "learning_rate": 7.488846398980242e-07, - "loss": 0.0031, - "reward": 1.4662871360778809, - "reward_std": 0.04408044368028641, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.466287225484848, - "rewards/pad": 0.0, - "step": 788 - }, - { - "completion_length": 98.40625, - "epoch": 0.251434034416826, - "grad_norm": 77.54596710205078, - "kl": 0.0947265625, - "learning_rate": 7.485659655831739e-07, - "loss": 0.0038, - "reward": 1.611450433731079, - "reward_std": 0.051737017929553986, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4864504635334015, - "rewards/pad": 0.125, - "step": 789 - }, - { - "completion_length": 124.15625, - "epoch": 0.2517527087316762, - "grad_norm": 23.194894790649414, - "kl": 0.07080078125, - "learning_rate": 7.482472912683237e-07, - "loss": 0.0028, - "reward": 1.4636067152023315, - "reward_std": 0.04804643988609314, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33860671520233154, - "step": 790 - }, - { - "completion_length": 123.953125, - "epoch": 0.25207138304652643, - "grad_norm": 40.468902587890625, - "kl": 0.15234375, - "learning_rate": 7.479286169534736e-07, - "loss": 0.0061, - "reward": 1.4913312196731567, - "reward_std": 0.1270146518945694, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4757062494754791, - "step": 791 - }, - { - "completion_length": 18.625, - "epoch": 0.25239005736137665, - "grad_norm": 27.677337646484375, - "kl": 0.255859375, - "learning_rate": 7.476099426386234e-07, - "loss": 0.0103, - "reward": 1.3575646877288818, - "reward_std": 0.057783663272857666, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35756468772888184, - "rewards/pad": 0.0, - "step": 792 - }, - { - "completion_length": 122.171875, - "epoch": 0.25270873167622687, - "grad_norm": 113.33208465576172, - "kl": 0.11572265625, - "learning_rate": 7.472912683237731e-07, - "loss": 0.0046, - "reward": 1.457079529762268, - "reward_std": 0.05201934278011322, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45707952976226807, - "step": 793 - }, - { - "completion_length": 126.421875, - "epoch": 0.2530274059910771, - "grad_norm": 24.9990177154541, - "kl": 0.10498046875, - "learning_rate": 7.469725940089229e-07, - "loss": 0.0042, - "reward": 1.466892957687378, - "reward_std": 0.11991224437952042, - "rewards/answer_reward": 0.09375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.37314292788505554, - "step": 794 - }, - { - "completion_length": 151.875, - "epoch": 0.25334608030592737, - "grad_norm": 20.538066864013672, - "kl": 0.08056640625, - "learning_rate": 7.466539196940727e-07, - "loss": 0.0032, - "reward": 1.6273270845413208, - "reward_std": 0.061867453157901764, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3773270845413208, - "step": 795 - }, - { - "completion_length": 121.9375, - "epoch": 0.2536647546207776, - "grad_norm": 58.47627639770508, - "kl": 0.11474609375, - "learning_rate": 7.463352453792225e-07, - "loss": 0.0046, - "reward": 1.5742998123168945, - "reward_std": 0.03341483324766159, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5742998719215393, - "rewards/pad": 0.0, - "step": 796 - }, - { - "completion_length": 122.8125, - "epoch": 0.2539834289356278, - "grad_norm": 19.176841735839844, - "kl": 0.130859375, - "learning_rate": 7.460165710643722e-07, - "loss": 0.0052, - "reward": 1.4974907636642456, - "reward_std": 0.04454575479030609, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49749070405960083, - "rewards/pad": 0.0, - "step": 797 - }, - { - "completion_length": 46.171875, - "epoch": 0.25430210325047803, - "grad_norm": 24.082975387573242, - "kl": 0.1884765625, - "learning_rate": 7.45697896749522e-07, - "loss": 0.0075, - "reward": 1.5651308298110962, - "reward_std": 0.09099891781806946, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4401308000087738, - "step": 798 - }, - { - "completion_length": 72.25, - "epoch": 0.25462077756532825, - "grad_norm": 53.692752838134766, - "kl": 0.1220703125, - "learning_rate": 7.453792224346718e-07, - "loss": 0.0049, - "reward": 1.537758231163025, - "reward_std": 0.09440574795007706, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4283832907676697, - "rewards/pad": 0.109375, - "step": 799 - }, - { - "completion_length": 96.328125, - "epoch": 0.25493945188017847, - "grad_norm": 74.38345336914062, - "kl": 0.1123046875, - "learning_rate": 7.450605481198216e-07, - "loss": 0.0045, - "reward": 1.6785751581192017, - "reward_std": 0.11030935496091843, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5535751581192017, - "rewards/pad": 0.125, - "step": 800 - }, - { - "completion_length": 95.84375, - "epoch": 0.2552581261950287, - "grad_norm": 30.01809310913086, - "kl": 0.123046875, - "learning_rate": 7.447418738049713e-07, - "loss": 0.0049, - "reward": 1.506225347518921, - "reward_std": 0.08522427827119827, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5062253475189209, - "rewards/pad": 0.0, - "step": 801 - }, - { - "completion_length": 68.71875, - "epoch": 0.2555768005098789, - "grad_norm": 36.41709899902344, - "kl": 0.30078125, - "learning_rate": 7.444231994901211e-07, - "loss": 0.012, - "reward": 1.4630727767944336, - "reward_std": 0.0720161646604538, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.463072806596756, - "rewards/pad": 0.0, - "step": 802 - }, - { - "completion_length": 71.28125, - "epoch": 0.25589547482472913, - "grad_norm": 70.04369354248047, - "kl": 0.1279296875, - "learning_rate": 7.441045251752709e-07, - "loss": 0.0051, - "reward": 1.5551037788391113, - "reward_std": 0.09400936961174011, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43010368943214417, - "step": 803 - }, - { - "completion_length": 122.703125, - "epoch": 0.25621414913957935, - "grad_norm": 89.90359497070312, - "kl": 0.109375, - "learning_rate": 7.437858508604206e-07, - "loss": 0.0044, - "reward": 1.509391188621521, - "reward_std": 0.08077703416347504, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.509391188621521, - "rewards/pad": 0.0, - "step": 804 - }, - { - "completion_length": 72.671875, - "epoch": 0.25653282345442957, - "grad_norm": 76.97059631347656, - "kl": 0.12353515625, - "learning_rate": 7.434671765455703e-07, - "loss": 0.0049, - "reward": 1.4971883296966553, - "reward_std": 0.14855819940567017, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.38781341910362244, - "rewards/pad": 0.125, - "step": 805 - }, - { - "completion_length": 96.84375, - "epoch": 0.2568514977692798, - "grad_norm": 20.679262161254883, - "kl": 0.1884765625, - "learning_rate": 7.431485022307201e-07, - "loss": 0.0075, - "reward": 1.5170658826828003, - "reward_std": 0.07145309448242188, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5170659422874451, - "step": 806 - }, - { - "completion_length": 152.4375, - "epoch": 0.25717017208413, - "grad_norm": 15.460671424865723, - "kl": 0.062255859375, - "learning_rate": 7.428298279158699e-07, - "loss": 0.0025, - "reward": 1.4074444770812988, - "reward_std": 0.09351341426372528, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.423069566488266, - "step": 807 - }, - { - "completion_length": 227.65625, - "epoch": 0.25748884639898023, - "grad_norm": 1.5667763948440552, - "kl": 0.0279541015625, - "learning_rate": 7.425111536010196e-07, - "loss": 0.0011, - "reward": 1.2311536073684692, - "reward_std": 0.08234579861164093, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.24677860736846924, - "step": 808 - }, - { - "completion_length": 45.359375, - "epoch": 0.25780752071383045, - "grad_norm": 122.13186645507812, - "kl": 0.1298828125, - "learning_rate": 7.421924792861694e-07, - "loss": 0.0052, - "reward": 1.8012471199035645, - "reward_std": 0.09172539412975311, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42624717950820923, - "step": 809 - }, - { - "completion_length": 96.296875, - "epoch": 0.25812619502868067, - "grad_norm": 26.060197830200195, - "kl": 0.11181640625, - "learning_rate": 7.418738049713192e-07, - "loss": 0.0045, - "reward": 1.5975518226623535, - "reward_std": 0.08361221849918365, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5038020014762878, - "step": 810 - }, - { - "completion_length": 98.15625, - "epoch": 0.2584448693435309, - "grad_norm": 31.350692749023438, - "kl": 0.134765625, - "learning_rate": 7.415551306564691e-07, - "loss": 0.0054, - "reward": 1.492019772529602, - "reward_std": 0.0747697651386261, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49201980233192444, - "rewards/pad": 0.0, - "step": 811 - }, - { - "completion_length": 71.140625, - "epoch": 0.2587635436583811, - "grad_norm": 73.38872528076172, - "kl": 0.125, - "learning_rate": 7.412364563416188e-07, - "loss": 0.005, - "reward": 1.751183271408081, - "reward_std": 0.06803754717111588, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.626183271408081, - "step": 812 - }, - { - "completion_length": 46.9375, - "epoch": 0.25908221797323133, - "grad_norm": 83.22901153564453, - "kl": 0.16796875, - "learning_rate": 7.409177820267686e-07, - "loss": 0.0067, - "reward": 1.6288355588912964, - "reward_std": 0.1803748458623886, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5194604992866516, - "rewards/pad": 0.109375, - "step": 813 - }, - { - "completion_length": 72.671875, - "epoch": 0.2594008922880816, - "grad_norm": 153.54087829589844, - "kl": 0.13671875, - "learning_rate": 7.405991077119184e-07, - "loss": 0.0055, - "reward": 1.5559117794036865, - "reward_std": 0.13691410422325134, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3840367794036865, - "step": 814 - }, - { - "completion_length": 98.59375, - "epoch": 0.25971956660293183, - "grad_norm": 33.62237548828125, - "kl": 0.1328125, - "learning_rate": 7.402804333970682e-07, - "loss": 0.0053, - "reward": 1.7688663005828857, - "reward_std": 0.04326247423887253, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.643866240978241, - "rewards/pad": 0.125, - "step": 815 - }, - { - "completion_length": 172.515625, - "epoch": 0.26003824091778205, - "grad_norm": 67.81851959228516, - "kl": 0.0830078125, - "learning_rate": 7.399617590822179e-07, - "loss": 0.0033, - "reward": 1.5516440868377686, - "reward_std": 0.05654784291982651, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.426643967628479, - "step": 816 - }, - { - "completion_length": 149.671875, - "epoch": 0.26035691523263227, - "grad_norm": 25.504825592041016, - "kl": 0.07763671875, - "learning_rate": 7.396430847673677e-07, - "loss": 0.0031, - "reward": 1.3726608753204346, - "reward_std": 0.02660282514989376, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37266087532043457, - "step": 817 - }, - { - "completion_length": 71.375, - "epoch": 0.2606755895474825, - "grad_norm": 20.733476638793945, - "kl": 0.228515625, - "learning_rate": 7.393244104525175e-07, - "loss": 0.0091, - "reward": 1.8035805225372314, - "reward_std": 0.10151496529579163, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5535804629325867, - "rewards/pad": 0.25, - "step": 818 - }, - { - "completion_length": 97.46875, - "epoch": 0.2609942638623327, - "grad_norm": 45.76640319824219, - "kl": 0.16015625, - "learning_rate": 7.390057361376673e-07, - "loss": 0.0064, - "reward": 1.468562364578247, - "reward_std": 0.04567372053861618, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4685623049736023, - "rewards/pad": 0.0, - "step": 819 - }, - { - "completion_length": 101.09375, - "epoch": 0.26131293817718293, - "grad_norm": 44.104248046875, - "kl": 0.142578125, - "learning_rate": 7.38687061822817e-07, - "loss": 0.0057, - "reward": 1.7732608318328857, - "reward_std": 0.03834523260593414, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3982608914375305, - "step": 820 - }, - { - "completion_length": 100.40625, - "epoch": 0.26163161249203315, - "grad_norm": 84.38065338134766, - "kl": 0.09033203125, - "learning_rate": 7.383683875079668e-07, - "loss": 0.0036, - "reward": 1.8088653087615967, - "reward_std": 0.12013958394527435, - "rewards/pad": 0.3125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4963652491569519, - "step": 821 - }, - { - "completion_length": 69.9375, - "epoch": 0.26195028680688337, - "grad_norm": 61.65689468383789, - "kl": 0.25390625, - "learning_rate": 7.380497131931166e-07, - "loss": 0.0102, - "reward": 1.5655962228775024, - "reward_std": 0.07943125814199448, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44059622287750244, - "step": 822 - }, - { - "completion_length": 96.5625, - "epoch": 0.2622689611217336, - "grad_norm": 35.45212936401367, - "kl": 0.3828125, - "learning_rate": 7.377310388782664e-07, - "loss": 0.0153, - "reward": 1.6489589214324951, - "reward_std": 0.035216521471738815, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6489588618278503, - "rewards/pad": 0.0, - "step": 823 - }, - { - "completion_length": 151.34375, - "epoch": 0.2625876354365838, - "grad_norm": 18.12538719177246, - "kl": 0.0634765625, - "learning_rate": 7.374123645634161e-07, - "loss": 0.0025, - "reward": 1.692170262336731, - "reward_std": 0.0424518883228302, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5671701431274414, - "step": 824 - }, - { - "completion_length": 74.03125, - "epoch": 0.26290630975143403, - "grad_norm": 138.32925415039062, - "kl": 0.1396484375, - "learning_rate": 7.370936902485659e-07, - "loss": 0.0056, - "reward": 1.632441759109497, - "reward_std": 0.0912943109869957, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5074419379234314, - "rewards/pad": 0.125, - "step": 825 - }, - { - "completion_length": 126.109375, - "epoch": 0.26322498406628425, - "grad_norm": 28.759326934814453, - "kl": 0.10205078125, - "learning_rate": 7.367750159337157e-07, - "loss": 0.0041, - "reward": 1.5843374729156494, - "reward_std": 0.06639618426561356, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45933741331100464, - "step": 826 - }, - { - "completion_length": 96.734375, - "epoch": 0.26354365838113447, - "grad_norm": 58.95811462402344, - "kl": 0.107421875, - "learning_rate": 7.364563416188655e-07, - "loss": 0.0043, - "reward": 1.32325279712677, - "reward_std": 0.03803478926420212, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3232528269290924, - "rewards/pad": 0.0, - "step": 827 - }, - { - "completion_length": 98.109375, - "epoch": 0.2638623326959847, - "grad_norm": 28.35196304321289, - "kl": 0.1337890625, - "learning_rate": 7.361376673040152e-07, - "loss": 0.0054, - "reward": 1.5631272792816162, - "reward_std": 0.04592440277338028, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5631272792816162, - "rewards/pad": 0.0, - "step": 828 - }, - { - "completion_length": 44.65625, - "epoch": 0.2641810070108349, - "grad_norm": 42.16221237182617, - "kl": 0.1748046875, - "learning_rate": 7.35818992989165e-07, - "loss": 0.007, - "reward": 1.7024767398834229, - "reward_std": 0.05674516409635544, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5774767994880676, - "rewards/pad": 0.125, - "step": 829 - }, - { - "completion_length": 176.4375, - "epoch": 0.26449968132568513, - "grad_norm": 92.30027770996094, - "kl": 0.08935546875, - "learning_rate": 7.355003186743149e-07, - "loss": 0.0036, - "reward": 1.4316385984420776, - "reward_std": 0.10597049444913864, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.32226359844207764, - "rewards/pad": 0.125, - "step": 830 - }, - { - "completion_length": 146.125, - "epoch": 0.26481835564053535, - "grad_norm": 36.19331359863281, - "kl": 0.126953125, - "learning_rate": 7.351816443594647e-07, - "loss": 0.0051, - "reward": 1.4525105953216553, - "reward_std": 0.08761756867170334, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4525105357170105, - "rewards/pad": 0.0, - "step": 831 - }, - { - "completion_length": 99.609375, - "epoch": 0.2651370299553856, - "grad_norm": 43.91147232055664, - "kl": 0.14453125, - "learning_rate": 7.348629700446144e-07, - "loss": 0.0058, - "reward": 1.6105023622512817, - "reward_std": 0.0791240856051445, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5011274218559265, - "step": 832 - }, - { - "completion_length": 150.46875, - "epoch": 0.2654557042702358, - "grad_norm": 13.669109344482422, - "kl": 0.10302734375, - "learning_rate": 7.345442957297642e-07, - "loss": 0.0041, - "reward": 1.4903199672698975, - "reward_std": 0.0454377755522728, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4903199076652527, - "rewards/pad": 0.0, - "step": 833 - }, - { - "completion_length": 45.953125, - "epoch": 0.26577437858508607, - "grad_norm": 64.69865417480469, - "kl": 0.203125, - "learning_rate": 7.34225621414914e-07, - "loss": 0.0081, - "reward": 1.657322883605957, - "reward_std": 0.054739683866500854, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.532322883605957, - "step": 834 - }, - { - "completion_length": 45.140625, - "epoch": 0.2660930528999363, - "grad_norm": 32.49448776245117, - "kl": 0.1376953125, - "learning_rate": 7.339069471000637e-07, - "loss": 0.0055, - "reward": 1.6843942403793335, - "reward_std": 0.05278486758470535, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.559394121170044, - "rewards/pad": 0.125, - "step": 835 - }, - { - "completion_length": 150.9375, - "epoch": 0.2664117272147865, - "grad_norm": 10.046256065368652, - "kl": 0.06298828125, - "learning_rate": 7.335882727852135e-07, - "loss": 0.0025, - "reward": 1.5606993436813354, - "reward_std": 0.057660944759845734, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43569934368133545, - "step": 836 - }, - { - "completion_length": 95.84375, - "epoch": 0.26673040152963673, - "grad_norm": 21.493789672851562, - "kl": 0.16796875, - "learning_rate": 7.332695984703633e-07, - "loss": 0.0067, - "reward": 1.5020877122879028, - "reward_std": 0.053312573581933975, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5020877718925476, - "step": 837 - }, - { - "completion_length": 127.125, - "epoch": 0.26704907584448695, - "grad_norm": 40.390464782714844, - "kl": 0.08154296875, - "learning_rate": 7.329509241555131e-07, - "loss": 0.0033, - "reward": 1.6771814823150635, - "reward_std": 0.05436215549707413, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42718154191970825, - "rewards/pad": 0.25, - "step": 838 - }, - { - "completion_length": 97.796875, - "epoch": 0.26736775015933717, - "grad_norm": 33.54790115356445, - "kl": 0.12890625, - "learning_rate": 7.326322498406628e-07, - "loss": 0.0051, - "reward": 1.6265747547149658, - "reward_std": 0.031377844512462616, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6265748143196106, - "rewards/pad": 0.0, - "step": 839 - }, - { - "completion_length": 125.875, - "epoch": 0.2676864244741874, - "grad_norm": 120.50456237792969, - "kl": 0.0810546875, - "learning_rate": 7.323135755258126e-07, - "loss": 0.0032, - "reward": 1.562516212463379, - "reward_std": 0.11323312669992447, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42189115285873413, - "step": 840 - }, - { - "completion_length": 97.3125, - "epoch": 0.2680050987890376, - "grad_norm": 15.057744026184082, - "kl": 0.1298828125, - "learning_rate": 7.319949012109624e-07, - "loss": 0.0052, - "reward": 1.3758224248886108, - "reward_std": 0.02350495010614395, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37582242488861084, - "step": 841 - }, - { - "completion_length": 72.015625, - "epoch": 0.26832377310388783, - "grad_norm": 23.781997680664062, - "kl": 0.2236328125, - "learning_rate": 7.316762268961122e-07, - "loss": 0.0089, - "reward": 1.5852105617523193, - "reward_std": 0.09232565760612488, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4758354425430298, - "rewards/pad": 0.109375, - "step": 842 - }, - { - "completion_length": 73.34375, - "epoch": 0.26864244741873805, - "grad_norm": 40.67388916015625, - "kl": 0.1728515625, - "learning_rate": 7.313575525812619e-07, - "loss": 0.0069, - "reward": 1.6226885318756104, - "reward_std": 0.08350202441215515, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3883134126663208, - "step": 843 - }, - { - "completion_length": 72.21875, - "epoch": 0.26896112173358827, - "grad_norm": 62.01053237915039, - "kl": 0.12255859375, - "learning_rate": 7.310388782664116e-07, - "loss": 0.0049, - "reward": 1.8127069473266602, - "reward_std": 0.07875151187181473, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5627070069313049, - "step": 844 - }, - { - "completion_length": 96.5625, - "epoch": 0.2692797960484385, - "grad_norm": 26.037363052368164, - "kl": 0.2177734375, - "learning_rate": 7.307202039515614e-07, - "loss": 0.0087, - "reward": 1.4820842742919922, - "reward_std": 0.0368165522813797, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.482084184885025, - "rewards/pad": 0.0, - "step": 845 - }, - { - "completion_length": 148.109375, - "epoch": 0.2695984703632887, - "grad_norm": 32.60529708862305, - "kl": 0.0791015625, - "learning_rate": 7.304015296367112e-07, - "loss": 0.0032, - "reward": 1.5095293521881104, - "reward_std": 0.07287216186523438, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5095293521881104, - "rewards/pad": 0.0, - "step": 846 - }, - { - "completion_length": 96.15625, - "epoch": 0.26991714467813893, - "grad_norm": 40.325313568115234, - "kl": 0.171875, - "learning_rate": 7.300828553218609e-07, - "loss": 0.0069, - "reward": 1.5568125247955322, - "reward_std": 0.07620823383331299, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5568124651908875, - "step": 847 - }, - { - "completion_length": 123.203125, - "epoch": 0.27023581899298915, - "grad_norm": 72.05610656738281, - "kl": 0.1494140625, - "learning_rate": 7.297641810070108e-07, - "loss": 0.006, - "reward": 1.379019021987915, - "reward_std": 0.04552176594734192, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3790189027786255, - "step": 848 - }, - { - "completion_length": 71.984375, - "epoch": 0.27055449330783937, - "grad_norm": 31.67304039001465, - "kl": 0.1259765625, - "learning_rate": 7.294455066921606e-07, - "loss": 0.005, - "reward": 1.5310370922088623, - "reward_std": 0.044179461896419525, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4060371518135071, - "step": 849 - }, - { - "completion_length": 70.65625, - "epoch": 0.2708731676226896, - "grad_norm": 36.889957427978516, - "kl": 0.1513671875, - "learning_rate": 7.291268323773104e-07, - "loss": 0.0061, - "reward": 1.602161169052124, - "reward_std": 0.11161001771688461, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6021612286567688, - "rewards/pad": 0.0, - "step": 850 - }, - { - "completion_length": 96.84375, - "epoch": 0.2711918419375398, - "grad_norm": 35.18388366699219, - "kl": 0.1357421875, - "learning_rate": 7.288081580624601e-07, - "loss": 0.0054, - "reward": 1.5036709308624268, - "reward_std": 0.06766320019960403, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5036708116531372, - "step": 851 - }, - { - "completion_length": 93.984375, - "epoch": 0.27151051625239003, - "grad_norm": 127.4382553100586, - "kl": 1.5390625, - "learning_rate": 7.284894837476099e-07, - "loss": 0.0615, - "reward": 1.473995566368103, - "reward_std": 0.09904636442661285, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.36462053656578064, - "rewards/pad": 0.125, - "step": 852 - }, - { - "completion_length": 69.3125, - "epoch": 0.27182919056724025, - "grad_norm": 42.15987777709961, - "kl": 0.1513671875, - "learning_rate": 7.281708094327597e-07, - "loss": 0.0061, - "reward": 1.719491958618164, - "reward_std": 0.05202781781554222, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7194918990135193, - "rewards/pad": 0.0, - "step": 853 - }, - { - "completion_length": 44.765625, - "epoch": 0.27214786488209053, - "grad_norm": 56.77442169189453, - "kl": 0.2177734375, - "learning_rate": 7.278521351179095e-07, - "loss": 0.0087, - "reward": 1.592527151107788, - "reward_std": 0.03890862315893173, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5925271511077881, - "rewards/pad": 0.0, - "step": 854 - }, - { - "completion_length": 71.765625, - "epoch": 0.27246653919694075, - "grad_norm": 78.89741516113281, - "kl": 0.2138671875, - "learning_rate": 7.275334608030592e-07, - "loss": 0.0086, - "reward": 1.6108283996582031, - "reward_std": 0.14424830675125122, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5327032804489136, - "step": 855 - }, - { - "completion_length": 71.15625, - "epoch": 0.27278521351179097, - "grad_norm": 23.657489776611328, - "kl": 0.298828125, - "learning_rate": 7.27214786488209e-07, - "loss": 0.012, - "reward": 1.430689811706543, - "reward_std": 0.0537835955619812, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43068990111351013, - "rewards/pad": 0.0, - "step": 856 - }, - { - "completion_length": 44.703125, - "epoch": 0.2731038878266412, - "grad_norm": 31.861421585083008, - "kl": 0.271484375, - "learning_rate": 7.268961121733588e-07, - "loss": 0.0108, - "reward": 1.5638333559036255, - "reward_std": 0.06311056017875671, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5638333559036255, - "step": 857 - }, - { - "completion_length": 124.09375, - "epoch": 0.2734225621414914, - "grad_norm": 16.415027618408203, - "kl": 0.10693359375, - "learning_rate": 7.265774378585086e-07, - "loss": 0.0043, - "reward": 1.59592604637146, - "reward_std": 0.05373538285493851, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.59592604637146, - "rewards/pad": 0.0, - "step": 858 - }, - { - "completion_length": 45.921875, - "epoch": 0.27374123645634163, - "grad_norm": 86.9303207397461, - "kl": 0.2490234375, - "learning_rate": 7.262587635436583e-07, - "loss": 0.0099, - "reward": 1.4179108142852783, - "reward_std": 0.07070551812648773, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2929109036922455, - "rewards/pad": 0.125, - "step": 859 - }, - { - "completion_length": 121.875, - "epoch": 0.27405991077119185, - "grad_norm": 39.70486068725586, - "kl": 0.140625, - "learning_rate": 7.259400892288081e-07, - "loss": 0.0056, - "reward": 1.5685955286026, - "reward_std": 0.04470613971352577, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5685955286026001, - "step": 860 - }, - { - "completion_length": 45.171875, - "epoch": 0.27437858508604207, - "grad_norm": 56.10869598388672, - "kl": 0.150390625, - "learning_rate": 7.256214149139579e-07, - "loss": 0.006, - "reward": 1.7189316749572754, - "reward_std": 0.0561370775103569, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5939316749572754, - "rewards/pad": 0.125, - "step": 861 - }, - { - "completion_length": 96.03125, - "epoch": 0.2746972594008923, - "grad_norm": 57.85542297363281, - "kl": 0.1044921875, - "learning_rate": 7.253027405991076e-07, - "loss": 0.0042, - "reward": 1.52811861038208, - "reward_std": 0.06384322047233582, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5281186103820801, - "rewards/pad": 0.0, - "step": 862 - }, - { - "completion_length": 73.0625, - "epoch": 0.2750159337157425, - "grad_norm": 117.91090393066406, - "kl": 0.1845703125, - "learning_rate": 7.249840662842574e-07, - "loss": 0.0074, - "reward": 1.769296407699585, - "reward_std": 0.07024556398391724, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.519296407699585, - "rewards/pad": 0.25, - "step": 863 - }, - { - "completion_length": 72.734375, - "epoch": 0.27533460803059273, - "grad_norm": 28.344497680664062, - "kl": 0.166015625, - "learning_rate": 7.246653919694072e-07, - "loss": 0.0066, - "reward": 1.793961763381958, - "reward_std": 0.07941886782646179, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.543961763381958, - "rewards/pad": 0.25, - "step": 864 - }, - { - "completion_length": 124.421875, - "epoch": 0.27565328234544295, - "grad_norm": 25.100507736206055, - "kl": 0.0849609375, - "learning_rate": 7.24346717654557e-07, - "loss": 0.0034, - "reward": 1.7098156213760376, - "reward_std": 0.051566511392593384, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3348156809806824, - "step": 865 - }, - { - "completion_length": 72.515625, - "epoch": 0.27597195666029317, - "grad_norm": 54.915462493896484, - "kl": 0.10302734375, - "learning_rate": 7.240280433397067e-07, - "loss": 0.0041, - "reward": 1.5807334184646606, - "reward_std": 0.09976384043693542, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.40885844826698303, - "step": 866 - }, - { - "completion_length": 121.40625, - "epoch": 0.2762906309751434, - "grad_norm": 19.998388290405273, - "kl": 0.169921875, - "learning_rate": 7.237093690248566e-07, - "loss": 0.0068, - "reward": 1.4986286163330078, - "reward_std": 0.0763099268078804, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49862855672836304, - "step": 867 - }, - { - "completion_length": 97.390625, - "epoch": 0.2766093052899936, - "grad_norm": 19.720415115356445, - "kl": 0.11572265625, - "learning_rate": 7.233906947100064e-07, - "loss": 0.0046, - "reward": 1.6597509384155273, - "reward_std": 0.04936613887548447, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4097510576248169, - "step": 868 - }, - { - "completion_length": 97.96875, - "epoch": 0.27692797960484383, - "grad_norm": 26.631547927856445, - "kl": 0.134765625, - "learning_rate": 7.230720203951562e-07, - "loss": 0.0054, - "reward": 1.6074879169464111, - "reward_std": 0.05781853571534157, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3574879467487335, - "rewards/pad": 0.25, - "step": 869 - }, - { - "completion_length": 18.40625, - "epoch": 0.27724665391969405, - "grad_norm": 75.75119018554688, - "kl": 0.201171875, - "learning_rate": 7.227533460803059e-07, - "loss": 0.0081, - "reward": 1.5645930767059326, - "reward_std": 0.1367059350013733, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5645930767059326, - "rewards/pad": 0.0, - "step": 870 - }, - { - "completion_length": 45.1875, - "epoch": 0.2775653282345443, - "grad_norm": 40.70870590209961, - "kl": 0.267578125, - "learning_rate": 7.224346717654557e-07, - "loss": 0.0107, - "reward": 1.689696192741394, - "reward_std": 0.05542924627661705, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5646960735321045, - "rewards/pad": 0.125, - "step": 871 - }, - { - "completion_length": 94.90625, - "epoch": 0.2778840025493945, - "grad_norm": 26.831253051757812, - "kl": 0.259765625, - "learning_rate": 7.221159974506055e-07, - "loss": 0.0104, - "reward": 1.406510829925537, - "reward_std": 0.0437244214117527, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4065108895301819, - "rewards/pad": 0.0, - "step": 872 - }, - { - "completion_length": 92.234375, - "epoch": 0.27820267686424477, - "grad_norm": 33.70995330810547, - "kl": 0.171875, - "learning_rate": 7.217973231357553e-07, - "loss": 0.0069, - "reward": 1.5849461555480957, - "reward_std": 0.07326267659664154, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4599460959434509, - "rewards/pad": 0.125, - "step": 873 - }, - { - "completion_length": 98.453125, - "epoch": 0.278521351179095, - "grad_norm": 24.882591247558594, - "kl": 0.1767578125, - "learning_rate": 7.21478648820905e-07, - "loss": 0.0071, - "reward": 1.6644113063812256, - "reward_std": 0.11803185939788818, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5550362467765808, - "step": 874 - }, - { - "completion_length": 69.03125, - "epoch": 0.2788400254939452, - "grad_norm": 20.49930191040039, - "kl": 0.2734375, - "learning_rate": 7.211599745060548e-07, - "loss": 0.0109, - "reward": 1.5971676111221313, - "reward_std": 0.06809830665588379, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5971676111221313, - "rewards/pad": 0.0, - "step": 875 - }, - { - "completion_length": 114.796875, - "epoch": 0.27915869980879543, - "grad_norm": 31.659582138061523, - "kl": 0.1357421875, - "learning_rate": 7.208413001912046e-07, - "loss": 0.0054, - "reward": 1.5567210912704468, - "reward_std": 0.04574970528483391, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5567210912704468, - "rewards/pad": 0.0, - "step": 876 - }, - { - "completion_length": 121.578125, - "epoch": 0.27947737412364565, - "grad_norm": 13.323171615600586, - "kl": 0.20703125, - "learning_rate": 7.205226258763544e-07, - "loss": 0.0083, - "reward": 1.4455690383911133, - "reward_std": 0.02520911768078804, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4455690085887909, - "rewards/pad": 0.0, - "step": 877 - }, - { - "completion_length": 121.84375, - "epoch": 0.27979604843849587, - "grad_norm": 81.34120178222656, - "kl": 0.1240234375, - "learning_rate": 7.202039515615041e-07, - "loss": 0.005, - "reward": 1.5624570846557617, - "reward_std": 0.19658038020133972, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4218320846557617, - "rewards/pad": 0.15625, - "step": 878 - }, - { - "completion_length": 120.375, - "epoch": 0.2801147227533461, - "grad_norm": 20.47405242919922, - "kl": 0.11865234375, - "learning_rate": 7.198852772466539e-07, - "loss": 0.0047, - "reward": 1.713339924812317, - "reward_std": 0.04620824754238129, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5883399248123169, - "step": 879 - }, - { - "completion_length": 123.046875, - "epoch": 0.2804333970681963, - "grad_norm": 10.733762741088867, - "kl": 0.15625, - "learning_rate": 7.195666029318037e-07, - "loss": 0.0063, - "reward": 1.3499104976654053, - "reward_std": 0.020909493789076805, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2249104231595993, - "step": 880 - }, - { - "completion_length": 120.375, - "epoch": 0.28075207138304653, - "grad_norm": 24.233781814575195, - "kl": 0.146484375, - "learning_rate": 7.192479286169535e-07, - "loss": 0.0058, - "reward": 1.4725770950317383, - "reward_std": 0.09297582507133484, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47257712483406067, - "step": 881 - }, - { - "completion_length": 95.921875, - "epoch": 0.28107074569789675, - "grad_norm": 11.09907054901123, - "kl": 0.1025390625, - "learning_rate": 7.189292543021032e-07, - "loss": 0.0041, - "reward": 1.7779861688613892, - "reward_std": 0.06479617953300476, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40298622846603394, - "step": 882 - }, - { - "completion_length": 70.9375, - "epoch": 0.28138942001274697, - "grad_norm": 82.33478546142578, - "kl": 0.1689453125, - "learning_rate": 7.186105799872529e-07, - "loss": 0.0067, - "reward": 1.7681920528411865, - "reward_std": 0.054539378732442856, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5181920528411865, - "rewards/pad": 0.25, - "step": 883 - }, - { - "completion_length": 118.875, - "epoch": 0.2817080943275972, - "grad_norm": 22.16440200805664, - "kl": 0.1201171875, - "learning_rate": 7.182919056724027e-07, - "loss": 0.0048, - "reward": 1.3002312183380127, - "reward_std": 0.03425529599189758, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3002312481403351, - "step": 884 - }, - { - "completion_length": 68.15625, - "epoch": 0.2820267686424474, - "grad_norm": 30.926509857177734, - "kl": 0.34765625, - "learning_rate": 7.179732313575525e-07, - "loss": 0.0139, - "reward": 1.5398776531219482, - "reward_std": 0.03936472162604332, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5398776531219482, - "rewards/pad": 0.0, - "step": 885 - }, - { - "completion_length": 94.421875, - "epoch": 0.28234544295729763, - "grad_norm": 46.35068130493164, - "kl": 0.16796875, - "learning_rate": 7.176545570427023e-07, - "loss": 0.0067, - "reward": 1.3887062072753906, - "reward_std": 0.1097392737865448, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3887063264846802, - "rewards/pad": 0.0, - "step": 886 - }, - { - "completion_length": 69.5, - "epoch": 0.28266411727214785, - "grad_norm": 26.24706268310547, - "kl": 0.1689453125, - "learning_rate": 7.173358827278521e-07, - "loss": 0.0067, - "reward": 1.6714463233947754, - "reward_std": 0.10547646135091782, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5620713233947754, - "step": 887 - }, - { - "completion_length": 147.4375, - "epoch": 0.2829827915869981, - "grad_norm": 10.859973907470703, - "kl": 0.11279296875, - "learning_rate": 7.170172084130019e-07, - "loss": 0.0045, - "reward": 1.453810691833496, - "reward_std": 0.03332022577524185, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45381075143814087, - "rewards/pad": 0.0, - "step": 888 - }, - { - "completion_length": 46.78125, - "epoch": 0.2833014659018483, - "grad_norm": 49.23662185668945, - "kl": 0.2734375, - "learning_rate": 7.166985340981517e-07, - "loss": 0.0109, - "reward": 1.8417705297470093, - "reward_std": 0.1342671513557434, - "rewards/answer_reward": 0.34375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4980205297470093, - "step": 889 - }, - { - "completion_length": 70.390625, - "epoch": 0.2836201402166985, - "grad_norm": 36.180294036865234, - "kl": 0.263671875, - "learning_rate": 7.163798597833014e-07, - "loss": 0.0106, - "reward": 1.3964661359786987, - "reward_std": 0.08337822556495667, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39646613597869873, - "rewards/pad": 0.0, - "step": 890 - }, - { - "completion_length": 94.796875, - "epoch": 0.28393881453154873, - "grad_norm": 146.4197540283203, - "kl": 0.11474609375, - "learning_rate": 7.160611854684512e-07, - "loss": 0.0046, - "reward": 1.4267725944519043, - "reward_std": 0.11456167697906494, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4423975348472595, - "step": 891 - }, - { - "completion_length": 97.171875, - "epoch": 0.28425748884639895, - "grad_norm": 39.37761688232422, - "kl": 0.1611328125, - "learning_rate": 7.15742511153601e-07, - "loss": 0.0065, - "reward": 1.6899974346160889, - "reward_std": 0.05695369839668274, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5649974346160889, - "step": 892 - }, - { - "completion_length": 21.546875, - "epoch": 0.28457616316124923, - "grad_norm": 43.16127014160156, - "kl": 0.30078125, - "learning_rate": 7.154238368387507e-07, - "loss": 0.012, - "reward": 1.8982596397399902, - "reward_std": 0.11875072121620178, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5388847589492798, - "rewards/pad": 0.359375, - "step": 893 - }, - { - "completion_length": 145.640625, - "epoch": 0.28489483747609945, - "grad_norm": 22.15288543701172, - "kl": 0.091796875, - "learning_rate": 7.151051625239005e-07, - "loss": 0.0037, - "reward": 1.303932785987854, - "reward_std": 0.03641955554485321, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.303932785987854, - "step": 894 - }, - { - "completion_length": 72.515625, - "epoch": 0.28521351179094967, - "grad_norm": 97.79877471923828, - "kl": 0.126953125, - "learning_rate": 7.147864882090503e-07, - "loss": 0.0051, - "reward": 1.45205557346344, - "reward_std": 0.054157327860593796, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3270556330680847, - "rewards/pad": 0.125, - "step": 895 - }, - { - "completion_length": 124.28125, - "epoch": 0.2855321861057999, - "grad_norm": 70.17988586425781, - "kl": 0.0830078125, - "learning_rate": 7.144678138942001e-07, - "loss": 0.0033, - "reward": 1.5320907831192017, - "reward_std": 0.07828199118375778, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40709084272384644, - "step": 896 - }, - { - "completion_length": 123.953125, - "epoch": 0.2858508604206501, - "grad_norm": 22.958566665649414, - "kl": 0.93359375, - "learning_rate": 7.141491395793498e-07, - "loss": 0.0374, - "reward": 1.4208877086639404, - "reward_std": 0.07943449914455414, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42088767886161804, - "rewards/pad": 0.0, - "step": 897 - }, - { - "completion_length": 72.609375, - "epoch": 0.28616953473550033, - "grad_norm": 18.273794174194336, - "kl": 0.10595703125, - "learning_rate": 7.138304652644996e-07, - "loss": 0.0042, - "reward": 1.5178472995758057, - "reward_std": 0.12105812132358551, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.40847232937812805, - "rewards/pad": 0.125, - "step": 898 - }, - { - "completion_length": 122.671875, - "epoch": 0.28648820905035055, - "grad_norm": 34.814266204833984, - "kl": 0.1103515625, - "learning_rate": 7.135117909496494e-07, - "loss": 0.0044, - "reward": 1.6072380542755127, - "reward_std": 0.0331178680062294, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48223793506622314, - "step": 899 - }, - { - "completion_length": 69.875, - "epoch": 0.28680688336520077, - "grad_norm": 83.57713317871094, - "kl": 0.1455078125, - "learning_rate": 7.131931166347992e-07, - "loss": 0.0058, - "reward": 1.6023356914520264, - "reward_std": 0.12720996141433716, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4617108106613159, - "step": 900 - }, - { - "completion_length": 95.203125, - "epoch": 0.287125557680051, - "grad_norm": 64.39884185791016, - "kl": 0.08154296875, - "learning_rate": 7.128744423199489e-07, - "loss": 0.0033, - "reward": 1.5819507837295532, - "reward_std": 0.10809376835823059, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4569507837295532, - "step": 901 - }, - { - "completion_length": 96.875, - "epoch": 0.2874442319949012, - "grad_norm": 21.195661544799805, - "kl": 0.126953125, - "learning_rate": 7.125557680050987e-07, - "loss": 0.0051, - "reward": 1.4954230785369873, - "reward_std": 0.10859449207782745, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5110480785369873, - "rewards/pad": 0.0, - "step": 902 - }, - { - "completion_length": 173.125, - "epoch": 0.28776290630975143, - "grad_norm": 5.571096420288086, - "kl": 0.06787109375, - "learning_rate": 7.122370936902485e-07, - "loss": 0.0027, - "reward": 1.541063904762268, - "reward_std": 0.12838870286941528, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.43168896436691284, - "step": 903 - }, - { - "completion_length": 45.6875, - "epoch": 0.28808158062460165, - "grad_norm": 67.46749114990234, - "kl": 0.11083984375, - "learning_rate": 7.119184193753984e-07, - "loss": 0.0044, - "reward": 1.7282071113586426, - "reward_std": 0.08071044832468033, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6032070517539978, - "rewards/pad": 0.125, - "step": 904 - }, - { - "completion_length": 147.40625, - "epoch": 0.28840025493945187, - "grad_norm": 16.74541473388672, - "kl": 0.0634765625, - "learning_rate": 7.115997450605481e-07, - "loss": 0.0025, - "reward": 1.4114105701446533, - "reward_std": 0.09488704055547714, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4270356297492981, - "rewards/pad": 0.0, - "step": 905 - }, - { - "completion_length": 19.859375, - "epoch": 0.2887189292543021, - "grad_norm": 121.58633422851562, - "kl": 2.765625, - "learning_rate": 7.112810707456979e-07, - "loss": 0.1109, - "reward": 1.7666852474212646, - "reward_std": 0.18628299236297607, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5323102474212646, - "rewards/pad": 0.25, - "step": 906 - }, - { - "completion_length": 120.71875, - "epoch": 0.2890376035691523, - "grad_norm": 11.139300346374512, - "kl": 0.158203125, - "learning_rate": 7.109623964308477e-07, - "loss": 0.0063, - "reward": 1.453099012374878, - "reward_std": 0.09419392049312592, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4530990421772003, - "step": 907 - }, - { - "completion_length": 121.375, - "epoch": 0.28935627788400253, - "grad_norm": 22.76377296447754, - "kl": 0.1328125, - "learning_rate": 7.106437221159975e-07, - "loss": 0.0053, - "reward": 1.3690372705459595, - "reward_std": 0.08764107525348663, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36903730034828186, - "rewards/pad": 0.0, - "step": 908 - }, - { - "completion_length": 97.6875, - "epoch": 0.28967495219885275, - "grad_norm": 36.15110397338867, - "kl": 0.1474609375, - "learning_rate": 7.103250478011472e-07, - "loss": 0.0059, - "reward": 1.514521837234497, - "reward_std": 0.09117848426103592, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5145218372344971, - "rewards/pad": 0.0, - "step": 909 - }, - { - "completion_length": 45.015625, - "epoch": 0.289993626513703, - "grad_norm": 43.60408020019531, - "kl": 0.212890625, - "learning_rate": 7.10006373486297e-07, - "loss": 0.0085, - "reward": 1.572979211807251, - "reward_std": 0.15550121665000916, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5729792714118958, - "rewards/pad": 0.0, - "step": 910 - }, - { - "completion_length": 123.15625, - "epoch": 0.2903123008285532, - "grad_norm": 5.848013877868652, - "kl": 0.1396484375, - "learning_rate": 7.096876991714468e-07, - "loss": 0.0056, - "reward": 1.5237162113189697, - "reward_std": 0.10056842863559723, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5237162113189697, - "step": 911 - }, - { - "completion_length": 74.34375, - "epoch": 0.29063097514340347, - "grad_norm": 28.616756439208984, - "kl": 0.1142578125, - "learning_rate": 7.093690248565966e-07, - "loss": 0.0046, - "reward": 1.6580133438110352, - "reward_std": 0.21618688106536865, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43926331400871277, - "step": 912 - }, - { - "completion_length": 71.71875, - "epoch": 0.2909496494582537, - "grad_norm": 10.530069351196289, - "kl": 0.1123046875, - "learning_rate": 7.090503505417463e-07, - "loss": 0.0045, - "reward": 1.6587841510772705, - "reward_std": 0.14223003387451172, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5494092702865601, - "rewards/pad": 0.125, - "step": 913 - }, - { - "completion_length": 128.546875, - "epoch": 0.2912683237731039, - "grad_norm": 24.983131408691406, - "kl": 0.1396484375, - "learning_rate": 7.087316762268961e-07, - "loss": 0.0056, - "reward": 1.6001564264297485, - "reward_std": 0.08532114326953888, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35015642642974854, - "rewards/pad": 0.25, - "step": 914 - }, - { - "completion_length": 154.4375, - "epoch": 0.29158699808795413, - "grad_norm": 12.255154609680176, - "kl": 0.1015625, - "learning_rate": 7.084130019120459e-07, - "loss": 0.0041, - "reward": 1.4581921100616455, - "reward_std": 0.10803060233592987, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45819202065467834, - "step": 915 - }, - { - "completion_length": 96.484375, - "epoch": 0.29190567240280435, - "grad_norm": 16.626344680786133, - "kl": 0.2734375, - "learning_rate": 7.080943275971957e-07, - "loss": 0.011, - "reward": 1.5853779315948486, - "reward_std": 0.09588365256786346, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5853779911994934, - "rewards/pad": 0.0, - "step": 916 - }, - { - "completion_length": 72.53125, - "epoch": 0.29222434671765457, - "grad_norm": 100.90467071533203, - "kl": 0.0986328125, - "learning_rate": 7.077756532823454e-07, - "loss": 0.004, - "reward": 1.6992861032485962, - "reward_std": 0.10677085071802139, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5742862224578857, - "rewards/pad": 0.125, - "step": 917 - }, - { - "completion_length": 72.609375, - "epoch": 0.2925430210325048, - "grad_norm": 55.456092834472656, - "kl": 0.1630859375, - "learning_rate": 7.074569789674952e-07, - "loss": 0.0065, - "reward": 1.8140954971313477, - "reward_std": 0.1802692860364914, - "rewards/answer_reward": 0.265625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5484704971313477, - "step": 918 - }, - { - "completion_length": 126.328125, - "epoch": 0.292861695347355, - "grad_norm": 63.22246170043945, - "kl": 0.09521484375, - "learning_rate": 7.07138304652645e-07, - "loss": 0.0038, - "reward": 1.5442649126052856, - "reward_std": 0.05458119511604309, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4192649722099304, - "step": 919 - }, - { - "completion_length": 82.546875, - "epoch": 0.29318036966220523, - "grad_norm": 33.70513916015625, - "kl": 0.1357421875, - "learning_rate": 7.068196303377948e-07, - "loss": 0.0054, - "reward": 1.5909119844436646, - "reward_std": 0.13808241486549377, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46591198444366455, - "rewards/pad": 0.125, - "step": 920 - }, - { - "completion_length": 97.546875, - "epoch": 0.29349904397705545, - "grad_norm": 22.9251651763916, - "kl": 0.12060546875, - "learning_rate": 7.065009560229445e-07, - "loss": 0.0048, - "reward": 1.4877631664276123, - "reward_std": 0.07348068058490753, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48776310682296753, - "step": 921 - }, - { - "completion_length": 71.5625, - "epoch": 0.29381771829190567, - "grad_norm": 41.56142044067383, - "kl": 0.1416015625, - "learning_rate": 7.061822817080942e-07, - "loss": 0.0057, - "reward": 1.4704723358154297, - "reward_std": 0.18349112570285797, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4392223358154297, - "step": 922 - }, - { - "completion_length": 72.96875, - "epoch": 0.2941363926067559, - "grad_norm": 23.7952823638916, - "kl": 0.1728515625, - "learning_rate": 7.05863607393244e-07, - "loss": 0.0069, - "reward": 1.7845497131347656, - "reward_std": 0.07781291007995605, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4095497131347656, - "step": 923 - }, - { - "completion_length": 20.125, - "epoch": 0.2944550669216061, - "grad_norm": 30.204940795898438, - "kl": 0.15625, - "learning_rate": 7.055449330783938e-07, - "loss": 0.0062, - "reward": 1.6023484468460083, - "reward_std": 0.14261382818222046, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3992233872413635, - "rewards/pad": 0.203125, - "step": 924 - }, - { - "completion_length": 99.046875, - "epoch": 0.29477374123645633, - "grad_norm": 23.71004867553711, - "kl": 0.087890625, - "learning_rate": 7.052262587635436e-07, - "loss": 0.0035, - "reward": 1.5678212642669678, - "reward_std": 0.07591604441404343, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44282129406929016, - "rewards/pad": 0.125, - "step": 925 - }, - { - "completion_length": 173.90625, - "epoch": 0.29509241555130655, - "grad_norm": 24.144794464111328, - "kl": 0.076171875, - "learning_rate": 7.049075844486934e-07, - "loss": 0.0031, - "reward": 1.3976125717163086, - "reward_std": 0.08255143463611603, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3976125717163086, - "rewards/pad": 0.0, - "step": 926 - }, - { - "completion_length": 98.3125, - "epoch": 0.2954110898661568, - "grad_norm": 18.583818435668945, - "kl": 0.10546875, - "learning_rate": 7.045889101338432e-07, - "loss": 0.0042, - "reward": 1.6382040977478027, - "reward_std": 0.12732411921024323, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.544454038143158, - "step": 927 - }, - { - "completion_length": 70.078125, - "epoch": 0.295729764181007, - "grad_norm": 19.732210159301758, - "kl": 0.1806640625, - "learning_rate": 7.042702358189929e-07, - "loss": 0.0072, - "reward": 1.6882609128952026, - "reward_std": 0.10261107981204987, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5632609128952026, - "rewards/pad": 0.125, - "step": 928 - }, - { - "completion_length": 68.578125, - "epoch": 0.2960484384958572, - "grad_norm": 21.04136085510254, - "kl": 0.173828125, - "learning_rate": 7.039515615041427e-07, - "loss": 0.0069, - "reward": 1.5921133756637573, - "reward_std": 0.1815553903579712, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4983634352684021, - "rewards/pad": 0.09375, - "step": 929 - }, - { - "completion_length": 71.6875, - "epoch": 0.29636711281070743, - "grad_norm": 30.88709259033203, - "kl": 0.1865234375, - "learning_rate": 7.036328871892925e-07, - "loss": 0.0075, - "reward": 1.7662714719772339, - "reward_std": 0.10816091299057007, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5162714123725891, - "rewards/pad": 0.25, - "step": 930 - }, - { - "completion_length": 148.5, - "epoch": 0.29668578712555765, - "grad_norm": 8.878599166870117, - "kl": 0.06884765625, - "learning_rate": 7.033142128744423e-07, - "loss": 0.0028, - "reward": 1.5028890371322632, - "reward_std": 0.07813405245542526, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37788909673690796, - "step": 931 - }, - { - "completion_length": 97.828125, - "epoch": 0.29700446144040793, - "grad_norm": 50.868247985839844, - "kl": 0.10888671875, - "learning_rate": 7.02995538559592e-07, - "loss": 0.0044, - "reward": 1.5021045207977295, - "reward_std": 0.10513219237327576, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3771045506000519, - "rewards/pad": 0.125, - "step": 932 - }, - { - "completion_length": 70.15625, - "epoch": 0.29732313575525815, - "grad_norm": 16.90110206604004, - "kl": 0.12451171875, - "learning_rate": 7.026768642447418e-07, - "loss": 0.005, - "reward": 1.599424958229065, - "reward_std": 0.06715093553066254, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5994249582290649, - "step": 933 - }, - { - "completion_length": 100.515625, - "epoch": 0.29764181007010837, - "grad_norm": 18.996936798095703, - "kl": 0.1396484375, - "learning_rate": 7.023581899298916e-07, - "loss": 0.0056, - "reward": 1.7346221208572388, - "reward_std": 0.12144883722066879, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45337218046188354, - "rewards/pad": 0.28125, - "step": 934 - }, - { - "completion_length": 122.171875, - "epoch": 0.2979604843849586, - "grad_norm": 36.10796356201172, - "kl": 0.08056640625, - "learning_rate": 7.020395156150414e-07, - "loss": 0.0032, - "reward": 1.5318636894226074, - "reward_std": 0.05889037996530533, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40686365962028503, - "step": 935 - }, - { - "completion_length": 200.625, - "epoch": 0.2982791586998088, - "grad_norm": 10.24109172821045, - "kl": 0.05859375, - "learning_rate": 7.017208413001911e-07, - "loss": 0.0023, - "reward": 1.5260357856750488, - "reward_std": 0.05816100537776947, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5260357856750488, - "step": 936 - }, - { - "completion_length": 71.875, - "epoch": 0.29859783301465903, - "grad_norm": 20.65817642211914, - "kl": 0.10498046875, - "learning_rate": 7.014021669853409e-07, - "loss": 0.0042, - "reward": 1.8145759105682373, - "reward_std": 0.04877658560872078, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6895760297775269, - "rewards/pad": 0.125, - "step": 937 - }, - { - "completion_length": 96.765625, - "epoch": 0.29891650732950925, - "grad_norm": 63.60162353515625, - "kl": 0.09375, - "learning_rate": 7.010834926704907e-07, - "loss": 0.0038, - "reward": 1.662316083908081, - "reward_std": 0.1494637280702591, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5529410243034363, - "rewards/pad": 0.125, - "step": 938 - }, - { - "completion_length": 73.0, - "epoch": 0.29923518164435947, - "grad_norm": 20.451332092285156, - "kl": 0.1787109375, - "learning_rate": 7.007648183556405e-07, - "loss": 0.0072, - "reward": 1.5908095836639404, - "reward_std": 0.10362397134304047, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46580955386161804, - "step": 939 - }, - { - "completion_length": 98.984375, - "epoch": 0.2995538559592097, - "grad_norm": 76.37430572509766, - "kl": 0.099609375, - "learning_rate": 7.004461440407902e-07, - "loss": 0.004, - "reward": 1.584186315536499, - "reward_std": 0.05407923460006714, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4591863751411438, - "step": 940 - }, - { - "completion_length": 150.171875, - "epoch": 0.2998725302740599, - "grad_norm": 22.983171463012695, - "kl": 0.087890625, - "learning_rate": 7.0012746972594e-07, - "loss": 0.0035, - "reward": 1.4531705379486084, - "reward_std": 0.07982495427131653, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45317062735557556, - "step": 941 - }, - { - "completion_length": 68.921875, - "epoch": 0.30019120458891013, - "grad_norm": 33.193077087402344, - "kl": 0.1435546875, - "learning_rate": 6.998087954110899e-07, - "loss": 0.0057, - "reward": 1.7539644241333008, - "reward_std": 0.18200719356536865, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45708948373794556, - "rewards/pad": 0.296875, - "step": 942 - }, - { - "completion_length": 96.5, - "epoch": 0.30050987890376035, - "grad_norm": 16.90999412536621, - "kl": 0.16015625, - "learning_rate": 6.994901210962397e-07, - "loss": 0.0064, - "reward": 1.6284356117248535, - "reward_std": 0.10825789719820023, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5034356117248535, - "rewards/pad": 0.125, - "step": 943 - }, - { - "completion_length": 44.109375, - "epoch": 0.3008285532186106, - "grad_norm": 42.69044494628906, - "kl": 0.1796875, - "learning_rate": 6.991714467813894e-07, - "loss": 0.0072, - "reward": 1.3881826400756836, - "reward_std": 0.10093092918395996, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2631826102733612, - "rewards/pad": 0.125, - "step": 944 - }, - { - "completion_length": 97.296875, - "epoch": 0.3011472275334608, - "grad_norm": 51.676631927490234, - "kl": 0.267578125, - "learning_rate": 6.988527724665392e-07, - "loss": 0.0107, - "reward": 1.3762681484222412, - "reward_std": 0.11830200254917145, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3762681484222412, - "rewards/pad": 0.0, - "step": 945 - }, - { - "completion_length": 124.5, - "epoch": 0.301465901848311, - "grad_norm": 36.757896423339844, - "kl": 0.06591796875, - "learning_rate": 6.98534098151689e-07, - "loss": 0.0026, - "reward": 1.6907156705856323, - "reward_std": 0.12420570105314255, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5344657301902771, - "rewards/pad": 0.15625, - "step": 946 - }, - { - "completion_length": 71.96875, - "epoch": 0.30178457616316123, - "grad_norm": 60.29111099243164, - "kl": 0.1435546875, - "learning_rate": 6.982154238368388e-07, - "loss": 0.0057, - "reward": 1.5995805263519287, - "reward_std": 0.11470350623130798, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5995805859565735, - "rewards/pad": 0.0, - "step": 947 - }, - { - "completion_length": 147.796875, - "epoch": 0.30210325047801145, - "grad_norm": 10.41659927368164, - "kl": 0.07666015625, - "learning_rate": 6.978967495219885e-07, - "loss": 0.0031, - "reward": 1.614039421081543, - "reward_std": 0.06173337250947952, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48903945088386536, - "step": 948 - }, - { - "completion_length": 72.875, - "epoch": 0.3024219247928617, - "grad_norm": 34.23875045776367, - "kl": 0.140625, - "learning_rate": 6.975780752071383e-07, - "loss": 0.0056, - "reward": 1.671351671218872, - "reward_std": 0.11177132278680801, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6244765520095825, - "rewards/pad": 0.046875, - "step": 949 - }, - { - "completion_length": 95.75, - "epoch": 0.3027405991077119, - "grad_norm": 45.31456756591797, - "kl": 0.1298828125, - "learning_rate": 6.972594008922881e-07, - "loss": 0.0052, - "reward": 1.6682624816894531, - "reward_std": 0.12410464882850647, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5432624816894531, - "rewards/pad": 0.125, - "step": 950 - }, - { - "completion_length": 121.0625, - "epoch": 0.3030592734225621, - "grad_norm": 29.192468643188477, - "kl": 0.10888671875, - "learning_rate": 6.969407265774379e-07, - "loss": 0.0044, - "reward": 1.4336252212524414, - "reward_std": 0.06404151022434235, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43362507224082947, - "step": 951 - }, - { - "completion_length": 174.796875, - "epoch": 0.3033779477374124, - "grad_norm": 7.179665565490723, - "kl": 0.1259765625, - "learning_rate": 6.966220522625876e-07, - "loss": 0.005, - "reward": 1.3703272342681885, - "reward_std": 0.047330766916275024, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37032726407051086, - "step": 952 - }, - { - "completion_length": 70.0625, - "epoch": 0.3036966220522626, - "grad_norm": 17.197315216064453, - "kl": 0.150390625, - "learning_rate": 6.963033779477374e-07, - "loss": 0.006, - "reward": 1.5124963521957397, - "reward_std": 0.10813236236572266, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5124963521957397, - "rewards/pad": 0.0, - "step": 953 - }, - { - "completion_length": 121.34375, - "epoch": 0.30401529636711283, - "grad_norm": 38.938541412353516, - "kl": 0.09033203125, - "learning_rate": 6.959847036328872e-07, - "loss": 0.0036, - "reward": 1.638798713684082, - "reward_std": 0.05554702877998352, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5137988924980164, - "step": 954 - }, - { - "completion_length": 122.359375, - "epoch": 0.30433397068196305, - "grad_norm": 21.31637954711914, - "kl": 0.1083984375, - "learning_rate": 6.95666029318037e-07, - "loss": 0.0043, - "reward": 1.507123351097107, - "reward_std": 0.07658617943525314, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5071233510971069, - "step": 955 - }, - { - "completion_length": 171.71875, - "epoch": 0.30465264499681327, - "grad_norm": 25.84273338317871, - "kl": 0.07666015625, - "learning_rate": 6.953473550031867e-07, - "loss": 0.0031, - "reward": 1.4941716194152832, - "reward_std": 0.06094829738140106, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.494171679019928, - "step": 956 - }, - { - "completion_length": 96.71875, - "epoch": 0.3049713193116635, - "grad_norm": 17.762001037597656, - "kl": 0.09375, - "learning_rate": 6.950286806883365e-07, - "loss": 0.0037, - "reward": 1.5891180038452148, - "reward_std": 0.043381866067647934, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5891180038452148, - "rewards/pad": 0.0, - "step": 957 - }, - { - "completion_length": 71.90625, - "epoch": 0.3052899936265137, - "grad_norm": 16.231908798217773, - "kl": 0.12255859375, - "learning_rate": 6.947100063734863e-07, - "loss": 0.0049, - "reward": 1.5764062404632568, - "reward_std": 0.06593342125415802, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.32640618085861206, - "rewards/pad": 0.25, - "step": 958 - }, - { - "completion_length": 96.015625, - "epoch": 0.30560866794136393, - "grad_norm": 62.19764709472656, - "kl": 0.091796875, - "learning_rate": 6.94391332058636e-07, - "loss": 0.0037, - "reward": 1.4593048095703125, - "reward_std": 0.06122640520334244, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3343048095703125, - "step": 959 - }, - { - "completion_length": 72.53125, - "epoch": 0.30592734225621415, - "grad_norm": 25.66366195678711, - "kl": 0.185546875, - "learning_rate": 6.940726577437858e-07, - "loss": 0.0074, - "reward": 1.6957283020019531, - "reward_std": 0.08966457843780518, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5707283020019531, - "rewards/pad": 0.125, - "step": 960 - }, - { - "completion_length": 95.5, - "epoch": 0.3062460165710644, - "grad_norm": 23.313758850097656, - "kl": 0.212890625, - "learning_rate": 6.937539834289357e-07, - "loss": 0.0085, - "reward": 1.4860730171203613, - "reward_std": 0.097783163189888, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4860730767250061, - "step": 961 - }, - { - "completion_length": 45.953125, - "epoch": 0.3065646908859146, - "grad_norm": 27.550352096557617, - "kl": 0.2138671875, - "learning_rate": 6.934353091140854e-07, - "loss": 0.0086, - "reward": 1.809466004371643, - "reward_std": 0.11983469128608704, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5750910043716431, - "rewards/pad": 0.234375, - "step": 962 - }, - { - "completion_length": 44.515625, - "epoch": 0.3068833652007648, - "grad_norm": 86.11719512939453, - "kl": 0.2216796875, - "learning_rate": 6.931166347992351e-07, - "loss": 0.0089, - "reward": 1.5869412422180176, - "reward_std": 0.1016860380768776, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5869413018226624, - "rewards/pad": 0.0, - "step": 963 - }, - { - "completion_length": 173.953125, - "epoch": 0.30720203951561503, - "grad_norm": 15.720704078674316, - "kl": 0.103515625, - "learning_rate": 6.927979604843849e-07, - "loss": 0.0041, - "reward": 1.4632954597473145, - "reward_std": 0.04079699516296387, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46329545974731445, - "step": 964 - }, - { - "completion_length": 96.53125, - "epoch": 0.30752071383046525, - "grad_norm": 21.17584800720215, - "kl": 0.1357421875, - "learning_rate": 6.924792861695347e-07, - "loss": 0.0054, - "reward": 1.526242733001709, - "reward_std": 0.10939948260784149, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49499279260635376, - "rewards/pad": 0.03125, - "step": 965 - }, - { - "completion_length": 97.109375, - "epoch": 0.3078393881453155, - "grad_norm": 19.60423469543457, - "kl": 0.1337890625, - "learning_rate": 6.921606118546845e-07, - "loss": 0.0053, - "reward": 1.5197116136550903, - "reward_std": 0.05108891427516937, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5197116136550903, - "step": 966 - }, - { - "completion_length": 70.5, - "epoch": 0.3081580624601657, - "grad_norm": 62.79570007324219, - "kl": 0.16796875, - "learning_rate": 6.918419375398342e-07, - "loss": 0.0067, - "reward": 1.4708547592163086, - "reward_std": 0.06340339034795761, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.34585481882095337, - "step": 967 - }, - { - "completion_length": 96.34375, - "epoch": 0.3084767367750159, - "grad_norm": 24.94214630126953, - "kl": 0.162109375, - "learning_rate": 6.91523263224984e-07, - "loss": 0.0065, - "reward": 1.4744987487792969, - "reward_std": 0.06317903846502304, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4744986891746521, - "rewards/pad": 0.0, - "step": 968 - }, - { - "completion_length": 43.8125, - "epoch": 0.30879541108986613, - "grad_norm": 30.116613388061523, - "kl": 0.1904296875, - "learning_rate": 6.912045889101338e-07, - "loss": 0.0076, - "reward": 1.6158849000930786, - "reward_std": 0.07592947781085968, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4908849000930786, - "rewards/pad": 0.125, - "step": 969 - }, - { - "completion_length": 71.796875, - "epoch": 0.30911408540471635, - "grad_norm": 130.30897521972656, - "kl": 0.193359375, - "learning_rate": 6.908859145952836e-07, - "loss": 0.0078, - "reward": 1.4907974004745483, - "reward_std": 0.11981174349784851, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49079734086990356, - "rewards/pad": 0.0, - "step": 970 - }, - { - "completion_length": 71.609375, - "epoch": 0.30943275971956663, - "grad_norm": 73.44354248046875, - "kl": 0.134765625, - "learning_rate": 6.905672402804333e-07, - "loss": 0.0054, - "reward": 1.5406136512756348, - "reward_std": 0.05049721896648407, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5406137704849243, - "rewards/pad": 0.0, - "step": 971 - }, - { - "completion_length": 69.96875, - "epoch": 0.30975143403441685, - "grad_norm": 21.05704116821289, - "kl": 0.166015625, - "learning_rate": 6.902485659655831e-07, - "loss": 0.0066, - "reward": 1.5726382732391357, - "reward_std": 0.08547885715961456, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5726381540298462, - "rewards/pad": 0.0, - "step": 972 - }, - { - "completion_length": 144.015625, - "epoch": 0.31007010834926707, - "grad_norm": 36.80889129638672, - "kl": 0.0791015625, - "learning_rate": 6.899298916507329e-07, - "loss": 0.0031, - "reward": 1.6074941158294678, - "reward_std": 0.09198617935180664, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6074941158294678, - "rewards/pad": 0.0, - "step": 973 - }, - { - "completion_length": 123.59375, - "epoch": 0.3103887826641173, - "grad_norm": 13.655397415161133, - "kl": 0.11767578125, - "learning_rate": 6.896112173358827e-07, - "loss": 0.0047, - "reward": 1.6335394382476807, - "reward_std": 0.06385113298892975, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5085393190383911, - "step": 974 - }, - { - "completion_length": 122.453125, - "epoch": 0.3107074569789675, - "grad_norm": 113.74642944335938, - "kl": 0.1259765625, - "learning_rate": 6.892925430210324e-07, - "loss": 0.005, - "reward": 1.5529446601867676, - "reward_std": 0.112543486058712, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4904446005821228, - "step": 975 - }, - { - "completion_length": 121.265625, - "epoch": 0.31102613129381773, - "grad_norm": 29.511276245117188, - "kl": 0.1279296875, - "learning_rate": 6.889738687061822e-07, - "loss": 0.0051, - "reward": 1.3846508264541626, - "reward_std": 0.03916772082448006, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3846507966518402, - "step": 976 - }, - { - "completion_length": 46.671875, - "epoch": 0.31134480560866795, - "grad_norm": 40.682498931884766, - "kl": 0.14453125, - "learning_rate": 6.88655194391332e-07, - "loss": 0.0058, - "reward": 1.8048152923583984, - "reward_std": 0.0671246275305748, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4298152029514313, - "step": 977 - }, - { - "completion_length": 94.640625, - "epoch": 0.31166347992351817, - "grad_norm": 15.610057830810547, - "kl": 0.12060546875, - "learning_rate": 6.883365200764818e-07, - "loss": 0.0048, - "reward": 1.5543341636657715, - "reward_std": 0.06744150072336197, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5543341636657715, - "rewards/pad": 0.0, - "step": 978 - }, - { - "completion_length": 145.6875, - "epoch": 0.3119821542383684, - "grad_norm": 34.307952880859375, - "kl": 0.11279296875, - "learning_rate": 6.880178457616315e-07, - "loss": 0.0045, - "reward": 1.4717872142791748, - "reward_std": 0.11070170998573303, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4874122738838196, - "step": 979 - }, - { - "completion_length": 98.890625, - "epoch": 0.3123008285532186, - "grad_norm": 74.25923919677734, - "kl": 0.11083984375, - "learning_rate": 6.876991714467814e-07, - "loss": 0.0044, - "reward": 1.5795928239822388, - "reward_std": 0.06632921099662781, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32959282398223877, - "step": 980 - }, - { - "completion_length": 148.171875, - "epoch": 0.31261950286806883, - "grad_norm": 14.847012519836426, - "kl": 0.1220703125, - "learning_rate": 6.873804971319312e-07, - "loss": 0.0049, - "reward": 1.5648550987243652, - "reward_std": 0.05563882738351822, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43985506892204285, - "step": 981 - }, - { - "completion_length": 149.90625, - "epoch": 0.31293817718291905, - "grad_norm": 21.062965393066406, - "kl": 0.07861328125, - "learning_rate": 6.87061822817081e-07, - "loss": 0.0031, - "reward": 1.6611219644546509, - "reward_std": 0.04433588683605194, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5361219644546509, - "step": 982 - }, - { - "completion_length": 121.296875, - "epoch": 0.3132568514977693, - "grad_norm": 8.826406478881836, - "kl": 0.08544921875, - "learning_rate": 6.867431485022307e-07, - "loss": 0.0034, - "reward": 1.6873703002929688, - "reward_std": 0.04004920646548271, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4373703598976135, - "rewards/pad": 0.25, - "step": 983 - }, - { - "completion_length": 70.59375, - "epoch": 0.3135755258126195, - "grad_norm": 38.75087356567383, - "kl": 0.1826171875, - "learning_rate": 6.864244741873805e-07, - "loss": 0.0073, - "reward": 1.514086127281189, - "reward_std": 0.13811108469963074, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3578360974788666, - "step": 984 - }, - { - "completion_length": 121.078125, - "epoch": 0.3138942001274697, - "grad_norm": 23.378746032714844, - "kl": 0.14453125, - "learning_rate": 6.861057998725303e-07, - "loss": 0.0058, - "reward": 1.6368203163146973, - "reward_std": 0.11664395034313202, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5118203163146973, - "step": 985 - }, - { - "completion_length": 97.3125, - "epoch": 0.31421287444231993, - "grad_norm": 65.3720703125, - "kl": 0.1103515625, - "learning_rate": 6.8578712555768e-07, - "loss": 0.0044, - "reward": 1.5769407749176025, - "reward_std": 0.1618584394454956, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4519408047199249, - "step": 986 - }, - { - "completion_length": 121.75, - "epoch": 0.31453154875717015, - "grad_norm": 53.989322662353516, - "kl": 0.1552734375, - "learning_rate": 6.854684512428298e-07, - "loss": 0.0062, - "reward": 1.5589345693588257, - "reward_std": 0.05939555913209915, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5589345693588257, - "rewards/pad": 0.0, - "step": 987 - }, - { - "completion_length": 171.234375, - "epoch": 0.3148502230720204, - "grad_norm": 12.67489242553711, - "kl": 0.06689453125, - "learning_rate": 6.851497769279796e-07, - "loss": 0.0027, - "reward": 1.4538019895553589, - "reward_std": 0.03941020369529724, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4538019597530365, - "rewards/pad": 0.0, - "step": 988 - }, - { - "completion_length": 120.84375, - "epoch": 0.3151688973868706, - "grad_norm": 21.424421310424805, - "kl": 0.12451171875, - "learning_rate": 6.848311026131294e-07, - "loss": 0.005, - "reward": 1.5307704210281372, - "reward_std": 0.1141161248087883, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.546395480632782, - "step": 989 - }, - { - "completion_length": 96.46875, - "epoch": 0.3154875717017208, - "grad_norm": 28.962202072143555, - "kl": 0.1103515625, - "learning_rate": 6.845124282982791e-07, - "loss": 0.0044, - "reward": 1.519028902053833, - "reward_std": 0.04207426309585571, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39402878284454346, - "step": 990 - }, - { - "completion_length": 97.796875, - "epoch": 0.3158062460165711, - "grad_norm": 126.08263397216797, - "kl": 0.11376953125, - "learning_rate": 6.841937539834289e-07, - "loss": 0.0046, - "reward": 1.5184412002563477, - "reward_std": 0.06963367760181427, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2684412896633148, - "rewards/pad": 0.25, - "step": 991 - }, - { - "completion_length": 67.875, - "epoch": 0.3161249203314213, - "grad_norm": 66.68081665039062, - "kl": 0.1748046875, - "learning_rate": 6.838750796685787e-07, - "loss": 0.007, - "reward": 1.6175551414489746, - "reward_std": 0.07942277193069458, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6175551414489746, - "step": 992 - }, - { - "completion_length": 97.359375, - "epoch": 0.31644359464627153, - "grad_norm": 23.92911148071289, - "kl": 0.255859375, - "learning_rate": 6.835564053537285e-07, - "loss": 0.0102, - "reward": 1.238165020942688, - "reward_std": 0.08866700530052185, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.222540020942688, - "step": 993 - }, - { - "completion_length": 122.703125, - "epoch": 0.31676226896112175, - "grad_norm": 97.40007019042969, - "kl": 0.08544921875, - "learning_rate": 6.832377310388782e-07, - "loss": 0.0034, - "reward": 1.5865464210510254, - "reward_std": 0.18776553869247437, - "rewards/pad": 0.265625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32092148065567017, - "step": 994 - }, - { - "completion_length": 119.75, - "epoch": 0.31708094327597197, - "grad_norm": 50.509674072265625, - "kl": 0.125, - "learning_rate": 6.82919056724028e-07, - "loss": 0.005, - "reward": 1.5579121112823486, - "reward_std": 0.04533017426729202, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43291202187538147, - "rewards/pad": 0.125, - "step": 995 - }, - { - "completion_length": 173.40625, - "epoch": 0.3173996175908222, - "grad_norm": 9.46157169342041, - "kl": 0.07958984375, - "learning_rate": 6.826003824091778e-07, - "loss": 0.0032, - "reward": 1.315908432006836, - "reward_std": 0.02767709270119667, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3159083127975464, - "step": 996 - }, - { - "completion_length": 95.578125, - "epoch": 0.3177182919056724, - "grad_norm": 181.33319091796875, - "kl": 0.1455078125, - "learning_rate": 6.822817080943276e-07, - "loss": 0.0058, - "reward": 1.6179592609405518, - "reward_std": 0.1326274871826172, - "rewards/answer_reward": 0.09375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5242092609405518, - "step": 997 - }, - { - "completion_length": 70.15625, - "epoch": 0.31803696622052263, - "grad_norm": 24.98843765258789, - "kl": 0.26953125, - "learning_rate": 6.819630337794773e-07, - "loss": 0.0108, - "reward": 1.6061959266662598, - "reward_std": 0.07357417047023773, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.606195867061615, - "step": 998 - }, - { - "completion_length": 70.953125, - "epoch": 0.31835564053537285, - "grad_norm": 21.293861389160156, - "kl": 0.2080078125, - "learning_rate": 6.816443594646272e-07, - "loss": 0.0083, - "reward": 1.7381505966186523, - "reward_std": 0.07790133357048035, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6131504774093628, - "rewards/pad": 0.125, - "step": 999 - }, - { - "completion_length": 123.15625, - "epoch": 0.3186743148502231, - "grad_norm": 103.16876220703125, - "kl": 0.107421875, - "learning_rate": 6.81325685149777e-07, - "loss": 0.0043, - "reward": 1.484614372253418, - "reward_std": 0.07241792976856232, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3596143126487732, - "step": 1000 - }, - { - "completion_length": 149.5625, - "epoch": 0.3189929891650733, - "grad_norm": 9.273104667663574, - "kl": 0.08056640625, - "learning_rate": 6.810070108349267e-07, - "loss": 0.0032, - "reward": 1.6335794925689697, - "reward_std": 0.03922747075557709, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3835795819759369, - "step": 1001 - }, - { - "completion_length": 120.96875, - "epoch": 0.3193116634799235, - "grad_norm": 27.187664031982422, - "kl": 0.166015625, - "learning_rate": 6.806883365200764e-07, - "loss": 0.0066, - "reward": 1.4678020477294922, - "reward_std": 0.04097019135951996, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4678019881248474, - "rewards/pad": 0.0, - "step": 1002 - }, - { - "completion_length": 123.53125, - "epoch": 0.31963033779477373, - "grad_norm": 6.377375602722168, - "kl": 0.12060546875, - "learning_rate": 6.803696622052262e-07, - "loss": 0.0048, - "reward": 1.4584826231002808, - "reward_std": 0.03448522090911865, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45848262310028076, - "step": 1003 - }, - { - "completion_length": 43.953125, - "epoch": 0.31994901210962395, - "grad_norm": 53.49687576293945, - "kl": 0.181640625, - "learning_rate": 6.80050987890376e-07, - "loss": 0.0073, - "reward": 1.544199824333191, - "reward_std": 0.04130152612924576, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5441998243331909, - "rewards/pad": 0.0, - "step": 1004 - }, - { - "completion_length": 20.203125, - "epoch": 0.3202676864244742, - "grad_norm": 69.30970001220703, - "kl": 0.1748046875, - "learning_rate": 6.797323135755258e-07, - "loss": 0.007, - "reward": 1.620455265045166, - "reward_std": 0.12522095441818237, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49545520544052124, - "rewards/pad": 0.125, - "step": 1005 - }, - { - "completion_length": 42.375, - "epoch": 0.3205863607393244, - "grad_norm": 104.5580825805664, - "kl": 0.185546875, - "learning_rate": 6.794136392606755e-07, - "loss": 0.0074, - "reward": 1.6227277517318726, - "reward_std": 0.10305334627628326, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5289777517318726, - "rewards/pad": 0.09375, - "step": 1006 - }, - { - "completion_length": 121.515625, - "epoch": 0.3209050350541746, - "grad_norm": 82.89877319335938, - "kl": 0.12890625, - "learning_rate": 6.790949649458253e-07, - "loss": 0.0051, - "reward": 1.4882017374038696, - "reward_std": 0.041941650211811066, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48820173740386963, - "step": 1007 - }, - { - "completion_length": 95.71875, - "epoch": 0.32122370936902483, - "grad_norm": 19.741044998168945, - "kl": 0.10986328125, - "learning_rate": 6.787762906309751e-07, - "loss": 0.0044, - "reward": 1.619329810142517, - "reward_std": 0.04831254482269287, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4943298101425171, - "step": 1008 - }, - { - "completion_length": 118.453125, - "epoch": 0.32154238368387505, - "grad_norm": 42.4965934753418, - "kl": 0.1123046875, - "learning_rate": 6.784576163161249e-07, - "loss": 0.0045, - "reward": 1.4485756158828735, - "reward_std": 0.08220487833023071, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44857558608055115, - "step": 1009 - }, - { - "completion_length": 48.265625, - "epoch": 0.3218610579987253, - "grad_norm": 32.22761154174805, - "kl": 0.1298828125, - "learning_rate": 6.781389420012746e-07, - "loss": 0.0052, - "reward": 1.849495768547058, - "reward_std": 0.08726021647453308, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47449585795402527, - "rewards/pad": 0.375, - "step": 1010 - }, - { - "completion_length": 44.859375, - "epoch": 0.32217973231357555, - "grad_norm": 19.2244815826416, - "kl": 0.12060546875, - "learning_rate": 6.778202676864244e-07, - "loss": 0.0048, - "reward": 1.932586908340454, - "reward_std": 0.05833645537495613, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5575869083404541, - "step": 1011 - }, - { - "completion_length": 144.125, - "epoch": 0.32249840662842577, - "grad_norm": 37.333648681640625, - "kl": 0.10009765625, - "learning_rate": 6.775015933715742e-07, - "loss": 0.004, - "reward": 1.5504242181777954, - "reward_std": 0.03601760044693947, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5504241585731506, - "step": 1012 - }, - { - "completion_length": 97.203125, - "epoch": 0.322817080943276, - "grad_norm": 47.14528274536133, - "kl": 0.1611328125, - "learning_rate": 6.77182919056724e-07, - "loss": 0.0065, - "reward": 1.682951807975769, - "reward_std": 0.08218291401863098, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.557951807975769, - "step": 1013 - }, - { - "completion_length": 119.859375, - "epoch": 0.3231357552581262, - "grad_norm": 36.78887939453125, - "kl": 0.251953125, - "learning_rate": 6.768642447418737e-07, - "loss": 0.0101, - "reward": 1.429707646369934, - "reward_std": 0.08912655711174011, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4297076463699341, - "step": 1014 - }, - { - "completion_length": 99.5, - "epoch": 0.32345442957297643, - "grad_norm": 63.29979705810547, - "kl": 0.138671875, - "learning_rate": 6.765455704270235e-07, - "loss": 0.0055, - "reward": 1.4366942644119263, - "reward_std": 0.08463046699762344, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32731932401657104, - "step": 1015 - }, - { - "completion_length": 66.671875, - "epoch": 0.32377310388782665, - "grad_norm": 39.79801559448242, - "kl": 0.140625, - "learning_rate": 6.762268961121733e-07, - "loss": 0.0056, - "reward": 1.6116578578948975, - "reward_std": 0.09199018031358719, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48665785789489746, - "step": 1016 - }, - { - "completion_length": 46.09375, - "epoch": 0.3240917782026769, - "grad_norm": 64.3765869140625, - "kl": 0.220703125, - "learning_rate": 6.75908221797323e-07, - "loss": 0.0088, - "reward": 1.686189889907837, - "reward_std": 0.0763259083032608, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5611898899078369, - "rewards/pad": 0.125, - "step": 1017 - }, - { - "completion_length": 146.265625, - "epoch": 0.3244104525175271, - "grad_norm": 46.217708587646484, - "kl": 0.2041015625, - "learning_rate": 6.755895474824729e-07, - "loss": 0.0082, - "reward": 1.3923192024230957, - "reward_std": 0.09687663614749908, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3923192322254181, - "step": 1018 - }, - { - "completion_length": 121.5, - "epoch": 0.3247291268323773, - "grad_norm": 21.247774124145508, - "kl": 0.1572265625, - "learning_rate": 6.752708731676227e-07, - "loss": 0.0063, - "reward": 1.637831211090088, - "reward_std": 0.05769766867160797, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5128312706947327, - "rewards/pad": 0.125, - "step": 1019 - }, - { - "completion_length": 120.734375, - "epoch": 0.32504780114722753, - "grad_norm": 29.704471588134766, - "kl": 0.10302734375, - "learning_rate": 6.749521988527725e-07, - "loss": 0.0041, - "reward": 1.5674422979354858, - "reward_std": 0.07865164428949356, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45806729793548584, - "step": 1020 - }, - { - "completion_length": 93.671875, - "epoch": 0.32536647546207775, - "grad_norm": 36.188941955566406, - "kl": 0.115234375, - "learning_rate": 6.746335245379222e-07, - "loss": 0.0046, - "reward": 1.6116740703582764, - "reward_std": 0.12697413563728333, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.37729912996292114, - "rewards/pad": 0.25, - "step": 1021 - }, - { - "completion_length": 121.6875, - "epoch": 0.325685149776928, - "grad_norm": 22.342357635498047, - "kl": 0.09521484375, - "learning_rate": 6.74314850223072e-07, - "loss": 0.0038, - "reward": 1.5704026222229004, - "reward_std": 0.07594575732946396, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.33602750301361084, - "step": 1022 - }, - { - "completion_length": 121.3125, - "epoch": 0.3260038240917782, - "grad_norm": 15.507749557495117, - "kl": 0.36328125, - "learning_rate": 6.739961759082218e-07, - "loss": 0.0145, - "reward": 1.6103272438049316, - "reward_std": 0.06380373984575272, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48532721400260925, - "step": 1023 - }, - { - "completion_length": 43.75, - "epoch": 0.3263224984066284, - "grad_norm": 26.68657684326172, - "kl": 0.1796875, - "learning_rate": 6.736775015933716e-07, - "loss": 0.0072, - "reward": 1.4567975997924805, - "reward_std": 0.04004968702793121, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4567975401878357, - "rewards/pad": 0.0, - "step": 1024 - }, - { - "completion_length": 71.171875, - "epoch": 0.32664117272147863, - "grad_norm": 95.6634292602539, - "kl": 0.177734375, - "learning_rate": 6.733588272785213e-07, - "loss": 0.0071, - "reward": 1.6111572980880737, - "reward_std": 0.11770939826965332, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5486572980880737, - "rewards/pad": 0.0625, - "step": 1025 - }, - { - "completion_length": 120.140625, - "epoch": 0.32695984703632885, - "grad_norm": 131.3987579345703, - "kl": 0.083984375, - "learning_rate": 6.730401529636711e-07, - "loss": 0.0034, - "reward": 1.5982779264450073, - "reward_std": 0.10901899635791779, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3482779264450073, - "step": 1026 - }, - { - "completion_length": 121.03125, - "epoch": 0.3272785213511791, - "grad_norm": 114.73072814941406, - "kl": 0.11279296875, - "learning_rate": 6.727214786488209e-07, - "loss": 0.0045, - "reward": 1.5256927013397217, - "reward_std": 0.16002598404884338, - "rewards/answer_reward": 0.046875, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4944427013397217, - "step": 1027 - }, - { - "completion_length": 71.75, - "epoch": 0.3275971956660293, - "grad_norm": 39.72523880004883, - "kl": 0.1572265625, - "learning_rate": 6.724028043339707e-07, - "loss": 0.0063, - "reward": 1.767228603363037, - "reward_std": 0.04902970418334007, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5172286629676819, - "step": 1028 - }, - { - "completion_length": 94.15625, - "epoch": 0.3279158699808795, - "grad_norm": 28.721555709838867, - "kl": 0.1298828125, - "learning_rate": 6.720841300191204e-07, - "loss": 0.0052, - "reward": 1.5470216274261475, - "reward_std": 0.06241071969270706, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5470216870307922, - "step": 1029 - }, - { - "completion_length": 46.609375, - "epoch": 0.3282345442957298, - "grad_norm": 43.542396545410156, - "kl": 0.13671875, - "learning_rate": 6.717654557042702e-07, - "loss": 0.0055, - "reward": 1.7156548500061035, - "reward_std": 0.0609230101108551, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5906547904014587, - "step": 1030 - }, - { - "completion_length": 70.34375, - "epoch": 0.32855321861058, - "grad_norm": 53.10152816772461, - "kl": 0.111328125, - "learning_rate": 6.7144678138942e-07, - "loss": 0.0045, - "reward": 1.6983790397644043, - "reward_std": 0.051853202283382416, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5733789801597595, - "rewards/pad": 0.125, - "step": 1031 - }, - { - "completion_length": 120.609375, - "epoch": 0.32887189292543023, - "grad_norm": 35.33661651611328, - "kl": 0.1494140625, - "learning_rate": 6.711281070745698e-07, - "loss": 0.006, - "reward": 1.6467417478561401, - "reward_std": 0.08918267488479614, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5217417478561401, - "step": 1032 - }, - { - "completion_length": 92.859375, - "epoch": 0.32919056724028045, - "grad_norm": 81.88345336914062, - "kl": 0.154296875, - "learning_rate": 6.708094327597195e-07, - "loss": 0.0062, - "reward": 1.5557001829147339, - "reward_std": 0.06777728348970413, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4307001531124115, - "rewards/pad": 0.125, - "step": 1033 - }, - { - "completion_length": 68.90625, - "epoch": 0.32950924155513067, - "grad_norm": 18.9570369720459, - "kl": 0.26171875, - "learning_rate": 6.704907584448693e-07, - "loss": 0.0104, - "reward": 1.6313979625701904, - "reward_std": 0.06359601020812988, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6313979625701904, - "rewards/pad": 0.0, - "step": 1034 - }, - { - "completion_length": 96.703125, - "epoch": 0.3298279158699809, - "grad_norm": 69.87822723388672, - "kl": 0.12451171875, - "learning_rate": 6.701720841300191e-07, - "loss": 0.005, - "reward": 1.544966220855713, - "reward_std": 0.10156506299972534, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49809128046035767, - "rewards/pad": 0.046875, - "step": 1035 - }, - { - "completion_length": 69.53125, - "epoch": 0.3301465901848311, - "grad_norm": 17.39261817932129, - "kl": 0.1484375, - "learning_rate": 6.69853409815169e-07, - "loss": 0.0059, - "reward": 1.9315378665924072, - "reward_std": 0.10341665893793106, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6815378069877625, - "step": 1036 - }, - { - "completion_length": 46.921875, - "epoch": 0.33046526449968133, - "grad_norm": 32.02791976928711, - "kl": 0.2216796875, - "learning_rate": 6.695347355003187e-07, - "loss": 0.0089, - "reward": 1.7051042318344116, - "reward_std": 0.055431973189115524, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.455104261636734, - "step": 1037 - }, - { - "completion_length": 118.015625, - "epoch": 0.33078393881453155, - "grad_norm": 46.765708923339844, - "kl": 0.0986328125, - "learning_rate": 6.692160611854685e-07, - "loss": 0.0039, - "reward": 1.5392026901245117, - "reward_std": 0.03716112673282623, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5392026901245117, - "step": 1038 - }, - { - "completion_length": 96.625, - "epoch": 0.3311026131293818, - "grad_norm": 33.89827346801758, - "kl": 0.1318359375, - "learning_rate": 6.688973868706183e-07, - "loss": 0.0053, - "reward": 1.6119462251663208, - "reward_std": 0.0470452755689621, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3619462251663208, - "step": 1039 - }, - { - "completion_length": 145.578125, - "epoch": 0.331421287444232, - "grad_norm": 54.471920013427734, - "kl": 0.0830078125, - "learning_rate": 6.68578712555768e-07, - "loss": 0.0033, - "reward": 1.4640541076660156, - "reward_std": 0.04760254919528961, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4640541672706604, - "step": 1040 - }, - { - "completion_length": 46.3125, - "epoch": 0.3317399617590822, - "grad_norm": 53.303070068359375, - "kl": 0.2470703125, - "learning_rate": 6.682600382409177e-07, - "loss": 0.0099, - "reward": 1.6290390491485596, - "reward_std": 0.14533916115760803, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3946640193462372, - "rewards/pad": 0.25, - "step": 1041 - }, - { - "completion_length": 71.4375, - "epoch": 0.33205863607393243, - "grad_norm": 20.868520736694336, - "kl": 0.11572265625, - "learning_rate": 6.679413639260675e-07, - "loss": 0.0046, - "reward": 1.7570959329605103, - "reward_std": 0.06300647556781769, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6320960521697998, - "rewards/pad": 0.125, - "step": 1042 - }, - { - "completion_length": 18.71875, - "epoch": 0.33237731038878265, - "grad_norm": 29.75905418395996, - "kl": 0.349609375, - "learning_rate": 6.676226896112173e-07, - "loss": 0.014, - "reward": 1.7678983211517334, - "reward_std": 0.08387524634599686, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6428983211517334, - "rewards/pad": 0.125, - "step": 1043 - }, - { - "completion_length": 95.203125, - "epoch": 0.3326959847036329, - "grad_norm": 34.0615119934082, - "kl": 0.2578125, - "learning_rate": 6.67304015296367e-07, - "loss": 0.0103, - "reward": 1.4088149070739746, - "reward_std": 0.04754612594842911, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4088148772716522, - "step": 1044 - }, - { - "completion_length": 68.96875, - "epoch": 0.3330146590184831, - "grad_norm": 219.807861328125, - "kl": 0.259765625, - "learning_rate": 6.669853409815168e-07, - "loss": 0.0103, - "reward": 1.607092261314392, - "reward_std": 0.09234102070331573, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4820922017097473, - "rewards/pad": 0.125, - "step": 1045 - }, - { - "completion_length": 144.09375, - "epoch": 0.3333333333333333, - "grad_norm": 23.311023712158203, - "kl": 0.1142578125, - "learning_rate": 6.666666666666666e-07, - "loss": 0.0046, - "reward": 1.5305759906768799, - "reward_std": 0.06265944242477417, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5305759906768799, - "rewards/pad": 0.0, - "step": 1046 - }, - { - "completion_length": 43.453125, - "epoch": 0.33365200764818354, - "grad_norm": 39.3546028137207, - "kl": 0.306640625, - "learning_rate": 6.663479923518164e-07, - "loss": 0.0123, - "reward": 1.440364122390747, - "reward_std": 0.15070118010044098, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.45598912239074707, - "rewards/pad": 0.0, - "step": 1047 - }, - { - "completion_length": 19.671875, - "epoch": 0.33397068196303376, - "grad_norm": 70.71843719482422, - "kl": 0.515625, - "learning_rate": 6.660293180369661e-07, - "loss": 0.0206, - "reward": 1.607330083847046, - "reward_std": 0.15620729327201843, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5917050242424011, - "step": 1048 - }, - { - "completion_length": 122.453125, - "epoch": 0.334289356277884, - "grad_norm": 27.474184036254883, - "kl": 0.08056640625, - "learning_rate": 6.657106437221159e-07, - "loss": 0.0032, - "reward": 1.4041144847869873, - "reward_std": 0.10201508551836014, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.27911460399627686, - "step": 1049 - }, - { - "completion_length": 120.40625, - "epoch": 0.33460803059273425, - "grad_norm": 28.913145065307617, - "kl": 0.1025390625, - "learning_rate": 6.653919694072657e-07, - "loss": 0.0041, - "reward": 1.5066144466400146, - "reward_std": 0.08884887397289276, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5066144466400146, - "step": 1050 - }, - { - "completion_length": 94.984375, - "epoch": 0.33492670490758447, - "grad_norm": 82.88301086425781, - "kl": 0.1591796875, - "learning_rate": 6.650732950924155e-07, - "loss": 0.0064, - "reward": 1.571221947669983, - "reward_std": 0.15910333395004272, - "rewards/pad": 0.15625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4149720072746277, - "step": 1051 - }, - { - "completion_length": 120.046875, - "epoch": 0.3352453792224347, - "grad_norm": 11.266196250915527, - "kl": 0.1484375, - "learning_rate": 6.647546207775652e-07, - "loss": 0.0059, - "reward": 1.581168532371521, - "reward_std": 0.059135135263204575, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5811686515808105, - "step": 1052 - }, - { - "completion_length": 70.71875, - "epoch": 0.3355640535372849, - "grad_norm": 22.130834579467773, - "kl": 0.134765625, - "learning_rate": 6.64435946462715e-07, - "loss": 0.0054, - "reward": 1.530851125717163, - "reward_std": 0.07286497205495834, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5308512449264526, - "step": 1053 - }, - { - "completion_length": 174.125, - "epoch": 0.33588272785213513, - "grad_norm": 8.050396919250488, - "kl": 0.07958984375, - "learning_rate": 6.641172721478648e-07, - "loss": 0.0032, - "reward": 1.4346730709075928, - "reward_std": 0.0913335531949997, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.32529813051223755, - "rewards/pad": 0.125, - "step": 1054 - }, - { - "completion_length": 95.921875, - "epoch": 0.33620140216698535, - "grad_norm": 135.49716186523438, - "kl": 0.1591796875, - "learning_rate": 6.637985978330147e-07, - "loss": 0.0063, - "reward": 1.4473363161087036, - "reward_std": 0.09837909787893295, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4473363161087036, - "step": 1055 - }, - { - "completion_length": 71.390625, - "epoch": 0.3365200764818356, - "grad_norm": 34.348148345947266, - "kl": 0.265625, - "learning_rate": 6.634799235181644e-07, - "loss": 0.0106, - "reward": 1.6273062229156494, - "reward_std": 0.0879427045583725, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3773062527179718, - "rewards/pad": 0.25, - "step": 1056 - }, - { - "completion_length": 44.765625, - "epoch": 0.3368387507966858, - "grad_norm": 88.11212158203125, - "kl": 0.2353515625, - "learning_rate": 6.631612492033142e-07, - "loss": 0.0094, - "reward": 1.7960689067840576, - "reward_std": 0.08121553808450699, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6710689663887024, - "rewards/pad": 0.125, - "step": 1057 - }, - { - "completion_length": 146.6875, - "epoch": 0.337157425111536, - "grad_norm": 42.112579345703125, - "kl": 0.07568359375, - "learning_rate": 6.62842574888464e-07, - "loss": 0.003, - "reward": 1.4606010913848877, - "reward_std": 0.09805289655923843, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.35122597217559814, - "step": 1058 - }, - { - "completion_length": 146.9375, - "epoch": 0.33747609942638623, - "grad_norm": 18.255258560180664, - "kl": 0.1435546875, - "learning_rate": 6.625239005736138e-07, - "loss": 0.0058, - "reward": 1.36257004737854, - "reward_std": 0.05446300283074379, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36257001757621765, - "step": 1059 - }, - { - "completion_length": 47.015625, - "epoch": 0.33779477374123645, - "grad_norm": 32.94956588745117, - "kl": 0.16796875, - "learning_rate": 6.622052262587635e-07, - "loss": 0.0067, - "reward": 1.5549402236938477, - "reward_std": 0.10364280641078949, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5393152832984924, - "rewards/pad": 0.015625, - "step": 1060 - }, - { - "completion_length": 96.0, - "epoch": 0.3381134480560867, - "grad_norm": 19.255802154541016, - "kl": 0.16015625, - "learning_rate": 6.618865519439133e-07, - "loss": 0.0064, - "reward": 1.4055922031402588, - "reward_std": 0.0773409828543663, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4055922329425812, - "step": 1061 - }, - { - "completion_length": 123.28125, - "epoch": 0.3384321223709369, - "grad_norm": 18.961702346801758, - "kl": 0.109375, - "learning_rate": 6.615678776290631e-07, - "loss": 0.0044, - "reward": 1.7836185693740845, - "reward_std": 0.04367794841527939, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4086185693740845, - "step": 1062 - }, - { - "completion_length": 119.75, - "epoch": 0.3387507966857871, - "grad_norm": 20.319904327392578, - "kl": 0.0947265625, - "learning_rate": 6.612492033142129e-07, - "loss": 0.0038, - "reward": 1.5855618715286255, - "reward_std": 0.050916701555252075, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4605619013309479, - "rewards/pad": 0.125, - "step": 1063 - }, - { - "completion_length": 74.046875, - "epoch": 0.33906947100063733, - "grad_norm": 97.88804626464844, - "kl": 0.181640625, - "learning_rate": 6.609305289993626e-07, - "loss": 0.0073, - "reward": 1.7505017518997192, - "reward_std": 0.14779749512672424, - "rewards/answer_reward": 0.203125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5473767518997192, - "step": 1064 - }, - { - "completion_length": 69.875, - "epoch": 0.33938814531548755, - "grad_norm": 155.23716735839844, - "kl": 0.158203125, - "learning_rate": 6.606118546845124e-07, - "loss": 0.0063, - "reward": 1.4346399307250977, - "reward_std": 0.08623382449150085, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43463993072509766, - "step": 1065 - }, - { - "completion_length": 94.375, - "epoch": 0.3397068196303378, - "grad_norm": 27.05531883239746, - "kl": 0.0869140625, - "learning_rate": 6.602931803696622e-07, - "loss": 0.0035, - "reward": 1.4803881645202637, - "reward_std": 0.03700976446270943, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3553881347179413, - "step": 1066 - }, - { - "completion_length": 120.9375, - "epoch": 0.340025493945188, - "grad_norm": 12.607763290405273, - "kl": 0.177734375, - "learning_rate": 6.59974506054812e-07, - "loss": 0.0071, - "reward": 1.3900195360183716, - "reward_std": 0.03963884338736534, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3900195062160492, - "step": 1067 - }, - { - "completion_length": 70.046875, - "epoch": 0.3403441682600382, - "grad_norm": 23.681140899658203, - "kl": 0.14453125, - "learning_rate": 6.596558317399617e-07, - "loss": 0.0058, - "reward": 1.5820633172988892, - "reward_std": 0.05296536535024643, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45706331729888916, - "rewards/pad": 0.125, - "step": 1068 - }, - { - "completion_length": 69.46875, - "epoch": 0.3406628425748885, - "grad_norm": 25.606416702270508, - "kl": 0.1953125, - "learning_rate": 6.593371574251115e-07, - "loss": 0.0078, - "reward": 1.4689515829086304, - "reward_std": 0.13900893926620483, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46895158290863037, - "rewards/pad": 0.0, - "step": 1069 - }, - { - "completion_length": 68.84375, - "epoch": 0.3409815168897387, - "grad_norm": 51.751583099365234, - "kl": 0.12109375, - "learning_rate": 6.590184831102613e-07, - "loss": 0.0048, - "reward": 1.7762489318847656, - "reward_std": 0.10998112708330154, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5418740510940552, - "rewards/pad": 0.234375, - "step": 1070 - }, - { - "completion_length": 68.953125, - "epoch": 0.34130019120458893, - "grad_norm": 95.57342529296875, - "kl": 0.2001953125, - "learning_rate": 6.586998087954111e-07, - "loss": 0.008, - "reward": 1.5211365222930908, - "reward_std": 0.12379960715770721, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5211365222930908, - "step": 1071 - }, - { - "completion_length": 148.578125, - "epoch": 0.34161886551943915, - "grad_norm": 24.331661224365234, - "kl": 0.0830078125, - "learning_rate": 6.583811344805608e-07, - "loss": 0.0033, - "reward": 1.4323045015335083, - "reward_std": 0.16860775649547577, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4323045015335083, - "rewards/pad": 0.015625, - "step": 1072 - }, - { - "completion_length": 97.75, - "epoch": 0.3419375398342894, - "grad_norm": 39.816951751708984, - "kl": 0.11865234375, - "learning_rate": 6.580624601657106e-07, - "loss": 0.0047, - "reward": 1.5016841888427734, - "reward_std": 0.054465897381305695, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.25168418884277344, - "step": 1073 - }, - { - "completion_length": 70.71875, - "epoch": 0.3422562141491396, - "grad_norm": 25.538747787475586, - "kl": 0.138671875, - "learning_rate": 6.577437858508605e-07, - "loss": 0.0055, - "reward": 1.491959571838379, - "reward_std": 0.0691341906785965, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49195966124534607, - "step": 1074 - }, - { - "completion_length": 145.1875, - "epoch": 0.3425748884639898, - "grad_norm": 18.296266555786133, - "kl": 0.1259765625, - "learning_rate": 6.574251115360103e-07, - "loss": 0.005, - "reward": 1.4316325187683105, - "reward_std": 0.08940847218036652, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43163245916366577, - "step": 1075 - }, - { - "completion_length": 70.546875, - "epoch": 0.34289356277884003, - "grad_norm": 14.101547241210938, - "kl": 0.255859375, - "learning_rate": 6.5710643722116e-07, - "loss": 0.0102, - "reward": 1.4184021949768066, - "reward_std": 0.054325029253959656, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4184022545814514, - "rewards/pad": 0.0, - "step": 1076 - }, - { - "completion_length": 43.65625, - "epoch": 0.34321223709369025, - "grad_norm": 57.003196716308594, - "kl": 0.1572265625, - "learning_rate": 6.567877629063098e-07, - "loss": 0.0063, - "reward": 1.4279489517211914, - "reward_std": 0.1659427285194397, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3810740113258362, - "step": 1077 - }, - { - "completion_length": 95.5625, - "epoch": 0.3435309114085405, - "grad_norm": 51.163570404052734, - "kl": 0.1552734375, - "learning_rate": 6.564690885914596e-07, - "loss": 0.0062, - "reward": 1.669952392578125, - "reward_std": 0.07310619205236435, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5449522733688354, - "step": 1078 - }, - { - "completion_length": 44.40625, - "epoch": 0.3438495857233907, - "grad_norm": 64.22246551513672, - "kl": 0.201171875, - "learning_rate": 6.561504142766092e-07, - "loss": 0.0081, - "reward": 1.6366376876831055, - "reward_std": 0.0761965811252594, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6366376280784607, - "rewards/pad": 0.0, - "step": 1079 - }, - { - "completion_length": 44.421875, - "epoch": 0.3441682600382409, - "grad_norm": 45.51784133911133, - "kl": 0.146484375, - "learning_rate": 6.55831739961759e-07, - "loss": 0.0059, - "reward": 1.596229910850525, - "reward_std": 0.073794886469841, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4712299108505249, - "rewards/pad": 0.125, - "step": 1080 - }, - { - "completion_length": 119.171875, - "epoch": 0.34448693435309113, - "grad_norm": 26.590566635131836, - "kl": 0.111328125, - "learning_rate": 6.555130656469088e-07, - "loss": 0.0044, - "reward": 1.7527601718902588, - "reward_std": 0.11826224625110626, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5183851718902588, - "step": 1081 - }, - { - "completion_length": 95.296875, - "epoch": 0.34480560866794135, - "grad_norm": 78.18777465820312, - "kl": 0.27734375, - "learning_rate": 6.551943913320586e-07, - "loss": 0.0111, - "reward": 1.6009079217910767, - "reward_std": 0.153416246175766, - "rewards/answer_reward": 0.046875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5540329217910767, - "step": 1082 - }, - { - "completion_length": 70.09375, - "epoch": 0.3451242829827916, - "grad_norm": 113.71192932128906, - "kl": 0.162109375, - "learning_rate": 6.548757170172083e-07, - "loss": 0.0065, - "reward": 1.6254713535308838, - "reward_std": 0.07463856041431427, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5004714131355286, - "rewards/pad": 0.125, - "step": 1083 - }, - { - "completion_length": 46.140625, - "epoch": 0.3454429572976418, - "grad_norm": 100.79532623291016, - "kl": 0.1787109375, - "learning_rate": 6.545570427023581e-07, - "loss": 0.0072, - "reward": 1.6921272277832031, - "reward_std": 0.08855599910020828, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5671272873878479, - "rewards/pad": 0.125, - "step": 1084 - }, - { - "completion_length": 119.671875, - "epoch": 0.345761631612492, - "grad_norm": 36.70648193359375, - "kl": 0.1640625, - "learning_rate": 6.542383683875079e-07, - "loss": 0.0066, - "reward": 1.4764964580535889, - "reward_std": 0.044807374477386475, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47649645805358887, - "step": 1085 - }, - { - "completion_length": 69.40625, - "epoch": 0.34608030592734224, - "grad_norm": 38.902225494384766, - "kl": 0.146484375, - "learning_rate": 6.539196940726577e-07, - "loss": 0.0059, - "reward": 1.5409049987792969, - "reward_std": 0.0742073804140091, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5409051179885864, - "step": 1086 - }, - { - "completion_length": 71.296875, - "epoch": 0.34639898024219246, - "grad_norm": 13.453853607177734, - "kl": 0.1572265625, - "learning_rate": 6.536010197578074e-07, - "loss": 0.0063, - "reward": 1.836655855178833, - "reward_std": 0.05411819368600845, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5866559147834778, - "rewards/pad": 0.25, - "step": 1087 - }, - { - "completion_length": 95.953125, - "epoch": 0.3467176545570427, - "grad_norm": 45.534881591796875, - "kl": 0.1064453125, - "learning_rate": 6.532823454429572e-07, - "loss": 0.0043, - "reward": 1.4753446578979492, - "reward_std": 0.052867937833070755, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47534459829330444, - "rewards/pad": 0.0, - "step": 1088 - }, - { - "completion_length": 96.765625, - "epoch": 0.34703632887189295, - "grad_norm": 45.09857940673828, - "kl": 0.16796875, - "learning_rate": 6.52963671128107e-07, - "loss": 0.0067, - "reward": 1.6063323020935059, - "reward_std": 0.036642368882894516, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4813322424888611, - "step": 1089 - }, - { - "completion_length": 144.046875, - "epoch": 0.34735500318674317, - "grad_norm": 12.49614143371582, - "kl": 0.11572265625, - "learning_rate": 6.526449968132568e-07, - "loss": 0.0046, - "reward": 1.6987546682357788, - "reward_std": 0.07186584174633026, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4487547278404236, - "step": 1090 - }, - { - "completion_length": 145.625, - "epoch": 0.3476736775015934, - "grad_norm": 25.01397132873535, - "kl": 0.09228515625, - "learning_rate": 6.523263224984065e-07, - "loss": 0.0037, - "reward": 1.3501834869384766, - "reward_std": 0.0349920280277729, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.350183367729187, - "rewards/pad": 0.0, - "step": 1091 - }, - { - "completion_length": 120.453125, - "epoch": 0.3479923518164436, - "grad_norm": 16.605253219604492, - "kl": 0.140625, - "learning_rate": 6.520076481835563e-07, - "loss": 0.0056, - "reward": 1.478297233581543, - "reward_std": 0.13914169371128082, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.49392229318618774, - "rewards/pad": 0.0, - "step": 1092 - }, - { - "completion_length": 148.265625, - "epoch": 0.34831102613129383, - "grad_norm": 63.30807113647461, - "kl": 0.1728515625, - "learning_rate": 6.516889738687062e-07, - "loss": 0.0069, - "reward": 1.5344979763031006, - "reward_std": 0.05048166215419769, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4094979763031006, - "step": 1093 - }, - { - "completion_length": 93.75, - "epoch": 0.34862970044614405, - "grad_norm": 56.43621826171875, - "kl": 0.134765625, - "learning_rate": 6.51370299553856e-07, - "loss": 0.0054, - "reward": 1.700314998626709, - "reward_std": 0.10079171508550644, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45031502842903137, - "step": 1094 - }, - { - "completion_length": 97.140625, - "epoch": 0.3489483747609943, - "grad_norm": 6.179287910461426, - "kl": 0.193359375, - "learning_rate": 6.510516252390057e-07, - "loss": 0.0077, - "reward": 1.6602232456207275, - "reward_std": 0.08768853545188904, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5352232456207275, - "step": 1095 - }, - { - "completion_length": 96.21875, - "epoch": 0.3492670490758445, - "grad_norm": 41.30459976196289, - "kl": 0.12158203125, - "learning_rate": 6.507329509241555e-07, - "loss": 0.0049, - "reward": 1.5766048431396484, - "reward_std": 0.07573454082012177, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4516048729419708, - "step": 1096 - }, - { - "completion_length": 95.8125, - "epoch": 0.3495857233906947, - "grad_norm": 32.63888168334961, - "kl": 0.130859375, - "learning_rate": 6.504142766093053e-07, - "loss": 0.0052, - "reward": 1.536113977432251, - "reward_std": 0.11577165871858597, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5517390966415405, - "rewards/pad": 0.0, - "step": 1097 - }, - { - "completion_length": 143.796875, - "epoch": 0.34990439770554493, - "grad_norm": 29.69789695739746, - "kl": 0.1005859375, - "learning_rate": 6.500956022944551e-07, - "loss": 0.004, - "reward": 1.5152339935302734, - "reward_std": 0.04404383897781372, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5152339339256287, - "step": 1098 - }, - { - "completion_length": 68.65625, - "epoch": 0.35022307202039515, - "grad_norm": 23.979541778564453, - "kl": 0.1845703125, - "learning_rate": 6.497769279796048e-07, - "loss": 0.0074, - "reward": 1.4446322917938232, - "reward_std": 0.07955366373062134, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44463226199150085, - "step": 1099 - }, - { - "completion_length": 143.3125, - "epoch": 0.3505417463352454, - "grad_norm": 23.917510986328125, - "kl": 0.1376953125, - "learning_rate": 6.494582536647546e-07, - "loss": 0.0055, - "reward": 1.5646904706954956, - "reward_std": 0.047669485211372375, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5646904110908508, - "step": 1100 - }, - { - "completion_length": 172.625, - "epoch": 0.3508604206500956, - "grad_norm": 31.693416595458984, - "kl": 0.06201171875, - "learning_rate": 6.491395793499044e-07, - "loss": 0.0025, - "reward": 1.409838080406189, - "reward_std": 0.047148752957582474, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40983811020851135, - "step": 1101 - }, - { - "completion_length": 43.65625, - "epoch": 0.3511790949649458, - "grad_norm": 37.47905349731445, - "kl": 0.5859375, - "learning_rate": 6.488209050350542e-07, - "loss": 0.0235, - "reward": 1.8206762075424194, - "reward_std": 0.07596644014120102, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5706762671470642, - "step": 1102 - }, - { - "completion_length": 121.375, - "epoch": 0.35149776927979604, - "grad_norm": 155.16152954101562, - "kl": 0.11767578125, - "learning_rate": 6.485022307202039e-07, - "loss": 0.0047, - "reward": 1.570386528968811, - "reward_std": 0.07902577519416809, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5703864693641663, - "rewards/pad": 0.0, - "step": 1103 - }, - { - "completion_length": 146.96875, - "epoch": 0.35181644359464626, - "grad_norm": 14.959189414978027, - "kl": 0.0908203125, - "learning_rate": 6.481835564053537e-07, - "loss": 0.0036, - "reward": 1.6194053888320923, - "reward_std": 0.11047296226024628, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3850303590297699, - "step": 1104 - }, - { - "completion_length": 121.734375, - "epoch": 0.3521351179094965, - "grad_norm": 11.895666122436523, - "kl": 0.11376953125, - "learning_rate": 6.478648820905035e-07, - "loss": 0.0045, - "reward": 1.5688235759735107, - "reward_std": 0.04408888518810272, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5688235759735107, - "step": 1105 - }, - { - "completion_length": 169.09375, - "epoch": 0.3524537922243467, - "grad_norm": 13.154065132141113, - "kl": 0.0830078125, - "learning_rate": 6.475462077756533e-07, - "loss": 0.0033, - "reward": 1.401365041732788, - "reward_std": 0.08900520950555801, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4013649821281433, - "step": 1106 - }, - { - "completion_length": 46.984375, - "epoch": 0.3527724665391969, - "grad_norm": 49.315128326416016, - "kl": 0.2451171875, - "learning_rate": 6.47227533460803e-07, - "loss": 0.0098, - "reward": 1.7354907989501953, - "reward_std": 0.20398296415805817, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5323659181594849, - "rewards/pad": 0.203125, - "step": 1107 - }, - { - "completion_length": 98.703125, - "epoch": 0.35309114085404714, - "grad_norm": 41.61372375488281, - "kl": 0.11279296875, - "learning_rate": 6.469088591459528e-07, - "loss": 0.0045, - "reward": 1.8001890182495117, - "reward_std": 0.09147683531045914, - "rewards/pad": 0.46875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33143895864486694, - "step": 1108 - }, - { - "completion_length": 69.34375, - "epoch": 0.3534098151688974, - "grad_norm": 20.674589157104492, - "kl": 0.20703125, - "learning_rate": 6.465901848311026e-07, - "loss": 0.0083, - "reward": 1.4114694595336914, - "reward_std": 0.09746737778186798, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4114694595336914, - "rewards/pad": 0.0, - "step": 1109 - }, - { - "completion_length": 70.203125, - "epoch": 0.35372848948374763, - "grad_norm": 61.68789291381836, - "kl": 0.12890625, - "learning_rate": 6.462715105162523e-07, - "loss": 0.0052, - "reward": 1.512228012084961, - "reward_std": 0.07808954268693924, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5122280716896057, - "step": 1110 - }, - { - "completion_length": 96.953125, - "epoch": 0.35404716379859785, - "grad_norm": 20.580629348754883, - "kl": 0.1220703125, - "learning_rate": 6.459528362014021e-07, - "loss": 0.0049, - "reward": 1.508711576461792, - "reward_std": 0.0672249048948288, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.508711576461792, - "rewards/pad": 0.0, - "step": 1111 - }, - { - "completion_length": 123.765625, - "epoch": 0.3543658381134481, - "grad_norm": 35.10501480102539, - "kl": 0.0654296875, - "learning_rate": 6.45634161886552e-07, - "loss": 0.0026, - "reward": 1.6096165180206299, - "reward_std": 0.04549971967935562, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3596165180206299, - "step": 1112 - }, - { - "completion_length": 70.3125, - "epoch": 0.3546845124282983, - "grad_norm": 94.42025756835938, - "kl": 0.169921875, - "learning_rate": 6.453154875717018e-07, - "loss": 0.0068, - "reward": 1.668575406074524, - "reward_std": 0.14063525199890137, - "rewards/answer_reward": 0.1875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.48107534646987915, - "step": 1113 - }, - { - "completion_length": 93.328125, - "epoch": 0.3550031867431485, - "grad_norm": 13.738290786743164, - "kl": 0.1279296875, - "learning_rate": 6.449968132568515e-07, - "loss": 0.0051, - "reward": 1.3417246341705322, - "reward_std": 0.059449244290590286, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3417246341705322, - "rewards/pad": 0.0, - "step": 1114 - }, - { - "completion_length": 70.234375, - "epoch": 0.35532186105799873, - "grad_norm": 26.223695755004883, - "kl": 0.1279296875, - "learning_rate": 6.446781389420013e-07, - "loss": 0.0051, - "reward": 1.6727383136749268, - "reward_std": 0.05653863400220871, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.547738254070282, - "rewards/pad": 0.125, - "step": 1115 - }, - { - "completion_length": 46.046875, - "epoch": 0.35564053537284895, - "grad_norm": 252.97560119628906, - "kl": 0.1953125, - "learning_rate": 6.443594646271511e-07, - "loss": 0.0078, - "reward": 1.8712449073791504, - "reward_std": 0.10797697305679321, - "rewards/answer_reward": 0.5, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3712449073791504, - "step": 1116 - }, - { - "completion_length": 68.796875, - "epoch": 0.3559592096876992, - "grad_norm": 14.689714431762695, - "kl": 0.12890625, - "learning_rate": 6.440407903123009e-07, - "loss": 0.0051, - "reward": 1.6983696222305298, - "reward_std": 0.05995948985219002, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44836968183517456, - "step": 1117 - }, - { - "completion_length": 69.078125, - "epoch": 0.3562778840025494, - "grad_norm": 27.807676315307617, - "kl": 0.197265625, - "learning_rate": 6.437221159974505e-07, - "loss": 0.0079, - "reward": 1.6848386526107788, - "reward_std": 0.07083934545516968, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5598386526107788, - "step": 1118 - }, - { - "completion_length": 150.578125, - "epoch": 0.3565965583173996, - "grad_norm": 17.879209518432617, - "kl": 0.09765625, - "learning_rate": 6.434034416826003e-07, - "loss": 0.0039, - "reward": 1.50717031955719, - "reward_std": 0.07021446526050568, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38217031955718994, - "step": 1119 - }, - { - "completion_length": 95.578125, - "epoch": 0.35691523263224983, - "grad_norm": 39.17152404785156, - "kl": 0.130859375, - "learning_rate": 6.430847673677501e-07, - "loss": 0.0052, - "reward": 1.4707567691802979, - "reward_std": 0.02951684780418873, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47075676918029785, - "rewards/pad": 0.0, - "step": 1120 - }, - { - "completion_length": 71.09375, - "epoch": 0.35723390694710005, - "grad_norm": 20.048585891723633, - "kl": 0.2021484375, - "learning_rate": 6.427660930528999e-07, - "loss": 0.0081, - "reward": 1.9080220460891724, - "reward_std": 0.13876330852508545, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5486471056938171, - "rewards/pad": 0.359375, - "step": 1121 - }, - { - "completion_length": 71.546875, - "epoch": 0.3575525812619503, - "grad_norm": 36.162559509277344, - "kl": 0.1484375, - "learning_rate": 6.424474187380496e-07, - "loss": 0.0059, - "reward": 1.79642915725708, - "reward_std": 0.07941967248916626, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5464290976524353, - "rewards/pad": 0.25, - "step": 1122 - }, - { - "completion_length": 94.0625, - "epoch": 0.3578712555768005, - "grad_norm": 42.912445068359375, - "kl": 0.1181640625, - "learning_rate": 6.421287444231994e-07, - "loss": 0.0047, - "reward": 1.6768763065338135, - "reward_std": 0.06940555572509766, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6768762469291687, - "rewards/pad": 0.0, - "step": 1123 - }, - { - "completion_length": 94.75, - "epoch": 0.3581899298916507, - "grad_norm": 39.69688415527344, - "kl": 0.1240234375, - "learning_rate": 6.418100701083492e-07, - "loss": 0.005, - "reward": 1.3529894351959229, - "reward_std": 0.049606405198574066, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3529894948005676, - "rewards/pad": 0.0, - "step": 1124 - }, - { - "completion_length": 147.765625, - "epoch": 0.35850860420650094, - "grad_norm": 25.249536514282227, - "kl": 0.08837890625, - "learning_rate": 6.41491395793499e-07, - "loss": 0.0035, - "reward": 1.538456678390503, - "reward_std": 0.0675881952047348, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4134565591812134, - "rewards/pad": 0.125, - "step": 1125 - }, - { - "completion_length": 72.0625, - "epoch": 0.35882727852135116, - "grad_norm": 115.1136703491211, - "kl": 0.12451171875, - "learning_rate": 6.411727214786487e-07, - "loss": 0.005, - "reward": 1.5841870307922363, - "reward_std": 0.10195166617631912, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45918700098991394, - "rewards/pad": 0.125, - "step": 1126 - }, - { - "completion_length": 72.90625, - "epoch": 0.3591459528362014, - "grad_norm": 18.511503219604492, - "kl": 0.185546875, - "learning_rate": 6.408540471637985e-07, - "loss": 0.0074, - "reward": 1.6491913795471191, - "reward_std": 0.09406599402427673, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.39919140934944153, - "step": 1127 - }, - { - "completion_length": 173.046875, - "epoch": 0.35946462715105165, - "grad_norm": 20.802494049072266, - "kl": 0.107421875, - "learning_rate": 6.405353728489483e-07, - "loss": 0.0043, - "reward": 1.3294429779052734, - "reward_std": 0.048001088201999664, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32944291830062866, - "step": 1128 - }, - { - "completion_length": 71.421875, - "epoch": 0.3597833014659019, - "grad_norm": 50.930999755859375, - "kl": 0.423828125, - "learning_rate": 6.402166985340981e-07, - "loss": 0.0169, - "reward": 1.7195351123809814, - "reward_std": 0.10376234352588654, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5945351123809814, - "step": 1129 - }, - { - "completion_length": 96.796875, - "epoch": 0.3601019757807521, - "grad_norm": 17.261219024658203, - "kl": 0.216796875, - "learning_rate": 6.398980242192478e-07, - "loss": 0.0086, - "reward": 1.6543177366256714, - "reward_std": 0.14827221632003784, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5293177366256714, - "rewards/pad": 0.125, - "step": 1130 - }, - { - "completion_length": 122.40625, - "epoch": 0.3604206500956023, - "grad_norm": 7.444701671600342, - "kl": 0.08544921875, - "learning_rate": 6.395793499043977e-07, - "loss": 0.0034, - "reward": 1.3980233669281006, - "reward_std": 0.04345095157623291, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.273023396730423, - "rewards/pad": 0.125, - "step": 1131 - }, - { - "completion_length": 95.921875, - "epoch": 0.36073932441045253, - "grad_norm": 17.02141571044922, - "kl": 0.109375, - "learning_rate": 6.392606755895475e-07, - "loss": 0.0044, - "reward": 1.536940574645996, - "reward_std": 0.03979106992483139, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5369405150413513, - "rewards/pad": 0.0, - "step": 1132 - }, - { - "completion_length": 100.96875, - "epoch": 0.36105799872530275, - "grad_norm": 26.141685485839844, - "kl": 0.1875, - "learning_rate": 6.389420012746973e-07, - "loss": 0.0075, - "reward": 1.7413113117218018, - "reward_std": 0.07656162232160568, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.36631134152412415, - "step": 1133 - }, - { - "completion_length": 46.203125, - "epoch": 0.361376673040153, - "grad_norm": 37.33512878417969, - "kl": 0.20703125, - "learning_rate": 6.38623326959847e-07, - "loss": 0.0083, - "reward": 1.7170262336730957, - "reward_std": 0.11353346705436707, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4982762336730957, - "step": 1134 - }, - { - "completion_length": 121.359375, - "epoch": 0.3616953473550032, - "grad_norm": 16.106428146362305, - "kl": 0.103515625, - "learning_rate": 6.383046526449968e-07, - "loss": 0.0041, - "reward": 1.5457892417907715, - "reward_std": 0.1631377637386322, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.5770392417907715, - "rewards/pad": 0.0, - "step": 1135 - }, - { - "completion_length": 19.734375, - "epoch": 0.3620140216698534, - "grad_norm": 43.21052169799805, - "kl": 0.138671875, - "learning_rate": 6.379859783301466e-07, - "loss": 0.0056, - "reward": 1.8942912817001343, - "reward_std": 0.07131467759609222, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6442912220954895, - "rewards/pad": 0.25, - "step": 1136 - }, - { - "completion_length": 122.40625, - "epoch": 0.36233269598470363, - "grad_norm": 27.354520797729492, - "kl": 0.1123046875, - "learning_rate": 6.376673040152964e-07, - "loss": 0.0045, - "reward": 1.372056245803833, - "reward_std": 0.0385003387928009, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.372056245803833, - "rewards/pad": 0.0, - "step": 1137 - }, - { - "completion_length": 46.96875, - "epoch": 0.36265137029955385, - "grad_norm": 91.3863296508789, - "kl": 0.1689453125, - "learning_rate": 6.373486297004461e-07, - "loss": 0.0067, - "reward": 1.6147258281707764, - "reward_std": 0.1302720457315445, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47410082817077637, - "rewards/pad": 0.140625, - "step": 1138 - }, - { - "completion_length": 144.875, - "epoch": 0.3629700446144041, - "grad_norm": 15.614913940429688, - "kl": 0.099609375, - "learning_rate": 6.370299553855959e-07, - "loss": 0.004, - "reward": 1.4381680488586426, - "reward_std": 0.051587171852588654, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4381680488586426, - "step": 1139 - }, - { - "completion_length": 74.890625, - "epoch": 0.3632887189292543, - "grad_norm": 26.624767303466797, - "kl": 0.14453125, - "learning_rate": 6.367112810707457e-07, - "loss": 0.0058, - "reward": 1.7303261756896973, - "reward_std": 0.05172932520508766, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48032620549201965, - "rewards/pad": 0.25, - "step": 1140 - }, - { - "completion_length": 123.015625, - "epoch": 0.3636073932441045, - "grad_norm": 52.83194351196289, - "kl": 0.12158203125, - "learning_rate": 6.363926067558954e-07, - "loss": 0.0049, - "reward": 1.5532702207565308, - "reward_std": 0.07574497163295746, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.553270161151886, - "step": 1141 - }, - { - "completion_length": 43.84375, - "epoch": 0.36392606755895474, - "grad_norm": 18.857934951782227, - "kl": 0.23828125, - "learning_rate": 6.360739324410452e-07, - "loss": 0.0095, - "reward": 1.6999969482421875, - "reward_std": 0.1190653145313263, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5749968886375427, - "rewards/pad": 0.125, - "step": 1142 - }, - { - "completion_length": 72.453125, - "epoch": 0.36424474187380496, - "grad_norm": 49.3443603515625, - "kl": 0.2236328125, - "learning_rate": 6.35755258126195e-07, - "loss": 0.0089, - "reward": 1.822791576385498, - "reward_std": 0.11447308957576752, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5727915167808533, - "rewards/pad": 0.25, - "step": 1143 - }, - { - "completion_length": 69.9375, - "epoch": 0.3645634161886552, - "grad_norm": 23.304380416870117, - "kl": 0.1923828125, - "learning_rate": 6.354365838113448e-07, - "loss": 0.0077, - "reward": 1.5509254932403564, - "reward_std": 0.11732175946235657, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.550925612449646, - "rewards/pad": 0.0, - "step": 1144 - }, - { - "completion_length": 45.34375, - "epoch": 0.3648820905035054, - "grad_norm": 21.73938751220703, - "kl": 0.2109375, - "learning_rate": 6.351179094964945e-07, - "loss": 0.0084, - "reward": 1.6023046970367432, - "reward_std": 0.08304779976606369, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6023047566413879, - "rewards/pad": 0.0, - "step": 1145 - }, - { - "completion_length": 71.578125, - "epoch": 0.3652007648183556, - "grad_norm": 45.14289474487305, - "kl": 0.1376953125, - "learning_rate": 6.347992351816443e-07, - "loss": 0.0055, - "reward": 1.6271437406539917, - "reward_std": 0.11034458130598068, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5177686810493469, - "rewards/pad": 0.109375, - "step": 1146 - }, - { - "completion_length": 122.375, - "epoch": 0.36551943913320584, - "grad_norm": 45.17229461669922, - "kl": 0.0947265625, - "learning_rate": 6.344805608667941e-07, - "loss": 0.0038, - "reward": 1.6036796569824219, - "reward_std": 0.06949443370103836, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47867974638938904, - "step": 1147 - }, - { - "completion_length": 95.96875, - "epoch": 0.3658381134480561, - "grad_norm": 23.348485946655273, - "kl": 0.1748046875, - "learning_rate": 6.34161886551944e-07, - "loss": 0.007, - "reward": 1.8184211254119873, - "reward_std": 0.07187941670417786, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6934211254119873, - "step": 1148 - }, - { - "completion_length": 98.015625, - "epoch": 0.36615678776290633, - "grad_norm": 51.460044860839844, - "kl": 0.1103515625, - "learning_rate": 6.338432122370936e-07, - "loss": 0.0044, - "reward": 1.6390737295150757, - "reward_std": 0.11423532664775848, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49844881892204285, - "step": 1149 - }, - { - "completion_length": 123.265625, - "epoch": 0.36647546207775655, - "grad_norm": 35.537864685058594, - "kl": 0.1181640625, - "learning_rate": 6.335245379222435e-07, - "loss": 0.0047, - "reward": 1.4418638944625854, - "reward_std": 0.1120540201663971, - "rewards/answer_reward": 0.078125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.36373889446258545, - "step": 1150 - }, - { - "completion_length": 45.078125, - "epoch": 0.3667941363926068, - "grad_norm": 85.31786346435547, - "kl": 0.140625, - "learning_rate": 6.332058636073933e-07, - "loss": 0.0056, - "reward": 1.5280697345733643, - "reward_std": 0.06672141700983047, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5280698537826538, - "rewards/pad": 0.0, - "step": 1151 - }, - { - "completion_length": 124.5, - "epoch": 0.367112810707457, - "grad_norm": 18.085052490234375, - "kl": 0.08642578125, - "learning_rate": 6.328871892925431e-07, - "loss": 0.0035, - "reward": 1.5425379276275635, - "reward_std": 0.047018010169267654, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4175378978252411, - "step": 1152 - }, - { - "completion_length": 122.03125, - "epoch": 0.3674314850223072, - "grad_norm": 41.6903076171875, - "kl": 0.2177734375, - "learning_rate": 6.325685149776928e-07, - "loss": 0.0087, - "reward": 1.4459846019744873, - "reward_std": 0.09456399083137512, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4459846615791321, - "rewards/pad": 0.0, - "step": 1153 - }, - { - "completion_length": 71.5, - "epoch": 0.36775015933715743, - "grad_norm": 16.231029510498047, - "kl": 0.1591796875, - "learning_rate": 6.322498406628426e-07, - "loss": 0.0064, - "reward": 1.67655611038208, - "reward_std": 0.11449247598648071, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5515559911727905, - "step": 1154 - }, - { - "completion_length": 97.703125, - "epoch": 0.36806883365200765, - "grad_norm": 14.319849967956543, - "kl": 0.095703125, - "learning_rate": 6.319311663479924e-07, - "loss": 0.0038, - "reward": 1.6368542909622192, - "reward_std": 0.08659229427576065, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6368542909622192, - "rewards/pad": 0.0, - "step": 1155 - }, - { - "completion_length": 98.09375, - "epoch": 0.3683875079668579, - "grad_norm": 27.879743576049805, - "kl": 0.08203125, - "learning_rate": 6.316124920331422e-07, - "loss": 0.0033, - "reward": 1.854182481765747, - "reward_std": 0.13125072419643402, - "rewards/pad": 0.5, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3698073625564575, - "step": 1156 - }, - { - "completion_length": 98.9375, - "epoch": 0.3687061822817081, - "grad_norm": 71.60199737548828, - "kl": 0.1923828125, - "learning_rate": 6.312938177182919e-07, - "loss": 0.0077, - "reward": 1.548559546470642, - "reward_std": 0.05056636035442352, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5485595464706421, - "step": 1157 - }, - { - "completion_length": 52.703125, - "epoch": 0.3690248565965583, - "grad_norm": 134.50344848632812, - "kl": 0.142578125, - "learning_rate": 6.309751434034416e-07, - "loss": 0.0057, - "reward": 1.6984164714813232, - "reward_std": 0.13021734356880188, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5890415906906128, - "rewards/pad": 0.125, - "step": 1158 - }, - { - "completion_length": 147.734375, - "epoch": 0.36934353091140854, - "grad_norm": 50.460662841796875, - "kl": 0.10205078125, - "learning_rate": 6.306564690885914e-07, - "loss": 0.0041, - "reward": 1.4703478813171387, - "reward_std": 0.04957844689488411, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47034788131713867, - "step": 1159 - }, - { - "completion_length": 71.421875, - "epoch": 0.36966220522625876, - "grad_norm": 28.34197425842285, - "kl": 0.1484375, - "learning_rate": 6.303377947737412e-07, - "loss": 0.0059, - "reward": 1.50392746925354, - "reward_std": 0.05638600140810013, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3789275586605072, - "step": 1160 - }, - { - "completion_length": 70.875, - "epoch": 0.369980879541109, - "grad_norm": 104.68994903564453, - "kl": 0.212890625, - "learning_rate": 6.300191204588909e-07, - "loss": 0.0085, - "reward": 1.6136631965637207, - "reward_std": 0.08198197185993195, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4886631369590759, - "rewards/pad": 0.125, - "step": 1161 - }, - { - "completion_length": 121.9375, - "epoch": 0.3702995538559592, - "grad_norm": 58.8842658996582, - "kl": 0.1083984375, - "learning_rate": 6.297004461440407e-07, - "loss": 0.0043, - "reward": 1.454060673713684, - "reward_std": 0.08632092922925949, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4540606439113617, - "rewards/pad": 0.0, - "step": 1162 - }, - { - "completion_length": 94.828125, - "epoch": 0.3706182281708094, - "grad_norm": 27.434877395629883, - "kl": 0.37109375, - "learning_rate": 6.293817718291905e-07, - "loss": 0.0149, - "reward": 1.4339649677276611, - "reward_std": 0.10790198296308517, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4339650273323059, - "rewards/pad": 0.0, - "step": 1163 - }, - { - "completion_length": 149.75, - "epoch": 0.37093690248565964, - "grad_norm": 19.707164764404297, - "kl": 0.08740234375, - "learning_rate": 6.290630975143403e-07, - "loss": 0.0035, - "reward": 1.531341552734375, - "reward_std": 0.04004561901092529, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.531341552734375, - "step": 1164 - }, - { - "completion_length": 72.828125, - "epoch": 0.37125557680050986, - "grad_norm": 88.5399169921875, - "kl": 0.10400390625, - "learning_rate": 6.2874442319949e-07, - "loss": 0.0042, - "reward": 1.9113043546676636, - "reward_std": 0.23165732622146606, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 0.96875, - "rewards/iou_glue_reward": 0.5831793546676636, - "step": 1165 - }, - { - "completion_length": 149.5, - "epoch": 0.3715742511153601, - "grad_norm": 16.100812911987305, - "kl": 0.181640625, - "learning_rate": 6.284257488846398e-07, - "loss": 0.0073, - "reward": 1.3362720012664795, - "reward_std": 0.04164520651102066, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.33627191185951233, - "rewards/pad": 0.0, - "step": 1166 - }, - { - "completion_length": 95.109375, - "epoch": 0.37189292543021035, - "grad_norm": 20.495891571044922, - "kl": 0.1298828125, - "learning_rate": 6.281070745697896e-07, - "loss": 0.0052, - "reward": 1.5656877756118774, - "reward_std": 0.047522690147161484, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5656878352165222, - "rewards/pad": 0.0, - "step": 1167 - }, - { - "completion_length": 69.09375, - "epoch": 0.3722115997450606, - "grad_norm": 33.39354705810547, - "kl": 0.1572265625, - "learning_rate": 6.277884002549393e-07, - "loss": 0.0063, - "reward": 1.602851390838623, - "reward_std": 0.13379782438278198, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.618476390838623, - "step": 1168 - }, - { - "completion_length": 70.90625, - "epoch": 0.3725302740599108, - "grad_norm": 88.5570068359375, - "kl": 0.1943359375, - "learning_rate": 6.274697259400892e-07, - "loss": 0.0078, - "reward": 1.6135051250457764, - "reward_std": 0.1037006676197052, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.48850518465042114, - "step": 1169 - }, - { - "completion_length": 98.890625, - "epoch": 0.372848948374761, - "grad_norm": 27.904115676879883, - "kl": 0.166015625, - "learning_rate": 6.27151051625239e-07, - "loss": 0.0066, - "reward": 1.5022716522216797, - "reward_std": 0.06810413300991058, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37727171182632446, - "rewards/pad": 0.125, - "step": 1170 - }, - { - "completion_length": 45.5625, - "epoch": 0.37316762268961123, - "grad_norm": 35.16514587402344, - "kl": 0.2216796875, - "learning_rate": 6.268323773103888e-07, - "loss": 0.0089, - "reward": 1.522203803062439, - "reward_std": 0.1130586639046669, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5065787434577942, - "rewards/pad": 0.015625, - "step": 1171 - }, - { - "completion_length": 122.375, - "epoch": 0.37348629700446145, - "grad_norm": 152.366455078125, - "kl": 0.10693359375, - "learning_rate": 6.265137029955385e-07, - "loss": 0.0043, - "reward": 1.3803231716156006, - "reward_std": 0.08589351922273636, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.2709481418132782, - "step": 1172 - }, - { - "completion_length": 72.640625, - "epoch": 0.3738049713193117, - "grad_norm": 19.461612701416016, - "kl": 0.205078125, - "learning_rate": 6.261950286806883e-07, - "loss": 0.0082, - "reward": 1.5194697380065918, - "reward_std": 0.09573933482170105, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.519469678401947, - "rewards/pad": 0.0, - "step": 1173 - }, - { - "completion_length": 121.671875, - "epoch": 0.3741236456341619, - "grad_norm": 21.8033390045166, - "kl": 0.12109375, - "learning_rate": 6.258763543658381e-07, - "loss": 0.0048, - "reward": 1.5664465427398682, - "reward_std": 0.10758214443922043, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5508216619491577, - "step": 1174 - }, - { - "completion_length": 149.265625, - "epoch": 0.3744423199490121, - "grad_norm": 76.57758331298828, - "kl": 0.11572265625, - "learning_rate": 6.255576800509879e-07, - "loss": 0.0046, - "reward": 1.3753361701965332, - "reward_std": 0.0831504836678505, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3753361701965332, - "step": 1175 - }, - { - "completion_length": 70.6875, - "epoch": 0.37476099426386233, - "grad_norm": 37.332637786865234, - "kl": 0.203125, - "learning_rate": 6.252390057361376e-07, - "loss": 0.0081, - "reward": 1.3779017925262451, - "reward_std": 0.11319969594478607, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3779017925262451, - "step": 1176 - }, - { - "completion_length": 71.0625, - "epoch": 0.37507966857871256, - "grad_norm": 33.94636535644531, - "kl": 0.1513671875, - "learning_rate": 6.249203314212874e-07, - "loss": 0.0061, - "reward": 1.6451020240783691, - "reward_std": 0.14212313294410706, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5982270240783691, - "step": 1177 - }, - { - "completion_length": 44.984375, - "epoch": 0.3753983428935628, - "grad_norm": 32.17403030395508, - "kl": 0.1904296875, - "learning_rate": 6.246016571064372e-07, - "loss": 0.0076, - "reward": 1.541552186012268, - "reward_std": 0.0988098680973053, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41655224561691284, - "rewards/pad": 0.125, - "step": 1178 - }, - { - "completion_length": 121.046875, - "epoch": 0.375717017208413, - "grad_norm": 14.725617408752441, - "kl": 0.1328125, - "learning_rate": 6.24282982791587e-07, - "loss": 0.0053, - "reward": 1.4356043338775635, - "reward_std": 0.0906897485256195, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.451229453086853, - "step": 1179 - }, - { - "completion_length": 151.296875, - "epoch": 0.3760356915232632, - "grad_norm": 13.176466941833496, - "kl": 0.09765625, - "learning_rate": 6.239643084767367e-07, - "loss": 0.0039, - "reward": 1.492295265197754, - "reward_std": 0.09461408853530884, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5079202651977539, - "step": 1180 - }, - { - "completion_length": 97.21875, - "epoch": 0.37635436583811344, - "grad_norm": 38.49825668334961, - "kl": 0.1787109375, - "learning_rate": 6.236456341618865e-07, - "loss": 0.0072, - "reward": 1.4638856649398804, - "reward_std": 0.04449250549077988, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46388566493988037, - "step": 1181 - }, - { - "completion_length": 148.8125, - "epoch": 0.37667304015296366, - "grad_norm": 21.729101181030273, - "kl": 0.07763671875, - "learning_rate": 6.233269598470363e-07, - "loss": 0.0031, - "reward": 1.580154299736023, - "reward_std": 0.029881590977311134, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4551542401313782, - "step": 1182 - }, - { - "completion_length": 150.09375, - "epoch": 0.3769917144678139, - "grad_norm": 25.908597946166992, - "kl": 0.07421875, - "learning_rate": 6.230082855321861e-07, - "loss": 0.003, - "reward": 1.4306869506835938, - "reward_std": 0.032189082354307175, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4306868314743042, - "step": 1183 - }, - { - "completion_length": 99.0, - "epoch": 0.3773103887826641, - "grad_norm": 25.995725631713867, - "kl": 0.09716796875, - "learning_rate": 6.226896112173358e-07, - "loss": 0.0039, - "reward": 1.934666395187378, - "reward_std": 0.05638587474822998, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6846664547920227, - "step": 1184 - }, - { - "completion_length": 96.15625, - "epoch": 0.3776290630975143, - "grad_norm": 36.28327560424805, - "kl": 0.173828125, - "learning_rate": 6.223709369024856e-07, - "loss": 0.0069, - "reward": 1.549536943435669, - "reward_std": 0.14491955935955048, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5495370626449585, - "rewards/pad": 0.0, - "step": 1185 - }, - { - "completion_length": 125.859375, - "epoch": 0.37794773741236454, - "grad_norm": 57.64690399169922, - "kl": 0.13671875, - "learning_rate": 6.220522625876354e-07, - "loss": 0.0055, - "reward": 1.4673360586166382, - "reward_std": 0.062283311039209366, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3423360586166382, - "step": 1186 - }, - { - "completion_length": 45.984375, - "epoch": 0.3782664117272148, - "grad_norm": 64.22225189208984, - "kl": 0.1513671875, - "learning_rate": 6.217335882727853e-07, - "loss": 0.006, - "reward": 1.6241674423217773, - "reward_std": 0.09040925651788712, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6241674423217773, - "rewards/pad": 0.0, - "step": 1187 - }, - { - "completion_length": 71.9375, - "epoch": 0.37858508604206503, - "grad_norm": 114.3721923828125, - "kl": 0.1259765625, - "learning_rate": 6.21414913957935e-07, - "loss": 0.005, - "reward": 1.732898473739624, - "reward_std": 0.12032415717840195, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.514148473739624, - "step": 1188 - }, - { - "completion_length": 97.3125, - "epoch": 0.37890376035691525, - "grad_norm": 35.77212142944336, - "kl": 0.1298828125, - "learning_rate": 6.210962396430848e-07, - "loss": 0.0052, - "reward": 1.8220728635787964, - "reward_std": 0.1216290220618248, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5876979231834412, - "rewards/pad": 0.234375, - "step": 1189 - }, - { - "completion_length": 120.6875, - "epoch": 0.3792224346717655, - "grad_norm": 23.409149169921875, - "kl": 0.11474609375, - "learning_rate": 6.207775653282346e-07, - "loss": 0.0046, - "reward": 1.434619426727295, - "reward_std": 0.04008358716964722, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43461936712265015, - "step": 1190 - }, - { - "completion_length": 124.96875, - "epoch": 0.3795411089866157, - "grad_norm": 38.4090576171875, - "kl": 0.1337890625, - "learning_rate": 6.204588910133844e-07, - "loss": 0.0053, - "reward": 1.5721570253372192, - "reward_std": 0.045577432960271835, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44715702533721924, - "rewards/pad": 0.125, - "step": 1191 - }, - { - "completion_length": 17.8125, - "epoch": 0.3798597833014659, - "grad_norm": 47.2709846496582, - "kl": 0.1533203125, - "learning_rate": 6.201402166985341e-07, - "loss": 0.0061, - "reward": 1.6977386474609375, - "reward_std": 0.048093028366565704, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.697738766670227, - "rewards/pad": 0.0, - "step": 1192 - }, - { - "completion_length": 150.890625, - "epoch": 0.38017845761631613, - "grad_norm": 17.609628677368164, - "kl": 0.08056640625, - "learning_rate": 6.198215423836839e-07, - "loss": 0.0032, - "reward": 1.5538386106491089, - "reward_std": 0.10176517069339752, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5694636106491089, - "step": 1193 - }, - { - "completion_length": 98.546875, - "epoch": 0.38049713193116635, - "grad_norm": 36.28522872924805, - "kl": 0.466796875, - "learning_rate": 6.195028680688337e-07, - "loss": 0.0186, - "reward": 1.6114730834960938, - "reward_std": 0.07779565453529358, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48647308349609375, - "step": 1194 - }, - { - "completion_length": 72.40625, - "epoch": 0.3808158062460166, - "grad_norm": 50.89261245727539, - "kl": 0.2353515625, - "learning_rate": 6.191841937539835e-07, - "loss": 0.0094, - "reward": 1.6488547325134277, - "reward_std": 0.14085832238197327, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5082297921180725, - "rewards/pad": 0.140625, - "step": 1195 - }, - { - "completion_length": 71.03125, - "epoch": 0.3811344805608668, - "grad_norm": 41.0035514831543, - "kl": 0.150390625, - "learning_rate": 6.188655194391332e-07, - "loss": 0.006, - "reward": 1.6350042819976807, - "reward_std": 0.05474071204662323, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6350042819976807, - "rewards/pad": 0.0, - "step": 1196 - }, - { - "completion_length": 45.1875, - "epoch": 0.381453154875717, - "grad_norm": 38.76614761352539, - "kl": 0.298828125, - "learning_rate": 6.185468451242829e-07, - "loss": 0.012, - "reward": 1.6334543228149414, - "reward_std": 0.07611322402954102, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6334543228149414, - "rewards/pad": 0.0, - "step": 1197 - }, - { - "completion_length": 98.0625, - "epoch": 0.38177182919056724, - "grad_norm": 15.111804962158203, - "kl": 0.1328125, - "learning_rate": 6.182281708094327e-07, - "loss": 0.0053, - "reward": 1.5155563354492188, - "reward_std": 0.040137313306331635, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5155563354492188, - "step": 1198 - }, - { - "completion_length": 152.421875, - "epoch": 0.38209050350541746, - "grad_norm": 27.031673431396484, - "kl": 0.169921875, - "learning_rate": 6.179094964945824e-07, - "loss": 0.0068, - "reward": 1.5351272821426392, - "reward_std": 0.10124079883098602, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3945022225379944, - "step": 1199 - }, - { - "completion_length": 98.328125, - "epoch": 0.3824091778202677, - "grad_norm": 64.51046752929688, - "kl": 0.1982421875, - "learning_rate": 6.175908221797322e-07, - "loss": 0.0079, - "reward": 1.6580084562301636, - "reward_std": 0.11241358518600464, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4861334562301636, - "rewards/pad": 0.171875, - "step": 1200 - }, - { - "completion_length": 150.921875, - "epoch": 0.3827278521351179, - "grad_norm": 30.153474807739258, - "kl": 0.09033203125, - "learning_rate": 6.17272147864882e-07, - "loss": 0.0036, - "reward": 1.6002533435821533, - "reward_std": 0.04925467073917389, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47525322437286377, - "step": 1201 - }, - { - "completion_length": 123.421875, - "epoch": 0.3830465264499681, - "grad_norm": 12.115293502807617, - "kl": 0.115234375, - "learning_rate": 6.169534735500318e-07, - "loss": 0.0046, - "reward": 1.4110045433044434, - "reward_std": 0.035006508231163025, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.28600451350212097, - "step": 1202 - }, - { - "completion_length": 147.078125, - "epoch": 0.38336520076481834, - "grad_norm": 10.549017906188965, - "kl": 0.11474609375, - "learning_rate": 6.166347992351815e-07, - "loss": 0.0046, - "reward": 1.5266410112380981, - "reward_std": 0.07672546058893204, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5266410112380981, - "step": 1203 - }, - { - "completion_length": 70.015625, - "epoch": 0.38368387507966856, - "grad_norm": 71.49323272705078, - "kl": 0.1259765625, - "learning_rate": 6.163161249203313e-07, - "loss": 0.0051, - "reward": 1.581700086593628, - "reward_std": 0.11010384559631348, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5192000865936279, - "rewards/pad": 0.0625, - "step": 1204 - }, - { - "completion_length": 150.140625, - "epoch": 0.3840025493945188, - "grad_norm": 19.70912742614746, - "kl": 0.08984375, - "learning_rate": 6.159974506054811e-07, - "loss": 0.0036, - "reward": 1.5954060554504395, - "reward_std": 0.06902284920215607, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4704059660434723, - "step": 1205 - }, - { - "completion_length": 122.296875, - "epoch": 0.384321223709369, - "grad_norm": 16.765697479248047, - "kl": 0.11962890625, - "learning_rate": 6.15678776290631e-07, - "loss": 0.0048, - "reward": 1.4820270538330078, - "reward_std": 0.07253751158714294, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.372652143239975, - "step": 1206 - }, - { - "completion_length": 20.453125, - "epoch": 0.3846398980242193, - "grad_norm": 39.035213470458984, - "kl": 0.201171875, - "learning_rate": 6.153601019757807e-07, - "loss": 0.008, - "reward": 1.7460757493972778, - "reward_std": 0.05062546581029892, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.49607568979263306, - "step": 1207 - }, - { - "completion_length": 72.0, - "epoch": 0.3849585723390695, - "grad_norm": 78.69184112548828, - "kl": 0.146484375, - "learning_rate": 6.150414276609305e-07, - "loss": 0.0058, - "reward": 1.5769894123077393, - "reward_std": 0.18609334528446198, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4988645017147064, - "rewards/pad": 0.078125, - "step": 1208 - }, - { - "completion_length": 123.75, - "epoch": 0.3852772466539197, - "grad_norm": 49.43526840209961, - "kl": 0.1669921875, - "learning_rate": 6.147227533460803e-07, - "loss": 0.0067, - "reward": 1.592034101486206, - "reward_std": 0.051229365170001984, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46703410148620605, - "rewards/pad": 0.125, - "step": 1209 - }, - { - "completion_length": 122.46875, - "epoch": 0.38559592096876993, - "grad_norm": 13.823474884033203, - "kl": 0.1748046875, - "learning_rate": 6.144040790312301e-07, - "loss": 0.007, - "reward": 1.4730191230773926, - "reward_std": 0.11505133658647537, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4886440932750702, - "step": 1210 - }, - { - "completion_length": 70.1875, - "epoch": 0.38591459528362015, - "grad_norm": 30.62668228149414, - "kl": 0.1640625, - "learning_rate": 6.140854047163798e-07, - "loss": 0.0066, - "reward": 1.5743595361709595, - "reward_std": 0.06021110713481903, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5743595361709595, - "step": 1211 - }, - { - "completion_length": 123.390625, - "epoch": 0.3862332695984704, - "grad_norm": 15.12385082244873, - "kl": 0.12451171875, - "learning_rate": 6.137667304015296e-07, - "loss": 0.005, - "reward": 1.4223942756652832, - "reward_std": 0.04839033633470535, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4223942756652832, - "rewards/pad": 0.0, - "step": 1212 - }, - { - "completion_length": 66.296875, - "epoch": 0.3865519439133206, - "grad_norm": 37.55725860595703, - "kl": 0.357421875, - "learning_rate": 6.134480560866794e-07, - "loss": 0.0142, - "reward": 1.6173839569091797, - "reward_std": 0.12178170680999756, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5548838973045349, - "rewards/pad": 0.0625, - "step": 1213 - }, - { - "completion_length": 69.59375, - "epoch": 0.3868706182281708, - "grad_norm": 67.85687255859375, - "kl": 0.228515625, - "learning_rate": 6.131293817718292e-07, - "loss": 0.0091, - "reward": 1.5872609615325928, - "reward_std": 0.11316142231225967, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.47788599133491516, - "rewards/pad": 0.125, - "step": 1214 - }, - { - "completion_length": 123.65625, - "epoch": 0.38718929254302104, - "grad_norm": 22.573434829711914, - "kl": 0.171875, - "learning_rate": 6.128107074569789e-07, - "loss": 0.0069, - "reward": 1.4954043626785278, - "reward_std": 0.04548972100019455, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49540436267852783, - "step": 1215 - }, - { - "completion_length": 173.71875, - "epoch": 0.38750796685787126, - "grad_norm": 8.874141693115234, - "kl": 0.059814453125, - "learning_rate": 6.124920331421287e-07, - "loss": 0.0024, - "reward": 1.5322620868682861, - "reward_std": 0.039836518466472626, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5322621464729309, - "rewards/pad": 0.0, - "step": 1216 - }, - { - "completion_length": 123.03125, - "epoch": 0.3878266411727215, - "grad_norm": 14.58663272857666, - "kl": 0.134765625, - "learning_rate": 6.121733588272785e-07, - "loss": 0.0054, - "reward": 1.5366692543029785, - "reward_std": 0.07987186312675476, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.42729437351226807, - "step": 1217 - }, - { - "completion_length": 98.171875, - "epoch": 0.3881453154875717, - "grad_norm": 76.23131561279297, - "kl": 0.166015625, - "learning_rate": 6.118546845124283e-07, - "loss": 0.0066, - "reward": 1.5804510116577148, - "reward_std": 0.15513239800930023, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.47107604146003723, - "rewards/pad": 0.125, - "step": 1218 - }, - { - "completion_length": 125.625, - "epoch": 0.3884639898024219, - "grad_norm": 9.716293334960938, - "kl": 0.166015625, - "learning_rate": 6.11536010197578e-07, - "loss": 0.0066, - "reward": 1.4332759380340576, - "reward_std": 0.043724529445171356, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4332759380340576, - "step": 1219 - }, - { - "completion_length": 174.078125, - "epoch": 0.38878266411727214, - "grad_norm": 5.577003479003906, - "kl": 0.1025390625, - "learning_rate": 6.112173358827278e-07, - "loss": 0.0041, - "reward": 1.3279743194580078, - "reward_std": 0.059924960136413574, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3279743492603302, - "step": 1220 - }, - { - "completion_length": 147.234375, - "epoch": 0.38910133843212236, - "grad_norm": 22.81951904296875, - "kl": 0.10205078125, - "learning_rate": 6.108986615678776e-07, - "loss": 0.0041, - "reward": 1.5294336080551147, - "reward_std": 0.06180447340011597, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.52943354845047, - "step": 1221 - }, - { - "completion_length": 123.5, - "epoch": 0.3894200127469726, - "grad_norm": 17.01264190673828, - "kl": 0.21484375, - "learning_rate": 6.105799872530274e-07, - "loss": 0.0086, - "reward": 1.5035171508789062, - "reward_std": 0.09979550540447235, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5035170912742615, - "rewards/pad": 0.0, - "step": 1222 - }, - { - "completion_length": 73.984375, - "epoch": 0.3897386870618228, - "grad_norm": 13.66805362701416, - "kl": 0.1025390625, - "learning_rate": 6.102613129381771e-07, - "loss": 0.0041, - "reward": 1.6973612308502197, - "reward_std": 0.10417403280735016, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3379861116409302, - "step": 1223 - }, - { - "completion_length": 97.625, - "epoch": 0.390057361376673, - "grad_norm": 45.69769287109375, - "kl": 0.119140625, - "learning_rate": 6.09942638623327e-07, - "loss": 0.0048, - "reward": 1.6858172416687012, - "reward_std": 0.11278460919857025, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.45144230127334595, - "step": 1224 - }, - { - "completion_length": 96.609375, - "epoch": 0.39037603569152324, - "grad_norm": 67.78718566894531, - "kl": 0.1259765625, - "learning_rate": 6.096239643084768e-07, - "loss": 0.005, - "reward": 1.3379242420196533, - "reward_std": 0.05427855625748634, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3379242718219757, - "rewards/pad": 0.0, - "step": 1225 - }, - { - "completion_length": 97.15625, - "epoch": 0.3906947100063735, - "grad_norm": 29.57921600341797, - "kl": 0.1123046875, - "learning_rate": 6.093052899936266e-07, - "loss": 0.0045, - "reward": 1.5308830738067627, - "reward_std": 0.06973686814308167, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5308831334114075, - "rewards/pad": 0.0, - "step": 1226 - }, - { - "completion_length": 97.53125, - "epoch": 0.39101338432122373, - "grad_norm": 38.909488677978516, - "kl": 0.13671875, - "learning_rate": 6.089866156787763e-07, - "loss": 0.0055, - "reward": 1.3697888851165771, - "reward_std": 0.058314502239227295, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3697889447212219, - "step": 1227 - }, - { - "completion_length": 47.421875, - "epoch": 0.39133205863607395, - "grad_norm": 23.21940803527832, - "kl": 0.173828125, - "learning_rate": 6.086679413639261e-07, - "loss": 0.007, - "reward": 1.8355824947357178, - "reward_std": 0.10905968397855759, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46058255434036255, - "step": 1228 - }, - { - "completion_length": 145.453125, - "epoch": 0.3916507329509242, - "grad_norm": 33.876712799072266, - "kl": 0.1953125, - "learning_rate": 6.083492670490759e-07, - "loss": 0.0078, - "reward": 1.4986937046051025, - "reward_std": 0.045384809374809265, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37369370460510254, - "step": 1229 - }, - { - "completion_length": 45.65625, - "epoch": 0.3919694072657744, - "grad_norm": 30.74664878845215, - "kl": 0.1494140625, - "learning_rate": 6.080305927342257e-07, - "loss": 0.006, - "reward": 1.5915486812591553, - "reward_std": 0.09160721302032471, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4665486812591553, - "rewards/pad": 0.125, - "step": 1230 - }, - { - "completion_length": 98.203125, - "epoch": 0.3922880815806246, - "grad_norm": 296.3717346191406, - "kl": 0.259765625, - "learning_rate": 6.077119184193754e-07, - "loss": 0.0104, - "reward": 1.7653553485870361, - "reward_std": 0.21209806203842163, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6247303485870361, - "step": 1231 - }, - { - "completion_length": 121.4375, - "epoch": 0.39260675589547483, - "grad_norm": 25.76085090637207, - "kl": 0.1689453125, - "learning_rate": 6.073932441045252e-07, - "loss": 0.0068, - "reward": 1.4098402261734009, - "reward_std": 0.061084769666194916, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4098402261734009, - "rewards/pad": 0.0, - "step": 1232 - }, - { - "completion_length": 71.28125, - "epoch": 0.39292543021032506, - "grad_norm": 26.772653579711914, - "kl": 0.189453125, - "learning_rate": 6.07074569789675e-07, - "loss": 0.0076, - "reward": 1.5265135765075684, - "reward_std": 0.057546839118003845, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5265136957168579, - "rewards/pad": 0.0, - "step": 1233 - }, - { - "completion_length": 97.0625, - "epoch": 0.3932441045251753, - "grad_norm": 8.714112281799316, - "kl": 0.1962890625, - "learning_rate": 6.067558954748247e-07, - "loss": 0.0079, - "reward": 1.6967378854751587, - "reward_std": 0.06316147744655609, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5717379450798035, - "step": 1234 - }, - { - "completion_length": 123.671875, - "epoch": 0.3935627788400255, - "grad_norm": 14.036128997802734, - "kl": 0.423828125, - "learning_rate": 6.064372211599745e-07, - "loss": 0.017, - "reward": 1.5465986728668213, - "reward_std": 0.10370682924985886, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5465986132621765, - "step": 1235 - }, - { - "completion_length": 148.5, - "epoch": 0.3938814531548757, - "grad_norm": 7.814291000366211, - "kl": 0.1025390625, - "learning_rate": 6.061185468451242e-07, - "loss": 0.0041, - "reward": 1.4367327690124512, - "reward_std": 0.06396310031414032, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.31173282861709595, - "step": 1236 - }, - { - "completion_length": 95.34375, - "epoch": 0.39420012746972594, - "grad_norm": 16.77633285522461, - "kl": 0.1474609375, - "learning_rate": 6.05799872530274e-07, - "loss": 0.0059, - "reward": 1.5735642910003662, - "reward_std": 0.05050143599510193, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4485642611980438, - "step": 1237 - }, - { - "completion_length": 70.765625, - "epoch": 0.39451880178457616, - "grad_norm": 22.550230026245117, - "kl": 0.15625, - "learning_rate": 6.054811982154237e-07, - "loss": 0.0062, - "reward": 1.5160186290740967, - "reward_std": 0.061520084738731384, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39101865887641907, - "step": 1238 - }, - { - "completion_length": 95.8125, - "epoch": 0.3948374760994264, - "grad_norm": 58.803009033203125, - "kl": 0.1376953125, - "learning_rate": 6.051625239005735e-07, - "loss": 0.0055, - "reward": 1.5313327312469482, - "reward_std": 0.10405335575342178, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42195767164230347, - "rewards/pad": 0.109375, - "step": 1239 - }, - { - "completion_length": 122.25, - "epoch": 0.3951561504142766, - "grad_norm": 49.10667037963867, - "kl": 0.455078125, - "learning_rate": 6.048438495857233e-07, - "loss": 0.0182, - "reward": 1.455152988433838, - "reward_std": 0.04565514251589775, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4551529586315155, - "rewards/pad": 0.0, - "step": 1240 - }, - { - "completion_length": 74.921875, - "epoch": 0.3954748247291268, - "grad_norm": 109.96369171142578, - "kl": 0.1875, - "learning_rate": 6.045251752708731e-07, - "loss": 0.0075, - "reward": 1.6491847038269043, - "reward_std": 0.1602613925933838, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5241846442222595, - "rewards/pad": 0.125, - "step": 1241 - }, - { - "completion_length": 72.859375, - "epoch": 0.39579349904397704, - "grad_norm": 14.490010261535645, - "kl": 0.220703125, - "learning_rate": 6.042065009560228e-07, - "loss": 0.0089, - "reward": 1.7809354066848755, - "reward_std": 0.07316883653402328, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5309354066848755, - "step": 1242 - }, - { - "completion_length": 151.890625, - "epoch": 0.39611217335882726, - "grad_norm": 13.692108154296875, - "kl": 0.09326171875, - "learning_rate": 6.038878266411726e-07, - "loss": 0.0037, - "reward": 1.3232471942901611, - "reward_std": 0.04393507167696953, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.32324713468551636, - "rewards/pad": 0.0, - "step": 1243 - }, - { - "completion_length": 146.484375, - "epoch": 0.3964308476736775, - "grad_norm": 12.829249382019043, - "kl": 0.1123046875, - "learning_rate": 6.035691523263225e-07, - "loss": 0.0045, - "reward": 1.5288894176483154, - "reward_std": 0.0912233367562294, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.41951435804367065, - "step": 1244 - }, - { - "completion_length": 123.625, - "epoch": 0.3967495219885277, - "grad_norm": 26.55255126953125, - "kl": 0.10888671875, - "learning_rate": 6.032504780114723e-07, - "loss": 0.0044, - "reward": 1.429291844367981, - "reward_std": 0.05211643874645233, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4292917847633362, - "step": 1245 - }, - { - "completion_length": 71.390625, - "epoch": 0.397068196303378, - "grad_norm": 36.959205627441406, - "kl": 0.22265625, - "learning_rate": 6.02931803696622e-07, - "loss": 0.0089, - "reward": 1.6715713739395142, - "reward_std": 0.05109931528568268, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5465713739395142, - "rewards/pad": 0.125, - "step": 1246 - }, - { - "completion_length": 94.53125, - "epoch": 0.3973868706182282, - "grad_norm": 51.179443359375, - "kl": 0.11083984375, - "learning_rate": 6.026131293817718e-07, - "loss": 0.0044, - "reward": 1.4588894844055176, - "reward_std": 0.10032698512077332, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4588894844055176, - "step": 1247 - }, - { - "completion_length": 69.46875, - "epoch": 0.3977055449330784, - "grad_norm": 101.04873657226562, - "kl": 0.322265625, - "learning_rate": 6.022944550669216e-07, - "loss": 0.0129, - "reward": 1.4367910623550415, - "reward_std": 0.06670445203781128, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3117910623550415, - "rewards/pad": 0.125, - "step": 1248 - }, - { - "completion_length": 48.53125, - "epoch": 0.39802421924792863, - "grad_norm": 58.24769592285156, - "kl": 0.158203125, - "learning_rate": 6.019757807520714e-07, - "loss": 0.0063, - "reward": 1.781355619430542, - "reward_std": 0.20801198482513428, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5782306790351868, - "step": 1249 - }, - { - "completion_length": 170.09375, - "epoch": 0.39834289356277885, - "grad_norm": 42.81908416748047, - "kl": 0.0927734375, - "learning_rate": 6.016571064372211e-07, - "loss": 0.0037, - "reward": 1.4066224098205566, - "reward_std": 0.09949782490730286, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.42224735021591187, - "step": 1250 - }, - { - "completion_length": 120.28125, - "epoch": 0.3986615678776291, - "grad_norm": 17.177026748657227, - "kl": 0.11279296875, - "learning_rate": 6.013384321223709e-07, - "loss": 0.0045, - "reward": 1.5096888542175293, - "reward_std": 0.04415253549814224, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5096888542175293, - "rewards/pad": 0.0, - "step": 1251 - }, - { - "completion_length": 119.890625, - "epoch": 0.3989802421924793, - "grad_norm": 31.22040557861328, - "kl": 0.130859375, - "learning_rate": 6.010197578075207e-07, - "loss": 0.0052, - "reward": 1.4680066108703613, - "reward_std": 0.08115118741989136, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4680064916610718, - "step": 1252 - }, - { - "completion_length": 71.28125, - "epoch": 0.3992989165073295, - "grad_norm": 209.50657653808594, - "kl": 0.1376953125, - "learning_rate": 6.007010834926705e-07, - "loss": 0.0055, - "reward": 1.8249537944793701, - "reward_std": 0.1311221420764923, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6062036752700806, - "step": 1253 - }, - { - "completion_length": 171.140625, - "epoch": 0.39961759082217974, - "grad_norm": 31.1502685546875, - "kl": 0.0927734375, - "learning_rate": 6.003824091778202e-07, - "loss": 0.0037, - "reward": 1.5441827774047852, - "reward_std": 0.05834697559475899, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5441827774047852, - "step": 1254 - }, - { - "completion_length": 172.640625, - "epoch": 0.39993626513702996, - "grad_norm": 13.332192420959473, - "kl": 0.10595703125, - "learning_rate": 6.0006373486297e-07, - "loss": 0.0042, - "reward": 1.4143109321594238, - "reward_std": 0.08136802166700363, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.414311021566391, - "step": 1255 - }, - { - "completion_length": 95.46875, - "epoch": 0.4002549394518802, - "grad_norm": 11.769364356994629, - "kl": 0.154296875, - "learning_rate": 5.997450605481198e-07, - "loss": 0.0062, - "reward": 1.5249242782592773, - "reward_std": 0.10501822829246521, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5249243974685669, - "step": 1256 - }, - { - "completion_length": 146.171875, - "epoch": 0.4005736137667304, - "grad_norm": 40.824153900146484, - "kl": 0.1357421875, - "learning_rate": 5.994263862332696e-07, - "loss": 0.0054, - "reward": 1.3846766948699951, - "reward_std": 0.071880042552948, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38467681407928467, - "step": 1257 - }, - { - "completion_length": 97.21875, - "epoch": 0.4008922880815806, - "grad_norm": 1223.993408203125, - "kl": 0.103515625, - "learning_rate": 5.991077119184193e-07, - "loss": 0.0041, - "reward": 1.598623514175415, - "reward_std": 0.06979230791330338, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47362351417541504, - "step": 1258 - }, - { - "completion_length": 94.0625, - "epoch": 0.40121096239643084, - "grad_norm": 30.500272750854492, - "kl": 0.166015625, - "learning_rate": 5.987890376035691e-07, - "loss": 0.0066, - "reward": 1.559830665588379, - "reward_std": 0.07134728133678436, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5598306059837341, - "rewards/pad": 0.0, - "step": 1259 - }, - { - "completion_length": 95.609375, - "epoch": 0.40152963671128106, - "grad_norm": 43.06867218017578, - "kl": 0.140625, - "learning_rate": 5.984703632887189e-07, - "loss": 0.0056, - "reward": 1.3730461597442627, - "reward_std": 0.10210265219211578, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3886711001396179, - "rewards/pad": 0.0, - "step": 1260 - }, - { - "completion_length": 98.140625, - "epoch": 0.4018483110261313, - "grad_norm": 33.724788665771484, - "kl": 0.16015625, - "learning_rate": 5.981516889738687e-07, - "loss": 0.0064, - "reward": 1.7647151947021484, - "reward_std": 0.06048569828271866, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5147151350975037, - "step": 1261 - }, - { - "completion_length": 73.9375, - "epoch": 0.4021669853409815, - "grad_norm": 37.967552185058594, - "kl": 0.09765625, - "learning_rate": 5.978330146590184e-07, - "loss": 0.0039, - "reward": 1.8287855386734009, - "reward_std": 0.0900324210524559, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4537854492664337, - "step": 1262 - }, - { - "completion_length": 147.96875, - "epoch": 0.4024856596558317, - "grad_norm": 34.182621002197266, - "kl": 0.0947265625, - "learning_rate": 5.975143403441683e-07, - "loss": 0.0038, - "reward": 1.5652196407318115, - "reward_std": 0.05024642124772072, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.565219521522522, - "step": 1263 - }, - { - "completion_length": 68.875, - "epoch": 0.40280433397068194, - "grad_norm": 82.22328186035156, - "kl": 0.255859375, - "learning_rate": 5.971956660293181e-07, - "loss": 0.0102, - "reward": 1.5273429155349731, - "reward_std": 0.09247814118862152, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40234294533729553, - "rewards/pad": 0.125, - "step": 1264 - }, - { - "completion_length": 121.5625, - "epoch": 0.40312300828553216, - "grad_norm": 46.81608200073242, - "kl": 0.1201171875, - "learning_rate": 5.968769917144678e-07, - "loss": 0.0048, - "reward": 1.4116365909576416, - "reward_std": 0.12033785134553909, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.427261620759964, - "step": 1265 - }, - { - "completion_length": 92.359375, - "epoch": 0.40344168260038243, - "grad_norm": 35.47514343261719, - "kl": 0.1318359375, - "learning_rate": 5.965583173996176e-07, - "loss": 0.0053, - "reward": 1.4215552806854248, - "reward_std": 0.036197151988744736, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42155537009239197, - "step": 1266 - }, - { - "completion_length": 95.3125, - "epoch": 0.40376035691523265, - "grad_norm": 51.23936080932617, - "kl": 0.1787109375, - "learning_rate": 5.962396430847674e-07, - "loss": 0.0072, - "reward": 1.5264898538589478, - "reward_std": 0.1344299465417862, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43273985385894775, - "rewards/pad": 0.09375, - "step": 1267 - }, - { - "completion_length": 70.84375, - "epoch": 0.4040790312300829, - "grad_norm": 33.75147247314453, - "kl": 0.158203125, - "learning_rate": 5.959209687699172e-07, - "loss": 0.0063, - "reward": 1.6157795190811157, - "reward_std": 0.054706256836652756, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6157793998718262, - "step": 1268 - }, - { - "completion_length": 71.5, - "epoch": 0.4043977055449331, - "grad_norm": 75.2858657836914, - "kl": 0.1533203125, - "learning_rate": 5.956022944550669e-07, - "loss": 0.0061, - "reward": 1.7125145196914673, - "reward_std": 0.11750952899456024, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4625145196914673, - "rewards/pad": 0.25, - "step": 1269 - }, - { - "completion_length": 146.171875, - "epoch": 0.4047163798597833, - "grad_norm": 46.43769454956055, - "kl": 0.0888671875, - "learning_rate": 5.952836201402167e-07, - "loss": 0.0035, - "reward": 1.4164249897003174, - "reward_std": 0.1052272617816925, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41642504930496216, - "rewards/pad": 0.0, - "step": 1270 - }, - { - "completion_length": 118.015625, - "epoch": 0.40503505417463354, - "grad_norm": 20.825685501098633, - "kl": 0.2333984375, - "learning_rate": 5.949649458253665e-07, - "loss": 0.0093, - "reward": 1.519181251525879, - "reward_std": 0.062050338834524155, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5191812515258789, - "step": 1271 - }, - { - "completion_length": 45.78125, - "epoch": 0.40535372848948376, - "grad_norm": 35.329437255859375, - "kl": 0.21875, - "learning_rate": 5.946462715105163e-07, - "loss": 0.0088, - "reward": 1.4968239068984985, - "reward_std": 0.129884272813797, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49682390689849854, - "rewards/pad": 0.0, - "step": 1272 - }, - { - "completion_length": 19.328125, - "epoch": 0.405672402804334, - "grad_norm": 82.19258880615234, - "kl": 0.203125, - "learning_rate": 5.94327597195666e-07, - "loss": 0.0082, - "reward": 1.5742777585983276, - "reward_std": 0.10323308408260345, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5742777585983276, - "rewards/pad": 0.0, - "step": 1273 - }, - { - "completion_length": 94.3125, - "epoch": 0.4059910771191842, - "grad_norm": 27.67733383178711, - "kl": 0.1455078125, - "learning_rate": 5.940089228808158e-07, - "loss": 0.0058, - "reward": 1.681000828742981, - "reward_std": 0.06062120571732521, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.556000828742981, - "rewards/pad": 0.125, - "step": 1274 - }, - { - "completion_length": 96.765625, - "epoch": 0.4063097514340344, - "grad_norm": 36.68671417236328, - "kl": 0.134765625, - "learning_rate": 5.936902485659655e-07, - "loss": 0.0054, - "reward": 1.4792072772979736, - "reward_std": 0.053634293377399445, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47920721769332886, - "step": 1275 - }, - { - "completion_length": 147.859375, - "epoch": 0.40662842574888464, - "grad_norm": 7.783206939697266, - "kl": 0.111328125, - "learning_rate": 5.933715742511153e-07, - "loss": 0.0045, - "reward": 1.4867475032806396, - "reward_std": 0.04772502928972244, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4867474436759949, - "step": 1276 - }, - { - "completion_length": 43.28125, - "epoch": 0.40694710006373486, - "grad_norm": 40.26476287841797, - "kl": 0.1953125, - "learning_rate": 5.93052899936265e-07, - "loss": 0.0078, - "reward": 1.5758988857269287, - "reward_std": 0.0841723382472992, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5758988857269287, - "rewards/pad": 0.0, - "step": 1277 - }, - { - "completion_length": 93.84375, - "epoch": 0.4072657743785851, - "grad_norm": 20.492387771606445, - "kl": 0.1298828125, - "learning_rate": 5.927342256214148e-07, - "loss": 0.0052, - "reward": 1.5350046157836914, - "reward_std": 0.027536258101463318, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.410004585981369, - "rewards/pad": 0.125, - "step": 1278 - }, - { - "completion_length": 94.640625, - "epoch": 0.4075844486934353, - "grad_norm": 15.450780868530273, - "kl": 0.146484375, - "learning_rate": 5.924155513065646e-07, - "loss": 0.0059, - "reward": 1.612504482269287, - "reward_std": 0.09879954159259796, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6125044822692871, - "rewards/pad": 0.0, - "step": 1279 - }, - { - "completion_length": 69.890625, - "epoch": 0.4079031230082855, - "grad_norm": 16.371076583862305, - "kl": 0.2373046875, - "learning_rate": 5.920968769917144e-07, - "loss": 0.0095, - "reward": 1.6929583549499512, - "reward_std": 0.037510331720113754, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5679583549499512, - "rewards/pad": 0.125, - "step": 1280 - }, - { - "completion_length": 95.890625, - "epoch": 0.40822179732313574, - "grad_norm": 33.37834548950195, - "kl": 0.11865234375, - "learning_rate": 5.917782026768641e-07, - "loss": 0.0048, - "reward": 1.5515503883361816, - "reward_std": 0.085669606924057, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5515503883361816, - "rewards/pad": 0.0, - "step": 1281 - }, - { - "completion_length": 45.03125, - "epoch": 0.40854047163798596, - "grad_norm": 67.09070587158203, - "kl": 0.1513671875, - "learning_rate": 5.91459528362014e-07, - "loss": 0.0061, - "reward": 1.7493417263031006, - "reward_std": 0.11784421652555466, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.624341607093811, - "rewards/pad": 0.125, - "step": 1282 - }, - { - "completion_length": 120.34375, - "epoch": 0.4088591459528362, - "grad_norm": 125.51654815673828, - "kl": 0.1533203125, - "learning_rate": 5.911408540471638e-07, - "loss": 0.0061, - "reward": 1.40920090675354, - "reward_std": 0.09400975704193115, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3779507875442505, - "step": 1283 - }, - { - "completion_length": 69.625, - "epoch": 0.4091778202676864, - "grad_norm": 33.30241394042969, - "kl": 0.1298828125, - "learning_rate": 5.908221797323136e-07, - "loss": 0.0052, - "reward": 1.46676766872406, - "reward_std": 0.05991966277360916, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34176772832870483, - "rewards/pad": 0.125, - "step": 1284 - }, - { - "completion_length": 146.0625, - "epoch": 0.4094964945825367, - "grad_norm": 25.540910720825195, - "kl": 0.08740234375, - "learning_rate": 5.905035054174633e-07, - "loss": 0.0035, - "reward": 1.4206722974777222, - "reward_std": 0.1471319943666458, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.38942232728004456, - "rewards/pad": 0.046875, - "step": 1285 - }, - { - "completion_length": 155.65625, - "epoch": 0.4098151688973869, - "grad_norm": 117.83950805664062, - "kl": 0.08544921875, - "learning_rate": 5.901848311026131e-07, - "loss": 0.0034, - "reward": 1.3239874839782715, - "reward_std": 0.13704180717468262, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.33961254358291626, - "step": 1286 - }, - { - "completion_length": 95.28125, - "epoch": 0.4101338432122371, - "grad_norm": 34.79705047607422, - "kl": 0.1376953125, - "learning_rate": 5.898661567877629e-07, - "loss": 0.0055, - "reward": 1.5317813158035278, - "reward_std": 0.11902156472206116, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4692813456058502, - "rewards/pad": 0.0625, - "step": 1287 - }, - { - "completion_length": 147.984375, - "epoch": 0.41045251752708733, - "grad_norm": 44.45170211791992, - "kl": 0.07470703125, - "learning_rate": 5.895474824729127e-07, - "loss": 0.003, - "reward": 1.4936556816101074, - "reward_std": 0.05926290154457092, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49365559220314026, - "step": 1288 - }, - { - "completion_length": 120.65625, - "epoch": 0.41077119184193756, - "grad_norm": 21.58782196044922, - "kl": 0.10205078125, - "learning_rate": 5.892288081580624e-07, - "loss": 0.0041, - "reward": 1.623151183128357, - "reward_std": 0.07410305738449097, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49815118312835693, - "step": 1289 - }, - { - "completion_length": 123.046875, - "epoch": 0.4110898661567878, - "grad_norm": 23.982749938964844, - "kl": 0.09716796875, - "learning_rate": 5.889101338432122e-07, - "loss": 0.0039, - "reward": 1.4851818084716797, - "reward_std": 0.11753320693969727, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45393168926239014, - "step": 1290 - }, - { - "completion_length": 120.84375, - "epoch": 0.411408540471638, - "grad_norm": 12.221750259399414, - "kl": 0.1318359375, - "learning_rate": 5.88591459528362e-07, - "loss": 0.0053, - "reward": 1.3529877662658691, - "reward_std": 0.028966424986720085, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35298776626586914, - "rewards/pad": 0.0, - "step": 1291 - }, - { - "completion_length": 95.75, - "epoch": 0.4117272147864882, - "grad_norm": 87.19110870361328, - "kl": 0.15625, - "learning_rate": 5.882727852135117e-07, - "loss": 0.0062, - "reward": 1.5021991729736328, - "reward_std": 0.04907491058111191, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3771991431713104, - "rewards/pad": 0.125, - "step": 1292 - }, - { - "completion_length": 47.515625, - "epoch": 0.41204588910133844, - "grad_norm": 43.21669387817383, - "kl": 0.150390625, - "learning_rate": 5.879541108986615e-07, - "loss": 0.006, - "reward": 1.508324146270752, - "reward_std": 0.05069003999233246, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38332417607307434, - "step": 1293 - }, - { - "completion_length": 93.15625, - "epoch": 0.41236456341618866, - "grad_norm": 60.25608444213867, - "kl": 0.12255859375, - "learning_rate": 5.876354365838113e-07, - "loss": 0.0049, - "reward": 1.5370088815689087, - "reward_std": 0.0394783578813076, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5370088219642639, - "step": 1294 - }, - { - "completion_length": 94.59375, - "epoch": 0.4126832377310389, - "grad_norm": 23.95777130126953, - "kl": 0.11376953125, - "learning_rate": 5.873167622689611e-07, - "loss": 0.0046, - "reward": 1.524670124053955, - "reward_std": 0.08892560005187988, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3996701240539551, - "rewards/pad": 0.125, - "step": 1295 - }, - { - "completion_length": 119.328125, - "epoch": 0.4130019120458891, - "grad_norm": 22.234315872192383, - "kl": 0.0810546875, - "learning_rate": 5.869980879541108e-07, - "loss": 0.0033, - "reward": 1.7183890342712402, - "reward_std": 0.06814469397068024, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48401397466659546, - "rewards/pad": 0.234375, - "step": 1296 - }, - { - "completion_length": 45.359375, - "epoch": 0.4133205863607393, - "grad_norm": 27.19536018371582, - "kl": 0.330078125, - "learning_rate": 5.866794136392606e-07, - "loss": 0.0132, - "reward": 1.650060772895813, - "reward_std": 0.1549336016178131, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47818565368652344, - "rewards/pad": 0.171875, - "step": 1297 - }, - { - "completion_length": 120.390625, - "epoch": 0.41363926067558954, - "grad_norm": 662.822998046875, - "kl": 0.14453125, - "learning_rate": 5.863607393244104e-07, - "loss": 0.0058, - "reward": 1.4083480834960938, - "reward_std": 0.07196545600891113, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.408348023891449, - "rewards/pad": 0.0, - "step": 1298 - }, - { - "completion_length": 94.171875, - "epoch": 0.41395793499043976, - "grad_norm": 29.26519775390625, - "kl": 0.1376953125, - "learning_rate": 5.860420650095602e-07, - "loss": 0.0055, - "reward": 1.5837488174438477, - "reward_std": 0.07130052894353867, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45874881744384766, - "step": 1299 - }, - { - "completion_length": 70.515625, - "epoch": 0.41427660930529, - "grad_norm": 48.05635070800781, - "kl": 0.1982421875, - "learning_rate": 5.8572339069471e-07, - "loss": 0.0079, - "reward": 1.6558793783187866, - "reward_std": 0.27641844749450684, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.49962949752807617, - "step": 1300 - }, - { - "completion_length": 68.953125, - "epoch": 0.4145952836201402, - "grad_norm": 28.768461227416992, - "kl": 0.1279296875, - "learning_rate": 5.854047163798598e-07, - "loss": 0.0051, - "reward": 1.7155733108520508, - "reward_std": 0.05253777652978897, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5905733704566956, - "rewards/pad": 0.125, - "step": 1301 - }, - { - "completion_length": 69.15625, - "epoch": 0.4149139579349904, - "grad_norm": 130.46864318847656, - "kl": 0.11962890625, - "learning_rate": 5.850860420650096e-07, - "loss": 0.0048, - "reward": 1.496382474899292, - "reward_std": 0.11643992364406586, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.371382474899292, - "rewards/pad": 0.125, - "step": 1302 - }, - { - "completion_length": 94.84375, - "epoch": 0.41523263224984064, - "grad_norm": 20.442569732666016, - "kl": 0.12060546875, - "learning_rate": 5.847673677501594e-07, - "loss": 0.0048, - "reward": 1.4754797220230103, - "reward_std": 0.07423287630081177, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47547969222068787, - "rewards/pad": 0.0, - "step": 1303 - }, - { - "completion_length": 146.1875, - "epoch": 0.41555130656469086, - "grad_norm": 45.905033111572266, - "kl": 0.142578125, - "learning_rate": 5.844486934353091e-07, - "loss": 0.0057, - "reward": 1.283578872680664, - "reward_std": 0.0793718472123146, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.26795390248298645, - "step": 1304 - }, - { - "completion_length": 92.78125, - "epoch": 0.41586998087954113, - "grad_norm": 50.72040939331055, - "kl": 0.1884765625, - "learning_rate": 5.841300191204589e-07, - "loss": 0.0075, - "reward": 1.4547278881072998, - "reward_std": 0.069220632314682, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4547279179096222, - "step": 1305 - }, - { - "completion_length": 69.046875, - "epoch": 0.41618865519439135, - "grad_norm": 38.16029357910156, - "kl": 0.142578125, - "learning_rate": 5.838113448056087e-07, - "loss": 0.0057, - "reward": 1.5830930471420288, - "reward_std": 0.05439729988574982, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5830931663513184, - "step": 1306 - }, - { - "completion_length": 170.390625, - "epoch": 0.4165073295092416, - "grad_norm": 27.359622955322266, - "kl": 0.07861328125, - "learning_rate": 5.834926704907585e-07, - "loss": 0.0032, - "reward": 1.576507329940796, - "reward_std": 0.051485076546669006, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5765073299407959, - "step": 1307 - }, - { - "completion_length": 144.578125, - "epoch": 0.4168260038240918, - "grad_norm": 18.980205535888672, - "kl": 0.1669921875, - "learning_rate": 5.831739961759082e-07, - "loss": 0.0067, - "reward": 1.5791575908660889, - "reward_std": 0.04680941626429558, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45415759086608887, - "rewards/pad": 0.125, - "step": 1308 - }, - { - "completion_length": 96.25, - "epoch": 0.417144678138942, - "grad_norm": 38.10404968261719, - "kl": 0.224609375, - "learning_rate": 5.82855321861058e-07, - "loss": 0.009, - "reward": 1.595242977142334, - "reward_std": 0.07397254556417465, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34524303674697876, - "step": 1309 - }, - { - "completion_length": 70.515625, - "epoch": 0.41746335245379224, - "grad_norm": 18.30024528503418, - "kl": 0.1884765625, - "learning_rate": 5.825366475462078e-07, - "loss": 0.0075, - "reward": 1.6927845478057861, - "reward_std": 0.06641843169927597, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44278454780578613, - "step": 1310 - }, - { - "completion_length": 147.203125, - "epoch": 0.41778202676864246, - "grad_norm": 18.82292366027832, - "kl": 0.08935546875, - "learning_rate": 5.822179732313576e-07, - "loss": 0.0036, - "reward": 1.5743370056152344, - "reward_std": 0.05167382210493088, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4493370056152344, - "step": 1311 - }, - { - "completion_length": 71.265625, - "epoch": 0.4181007010834927, - "grad_norm": 36.481136322021484, - "kl": 0.1396484375, - "learning_rate": 5.818992989165073e-07, - "loss": 0.0056, - "reward": 1.5864616632461548, - "reward_std": 0.06692761927843094, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4614616930484772, - "rewards/pad": 0.125, - "step": 1312 - }, - { - "completion_length": 44.734375, - "epoch": 0.4184193753983429, - "grad_norm": 26.760005950927734, - "kl": 0.1552734375, - "learning_rate": 5.815806246016571e-07, - "loss": 0.0062, - "reward": 1.8370531797409058, - "reward_std": 0.05633014068007469, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5870531797409058, - "rewards/pad": 0.25, - "step": 1313 - }, - { - "completion_length": 174.125, - "epoch": 0.4187380497131931, - "grad_norm": 17.700157165527344, - "kl": 0.064453125, - "learning_rate": 5.812619502868069e-07, - "loss": 0.0026, - "reward": 1.5748400688171387, - "reward_std": 0.11043757945299149, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49671506881713867, - "step": 1314 - }, - { - "completion_length": 122.75, - "epoch": 0.41905672402804334, - "grad_norm": 112.10523986816406, - "kl": 0.1201171875, - "learning_rate": 5.809432759719566e-07, - "loss": 0.0048, - "reward": 1.4939663410186768, - "reward_std": 0.07092640548944473, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.493966281414032, - "step": 1315 - }, - { - "completion_length": 93.890625, - "epoch": 0.41937539834289356, - "grad_norm": 68.99323272705078, - "kl": 0.1396484375, - "learning_rate": 5.806246016571063e-07, - "loss": 0.0056, - "reward": 1.399767518043518, - "reward_std": 0.05090294033288956, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39976757764816284, - "rewards/pad": 0.0, - "step": 1316 - }, - { - "completion_length": 97.109375, - "epoch": 0.4196940726577438, - "grad_norm": 36.900672912597656, - "kl": 0.1259765625, - "learning_rate": 5.803059273422561e-07, - "loss": 0.005, - "reward": 1.5363261699676514, - "reward_std": 0.061481181532144547, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.28632619976997375, - "rewards/pad": 0.25, - "step": 1317 - }, - { - "completion_length": 95.265625, - "epoch": 0.420012746972594, - "grad_norm": 69.90725708007812, - "kl": 0.17578125, - "learning_rate": 5.799872530274059e-07, - "loss": 0.007, - "reward": 1.4442648887634277, - "reward_std": 0.09392991662025452, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44426485896110535, - "rewards/pad": 0.0, - "step": 1318 - }, - { - "completion_length": 97.703125, - "epoch": 0.4203314212874442, - "grad_norm": 18.30790138244629, - "kl": 0.2216796875, - "learning_rate": 5.796685787125558e-07, - "loss": 0.0089, - "reward": 1.4277915954589844, - "reward_std": 0.13965944945812225, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.396541565656662, - "rewards/pad": 0.03125, - "step": 1319 - }, - { - "completion_length": 121.375, - "epoch": 0.42065009560229444, - "grad_norm": 14.09908676147461, - "kl": 0.13671875, - "learning_rate": 5.793499043977055e-07, - "loss": 0.0055, - "reward": 1.3608624935150146, - "reward_std": 0.054107192903757095, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3608624339103699, - "step": 1320 - }, - { - "completion_length": 44.21875, - "epoch": 0.42096876991714466, - "grad_norm": 18.591712951660156, - "kl": 0.23046875, - "learning_rate": 5.790312300828553e-07, - "loss": 0.0092, - "reward": 1.8102295398712158, - "reward_std": 0.15906178951263428, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.7008545994758606, - "rewards/pad": 0.125, - "step": 1321 - }, - { - "completion_length": 45.78125, - "epoch": 0.4212874442319949, - "grad_norm": 68.0612564086914, - "kl": 0.1455078125, - "learning_rate": 5.787125557680051e-07, - "loss": 0.0058, - "reward": 1.786529541015625, - "reward_std": 0.1605585217475891, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5521544218063354, - "step": 1322 - }, - { - "completion_length": 98.8125, - "epoch": 0.4216061185468451, - "grad_norm": 37.890174865722656, - "kl": 0.09326171875, - "learning_rate": 5.783938814531548e-07, - "loss": 0.0037, - "reward": 1.6999619007110596, - "reward_std": 0.08607161045074463, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.46558693051338196, - "rewards/pad": 0.25, - "step": 1323 - }, - { - "completion_length": 145.75, - "epoch": 0.4219247928616954, - "grad_norm": 14.982475280761719, - "kl": 0.11962890625, - "learning_rate": 5.780752071383046e-07, - "loss": 0.0048, - "reward": 1.6025621891021729, - "reward_std": 0.12264655530452728, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.6181871294975281, - "rewards/pad": 0.0, - "step": 1324 - }, - { - "completion_length": 69.53125, - "epoch": 0.4222434671765456, - "grad_norm": 10.594503402709961, - "kl": 0.146484375, - "learning_rate": 5.777565328234544e-07, - "loss": 0.0058, - "reward": 1.4348278045654297, - "reward_std": 0.03396230563521385, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4348278343677521, - "rewards/pad": 0.0, - "step": 1325 - }, - { - "completion_length": 45.4375, - "epoch": 0.4225621414913958, - "grad_norm": 28.847732543945312, - "kl": 0.162109375, - "learning_rate": 5.774378585086042e-07, - "loss": 0.0065, - "reward": 1.710758924484253, - "reward_std": 0.166127011179924, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.49200892448425293, - "rewards/pad": 0.25, - "step": 1326 - }, - { - "completion_length": 46.65625, - "epoch": 0.42288081580624604, - "grad_norm": 62.12076950073242, - "kl": 0.1669921875, - "learning_rate": 5.771191841937539e-07, - "loss": 0.0067, - "reward": 1.4910268783569336, - "reward_std": 0.08188996464014053, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49102693796157837, - "rewards/pad": 0.0, - "step": 1327 - }, - { - "completion_length": 121.109375, - "epoch": 0.42319949012109626, - "grad_norm": 27.380247116088867, - "kl": 0.13671875, - "learning_rate": 5.768005098789037e-07, - "loss": 0.0055, - "reward": 1.342002511024475, - "reward_std": 0.07291238009929657, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3420025706291199, - "rewards/pad": 0.0, - "step": 1328 - }, - { - "completion_length": 151.125, - "epoch": 0.4235181644359465, - "grad_norm": 22.531246185302734, - "kl": 0.10888671875, - "learning_rate": 5.764818355640535e-07, - "loss": 0.0044, - "reward": 1.7052443027496338, - "reward_std": 0.08555316925048828, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5958691835403442, - "step": 1329 - }, - { - "completion_length": 70.671875, - "epoch": 0.4238368387507967, - "grad_norm": 45.3145751953125, - "kl": 0.23828125, - "learning_rate": 5.761631612492033e-07, - "loss": 0.0095, - "reward": 1.6560629606246948, - "reward_std": 0.1088961511850357, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5310629606246948, - "rewards/pad": 0.125, - "step": 1330 - }, - { - "completion_length": 120.609375, - "epoch": 0.4241555130656469, - "grad_norm": 27.0241641998291, - "kl": 0.1533203125, - "learning_rate": 5.75844486934353e-07, - "loss": 0.0061, - "reward": 1.6354317665100098, - "reward_std": 0.0771084874868393, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6354317665100098, - "rewards/pad": 0.0, - "step": 1331 - }, - { - "completion_length": 95.5, - "epoch": 0.42447418738049714, - "grad_norm": 106.8154525756836, - "kl": 0.16015625, - "learning_rate": 5.755258126195028e-07, - "loss": 0.0064, - "reward": 1.7053183317184448, - "reward_std": 0.10672757029533386, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5803183317184448, - "rewards/pad": 0.125, - "step": 1332 - }, - { - "completion_length": 71.421875, - "epoch": 0.42479286169534736, - "grad_norm": 30.220779418945312, - "kl": 0.1484375, - "learning_rate": 5.752071383046526e-07, - "loss": 0.0059, - "reward": 1.4662553071975708, - "reward_std": 0.12006659805774689, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4662552773952484, - "rewards/pad": 0.0, - "step": 1333 - }, - { - "completion_length": 96.25, - "epoch": 0.4251115360101976, - "grad_norm": 52.073760986328125, - "kl": 0.119140625, - "learning_rate": 5.748884639898024e-07, - "loss": 0.0048, - "reward": 1.6597801446914673, - "reward_std": 0.04236849769949913, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6597801446914673, - "step": 1334 - }, - { - "completion_length": 126.703125, - "epoch": 0.4254302103250478, - "grad_norm": 28.545778274536133, - "kl": 0.10888671875, - "learning_rate": 5.745697896749521e-07, - "loss": 0.0043, - "reward": 1.6625990867614746, - "reward_std": 0.1483616977930069, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5688490271568298, - "step": 1335 - }, - { - "completion_length": 71.21875, - "epoch": 0.425748884639898, - "grad_norm": 12.07433795928955, - "kl": 0.1455078125, - "learning_rate": 5.742511153601019e-07, - "loss": 0.0058, - "reward": 1.582622766494751, - "reward_std": 0.05007660388946533, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5826227068901062, - "rewards/pad": 0.0, - "step": 1336 - }, - { - "completion_length": 94.796875, - "epoch": 0.42606755895474824, - "grad_norm": 36.460994720458984, - "kl": 0.12353515625, - "learning_rate": 5.739324410452517e-07, - "loss": 0.0049, - "reward": 1.3425620794296265, - "reward_std": 0.049371980130672455, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.34256210923194885, - "step": 1337 - }, - { - "completion_length": 68.984375, - "epoch": 0.42638623326959846, - "grad_norm": 31.095317840576172, - "kl": 0.2001953125, - "learning_rate": 5.736137667304016e-07, - "loss": 0.008, - "reward": 1.4534603357315063, - "reward_std": 0.055513009428977966, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45346033573150635, - "rewards/pad": 0.0, - "step": 1338 - }, - { - "completion_length": 130.03125, - "epoch": 0.4267049075844487, - "grad_norm": 15.608606338500977, - "kl": 0.11083984375, - "learning_rate": 5.732950924155513e-07, - "loss": 0.0044, - "reward": 1.6918649673461914, - "reward_std": 0.08464628458023071, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44186487793922424, - "step": 1339 - }, - { - "completion_length": 22.265625, - "epoch": 0.4270235818992989, - "grad_norm": 46.421653747558594, - "kl": 0.1572265625, - "learning_rate": 5.729764181007011e-07, - "loss": 0.0063, - "reward": 1.7511345148086548, - "reward_std": 0.1293444037437439, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5167595148086548, - "rewards/pad": 0.25, - "step": 1340 - }, - { - "completion_length": 70.453125, - "epoch": 0.4273422562141491, - "grad_norm": 212.90789794921875, - "kl": 0.1650390625, - "learning_rate": 5.726577437858509e-07, - "loss": 0.0066, - "reward": 1.647883653640747, - "reward_std": 0.08742669224739075, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5228837728500366, - "rewards/pad": 0.125, - "step": 1341 - }, - { - "completion_length": 72.703125, - "epoch": 0.42766093052899934, - "grad_norm": 18.851896286010742, - "kl": 0.22265625, - "learning_rate": 5.723390694710007e-07, - "loss": 0.0089, - "reward": 1.5048867464065552, - "reward_std": 0.09711531549692154, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4736367166042328, - "step": 1342 - }, - { - "completion_length": 70.75, - "epoch": 0.42797960484384956, - "grad_norm": 170.10140991210938, - "kl": 0.1767578125, - "learning_rate": 5.720203951561504e-07, - "loss": 0.0071, - "reward": 1.5883522033691406, - "reward_std": 0.1104811504483223, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5727271437644958, - "step": 1343 - }, - { - "completion_length": 70.359375, - "epoch": 0.42829827915869984, - "grad_norm": 24.550609588623047, - "kl": 0.2109375, - "learning_rate": 5.717017208413002e-07, - "loss": 0.0084, - "reward": 1.7100211381912231, - "reward_std": 0.14169202744960785, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5850211381912231, - "step": 1344 - }, - { - "completion_length": 95.859375, - "epoch": 0.42861695347355006, - "grad_norm": 28.526103973388672, - "kl": 0.251953125, - "learning_rate": 5.7138304652645e-07, - "loss": 0.0101, - "reward": 1.5001425743103027, - "reward_std": 0.08676139265298843, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.500142514705658, - "step": 1345 - }, - { - "completion_length": 70.109375, - "epoch": 0.4289356277884003, - "grad_norm": 47.61008834838867, - "kl": 0.162109375, - "learning_rate": 5.710643722115998e-07, - "loss": 0.0065, - "reward": 1.60111665725708, - "reward_std": 0.06631819903850555, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6011165380477905, - "rewards/pad": 0.0, - "step": 1346 - }, - { - "completion_length": 98.625, - "epoch": 0.4292543021032505, - "grad_norm": 184.50650024414062, - "kl": 0.11572265625, - "learning_rate": 5.707456978967495e-07, - "loss": 0.0046, - "reward": 1.3712372779846191, - "reward_std": 0.09562046080827713, - "rewards/answer_reward": 0.078125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.2931123971939087, - "step": 1347 - }, - { - "completion_length": 123.265625, - "epoch": 0.4295729764181007, - "grad_norm": 40.26362228393555, - "kl": 0.1533203125, - "learning_rate": 5.704270235818993e-07, - "loss": 0.0061, - "reward": 1.5752763748168945, - "reward_std": 0.11223172396421432, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4815264046192169, - "step": 1348 - }, - { - "completion_length": 149.0, - "epoch": 0.42989165073295094, - "grad_norm": 45.832801818847656, - "kl": 0.109375, - "learning_rate": 5.701083492670491e-07, - "loss": 0.0044, - "reward": 1.524371862411499, - "reward_std": 0.06820206344127655, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5243719220161438, - "step": 1349 - }, - { - "completion_length": 96.921875, - "epoch": 0.43021032504780116, - "grad_norm": 24.57037353515625, - "kl": 0.1318359375, - "learning_rate": 5.697896749521989e-07, - "loss": 0.0053, - "reward": 1.579499363899231, - "reward_std": 0.05558115616440773, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45449933409690857, - "step": 1350 - }, - { - "completion_length": 123.171875, - "epoch": 0.4305289993626514, - "grad_norm": 26.439207077026367, - "kl": 0.1162109375, - "learning_rate": 5.694710006373486e-07, - "loss": 0.0047, - "reward": 1.520458459854126, - "reward_std": 0.04056993126869202, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.520458459854126, - "rewards/pad": 0.0, - "step": 1351 - }, - { - "completion_length": 95.734375, - "epoch": 0.4308476736775016, - "grad_norm": 150.6898651123047, - "kl": 0.1298828125, - "learning_rate": 5.691523263224984e-07, - "loss": 0.0052, - "reward": 1.5843548774719238, - "reward_std": 0.11656299233436584, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47497978806495667, - "rewards/pad": 0.109375, - "step": 1352 - }, - { - "completion_length": 44.890625, - "epoch": 0.4311663479923518, - "grad_norm": 124.51334381103516, - "kl": 0.1337890625, - "learning_rate": 5.688336520076482e-07, - "loss": 0.0054, - "reward": 1.6478725671768188, - "reward_std": 0.13374654948711395, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6478725671768188, - "rewards/pad": 0.0, - "step": 1353 - }, - { - "completion_length": 71.90625, - "epoch": 0.43148502230720204, - "grad_norm": 27.88372230529785, - "kl": 0.1025390625, - "learning_rate": 5.685149776927978e-07, - "loss": 0.0041, - "reward": 1.6494276523590088, - "reward_std": 0.11932900547981262, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6181777715682983, - "rewards/pad": 0.03125, - "step": 1354 - }, - { - "completion_length": 127.421875, - "epoch": 0.43180369662205226, - "grad_norm": 13.997013092041016, - "kl": 0.12451171875, - "learning_rate": 5.681963033779476e-07, - "loss": 0.005, - "reward": 1.6400054693222046, - "reward_std": 0.05574731528759003, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5150054693222046, - "step": 1355 - }, - { - "completion_length": 69.90625, - "epoch": 0.4321223709369025, - "grad_norm": 59.1829719543457, - "kl": 0.1337890625, - "learning_rate": 5.678776290630974e-07, - "loss": 0.0053, - "reward": 1.4801830053329468, - "reward_std": 0.08168666064739227, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48018306493759155, - "rewards/pad": 0.0, - "step": 1356 - }, - { - "completion_length": 96.53125, - "epoch": 0.4324410452517527, - "grad_norm": 17.930870056152344, - "kl": 0.1474609375, - "learning_rate": 5.675589547482473e-07, - "loss": 0.0059, - "reward": 1.7791249752044678, - "reward_std": 0.09777411073446274, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5291250348091125, - "rewards/pad": 0.25, - "step": 1357 - }, - { - "completion_length": 72.84375, - "epoch": 0.4327597195666029, - "grad_norm": 15.228046417236328, - "kl": 0.1845703125, - "learning_rate": 5.67240280433397e-07, - "loss": 0.0074, - "reward": 1.8047544956207275, - "reward_std": 0.0883319303393364, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5547544956207275, - "step": 1358 - }, - { - "completion_length": 96.8125, - "epoch": 0.43307839388145314, - "grad_norm": 47.425968170166016, - "kl": 0.09423828125, - "learning_rate": 5.669216061185468e-07, - "loss": 0.0038, - "reward": 1.6819376945495605, - "reward_std": 0.07194778323173523, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5569379329681396, - "step": 1359 - }, - { - "completion_length": 155.671875, - "epoch": 0.43339706819630336, - "grad_norm": 27.573013305664062, - "kl": 0.11376953125, - "learning_rate": 5.666029318036966e-07, - "loss": 0.0045, - "reward": 1.5436911582946777, - "reward_std": 0.06996867060661316, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4186912178993225, - "step": 1360 - }, - { - "completion_length": 69.28125, - "epoch": 0.4337157425111536, - "grad_norm": 34.57221984863281, - "kl": 0.142578125, - "learning_rate": 5.662842574888464e-07, - "loss": 0.0057, - "reward": 1.4558401107788086, - "reward_std": 0.08932884782552719, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4558401107788086, - "rewards/pad": 0.0, - "step": 1361 - }, - { - "completion_length": 97.265625, - "epoch": 0.4340344168260038, - "grad_norm": 34.02722930908203, - "kl": 0.1416015625, - "learning_rate": 5.659655831739961e-07, - "loss": 0.0056, - "reward": 1.4915645122528076, - "reward_std": 0.041257619857788086, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4915645718574524, - "rewards/pad": 0.0, - "step": 1362 - }, - { - "completion_length": 95.59375, - "epoch": 0.434353091140854, - "grad_norm": 75.93492126464844, - "kl": 0.1474609375, - "learning_rate": 5.656469088591459e-07, - "loss": 0.0059, - "reward": 1.7674086093902588, - "reward_std": 0.10432854294776917, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6424086093902588, - "rewards/pad": 0.125, - "step": 1363 - }, - { - "completion_length": 71.265625, - "epoch": 0.4346717654557043, - "grad_norm": 31.960256576538086, - "kl": 0.166015625, - "learning_rate": 5.653282345442957e-07, - "loss": 0.0066, - "reward": 1.3737208843231201, - "reward_std": 0.054402053356170654, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3737209439277649, - "rewards/pad": 0.0, - "step": 1364 - }, - { - "completion_length": 45.265625, - "epoch": 0.4349904397705545, - "grad_norm": 22.74249267578125, - "kl": 0.205078125, - "learning_rate": 5.650095602294455e-07, - "loss": 0.0082, - "reward": 1.6794965267181396, - "reward_std": 0.0682016983628273, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5544965267181396, - "rewards/pad": 0.125, - "step": 1365 - }, - { - "completion_length": 97.828125, - "epoch": 0.43530911408540474, - "grad_norm": 39.36299514770508, - "kl": 0.1474609375, - "learning_rate": 5.646908859145952e-07, - "loss": 0.0059, - "reward": 1.3101704120635986, - "reward_std": 0.04966102913022041, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.31017038226127625, - "rewards/pad": 0.0, - "step": 1366 - }, - { - "completion_length": 74.1875, - "epoch": 0.43562778840025496, - "grad_norm": 39.882850646972656, - "kl": 0.12890625, - "learning_rate": 5.64372211599745e-07, - "loss": 0.0052, - "reward": 1.8632087707519531, - "reward_std": 0.07874581217765808, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6132087111473083, - "rewards/pad": 0.25, - "step": 1367 - }, - { - "completion_length": 97.734375, - "epoch": 0.4359464627151052, - "grad_norm": 41.4481086730957, - "kl": 0.10986328125, - "learning_rate": 5.640535372848948e-07, - "loss": 0.0044, - "reward": 1.6479605436325073, - "reward_std": 0.108167365193367, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6479606032371521, - "rewards/pad": 0.0, - "step": 1368 - }, - { - "completion_length": 124.09375, - "epoch": 0.4362651370299554, - "grad_norm": 43.37957763671875, - "kl": 0.2275390625, - "learning_rate": 5.637348629700446e-07, - "loss": 0.0091, - "reward": 1.5912022590637207, - "reward_std": 0.07037024199962616, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4662023186683655, - "step": 1369 - }, - { - "completion_length": 97.671875, - "epoch": 0.4365838113448056, - "grad_norm": 23.112123489379883, - "kl": 0.224609375, - "learning_rate": 5.634161886551943e-07, - "loss": 0.009, - "reward": 1.5291333198547363, - "reward_std": 0.0721755251288414, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5291333794593811, - "rewards/pad": 0.0, - "step": 1370 - }, - { - "completion_length": 72.109375, - "epoch": 0.43690248565965584, - "grad_norm": 44.03460693359375, - "kl": 0.10595703125, - "learning_rate": 5.630975143403441e-07, - "loss": 0.0042, - "reward": 1.6287412643432617, - "reward_std": 0.04039338603615761, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3787413537502289, - "step": 1371 - }, - { - "completion_length": 149.09375, - "epoch": 0.43722115997450606, - "grad_norm": 11.578285217285156, - "kl": 0.10302734375, - "learning_rate": 5.627788400254939e-07, - "loss": 0.0041, - "reward": 1.7141399383544922, - "reward_std": 0.05874771997332573, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.589139997959137, - "step": 1372 - }, - { - "completion_length": 20.15625, - "epoch": 0.4375398342893563, - "grad_norm": 55.78146743774414, - "kl": 0.330078125, - "learning_rate": 5.624601657106437e-07, - "loss": 0.0132, - "reward": 1.6458699703216553, - "reward_std": 0.1880033016204834, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.41149500012397766, - "rewards/pad": 0.25, - "step": 1373 - }, - { - "completion_length": 44.75, - "epoch": 0.4378585086042065, - "grad_norm": 41.31639099121094, - "kl": 0.2255859375, - "learning_rate": 5.621414913957934e-07, - "loss": 0.009, - "reward": 1.5588467121124268, - "reward_std": 0.08944406360387802, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.558846652507782, - "rewards/pad": 0.0, - "step": 1374 - }, - { - "completion_length": 121.328125, - "epoch": 0.4381771829190567, - "grad_norm": 23.76357650756836, - "kl": 0.1474609375, - "learning_rate": 5.618228170809432e-07, - "loss": 0.0059, - "reward": 1.5019047260284424, - "reward_std": 0.06296569854021072, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5019047260284424, - "step": 1375 - }, - { - "completion_length": 74.5625, - "epoch": 0.43849585723390694, - "grad_norm": 27.09462547302246, - "kl": 0.0947265625, - "learning_rate": 5.615041427660931e-07, - "loss": 0.0038, - "reward": 1.8667649030685425, - "reward_std": 0.054511696100234985, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4917648136615753, - "step": 1376 - }, - { - "completion_length": 98.59375, - "epoch": 0.43881453154875716, - "grad_norm": 49.495826721191406, - "kl": 0.2470703125, - "learning_rate": 5.611854684512429e-07, - "loss": 0.0099, - "reward": 1.688317060470581, - "reward_std": 0.06171800568699837, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4383169114589691, - "step": 1377 - }, - { - "completion_length": 22.96875, - "epoch": 0.4391332058636074, - "grad_norm": 264.8526306152344, - "kl": 0.158203125, - "learning_rate": 5.608667941363926e-07, - "loss": 0.0063, - "reward": 1.7475816011428833, - "reward_std": 0.17028731107711792, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4819566011428833, - "rewards/pad": 0.28125, - "step": 1378 - }, - { - "completion_length": 176.09375, - "epoch": 0.4394518801784576, - "grad_norm": 17.579050064086914, - "kl": 0.056640625, - "learning_rate": 5.605481198215424e-07, - "loss": 0.0023, - "reward": 1.4881428480148315, - "reward_std": 0.0604608952999115, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36314287781715393, - "step": 1379 - }, - { - "completion_length": 69.96875, - "epoch": 0.4397705544933078, - "grad_norm": 35.04433822631836, - "kl": 0.0947265625, - "learning_rate": 5.602294455066922e-07, - "loss": 0.0038, - "reward": 1.7537932395935059, - "reward_std": 0.10242816805839539, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6287932991981506, - "rewards/pad": 0.125, - "step": 1380 - }, - { - "completion_length": 70.015625, - "epoch": 0.44008922880815804, - "grad_norm": 39.497703552246094, - "kl": 0.12109375, - "learning_rate": 5.59910771191842e-07, - "loss": 0.0048, - "reward": 1.5401318073272705, - "reward_std": 0.0786762535572052, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41513165831565857, - "step": 1381 - }, - { - "completion_length": 120.921875, - "epoch": 0.44040790312300826, - "grad_norm": 26.43705940246582, - "kl": 0.1396484375, - "learning_rate": 5.595920968769917e-07, - "loss": 0.0056, - "reward": 1.4561233520507812, - "reward_std": 0.054390233010053635, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45612335205078125, - "rewards/pad": 0.0, - "step": 1382 - }, - { - "completion_length": 124.78125, - "epoch": 0.44072657743785854, - "grad_norm": 18.053184509277344, - "kl": 0.18359375, - "learning_rate": 5.592734225621415e-07, - "loss": 0.0073, - "reward": 1.66341233253479, - "reward_std": 0.0625927671790123, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5384123921394348, - "step": 1383 - }, - { - "completion_length": 70.015625, - "epoch": 0.44104525175270876, - "grad_norm": 1104.785888671875, - "kl": 0.138671875, - "learning_rate": 5.589547482472913e-07, - "loss": 0.0056, - "reward": 1.4395288228988647, - "reward_std": 0.048976339399814606, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43952885270118713, - "step": 1384 - }, - { - "completion_length": 146.375, - "epoch": 0.441363926067559, - "grad_norm": 79.37278747558594, - "kl": 0.0869140625, - "learning_rate": 5.586360739324411e-07, - "loss": 0.0035, - "reward": 1.4432659149169922, - "reward_std": 0.04890685901045799, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44326597452163696, - "step": 1385 - }, - { - "completion_length": 43.015625, - "epoch": 0.4416826003824092, - "grad_norm": 27.281627655029297, - "kl": 0.1533203125, - "learning_rate": 5.583173996175908e-07, - "loss": 0.0061, - "reward": 1.5859811305999756, - "reward_std": 0.07883980870246887, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5859811305999756, - "step": 1386 - }, - { - "completion_length": 44.65625, - "epoch": 0.4420012746972594, - "grad_norm": 37.530025482177734, - "kl": 0.1708984375, - "learning_rate": 5.579987253027406e-07, - "loss": 0.0068, - "reward": 1.510833740234375, - "reward_std": 0.09606379270553589, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.38583362102508545, - "rewards/pad": 0.125, - "step": 1387 - }, - { - "completion_length": 145.34375, - "epoch": 0.44231994901210964, - "grad_norm": 12.873120307922363, - "kl": 0.07275390625, - "learning_rate": 5.576800509878904e-07, - "loss": 0.0029, - "reward": 1.631828784942627, - "reward_std": 0.04249340295791626, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.631828784942627, - "rewards/pad": 0.0, - "step": 1388 - }, - { - "completion_length": 46.34375, - "epoch": 0.44263862332695986, - "grad_norm": 40.11998748779297, - "kl": 0.3203125, - "learning_rate": 5.573613766730401e-07, - "loss": 0.0129, - "reward": 1.6404682397842407, - "reward_std": 0.13934734463691711, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4373432695865631, - "rewards/pad": 0.203125, - "step": 1389 - }, - { - "completion_length": 97.78125, - "epoch": 0.4429572976418101, - "grad_norm": 41.30707550048828, - "kl": 0.099609375, - "learning_rate": 5.570427023581899e-07, - "loss": 0.004, - "reward": 1.713468313217163, - "reward_std": 0.13982240855693817, - "rewards/answer_reward": 0.328125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3853432238101959, - "step": 1390 - }, - { - "completion_length": 20.421875, - "epoch": 0.4432759719566603, - "grad_norm": 64.35762023925781, - "kl": 0.30859375, - "learning_rate": 5.567240280433397e-07, - "loss": 0.0124, - "reward": 1.66224205493927, - "reward_std": 0.15928740799427032, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.61536705493927, - "rewards/pad": 0.046875, - "step": 1391 - }, - { - "completion_length": 67.71875, - "epoch": 0.4435946462715105, - "grad_norm": 18.37198257446289, - "kl": 0.1513671875, - "learning_rate": 5.564053537284895e-07, - "loss": 0.006, - "reward": 1.6161123514175415, - "reward_std": 0.1390165537595749, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5379872918128967, - "rewards/pad": 0.078125, - "step": 1392 - }, - { - "completion_length": 69.65625, - "epoch": 0.44391332058636074, - "grad_norm": 97.37084197998047, - "kl": 0.1484375, - "learning_rate": 5.560866794136391e-07, - "loss": 0.0059, - "reward": 1.722306728363037, - "reward_std": 0.13341601192951202, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5504317283630371, - "step": 1393 - }, - { - "completion_length": 122.078125, - "epoch": 0.44423199490121096, - "grad_norm": 12.7796630859375, - "kl": 0.08642578125, - "learning_rate": 5.557680050987889e-07, - "loss": 0.0035, - "reward": 1.5103886127471924, - "reward_std": 0.0922664999961853, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5260135531425476, - "step": 1394 - }, - { - "completion_length": 121.90625, - "epoch": 0.4445506692160612, - "grad_norm": 13.822649002075195, - "kl": 0.11279296875, - "learning_rate": 5.554493307839388e-07, - "loss": 0.0045, - "reward": 1.471165657043457, - "reward_std": 0.0489761158823967, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47116556763648987, - "step": 1395 - }, - { - "completion_length": 122.265625, - "epoch": 0.4448693435309114, - "grad_norm": 95.01806640625, - "kl": 0.1142578125, - "learning_rate": 5.551306564690886e-07, - "loss": 0.0046, - "reward": 1.5615136623382568, - "reward_std": 0.04147946089506149, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5615136027336121, - "rewards/pad": 0.0, - "step": 1396 - }, - { - "completion_length": 70.921875, - "epoch": 0.4451880178457616, - "grad_norm": 40.63417053222656, - "kl": 0.142578125, - "learning_rate": 5.548119821542383e-07, - "loss": 0.0057, - "reward": 1.5273734331130981, - "reward_std": 0.040808022022247314, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5273733735084534, - "rewards/pad": 0.0, - "step": 1397 - }, - { - "completion_length": 70.640625, - "epoch": 0.44550669216061184, - "grad_norm": 47.04145812988281, - "kl": 0.169921875, - "learning_rate": 5.544933078393881e-07, - "loss": 0.0068, - "reward": 1.4421722888946533, - "reward_std": 0.056907620280981064, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44217219948768616, - "step": 1398 - }, - { - "completion_length": 72.125, - "epoch": 0.44582536647546206, - "grad_norm": 104.2425308227539, - "kl": 0.15234375, - "learning_rate": 5.541746335245379e-07, - "loss": 0.0061, - "reward": 1.5128662586212158, - "reward_std": 0.09221340715885162, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4972413182258606, - "rewards/pad": 0.015625, - "step": 1399 - }, - { - "completion_length": 46.1875, - "epoch": 0.4461440407903123, - "grad_norm": 37.326927185058594, - "kl": 0.2197265625, - "learning_rate": 5.538559592096877e-07, - "loss": 0.0088, - "reward": 1.8115777969360352, - "reward_std": 0.052388474345207214, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5615776777267456, - "step": 1400 - }, - { - "completion_length": 70.453125, - "epoch": 0.4464627151051625, - "grad_norm": 129.155517578125, - "kl": 0.1787109375, - "learning_rate": 5.535372848948374e-07, - "loss": 0.0072, - "reward": 1.4227665662765503, - "reward_std": 0.0634697824716568, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4227665066719055, - "step": 1401 - }, - { - "completion_length": 122.609375, - "epoch": 0.4467813894200127, - "grad_norm": 6.261690139770508, - "kl": 0.14453125, - "learning_rate": 5.532186105799872e-07, - "loss": 0.0058, - "reward": 1.3813254833221436, - "reward_std": 0.06035871058702469, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3813254237174988, - "rewards/pad": 0.0, - "step": 1402 - }, - { - "completion_length": 98.59375, - "epoch": 0.447100063734863, - "grad_norm": 69.14564514160156, - "kl": 0.1015625, - "learning_rate": 5.52899936265137e-07, - "loss": 0.0041, - "reward": 1.9547457695007324, - "reward_std": 0.12339489161968231, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5797458291053772, - "rewards/pad": 0.375, - "step": 1403 - }, - { - "completion_length": 48.609375, - "epoch": 0.4474187380497132, - "grad_norm": 79.41541290283203, - "kl": 0.16015625, - "learning_rate": 5.525812619502868e-07, - "loss": 0.0064, - "reward": 1.612067461013794, - "reward_std": 0.11976909637451172, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37769243121147156, - "step": 1404 - }, - { - "completion_length": 70.75, - "epoch": 0.44773741236456344, - "grad_norm": 24.919973373413086, - "kl": 0.1552734375, - "learning_rate": 5.522625876354365e-07, - "loss": 0.0062, - "reward": 1.6123287677764893, - "reward_std": 0.07202111184597015, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48732879757881165, - "step": 1405 - }, - { - "completion_length": 71.109375, - "epoch": 0.44805608667941366, - "grad_norm": 53.67876052856445, - "kl": 0.1396484375, - "learning_rate": 5.519439133205863e-07, - "loss": 0.0056, - "reward": 1.5918638706207275, - "reward_std": 0.07275165617465973, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46686387062072754, - "step": 1406 - }, - { - "completion_length": 95.375, - "epoch": 0.4483747609942639, - "grad_norm": 20.173824310302734, - "kl": 0.369140625, - "learning_rate": 5.516252390057361e-07, - "loss": 0.0148, - "reward": 1.5206084251403809, - "reward_std": 0.13873018324375153, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5362333655357361, - "rewards/pad": 0.0, - "step": 1407 - }, - { - "completion_length": 120.9375, - "epoch": 0.4486934353091141, - "grad_norm": 25.047237396240234, - "kl": 0.10107421875, - "learning_rate": 5.513065646908859e-07, - "loss": 0.004, - "reward": 1.4869604110717773, - "reward_std": 0.061503179371356964, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3619605302810669, - "step": 1408 - }, - { - "completion_length": 46.96875, - "epoch": 0.4490121096239643, - "grad_norm": 51.29338455200195, - "kl": 0.146484375, - "learning_rate": 5.509878903760356e-07, - "loss": 0.0059, - "reward": 1.7304821014404297, - "reward_std": 0.12294122576713562, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5586071014404297, - "rewards/pad": 0.171875, - "step": 1409 - }, - { - "completion_length": 46.390625, - "epoch": 0.44933078393881454, - "grad_norm": 25.94217300415039, - "kl": 0.1142578125, - "learning_rate": 5.506692160611854e-07, - "loss": 0.0046, - "reward": 1.7969729900360107, - "reward_std": 0.12597975134849548, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43759799003601074, - "rewards/pad": 0.359375, - "step": 1410 - }, - { - "completion_length": 95.90625, - "epoch": 0.44964945825366476, - "grad_norm": 17.947458267211914, - "kl": 0.2578125, - "learning_rate": 5.503505417463352e-07, - "loss": 0.0103, - "reward": 1.6846892833709717, - "reward_std": 0.07526648789644241, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5596892237663269, - "step": 1411 - }, - { - "completion_length": 146.140625, - "epoch": 0.449968132568515, - "grad_norm": 42.996315002441406, - "kl": 0.130859375, - "learning_rate": 5.50031867431485e-07, - "loss": 0.0053, - "reward": 1.3987228870391846, - "reward_std": 0.10691266506910324, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38309788703918457, - "step": 1412 - }, - { - "completion_length": 71.703125, - "epoch": 0.4502868068833652, - "grad_norm": 42.76258850097656, - "kl": 0.2138671875, - "learning_rate": 5.497131931166347e-07, - "loss": 0.0085, - "reward": 1.5707018375396729, - "reward_std": 0.07505904138088226, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.44570183753967285, - "step": 1413 - }, - { - "completion_length": 72.859375, - "epoch": 0.4506054811982154, - "grad_norm": 47.79791259765625, - "kl": 0.1513671875, - "learning_rate": 5.493945188017846e-07, - "loss": 0.0061, - "reward": 1.6459414958953857, - "reward_std": 0.08020886778831482, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5209414958953857, - "step": 1414 - }, - { - "completion_length": 172.125, - "epoch": 0.45092415551306564, - "grad_norm": 71.31310272216797, - "kl": 0.0751953125, - "learning_rate": 5.490758444869344e-07, - "loss": 0.003, - "reward": 1.658172845840454, - "reward_std": 0.06371012330055237, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5331727862358093, - "rewards/pad": 0.125, - "step": 1415 - }, - { - "completion_length": 150.515625, - "epoch": 0.45124282982791586, - "grad_norm": 8.224554061889648, - "kl": 0.1787109375, - "learning_rate": 5.487571701720841e-07, - "loss": 0.0071, - "reward": 1.5972743034362793, - "reward_std": 0.1921130120754242, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.96875, - "rewards/tracking_iou_reward": 0.5191493630409241, - "step": 1416 - }, - { - "completion_length": 149.9375, - "epoch": 0.4515615041427661, - "grad_norm": 12.27308464050293, - "kl": 0.0810546875, - "learning_rate": 5.484384958572339e-07, - "loss": 0.0032, - "reward": 1.4329180717468262, - "reward_std": 0.047323573380708694, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4329180121421814, - "step": 1417 - }, - { - "completion_length": 120.609375, - "epoch": 0.4518801784576163, - "grad_norm": 14.535323143005371, - "kl": 0.177734375, - "learning_rate": 5.481198215423837e-07, - "loss": 0.0071, - "reward": 1.3263839483261108, - "reward_std": 0.029305964708328247, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3263840079307556, - "rewards/pad": 0.0, - "step": 1418 - }, - { - "completion_length": 123.9375, - "epoch": 0.4521988527724665, - "grad_norm": 74.81829833984375, - "kl": 0.10595703125, - "learning_rate": 5.478011472275335e-07, - "loss": 0.0042, - "reward": 1.645141363143921, - "reward_std": 0.09291210770606995, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5201413035392761, - "rewards/pad": 0.125, - "step": 1419 - }, - { - "completion_length": 94.421875, - "epoch": 0.45251752708731674, - "grad_norm": 66.59827423095703, - "kl": 0.099609375, - "learning_rate": 5.474824729126832e-07, - "loss": 0.004, - "reward": 1.3246803283691406, - "reward_std": 0.11868653446435928, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3403053283691406, - "rewards/pad": 0.0, - "step": 1420 - }, - { - "completion_length": 121.921875, - "epoch": 0.45283620140216696, - "grad_norm": 23.449520111083984, - "kl": 0.1650390625, - "learning_rate": 5.47163798597833e-07, - "loss": 0.0066, - "reward": 1.550499439239502, - "reward_std": 0.1245659664273262, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44112440943717957, - "step": 1421 - }, - { - "completion_length": 44.390625, - "epoch": 0.45315487571701724, - "grad_norm": 54.22697830200195, - "kl": 0.44921875, - "learning_rate": 5.468451242829828e-07, - "loss": 0.018, - "reward": 1.5884101390838623, - "reward_std": 0.1501428335905075, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5884101986885071, - "rewards/pad": 0.0, - "step": 1422 - }, - { - "completion_length": 97.984375, - "epoch": 0.45347355003186746, - "grad_norm": 19.764827728271484, - "kl": 0.1181640625, - "learning_rate": 5.465264499681326e-07, - "loss": 0.0047, - "reward": 1.6852483749389648, - "reward_std": 0.07374419271945953, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5602483749389648, - "step": 1423 - }, - { - "completion_length": 72.671875, - "epoch": 0.4537922243467177, - "grad_norm": 97.86742401123047, - "kl": 0.150390625, - "learning_rate": 5.462077756532823e-07, - "loss": 0.006, - "reward": 1.7004663944244385, - "reward_std": 0.17683261632919312, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5598414540290833, - "rewards/pad": 0.140625, - "step": 1424 - }, - { - "completion_length": 172.28125, - "epoch": 0.4541108986615679, - "grad_norm": 6.176180839538574, - "kl": 0.12109375, - "learning_rate": 5.458891013384321e-07, - "loss": 0.0048, - "reward": 1.4284148216247559, - "reward_std": 0.10202738642692566, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.444039911031723, - "step": 1425 - }, - { - "completion_length": 97.546875, - "epoch": 0.4544295729764181, - "grad_norm": 84.13567352294922, - "kl": 0.1826171875, - "learning_rate": 5.455704270235819e-07, - "loss": 0.0073, - "reward": 1.5227749347686768, - "reward_std": 0.07707509398460388, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39777496457099915, - "step": 1426 - }, - { - "completion_length": 71.953125, - "epoch": 0.45474824729126834, - "grad_norm": 48.81588363647461, - "kl": 0.2041015625, - "learning_rate": 5.452517527087317e-07, - "loss": 0.0082, - "reward": 1.6935824155807495, - "reward_std": 0.09216948598623276, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5685823559761047, - "step": 1427 - }, - { - "completion_length": 95.390625, - "epoch": 0.45506692160611856, - "grad_norm": 26.522541046142578, - "kl": 0.1357421875, - "learning_rate": 5.449330783938814e-07, - "loss": 0.0054, - "reward": 1.5668354034423828, - "reward_std": 0.07392378896474838, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5668354034423828, - "rewards/pad": 0.0, - "step": 1428 - }, - { - "completion_length": 148.890625, - "epoch": 0.4553855959209688, - "grad_norm": 11.638447761535645, - "kl": 0.09228515625, - "learning_rate": 5.446144040790312e-07, - "loss": 0.0037, - "reward": 1.5445245504379272, - "reward_std": 0.037910446524620056, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.419524610042572, - "rewards/pad": 0.125, - "step": 1429 - }, - { - "completion_length": 71.625, - "epoch": 0.455704270235819, - "grad_norm": 57.1502685546875, - "kl": 0.1708984375, - "learning_rate": 5.44295729764181e-07, - "loss": 0.0068, - "reward": 1.686220407485962, - "reward_std": 0.07011242210865021, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5612203478813171, - "rewards/pad": 0.125, - "step": 1430 - }, - { - "completion_length": 123.578125, - "epoch": 0.4560229445506692, - "grad_norm": 33.73246383666992, - "kl": 0.123046875, - "learning_rate": 5.439770554493309e-07, - "loss": 0.0049, - "reward": 1.6060597896575928, - "reward_std": 0.058518167585134506, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4810597896575928, - "step": 1431 - }, - { - "completion_length": 128.859375, - "epoch": 0.45634161886551944, - "grad_norm": 49.34904479980469, - "kl": 0.076171875, - "learning_rate": 5.436583811344804e-07, - "loss": 0.003, - "reward": 1.4976006746292114, - "reward_std": 0.06735117733478546, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37260061502456665, - "rewards/pad": 0.125, - "step": 1432 - }, - { - "completion_length": 99.328125, - "epoch": 0.45666029318036966, - "grad_norm": 19.230791091918945, - "kl": 0.09375, - "learning_rate": 5.433397068196303e-07, - "loss": 0.0037, - "reward": 1.8719754219055176, - "reward_std": 0.08034700155258179, - "rewards/answer_reward": 0.484375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3876003623008728, - "step": 1433 - }, - { - "completion_length": 121.53125, - "epoch": 0.4569789674952199, - "grad_norm": 14.015466690063477, - "kl": 0.1376953125, - "learning_rate": 5.430210325047801e-07, - "loss": 0.0055, - "reward": 1.534825086593628, - "reward_std": 0.04365585371851921, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5348250865936279, - "step": 1434 - }, - { - "completion_length": 70.84375, - "epoch": 0.4572976418100701, - "grad_norm": 18.4355525970459, - "kl": 0.1376953125, - "learning_rate": 5.427023581899299e-07, - "loss": 0.0055, - "reward": 1.6791949272155762, - "reward_std": 0.04616924747824669, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5541948676109314, - "step": 1435 - }, - { - "completion_length": 93.65625, - "epoch": 0.4576163161249203, - "grad_norm": 16.46039581298828, - "kl": 0.26953125, - "learning_rate": 5.423836838750796e-07, - "loss": 0.0108, - "reward": 1.4510504007339478, - "reward_std": 0.0750553086400032, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45105046033859253, - "step": 1436 - }, - { - "completion_length": 97.390625, - "epoch": 0.45793499043977054, - "grad_norm": 23.34442138671875, - "kl": 0.1474609375, - "learning_rate": 5.420650095602294e-07, - "loss": 0.0059, - "reward": 1.5739753246307373, - "reward_std": 0.09569709002971649, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5739753246307373, - "rewards/pad": 0.0, - "step": 1437 - }, - { - "completion_length": 147.15625, - "epoch": 0.45825366475462076, - "grad_norm": 27.796993255615234, - "kl": 0.091796875, - "learning_rate": 5.417463352453792e-07, - "loss": 0.0037, - "reward": 1.5219330787658691, - "reward_std": 0.024136081337928772, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5219330787658691, - "rewards/pad": 0.0, - "step": 1438 - }, - { - "completion_length": 71.125, - "epoch": 0.458572339069471, - "grad_norm": 45.78583526611328, - "kl": 0.119140625, - "learning_rate": 5.41427660930529e-07, - "loss": 0.0048, - "reward": 1.8081820011138916, - "reward_std": 0.08273088932037354, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5581819415092468, - "rewards/pad": 0.25, - "step": 1439 - }, - { - "completion_length": 125.765625, - "epoch": 0.4588910133843212, - "grad_norm": 23.17354965209961, - "kl": 0.1083984375, - "learning_rate": 5.411089866156787e-07, - "loss": 0.0043, - "reward": 1.6552670001983643, - "reward_std": 0.20718078315258026, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42089203000068665, - "step": 1440 - }, - { - "completion_length": 123.9375, - "epoch": 0.4592096876991714, - "grad_norm": 23.839529037475586, - "kl": 0.25, - "learning_rate": 5.407903123008285e-07, - "loss": 0.01, - "reward": 1.5547746419906616, - "reward_std": 0.07701779901981354, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4297747313976288, - "rewards/pad": 0.125, - "step": 1441 - }, - { - "completion_length": 45.78125, - "epoch": 0.4595283620140217, - "grad_norm": 40.79808807373047, - "kl": 0.2197265625, - "learning_rate": 5.404716379859783e-07, - "loss": 0.0088, - "reward": 1.6783041954040527, - "reward_std": 0.12530480325222015, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5533040761947632, - "step": 1442 - }, - { - "completion_length": 74.5625, - "epoch": 0.4598470363288719, - "grad_norm": 43.00371170043945, - "kl": 0.1533203125, - "learning_rate": 5.401529636711281e-07, - "loss": 0.0061, - "reward": 1.6545014381408691, - "reward_std": 0.11320410668849945, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.38887643814086914, - "rewards/pad": 0.265625, - "step": 1443 - }, - { - "completion_length": 99.296875, - "epoch": 0.46016571064372214, - "grad_norm": 16.58584976196289, - "kl": 0.1318359375, - "learning_rate": 5.398342893562778e-07, - "loss": 0.0053, - "reward": 1.5221130847930908, - "reward_std": 0.08949887752532959, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5221132636070251, - "rewards/pad": 0.0, - "step": 1444 - }, - { - "completion_length": 46.34375, - "epoch": 0.46048438495857236, - "grad_norm": 67.59211730957031, - "kl": 0.1787109375, - "learning_rate": 5.395156150414276e-07, - "loss": 0.0072, - "reward": 1.4442793130874634, - "reward_std": 0.1727423369884491, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3661543130874634, - "rewards/pad": 0.078125, - "step": 1445 - }, - { - "completion_length": 124.34375, - "epoch": 0.4608030592734226, - "grad_norm": 34.86455535888672, - "kl": 0.1201171875, - "learning_rate": 5.391969407265774e-07, - "loss": 0.0048, - "reward": 1.561646819114685, - "reward_std": 0.07751183211803436, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5460219383239746, - "rewards/pad": 0.015625, - "step": 1446 - }, - { - "completion_length": 125.765625, - "epoch": 0.4611217335882728, - "grad_norm": 31.93128776550293, - "kl": 0.1328125, - "learning_rate": 5.388782664117271e-07, - "loss": 0.0053, - "reward": 1.6424591541290283, - "reward_std": 0.1986120492219925, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.43933412432670593, - "step": 1447 - }, - { - "completion_length": 148.828125, - "epoch": 0.461440407903123, - "grad_norm": 21.80022430419922, - "kl": 0.0849609375, - "learning_rate": 5.385595920968769e-07, - "loss": 0.0034, - "reward": 1.6194400787353516, - "reward_std": 0.09938361495733261, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5256900787353516, - "rewards/pad": 0.09375, - "step": 1448 - }, - { - "completion_length": 96.953125, - "epoch": 0.46175908221797324, - "grad_norm": 40.039730072021484, - "kl": 0.15625, - "learning_rate": 5.382409177820267e-07, - "loss": 0.0063, - "reward": 1.564121961593628, - "reward_std": 0.11281084269285202, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5484969615936279, - "rewards/pad": 0.015625, - "step": 1449 - }, - { - "completion_length": 127.578125, - "epoch": 0.46207775653282346, - "grad_norm": 19.242937088012695, - "kl": 0.10302734375, - "learning_rate": 5.379222434671765e-07, - "loss": 0.0041, - "reward": 1.6729660034179688, - "reward_std": 0.06923995912075043, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5479658246040344, - "rewards/pad": 0.125, - "step": 1450 - }, - { - "completion_length": 70.4375, - "epoch": 0.4623964308476737, - "grad_norm": 26.17292022705078, - "kl": 0.1796875, - "learning_rate": 5.376035691523262e-07, - "loss": 0.0072, - "reward": 1.6035188436508179, - "reward_std": 0.124393530189991, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49414384365081787, - "rewards/pad": 0.109375, - "step": 1451 - }, - { - "completion_length": 44.4375, - "epoch": 0.4627151051625239, - "grad_norm": 101.223388671875, - "kl": 0.2099609375, - "learning_rate": 5.372848948374761e-07, - "loss": 0.0084, - "reward": 1.464913010597229, - "reward_std": 0.07390671968460083, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46491295099258423, - "rewards/pad": 0.0, - "step": 1452 - }, - { - "completion_length": 123.234375, - "epoch": 0.4630337794773741, - "grad_norm": 25.080516815185547, - "kl": 0.1220703125, - "learning_rate": 5.369662205226259e-07, - "loss": 0.0049, - "reward": 1.477994680404663, - "reward_std": 0.0637764185667038, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3529945909976959, - "step": 1453 - }, - { - "completion_length": 122.359375, - "epoch": 0.46335245379222434, - "grad_norm": 20.0180606842041, - "kl": 0.1083984375, - "learning_rate": 5.366475462077757e-07, - "loss": 0.0043, - "reward": 1.2684621810913086, - "reward_std": 0.048859499394893646, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.26846227049827576, - "rewards/pad": 0.0, - "step": 1454 - }, - { - "completion_length": 71.84375, - "epoch": 0.46367112810707456, - "grad_norm": 15.79022216796875, - "kl": 0.2197265625, - "learning_rate": 5.363288718929254e-07, - "loss": 0.0088, - "reward": 1.782228708267212, - "reward_std": 0.1331326961517334, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6572288274765015, - "rewards/pad": 0.125, - "step": 1455 - }, - { - "completion_length": 70.78125, - "epoch": 0.4639898024219248, - "grad_norm": 242.91552734375, - "kl": 0.1376953125, - "learning_rate": 5.360101975780752e-07, - "loss": 0.0055, - "reward": 1.8918707370758057, - "reward_std": 0.061253033578395844, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6418708562850952, - "step": 1456 - }, - { - "completion_length": 73.171875, - "epoch": 0.464308476736775, - "grad_norm": 13.989324569702148, - "kl": 0.1298828125, - "learning_rate": 5.35691523263225e-07, - "loss": 0.0052, - "reward": 1.6837668418884277, - "reward_std": 0.061112187802791595, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43376684188842773, - "step": 1457 - }, - { - "completion_length": 126.46875, - "epoch": 0.4646271510516252, - "grad_norm": 31.25027847290039, - "kl": 0.099609375, - "learning_rate": 5.353728489483748e-07, - "loss": 0.004, - "reward": 1.5163158178329468, - "reward_std": 0.06596622616052628, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2663158178329468, - "step": 1458 - }, - { - "completion_length": 70.953125, - "epoch": 0.46494582536647544, - "grad_norm": 24.366724014282227, - "kl": 0.2314453125, - "learning_rate": 5.350541746335245e-07, - "loss": 0.0092, - "reward": 1.5501924753189087, - "reward_std": 0.08063836395740509, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5501924753189087, - "rewards/pad": 0.0, - "step": 1459 - }, - { - "completion_length": 126.8125, - "epoch": 0.46526449968132566, - "grad_norm": 18.02052116394043, - "kl": 0.1611328125, - "learning_rate": 5.347355003186743e-07, - "loss": 0.0065, - "reward": 1.519972801208496, - "reward_std": 0.06053301319479942, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5199728012084961, - "step": 1460 - }, - { - "completion_length": 124.203125, - "epoch": 0.4655831739961759, - "grad_norm": 11.933880805969238, - "kl": 0.10205078125, - "learning_rate": 5.344168260038241e-07, - "loss": 0.0041, - "reward": 1.4709376096725464, - "reward_std": 0.10856231302022934, - "rewards/answer_reward": 0.046875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42406266927719116, - "step": 1461 - }, - { - "completion_length": 124.484375, - "epoch": 0.46590184831102616, - "grad_norm": 9.68100643157959, - "kl": 0.1279296875, - "learning_rate": 5.340981516889739e-07, - "loss": 0.0051, - "reward": 1.5821932554244995, - "reward_std": 0.08199529349803925, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4571932554244995, - "rewards/pad": 0.125, - "step": 1462 - }, - { - "completion_length": 48.171875, - "epoch": 0.4662205226258764, - "grad_norm": 30.872400283813477, - "kl": 0.2138671875, - "learning_rate": 5.337794773741236e-07, - "loss": 0.0085, - "reward": 1.7756941318511963, - "reward_std": 0.06104860454797745, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4006942808628082, - "step": 1463 - }, - { - "completion_length": 46.140625, - "epoch": 0.4665391969407266, - "grad_norm": 32.10598373413086, - "kl": 0.1533203125, - "learning_rate": 5.334608030592734e-07, - "loss": 0.0061, - "reward": 1.8779191970825195, - "reward_std": 0.09105006605386734, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6279191374778748, - "rewards/pad": 0.25, - "step": 1464 - }, - { - "completion_length": 95.3125, - "epoch": 0.4668578712555768, - "grad_norm": 31.138216018676758, - "kl": 0.2177734375, - "learning_rate": 5.331421287444232e-07, - "loss": 0.0087, - "reward": 1.5052273273468018, - "reward_std": 0.05461052432656288, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5052273869514465, - "rewards/pad": 0.0, - "step": 1465 - }, - { - "completion_length": 69.796875, - "epoch": 0.46717654557042704, - "grad_norm": 29.701608657836914, - "kl": 0.15625, - "learning_rate": 5.32823454429573e-07, - "loss": 0.0062, - "reward": 1.4903333187103271, - "reward_std": 0.07421676814556122, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49033334851264954, - "step": 1466 - }, - { - "completion_length": 71.09375, - "epoch": 0.46749521988527726, - "grad_norm": 21.280033111572266, - "kl": 0.1953125, - "learning_rate": 5.325047801147227e-07, - "loss": 0.0078, - "reward": 1.7816627025604248, - "reward_std": 0.06757558137178421, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6566627621650696, - "step": 1467 - }, - { - "completion_length": 150.796875, - "epoch": 0.4678138942001275, - "grad_norm": 10.08774185180664, - "kl": 0.12255859375, - "learning_rate": 5.321861057998725e-07, - "loss": 0.0049, - "reward": 1.463547945022583, - "reward_std": 0.03903178498148918, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.33854788541793823, - "rewards/pad": 0.125, - "step": 1468 - }, - { - "completion_length": 71.421875, - "epoch": 0.4681325685149777, - "grad_norm": 31.640708923339844, - "kl": 0.1484375, - "learning_rate": 5.318674314850224e-07, - "loss": 0.0059, - "reward": 1.387979507446289, - "reward_std": 0.0799902081489563, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37235450744628906, - "rewards/pad": 0.015625, - "step": 1469 - }, - { - "completion_length": 126.125, - "epoch": 0.4684512428298279, - "grad_norm": 63.39480972290039, - "kl": 0.1064453125, - "learning_rate": 5.315487571701722e-07, - "loss": 0.0043, - "reward": 1.4995639324188232, - "reward_std": 0.14248573780059814, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43706390261650085, - "step": 1470 - }, - { - "completion_length": 96.390625, - "epoch": 0.46876991714467814, - "grad_norm": 33.108638763427734, - "kl": 0.30859375, - "learning_rate": 5.312300828553218e-07, - "loss": 0.0124, - "reward": 1.610578179359436, - "reward_std": 0.11231406033039093, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.610578179359436, - "rewards/pad": 0.0, - "step": 1471 - }, - { - "completion_length": 70.453125, - "epoch": 0.46908859145952836, - "grad_norm": 17.15878677368164, - "kl": 0.201171875, - "learning_rate": 5.309114085404716e-07, - "loss": 0.008, - "reward": 1.852674126625061, - "reward_std": 0.06432235985994339, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.727674126625061, - "rewards/pad": 0.125, - "step": 1472 - }, - { - "completion_length": 71.703125, - "epoch": 0.4694072657743786, - "grad_norm": 29.73100471496582, - "kl": 0.158203125, - "learning_rate": 5.305927342256214e-07, - "loss": 0.0063, - "reward": 1.6606179475784302, - "reward_std": 0.08843652904033661, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5356178283691406, - "step": 1473 - }, - { - "completion_length": 149.21875, - "epoch": 0.4697259400892288, - "grad_norm": 8.382768630981445, - "kl": 0.11083984375, - "learning_rate": 5.302740599107712e-07, - "loss": 0.0044, - "reward": 1.3328176736831665, - "reward_std": 0.08705488592386246, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3484426736831665, - "rewards/pad": 0.0, - "step": 1474 - }, - { - "completion_length": 72.109375, - "epoch": 0.470044614404079, - "grad_norm": 22.395475387573242, - "kl": 0.16015625, - "learning_rate": 5.299553855959209e-07, - "loss": 0.0064, - "reward": 1.524498701095581, - "reward_std": 0.044502221047878265, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39949876070022583, - "rewards/pad": 0.125, - "step": 1475 - }, - { - "completion_length": 72.765625, - "epoch": 0.47036328871892924, - "grad_norm": 54.53713607788086, - "kl": 0.1708984375, - "learning_rate": 5.296367112810707e-07, - "loss": 0.0068, - "reward": 1.5576719045639038, - "reward_std": 0.23511239886283875, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.3545469045639038, - "step": 1476 - }, - { - "completion_length": 97.15625, - "epoch": 0.47068196303377946, - "grad_norm": 13.809905052185059, - "kl": 0.2216796875, - "learning_rate": 5.293180369662205e-07, - "loss": 0.0089, - "reward": 1.5029902458190918, - "reward_std": 0.048063814640045166, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5029902458190918, - "step": 1477 - }, - { - "completion_length": 147.171875, - "epoch": 0.4710006373486297, - "grad_norm": 30.669885635375977, - "kl": 0.08984375, - "learning_rate": 5.289993626513702e-07, - "loss": 0.0036, - "reward": 1.568947434425354, - "reward_std": 0.11746594309806824, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.459572434425354, - "step": 1478 - }, - { - "completion_length": 122.546875, - "epoch": 0.4713193116634799, - "grad_norm": 54.18355941772461, - "kl": 0.14453125, - "learning_rate": 5.2868068833652e-07, - "loss": 0.0058, - "reward": 1.4720231294631958, - "reward_std": 0.13342934846878052, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4876481592655182, - "step": 1479 - }, - { - "completion_length": 72.015625, - "epoch": 0.4716379859783301, - "grad_norm": 16.47974395751953, - "kl": 0.181640625, - "learning_rate": 5.283620140216698e-07, - "loss": 0.0073, - "reward": 1.4542089700698853, - "reward_std": 0.08869212865829468, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42295897006988525, - "step": 1480 - }, - { - "completion_length": 69.203125, - "epoch": 0.4719566602931804, - "grad_norm": 104.44915771484375, - "kl": 0.1279296875, - "learning_rate": 5.280433397068196e-07, - "loss": 0.0051, - "reward": 1.4982777833938599, - "reward_std": 0.06480859965085983, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.49827784299850464, - "step": 1481 - }, - { - "completion_length": 74.0, - "epoch": 0.4722753346080306, - "grad_norm": 23.072843551635742, - "kl": 0.1982421875, - "learning_rate": 5.277246653919693e-07, - "loss": 0.0079, - "reward": 1.7503235340118408, - "reward_std": 0.06959592550992966, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6253235936164856, - "rewards/pad": 0.125, - "step": 1482 - }, - { - "completion_length": 123.09375, - "epoch": 0.47259400892288084, - "grad_norm": 13.988505363464355, - "kl": 0.111328125, - "learning_rate": 5.274059910771191e-07, - "loss": 0.0045, - "reward": 1.515568494796753, - "reward_std": 0.04018034413456917, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5155684351921082, - "rewards/pad": 0.0, - "step": 1483 - }, - { - "completion_length": 77.828125, - "epoch": 0.47291268323773106, - "grad_norm": 22.228260040283203, - "kl": 0.1337890625, - "learning_rate": 5.270873167622689e-07, - "loss": 0.0054, - "reward": 1.716814637184143, - "reward_std": 0.10153313726186752, - "rewards/answer_reward": 0.421875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.29493963718414307, - "step": 1484 - }, - { - "completion_length": 72.21875, - "epoch": 0.4732313575525813, - "grad_norm": 66.78959655761719, - "kl": 0.134765625, - "learning_rate": 5.267686424474187e-07, - "loss": 0.0054, - "reward": 1.7643070220947266, - "reward_std": 0.1584181785583496, - "rewards/pad": 0.328125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.436181902885437, - "step": 1485 - }, - { - "completion_length": 99.515625, - "epoch": 0.4735500318674315, - "grad_norm": 15.219985008239746, - "kl": 0.205078125, - "learning_rate": 5.264499681325684e-07, - "loss": 0.0082, - "reward": 1.3323643207550049, - "reward_std": 0.07871396839618683, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3167392909526825, - "step": 1486 - }, - { - "completion_length": 123.1875, - "epoch": 0.4738687061822817, - "grad_norm": 45.12118148803711, - "kl": 0.10400390625, - "learning_rate": 5.261312938177182e-07, - "loss": 0.0042, - "reward": 1.6009794473648071, - "reward_std": 0.13043075799942017, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5853543877601624, - "rewards/pad": 0.03125, - "step": 1487 - }, - { - "completion_length": 122.671875, - "epoch": 0.47418738049713194, - "grad_norm": 18.35110092163086, - "kl": 0.103515625, - "learning_rate": 5.25812619502868e-07, - "loss": 0.0041, - "reward": 1.6021339893341064, - "reward_std": 0.0574474036693573, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4771340787410736, - "step": 1488 - }, - { - "completion_length": 70.421875, - "epoch": 0.47450605481198216, - "grad_norm": 30.877819061279297, - "kl": 0.15234375, - "learning_rate": 5.254939451880179e-07, - "loss": 0.0061, - "reward": 1.5548548698425293, - "reward_std": 0.08415327221155167, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4611048996448517, - "rewards/pad": 0.09375, - "step": 1489 - }, - { - "completion_length": 96.53125, - "epoch": 0.4748247291268324, - "grad_norm": 12.576179504394531, - "kl": 0.1845703125, - "learning_rate": 5.251752708731676e-07, - "loss": 0.0074, - "reward": 1.458923578262329, - "reward_std": 0.04576413705945015, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4589235782623291, - "rewards/pad": 0.0, - "step": 1490 - }, - { - "completion_length": 96.203125, - "epoch": 0.4751434034416826, - "grad_norm": 42.55897521972656, - "kl": 0.10693359375, - "learning_rate": 5.248565965583174e-07, - "loss": 0.0043, - "reward": 1.6549313068389893, - "reward_std": 0.03176398202776909, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5299313068389893, - "rewards/pad": 0.125, - "step": 1491 - }, - { - "completion_length": 70.375, - "epoch": 0.4754620777565328, - "grad_norm": 30.84855842590332, - "kl": 0.1787109375, - "learning_rate": 5.245379222434672e-07, - "loss": 0.0072, - "reward": 1.458336591720581, - "reward_std": 0.058646999299526215, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4583367109298706, - "rewards/pad": 0.0, - "step": 1492 - }, - { - "completion_length": 123.515625, - "epoch": 0.47578075207138304, - "grad_norm": 25.724525451660156, - "kl": 0.12890625, - "learning_rate": 5.24219247928617e-07, - "loss": 0.0052, - "reward": 1.5070421695709229, - "reward_std": 0.042530253529548645, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3820420503616333, - "rewards/pad": 0.125, - "step": 1493 - }, - { - "completion_length": 124.78125, - "epoch": 0.47609942638623326, - "grad_norm": 8.826605796813965, - "kl": 0.166015625, - "learning_rate": 5.239005736137667e-07, - "loss": 0.0066, - "reward": 1.6858264207839966, - "reward_std": 0.08236081898212433, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5608264207839966, - "step": 1494 - }, - { - "completion_length": 74.09375, - "epoch": 0.4764181007010835, - "grad_norm": 23.001249313354492, - "kl": 0.201171875, - "learning_rate": 5.235818992989165e-07, - "loss": 0.008, - "reward": 1.9178105592727661, - "reward_std": 0.060972314327955246, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6678106188774109, - "step": 1495 - }, - { - "completion_length": 45.21875, - "epoch": 0.4767367750159337, - "grad_norm": 66.9561538696289, - "kl": 0.2275390625, - "learning_rate": 5.232632249840663e-07, - "loss": 0.0091, - "reward": 1.6306493282318115, - "reward_std": 0.15838010609149933, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5212743282318115, - "rewards/pad": 0.109375, - "step": 1496 - }, - { - "completion_length": 122.734375, - "epoch": 0.4770554493307839, - "grad_norm": 137.2368927001953, - "kl": 0.2158203125, - "learning_rate": 5.229445506692161e-07, - "loss": 0.0086, - "reward": 1.4414557218551636, - "reward_std": 0.11607854813337326, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4414557218551636, - "rewards/pad": 0.0, - "step": 1497 - }, - { - "completion_length": 44.546875, - "epoch": 0.47737412364563414, - "grad_norm": 26.18069839477539, - "kl": 0.169921875, - "learning_rate": 5.226258763543658e-07, - "loss": 0.0068, - "reward": 1.690140724182129, - "reward_std": 0.06070908159017563, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6901407241821289, - "rewards/pad": 0.0, - "step": 1498 - }, - { - "completion_length": 123.140625, - "epoch": 0.47769279796048436, - "grad_norm": 51.89798355102539, - "kl": 0.154296875, - "learning_rate": 5.223072020395156e-07, - "loss": 0.0062, - "reward": 1.5114344358444214, - "reward_std": 0.10912653058767319, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5114343166351318, - "rewards/pad": 0.0, - "step": 1499 - }, - { - "completion_length": 71.234375, - "epoch": 0.4780114722753346, - "grad_norm": 20.26959991455078, - "kl": 0.1259765625, - "learning_rate": 5.219885277246654e-07, - "loss": 0.005, - "reward": 1.6750046014785767, - "reward_std": 0.06571859866380692, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6750045418739319, - "rewards/pad": 0.0, - "step": 1500 - }, - { - "completion_length": 123.328125, - "epoch": 0.47833014659018486, - "grad_norm": 34.22078323364258, - "kl": 0.294921875, - "learning_rate": 5.216698534098152e-07, - "loss": 0.0118, - "reward": 1.624065637588501, - "reward_std": 0.07414637506008148, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49906566739082336, - "rewards/pad": 0.125, - "step": 1501 - }, - { - "completion_length": 99.171875, - "epoch": 0.4786488209050351, - "grad_norm": 230.13027954101562, - "kl": 0.12451171875, - "learning_rate": 5.213511790949649e-07, - "loss": 0.005, - "reward": 1.460237979888916, - "reward_std": 0.08080984652042389, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33523792028427124, - "step": 1502 - }, - { - "completion_length": 96.265625, - "epoch": 0.4789674952198853, - "grad_norm": 45.79084777832031, - "kl": 0.13671875, - "learning_rate": 5.210325047801147e-07, - "loss": 0.0055, - "reward": 1.4750096797943115, - "reward_std": 0.05699557065963745, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4750097692012787, - "rewards/pad": 0.0, - "step": 1503 - }, - { - "completion_length": 96.359375, - "epoch": 0.4792861695347355, - "grad_norm": 37.625885009765625, - "kl": 0.1806640625, - "learning_rate": 5.207138304652645e-07, - "loss": 0.0072, - "reward": 1.4821243286132812, - "reward_std": 0.060712795704603195, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4821242094039917, - "rewards/pad": 0.0, - "step": 1504 - }, - { - "completion_length": 95.40625, - "epoch": 0.47960484384958574, - "grad_norm": 45.98420333862305, - "kl": 0.193359375, - "learning_rate": 5.203951561504143e-07, - "loss": 0.0078, - "reward": 1.55684494972229, - "reward_std": 0.061466366052627563, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5568448901176453, - "step": 1505 - }, - { - "completion_length": 120.171875, - "epoch": 0.47992351816443596, - "grad_norm": 33.786048889160156, - "kl": 0.1748046875, - "learning_rate": 5.20076481835564e-07, - "loss": 0.007, - "reward": 1.4618111848831177, - "reward_std": 0.08361318707466125, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4618111550807953, - "step": 1506 - }, - { - "completion_length": 123.875, - "epoch": 0.4802421924792862, - "grad_norm": 30.181888580322266, - "kl": 0.126953125, - "learning_rate": 5.197578075207139e-07, - "loss": 0.0051, - "reward": 1.3651647567749023, - "reward_std": 0.060573190450668335, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36516469717025757, - "step": 1507 - }, - { - "completion_length": 73.5625, - "epoch": 0.4805608667941364, - "grad_norm": 18.62019157409668, - "kl": 0.1552734375, - "learning_rate": 5.194391332058637e-07, - "loss": 0.0062, - "reward": 1.5799176692962646, - "reward_std": 0.06475313007831573, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32991766929626465, - "step": 1508 - }, - { - "completion_length": 97.6875, - "epoch": 0.4808795411089866, - "grad_norm": 28.941173553466797, - "kl": 0.15234375, - "learning_rate": 5.191204588910135e-07, - "loss": 0.0061, - "reward": 1.4807729721069336, - "reward_std": 0.09656580537557602, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4807729125022888, - "step": 1509 - }, - { - "completion_length": 124.5625, - "epoch": 0.48119821542383684, - "grad_norm": 53.21528625488281, - "kl": 0.154296875, - "learning_rate": 5.188017845761632e-07, - "loss": 0.0062, - "reward": 1.8000701665878296, - "reward_std": 0.2111843079328537, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.4563201069831848, - "rewards/pad": 0.375, - "step": 1510 - }, - { - "completion_length": 71.765625, - "epoch": 0.48151688973868706, - "grad_norm": 11.812533378601074, - "kl": 0.21875, - "learning_rate": 5.184831102613129e-07, - "loss": 0.0088, - "reward": 1.581697702407837, - "reward_std": 0.09734717756509781, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5816976428031921, - "step": 1511 - }, - { - "completion_length": 98.25, - "epoch": 0.4818355640535373, - "grad_norm": 41.84068298339844, - "kl": 0.11376953125, - "learning_rate": 5.181644359464627e-07, - "loss": 0.0046, - "reward": 1.617072582244873, - "reward_std": 0.05539502203464508, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4920726418495178, - "step": 1512 - }, - { - "completion_length": 99.046875, - "epoch": 0.4821542383683875, - "grad_norm": 13.822397232055664, - "kl": 0.14453125, - "learning_rate": 5.178457616316124e-07, - "loss": 0.0058, - "reward": 1.7891383171081543, - "reward_std": 0.0863952562212944, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5391384363174438, - "step": 1513 - }, - { - "completion_length": 98.546875, - "epoch": 0.4824729126832377, - "grad_norm": 74.7471923828125, - "kl": 0.1455078125, - "learning_rate": 5.175270873167622e-07, - "loss": 0.0058, - "reward": 1.8373247385025024, - "reward_std": 0.10276351124048233, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6029497385025024, - "step": 1514 - }, - { - "completion_length": 96.125, - "epoch": 0.48279158699808794, - "grad_norm": 23.02245330810547, - "kl": 0.138671875, - "learning_rate": 5.17208413001912e-07, - "loss": 0.0056, - "reward": 1.6412444114685059, - "reward_std": 0.0655876025557518, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5162444114685059, - "step": 1515 - }, - { - "completion_length": 150.0, - "epoch": 0.48311026131293816, - "grad_norm": 9.277081489562988, - "kl": 0.0908203125, - "learning_rate": 5.168897386870618e-07, - "loss": 0.0036, - "reward": 1.4091484546661377, - "reward_std": 0.04507271200418472, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4091483950614929, - "rewards/pad": 0.0, - "step": 1516 - }, - { - "completion_length": 71.6875, - "epoch": 0.4834289356277884, - "grad_norm": 32.54093551635742, - "kl": 0.220703125, - "learning_rate": 5.165710643722115e-07, - "loss": 0.0088, - "reward": 1.6252309083938599, - "reward_std": 0.1263033002614975, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5158559083938599, - "step": 1517 - }, - { - "completion_length": 72.375, - "epoch": 0.4837476099426386, - "grad_norm": 57.833587646484375, - "kl": 0.1591796875, - "learning_rate": 5.162523900573613e-07, - "loss": 0.0064, - "reward": 1.628098964691162, - "reward_std": 0.10794929414987564, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5030990242958069, - "rewards/pad": 0.125, - "step": 1518 - }, - { - "completion_length": 49.78125, - "epoch": 0.4840662842574888, - "grad_norm": 35.0426139831543, - "kl": 0.259765625, - "learning_rate": 5.159337157425111e-07, - "loss": 0.0104, - "reward": 1.478843331336975, - "reward_std": 0.1224755346775055, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4319683313369751, - "rewards/pad": 0.046875, - "step": 1519 - }, - { - "completion_length": 96.265625, - "epoch": 0.4843849585723391, - "grad_norm": 33.974422454833984, - "kl": 0.10400390625, - "learning_rate": 5.156150414276609e-07, - "loss": 0.0042, - "reward": 1.6537295579910278, - "reward_std": 0.15965929627418518, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5443546772003174, - "rewards/pad": 0.125, - "step": 1520 - }, - { - "completion_length": 99.453125, - "epoch": 0.4847036328871893, - "grad_norm": 24.93669891357422, - "kl": 0.2421875, - "learning_rate": 5.152963671128106e-07, - "loss": 0.0097, - "reward": 1.5643985271453857, - "reward_std": 0.13921087980270386, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48627355694770813, - "step": 1521 - }, - { - "completion_length": 97.8125, - "epoch": 0.48502230720203954, - "grad_norm": 36.32746124267578, - "kl": 0.267578125, - "learning_rate": 5.149776927979604e-07, - "loss": 0.0107, - "reward": 1.6686410903930664, - "reward_std": 0.11134155094623566, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4342661499977112, - "rewards/pad": 0.234375, - "step": 1522 - }, - { - "completion_length": 123.609375, - "epoch": 0.48534098151688976, - "grad_norm": 33.87176513671875, - "kl": 0.14453125, - "learning_rate": 5.146590184831102e-07, - "loss": 0.0058, - "reward": 1.415372371673584, - "reward_std": 0.1284211277961731, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3841223120689392, - "step": 1523 - }, - { - "completion_length": 121.09375, - "epoch": 0.48565965583174, - "grad_norm": 46.95094680786133, - "kl": 0.1806640625, - "learning_rate": 5.1434034416826e-07, - "loss": 0.0072, - "reward": 1.332892894744873, - "reward_std": 0.05756673216819763, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33289292454719543, - "step": 1524 - }, - { - "completion_length": 175.265625, - "epoch": 0.4859783301465902, - "grad_norm": 11.277741432189941, - "kl": 0.1376953125, - "learning_rate": 5.140216698534097e-07, - "loss": 0.0055, - "reward": 1.375731110572815, - "reward_std": 0.0918242484331131, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3913561999797821, - "step": 1525 - }, - { - "completion_length": 72.609375, - "epoch": 0.4862970044614404, - "grad_norm": 29.35453224182129, - "kl": 0.3828125, - "learning_rate": 5.137029955385595e-07, - "loss": 0.0153, - "reward": 1.5956140756607056, - "reward_std": 0.07637912034988403, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3456140160560608, - "step": 1526 - }, - { - "completion_length": 70.859375, - "epoch": 0.48661567877629064, - "grad_norm": 27.634729385375977, - "kl": 0.6484375, - "learning_rate": 5.133843212237094e-07, - "loss": 0.0259, - "reward": 1.6848344802856445, - "reward_std": 0.07616157829761505, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.559834361076355, - "rewards/pad": 0.125, - "step": 1527 - }, - { - "completion_length": 19.9375, - "epoch": 0.48693435309114086, - "grad_norm": 43.502281188964844, - "kl": 0.1943359375, - "learning_rate": 5.130656469088592e-07, - "loss": 0.0078, - "reward": 1.6018435955047607, - "reward_std": 0.09998296201229095, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.44559359550476074, - "step": 1528 - }, - { - "completion_length": 44.546875, - "epoch": 0.4872530274059911, - "grad_norm": 28.72425651550293, - "kl": 0.169921875, - "learning_rate": 5.127469725940089e-07, - "loss": 0.0068, - "reward": 1.4866104125976562, - "reward_std": 0.05266030877828598, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.486610472202301, - "rewards/pad": 0.0, - "step": 1529 - }, - { - "completion_length": 47.125, - "epoch": 0.4875717017208413, - "grad_norm": 71.66815185546875, - "kl": 0.2275390625, - "learning_rate": 5.124282982791587e-07, - "loss": 0.0091, - "reward": 1.5927340984344482, - "reward_std": 0.16904056072235107, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.37398403882980347, - "rewards/pad": 0.234375, - "step": 1530 - }, - { - "completion_length": 72.515625, - "epoch": 0.4878903760356915, - "grad_norm": 198.94308471679688, - "kl": 0.1845703125, - "learning_rate": 5.121096239643085e-07, - "loss": 0.0074, - "reward": 1.823049545288086, - "reward_std": 0.13386155664920807, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6355496644973755, - "rewards/pad": 0.1875, - "step": 1531 - }, - { - "completion_length": 72.171875, - "epoch": 0.48820905035054174, - "grad_norm": 25.96636962890625, - "kl": 0.158203125, - "learning_rate": 5.117909496494583e-07, - "loss": 0.0063, - "reward": 1.523979902267456, - "reward_std": 0.059931959956884384, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.523979902267456, - "rewards/pad": 0.0, - "step": 1532 - }, - { - "completion_length": 71.3125, - "epoch": 0.48852772466539196, - "grad_norm": 38.72029495239258, - "kl": 0.18359375, - "learning_rate": 5.11472275334608e-07, - "loss": 0.0073, - "reward": 1.4667209386825562, - "reward_std": 0.09433542937040329, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4510958790779114, - "rewards/pad": 0.015625, - "step": 1533 - }, - { - "completion_length": 72.09375, - "epoch": 0.4888463989802422, - "grad_norm": 86.83837890625, - "kl": 0.1904296875, - "learning_rate": 5.111536010197578e-07, - "loss": 0.0076, - "reward": 1.7294871807098389, - "reward_std": 0.16661599278450012, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.49511218070983887, - "rewards/pad": 0.25, - "step": 1534 - }, - { - "completion_length": 128.984375, - "epoch": 0.4891650732950924, - "grad_norm": 215.36953735351562, - "kl": 0.1220703125, - "learning_rate": 5.108349267049076e-07, - "loss": 0.0049, - "reward": 1.614091396331787, - "reward_std": 0.08909813314676285, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3640914559364319, - "step": 1535 - }, - { - "completion_length": 149.21875, - "epoch": 0.4894837476099426, - "grad_norm": 5.865546703338623, - "kl": 0.11572265625, - "learning_rate": 5.105162523900574e-07, - "loss": 0.0046, - "reward": 1.529909372329712, - "reward_std": 0.040073707699775696, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4049093723297119, - "rewards/pad": 0.125, - "step": 1536 - }, - { - "completion_length": 122.84375, - "epoch": 0.48980242192479284, - "grad_norm": 22.46348762512207, - "kl": 0.1162109375, - "learning_rate": 5.101975780752071e-07, - "loss": 0.0046, - "reward": 1.4626240730285645, - "reward_std": 0.095807746052742, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4782490134239197, - "step": 1537 - }, - { - "completion_length": 77.125, - "epoch": 0.49012109623964306, - "grad_norm": 45.38951110839844, - "kl": 0.158203125, - "learning_rate": 5.098789037603569e-07, - "loss": 0.0063, - "reward": 1.5587177276611328, - "reward_std": 0.08830150961875916, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4337177276611328, - "step": 1538 - }, - { - "completion_length": 98.390625, - "epoch": 0.4904397705544933, - "grad_norm": 29.60246467590332, - "kl": 0.1953125, - "learning_rate": 5.095602294455067e-07, - "loss": 0.0078, - "reward": 1.63431978225708, - "reward_std": 0.06482817977666855, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6343198418617249, - "step": 1539 - }, - { - "completion_length": 46.0, - "epoch": 0.49075844486934356, - "grad_norm": 24.609296798706055, - "kl": 0.1826171875, - "learning_rate": 5.092415551306564e-07, - "loss": 0.0073, - "reward": 1.4860643148422241, - "reward_std": 0.09358102083206177, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47043925523757935, - "rewards/pad": 0.015625, - "step": 1540 - }, - { - "completion_length": 99.0625, - "epoch": 0.4910771191841938, - "grad_norm": 23.453184127807617, - "kl": 0.11181640625, - "learning_rate": 5.089228808158062e-07, - "loss": 0.0045, - "reward": 1.7013800144195557, - "reward_std": 0.09791174530982971, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5920050144195557, - "step": 1541 - }, - { - "completion_length": 123.6875, - "epoch": 0.491395793499044, - "grad_norm": 21.338483810424805, - "kl": 0.1767578125, - "learning_rate": 5.08604206500956e-07, - "loss": 0.0071, - "reward": 1.5143307447433472, - "reward_std": 0.0737447589635849, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38933080434799194, - "step": 1542 - }, - { - "completion_length": 78.4375, - "epoch": 0.4917144678138942, - "grad_norm": 34.1049919128418, - "kl": 0.224609375, - "learning_rate": 5.082855321861058e-07, - "loss": 0.009, - "reward": 1.594736933708191, - "reward_std": 0.08642074465751648, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5947369337081909, - "rewards/pad": 0.0, - "step": 1543 - }, - { - "completion_length": 75.484375, - "epoch": 0.49203314212874444, - "grad_norm": 35.719322204589844, - "kl": 0.26953125, - "learning_rate": 5.079668578712555e-07, - "loss": 0.0108, - "reward": 1.5955408811569214, - "reward_std": 0.16018131375312805, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4861658215522766, - "rewards/pad": 0.109375, - "step": 1544 - }, - { - "completion_length": 44.703125, - "epoch": 0.49235181644359466, - "grad_norm": 30.363176345825195, - "kl": 0.298828125, - "learning_rate": 5.076481835564054e-07, - "loss": 0.012, - "reward": 1.5528295040130615, - "reward_std": 0.08602527529001236, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5528294444084167, - "step": 1545 - }, - { - "completion_length": 71.640625, - "epoch": 0.4926704907584449, - "grad_norm": 34.98065185546875, - "kl": 0.1181640625, - "learning_rate": 5.073295092415552e-07, - "loss": 0.0047, - "reward": 1.6527106761932373, - "reward_std": 0.04381757974624634, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5277107357978821, - "step": 1546 - }, - { - "completion_length": 93.40625, - "epoch": 0.4929891650732951, - "grad_norm": 13.550950050354004, - "kl": 0.1826171875, - "learning_rate": 5.07010834926705e-07, - "loss": 0.0073, - "reward": 1.5585880279541016, - "reward_std": 0.08185498416423798, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5585879683494568, - "step": 1547 - }, - { - "completion_length": 96.34375, - "epoch": 0.4933078393881453, - "grad_norm": 25.857145309448242, - "kl": 0.1142578125, - "learning_rate": 5.066921606118547e-07, - "loss": 0.0046, - "reward": 1.4285244941711426, - "reward_std": 0.1266767680644989, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.44414952397346497, - "rewards/pad": 0.0, - "step": 1548 - }, - { - "completion_length": 72.859375, - "epoch": 0.49362651370299554, - "grad_norm": 42.86756896972656, - "kl": 0.18359375, - "learning_rate": 5.063734862970045e-07, - "loss": 0.0074, - "reward": 1.8299238681793213, - "reward_std": 0.16717174649238586, - "rewards/answer_reward": 0.390625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43929874897003174, - "step": 1549 - }, - { - "completion_length": 149.015625, - "epoch": 0.49394518801784576, - "grad_norm": 29.9815731048584, - "kl": 0.1357421875, - "learning_rate": 5.060548119821542e-07, - "loss": 0.0054, - "reward": 1.3113267421722412, - "reward_std": 0.0599776916205883, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3113267123699188, - "step": 1550 - }, - { - "completion_length": 70.734375, - "epoch": 0.494263862332696, - "grad_norm": 38.25667953491211, - "kl": 0.173828125, - "learning_rate": 5.05736137667304e-07, - "loss": 0.0069, - "reward": 1.47730553150177, - "reward_std": 0.0800052136182785, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47730547189712524, - "step": 1551 - }, - { - "completion_length": 123.984375, - "epoch": 0.4945825366475462, - "grad_norm": 52.59759521484375, - "kl": 0.1318359375, - "learning_rate": 5.054174633524537e-07, - "loss": 0.0052, - "reward": 1.4347107410430908, - "reward_std": 0.12221598625183105, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.45033565163612366, - "step": 1552 - }, - { - "completion_length": 97.390625, - "epoch": 0.4949012109623964, - "grad_norm": 55.0565185546875, - "kl": 0.1298828125, - "learning_rate": 5.050987890376035e-07, - "loss": 0.0052, - "reward": 1.5172079801559448, - "reward_std": 0.09507996588945389, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5015830397605896, - "step": 1553 - }, - { - "completion_length": 73.265625, - "epoch": 0.49521988527724664, - "grad_norm": 55.56892776489258, - "kl": 0.1923828125, - "learning_rate": 5.047801147227533e-07, - "loss": 0.0077, - "reward": 1.5761849880218506, - "reward_std": 0.09632700681686401, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45118507742881775, - "step": 1554 - }, - { - "completion_length": 93.921875, - "epoch": 0.49553855959209686, - "grad_norm": 18.484773635864258, - "kl": 0.1494140625, - "learning_rate": 5.044614404079031e-07, - "loss": 0.006, - "reward": 1.4626483917236328, - "reward_std": 0.13790366053581238, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4782733619213104, - "rewards/pad": 0.0, - "step": 1555 - }, - { - "completion_length": 69.1875, - "epoch": 0.4958572339069471, - "grad_norm": 22.0714111328125, - "kl": 0.236328125, - "learning_rate": 5.041427660930528e-07, - "loss": 0.0094, - "reward": 1.6114702224731445, - "reward_std": 0.058457039296627045, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6114701628684998, - "rewards/pad": 0.0, - "step": 1556 - }, - { - "completion_length": 146.265625, - "epoch": 0.4961759082217973, - "grad_norm": 16.967422485351562, - "kl": 0.08154296875, - "learning_rate": 5.038240917782026e-07, - "loss": 0.0033, - "reward": 1.2791426181793213, - "reward_std": 0.02925313636660576, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2791425287723541, - "rewards/pad": 0.0, - "step": 1557 - }, - { - "completion_length": 96.34375, - "epoch": 0.4964945825366475, - "grad_norm": 135.3175506591797, - "kl": 0.1572265625, - "learning_rate": 5.035054174633524e-07, - "loss": 0.0063, - "reward": 1.4358652830123901, - "reward_std": 0.03743431344628334, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43586528301239014, - "rewards/pad": 0.0, - "step": 1558 - }, - { - "completion_length": 97.671875, - "epoch": 0.49681325685149774, - "grad_norm": 105.35767364501953, - "kl": 0.138671875, - "learning_rate": 5.031867431485022e-07, - "loss": 0.0055, - "reward": 1.5372705459594727, - "reward_std": 0.04880628362298012, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4122704565525055, - "step": 1559 - }, - { - "completion_length": 93.53125, - "epoch": 0.497131931166348, - "grad_norm": 21.049678802490234, - "kl": 0.15234375, - "learning_rate": 5.028680688336519e-07, - "loss": 0.0061, - "reward": 1.59987211227417, - "reward_std": 0.03113095834851265, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5998720526695251, - "rewards/pad": 0.0, - "step": 1560 - }, - { - "completion_length": 46.875, - "epoch": 0.49745060548119824, - "grad_norm": 48.804405212402344, - "kl": 0.25390625, - "learning_rate": 5.025493945188017e-07, - "loss": 0.0102, - "reward": 1.4484260082244873, - "reward_std": 0.10934975743293762, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4328010678291321, - "rewards/pad": 0.015625, - "step": 1561 - }, - { - "completion_length": 48.59375, - "epoch": 0.49776927979604846, - "grad_norm": 56.09344482421875, - "kl": 0.18359375, - "learning_rate": 5.022307202039515e-07, - "loss": 0.0073, - "reward": 1.8689539432525635, - "reward_std": 0.12545578181743622, - "rewards/answer_reward": 0.421875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4470789134502411, - "step": 1562 - }, - { - "completion_length": 70.5, - "epoch": 0.4980879541108987, - "grad_norm": 47.180362701416016, - "kl": 0.1904296875, - "learning_rate": 5.019120458891013e-07, - "loss": 0.0076, - "reward": 1.495100736618042, - "reward_std": 0.0501336008310318, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4951007068157196, - "step": 1563 - }, - { - "completion_length": 170.34375, - "epoch": 0.4984066284257489, - "grad_norm": 19.715389251708984, - "kl": 0.06494140625, - "learning_rate": 5.01593371574251e-07, - "loss": 0.0026, - "reward": 1.3467738628387451, - "reward_std": 0.028132686391472816, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3467738628387451, - "step": 1564 - }, - { - "completion_length": 72.390625, - "epoch": 0.4987253027405991, - "grad_norm": 39.21231460571289, - "kl": 0.1298828125, - "learning_rate": 5.012746972594009e-07, - "loss": 0.0052, - "reward": 1.3724713325500488, - "reward_std": 0.07831380516290665, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3568463623523712, - "rewards/pad": 0.015625, - "step": 1565 - }, - { - "completion_length": 97.78125, - "epoch": 0.49904397705544934, - "grad_norm": 70.9393310546875, - "kl": 0.181640625, - "learning_rate": 5.009560229445507e-07, - "loss": 0.0073, - "reward": 1.560790777206421, - "reward_std": 0.04791503772139549, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4357907772064209, - "rewards/pad": 0.125, - "step": 1566 - }, - { - "completion_length": 73.90625, - "epoch": 0.49936265137029956, - "grad_norm": 56.073753356933594, - "kl": 0.13671875, - "learning_rate": 5.006373486297005e-07, - "loss": 0.0055, - "reward": 1.8846735954284668, - "reward_std": 0.08357321470975876, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5252986550331116, - "step": 1567 - }, - { - "completion_length": 96.46875, - "epoch": 0.4996813256851498, - "grad_norm": 22.90010643005371, - "kl": 0.130859375, - "learning_rate": 5.003186743148502e-07, - "loss": 0.0052, - "reward": 1.486340045928955, - "reward_std": 0.05758453905582428, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4863400161266327, - "step": 1568 - }, - { - "completion_length": 96.0625, - "epoch": 0.5, - "grad_norm": 109.51544952392578, - "kl": 0.1513671875, - "learning_rate": 5e-07, - "loss": 0.0061, - "reward": 1.646225094795227, - "reward_std": 0.10367502272129059, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5212252140045166, - "step": 1569 - }, - { - "completion_length": 95.65625, - "epoch": 0.5003186743148502, - "grad_norm": 18.478254318237305, - "kl": 0.1455078125, - "learning_rate": 4.996813256851498e-07, - "loss": 0.0058, - "reward": 1.5283114910125732, - "reward_std": 0.0567050576210022, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.528311550617218, - "rewards/pad": 0.0, - "step": 1570 - }, - { - "completion_length": 72.8125, - "epoch": 0.5006373486297004, - "grad_norm": 40.840431213378906, - "kl": 0.263671875, - "learning_rate": 4.993626513702995e-07, - "loss": 0.0105, - "reward": 1.5773298740386963, - "reward_std": 0.1823175549507141, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4835800230503082, - "rewards/pad": 0.09375, - "step": 1571 - }, - { - "completion_length": 120.953125, - "epoch": 0.5009560229445507, - "grad_norm": 69.60863494873047, - "kl": 0.0986328125, - "learning_rate": 4.990439770554493e-07, - "loss": 0.0039, - "reward": 1.693233609199524, - "reward_std": 0.07272499799728394, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5682336091995239, - "rewards/pad": 0.125, - "step": 1572 - }, - { - "completion_length": 96.765625, - "epoch": 0.5012746972594009, - "grad_norm": 42.9666748046875, - "kl": 0.158203125, - "learning_rate": 4.987253027405991e-07, - "loss": 0.0063, - "reward": 1.7116810083389282, - "reward_std": 0.09147972613573074, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.586681067943573, - "step": 1573 - }, - { - "completion_length": 70.09375, - "epoch": 0.5015933715742511, - "grad_norm": 39.15825271606445, - "kl": 0.23046875, - "learning_rate": 4.984066284257489e-07, - "loss": 0.0092, - "reward": 1.6254063844680786, - "reward_std": 0.10068808495998383, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5004063844680786, - "rewards/pad": 0.125, - "step": 1574 - }, - { - "completion_length": 95.953125, - "epoch": 0.5019120458891013, - "grad_norm": 2402.987060546875, - "kl": 5.6875, - "learning_rate": 4.980879541108986e-07, - "loss": 0.227, - "reward": 1.4909906387329102, - "reward_std": 0.1678842306137085, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5066156387329102, - "step": 1575 - }, - { - "completion_length": 94.6875, - "epoch": 0.5022307202039515, - "grad_norm": 21.808799743652344, - "kl": 0.11962890625, - "learning_rate": 4.977692797960484e-07, - "loss": 0.0048, - "reward": 1.6425373554229736, - "reward_std": 0.14815226197242737, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4706624746322632, - "step": 1576 - }, - { - "completion_length": 70.484375, - "epoch": 0.5025493945188018, - "grad_norm": 24.197406768798828, - "kl": 0.1455078125, - "learning_rate": 4.974506054811982e-07, - "loss": 0.0058, - "reward": 1.5822218656539917, - "reward_std": 0.059086866676807404, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5822218656539917, - "step": 1577 - }, - { - "completion_length": 124.546875, - "epoch": 0.502868068833652, - "grad_norm": 59.101585388183594, - "kl": 0.193359375, - "learning_rate": 4.97131931166348e-07, - "loss": 0.0077, - "reward": 1.5973970890045166, - "reward_std": 0.08404646068811417, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5973970890045166, - "step": 1578 - }, - { - "completion_length": 96.203125, - "epoch": 0.5031867431485022, - "grad_norm": 16.71595573425293, - "kl": 0.126953125, - "learning_rate": 4.968132568514977e-07, - "loss": 0.0051, - "reward": 1.449321985244751, - "reward_std": 0.05522039160132408, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44932207465171814, - "step": 1579 - }, - { - "completion_length": 98.015625, - "epoch": 0.5035054174633524, - "grad_norm": 24.666296005249023, - "kl": 0.08984375, - "learning_rate": 4.964945825366475e-07, - "loss": 0.0036, - "reward": 1.659955620765686, - "reward_std": 0.07834160327911377, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4099557101726532, - "step": 1580 - }, - { - "completion_length": 145.28125, - "epoch": 0.5038240917782026, - "grad_norm": 16.429119110107422, - "kl": 0.12060546875, - "learning_rate": 4.961759082217972e-07, - "loss": 0.0048, - "reward": 1.436887264251709, - "reward_std": 0.0728699266910553, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4368871748447418, - "step": 1581 - }, - { - "completion_length": 72.90625, - "epoch": 0.5041427660930529, - "grad_norm": 35.17189407348633, - "kl": 0.15625, - "learning_rate": 4.95857233906947e-07, - "loss": 0.0062, - "reward": 1.5288763046264648, - "reward_std": 0.1308014839887619, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5445013046264648, - "step": 1582 - }, - { - "completion_length": 71.140625, - "epoch": 0.5044614404079031, - "grad_norm": 54.288818359375, - "kl": 0.1494140625, - "learning_rate": 4.955385595920969e-07, - "loss": 0.006, - "reward": 1.511135458946228, - "reward_std": 0.06047463044524193, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5111355185508728, - "step": 1583 - }, - { - "completion_length": 149.578125, - "epoch": 0.5047801147227533, - "grad_norm": 10.70415210723877, - "kl": 0.09033203125, - "learning_rate": 4.952198852772467e-07, - "loss": 0.0036, - "reward": 1.6119954586029053, - "reward_std": 0.04025973752140999, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3619953691959381, - "step": 1584 - }, - { - "completion_length": 73.015625, - "epoch": 0.5050987890376035, - "grad_norm": 39.62226104736328, - "kl": 0.1279296875, - "learning_rate": 4.949012109623964e-07, - "loss": 0.0051, - "reward": 1.833071231842041, - "reward_std": 0.11723049730062485, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5361962914466858, - "rewards/pad": 0.296875, - "step": 1585 - }, - { - "completion_length": 96.328125, - "epoch": 0.5054174633524537, - "grad_norm": 9.883891105651855, - "kl": 0.2353515625, - "learning_rate": 4.945825366475462e-07, - "loss": 0.0094, - "reward": 1.5089480876922607, - "reward_std": 0.0629185289144516, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5089481472969055, - "rewards/pad": 0.0, - "step": 1586 - }, - { - "completion_length": 68.953125, - "epoch": 0.505736137667304, - "grad_norm": 41.82442855834961, - "kl": 0.162109375, - "learning_rate": 4.94263862332696e-07, - "loss": 0.0065, - "reward": 1.6769931316375732, - "reward_std": 0.08155511319637299, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5519931316375732, - "step": 1587 - }, - { - "completion_length": 147.109375, - "epoch": 0.5060548119821542, - "grad_norm": 64.26282501220703, - "kl": 0.09619140625, - "learning_rate": 4.939451880178458e-07, - "loss": 0.0038, - "reward": 1.5831506252288818, - "reward_std": 0.11183653771877289, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47377562522888184, - "step": 1588 - }, - { - "completion_length": 147.46875, - "epoch": 0.5063734862970045, - "grad_norm": 9.992218971252441, - "kl": 0.1142578125, - "learning_rate": 4.936265137029955e-07, - "loss": 0.0046, - "reward": 1.7075880765914917, - "reward_std": 0.10298748314380646, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5825880169868469, - "step": 1589 - }, - { - "completion_length": 126.546875, - "epoch": 0.5066921606118547, - "grad_norm": 98.55374145507812, - "kl": 0.2177734375, - "learning_rate": 4.933078393881453e-07, - "loss": 0.0087, - "reward": 1.611853837966919, - "reward_std": 0.03698485344648361, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4868537187576294, - "step": 1590 - }, - { - "completion_length": 43.859375, - "epoch": 0.507010834926705, - "grad_norm": 29.617713928222656, - "kl": 0.1337890625, - "learning_rate": 4.929891650732951e-07, - "loss": 0.0053, - "reward": 1.757079005241394, - "reward_std": 0.06662129610776901, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6320791244506836, - "step": 1591 - }, - { - "completion_length": 69.359375, - "epoch": 0.5073295092415552, - "grad_norm": 51.22451400756836, - "kl": 0.2041015625, - "learning_rate": 4.926704907584449e-07, - "loss": 0.0082, - "reward": 1.728276252746582, - "reward_std": 0.06753535568714142, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6032761931419373, - "step": 1592 - }, - { - "completion_length": 121.484375, - "epoch": 0.5076481835564054, - "grad_norm": 18.68769073486328, - "kl": 0.103515625, - "learning_rate": 4.923518164435946e-07, - "loss": 0.0041, - "reward": 1.4825172424316406, - "reward_std": 0.07809288054704666, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35751718282699585, - "step": 1593 - }, - { - "completion_length": 123.265625, - "epoch": 0.5079668578712556, - "grad_norm": 35.22314453125, - "kl": 0.10400390625, - "learning_rate": 4.920331421287444e-07, - "loss": 0.0042, - "reward": 1.5061140060424805, - "reward_std": 0.04520394280552864, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.38111400604248047, - "rewards/pad": 0.125, - "step": 1594 - }, - { - "completion_length": 99.296875, - "epoch": 0.5082855321861058, - "grad_norm": 18.381580352783203, - "kl": 0.10546875, - "learning_rate": 4.917144678138942e-07, - "loss": 0.0042, - "reward": 1.8121449947357178, - "reward_std": 0.045704472810029984, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43714502453804016, - "rewards/pad": 0.375, - "step": 1595 - }, - { - "completion_length": 94.046875, - "epoch": 0.5086042065009561, - "grad_norm": 26.083742141723633, - "kl": 0.1494140625, - "learning_rate": 4.91395793499044e-07, - "loss": 0.006, - "reward": 1.4526923894882202, - "reward_std": 0.06620436906814575, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4526923596858978, - "rewards/pad": 0.0, - "step": 1596 - }, - { - "completion_length": 148.984375, - "epoch": 0.5089228808158063, - "grad_norm": 55.55546951293945, - "kl": 0.1162109375, - "learning_rate": 4.910771191841937e-07, - "loss": 0.0047, - "reward": 1.3445849418640137, - "reward_std": 0.036867473274469376, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3445849120616913, - "step": 1597 - }, - { - "completion_length": 144.03125, - "epoch": 0.5092415551306565, - "grad_norm": 17.205900192260742, - "kl": 0.123046875, - "learning_rate": 4.907584448693435e-07, - "loss": 0.0049, - "reward": 1.506199836730957, - "reward_std": 0.05332973971962929, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5061998963356018, - "rewards/pad": 0.0, - "step": 1598 - }, - { - "completion_length": 95.921875, - "epoch": 0.5095602294455067, - "grad_norm": 21.30576515197754, - "kl": 0.20703125, - "learning_rate": 4.904397705544932e-07, - "loss": 0.0083, - "reward": 1.6577305793762207, - "reward_std": 0.11153335869312286, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6264805793762207, - "step": 1599 - }, - { - "completion_length": 94.34375, - "epoch": 0.5098789037603569, - "grad_norm": 47.93109130859375, - "kl": 0.162109375, - "learning_rate": 4.90121096239643e-07, - "loss": 0.0065, - "reward": 1.5295488834381104, - "reward_std": 0.15334632992744446, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49829891324043274, - "step": 1600 - }, - { - "completion_length": 71.984375, - "epoch": 0.5101975780752072, - "grad_norm": 46.93277359008789, - "kl": 0.244140625, - "learning_rate": 4.898024219247928e-07, - "loss": 0.0098, - "reward": 1.6092555522918701, - "reward_std": 0.07779088616371155, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48425549268722534, - "rewards/pad": 0.125, - "step": 1601 - }, - { - "completion_length": 43.625, - "epoch": 0.5105162523900574, - "grad_norm": 47.682518005371094, - "kl": 0.2001953125, - "learning_rate": 4.894837476099425e-07, - "loss": 0.008, - "reward": 1.5027817487716675, - "reward_std": 0.06517623364925385, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5027817487716675, - "step": 1602 - }, - { - "completion_length": 123.015625, - "epoch": 0.5108349267049076, - "grad_norm": 58.71294403076172, - "kl": 0.0859375, - "learning_rate": 4.891650732950924e-07, - "loss": 0.0034, - "reward": 1.413274884223938, - "reward_std": 0.12114940583705902, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.319524884223938, - "rewards/pad": 0.09375, - "step": 1603 - }, - { - "completion_length": 120.390625, - "epoch": 0.5111536010197578, - "grad_norm": 10.46804428100586, - "kl": 0.142578125, - "learning_rate": 4.888463989802422e-07, - "loss": 0.0057, - "reward": 1.3477072715759277, - "reward_std": 0.06476961821317673, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3477073311805725, - "step": 1604 - }, - { - "completion_length": 96.28125, - "epoch": 0.511472275334608, - "grad_norm": 27.42978286743164, - "kl": 0.130859375, - "learning_rate": 4.88527724665392e-07, - "loss": 0.0052, - "reward": 1.3898663520812988, - "reward_std": 0.07068990170955658, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3898663818836212, - "rewards/pad": 0.0, - "step": 1605 - }, - { - "completion_length": 96.046875, - "epoch": 0.5117909496494583, - "grad_norm": 13.107078552246094, - "kl": 0.1533203125, - "learning_rate": 4.882090503505417e-07, - "loss": 0.0061, - "reward": 1.6023199558258057, - "reward_std": 0.052031271159648895, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4773198366165161, - "step": 1606 - }, - { - "completion_length": 123.078125, - "epoch": 0.5121096239643085, - "grad_norm": 27.67726707458496, - "kl": 0.1650390625, - "learning_rate": 4.878903760356915e-07, - "loss": 0.0066, - "reward": 1.6470005512237549, - "reward_std": 0.09419877827167511, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5063754320144653, - "step": 1607 - }, - { - "completion_length": 147.03125, - "epoch": 0.5124282982791587, - "grad_norm": 21.777843475341797, - "kl": 0.10107421875, - "learning_rate": 4.875717017208413e-07, - "loss": 0.004, - "reward": 1.5854337215423584, - "reward_std": 0.042907871305942535, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.585433840751648, - "step": 1608 - }, - { - "completion_length": 72.125, - "epoch": 0.5127469725940089, - "grad_norm": 37.6806526184082, - "kl": 0.173828125, - "learning_rate": 4.872530274059911e-07, - "loss": 0.0069, - "reward": 1.4564390182495117, - "reward_std": 0.10523393005132675, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4408140778541565, - "step": 1609 - }, - { - "completion_length": 95.5, - "epoch": 0.5130656469088591, - "grad_norm": 36.77358627319336, - "kl": 0.1611328125, - "learning_rate": 4.869343530911408e-07, - "loss": 0.0065, - "reward": 1.5982133150100708, - "reward_std": 0.09956274181604385, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5982133150100708, - "rewards/pad": 0.0, - "step": 1610 - }, - { - "completion_length": 45.125, - "epoch": 0.5133843212237094, - "grad_norm": 34.843727111816406, - "kl": 0.1767578125, - "learning_rate": 4.866156787762906e-07, - "loss": 0.0071, - "reward": 1.751644253730774, - "reward_std": 0.09237408638000488, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6266443133354187, - "rewards/pad": 0.125, - "step": 1611 - }, - { - "completion_length": 47.890625, - "epoch": 0.5137029955385596, - "grad_norm": 27.2742977142334, - "kl": 0.1474609375, - "learning_rate": 4.862970044614404e-07, - "loss": 0.0059, - "reward": 1.4951512813568115, - "reward_std": 0.0747571736574173, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.49515125155448914, - "step": 1612 - }, - { - "completion_length": 79.53125, - "epoch": 0.5140216698534098, - "grad_norm": 72.64749908447266, - "kl": 0.123046875, - "learning_rate": 4.859783301465902e-07, - "loss": 0.0049, - "reward": 1.7619708776474, - "reward_std": 0.14130009710788727, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4025958776473999, - "rewards/pad": 0.375, - "step": 1613 - }, - { - "completion_length": 145.65625, - "epoch": 0.51434034416826, - "grad_norm": 78.582763671875, - "kl": 0.10302734375, - "learning_rate": 4.856596558317399e-07, - "loss": 0.0041, - "reward": 1.4470350742340088, - "reward_std": 0.04571251943707466, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4470349848270416, - "rewards/pad": 0.0, - "step": 1614 - }, - { - "completion_length": 121.46875, - "epoch": 0.5146590184831102, - "grad_norm": 26.473241806030273, - "kl": 0.138671875, - "learning_rate": 4.853409815168897e-07, - "loss": 0.0055, - "reward": 1.5637502670288086, - "reward_std": 0.04434807598590851, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43875014781951904, - "rewards/pad": 0.125, - "step": 1615 - }, - { - "completion_length": 175.34375, - "epoch": 0.5149776927979605, - "grad_norm": 26.083354949951172, - "kl": 0.06884765625, - "learning_rate": 4.850223072020395e-07, - "loss": 0.0028, - "reward": 1.5136511325836182, - "reward_std": 0.04185505956411362, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5136511325836182, - "step": 1616 - }, - { - "completion_length": 44.953125, - "epoch": 0.5152963671128107, - "grad_norm": 35.294132232666016, - "kl": 0.26171875, - "learning_rate": 4.847036328871893e-07, - "loss": 0.0104, - "reward": 1.489182710647583, - "reward_std": 0.12016036361455917, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48918282985687256, - "rewards/pad": 0.0, - "step": 1617 - }, - { - "completion_length": 173.328125, - "epoch": 0.5156150414276609, - "grad_norm": 12.702577590942383, - "kl": 0.06884765625, - "learning_rate": 4.84384958572339e-07, - "loss": 0.0027, - "reward": 1.5943570137023926, - "reward_std": 0.05077844485640526, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4693570137023926, - "step": 1618 - }, - { - "completion_length": 148.140625, - "epoch": 0.5159337157425111, - "grad_norm": 28.387948989868164, - "kl": 0.11474609375, - "learning_rate": 4.840662842574888e-07, - "loss": 0.0046, - "reward": 1.4957976341247559, - "reward_std": 0.06421782076358795, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4957975745201111, - "rewards/pad": 0.0, - "step": 1619 - }, - { - "completion_length": 149.6875, - "epoch": 0.5162523900573613, - "grad_norm": 63.94855880737305, - "kl": 0.11181640625, - "learning_rate": 4.837476099426385e-07, - "loss": 0.0045, - "reward": 1.4858911037445068, - "reward_std": 0.05516339838504791, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48589110374450684, - "step": 1620 - }, - { - "completion_length": 119.703125, - "epoch": 0.5165710643722116, - "grad_norm": 37.80052185058594, - "kl": 0.0751953125, - "learning_rate": 4.834289356277884e-07, - "loss": 0.003, - "reward": 1.5186381340026855, - "reward_std": 0.04086693376302719, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5186381340026855, - "step": 1621 - }, - { - "completion_length": 121.25, - "epoch": 0.5168897386870618, - "grad_norm": 9.314208030700684, - "kl": 0.1259765625, - "learning_rate": 4.831102613129382e-07, - "loss": 0.005, - "reward": 1.4766916036605835, - "reward_std": 0.03902985155582428, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4766915440559387, - "step": 1622 - }, - { - "completion_length": 98.828125, - "epoch": 0.517208413001912, - "grad_norm": 48.123897552490234, - "kl": 0.1162109375, - "learning_rate": 4.82791586998088e-07, - "loss": 0.0047, - "reward": 1.5715768337249756, - "reward_std": 0.06291652470827103, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44657695293426514, - "rewards/pad": 0.125, - "step": 1623 - }, - { - "completion_length": 20.765625, - "epoch": 0.5175270873167622, - "grad_norm": 43.335289001464844, - "kl": 0.142578125, - "learning_rate": 4.824729126832377e-07, - "loss": 0.0057, - "reward": 1.7628185749053955, - "reward_std": 0.20459821820259094, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 0.96875, - "rewards/iou_glue_reward": 0.5753185749053955, - "step": 1624 - }, - { - "completion_length": 19.21875, - "epoch": 0.5178457616316124, - "grad_norm": 38.60752868652344, - "kl": 0.1787109375, - "learning_rate": 4.821542383683875e-07, - "loss": 0.0071, - "reward": 1.5983529090881348, - "reward_std": 0.05511268973350525, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5983529090881348, - "rewards/pad": 0.0, - "step": 1625 - }, - { - "completion_length": 121.5, - "epoch": 0.5181644359464627, - "grad_norm": 13.00921630859375, - "kl": 0.1025390625, - "learning_rate": 4.818355640535373e-07, - "loss": 0.0041, - "reward": 1.6340184211730957, - "reward_std": 0.10035441815853119, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5246434807777405, - "step": 1626 - }, - { - "completion_length": 123.125, - "epoch": 0.5184831102613129, - "grad_norm": 20.170867919921875, - "kl": 0.1259765625, - "learning_rate": 4.815168897386871e-07, - "loss": 0.005, - "reward": 1.5755319595336914, - "reward_std": 0.11915256828069687, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48178181052207947, - "step": 1627 - }, - { - "completion_length": 97.046875, - "epoch": 0.5188017845761632, - "grad_norm": 26.876707077026367, - "kl": 0.140625, - "learning_rate": 4.811982154238368e-07, - "loss": 0.0056, - "reward": 1.606523036956787, - "reward_std": 0.12204252183437347, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4971481263637543, - "rewards/pad": 0.125, - "step": 1628 - }, - { - "completion_length": 71.625, - "epoch": 0.5191204588910134, - "grad_norm": 27.261844635009766, - "kl": 0.203125, - "learning_rate": 4.808795411089866e-07, - "loss": 0.0081, - "reward": 1.6306061744689941, - "reward_std": 0.07760140299797058, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5056062340736389, - "rewards/pad": 0.125, - "step": 1629 - }, - { - "completion_length": 153.984375, - "epoch": 0.5194391332058637, - "grad_norm": 57.45818328857422, - "kl": 0.115234375, - "learning_rate": 4.805608667941364e-07, - "loss": 0.0046, - "reward": 1.380428433418274, - "reward_std": 0.1852729320526123, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.41167837381362915, - "rewards/pad": 0.0, - "step": 1630 - }, - { - "completion_length": 98.328125, - "epoch": 0.5197578075207139, - "grad_norm": 18.166400909423828, - "kl": 0.1455078125, - "learning_rate": 4.802421924792862e-07, - "loss": 0.0058, - "reward": 1.5990135669708252, - "reward_std": 0.10991062223911285, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4896385669708252, - "step": 1631 - }, - { - "completion_length": 75.15625, - "epoch": 0.5200764818355641, - "grad_norm": 51.96231460571289, - "kl": 0.171875, - "learning_rate": 4.799235181644359e-07, - "loss": 0.0069, - "reward": 1.7394810914993286, - "reward_std": 0.06469561904668808, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6144810914993286, - "rewards/pad": 0.125, - "step": 1632 - }, - { - "completion_length": 120.40625, - "epoch": 0.5203951561504143, - "grad_norm": 32.042728424072266, - "kl": 0.11962890625, - "learning_rate": 4.796048438495857e-07, - "loss": 0.0048, - "reward": 1.4044157266616821, - "reward_std": 0.04880164563655853, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4044157564640045, - "step": 1633 - }, - { - "completion_length": 73.484375, - "epoch": 0.5207138304652645, - "grad_norm": 23.388429641723633, - "kl": 0.169921875, - "learning_rate": 4.792861695347355e-07, - "loss": 0.0068, - "reward": 1.5240478515625, - "reward_std": 0.09146015346050262, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5240478515625, - "step": 1634 - }, - { - "completion_length": 72.84375, - "epoch": 0.5210325047801148, - "grad_norm": 160.82843017578125, - "kl": 0.296875, - "learning_rate": 4.789674952198852e-07, - "loss": 0.0118, - "reward": 1.577231764793396, - "reward_std": 0.15222786366939545, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48348167538642883, - "rewards/pad": 0.09375, - "step": 1635 - }, - { - "completion_length": 150.15625, - "epoch": 0.521351179094965, - "grad_norm": 13.156582832336426, - "kl": 0.07177734375, - "learning_rate": 4.78648820905035e-07, - "loss": 0.0029, - "reward": 1.542654037475586, - "reward_std": 0.03593272343277931, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4176541268825531, - "step": 1636 - }, - { - "completion_length": 200.171875, - "epoch": 0.5216698534098152, - "grad_norm": 14.329692840576172, - "kl": 0.0751953125, - "learning_rate": 4.783301465901848e-07, - "loss": 0.003, - "reward": 1.4973219633102417, - "reward_std": 0.04188467934727669, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4973219335079193, - "step": 1637 - }, - { - "completion_length": 74.78125, - "epoch": 0.5219885277246654, - "grad_norm": 33.7816047668457, - "kl": 0.2255859375, - "learning_rate": 4.780114722753345e-07, - "loss": 0.009, - "reward": 1.7144832611083984, - "reward_std": 0.06846581399440765, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5894832015037537, - "step": 1638 - }, - { - "completion_length": 120.890625, - "epoch": 0.5223072020395156, - "grad_norm": 25.760684967041016, - "kl": 0.1962890625, - "learning_rate": 4.776927979604843e-07, - "loss": 0.0078, - "reward": 1.4427735805511475, - "reward_std": 0.05109262466430664, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4427736699581146, - "rewards/pad": 0.0, - "step": 1639 - }, - { - "completion_length": 97.875, - "epoch": 0.5226258763543659, - "grad_norm": 30.939523696899414, - "kl": 0.24609375, - "learning_rate": 4.773741236456342e-07, - "loss": 0.0099, - "reward": 1.850117802619934, - "reward_std": 0.07939118146896362, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6001178622245789, - "rewards/pad": 0.25, - "step": 1640 - }, - { - "completion_length": 151.46875, - "epoch": 0.5229445506692161, - "grad_norm": 350.8528137207031, - "kl": 0.2119140625, - "learning_rate": 4.770554493307839e-07, - "loss": 0.0085, - "reward": 1.578183650970459, - "reward_std": 0.10141550749540329, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46880874037742615, - "rewards/pad": 0.109375, - "step": 1641 - }, - { - "completion_length": 95.984375, - "epoch": 0.5232632249840663, - "grad_norm": 40.61670684814453, - "kl": 0.130859375, - "learning_rate": 4.7673677501593366e-07, - "loss": 0.0052, - "reward": 1.642643690109253, - "reward_std": 0.07146844267845154, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5176436305046082, - "rewards/pad": 0.125, - "step": 1642 - }, - { - "completion_length": 97.140625, - "epoch": 0.5235818992989165, - "grad_norm": 56.291839599609375, - "kl": 0.11767578125, - "learning_rate": 4.7641810070108347e-07, - "loss": 0.0047, - "reward": 1.4887385368347168, - "reward_std": 0.048854757100343704, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4887385070323944, - "rewards/pad": 0.0, - "step": 1643 - }, - { - "completion_length": 181.625, - "epoch": 0.5239005736137667, - "grad_norm": 6.92765998840332, - "kl": 0.11962890625, - "learning_rate": 4.760994263862332e-07, - "loss": 0.0048, - "reward": 1.4948139190673828, - "reward_std": 0.09755484759807587, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3854389190673828, - "step": 1644 - }, - { - "completion_length": 69.96875, - "epoch": 0.524219247928617, - "grad_norm": 98.2040023803711, - "kl": 0.203125, - "learning_rate": 4.7578075207138303e-07, - "loss": 0.0081, - "reward": 1.4809551239013672, - "reward_std": 0.09209290891885757, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4809551537036896, - "rewards/pad": 0.0, - "step": 1645 - }, - { - "completion_length": 74.09375, - "epoch": 0.5245379222434672, - "grad_norm": 27.13360595703125, - "kl": 0.2333984375, - "learning_rate": 4.754620777565328e-07, - "loss": 0.0093, - "reward": 1.660394549369812, - "reward_std": 0.10714735835790634, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.535394549369812, - "step": 1646 - }, - { - "completion_length": 123.921875, - "epoch": 0.5248565965583174, - "grad_norm": 51.77692413330078, - "kl": 0.14453125, - "learning_rate": 4.751434034416826e-07, - "loss": 0.0058, - "reward": 1.386512279510498, - "reward_std": 0.07405687123537064, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3865121901035309, - "step": 1647 - }, - { - "completion_length": 122.96875, - "epoch": 0.5251752708731676, - "grad_norm": 57.70017623901367, - "kl": 0.10205078125, - "learning_rate": 4.7482472912683235e-07, - "loss": 0.0041, - "reward": 1.5424884557724, - "reward_std": 0.0725923627614975, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4174885153770447, - "rewards/pad": 0.125, - "step": 1648 - }, - { - "completion_length": 46.3125, - "epoch": 0.5254939451880178, - "grad_norm": 32.44889831542969, - "kl": 0.150390625, - "learning_rate": 4.7450605481198215e-07, - "loss": 0.006, - "reward": 1.6891388893127441, - "reward_std": 0.08782389760017395, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4391387701034546, - "step": 1649 - }, - { - "completion_length": 97.53125, - "epoch": 0.5258126195028681, - "grad_norm": 34.80462646484375, - "kl": 0.111328125, - "learning_rate": 4.741873804971319e-07, - "loss": 0.0044, - "reward": 1.542208194732666, - "reward_std": 0.06380252540111542, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41720810532569885, - "step": 1650 - }, - { - "completion_length": 123.875, - "epoch": 0.5261312938177183, - "grad_norm": 41.51993942260742, - "kl": 0.1162109375, - "learning_rate": 4.738687061822817e-07, - "loss": 0.0047, - "reward": 1.470268964767456, - "reward_std": 0.050267286598682404, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4702690541744232, - "step": 1651 - }, - { - "completion_length": 123.453125, - "epoch": 0.5264499681325685, - "grad_norm": 55.49713897705078, - "kl": 0.12109375, - "learning_rate": 4.7355003186743147e-07, - "loss": 0.0048, - "reward": 1.4749317169189453, - "reward_std": 0.07620559632778168, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47493165731430054, - "step": 1652 - }, - { - "completion_length": 72.265625, - "epoch": 0.5267686424474187, - "grad_norm": 83.41410827636719, - "kl": 0.1640625, - "learning_rate": 4.732313575525813e-07, - "loss": 0.0066, - "reward": 1.6315970420837402, - "reward_std": 0.1632934808731079, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6003470420837402, - "rewards/pad": 0.03125, - "step": 1653 - }, - { - "completion_length": 100.109375, - "epoch": 0.5270873167622689, - "grad_norm": 29.058053970336914, - "kl": 0.15625, - "learning_rate": 4.7291268323773103e-07, - "loss": 0.0062, - "reward": 1.5368297100067139, - "reward_std": 0.1058320552110672, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.41182976961135864, - "step": 1654 - }, - { - "completion_length": 45.09375, - "epoch": 0.5274059910771192, - "grad_norm": 31.651737213134766, - "kl": 0.1591796875, - "learning_rate": 4.7259400892288084e-07, - "loss": 0.0064, - "reward": 1.491950511932373, - "reward_std": 0.06471245735883713, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4919503927230835, - "rewards/pad": 0.0, - "step": 1655 - }, - { - "completion_length": 70.6875, - "epoch": 0.5277246653919694, - "grad_norm": 48.495574951171875, - "kl": 0.1533203125, - "learning_rate": 4.722753346080306e-07, - "loss": 0.0061, - "reward": 1.7000138759613037, - "reward_std": 0.11548508703708649, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5750138163566589, - "rewards/pad": 0.125, - "step": 1656 - }, - { - "completion_length": 96.0625, - "epoch": 0.5280433397068196, - "grad_norm": 42.84994888305664, - "kl": 0.310546875, - "learning_rate": 4.719566602931804e-07, - "loss": 0.0124, - "reward": 1.5187783241271973, - "reward_std": 0.04149685800075531, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5187783241271973, - "step": 1657 - }, - { - "completion_length": 97.265625, - "epoch": 0.5283620140216698, - "grad_norm": 28.610042572021484, - "kl": 0.1630859375, - "learning_rate": 4.716379859783301e-07, - "loss": 0.0065, - "reward": 1.5452983379364014, - "reward_std": 0.07879804074764252, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4202982485294342, - "step": 1658 - }, - { - "completion_length": 122.921875, - "epoch": 0.52868068833652, - "grad_norm": 51.03239440917969, - "kl": 0.1171875, - "learning_rate": 4.713193116634799e-07, - "loss": 0.0047, - "reward": 1.5429987907409668, - "reward_std": 0.1555194854736328, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.44924867153167725, - "rewards/pad": 0.125, - "step": 1659 - }, - { - "completion_length": 98.25, - "epoch": 0.5289993626513703, - "grad_norm": 53.16851043701172, - "kl": 0.126953125, - "learning_rate": 4.7100063734862966e-07, - "loss": 0.0051, - "reward": 1.633514642715454, - "reward_std": 0.07731299102306366, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5085147619247437, - "step": 1660 - }, - { - "completion_length": 72.828125, - "epoch": 0.5293180369662205, - "grad_norm": 19.036083221435547, - "kl": 0.12109375, - "learning_rate": 4.706819630337794e-07, - "loss": 0.0048, - "reward": 1.5364749431610107, - "reward_std": 0.0853765681385994, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3958500325679779, - "rewards/pad": 0.140625, - "step": 1661 - }, - { - "completion_length": 94.09375, - "epoch": 0.5296367112810707, - "grad_norm": 31.02826499938965, - "kl": 0.125, - "learning_rate": 4.703632887189292e-07, - "loss": 0.005, - "reward": 1.4983229637145996, - "reward_std": 0.06386838108301163, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4983229339122772, - "rewards/pad": 0.0, - "step": 1662 - }, - { - "completion_length": 118.890625, - "epoch": 0.5299553855959209, - "grad_norm": 30.89619255065918, - "kl": 0.10693359375, - "learning_rate": 4.70044614404079e-07, - "loss": 0.0043, - "reward": 1.4212117195129395, - "reward_std": 0.04871714487671852, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42121171951293945, - "step": 1663 - }, - { - "completion_length": 96.296875, - "epoch": 0.5302740599107711, - "grad_norm": 25.728801727294922, - "kl": 0.1259765625, - "learning_rate": 4.697259400892288e-07, - "loss": 0.005, - "reward": 1.4896304607391357, - "reward_std": 0.08095531165599823, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48963046073913574, - "rewards/pad": 0.0, - "step": 1664 - }, - { - "completion_length": 125.40625, - "epoch": 0.5305927342256214, - "grad_norm": 18.65762710571289, - "kl": 0.07080078125, - "learning_rate": 4.6940726577437853e-07, - "loss": 0.0028, - "reward": 1.63633394241333, - "reward_std": 0.07638265937566757, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5113338828086853, - "step": 1665 - }, - { - "completion_length": 122.71875, - "epoch": 0.5309114085404716, - "grad_norm": 13.211691856384277, - "kl": 0.091796875, - "learning_rate": 4.6908859145952834e-07, - "loss": 0.0037, - "reward": 1.618811845779419, - "reward_std": 0.05741771683096886, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4938117265701294, - "step": 1666 - }, - { - "completion_length": 95.46875, - "epoch": 0.5312300828553218, - "grad_norm": 32.33008575439453, - "kl": 0.2041015625, - "learning_rate": 4.687699171446781e-07, - "loss": 0.0082, - "reward": 1.3568158149719238, - "reward_std": 0.03793134540319443, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35681572556495667, - "rewards/pad": 0.0, - "step": 1667 - }, - { - "completion_length": 96.359375, - "epoch": 0.5315487571701721, - "grad_norm": 26.959739685058594, - "kl": 0.11083984375, - "learning_rate": 4.684512428298279e-07, - "loss": 0.0044, - "reward": 1.5582318305969238, - "reward_std": 0.08168759942054749, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5582318902015686, - "rewards/pad": 0.0, - "step": 1668 - }, - { - "completion_length": 70.75, - "epoch": 0.5318674314850224, - "grad_norm": 106.21484375, - "kl": 0.1513671875, - "learning_rate": 4.6813256851497766e-07, - "loss": 0.006, - "reward": 1.5603129863739014, - "reward_std": 0.08481752872467041, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45093798637390137, - "rewards/pad": 0.109375, - "step": 1669 - }, - { - "completion_length": 97.265625, - "epoch": 0.5321861057998726, - "grad_norm": 31.24065589904785, - "kl": 0.12060546875, - "learning_rate": 4.6781389420012746e-07, - "loss": 0.0048, - "reward": 1.829498529434204, - "reward_std": 0.07612404227256775, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5794985890388489, - "step": 1670 - }, - { - "completion_length": 146.171875, - "epoch": 0.5325047801147228, - "grad_norm": 22.557510375976562, - "kl": 0.08203125, - "learning_rate": 4.674952198852772e-07, - "loss": 0.0033, - "reward": 1.5544493198394775, - "reward_std": 0.0435752347111702, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5544492602348328, - "step": 1671 - }, - { - "completion_length": 96.65625, - "epoch": 0.532823454429573, - "grad_norm": 16.112016677856445, - "kl": 0.111328125, - "learning_rate": 4.67176545570427e-07, - "loss": 0.0044, - "reward": 1.5003379583358765, - "reward_std": 0.0549815371632576, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5003380179405212, - "step": 1672 - }, - { - "completion_length": 70.046875, - "epoch": 0.5331421287444232, - "grad_norm": 38.697509765625, - "kl": 0.2333984375, - "learning_rate": 4.668578712555768e-07, - "loss": 0.0093, - "reward": 1.4883949756622314, - "reward_std": 0.12367160618305206, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48839494585990906, - "rewards/pad": 0.0, - "step": 1673 - }, - { - "completion_length": 73.140625, - "epoch": 0.5334608030592735, - "grad_norm": 45.91947555541992, - "kl": 0.1513671875, - "learning_rate": 4.665391969407266e-07, - "loss": 0.0061, - "reward": 1.7846755981445312, - "reward_std": 0.1634289026260376, - "rewards/answer_reward": 0.40625, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.394050657749176, - "step": 1674 - }, - { - "completion_length": 97.578125, - "epoch": 0.5337794773741237, - "grad_norm": 70.14591979980469, - "kl": 0.15234375, - "learning_rate": 4.6622052262587634e-07, - "loss": 0.0061, - "reward": 1.4238718748092651, - "reward_std": 0.0894370749592781, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42387187480926514, - "rewards/pad": 0.0, - "step": 1675 - }, - { - "completion_length": 121.34375, - "epoch": 0.5340981516889739, - "grad_norm": 9.284736633300781, - "kl": 0.1103515625, - "learning_rate": 4.6590184831102615e-07, - "loss": 0.0044, - "reward": 1.562800407409668, - "reward_std": 0.05218476057052612, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5628004670143127, - "rewards/pad": 0.0, - "step": 1676 - }, - { - "completion_length": 124.09375, - "epoch": 0.5344168260038241, - "grad_norm": 13.18323040008545, - "kl": 0.154296875, - "learning_rate": 4.6558317399617585e-07, - "loss": 0.0062, - "reward": 1.4828343391418457, - "reward_std": 0.1092568188905716, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46720945835113525, - "rewards/pad": 0.015625, - "step": 1677 - }, - { - "completion_length": 122.71875, - "epoch": 0.5347355003186743, - "grad_norm": 56.83836364746094, - "kl": 0.11572265625, - "learning_rate": 4.6526449968132566e-07, - "loss": 0.0046, - "reward": 1.644931674003601, - "reward_std": 0.11709925532341003, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5355565547943115, - "step": 1678 - }, - { - "completion_length": 173.828125, - "epoch": 0.5350541746335246, - "grad_norm": 6.490396976470947, - "kl": 0.0771484375, - "learning_rate": 4.649458253664754e-07, - "loss": 0.0031, - "reward": 1.646160364151001, - "reward_std": 0.03755709156394005, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6461603045463562, - "step": 1679 - }, - { - "completion_length": 96.375, - "epoch": 0.5353728489483748, - "grad_norm": 31.79176902770996, - "kl": 0.154296875, - "learning_rate": 4.646271510516252e-07, - "loss": 0.0062, - "reward": 1.514885425567627, - "reward_std": 0.05413583666086197, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.514885425567627, - "rewards/pad": 0.0, - "step": 1680 - }, - { - "completion_length": 122.1875, - "epoch": 0.535691523263225, - "grad_norm": 16.86212158203125, - "kl": 0.15625, - "learning_rate": 4.6430847673677497e-07, - "loss": 0.0063, - "reward": 1.5246326923370361, - "reward_std": 0.09441748261451721, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5246326923370361, - "rewards/pad": 0.0, - "step": 1681 - }, - { - "completion_length": 68.9375, - "epoch": 0.5360101975780752, - "grad_norm": 40.876766204833984, - "kl": 0.1806640625, - "learning_rate": 4.639898024219248e-07, - "loss": 0.0072, - "reward": 1.5343561172485352, - "reward_std": 0.06397495418787003, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5343560576438904, - "rewards/pad": 0.0, - "step": 1682 - }, - { - "completion_length": 97.75, - "epoch": 0.5363288718929254, - "grad_norm": 28.436660766601562, - "kl": 0.11279296875, - "learning_rate": 4.6367112810707453e-07, - "loss": 0.0045, - "reward": 1.4423682689666748, - "reward_std": 0.05234856903553009, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44236820936203003, - "step": 1683 - }, - { - "completion_length": 20.625, - "epoch": 0.5366475462077757, - "grad_norm": 33.24636459350586, - "kl": 0.2041015625, - "learning_rate": 4.6335245379222434e-07, - "loss": 0.0082, - "reward": 1.596472978591919, - "reward_std": 0.09440158307552338, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4714728891849518, - "rewards/pad": 0.125, - "step": 1684 - }, - { - "completion_length": 148.65625, - "epoch": 0.5369662205226259, - "grad_norm": 37.23117446899414, - "kl": 0.2099609375, - "learning_rate": 4.630337794773741e-07, - "loss": 0.0084, - "reward": 1.5583281517028809, - "reward_std": 0.07235017418861389, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43332818150520325, - "step": 1685 - }, - { - "completion_length": 71.5, - "epoch": 0.5372848948374761, - "grad_norm": 33.29726791381836, - "kl": 0.1376953125, - "learning_rate": 4.627151051625239e-07, - "loss": 0.0055, - "reward": 1.618628740310669, - "reward_std": 0.09640872478485107, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4936287999153137, - "rewards/pad": 0.125, - "step": 1686 - }, - { - "completion_length": 121.78125, - "epoch": 0.5376035691523263, - "grad_norm": 9.005082130432129, - "kl": 0.1181640625, - "learning_rate": 4.6239643084767365e-07, - "loss": 0.0047, - "reward": 1.4646806716918945, - "reward_std": 0.052319884300231934, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.464680552482605, - "step": 1687 - }, - { - "completion_length": 71.21875, - "epoch": 0.5379222434671765, - "grad_norm": 26.622392654418945, - "kl": 0.1728515625, - "learning_rate": 4.6207775653282346e-07, - "loss": 0.0069, - "reward": 1.6820809841156006, - "reward_std": 0.09217497706413269, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6820809841156006, - "step": 1688 - }, - { - "completion_length": 123.609375, - "epoch": 0.5382409177820268, - "grad_norm": 97.50489044189453, - "kl": 0.185546875, - "learning_rate": 4.617590822179732e-07, - "loss": 0.0074, - "reward": 1.5759509801864624, - "reward_std": 0.12089285254478455, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4509509205818176, - "step": 1689 - }, - { - "completion_length": 94.3125, - "epoch": 0.538559592096877, - "grad_norm": 39.29410934448242, - "kl": 0.5078125, - "learning_rate": 4.61440407903123e-07, - "loss": 0.0204, - "reward": 1.592628002166748, - "reward_std": 0.08428885787725449, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5926278829574585, - "rewards/pad": 0.0, - "step": 1690 - }, - { - "completion_length": 94.984375, - "epoch": 0.5388782664117272, - "grad_norm": 27.28589630126953, - "kl": 0.19140625, - "learning_rate": 4.611217335882728e-07, - "loss": 0.0077, - "reward": 1.4897959232330322, - "reward_std": 0.061947956681251526, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48979589343070984, - "rewards/pad": 0.0, - "step": 1691 - }, - { - "completion_length": 69.59375, - "epoch": 0.5391969407265774, - "grad_norm": 15.837658882141113, - "kl": 0.314453125, - "learning_rate": 4.608030592734226e-07, - "loss": 0.0126, - "reward": 1.4092113971710205, - "reward_std": 0.045040152966976166, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.28421148657798767, - "step": 1692 - }, - { - "completion_length": 69.234375, - "epoch": 0.5395156150414276, - "grad_norm": 26.907012939453125, - "kl": 0.162109375, - "learning_rate": 4.6048438495857234e-07, - "loss": 0.0065, - "reward": 1.5673654079437256, - "reward_std": 0.09178591519594193, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5673654079437256, - "rewards/pad": 0.0, - "step": 1693 - }, - { - "completion_length": 97.015625, - "epoch": 0.5398342893562779, - "grad_norm": 33.33298873901367, - "kl": 0.142578125, - "learning_rate": 4.601657106437221e-07, - "loss": 0.0057, - "reward": 1.7240219116210938, - "reward_std": 0.10178350657224655, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5990219712257385, - "rewards/pad": 0.125, - "step": 1694 - }, - { - "completion_length": 147.5625, - "epoch": 0.5401529636711281, - "grad_norm": 15.036523818969727, - "kl": 0.1083984375, - "learning_rate": 4.598470363288719e-07, - "loss": 0.0043, - "reward": 1.528503179550171, - "reward_std": 0.05876230448484421, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5285031795501709, - "rewards/pad": 0.0, - "step": 1695 - }, - { - "completion_length": 96.390625, - "epoch": 0.5404716379859783, - "grad_norm": 20.0390682220459, - "kl": 0.173828125, - "learning_rate": 4.5952836201402165e-07, - "loss": 0.007, - "reward": 1.6498191356658936, - "reward_std": 0.058616429567337036, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3998190462589264, - "rewards/pad": 0.25, - "step": 1696 - }, - { - "completion_length": 97.6875, - "epoch": 0.5407903123008285, - "grad_norm": 100.7648696899414, - "kl": 0.158203125, - "learning_rate": 4.592096876991714e-07, - "loss": 0.0063, - "reward": 1.6984670162200928, - "reward_std": 0.10114402323961258, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5265920162200928, - "rewards/pad": 0.171875, - "step": 1697 - }, - { - "completion_length": 93.28125, - "epoch": 0.5411089866156787, - "grad_norm": 18.635839462280273, - "kl": 0.1865234375, - "learning_rate": 4.5889101338432116e-07, - "loss": 0.0074, - "reward": 1.5091419219970703, - "reward_std": 0.06434226781129837, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5091418623924255, - "step": 1698 - }, - { - "completion_length": 95.71875, - "epoch": 0.541427660930529, - "grad_norm": 148.67578125, - "kl": 0.1328125, - "learning_rate": 4.5857233906947097e-07, - "loss": 0.0053, - "reward": 1.6802880764007568, - "reward_std": 0.06698621809482574, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6802880764007568, - "step": 1699 - }, - { - "completion_length": 124.125, - "epoch": 0.5417463352453792, - "grad_norm": 8.406394958496094, - "kl": 0.091796875, - "learning_rate": 4.582536647546207e-07, - "loss": 0.0037, - "reward": 1.604690670967102, - "reward_std": 0.06378532201051712, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47969070076942444, - "step": 1700 - }, - { - "completion_length": 46.109375, - "epoch": 0.5420650095602294, - "grad_norm": 23.811153411865234, - "kl": 0.138671875, - "learning_rate": 4.5793499043977053e-07, - "loss": 0.0056, - "reward": 1.6016175746917725, - "reward_std": 0.05782673507928848, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4766176640987396, - "rewards/pad": 0.125, - "step": 1701 - }, - { - "completion_length": 47.765625, - "epoch": 0.5423836838750796, - "grad_norm": 20.4882755279541, - "kl": 0.1748046875, - "learning_rate": 4.576163161249203e-07, - "loss": 0.007, - "reward": 2.0038671493530273, - "reward_std": 0.07229779660701752, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6288673281669617, - "step": 1702 - }, - { - "completion_length": 98.109375, - "epoch": 0.5427023581899298, - "grad_norm": 8.164681434631348, - "kl": 0.1396484375, - "learning_rate": 4.572976418100701e-07, - "loss": 0.0056, - "reward": 1.90226411819458, - "reward_std": 0.06600087881088257, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6522639989852905, - "step": 1703 - }, - { - "completion_length": 122.890625, - "epoch": 0.5430210325047801, - "grad_norm": 10.439600944519043, - "kl": 0.1513671875, - "learning_rate": 4.5697896749521984e-07, - "loss": 0.006, - "reward": 1.6205843687057495, - "reward_std": 0.07790669053792953, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6205843687057495, - "rewards/pad": 0.0, - "step": 1704 - }, - { - "completion_length": 97.984375, - "epoch": 0.5433397068196303, - "grad_norm": 16.746431350708008, - "kl": 0.12890625, - "learning_rate": 4.5666029318036965e-07, - "loss": 0.0052, - "reward": 1.438126802444458, - "reward_std": 0.037393443286418915, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4381268322467804, - "rewards/pad": 0.0, - "step": 1705 - }, - { - "completion_length": 174.625, - "epoch": 0.5436583811344805, - "grad_norm": 119.52044677734375, - "kl": 0.1025390625, - "learning_rate": 4.563416188655194e-07, - "loss": 0.0041, - "reward": 1.4924670457839966, - "reward_std": 0.04576924815773964, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4924669861793518, - "rewards/pad": 0.0, - "step": 1706 - }, - { - "completion_length": 120.671875, - "epoch": 0.5439770554493308, - "grad_norm": 27.163732528686523, - "kl": 0.11474609375, - "learning_rate": 4.560229445506692e-07, - "loss": 0.0046, - "reward": 1.5915979146957397, - "reward_std": 0.0663021057844162, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48222291469573975, - "rewards/pad": 0.109375, - "step": 1707 - }, - { - "completion_length": 121.15625, - "epoch": 0.5442957297641811, - "grad_norm": 19.470294952392578, - "kl": 0.2158203125, - "learning_rate": 4.5570427023581896e-07, - "loss": 0.0086, - "reward": 1.5954852104187012, - "reward_std": 0.1103629320859909, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5954852104187012, - "rewards/pad": 0.0, - "step": 1708 - }, - { - "completion_length": 45.359375, - "epoch": 0.5446144040790313, - "grad_norm": 35.75319290161133, - "kl": 0.1689453125, - "learning_rate": 4.5538559592096877e-07, - "loss": 0.0068, - "reward": 1.6164058446884155, - "reward_std": 0.1098790392279625, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5226558446884155, - "rewards/pad": 0.09375, - "step": 1709 - }, - { - "completion_length": 122.734375, - "epoch": 0.5449330783938815, - "grad_norm": 77.64950561523438, - "kl": 0.1826171875, - "learning_rate": 4.550669216061185e-07, - "loss": 0.0073, - "reward": 1.546919584274292, - "reward_std": 0.0800800621509552, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.421919584274292, - "rewards/pad": 0.125, - "step": 1710 - }, - { - "completion_length": 98.78125, - "epoch": 0.5452517527087317, - "grad_norm": 16.44868278503418, - "kl": 0.154296875, - "learning_rate": 4.5474824729126833e-07, - "loss": 0.0062, - "reward": 1.552331566810608, - "reward_std": 0.040778547525405884, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4273315668106079, - "step": 1711 - }, - { - "completion_length": 71.4375, - "epoch": 0.5455704270235819, - "grad_norm": 97.36717987060547, - "kl": 0.173828125, - "learning_rate": 4.544295729764181e-07, - "loss": 0.007, - "reward": 1.6137738227844238, - "reward_std": 0.06848740577697754, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48877376317977905, - "rewards/pad": 0.125, - "step": 1712 - }, - { - "completion_length": 70.84375, - "epoch": 0.5458891013384322, - "grad_norm": 26.707841873168945, - "kl": 0.1943359375, - "learning_rate": 4.541108986615679e-07, - "loss": 0.0078, - "reward": 1.5837278366088867, - "reward_std": 0.06415103375911713, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4587279260158539, - "rewards/pad": 0.125, - "step": 1713 - }, - { - "completion_length": 95.40625, - "epoch": 0.5462077756532824, - "grad_norm": 81.21223449707031, - "kl": 0.2265625, - "learning_rate": 4.5379222434671765e-07, - "loss": 0.009, - "reward": 1.4255642890930176, - "reward_std": 0.05765555426478386, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.425564169883728, - "rewards/pad": 0.0, - "step": 1714 - }, - { - "completion_length": 96.625, - "epoch": 0.5465264499681326, - "grad_norm": 17.838123321533203, - "kl": 0.12158203125, - "learning_rate": 4.5347355003186745e-07, - "loss": 0.0049, - "reward": 1.4614139795303345, - "reward_std": 0.041417405009269714, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4614139795303345, - "step": 1715 - }, - { - "completion_length": 98.140625, - "epoch": 0.5468451242829828, - "grad_norm": 16.121427536010742, - "kl": 0.25390625, - "learning_rate": 4.531548757170172e-07, - "loss": 0.0101, - "reward": 1.5973544120788574, - "reward_std": 0.08118489384651184, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45672938227653503, - "rewards/pad": 0.140625, - "step": 1716 - }, - { - "completion_length": 69.5625, - "epoch": 0.547163798597833, - "grad_norm": 114.39732360839844, - "kl": 0.14453125, - "learning_rate": 4.5283620140216696e-07, - "loss": 0.0058, - "reward": 1.7027480602264404, - "reward_std": 0.07084536552429199, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.7027480006217957, - "step": 1717 - }, - { - "completion_length": 71.390625, - "epoch": 0.5474824729126833, - "grad_norm": 18.393924713134766, - "kl": 0.2265625, - "learning_rate": 4.525175270873167e-07, - "loss": 0.0091, - "reward": 1.7171638011932373, - "reward_std": 0.09167201817035675, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4671638309955597, - "rewards/pad": 0.25, - "step": 1718 - }, - { - "completion_length": 122.40625, - "epoch": 0.5478011472275335, - "grad_norm": 39.043643951416016, - "kl": 0.1201171875, - "learning_rate": 4.521988527724665e-07, - "loss": 0.0048, - "reward": 1.5978953838348389, - "reward_std": 0.069697305560112, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47289541363716125, - "step": 1719 - }, - { - "completion_length": 147.8125, - "epoch": 0.5481198215423837, - "grad_norm": 26.979312896728516, - "kl": 0.138671875, - "learning_rate": 4.518801784576163e-07, - "loss": 0.0055, - "reward": 1.5596652030944824, - "reward_std": 0.0610220730304718, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5596652626991272, - "rewards/pad": 0.0, - "step": 1720 - }, - { - "completion_length": 97.0625, - "epoch": 0.5484384958572339, - "grad_norm": 35.1412353515625, - "kl": 0.197265625, - "learning_rate": 4.515615041427661e-07, - "loss": 0.0079, - "reward": 1.400559902191162, - "reward_std": 0.0762283131480217, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4005598723888397, - "step": 1721 - }, - { - "completion_length": 97.09375, - "epoch": 0.5487571701720841, - "grad_norm": 40.99324417114258, - "kl": 0.125, - "learning_rate": 4.5124282982791584e-07, - "loss": 0.005, - "reward": 1.4977219104766846, - "reward_std": 0.1251518428325653, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4195968806743622, - "rewards/pad": 0.078125, - "step": 1722 - }, - { - "completion_length": 95.0, - "epoch": 0.5490758444869344, - "grad_norm": 27.61468505859375, - "kl": 0.134765625, - "learning_rate": 4.509241555130656e-07, - "loss": 0.0054, - "reward": 1.4169597625732422, - "reward_std": 0.16255837678909302, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35445982217788696, - "step": 1723 - }, - { - "completion_length": 95.25, - "epoch": 0.5493945188017846, - "grad_norm": 51.87438201904297, - "kl": 0.16015625, - "learning_rate": 4.506054811982154e-07, - "loss": 0.0064, - "reward": 1.3904025554656982, - "reward_std": 0.06188996881246567, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3904026448726654, - "step": 1724 - }, - { - "completion_length": 70.859375, - "epoch": 0.5497131931166348, - "grad_norm": 47.93151092529297, - "kl": 0.1396484375, - "learning_rate": 4.5028680688336515e-07, - "loss": 0.0056, - "reward": 1.4469374418258667, - "reward_std": 0.05983661487698555, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4469375014305115, - "rewards/pad": 0.0, - "step": 1725 - }, - { - "completion_length": 70.828125, - "epoch": 0.550031867431485, - "grad_norm": 55.93996047973633, - "kl": 0.2119140625, - "learning_rate": 4.4996813256851496e-07, - "loss": 0.0085, - "reward": 1.595890998840332, - "reward_std": 0.11610599607229233, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5333911180496216, - "step": 1726 - }, - { - "completion_length": 122.890625, - "epoch": 0.5503505417463352, - "grad_norm": 29.33034896850586, - "kl": 0.13671875, - "learning_rate": 4.496494582536647e-07, - "loss": 0.0055, - "reward": 1.574709177017212, - "reward_std": 0.0751732587814331, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5747092366218567, - "step": 1727 - }, - { - "completion_length": 72.0, - "epoch": 0.5506692160611855, - "grad_norm": 45.549583435058594, - "kl": 0.1357421875, - "learning_rate": 4.493307839388145e-07, - "loss": 0.0054, - "reward": 1.8912920951843262, - "reward_std": 0.10724160820245743, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5475420355796814, - "rewards/pad": 0.34375, - "step": 1728 - }, - { - "completion_length": 122.859375, - "epoch": 0.5509878903760357, - "grad_norm": 122.186767578125, - "kl": 0.0888671875, - "learning_rate": 4.490121096239643e-07, - "loss": 0.0036, - "reward": 1.504943609237671, - "reward_std": 0.09938017278909683, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5049434900283813, - "rewards/pad": 0.0, - "step": 1729 - }, - { - "completion_length": 44.203125, - "epoch": 0.5513065646908859, - "grad_norm": 33.6113166809082, - "kl": 0.17578125, - "learning_rate": 4.486934353091141e-07, - "loss": 0.007, - "reward": 1.710925579071045, - "reward_std": 0.08198478817939758, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7109256982803345, - "rewards/pad": 0.0, - "step": 1730 - }, - { - "completion_length": 43.546875, - "epoch": 0.5516252390057361, - "grad_norm": 20.308115005493164, - "kl": 0.484375, - "learning_rate": 4.4837476099426384e-07, - "loss": 0.0194, - "reward": 1.6070808172225952, - "reward_std": 0.09324438869953156, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6070806980133057, - "rewards/pad": 0.0, - "step": 1731 - }, - { - "completion_length": 119.421875, - "epoch": 0.5519439133205863, - "grad_norm": 461.97021484375, - "kl": 0.134765625, - "learning_rate": 4.4805608667941364e-07, - "loss": 0.0054, - "reward": 1.552412509918213, - "reward_std": 0.05798398330807686, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5524123907089233, - "rewards/pad": 0.0, - "step": 1732 - }, - { - "completion_length": 96.515625, - "epoch": 0.5522625876354366, - "grad_norm": 27.874906539916992, - "kl": 0.16015625, - "learning_rate": 4.477374123645634e-07, - "loss": 0.0064, - "reward": 1.422250747680664, - "reward_std": 0.050203837454319, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42225080728530884, - "rewards/pad": 0.0, - "step": 1733 - }, - { - "completion_length": 118.59375, - "epoch": 0.5525812619502868, - "grad_norm": 25.63271713256836, - "kl": 0.12890625, - "learning_rate": 4.474187380497132e-07, - "loss": 0.0051, - "reward": 1.4211066961288452, - "reward_std": 0.052193887531757355, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42110663652420044, - "rewards/pad": 0.0, - "step": 1734 - }, - { - "completion_length": 44.1875, - "epoch": 0.552899936265137, - "grad_norm": 34.65806579589844, - "kl": 0.166015625, - "learning_rate": 4.4710006373486296e-07, - "loss": 0.0067, - "reward": 1.8596563339233398, - "reward_std": 0.08217694610357285, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6096563339233398, - "step": 1735 - }, - { - "completion_length": 94.078125, - "epoch": 0.5532186105799872, - "grad_norm": 33.38858413696289, - "kl": 0.1083984375, - "learning_rate": 4.467813894200127e-07, - "loss": 0.0043, - "reward": 1.6893126964569092, - "reward_std": 0.04501698166131973, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.439312607049942, - "step": 1736 - }, - { - "completion_length": 72.140625, - "epoch": 0.5535372848948374, - "grad_norm": 38.28264236450195, - "kl": 0.1455078125, - "learning_rate": 4.4646271510516247e-07, - "loss": 0.0058, - "reward": 1.7901391983032227, - "reward_std": 0.048542171716690063, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5401391386985779, - "rewards/pad": 0.25, - "step": 1737 - }, - { - "completion_length": 70.671875, - "epoch": 0.5538559592096877, - "grad_norm": 49.95315170288086, - "kl": 0.17578125, - "learning_rate": 4.4614404079031227e-07, - "loss": 0.007, - "reward": 1.5110564231872559, - "reward_std": 0.08883722126483917, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5110564231872559, - "step": 1738 - }, - { - "completion_length": 95.1875, - "epoch": 0.5541746335245379, - "grad_norm": 79.57206726074219, - "kl": 0.11572265625, - "learning_rate": 4.4582536647546203e-07, - "loss": 0.0046, - "reward": 1.468375325202942, - "reward_std": 0.03867108374834061, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4683753252029419, - "step": 1739 - }, - { - "completion_length": 124.90625, - "epoch": 0.5544933078393881, - "grad_norm": 50.3587532043457, - "kl": 0.1083984375, - "learning_rate": 4.4550669216061183e-07, - "loss": 0.0043, - "reward": 1.4035348892211914, - "reward_std": 0.03411925584077835, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.403534859418869, - "rewards/pad": 0.0, - "step": 1740 - }, - { - "completion_length": 43.03125, - "epoch": 0.5548119821542383, - "grad_norm": 42.346439361572266, - "kl": 0.240234375, - "learning_rate": 4.451880178457616e-07, - "loss": 0.0096, - "reward": 1.7896267175674438, - "reward_std": 0.057393234223127365, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6646266579627991, - "step": 1741 - }, - { - "completion_length": 121.8125, - "epoch": 0.5551306564690885, - "grad_norm": 18.29889678955078, - "kl": 0.197265625, - "learning_rate": 4.448693435309114e-07, - "loss": 0.0079, - "reward": 1.6041014194488525, - "reward_std": 0.15853551030158997, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.49472635984420776, - "step": 1742 - }, - { - "completion_length": 20.484375, - "epoch": 0.5554493307839388, - "grad_norm": 34.001243591308594, - "kl": 0.25, - "learning_rate": 4.4455066921606115e-07, - "loss": 0.01, - "reward": 1.8880293369293213, - "reward_std": 0.08389134705066681, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6380293965339661, - "rewards/pad": 0.25, - "step": 1743 - }, - { - "completion_length": 70.875, - "epoch": 0.555768005098789, - "grad_norm": 25.211380004882812, - "kl": 0.13671875, - "learning_rate": 4.4423199490121096e-07, - "loss": 0.0055, - "reward": 1.6173521280288696, - "reward_std": 0.11482757329940796, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5079771876335144, - "step": 1744 - }, - { - "completion_length": 43.640625, - "epoch": 0.5560866794136392, - "grad_norm": 33.18324661254883, - "kl": 0.1474609375, - "learning_rate": 4.439133205863607e-07, - "loss": 0.0059, - "reward": 1.6932084560394287, - "reward_std": 0.06315641105175018, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6932085156440735, - "rewards/pad": 0.0, - "step": 1745 - }, - { - "completion_length": 70.59375, - "epoch": 0.5564053537284895, - "grad_norm": 23.28170394897461, - "kl": 0.1064453125, - "learning_rate": 4.435946462715105e-07, - "loss": 0.0043, - "reward": 1.636433720588684, - "reward_std": 0.06571288406848907, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6364337205886841, - "step": 1746 - }, - { - "completion_length": 119.765625, - "epoch": 0.5567240280433398, - "grad_norm": 30.740108489990234, - "kl": 0.240234375, - "learning_rate": 4.4327597195666027e-07, - "loss": 0.0096, - "reward": 1.7088111639022827, - "reward_std": 0.10138913244009018, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5838110446929932, - "step": 1747 - }, - { - "completion_length": 73.3125, - "epoch": 0.55704270235819, - "grad_norm": 92.40509796142578, - "kl": 0.189453125, - "learning_rate": 4.429572976418101e-07, - "loss": 0.0076, - "reward": 1.7267744541168213, - "reward_std": 0.2050127536058426, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5080245137214661, - "rewards/pad": 0.234375, - "step": 1748 - }, - { - "completion_length": 70.28125, - "epoch": 0.5573613766730402, - "grad_norm": 48.15990447998047, - "kl": 0.1494140625, - "learning_rate": 4.4263862332695983e-07, - "loss": 0.006, - "reward": 1.694014072418213, - "reward_std": 0.09263955056667328, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5846391320228577, - "step": 1749 - }, - { - "completion_length": 45.21875, - "epoch": 0.5576800509878904, - "grad_norm": 46.90249252319336, - "kl": 0.35546875, - "learning_rate": 4.4231994901210964e-07, - "loss": 0.0142, - "reward": 1.600165605545044, - "reward_std": 0.15194952487945557, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4282906651496887, - "step": 1750 - }, - { - "completion_length": 68.71875, - "epoch": 0.5579987253027406, - "grad_norm": 16.717771530151367, - "kl": 0.1533203125, - "learning_rate": 4.420012746972594e-07, - "loss": 0.0061, - "reward": 1.71664297580719, - "reward_std": 0.07848171889781952, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5916429758071899, - "rewards/pad": 0.125, - "step": 1751 - }, - { - "completion_length": 122.171875, - "epoch": 0.5583173996175909, - "grad_norm": 96.13909149169922, - "kl": 0.1865234375, - "learning_rate": 4.416826003824092e-07, - "loss": 0.0075, - "reward": 1.4779011011123657, - "reward_std": 0.159339040517807, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3685261309146881, - "step": 1752 - }, - { - "completion_length": 173.46875, - "epoch": 0.5586360739324411, - "grad_norm": 43.84035110473633, - "kl": 0.09228515625, - "learning_rate": 4.4136392606755895e-07, - "loss": 0.0037, - "reward": 1.5944308042526245, - "reward_std": 0.05909878760576248, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4694308340549469, - "rewards/pad": 0.125, - "step": 1753 - }, - { - "completion_length": 69.9375, - "epoch": 0.5589547482472913, - "grad_norm": 19.323455810546875, - "kl": 0.201171875, - "learning_rate": 4.4104525175270876e-07, - "loss": 0.008, - "reward": 1.7000752687454224, - "reward_std": 0.06558647751808167, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7000752091407776, - "rewards/pad": 0.0, - "step": 1754 - }, - { - "completion_length": 120.6875, - "epoch": 0.5592734225621415, - "grad_norm": 17.894180297851562, - "kl": 0.11962890625, - "learning_rate": 4.407265774378585e-07, - "loss": 0.0048, - "reward": 1.6068882942199707, - "reward_std": 0.05032335966825485, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6068882942199707, - "rewards/pad": 0.0, - "step": 1755 - }, - { - "completion_length": 96.234375, - "epoch": 0.5595920968769917, - "grad_norm": 105.96263885498047, - "kl": 0.11181640625, - "learning_rate": 4.404079031230082e-07, - "loss": 0.0045, - "reward": 1.6398990154266357, - "reward_std": 0.05206108093261719, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5148990750312805, - "rewards/pad": 0.125, - "step": 1756 - }, - { - "completion_length": 96.828125, - "epoch": 0.559910771191842, - "grad_norm": 17.890987396240234, - "kl": 0.1640625, - "learning_rate": 4.40089228808158e-07, - "loss": 0.0066, - "reward": 1.7085633277893066, - "reward_std": 0.11856211721897125, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5991884469985962, - "rewards/pad": 0.109375, - "step": 1757 - }, - { - "completion_length": 119.90625, - "epoch": 0.5602294455066922, - "grad_norm": 37.540985107421875, - "kl": 0.0986328125, - "learning_rate": 4.397705544933078e-07, - "loss": 0.0039, - "reward": 1.4744625091552734, - "reward_std": 0.11637206375598907, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.365087628364563, - "step": 1758 - }, - { - "completion_length": 96.984375, - "epoch": 0.5605481198215424, - "grad_norm": 44.76865768432617, - "kl": 0.1982421875, - "learning_rate": 4.394518801784576e-07, - "loss": 0.0079, - "reward": 1.5458180904388428, - "reward_std": 0.139680877327919, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5145681500434875, - "rewards/pad": 0.03125, - "step": 1759 - }, - { - "completion_length": 19.484375, - "epoch": 0.5608667941363926, - "grad_norm": 46.35175704956055, - "kl": 0.2060546875, - "learning_rate": 4.3913320586360734e-07, - "loss": 0.0082, - "reward": 1.5277413129806519, - "reward_std": 0.054538920521736145, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5277412533760071, - "rewards/pad": 0.0, - "step": 1760 - }, - { - "completion_length": 69.8125, - "epoch": 0.5611854684512428, - "grad_norm": 51.44297409057617, - "kl": 0.251953125, - "learning_rate": 4.3881453154875715e-07, - "loss": 0.0101, - "reward": 1.6726279258728027, - "reward_std": 0.08104786276817322, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.547627866268158, - "rewards/pad": 0.125, - "step": 1761 - }, - { - "completion_length": 71.1875, - "epoch": 0.5615041427660931, - "grad_norm": 26.13613510131836, - "kl": 0.15234375, - "learning_rate": 4.384958572339069e-07, - "loss": 0.0061, - "reward": 1.5365188121795654, - "reward_std": 0.05022150278091431, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2865188717842102, - "rewards/pad": 0.25, - "step": 1762 - }, - { - "completion_length": 95.15625, - "epoch": 0.5618228170809433, - "grad_norm": 15.526792526245117, - "kl": 0.2197265625, - "learning_rate": 4.381771829190567e-07, - "loss": 0.0088, - "reward": 1.558420181274414, - "reward_std": 0.06372374296188354, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4334202706813812, - "rewards/pad": 0.125, - "step": 1763 - }, - { - "completion_length": 46.1875, - "epoch": 0.5621414913957935, - "grad_norm": 38.44749450683594, - "kl": 0.703125, - "learning_rate": 4.3785850860420646e-07, - "loss": 0.0281, - "reward": 1.4561386108398438, - "reward_std": 0.04978647083044052, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.456138551235199, - "step": 1764 - }, - { - "completion_length": 97.734375, - "epoch": 0.5624601657106437, - "grad_norm": 26.058134078979492, - "kl": 0.15234375, - "learning_rate": 4.3753983428935627e-07, - "loss": 0.0061, - "reward": 1.730491042137146, - "reward_std": 0.06588118523359299, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6054909825325012, - "step": 1765 - }, - { - "completion_length": 123.25, - "epoch": 0.5627788400254939, - "grad_norm": 138.15040588378906, - "kl": 0.1005859375, - "learning_rate": 4.37221159974506e-07, - "loss": 0.004, - "reward": 1.4126694202423096, - "reward_std": 0.06151300668716431, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41266947984695435, - "rewards/pad": 0.0, - "step": 1766 - }, - { - "completion_length": 96.375, - "epoch": 0.5630975143403442, - "grad_norm": 21.328184127807617, - "kl": 0.15625, - "learning_rate": 4.3690248565965583e-07, - "loss": 0.0063, - "reward": 1.5501000881195068, - "reward_std": 0.10059985518455505, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48760002851486206, - "rewards/pad": 0.0625, - "step": 1767 - }, - { - "completion_length": 98.484375, - "epoch": 0.5634161886551944, - "grad_norm": 46.83586120605469, - "kl": 0.1416015625, - "learning_rate": 4.365838113448056e-07, - "loss": 0.0057, - "reward": 1.8298544883728027, - "reward_std": 0.0899556502699852, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5798544883728027, - "rewards/pad": 0.25, - "step": 1768 - }, - { - "completion_length": 101.953125, - "epoch": 0.5637348629700446, - "grad_norm": 17.387331008911133, - "kl": 0.1298828125, - "learning_rate": 4.362651370299554e-07, - "loss": 0.0052, - "reward": 1.625913381576538, - "reward_std": 0.0826636552810669, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5009133219718933, - "step": 1769 - }, - { - "completion_length": 120.59375, - "epoch": 0.5640535372848948, - "grad_norm": 61.88886260986328, - "kl": 0.291015625, - "learning_rate": 4.3594646271510514e-07, - "loss": 0.0116, - "reward": 1.4453045129776, - "reward_std": 0.11664284765720367, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4453044831752777, - "step": 1770 - }, - { - "completion_length": 97.515625, - "epoch": 0.564372211599745, - "grad_norm": 26.429187774658203, - "kl": 0.10302734375, - "learning_rate": 4.3562778840025495e-07, - "loss": 0.0041, - "reward": 1.506064772605896, - "reward_std": 0.09924240410327911, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4123147130012512, - "rewards/pad": 0.09375, - "step": 1771 - }, - { - "completion_length": 150.515625, - "epoch": 0.5646908859145953, - "grad_norm": 21.579301834106445, - "kl": 0.0927734375, - "learning_rate": 4.353091140854047e-07, - "loss": 0.0037, - "reward": 1.5770467519760132, - "reward_std": 0.09655211120843887, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5301716923713684, - "step": 1772 - }, - { - "completion_length": 150.203125, - "epoch": 0.5650095602294455, - "grad_norm": 15.933002471923828, - "kl": 0.08349609375, - "learning_rate": 4.349904397705545e-07, - "loss": 0.0033, - "reward": 1.4591649770736694, - "reward_std": 0.03933807089924812, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45916497707366943, - "rewards/pad": 0.0, - "step": 1773 - }, - { - "completion_length": 96.71875, - "epoch": 0.5653282345442957, - "grad_norm": 24.305068969726562, - "kl": 0.1220703125, - "learning_rate": 4.3467176545570427e-07, - "loss": 0.0049, - "reward": 1.490128755569458, - "reward_std": 0.04995978623628616, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.490128755569458, - "rewards/pad": 0.0, - "step": 1774 - }, - { - "completion_length": 96.5625, - "epoch": 0.5656469088591459, - "grad_norm": 47.11887741088867, - "kl": 0.10791015625, - "learning_rate": 4.34353091140854e-07, - "loss": 0.0043, - "reward": 1.543684720993042, - "reward_std": 0.18430109322071075, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.528059720993042, - "step": 1775 - }, - { - "completion_length": 67.921875, - "epoch": 0.5659655831739961, - "grad_norm": 24.030990600585938, - "kl": 0.19921875, - "learning_rate": 4.340344168260038e-07, - "loss": 0.008, - "reward": 1.4360082149505615, - "reward_std": 0.04120776802301407, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4360082149505615, - "rewards/pad": 0.0, - "step": 1776 - }, - { - "completion_length": 43.765625, - "epoch": 0.5662842574888464, - "grad_norm": 27.24772834777832, - "kl": 0.2412109375, - "learning_rate": 4.337157425111536e-07, - "loss": 0.0096, - "reward": 1.5563490390777588, - "reward_std": 0.04915473610162735, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.556348979473114, - "rewards/pad": 0.0, - "step": 1777 - }, - { - "completion_length": 97.21875, - "epoch": 0.5666029318036966, - "grad_norm": 66.9402084350586, - "kl": 0.1357421875, - "learning_rate": 4.3339706819630333e-07, - "loss": 0.0054, - "reward": 1.52845299243927, - "reward_std": 0.08243995904922485, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4034530520439148, - "rewards/pad": 0.125, - "step": 1778 - }, - { - "completion_length": 70.203125, - "epoch": 0.5669216061185468, - "grad_norm": 42.54038619995117, - "kl": 0.2255859375, - "learning_rate": 4.3307839388145314e-07, - "loss": 0.009, - "reward": 1.5368831157684326, - "reward_std": 0.16717883944511414, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4275081157684326, - "rewards/pad": 0.109375, - "step": 1779 - }, - { - "completion_length": 122.578125, - "epoch": 0.567240280433397, - "grad_norm": 32.484798431396484, - "kl": 0.2001953125, - "learning_rate": 4.327597195666029e-07, - "loss": 0.008, - "reward": 1.7498657703399658, - "reward_std": 0.09915802627801895, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6404908299446106, - "rewards/pad": 0.109375, - "step": 1780 - }, - { - "completion_length": 47.03125, - "epoch": 0.5675589547482472, - "grad_norm": 67.59293365478516, - "kl": 0.1787109375, - "learning_rate": 4.324410452517527e-07, - "loss": 0.0072, - "reward": 1.6660947799682617, - "reward_std": 0.10472682118415833, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5410947203636169, - "step": 1781 - }, - { - "completion_length": 98.328125, - "epoch": 0.5678776290630975, - "grad_norm": 46.848995208740234, - "kl": 0.134765625, - "learning_rate": 4.3212237093690246e-07, - "loss": 0.0054, - "reward": 1.39346182346344, - "reward_std": 0.11564335227012634, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36221185326576233, - "rewards/pad": 0.03125, - "step": 1782 - }, - { - "completion_length": 174.9375, - "epoch": 0.5681963033779477, - "grad_norm": 6.686620712280273, - "kl": 0.0830078125, - "learning_rate": 4.3180369662205226e-07, - "loss": 0.0033, - "reward": 1.7112741470336914, - "reward_std": 0.1290093958377838, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.47689908742904663, - "step": 1783 - }, - { - "completion_length": 68.859375, - "epoch": 0.5685149776927979, - "grad_norm": 33.932472229003906, - "kl": 0.2041015625, - "learning_rate": 4.31485022307202e-07, - "loss": 0.0082, - "reward": 1.681599736213684, - "reward_std": 0.05573929101228714, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5565997362136841, - "step": 1784 - }, - { - "completion_length": 71.453125, - "epoch": 0.5688336520076482, - "grad_norm": 26.65557861328125, - "kl": 0.1572265625, - "learning_rate": 4.3116634799235177e-07, - "loss": 0.0063, - "reward": 1.5919816493988037, - "reward_std": 0.07187262177467346, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5763567686080933, - "rewards/pad": 0.015625, - "step": 1785 - }, - { - "completion_length": 122.90625, - "epoch": 0.5691523263224985, - "grad_norm": 18.70304298400879, - "kl": 0.10986328125, - "learning_rate": 4.308476736775016e-07, - "loss": 0.0044, - "reward": 1.5359832048416138, - "reward_std": 0.07597589492797852, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.535983145236969, - "step": 1786 - }, - { - "completion_length": 121.109375, - "epoch": 0.5694710006373487, - "grad_norm": 27.156719207763672, - "kl": 0.103515625, - "learning_rate": 4.3052899936265133e-07, - "loss": 0.0041, - "reward": 1.549729585647583, - "reward_std": 0.03134915232658386, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4247296452522278, - "step": 1787 - }, - { - "completion_length": 69.9375, - "epoch": 0.5697896749521989, - "grad_norm": 24.777515411376953, - "kl": 0.205078125, - "learning_rate": 4.3021032504780114e-07, - "loss": 0.0082, - "reward": 1.6144177913665771, - "reward_std": 0.08694911748170853, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6144178509712219, - "rewards/pad": 0.0, - "step": 1788 - }, - { - "completion_length": 174.078125, - "epoch": 0.5701083492670491, - "grad_norm": 16.213119506835938, - "kl": 0.11572265625, - "learning_rate": 4.298916507329509e-07, - "loss": 0.0046, - "reward": 1.38360595703125, - "reward_std": 0.09917178004980087, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3992310166358948, - "step": 1789 - }, - { - "completion_length": 70.078125, - "epoch": 0.5704270235818993, - "grad_norm": 120.56027221679688, - "kl": 0.2353515625, - "learning_rate": 4.295729764181007e-07, - "loss": 0.0094, - "reward": 1.6199557781219482, - "reward_std": 0.11785067617893219, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49495577812194824, - "step": 1790 - }, - { - "completion_length": 43.453125, - "epoch": 0.5707456978967496, - "grad_norm": 36.520206451416016, - "kl": 0.24609375, - "learning_rate": 4.2925430210325045e-07, - "loss": 0.0099, - "reward": 1.5655598640441895, - "reward_std": 0.06695032119750977, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5655598640441895, - "rewards/pad": 0.0, - "step": 1791 - }, - { - "completion_length": 94.5625, - "epoch": 0.5710643722115998, - "grad_norm": 48.086097717285156, - "kl": 0.1552734375, - "learning_rate": 4.2893562778840026e-07, - "loss": 0.0062, - "reward": 1.5157904624938965, - "reward_std": 0.08498187363147736, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5157905220985413, - "rewards/pad": 0.0, - "step": 1792 - }, - { - "completion_length": 95.015625, - "epoch": 0.57138304652645, - "grad_norm": 16.781322479248047, - "kl": 0.1865234375, - "learning_rate": 4.2861695347355e-07, - "loss": 0.0075, - "reward": 1.5322211980819702, - "reward_std": 0.07854540646076202, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.532221257686615, - "rewards/pad": 0.0, - "step": 1793 - }, - { - "completion_length": 71.96875, - "epoch": 0.5717017208413002, - "grad_norm": 58.54457473754883, - "kl": 0.1435546875, - "learning_rate": 4.282982791586998e-07, - "loss": 0.0057, - "reward": 1.6249440908432007, - "reward_std": 0.1079111397266388, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5780690908432007, - "rewards/pad": 0.046875, - "step": 1794 - }, - { - "completion_length": 71.265625, - "epoch": 0.5720203951561504, - "grad_norm": 17.139041900634766, - "kl": 0.2314453125, - "learning_rate": 4.279796048438495e-07, - "loss": 0.0093, - "reward": 1.4836182594299316, - "reward_std": 0.11333303153514862, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42111825942993164, - "step": 1795 - }, - { - "completion_length": 42.53125, - "epoch": 0.5723390694710007, - "grad_norm": 37.97492599487305, - "kl": 0.2109375, - "learning_rate": 4.2766093052899933e-07, - "loss": 0.0084, - "reward": 1.445401906967163, - "reward_std": 0.12103722989559174, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4454018473625183, - "rewards/pad": 0.0, - "step": 1796 - }, - { - "completion_length": 124.15625, - "epoch": 0.5726577437858509, - "grad_norm": 35.62852478027344, - "kl": 0.2890625, - "learning_rate": 4.273422562141491e-07, - "loss": 0.0115, - "reward": 1.4103597402572632, - "reward_std": 0.09135589748620987, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.30098479986190796, - "rewards/pad": 0.125, - "step": 1797 - }, - { - "completion_length": 123.625, - "epoch": 0.5729764181007011, - "grad_norm": 31.406539916992188, - "kl": 0.146484375, - "learning_rate": 4.270235818992989e-07, - "loss": 0.0058, - "reward": 1.5586957931518555, - "reward_std": 0.0993969589471817, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5586957931518555, - "rewards/pad": 0.0, - "step": 1798 - }, - { - "completion_length": 97.75, - "epoch": 0.5732950924155513, - "grad_norm": 28.93682098388672, - "kl": 0.15234375, - "learning_rate": 4.2670490758444865e-07, - "loss": 0.0061, - "reward": 1.4915449619293213, - "reward_std": 0.08422195911407471, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4759199023246765, - "rewards/pad": 0.015625, - "step": 1799 - }, - { - "completion_length": 68.421875, - "epoch": 0.5736137667304015, - "grad_norm": 265.1728210449219, - "kl": 0.2216796875, - "learning_rate": 4.2638623326959845e-07, - "loss": 0.0089, - "reward": 1.5189340114593506, - "reward_std": 0.10375507175922394, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5189339518547058, - "rewards/pad": 0.0, - "step": 1800 - }, - { - "completion_length": 172.65625, - "epoch": 0.5739324410452518, - "grad_norm": 8.524855613708496, - "kl": 0.08984375, - "learning_rate": 4.260675589547482e-07, - "loss": 0.0036, - "reward": 1.3667278289794922, - "reward_std": 0.03605463728308678, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3667277693748474, - "step": 1801 - }, - { - "completion_length": 97.78125, - "epoch": 0.574251115360102, - "grad_norm": 28.816625595092773, - "kl": 0.1572265625, - "learning_rate": 4.25748884639898e-07, - "loss": 0.0063, - "reward": 1.5991346836090088, - "reward_std": 0.05232204869389534, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34913477301597595, - "step": 1802 - }, - { - "completion_length": 18.375, - "epoch": 0.5745697896749522, - "grad_norm": 62.05613327026367, - "kl": 0.189453125, - "learning_rate": 4.2543021032504777e-07, - "loss": 0.0076, - "reward": 1.6871989965438843, - "reward_std": 0.08016446232795715, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6871989965438843, - "rewards/pad": 0.0, - "step": 1803 - }, - { - "completion_length": 71.40625, - "epoch": 0.5748884639898024, - "grad_norm": 20.37428855895996, - "kl": 0.283203125, - "learning_rate": 4.251115360101976e-07, - "loss": 0.0114, - "reward": 1.819273829460144, - "reward_std": 0.08338651806116104, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.569273829460144, - "rewards/pad": 0.25, - "step": 1804 - }, - { - "completion_length": 95.609375, - "epoch": 0.5752071383046526, - "grad_norm": 34.7626953125, - "kl": 0.14453125, - "learning_rate": 4.2479286169534733e-07, - "loss": 0.0058, - "reward": 1.725312352180481, - "reward_std": 0.04575943201780319, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.600312352180481, - "rewards/pad": 0.125, - "step": 1805 - }, - { - "completion_length": 99.5625, - "epoch": 0.5755258126195029, - "grad_norm": 44.738346099853516, - "kl": 0.18359375, - "learning_rate": 4.2447418738049714e-07, - "loss": 0.0074, - "reward": 1.5814894437789917, - "reward_std": 0.10402372479438782, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4252393841743469, - "step": 1806 - }, - { - "completion_length": 122.421875, - "epoch": 0.5758444869343531, - "grad_norm": 38.684932708740234, - "kl": 0.08837890625, - "learning_rate": 4.241555130656469e-07, - "loss": 0.0035, - "reward": 1.5361063480377197, - "reward_std": 0.0688786506652832, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5361063480377197, - "rewards/pad": 0.0, - "step": 1807 - }, - { - "completion_length": 97.640625, - "epoch": 0.5761631612492033, - "grad_norm": 117.1318130493164, - "kl": 0.28125, - "learning_rate": 4.238368387507967e-07, - "loss": 0.0112, - "reward": 1.5856540203094482, - "reward_std": 0.10018368065357208, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.570029079914093, - "rewards/pad": 0.015625, - "step": 1808 - }, - { - "completion_length": 172.546875, - "epoch": 0.5764818355640535, - "grad_norm": 19.1522216796875, - "kl": 0.09814453125, - "learning_rate": 4.2351816443594645e-07, - "loss": 0.0039, - "reward": 1.5147862434387207, - "reward_std": 0.03550882637500763, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5147862434387207, - "rewards/pad": 0.0, - "step": 1809 - }, - { - "completion_length": 95.53125, - "epoch": 0.5768005098789037, - "grad_norm": 53.894081115722656, - "kl": 0.1435546875, - "learning_rate": 4.2319949012109626e-07, - "loss": 0.0057, - "reward": 1.4878883361816406, - "reward_std": 0.05823700875043869, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4878883361816406, - "rewards/pad": 0.0, - "step": 1810 - }, - { - "completion_length": 122.390625, - "epoch": 0.577119184193754, - "grad_norm": 48.407161712646484, - "kl": 0.15625, - "learning_rate": 4.22880815806246e-07, - "loss": 0.0062, - "reward": 1.5016330480575562, - "reward_std": 0.08562350273132324, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5016330480575562, - "rewards/pad": 0.0, - "step": 1811 - }, - { - "completion_length": 70.59375, - "epoch": 0.5774378585086042, - "grad_norm": 52.78874206542969, - "kl": 0.1962890625, - "learning_rate": 4.225621414913958e-07, - "loss": 0.0079, - "reward": 1.5229122638702393, - "reward_std": 0.09713631123304367, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5229122042655945, - "rewards/pad": 0.0, - "step": 1812 - }, - { - "completion_length": 44.734375, - "epoch": 0.5777565328234544, - "grad_norm": 31.73822021484375, - "kl": 0.2275390625, - "learning_rate": 4.2224346717654557e-07, - "loss": 0.0091, - "reward": 1.7680362462997437, - "reward_std": 0.08835802972316742, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6430363655090332, - "rewards/pad": 0.125, - "step": 1813 - }, - { - "completion_length": 96.34375, - "epoch": 0.5780752071383046, - "grad_norm": 42.44234848022461, - "kl": 0.1318359375, - "learning_rate": 4.219247928616954e-07, - "loss": 0.0053, - "reward": 1.6047329902648926, - "reward_std": 0.1471775770187378, - "rewards/answer_reward": 0.09375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5109829902648926, - "step": 1814 - }, - { - "completion_length": 121.953125, - "epoch": 0.5783938814531548, - "grad_norm": 57.97667694091797, - "kl": 0.11279296875, - "learning_rate": 4.216061185468451e-07, - "loss": 0.0045, - "reward": 1.438378930091858, - "reward_std": 0.1234152764081955, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4540039300918579, - "rewards/pad": 0.0, - "step": 1815 - }, - { - "completion_length": 97.984375, - "epoch": 0.5787125557680051, - "grad_norm": 25.46637535095215, - "kl": 0.142578125, - "learning_rate": 4.2128744423199483e-07, - "loss": 0.0057, - "reward": 1.6345410346984863, - "reward_std": 0.05224389582872391, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5095410346984863, - "step": 1816 - }, - { - "completion_length": 122.8125, - "epoch": 0.5790312300828553, - "grad_norm": 27.115081787109375, - "kl": 0.162109375, - "learning_rate": 4.2096876991714464e-07, - "loss": 0.0065, - "reward": 1.7935987710952759, - "reward_std": 0.05079444497823715, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5435987114906311, - "rewards/pad": 0.25, - "step": 1817 - }, - { - "completion_length": 96.640625, - "epoch": 0.5793499043977055, - "grad_norm": 28.72382926940918, - "kl": 0.1494140625, - "learning_rate": 4.206500956022944e-07, - "loss": 0.006, - "reward": 1.5233402252197266, - "reward_std": 0.10524062067270279, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3983403444290161, - "rewards/pad": 0.125, - "step": 1818 - }, - { - "completion_length": 97.015625, - "epoch": 0.5796685787125557, - "grad_norm": 13.208295822143555, - "kl": 0.1806640625, - "learning_rate": 4.203314212874442e-07, - "loss": 0.0072, - "reward": 1.5535154342651367, - "reward_std": 0.09736257791519165, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42851537466049194, - "rewards/pad": 0.125, - "step": 1819 - }, - { - "completion_length": 69.984375, - "epoch": 0.579987253027406, - "grad_norm": 65.50917053222656, - "kl": 0.1953125, - "learning_rate": 4.2001274697259396e-07, - "loss": 0.0078, - "reward": 1.6009665727615356, - "reward_std": 0.048545848578214645, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6009665727615356, - "rewards/pad": 0.0, - "step": 1820 - }, - { - "completion_length": 97.625, - "epoch": 0.5803059273422562, - "grad_norm": 75.36929321289062, - "kl": 0.12158203125, - "learning_rate": 4.1969407265774376e-07, - "loss": 0.0049, - "reward": 1.6634126901626587, - "reward_std": 0.06822207570075989, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4290376901626587, - "step": 1821 - }, - { - "completion_length": 198.265625, - "epoch": 0.5806246016571064, - "grad_norm": 6.578020095825195, - "kl": 0.06640625, - "learning_rate": 4.193753983428935e-07, - "loss": 0.0026, - "reward": 1.4357388019561768, - "reward_std": 0.09460680186748505, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.45136383175849915, - "step": 1822 - }, - { - "completion_length": 95.703125, - "epoch": 0.5809432759719566, - "grad_norm": 50.63115310668945, - "kl": 0.142578125, - "learning_rate": 4.190567240280433e-07, - "loss": 0.0057, - "reward": 1.6899892091751099, - "reward_std": 0.05263391509652138, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5649892091751099, - "step": 1823 - }, - { - "completion_length": 94.921875, - "epoch": 0.5812619502868069, - "grad_norm": 44.11355209350586, - "kl": 0.26953125, - "learning_rate": 4.187380497131931e-07, - "loss": 0.0108, - "reward": 1.6089653968811035, - "reward_std": 0.09958036243915558, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.48396536707878113, - "step": 1824 - }, - { - "completion_length": 99.0, - "epoch": 0.5815806246016572, - "grad_norm": 17.71644401550293, - "kl": 0.11181640625, - "learning_rate": 4.184193753983429e-07, - "loss": 0.0045, - "reward": 1.6234378814697266, - "reward_std": 0.03895140066742897, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3734378516674042, - "rewards/pad": 0.25, - "step": 1825 - }, - { - "completion_length": 70.234375, - "epoch": 0.5818992989165074, - "grad_norm": 22.119213104248047, - "kl": 0.154296875, - "learning_rate": 4.1810070108349264e-07, - "loss": 0.0062, - "reward": 1.7264617681503296, - "reward_std": 0.08063302934169769, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6014617681503296, - "rewards/pad": 0.125, - "step": 1826 - }, - { - "completion_length": 96.515625, - "epoch": 0.5822179732313576, - "grad_norm": 19.158109664916992, - "kl": 0.2216796875, - "learning_rate": 4.1778202676864245e-07, - "loss": 0.0089, - "reward": 1.5653225183486938, - "reward_std": 0.11509357392787933, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5496974587440491, - "rewards/pad": 0.015625, - "step": 1827 - }, - { - "completion_length": 71.265625, - "epoch": 0.5825366475462078, - "grad_norm": 45.197513580322266, - "kl": 0.1435546875, - "learning_rate": 4.174633524537922e-07, - "loss": 0.0057, - "reward": 1.6790035963058472, - "reward_std": 0.13340117037296295, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5696284770965576, - "rewards/pad": 0.109375, - "step": 1828 - }, - { - "completion_length": 119.9375, - "epoch": 0.582855321861058, - "grad_norm": 15.074068069458008, - "kl": 0.2197265625, - "learning_rate": 4.17144678138942e-07, - "loss": 0.0088, - "reward": 1.643786907196045, - "reward_std": 0.06817853450775146, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6437869071960449, - "rewards/pad": 0.0, - "step": 1829 - }, - { - "completion_length": 67.296875, - "epoch": 0.5831739961759083, - "grad_norm": 94.0682601928711, - "kl": 0.2138671875, - "learning_rate": 4.1682600382409176e-07, - "loss": 0.0086, - "reward": 1.5750195980072021, - "reward_std": 0.1432129293680191, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5750196576118469, - "step": 1830 - }, - { - "completion_length": 68.46875, - "epoch": 0.5834926704907585, - "grad_norm": 38.54087829589844, - "kl": 0.1435546875, - "learning_rate": 4.1650732950924157e-07, - "loss": 0.0058, - "reward": 1.5484356880187988, - "reward_std": 0.0928826630115509, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5484358072280884, - "rewards/pad": 0.0, - "step": 1831 - }, - { - "completion_length": 97.8125, - "epoch": 0.5838113448056087, - "grad_norm": 19.759061813354492, - "kl": 0.10302734375, - "learning_rate": 4.161886551943913e-07, - "loss": 0.0041, - "reward": 1.604832410812378, - "reward_std": 0.09509650617837906, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5892074108123779, - "step": 1832 - }, - { - "completion_length": 149.71875, - "epoch": 0.5841300191204589, - "grad_norm": 22.152210235595703, - "kl": 0.0859375, - "learning_rate": 4.1586998087954113e-07, - "loss": 0.0034, - "reward": 1.5515859127044678, - "reward_std": 0.11550014466047287, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.44221100211143494, - "step": 1833 - }, - { - "completion_length": 119.109375, - "epoch": 0.5844486934353091, - "grad_norm": 56.19287109375, - "kl": 0.1962890625, - "learning_rate": 4.1555130656469083e-07, - "loss": 0.0078, - "reward": 1.4814128875732422, - "reward_std": 0.0679532140493393, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48141294717788696, - "rewards/pad": 0.0, - "step": 1834 - }, - { - "completion_length": 169.265625, - "epoch": 0.5847673677501594, - "grad_norm": 386.7119445800781, - "kl": 0.1533203125, - "learning_rate": 4.1523263224984064e-07, - "loss": 0.0061, - "reward": 1.359937071800232, - "reward_std": 0.09846100956201553, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35993701219558716, - "step": 1835 - }, - { - "completion_length": 143.734375, - "epoch": 0.5850860420650096, - "grad_norm": 30.239673614501953, - "kl": 0.1572265625, - "learning_rate": 4.149139579349904e-07, - "loss": 0.0063, - "reward": 1.6132328510284424, - "reward_std": 0.06773644685745239, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4882327914237976, - "step": 1836 - }, - { - "completion_length": 93.5625, - "epoch": 0.5854047163798598, - "grad_norm": 36.47754669189453, - "kl": 0.173828125, - "learning_rate": 4.145952836201402e-07, - "loss": 0.0069, - "reward": 1.5574928522109985, - "reward_std": 0.06488293409347534, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5574928522109985, - "rewards/pad": 0.0, - "step": 1837 - }, - { - "completion_length": 121.03125, - "epoch": 0.58572339069471, - "grad_norm": 13.884906768798828, - "kl": 0.076171875, - "learning_rate": 4.1427660930528995e-07, - "loss": 0.003, - "reward": 1.6176261901855469, - "reward_std": 0.04010601341724396, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49262624979019165, - "step": 1838 - }, - { - "completion_length": 67.0, - "epoch": 0.5860420650095602, - "grad_norm": 24.440515518188477, - "kl": 0.2158203125, - "learning_rate": 4.1395793499043976e-07, - "loss": 0.0086, - "reward": 1.5812326669692993, - "reward_std": 0.08772775530815125, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5812326669692993, - "step": 1839 - }, - { - "completion_length": 95.125, - "epoch": 0.5863607393244105, - "grad_norm": 39.3325309753418, - "kl": 0.11669921875, - "learning_rate": 4.136392606755895e-07, - "loss": 0.0047, - "reward": 1.7650656700134277, - "reward_std": 0.06125748157501221, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6400656700134277, - "rewards/pad": 0.125, - "step": 1840 - }, - { - "completion_length": 118.09375, - "epoch": 0.5866794136392607, - "grad_norm": 9.982199668884277, - "kl": 0.1513671875, - "learning_rate": 4.133205863607393e-07, - "loss": 0.006, - "reward": 1.61601984500885, - "reward_std": 0.04292646422982216, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6160199046134949, - "rewards/pad": 0.0, - "step": 1841 - }, - { - "completion_length": 98.453125, - "epoch": 0.5869980879541109, - "grad_norm": 26.047142028808594, - "kl": 0.2177734375, - "learning_rate": 4.130019120458891e-07, - "loss": 0.0087, - "reward": 1.7720967531204224, - "reward_std": 0.0968465581536293, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5220966339111328, - "step": 1842 - }, - { - "completion_length": 119.5625, - "epoch": 0.5873167622689611, - "grad_norm": 33.48945617675781, - "kl": 0.09912109375, - "learning_rate": 4.126832377310389e-07, - "loss": 0.004, - "reward": 1.5689330101013184, - "reward_std": 0.04028594493865967, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5689329504966736, - "rewards/pad": 0.0, - "step": 1843 - }, - { - "completion_length": 97.828125, - "epoch": 0.5876354365838113, - "grad_norm": 79.5909194946289, - "kl": 0.134765625, - "learning_rate": 4.1236456341618864e-07, - "loss": 0.0054, - "reward": 1.5185623168945312, - "reward_std": 0.1128365620970726, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37793731689453125, - "rewards/pad": 0.140625, - "step": 1844 - }, - { - "completion_length": 70.90625, - "epoch": 0.5879541108986616, - "grad_norm": 18.25775146484375, - "kl": 0.2197265625, - "learning_rate": 4.1204588910133844e-07, - "loss": 0.0088, - "reward": 1.4915564060211182, - "reward_std": 0.140813410282135, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5071814656257629, - "step": 1845 - }, - { - "completion_length": 146.21875, - "epoch": 0.5882727852135118, - "grad_norm": 6.8526716232299805, - "kl": 0.1005859375, - "learning_rate": 4.117272147864882e-07, - "loss": 0.004, - "reward": 1.633963942527771, - "reward_std": 0.04471247270703316, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.508963942527771, - "step": 1846 - }, - { - "completion_length": 96.828125, - "epoch": 0.588591459528362, - "grad_norm": 11.51022720336914, - "kl": 0.1572265625, - "learning_rate": 4.1140854047163795e-07, - "loss": 0.0063, - "reward": 1.5092898607254028, - "reward_std": 0.05117335915565491, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5092898607254028, - "step": 1847 - }, - { - "completion_length": 70.953125, - "epoch": 0.5889101338432122, - "grad_norm": 28.110187530517578, - "kl": 0.1591796875, - "learning_rate": 4.1108986615678776e-07, - "loss": 0.0064, - "reward": 1.651709794998169, - "reward_std": 0.06477080285549164, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.651709794998169, - "step": 1848 - }, - { - "completion_length": 145.328125, - "epoch": 0.5892288081580624, - "grad_norm": 20.159751892089844, - "kl": 0.119140625, - "learning_rate": 4.107711918419375e-07, - "loss": 0.0048, - "reward": 1.5828752517700195, - "reward_std": 0.07095219939947128, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4578753411769867, - "step": 1849 - }, - { - "completion_length": 121.828125, - "epoch": 0.5895474824729127, - "grad_norm": 421.111572265625, - "kl": 0.125, - "learning_rate": 4.104525175270873e-07, - "loss": 0.005, - "reward": 1.7188516855239868, - "reward_std": 0.06404486298561096, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4688516855239868, - "step": 1850 - }, - { - "completion_length": 100.515625, - "epoch": 0.5898661567877629, - "grad_norm": 14.096718788146973, - "kl": 0.1611328125, - "learning_rate": 4.1013384321223707e-07, - "loss": 0.0064, - "reward": 1.4676356315612793, - "reward_std": 0.07982802391052246, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3426356315612793, - "rewards/pad": 0.125, - "step": 1851 - }, - { - "completion_length": 97.59375, - "epoch": 0.5901848311026131, - "grad_norm": 31.308773040771484, - "kl": 0.1376953125, - "learning_rate": 4.098151688973869e-07, - "loss": 0.0055, - "reward": 1.7079538106918335, - "reward_std": 0.1335076093673706, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5673288106918335, - "step": 1852 - }, - { - "completion_length": 118.609375, - "epoch": 0.5905035054174633, - "grad_norm": 28.14283561706543, - "kl": 0.1337890625, - "learning_rate": 4.0949649458253663e-07, - "loss": 0.0053, - "reward": 1.4441075325012207, - "reward_std": 0.06789509207010269, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4441075325012207, - "rewards/pad": 0.0, - "step": 1853 - }, - { - "completion_length": 95.78125, - "epoch": 0.5908221797323135, - "grad_norm": 20.888446807861328, - "kl": 0.1435546875, - "learning_rate": 4.091778202676864e-07, - "loss": 0.0057, - "reward": 1.7055909633636475, - "reward_std": 0.11922048777341843, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5962159037590027, - "rewards/pad": 0.125, - "step": 1854 - }, - { - "completion_length": 45.734375, - "epoch": 0.5911408540471638, - "grad_norm": 49.40755844116211, - "kl": 0.197265625, - "learning_rate": 4.0885914595283614e-07, - "loss": 0.0079, - "reward": 1.4927027225494385, - "reward_std": 0.22029918432235718, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3520776927471161, - "rewards/pad": 0.140625, - "step": 1855 - }, - { - "completion_length": 93.4375, - "epoch": 0.591459528362014, - "grad_norm": 51.1677131652832, - "kl": 0.1650390625, - "learning_rate": 4.0854047163798595e-07, - "loss": 0.0066, - "reward": 1.5748016834259033, - "reward_std": 0.07330948114395142, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5748017430305481, - "step": 1856 - }, - { - "completion_length": 70.703125, - "epoch": 0.5917782026768642, - "grad_norm": 38.13994598388672, - "kl": 0.1826171875, - "learning_rate": 4.082217973231357e-07, - "loss": 0.0073, - "reward": 1.5587074756622314, - "reward_std": 0.07806047797203064, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5587073564529419, - "step": 1857 - }, - { - "completion_length": 71.9375, - "epoch": 0.5920968769917144, - "grad_norm": 18.738344192504883, - "kl": 0.1533203125, - "learning_rate": 4.079031230082855e-07, - "loss": 0.0062, - "reward": 1.5912575721740723, - "reward_std": 0.1442650705575943, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5287575721740723, - "step": 1858 - }, - { - "completion_length": 70.90625, - "epoch": 0.5924155513065646, - "grad_norm": 56.776798248291016, - "kl": 0.201171875, - "learning_rate": 4.0758444869343526e-07, - "loss": 0.0081, - "reward": 1.5838427543640137, - "reward_std": 0.06699131429195404, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5838428139686584, - "step": 1859 - }, - { - "completion_length": 96.625, - "epoch": 0.5927342256214149, - "grad_norm": 32.76490783691406, - "kl": 0.1318359375, - "learning_rate": 4.0726577437858507e-07, - "loss": 0.0053, - "reward": 1.5736356973648071, - "reward_std": 0.09358209371566772, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4486357271671295, - "rewards/pad": 0.125, - "step": 1860 - }, - { - "completion_length": 44.96875, - "epoch": 0.5930528999362651, - "grad_norm": 28.009550094604492, - "kl": 0.359375, - "learning_rate": 4.069471000637348e-07, - "loss": 0.0144, - "reward": 1.483938455581665, - "reward_std": 0.12422734498977661, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4526885151863098, - "rewards/pad": 0.03125, - "step": 1861 - }, - { - "completion_length": 122.171875, - "epoch": 0.5933715742511153, - "grad_norm": 17.696401596069336, - "kl": 0.1337890625, - "learning_rate": 4.0662842574888463e-07, - "loss": 0.0054, - "reward": 1.4019179344177246, - "reward_std": 0.05144944787025452, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40191787481307983, - "rewards/pad": 0.0, - "step": 1862 - }, - { - "completion_length": 120.59375, - "epoch": 0.5936902485659655, - "grad_norm": 28.02484703063965, - "kl": 0.1103515625, - "learning_rate": 4.063097514340344e-07, - "loss": 0.0044, - "reward": 1.4743096828460693, - "reward_std": 0.06880053877830505, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4743097126483917, - "step": 1863 - }, - { - "completion_length": 19.40625, - "epoch": 0.5940089228808159, - "grad_norm": 44.00604248046875, - "kl": 0.359375, - "learning_rate": 4.059910771191842e-07, - "loss": 0.0144, - "reward": 1.7003247737884521, - "reward_std": 0.10178287327289581, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5753247737884521, - "rewards/pad": 0.125, - "step": 1864 - }, - { - "completion_length": 98.078125, - "epoch": 0.5943275971956661, - "grad_norm": 22.15548324584961, - "kl": 0.1533203125, - "learning_rate": 4.0567240280433395e-07, - "loss": 0.0061, - "reward": 1.7335445880889893, - "reward_std": 0.10923653095960617, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5147947072982788, - "step": 1865 - }, - { - "completion_length": 121.0, - "epoch": 0.5946462715105163, - "grad_norm": 11.890228271484375, - "kl": 0.126953125, - "learning_rate": 4.0535372848948375e-07, - "loss": 0.0051, - "reward": 1.6787242889404297, - "reward_std": 0.07239396870136261, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6787242889404297, - "step": 1866 - }, - { - "completion_length": 71.21875, - "epoch": 0.5949649458253665, - "grad_norm": 42.6705322265625, - "kl": 0.1357421875, - "learning_rate": 4.050350541746335e-07, - "loss": 0.0054, - "reward": 1.7034825086593628, - "reward_std": 0.11921627819538116, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.7191076278686523, - "step": 1867 - }, - { - "completion_length": 122.421875, - "epoch": 0.5952836201402167, - "grad_norm": 11.307313919067383, - "kl": 0.1923828125, - "learning_rate": 4.047163798597833e-07, - "loss": 0.0077, - "reward": 1.6598554849624634, - "reward_std": 0.055303771048784256, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40985557436943054, - "rewards/pad": 0.25, - "step": 1868 - }, - { - "completion_length": 45.5, - "epoch": 0.595602294455067, - "grad_norm": 36.23927307128906, - "kl": 0.1865234375, - "learning_rate": 4.0439770554493307e-07, - "loss": 0.0074, - "reward": 1.6517199277877808, - "reward_std": 0.14537964761257172, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5423449277877808, - "step": 1869 - }, - { - "completion_length": 122.71875, - "epoch": 0.5959209687699172, - "grad_norm": 68.48987579345703, - "kl": 0.1298828125, - "learning_rate": 4.040790312300829e-07, - "loss": 0.0052, - "reward": 1.3829237222671509, - "reward_std": 0.1671084463596344, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3360486924648285, - "step": 1870 - }, - { - "completion_length": 93.859375, - "epoch": 0.5962396430847674, - "grad_norm": 73.4350357055664, - "kl": 0.2216796875, - "learning_rate": 4.0376035691523263e-07, - "loss": 0.0089, - "reward": 1.7129826545715332, - "reward_std": 0.11085135489702225, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5879825949668884, - "step": 1871 - }, - { - "completion_length": 121.109375, - "epoch": 0.5965583173996176, - "grad_norm": 10.7921724319458, - "kl": 0.203125, - "learning_rate": 4.0344168260038244e-07, - "loss": 0.0082, - "reward": 1.5458829402923584, - "reward_std": 0.05583937466144562, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4208829998970032, - "rewards/pad": 0.125, - "step": 1872 - }, - { - "completion_length": 146.875, - "epoch": 0.5968769917144678, - "grad_norm": 14.976286888122559, - "kl": 0.0986328125, - "learning_rate": 4.0312300828553214e-07, - "loss": 0.0039, - "reward": 1.5083661079406738, - "reward_std": 0.05651112645864487, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5083661675453186, - "step": 1873 - }, - { - "completion_length": 46.0625, - "epoch": 0.5971956660293181, - "grad_norm": 99.23194122314453, - "kl": 0.177734375, - "learning_rate": 4.0280433397068195e-07, - "loss": 0.0071, - "reward": 1.5729570388793945, - "reward_std": 0.16992275416851044, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44795697927474976, - "rewards/pad": 0.125, - "step": 1874 - }, - { - "completion_length": 69.484375, - "epoch": 0.5975143403441683, - "grad_norm": 12.070338249206543, - "kl": 0.1484375, - "learning_rate": 4.024856596558317e-07, - "loss": 0.0059, - "reward": 1.510481357574463, - "reward_std": 0.04901345819234848, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5104814171791077, - "step": 1875 - }, - { - "completion_length": 123.3125, - "epoch": 0.5978330146590185, - "grad_norm": 76.7409439086914, - "kl": 0.09814453125, - "learning_rate": 4.0216698534098145e-07, - "loss": 0.0039, - "reward": 1.4769536256790161, - "reward_std": 0.08186715096235275, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3675785958766937, - "rewards/pad": 0.109375, - "step": 1876 - }, - { - "completion_length": 46.25, - "epoch": 0.5981516889738687, - "grad_norm": 47.11528396606445, - "kl": 0.19140625, - "learning_rate": 4.0184831102613126e-07, - "loss": 0.0077, - "reward": 1.580622673034668, - "reward_std": 0.13584300875663757, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4243726134300232, - "step": 1877 - }, - { - "completion_length": 70.234375, - "epoch": 0.5984703632887189, - "grad_norm": 39.12364959716797, - "kl": 0.2890625, - "learning_rate": 4.01529636711281e-07, - "loss": 0.0115, - "reward": 1.7271225452423096, - "reward_std": 0.11779257655143738, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6177475452423096, - "rewards/pad": 0.109375, - "step": 1878 - }, - { - "completion_length": 43.578125, - "epoch": 0.5987890376035692, - "grad_norm": 39.20248031616211, - "kl": 0.27734375, - "learning_rate": 4.012109623964308e-07, - "loss": 0.0111, - "reward": 1.6702015399932861, - "reward_std": 0.08535203337669373, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6702016592025757, - "rewards/pad": 0.0, - "step": 1879 - }, - { - "completion_length": 122.34375, - "epoch": 0.5991077119184194, - "grad_norm": 34.952552795410156, - "kl": 0.11328125, - "learning_rate": 4.008922880815806e-07, - "loss": 0.0045, - "reward": 1.4387636184692383, - "reward_std": 0.04700171947479248, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43876367807388306, - "rewards/pad": 0.0, - "step": 1880 - }, - { - "completion_length": 71.3125, - "epoch": 0.5994263862332696, - "grad_norm": 30.318069458007812, - "kl": 0.1259765625, - "learning_rate": 4.005736137667304e-07, - "loss": 0.005, - "reward": 1.5658936500549316, - "reward_std": 0.05282466858625412, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44089367985725403, - "step": 1881 - }, - { - "completion_length": 94.4375, - "epoch": 0.5997450605481198, - "grad_norm": 12.464398384094238, - "kl": 0.181640625, - "learning_rate": 4.0025493945188014e-07, - "loss": 0.0073, - "reward": 1.4137916564941406, - "reward_std": 0.07801970094442368, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4137916564941406, - "rewards/pad": 0.0, - "step": 1882 - }, - { - "completion_length": 71.21875, - "epoch": 0.60006373486297, - "grad_norm": 67.10906982421875, - "kl": 0.1650390625, - "learning_rate": 3.9993626513702994e-07, - "loss": 0.0066, - "reward": 1.7139067649841309, - "reward_std": 0.11026249825954437, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4951567053794861, - "rewards/pad": 0.21875, - "step": 1883 - }, - { - "completion_length": 98.40625, - "epoch": 0.6003824091778203, - "grad_norm": 29.317354202270508, - "kl": 0.14453125, - "learning_rate": 3.996175908221797e-07, - "loss": 0.0058, - "reward": 1.4955264329910278, - "reward_std": 0.06369823217391968, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.49552643299102783, - "step": 1884 - }, - { - "completion_length": 144.078125, - "epoch": 0.6007010834926705, - "grad_norm": 20.339582443237305, - "kl": 0.1142578125, - "learning_rate": 3.992989165073295e-07, - "loss": 0.0046, - "reward": 1.5659692287445068, - "reward_std": 0.11166795343160629, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48784422874450684, - "rewards/pad": 0.078125, - "step": 1885 - }, - { - "completion_length": 67.4375, - "epoch": 0.6010197578075207, - "grad_norm": 66.48014831542969, - "kl": 0.1630859375, - "learning_rate": 3.9898024219247926e-07, - "loss": 0.0065, - "reward": 1.6489007472991943, - "reward_std": 0.11646482348442078, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6489008665084839, - "rewards/pad": 0.0, - "step": 1886 - }, - { - "completion_length": 122.890625, - "epoch": 0.6013384321223709, - "grad_norm": 35.8299560546875, - "kl": 0.09619140625, - "learning_rate": 3.9866156787762907e-07, - "loss": 0.0039, - "reward": 1.6577228307724, - "reward_std": 0.06755958497524261, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5327227711677551, - "step": 1887 - }, - { - "completion_length": 43.4375, - "epoch": 0.6016571064372211, - "grad_norm": 18.268341064453125, - "kl": 0.1474609375, - "learning_rate": 3.983428935627788e-07, - "loss": 0.0059, - "reward": 1.5191141366958618, - "reward_std": 0.050555162131786346, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5191141366958618, - "step": 1888 - }, - { - "completion_length": 97.578125, - "epoch": 0.6019757807520714, - "grad_norm": 21.400203704833984, - "kl": 0.1669921875, - "learning_rate": 3.980242192479286e-07, - "loss": 0.0067, - "reward": 1.8046271800994873, - "reward_std": 0.16014282405376434, - "rewards/answer_reward": 0.296875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5077521800994873, - "step": 1889 - }, - { - "completion_length": 44.578125, - "epoch": 0.6022944550669216, - "grad_norm": 42.205665588378906, - "kl": 0.609375, - "learning_rate": 3.977055449330784e-07, - "loss": 0.0244, - "reward": 1.6509807109832764, - "reward_std": 0.11738397926092148, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6509807705879211, - "rewards/pad": 0.0, - "step": 1890 - }, - { - "completion_length": 122.078125, - "epoch": 0.6026131293817718, - "grad_norm": 23.141324996948242, - "kl": 0.10986328125, - "learning_rate": 3.973868706182282e-07, - "loss": 0.0044, - "reward": 1.5159111022949219, - "reward_std": 0.09590984135866165, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3909110724925995, - "step": 1891 - }, - { - "completion_length": 121.40625, - "epoch": 0.602931803696622, - "grad_norm": 20.430015563964844, - "kl": 0.087890625, - "learning_rate": 3.9706819630337794e-07, - "loss": 0.0035, - "reward": 1.6643826961517334, - "reward_std": 0.09546613693237305, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5550077557563782, - "step": 1892 - }, - { - "completion_length": 44.4375, - "epoch": 0.6032504780114722, - "grad_norm": 54.14018249511719, - "kl": 0.16015625, - "learning_rate": 3.967495219885277e-07, - "loss": 0.0064, - "reward": 1.2324488162994385, - "reward_std": 0.07691571861505508, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2324487566947937, - "step": 1893 - }, - { - "completion_length": 45.84375, - "epoch": 0.6035691523263225, - "grad_norm": 68.19710540771484, - "kl": 0.23046875, - "learning_rate": 3.9643084767367745e-07, - "loss": 0.0092, - "reward": 1.6674814224243164, - "reward_std": 0.10494668036699295, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.526856541633606, - "rewards/pad": 0.140625, - "step": 1894 - }, - { - "completion_length": 171.859375, - "epoch": 0.6038878266411727, - "grad_norm": 10.1597318649292, - "kl": 0.080078125, - "learning_rate": 3.9611217335882726e-07, - "loss": 0.0032, - "reward": 1.592544436454773, - "reward_std": 0.053140126168727875, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4675444960594177, - "step": 1895 - }, - { - "completion_length": 92.71875, - "epoch": 0.6042065009560229, - "grad_norm": 21.789133071899414, - "kl": 0.2353515625, - "learning_rate": 3.95793499043977e-07, - "loss": 0.0094, - "reward": 1.6179006099700928, - "reward_std": 0.0610184520483017, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6179004907608032, - "rewards/pad": 0.0, - "step": 1896 - }, - { - "completion_length": 45.96875, - "epoch": 0.6045251752708731, - "grad_norm": 208.56423950195312, - "kl": 0.1650390625, - "learning_rate": 3.954748247291268e-07, - "loss": 0.0066, - "reward": 1.7036913633346558, - "reward_std": 0.13579218089580536, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46931636333465576, - "step": 1897 - }, - { - "completion_length": 97.1875, - "epoch": 0.6048438495857233, - "grad_norm": 21.571367263793945, - "kl": 0.1484375, - "learning_rate": 3.9515615041427657e-07, - "loss": 0.0059, - "reward": 1.5167163610458374, - "reward_std": 0.10557783395051956, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4073413908481598, - "rewards/pad": 0.109375, - "step": 1898 - }, - { - "completion_length": 45.796875, - "epoch": 0.6051625239005736, - "grad_norm": 17.291685104370117, - "kl": 0.1533203125, - "learning_rate": 3.948374760994264e-07, - "loss": 0.0061, - "reward": 1.7278497219085693, - "reward_std": 0.10903044790029526, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43097472190856934, - "rewards/pad": 0.296875, - "step": 1899 - }, - { - "completion_length": 93.859375, - "epoch": 0.6054811982154238, - "grad_norm": 23.579120635986328, - "kl": 0.11376953125, - "learning_rate": 3.9451880178457613e-07, - "loss": 0.0046, - "reward": 1.507385015487671, - "reward_std": 0.03943081945180893, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5073848962783813, - "rewards/pad": 0.0, - "step": 1900 - }, - { - "completion_length": 120.734375, - "epoch": 0.605799872530274, - "grad_norm": 100.17448425292969, - "kl": 0.09130859375, - "learning_rate": 3.9420012746972594e-07, - "loss": 0.0037, - "reward": 1.5422768592834473, - "reward_std": 0.13402345776557922, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.43290185928344727, - "rewards/pad": 0.125, - "step": 1901 - }, - { - "completion_length": 45.484375, - "epoch": 0.6061185468451242, - "grad_norm": 29.960926055908203, - "kl": 0.2216796875, - "learning_rate": 3.938814531548757e-07, - "loss": 0.0089, - "reward": 1.6506834030151367, - "reward_std": 0.08238563686609268, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5256834030151367, - "rewards/pad": 0.125, - "step": 1902 - }, - { - "completion_length": 46.140625, - "epoch": 0.6064372211599746, - "grad_norm": 43.66762924194336, - "kl": 0.20703125, - "learning_rate": 3.935627788400255e-07, - "loss": 0.0083, - "reward": 1.7490934133529663, - "reward_std": 0.1362352967262268, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5459684729576111, - "rewards/pad": 0.203125, - "step": 1903 - }, - { - "completion_length": 146.453125, - "epoch": 0.6067558954748248, - "grad_norm": 7.266249656677246, - "kl": 0.10205078125, - "learning_rate": 3.9324410452517525e-07, - "loss": 0.0041, - "reward": 1.4332079887390137, - "reward_std": 0.03245477378368378, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43320807814598083, - "rewards/pad": 0.0, - "step": 1904 - }, - { - "completion_length": 99.109375, - "epoch": 0.607074569789675, - "grad_norm": 14.11192512512207, - "kl": 0.1630859375, - "learning_rate": 3.9292543021032506e-07, - "loss": 0.0065, - "reward": 1.5477190017700195, - "reward_std": 0.11120744049549103, - "rewards/pad": 0.359375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.18834398686885834, - "step": 1905 - }, - { - "completion_length": 73.515625, - "epoch": 0.6073932441045252, - "grad_norm": 15.46536922454834, - "kl": 0.10791015625, - "learning_rate": 3.926067558954748e-07, - "loss": 0.0043, - "reward": 1.7275207042694092, - "reward_std": 0.0868614986538887, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3525207042694092, - "rewards/pad": 0.375, - "step": 1906 - }, - { - "completion_length": 47.078125, - "epoch": 0.6077119184193754, - "grad_norm": 63.19918441772461, - "kl": 0.171875, - "learning_rate": 3.922880815806246e-07, - "loss": 0.0069, - "reward": 1.7599387168884277, - "reward_std": 0.17778722941875458, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5568135976791382, - "rewards/pad": 0.203125, - "step": 1907 - }, - { - "completion_length": 96.875, - "epoch": 0.6080305927342257, - "grad_norm": 21.277603149414062, - "kl": 0.1337890625, - "learning_rate": 3.919694072657744e-07, - "loss": 0.0054, - "reward": 1.4872418642044067, - "reward_std": 0.0728248655796051, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.48724186420440674, - "step": 1908 - }, - { - "completion_length": 98.390625, - "epoch": 0.6083492670490759, - "grad_norm": 30.763784408569336, - "kl": 0.12890625, - "learning_rate": 3.9165073295092413e-07, - "loss": 0.0051, - "reward": 1.6191275119781494, - "reward_std": 0.154721200466156, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4160025119781494, - "step": 1909 - }, - { - "completion_length": 70.1875, - "epoch": 0.6086679413639261, - "grad_norm": 26.556743621826172, - "kl": 0.173828125, - "learning_rate": 3.9133205863607394e-07, - "loss": 0.007, - "reward": 1.53456711769104, - "reward_std": 0.06957153975963593, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.53456711769104, - "rewards/pad": 0.0, - "step": 1910 - }, - { - "completion_length": 96.671875, - "epoch": 0.6089866156787763, - "grad_norm": 117.11419677734375, - "kl": 0.1240234375, - "learning_rate": 3.910133843212237e-07, - "loss": 0.005, - "reward": 1.5257726907730103, - "reward_std": 0.04365560784935951, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5257725715637207, - "step": 1911 - }, - { - "completion_length": 121.6875, - "epoch": 0.6093052899936265, - "grad_norm": 31.417213439941406, - "kl": 0.11767578125, - "learning_rate": 3.906947100063735e-07, - "loss": 0.0047, - "reward": 1.5414679050445557, - "reward_std": 0.13280373811721802, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4320930540561676, - "step": 1912 - }, - { - "completion_length": 97.671875, - "epoch": 0.6096239643084768, - "grad_norm": 61.08513641357422, - "kl": 0.12255859375, - "learning_rate": 3.903760356915232e-07, - "loss": 0.0049, - "reward": 1.5116188526153564, - "reward_std": 0.11132874339818954, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.527243971824646, - "rewards/pad": 0.0, - "step": 1913 - }, - { - "completion_length": 71.25, - "epoch": 0.609942638623327, - "grad_norm": 20.120527267456055, - "kl": 0.2451171875, - "learning_rate": 3.90057361376673e-07, - "loss": 0.0098, - "reward": 1.551047921180725, - "reward_std": 0.07995973527431488, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5510479807853699, - "rewards/pad": 0.0, - "step": 1914 - }, - { - "completion_length": 95.0625, - "epoch": 0.6102613129381772, - "grad_norm": 20.293169021606445, - "kl": 0.404296875, - "learning_rate": 3.8973868706182276e-07, - "loss": 0.0161, - "reward": 1.6185961961746216, - "reward_std": 0.10013433545827866, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6185961365699768, - "step": 1915 - }, - { - "completion_length": 95.453125, - "epoch": 0.6105799872530274, - "grad_norm": 23.486623764038086, - "kl": 0.10400390625, - "learning_rate": 3.8942001274697257e-07, - "loss": 0.0042, - "reward": 1.4344849586486816, - "reward_std": 0.10762172937393188, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43448498845100403, - "rewards/pad": 0.0, - "step": 1916 - }, - { - "completion_length": 69.96875, - "epoch": 0.6108986615678776, - "grad_norm": 17.362545013427734, - "kl": 0.244140625, - "learning_rate": 3.891013384321223e-07, - "loss": 0.0098, - "reward": 1.6059834957122803, - "reward_std": 0.11794085055589676, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4966084659099579, - "rewards/pad": 0.109375, - "step": 1917 - }, - { - "completion_length": 123.125, - "epoch": 0.6112173358827279, - "grad_norm": 22.448604583740234, - "kl": 0.12109375, - "learning_rate": 3.8878266411727213e-07, - "loss": 0.0048, - "reward": 1.3911631107330322, - "reward_std": 0.04575665667653084, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3911631107330322, - "step": 1918 - }, - { - "completion_length": 146.09375, - "epoch": 0.6115360101975781, - "grad_norm": 18.462223052978516, - "kl": 0.08447265625, - "learning_rate": 3.884639898024219e-07, - "loss": 0.0034, - "reward": 1.53928542137146, - "reward_std": 0.04528950899839401, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.53928542137146, - "step": 1919 - }, - { - "completion_length": 72.078125, - "epoch": 0.6118546845124283, - "grad_norm": 22.600019454956055, - "kl": 0.1318359375, - "learning_rate": 3.881453154875717e-07, - "loss": 0.0053, - "reward": 1.585951805114746, - "reward_std": 0.12895137071609497, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.6015768647193909, - "step": 1920 - }, - { - "completion_length": 123.5, - "epoch": 0.6121733588272785, - "grad_norm": 17.322956085205078, - "kl": 0.1640625, - "learning_rate": 3.8782664117272144e-07, - "loss": 0.0066, - "reward": 1.679431438446045, - "reward_std": 0.10875079780817032, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5231814384460449, - "step": 1921 - }, - { - "completion_length": 96.921875, - "epoch": 0.6124920331421287, - "grad_norm": 13.455272674560547, - "kl": 0.138671875, - "learning_rate": 3.8750796685787125e-07, - "loss": 0.0055, - "reward": 1.5285980701446533, - "reward_std": 0.06912402808666229, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40359804034233093, - "rewards/pad": 0.125, - "step": 1922 - }, - { - "completion_length": 70.796875, - "epoch": 0.612810707456979, - "grad_norm": 16.77520179748535, - "kl": 0.263671875, - "learning_rate": 3.87189292543021e-07, - "loss": 0.0106, - "reward": 1.5207405090332031, - "reward_std": 0.07428693771362305, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5207405686378479, - "step": 1923 - }, - { - "completion_length": 45.15625, - "epoch": 0.6131293817718292, - "grad_norm": 72.12005615234375, - "kl": 0.1337890625, - "learning_rate": 3.868706182281708e-07, - "loss": 0.0053, - "reward": 1.4954227209091187, - "reward_std": 0.08774071931838989, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49542272090911865, - "rewards/pad": 0.0, - "step": 1924 - }, - { - "completion_length": 119.359375, - "epoch": 0.6134480560866794, - "grad_norm": 45.66981506347656, - "kl": 0.0908203125, - "learning_rate": 3.8655194391332057e-07, - "loss": 0.0036, - "reward": 1.488682508468628, - "reward_std": 0.05444840341806412, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4886825680732727, - "rewards/pad": 0.0, - "step": 1925 - }, - { - "completion_length": 70.65625, - "epoch": 0.6137667304015296, - "grad_norm": 43.318458557128906, - "kl": 0.142578125, - "learning_rate": 3.8623326959847037e-07, - "loss": 0.0057, - "reward": 1.457085371017456, - "reward_std": 0.07869704812765121, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45708543062210083, - "rewards/pad": 0.0, - "step": 1926 - }, - { - "completion_length": 101.078125, - "epoch": 0.6140854047163798, - "grad_norm": 50.392730712890625, - "kl": 0.1103515625, - "learning_rate": 3.8591459528362013e-07, - "loss": 0.0044, - "reward": 1.7335388660430908, - "reward_std": 0.1704206019639969, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4366638660430908, - "rewards/pad": 0.296875, - "step": 1927 - }, - { - "completion_length": 73.296875, - "epoch": 0.6144040790312301, - "grad_norm": 44.92176818847656, - "kl": 0.12109375, - "learning_rate": 3.8559592096876993e-07, - "loss": 0.0048, - "reward": 1.5867115259170532, - "reward_std": 0.18084655702114105, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.446086585521698, - "rewards/pad": 0.140625, - "step": 1928 - }, - { - "completion_length": 123.96875, - "epoch": 0.6147227533460803, - "grad_norm": 11.913430213928223, - "kl": 0.154296875, - "learning_rate": 3.852772466539197e-07, - "loss": 0.0062, - "reward": 1.5406157970428467, - "reward_std": 0.10338909924030304, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5406157970428467, - "step": 1929 - }, - { - "completion_length": 72.4375, - "epoch": 0.6150414276609305, - "grad_norm": 16.6257381439209, - "kl": 0.1630859375, - "learning_rate": 3.849585723390695e-07, - "loss": 0.0065, - "reward": 1.632989525794983, - "reward_std": 0.10083861649036407, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6329895257949829, - "rewards/pad": 0.0, - "step": 1930 - }, - { - "completion_length": 95.296875, - "epoch": 0.6153601019757807, - "grad_norm": 109.17361450195312, - "kl": 0.1376953125, - "learning_rate": 3.8463989802421925e-07, - "loss": 0.0055, - "reward": 1.5289793014526367, - "reward_std": 0.09404543787240982, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4977293312549591, - "step": 1931 - }, - { - "completion_length": 123.578125, - "epoch": 0.615678776290631, - "grad_norm": 16.52717399597168, - "kl": 0.2021484375, - "learning_rate": 3.84321223709369e-07, - "loss": 0.0081, - "reward": 1.5912632942199707, - "reward_std": 0.07738955318927765, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5912632346153259, - "step": 1932 - }, - { - "completion_length": 122.984375, - "epoch": 0.6159974506054812, - "grad_norm": 22.1501522064209, - "kl": 0.1318359375, - "learning_rate": 3.8400254939451876e-07, - "loss": 0.0053, - "reward": 1.4621679782867432, - "reward_std": 0.07366758584976196, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46216797828674316, - "step": 1933 - }, - { - "completion_length": 97.1875, - "epoch": 0.6163161249203314, - "grad_norm": 21.725025177001953, - "kl": 0.1064453125, - "learning_rate": 3.8368387507966856e-07, - "loss": 0.0043, - "reward": 1.3546940088272095, - "reward_std": 0.06960678845643997, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3546939492225647, - "step": 1934 - }, - { - "completion_length": 71.34375, - "epoch": 0.6166347992351816, - "grad_norm": 24.456161499023438, - "kl": 0.1630859375, - "learning_rate": 3.833652007648183e-07, - "loss": 0.0065, - "reward": 1.5506436824798584, - "reward_std": 0.09592757374048233, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4256436228752136, - "step": 1935 - }, - { - "completion_length": 72.5, - "epoch": 0.6169534735500318, - "grad_norm": 20.02235221862793, - "kl": 0.146484375, - "learning_rate": 3.830465264499681e-07, - "loss": 0.0059, - "reward": 1.6763842105865479, - "reward_std": 0.05954057723283768, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5513842105865479, - "rewards/pad": 0.125, - "step": 1936 - }, - { - "completion_length": 97.84375, - "epoch": 0.617272147864882, - "grad_norm": 116.22586059570312, - "kl": 0.125, - "learning_rate": 3.827278521351179e-07, - "loss": 0.005, - "reward": 1.5901174545288086, - "reward_std": 0.050210874527692795, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5901174545288086, - "step": 1937 - }, - { - "completion_length": 99.4375, - "epoch": 0.6175908221797323, - "grad_norm": 33.92824935913086, - "kl": 0.1708984375, - "learning_rate": 3.8240917782026763e-07, - "loss": 0.0068, - "reward": 1.612595796585083, - "reward_std": 0.053627826273441315, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48759573698043823, - "rewards/pad": 0.125, - "step": 1938 - }, - { - "completion_length": 123.0625, - "epoch": 0.6179094964945825, - "grad_norm": 21.3590087890625, - "kl": 0.0830078125, - "learning_rate": 3.8209050350541744e-07, - "loss": 0.0033, - "reward": 1.4808800220489502, - "reward_std": 0.07551354914903641, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4652549922466278, - "step": 1939 - }, - { - "completion_length": 121.28125, - "epoch": 0.6182281708094327, - "grad_norm": 31.41773223876953, - "kl": 0.166015625, - "learning_rate": 3.817718291905672e-07, - "loss": 0.0066, - "reward": 1.5100470781326294, - "reward_std": 0.08721407502889633, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5100470781326294, - "step": 1940 - }, - { - "completion_length": 45.234375, - "epoch": 0.6185468451242829, - "grad_norm": 104.99120330810547, - "kl": 0.140625, - "learning_rate": 3.81453154875717e-07, - "loss": 0.0056, - "reward": 1.445270299911499, - "reward_std": 0.1404525339603424, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4452703297138214, - "step": 1941 - }, - { - "completion_length": 100.296875, - "epoch": 0.6188655194391333, - "grad_norm": 37.74537658691406, - "kl": 0.2314453125, - "learning_rate": 3.8113448056086675e-07, - "loss": 0.0092, - "reward": 1.700887680053711, - "reward_std": 0.08685436844825745, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4508877396583557, - "rewards/pad": 0.25, - "step": 1942 - }, - { - "completion_length": 18.375, - "epoch": 0.6191841937539835, - "grad_norm": 22.813129425048828, - "kl": 0.2216796875, - "learning_rate": 3.8081580624601656e-07, - "loss": 0.0088, - "reward": 1.7529795169830322, - "reward_std": 0.08033765107393265, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7529796361923218, - "rewards/pad": 0.0, - "step": 1943 - }, - { - "completion_length": 148.234375, - "epoch": 0.6195028680688337, - "grad_norm": 93.70587921142578, - "kl": 0.0859375, - "learning_rate": 3.804971319311663e-07, - "loss": 0.0034, - "reward": 1.4083514213562012, - "reward_std": 0.042352207005023956, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.408351331949234, - "rewards/pad": 0.0, - "step": 1944 - }, - { - "completion_length": 73.3125, - "epoch": 0.6198215423836839, - "grad_norm": 57.26400375366211, - "kl": 0.1611328125, - "learning_rate": 3.801784576163161e-07, - "loss": 0.0065, - "reward": 1.7585382461547852, - "reward_std": 0.06438243389129639, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5085383057594299, - "step": 1945 - }, - { - "completion_length": 146.8125, - "epoch": 0.6201402166985341, - "grad_norm": 10.125571250915527, - "kl": 0.11083984375, - "learning_rate": 3.798597833014659e-07, - "loss": 0.0044, - "reward": 1.5111711025238037, - "reward_std": 0.03506201505661011, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5111711025238037, - "step": 1946 - }, - { - "completion_length": 73.359375, - "epoch": 0.6204588910133844, - "grad_norm": 14.61278247833252, - "kl": 0.3203125, - "learning_rate": 3.795411089866157e-07, - "loss": 0.0128, - "reward": 1.724852442741394, - "reward_std": 0.19486337900161743, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.521727442741394, - "step": 1947 - }, - { - "completion_length": 123.0, - "epoch": 0.6207775653282346, - "grad_norm": 19.849451065063477, - "kl": 0.10986328125, - "learning_rate": 3.7922243467176544e-07, - "loss": 0.0044, - "reward": 1.784373164176941, - "reward_std": 0.062317900359630585, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5343732237815857, - "rewards/pad": 0.25, - "step": 1948 - }, - { - "completion_length": 97.890625, - "epoch": 0.6210962396430848, - "grad_norm": 18.029869079589844, - "kl": 0.1279296875, - "learning_rate": 3.7890376035691524e-07, - "loss": 0.0051, - "reward": 1.6952381134033203, - "reward_std": 0.06585781276226044, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5702381730079651, - "step": 1949 - }, - { - "completion_length": 128.25, - "epoch": 0.621414913957935, - "grad_norm": 80.95796966552734, - "kl": 0.09912109375, - "learning_rate": 3.78585086042065e-07, - "loss": 0.004, - "reward": 1.4363656044006348, - "reward_std": 0.04811306297779083, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43636554479599, - "rewards/pad": 0.0, - "step": 1950 - }, - { - "completion_length": 98.75, - "epoch": 0.6217335882727852, - "grad_norm": 52.44783020019531, - "kl": 0.1572265625, - "learning_rate": 3.782664117272148e-07, - "loss": 0.0063, - "reward": 1.5792521238327026, - "reward_std": 0.10398217290639877, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45425212383270264, - "rewards/pad": 0.125, - "step": 1951 - }, - { - "completion_length": 71.953125, - "epoch": 0.6220522625876355, - "grad_norm": 17.435596466064453, - "kl": 0.11572265625, - "learning_rate": 3.779477374123645e-07, - "loss": 0.0046, - "reward": 1.7964942455291748, - "reward_std": 0.05092811957001686, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6714942455291748, - "rewards/pad": 0.125, - "step": 1952 - }, - { - "completion_length": 72.078125, - "epoch": 0.6223709369024857, - "grad_norm": 27.534250259399414, - "kl": 0.119140625, - "learning_rate": 3.776290630975143e-07, - "loss": 0.0048, - "reward": 1.6906150579452515, - "reward_std": 0.03630218282341957, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.565615177154541, - "step": 1953 - }, - { - "completion_length": 99.640625, - "epoch": 0.6226896112173359, - "grad_norm": 42.14917755126953, - "kl": 0.10546875, - "learning_rate": 3.7731038878266407e-07, - "loss": 0.0042, - "reward": 1.6329374313354492, - "reward_std": 0.1385832130908966, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.632937490940094, - "step": 1954 - }, - { - "completion_length": 71.421875, - "epoch": 0.6230082855321861, - "grad_norm": 76.2291030883789, - "kl": 0.1748046875, - "learning_rate": 3.769917144678139e-07, - "loss": 0.007, - "reward": 1.5288151502609253, - "reward_std": 0.12570810317993164, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3881901800632477, - "step": 1955 - }, - { - "completion_length": 69.546875, - "epoch": 0.6233269598470363, - "grad_norm": 40.875633239746094, - "kl": 0.1767578125, - "learning_rate": 3.7667304015296363e-07, - "loss": 0.0071, - "reward": 1.4328603744506836, - "reward_std": 0.10027036815881729, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.432860404253006, - "step": 1956 - }, - { - "completion_length": 101.28125, - "epoch": 0.6236456341618866, - "grad_norm": 27.414457321166992, - "kl": 0.27734375, - "learning_rate": 3.7635436583811344e-07, - "loss": 0.0111, - "reward": 1.7710248231887817, - "reward_std": 0.14013846218585968, - "rewards/pad": 0.359375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4116498827934265, - "step": 1957 - }, - { - "completion_length": 44.296875, - "epoch": 0.6239643084767368, - "grad_norm": 30.38433074951172, - "kl": 0.2255859375, - "learning_rate": 3.760356915232632e-07, - "loss": 0.009, - "reward": 1.5368155241012573, - "reward_std": 0.09000550210475922, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5368155241012573, - "rewards/pad": 0.0, - "step": 1958 - }, - { - "completion_length": 71.265625, - "epoch": 0.624282982791587, - "grad_norm": 73.53339385986328, - "kl": 0.169921875, - "learning_rate": 3.75717017208413e-07, - "loss": 0.0068, - "reward": 1.6425340175628662, - "reward_std": 0.14571252465248108, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.564409077167511, - "rewards/pad": 0.109375, - "step": 1959 - }, - { - "completion_length": 97.953125, - "epoch": 0.6246016571064372, - "grad_norm": 63.19132995605469, - "kl": 1.2578125, - "learning_rate": 3.7539834289356275e-07, - "loss": 0.05, - "reward": 1.7158057689666748, - "reward_std": 0.10019126534461975, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48143070936203003, - "rewards/pad": 0.234375, - "step": 1960 - }, - { - "completion_length": 150.265625, - "epoch": 0.6249203314212874, - "grad_norm": 54.07538986206055, - "kl": 0.09716796875, - "learning_rate": 3.7507966857871256e-07, - "loss": 0.0039, - "reward": 1.510149598121643, - "reward_std": 0.08005047589540482, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3851495385169983, - "rewards/pad": 0.125, - "step": 1961 - }, - { - "completion_length": 122.15625, - "epoch": 0.6252390057361377, - "grad_norm": 21.607746124267578, - "kl": 0.109375, - "learning_rate": 3.747609942638623e-07, - "loss": 0.0044, - "reward": 1.5534250736236572, - "reward_std": 0.04033489525318146, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5534250736236572, - "step": 1962 - }, - { - "completion_length": 70.15625, - "epoch": 0.6255576800509879, - "grad_norm": 15.613412857055664, - "kl": 0.1416015625, - "learning_rate": 3.744423199490121e-07, - "loss": 0.0056, - "reward": 1.618166208267212, - "reward_std": 0.16496075689792633, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5400412082672119, - "step": 1963 - }, - { - "completion_length": 98.703125, - "epoch": 0.6258763543658381, - "grad_norm": 37.2735481262207, - "kl": 0.1796875, - "learning_rate": 3.7412364563416187e-07, - "loss": 0.0072, - "reward": 1.6815367937088013, - "reward_std": 0.09005028754472733, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5409117937088013, - "step": 1964 - }, - { - "completion_length": 70.953125, - "epoch": 0.6261950286806883, - "grad_norm": 23.696840286254883, - "kl": 0.1337890625, - "learning_rate": 3.738049713193117e-07, - "loss": 0.0053, - "reward": 1.6503503322601318, - "reward_std": 0.05913899093866348, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5253504514694214, - "step": 1965 - }, - { - "completion_length": 123.890625, - "epoch": 0.6265137029955385, - "grad_norm": 31.705644607543945, - "kl": 0.12890625, - "learning_rate": 3.7348629700446143e-07, - "loss": 0.0051, - "reward": 1.4367663860321045, - "reward_std": 0.07267533242702484, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4367663264274597, - "step": 1966 - }, - { - "completion_length": 73.40625, - "epoch": 0.6268323773103888, - "grad_norm": 140.63800048828125, - "kl": 0.185546875, - "learning_rate": 3.7316762268961124e-07, - "loss": 0.0074, - "reward": 1.666994333267212, - "reward_std": 0.08498790860176086, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6669944524765015, - "rewards/pad": 0.0, - "step": 1967 - }, - { - "completion_length": 72.296875, - "epoch": 0.627151051625239, - "grad_norm": 35.06744384765625, - "kl": 0.1396484375, - "learning_rate": 3.72848948374761e-07, - "loss": 0.0056, - "reward": 1.6456339359283447, - "reward_std": 0.0863327831029892, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5206338167190552, - "rewards/pad": 0.125, - "step": 1968 - }, - { - "completion_length": 124.296875, - "epoch": 0.6274697259400892, - "grad_norm": 46.038536071777344, - "kl": 0.099609375, - "learning_rate": 3.725302740599108e-07, - "loss": 0.004, - "reward": 1.6347662210464478, - "reward_std": 0.08927043527364731, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5097662806510925, - "rewards/pad": 0.125, - "step": 1969 - }, - { - "completion_length": 96.609375, - "epoch": 0.6277884002549394, - "grad_norm": 43.614681243896484, - "kl": 0.1337890625, - "learning_rate": 3.7221159974506056e-07, - "loss": 0.0053, - "reward": 1.7559230327606201, - "reward_std": 0.15201693773269653, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5840479731559753, - "rewards/pad": 0.171875, - "step": 1970 - }, - { - "completion_length": 43.640625, - "epoch": 0.6281070745697896, - "grad_norm": 175.77879333496094, - "kl": 0.34375, - "learning_rate": 3.718929254302103e-07, - "loss": 0.0137, - "reward": 1.989320993423462, - "reward_std": 0.11485286056995392, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7393209934234619, - "rewards/pad": 0.25, - "step": 1971 - }, - { - "completion_length": 148.3125, - "epoch": 0.6284257488846399, - "grad_norm": 30.099884033203125, - "kl": 0.0986328125, - "learning_rate": 3.7157425111536006e-07, - "loss": 0.0039, - "reward": 1.3178768157958984, - "reward_std": 0.09088070690631866, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.33350178599357605, - "step": 1972 - }, - { - "completion_length": 73.5625, - "epoch": 0.6287444231994901, - "grad_norm": 13.46129035949707, - "kl": 0.130859375, - "learning_rate": 3.712555768005098e-07, - "loss": 0.0052, - "reward": 1.5966365337371826, - "reward_std": 0.1090199202299118, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2841365933418274, - "rewards/pad": 0.3125, - "step": 1973 - }, - { - "completion_length": 70.40625, - "epoch": 0.6290630975143403, - "grad_norm": 20.175935745239258, - "kl": 0.20703125, - "learning_rate": 3.709369024856596e-07, - "loss": 0.0083, - "reward": 1.5837442874908447, - "reward_std": 0.07582993805408478, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5837444067001343, - "rewards/pad": 0.0, - "step": 1974 - }, - { - "completion_length": 44.59375, - "epoch": 0.6293817718291905, - "grad_norm": 49.37710952758789, - "kl": 0.18359375, - "learning_rate": 3.706182281708094e-07, - "loss": 0.0073, - "reward": 1.6172418594360352, - "reward_std": 0.10630054026842117, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6172418594360352, - "rewards/pad": 0.0, - "step": 1975 - }, - { - "completion_length": 72.390625, - "epoch": 0.6297004461440407, - "grad_norm": 18.700769424438477, - "kl": 0.2021484375, - "learning_rate": 3.702995538559592e-07, - "loss": 0.0081, - "reward": 1.7520803213119507, - "reward_std": 0.05088932067155838, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5020803213119507, - "step": 1976 - }, - { - "completion_length": 124.703125, - "epoch": 0.630019120458891, - "grad_norm": 70.51777648925781, - "kl": 0.08837890625, - "learning_rate": 3.6998087954110894e-07, - "loss": 0.0035, - "reward": 1.7651245594024658, - "reward_std": 0.0476035512983799, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.39012449979782104, - "step": 1977 - }, - { - "completion_length": 43.921875, - "epoch": 0.6303377947737412, - "grad_norm": 59.889183044433594, - "kl": 0.1748046875, - "learning_rate": 3.6966220522625875e-07, - "loss": 0.007, - "reward": 1.6480891704559326, - "reward_std": 0.12159959971904755, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6480890512466431, - "step": 1978 - }, - { - "completion_length": 74.578125, - "epoch": 0.6306564690885914, - "grad_norm": 23.10746955871582, - "kl": 0.13671875, - "learning_rate": 3.693435309114085e-07, - "loss": 0.0055, - "reward": 1.703021764755249, - "reward_std": 0.1205596774816513, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5311467051506042, - "step": 1979 - }, - { - "completion_length": 121.375, - "epoch": 0.6309751434034416, - "grad_norm": 18.383968353271484, - "kl": 0.1982421875, - "learning_rate": 3.690248565965583e-07, - "loss": 0.0079, - "reward": 1.4522733688354492, - "reward_std": 0.11010268330574036, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4522734582424164, - "rewards/pad": 0.0, - "step": 1980 - }, - { - "completion_length": 49.40625, - "epoch": 0.631293817718292, - "grad_norm": 39.16129684448242, - "kl": 0.447265625, - "learning_rate": 3.6870618228170806e-07, - "loss": 0.0179, - "reward": 1.5960018634796143, - "reward_std": 0.22656530141830444, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3460018038749695, - "step": 1981 - }, - { - "completion_length": 97.59375, - "epoch": 0.6316124920331422, - "grad_norm": 56.11304473876953, - "kl": 0.14453125, - "learning_rate": 3.6838750796685787e-07, - "loss": 0.0058, - "reward": 1.5802655220031738, - "reward_std": 0.07598498463630676, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45526552200317383, - "rewards/pad": 0.125, - "step": 1982 - }, - { - "completion_length": 69.96875, - "epoch": 0.6319311663479924, - "grad_norm": 14.71876335144043, - "kl": 0.2490234375, - "learning_rate": 3.680688336520076e-07, - "loss": 0.01, - "reward": 1.604870319366455, - "reward_std": 0.09864290058612823, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6048702001571655, - "rewards/pad": 0.0, - "step": 1983 - }, - { - "completion_length": 76.15625, - "epoch": 0.6322498406628426, - "grad_norm": 45.49012756347656, - "kl": 0.271484375, - "learning_rate": 3.6775015933715743e-07, - "loss": 0.0109, - "reward": 2.0544075965881348, - "reward_std": 0.10176599770784378, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5544075965881348, - "rewards/pad": 0.5, - "step": 1984 - }, - { - "completion_length": 96.609375, - "epoch": 0.6325685149776928, - "grad_norm": 26.258607864379883, - "kl": 0.1591796875, - "learning_rate": 3.674314850223072e-07, - "loss": 0.0064, - "reward": 1.666446566581726, - "reward_std": 0.12379075586795807, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5570715665817261, - "step": 1985 - }, - { - "completion_length": 122.3125, - "epoch": 0.6328871892925431, - "grad_norm": 35.372032165527344, - "kl": 0.10302734375, - "learning_rate": 3.67112810707457e-07, - "loss": 0.0041, - "reward": 1.5771713256835938, - "reward_std": 0.03920065239071846, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5771713256835938, - "step": 1986 - }, - { - "completion_length": 45.859375, - "epoch": 0.6332058636073933, - "grad_norm": 57.8635139465332, - "kl": 0.2236328125, - "learning_rate": 3.6679413639260674e-07, - "loss": 0.0089, - "reward": 1.5414435863494873, - "reward_std": 0.15431562066078186, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5414435863494873, - "rewards/pad": 0.015625, - "step": 1987 - }, - { - "completion_length": 100.015625, - "epoch": 0.6335245379222435, - "grad_norm": 19.29568099975586, - "kl": 0.1025390625, - "learning_rate": 3.6647546207775655e-07, - "loss": 0.0041, - "reward": 1.4702959060668945, - "reward_std": 0.07151374965906143, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34529587626457214, - "step": 1988 - }, - { - "completion_length": 97.109375, - "epoch": 0.6338432122370937, - "grad_norm": 19.67873764038086, - "kl": 0.123046875, - "learning_rate": 3.661567877629063e-07, - "loss": 0.0049, - "reward": 1.690169095993042, - "reward_std": 0.043867893517017365, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5651690363883972, - "step": 1989 - }, - { - "completion_length": 18.921875, - "epoch": 0.6341618865519439, - "grad_norm": 32.41449737548828, - "kl": 0.173828125, - "learning_rate": 3.658381134480561e-07, - "loss": 0.0069, - "reward": 1.4949960708618164, - "reward_std": 0.11608640849590302, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4949961006641388, - "rewards/pad": 0.0, - "step": 1990 - }, - { - "completion_length": 46.328125, - "epoch": 0.6344805608667942, - "grad_norm": 22.01445198059082, - "kl": 0.26953125, - "learning_rate": 3.655194391332058e-07, - "loss": 0.0108, - "reward": 1.8105196952819824, - "reward_std": 0.08823952078819275, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5605197548866272, - "rewards/pad": 0.25, - "step": 1991 - }, - { - "completion_length": 95.953125, - "epoch": 0.6347992351816444, - "grad_norm": 26.866201400756836, - "kl": 0.169921875, - "learning_rate": 3.652007648183556e-07, - "loss": 0.0068, - "reward": 1.4317470788955688, - "reward_std": 0.0999232605099678, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4317470192909241, - "step": 1992 - }, - { - "completion_length": 68.734375, - "epoch": 0.6351179094964946, - "grad_norm": 33.86798858642578, - "kl": 0.1513671875, - "learning_rate": 3.648820905035054e-07, - "loss": 0.006, - "reward": 1.461557388305664, - "reward_std": 0.043246183544397354, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4615574777126312, - "rewards/pad": 0.0, - "step": 1993 - }, - { - "completion_length": 97.0625, - "epoch": 0.6354365838113448, - "grad_norm": 22.541500091552734, - "kl": 0.142578125, - "learning_rate": 3.645634161886552e-07, - "loss": 0.0057, - "reward": 1.4119422435760498, - "reward_std": 0.10992357134819031, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4275672137737274, - "step": 1994 - }, - { - "completion_length": 71.453125, - "epoch": 0.635755258126195, - "grad_norm": 54.545433044433594, - "kl": 1.0703125, - "learning_rate": 3.6424474187380494e-07, - "loss": 0.0428, - "reward": 1.6790932416915894, - "reward_std": 0.1349036693572998, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6009682416915894, - "rewards/pad": 0.078125, - "step": 1995 - }, - { - "completion_length": 71.328125, - "epoch": 0.6360739324410453, - "grad_norm": 31.94515609741211, - "kl": 0.2314453125, - "learning_rate": 3.6392606755895474e-07, - "loss": 0.0093, - "reward": 1.527273178100586, - "reward_std": 0.1572173535823822, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5272730588912964, - "rewards/pad": 0.0, - "step": 1996 - }, - { - "completion_length": 123.921875, - "epoch": 0.6363926067558955, - "grad_norm": 44.97288513183594, - "kl": 0.166015625, - "learning_rate": 3.636073932441045e-07, - "loss": 0.0066, - "reward": 1.5373291969299316, - "reward_std": 0.10828669369220734, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5529541969299316, - "step": 1997 - }, - { - "completion_length": 124.609375, - "epoch": 0.6367112810707457, - "grad_norm": 41.73975372314453, - "kl": 0.09423828125, - "learning_rate": 3.632887189292543e-07, - "loss": 0.0038, - "reward": 1.7525591850280762, - "reward_std": 0.09953247010707855, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5338091850280762, - "step": 1998 - }, - { - "completion_length": 70.5625, - "epoch": 0.6370299553855959, - "grad_norm": 63.7142333984375, - "kl": 0.2314453125, - "learning_rate": 3.6297004461440406e-07, - "loss": 0.0092, - "reward": 1.5226670503616333, - "reward_std": 0.08986333757638931, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5226670503616333, - "rewards/pad": 0.0, - "step": 1999 - }, - { - "completion_length": 98.171875, - "epoch": 0.6373486297004461, - "grad_norm": 73.62125396728516, - "kl": 0.10693359375, - "learning_rate": 3.626513702995538e-07, - "loss": 0.0043, - "reward": 1.3816452026367188, - "reward_std": 0.06278573721647263, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3816451132297516, - "rewards/pad": 0.0, - "step": 2000 - }, - { - "completion_length": 123.859375, - "epoch": 0.6376673040152964, - "grad_norm": 34.87504959106445, - "kl": 0.10546875, - "learning_rate": 3.623326959847036e-07, - "loss": 0.0042, - "reward": 1.5530742406845093, - "reward_std": 0.04430997744202614, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.553074300289154, - "step": 2001 - }, - { - "completion_length": 119.75, - "epoch": 0.6379859783301466, - "grad_norm": 59.58365249633789, - "kl": 0.171875, - "learning_rate": 3.6201402166985337e-07, - "loss": 0.0069, - "reward": 1.5331008434295654, - "reward_std": 0.09816928207874298, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5331008434295654, - "rewards/pad": 0.0, - "step": 2002 - }, - { - "completion_length": 46.203125, - "epoch": 0.6383046526449968, - "grad_norm": 57.24969482421875, - "kl": 0.146484375, - "learning_rate": 3.616953473550032e-07, - "loss": 0.0059, - "reward": 1.7841196060180664, - "reward_std": 0.076505646109581, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5341196060180664, - "rewards/pad": 0.25, - "step": 2003 - }, - { - "completion_length": 121.875, - "epoch": 0.638623326959847, - "grad_norm": 19.603242874145508, - "kl": 0.11572265625, - "learning_rate": 3.6137667304015293e-07, - "loss": 0.0046, - "reward": 1.5577675104141235, - "reward_std": 0.07358790934085846, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4327675998210907, - "step": 2004 - }, - { - "completion_length": 121.703125, - "epoch": 0.6389420012746972, - "grad_norm": 12.69798755645752, - "kl": 0.150390625, - "learning_rate": 3.6105799872530274e-07, - "loss": 0.006, - "reward": 1.4820199012756348, - "reward_std": 0.057745158672332764, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48201999068260193, - "step": 2005 - }, - { - "completion_length": 150.078125, - "epoch": 0.6392606755895475, - "grad_norm": 10.070806503295898, - "kl": 0.12353515625, - "learning_rate": 3.607393244104525e-07, - "loss": 0.0049, - "reward": 1.6496670246124268, - "reward_std": 0.05774607136845589, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6496670246124268, - "step": 2006 - }, - { - "completion_length": 97.890625, - "epoch": 0.6395793499043977, - "grad_norm": 56.1859130859375, - "kl": 0.1796875, - "learning_rate": 3.604206500956023e-07, - "loss": 0.0072, - "reward": 1.4355159997940063, - "reward_std": 0.07499827444553375, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43551596999168396, - "step": 2007 - }, - { - "completion_length": 96.8125, - "epoch": 0.6398980242192479, - "grad_norm": 15.85274600982666, - "kl": 0.1787109375, - "learning_rate": 3.6010197578075206e-07, - "loss": 0.0071, - "reward": 1.4491064548492432, - "reward_std": 0.03527633100748062, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3241065442562103, - "rewards/pad": 0.125, - "step": 2008 - }, - { - "completion_length": 122.453125, - "epoch": 0.6402166985340981, - "grad_norm": 64.17939758300781, - "kl": 0.09765625, - "learning_rate": 3.5978330146590186e-07, - "loss": 0.0039, - "reward": 1.704590916633606, - "reward_std": 0.09914045035839081, - "rewards/pad": 0.265625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43896591663360596, - "step": 2009 - }, - { - "completion_length": 122.21875, - "epoch": 0.6405353728489483, - "grad_norm": 24.01514434814453, - "kl": 0.12353515625, - "learning_rate": 3.594646271510516e-07, - "loss": 0.005, - "reward": 1.3596742153167725, - "reward_std": 0.04573728144168854, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3596740961074829, - "rewards/pad": 0.0, - "step": 2010 - }, - { - "completion_length": 46.890625, - "epoch": 0.6408540471637986, - "grad_norm": 16.87997817993164, - "kl": 0.1875, - "learning_rate": 3.5914595283620137e-07, - "loss": 0.0075, - "reward": 1.8270705938339233, - "reward_std": 0.23601290583610535, - "rewards/pad": 0.1875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.6551956534385681, - "step": 2011 - }, - { - "completion_length": 71.765625, - "epoch": 0.6411727214786488, - "grad_norm": 19.43378448486328, - "kl": 0.1572265625, - "learning_rate": 3.588272785213511e-07, - "loss": 0.0063, - "reward": 1.605637788772583, - "reward_std": 0.17479608952999115, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49626290798187256, - "step": 2012 - }, - { - "completion_length": 96.515625, - "epoch": 0.641491395793499, - "grad_norm": 35.44859313964844, - "kl": 0.1162109375, - "learning_rate": 3.5850860420650093e-07, - "loss": 0.0047, - "reward": 1.541455626487732, - "reward_std": 0.06357315182685852, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5414556264877319, - "rewards/pad": 0.0, - "step": 2013 - }, - { - "completion_length": 96.78125, - "epoch": 0.6418100701083492, - "grad_norm": 7.675474643707275, - "kl": 0.1689453125, - "learning_rate": 3.581899298916507e-07, - "loss": 0.0068, - "reward": 1.7133961915969849, - "reward_std": 0.0641525387763977, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4633961319923401, - "step": 2014 - }, - { - "completion_length": 123.6875, - "epoch": 0.6421287444231994, - "grad_norm": 115.30647277832031, - "kl": 0.1591796875, - "learning_rate": 3.578712555768005e-07, - "loss": 0.0064, - "reward": 1.4052350521087646, - "reward_std": 0.06139522045850754, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4052349627017975, - "rewards/pad": 0.0, - "step": 2015 - }, - { - "completion_length": 98.171875, - "epoch": 0.6424474187380497, - "grad_norm": 82.6279067993164, - "kl": 0.185546875, - "learning_rate": 3.5755258126195025e-07, - "loss": 0.0075, - "reward": 1.400416612625122, - "reward_std": 0.045290593057870865, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4004167318344116, - "rewards/pad": 0.0, - "step": 2016 - }, - { - "completion_length": 97.59375, - "epoch": 0.6427660930528999, - "grad_norm": 30.792123794555664, - "kl": 0.15625, - "learning_rate": 3.5723390694710005e-07, - "loss": 0.0063, - "reward": 1.4318363666534424, - "reward_std": 0.07799049466848373, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43183648586273193, - "rewards/pad": 0.0, - "step": 2017 - }, - { - "completion_length": 99.234375, - "epoch": 0.6430847673677501, - "grad_norm": 12.136988639831543, - "kl": 0.2412109375, - "learning_rate": 3.569152326322498e-07, - "loss": 0.0096, - "reward": 1.8425437211990356, - "reward_std": 0.09182443469762802, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5925437211990356, - "rewards/pad": 0.25, - "step": 2018 - }, - { - "completion_length": 72.875, - "epoch": 0.6434034416826003, - "grad_norm": 19.05178451538086, - "kl": 0.287109375, - "learning_rate": 3.565965583173996e-07, - "loss": 0.0115, - "reward": 1.5046403408050537, - "reward_std": 0.12479674071073532, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.3952653408050537, - "rewards/pad": 0.125, - "step": 2019 - }, - { - "completion_length": 70.453125, - "epoch": 0.6437221159974506, - "grad_norm": 24.057708740234375, - "kl": 0.16015625, - "learning_rate": 3.5627788400254937e-07, - "loss": 0.0064, - "reward": 1.7552566528320312, - "reward_std": 0.04989270865917206, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6302566528320312, - "step": 2020 - }, - { - "completion_length": 96.953125, - "epoch": 0.6440407903123009, - "grad_norm": 41.243648529052734, - "kl": 0.123046875, - "learning_rate": 3.559592096876992e-07, - "loss": 0.0049, - "reward": 1.6406363248825073, - "reward_std": 0.1412319540977478, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5937612652778625, - "rewards/pad": 0.046875, - "step": 2021 - }, - { - "completion_length": 96.078125, - "epoch": 0.6443594646271511, - "grad_norm": 59.88046646118164, - "kl": 0.1142578125, - "learning_rate": 3.5564053537284893e-07, - "loss": 0.0046, - "reward": 1.5989556312561035, - "reward_std": 0.05263106897473335, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5989556312561035, - "rewards/pad": 0.0, - "step": 2022 - }, - { - "completion_length": 99.5, - "epoch": 0.6446781389420013, - "grad_norm": 60.32493209838867, - "kl": 0.185546875, - "learning_rate": 3.5532186105799874e-07, - "loss": 0.0074, - "reward": 1.5468171834945679, - "reward_std": 0.1444055736064911, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.37494218349456787, - "step": 2023 - }, - { - "completion_length": 97.796875, - "epoch": 0.6449968132568515, - "grad_norm": 21.40589141845703, - "kl": 0.314453125, - "learning_rate": 3.550031867431485e-07, - "loss": 0.0126, - "reward": 1.479875922203064, - "reward_std": 0.09796611964702606, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47987598180770874, - "step": 2024 - }, - { - "completion_length": 96.625, - "epoch": 0.6453154875717018, - "grad_norm": 25.341371536254883, - "kl": 0.1708984375, - "learning_rate": 3.546845124282983e-07, - "loss": 0.0068, - "reward": 1.5377442836761475, - "reward_std": 0.05767418071627617, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4127442240715027, - "step": 2025 - }, - { - "completion_length": 76.34375, - "epoch": 0.645634161886552, - "grad_norm": 47.222721099853516, - "kl": 0.1328125, - "learning_rate": 3.5436583811344805e-07, - "loss": 0.0053, - "reward": 1.683227777481079, - "reward_std": 0.05976993963122368, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4332278370857239, - "step": 2026 - }, - { - "completion_length": 48.15625, - "epoch": 0.6459528362014022, - "grad_norm": 39.01726150512695, - "kl": 0.345703125, - "learning_rate": 3.5404716379859786e-07, - "loss": 0.0138, - "reward": 1.8334259986877441, - "reward_std": 0.15636026859283447, - "rewards/pad": 0.359375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4896758794784546, - "step": 2027 - }, - { - "completion_length": 173.0625, - "epoch": 0.6462715105162524, - "grad_norm": 10.122318267822266, - "kl": 0.08203125, - "learning_rate": 3.537284894837476e-07, - "loss": 0.0033, - "reward": 1.5355513095855713, - "reward_std": 0.03889822959899902, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5355513095855713, - "step": 2028 - }, - { - "completion_length": 97.578125, - "epoch": 0.6465901848311026, - "grad_norm": 67.95327758789062, - "kl": 0.1767578125, - "learning_rate": 3.534098151688974e-07, - "loss": 0.0071, - "reward": 1.5799638032913208, - "reward_std": 0.06284810602664948, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5799638628959656, - "rewards/pad": 0.0, - "step": 2029 - }, - { - "completion_length": 123.515625, - "epoch": 0.6469088591459529, - "grad_norm": 13.074254035949707, - "kl": 0.14453125, - "learning_rate": 3.530911408540471e-07, - "loss": 0.0058, - "reward": 1.5531742572784424, - "reward_std": 0.08037546277046204, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5531742572784424, - "rewards/pad": 0.0, - "step": 2030 - }, - { - "completion_length": 98.0625, - "epoch": 0.6472275334608031, - "grad_norm": 65.16255950927734, - "kl": 0.1220703125, - "learning_rate": 3.527724665391969e-07, - "loss": 0.0049, - "reward": 1.7504075765609741, - "reward_std": 0.03542018309235573, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6254075169563293, - "rewards/pad": 0.125, - "step": 2031 - }, - { - "completion_length": 73.796875, - "epoch": 0.6475462077756533, - "grad_norm": 25.36817169189453, - "kl": 0.2890625, - "learning_rate": 3.524537922243467e-07, - "loss": 0.0116, - "reward": 1.774897575378418, - "reward_std": 0.1450740098953247, - "rewards/pad": 0.328125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44677257537841797, - "step": 2032 - }, - { - "completion_length": 98.53125, - "epoch": 0.6478648820905035, - "grad_norm": 19.02562141418457, - "kl": 0.1953125, - "learning_rate": 3.5213511790949644e-07, - "loss": 0.0078, - "reward": 1.5571558475494385, - "reward_std": 0.05887052044272423, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4321557879447937, - "step": 2033 - }, - { - "completion_length": 71.859375, - "epoch": 0.6481835564053537, - "grad_norm": 19.338512420654297, - "kl": 0.181640625, - "learning_rate": 3.5181644359464624e-07, - "loss": 0.0072, - "reward": 1.6721837520599365, - "reward_std": 0.1088566929101944, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5628088116645813, - "rewards/pad": 0.109375, - "step": 2034 - }, - { - "completion_length": 126.53125, - "epoch": 0.648502230720204, - "grad_norm": 12.018340110778809, - "kl": 0.109375, - "learning_rate": 3.51497769279796e-07, - "loss": 0.0044, - "reward": 1.7323064804077148, - "reward_std": 0.08383309096097946, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5135563611984253, - "rewards/pad": 0.21875, - "step": 2035 - }, - { - "completion_length": 71.890625, - "epoch": 0.6488209050350542, - "grad_norm": 30.69729995727539, - "kl": 0.205078125, - "learning_rate": 3.511790949649458e-07, - "loss": 0.0082, - "reward": 1.5320508480072021, - "reward_std": 0.08980223536491394, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5476758480072021, - "rewards/pad": 0.0, - "step": 2036 - }, - { - "completion_length": 47.125, - "epoch": 0.6491395793499044, - "grad_norm": 23.445585250854492, - "kl": 0.1640625, - "learning_rate": 3.5086042065009556e-07, - "loss": 0.0066, - "reward": 1.8220884799957275, - "reward_std": 0.1505730152130127, - "rewards/answer_reward": 0.484375, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.3533385992050171, - "step": 2037 - }, - { - "completion_length": 127.0, - "epoch": 0.6494582536647546, - "grad_norm": 37.44228744506836, - "kl": 0.12060546875, - "learning_rate": 3.5054174633524537e-07, - "loss": 0.0048, - "reward": 1.7701386213302612, - "reward_std": 0.05509011074900627, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5201385617256165, - "step": 2038 - }, - { - "completion_length": 149.25, - "epoch": 0.6497769279796048, - "grad_norm": 66.34549713134766, - "kl": 0.0927734375, - "learning_rate": 3.502230720203951e-07, - "loss": 0.0037, - "reward": 1.5600908994674683, - "reward_std": 0.11401061713695526, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5757158994674683, - "step": 2039 - }, - { - "completion_length": 176.21875, - "epoch": 0.6500956022944551, - "grad_norm": 71.6153793334961, - "kl": 0.06103515625, - "learning_rate": 3.499043977055449e-07, - "loss": 0.0024, - "reward": 1.515439510345459, - "reward_std": 0.14185801148414612, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.531064510345459, - "rewards/pad": 0.0, - "step": 2040 - }, - { - "completion_length": 74.921875, - "epoch": 0.6504142766093053, - "grad_norm": 61.78131866455078, - "kl": 0.15625, - "learning_rate": 3.495857233906947e-07, - "loss": 0.0063, - "reward": 1.6786961555480957, - "reward_std": 0.061907485127449036, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3036961853504181, - "rewards/pad": 0.375, - "step": 2041 - }, - { - "completion_length": 103.671875, - "epoch": 0.6507329509241555, - "grad_norm": 206.1800079345703, - "kl": 0.15625, - "learning_rate": 3.492670490758445e-07, - "loss": 0.0062, - "reward": 1.9095579385757446, - "reward_std": 0.15298190712928772, - "rewards/pad": 0.421875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5033079981803894, - "step": 2042 - }, - { - "completion_length": 124.15625, - "epoch": 0.6510516252390057, - "grad_norm": 28.421953201293945, - "kl": 0.11474609375, - "learning_rate": 3.4894837476099424e-07, - "loss": 0.0046, - "reward": 1.813352108001709, - "reward_std": 0.04790331423282623, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43835219740867615, - "rewards/pad": 0.375, - "step": 2043 - }, - { - "completion_length": 121.671875, - "epoch": 0.651370299553856, - "grad_norm": 69.96031951904297, - "kl": 0.134765625, - "learning_rate": 3.4862970044614405e-07, - "loss": 0.0054, - "reward": 1.6841192245483398, - "reward_std": 0.0788702517747879, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5591192245483398, - "rewards/pad": 0.125, - "step": 2044 - }, - { - "completion_length": 72.0, - "epoch": 0.6516889738687062, - "grad_norm": 83.14112091064453, - "kl": 0.140625, - "learning_rate": 3.483110261312938e-07, - "loss": 0.0056, - "reward": 1.635480284690857, - "reward_std": 0.05237569287419319, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6354802846908569, - "step": 2045 - }, - { - "completion_length": 72.125, - "epoch": 0.6520076481835564, - "grad_norm": 65.5025863647461, - "kl": 0.19921875, - "learning_rate": 3.479923518164436e-07, - "loss": 0.008, - "reward": 1.5832891464233398, - "reward_std": 0.05970742926001549, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45828914642333984, - "rewards/pad": 0.125, - "step": 2046 - }, - { - "completion_length": 96.953125, - "epoch": 0.6523263224984066, - "grad_norm": 36.18320846557617, - "kl": 0.19921875, - "learning_rate": 3.4767367750159336e-07, - "loss": 0.008, - "reward": 1.6746628284454346, - "reward_std": 0.04143855720758438, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5496628880500793, - "step": 2047 - }, - { - "completion_length": 125.1875, - "epoch": 0.6526449968132568, - "grad_norm": 14.649348258972168, - "kl": 0.2265625, - "learning_rate": 3.4735500318674317e-07, - "loss": 0.0091, - "reward": 1.6547412872314453, - "reward_std": 0.04746638238430023, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5297412276268005, - "rewards/pad": 0.125, - "step": 2048 - }, - { - "completion_length": 126.671875, - "epoch": 0.652963671128107, - "grad_norm": 47.00898742675781, - "kl": 0.1748046875, - "learning_rate": 3.470363288718929e-07, - "loss": 0.007, - "reward": 1.49715256690979, - "reward_std": 0.12011165916919708, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4502776563167572, - "step": 2049 - }, - { - "completion_length": 72.265625, - "epoch": 0.6532823454429573, - "grad_norm": 58.02875900268555, - "kl": 0.1689453125, - "learning_rate": 3.467176545570427e-07, - "loss": 0.0068, - "reward": 1.4719352722167969, - "reward_std": 0.05091769993305206, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34693530201911926, - "rewards/pad": 0.125, - "step": 2050 - }, - { - "completion_length": 123.90625, - "epoch": 0.6536010197578075, - "grad_norm": 10.941967964172363, - "kl": 0.1259765625, - "learning_rate": 3.4639898024219243e-07, - "loss": 0.005, - "reward": 1.6538059711456299, - "reward_std": 0.047944795340299606, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4038059115409851, - "step": 2051 - }, - { - "completion_length": 44.640625, - "epoch": 0.6539196940726577, - "grad_norm": 23.65229034423828, - "kl": 0.27734375, - "learning_rate": 3.4608030592734224e-07, - "loss": 0.0111, - "reward": 1.5575801134109497, - "reward_std": 0.09973680227994919, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5575801134109497, - "rewards/pad": 0.0, - "step": 2052 - }, - { - "completion_length": 97.28125, - "epoch": 0.6542383683875079, - "grad_norm": 21.43920135498047, - "kl": 0.1533203125, - "learning_rate": 3.45761631612492e-07, - "loss": 0.0061, - "reward": 1.7789738178253174, - "reward_std": 0.10994003713130951, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6695988178253174, - "step": 2053 - }, - { - "completion_length": 97.703125, - "epoch": 0.6545570427023581, - "grad_norm": 13.602856636047363, - "kl": 0.1591796875, - "learning_rate": 3.454429572976418e-07, - "loss": 0.0064, - "reward": 1.45784330368042, - "reward_std": 0.06162258982658386, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4578433334827423, - "step": 2054 - }, - { - "completion_length": 99.671875, - "epoch": 0.6548757170172084, - "grad_norm": 47.3553466796875, - "kl": 0.1796875, - "learning_rate": 3.4512428298279155e-07, - "loss": 0.0072, - "reward": 1.640990972518921, - "reward_std": 0.21965278685092926, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.6097409725189209, - "step": 2055 - }, - { - "completion_length": 70.625, - "epoch": 0.6551943913320586, - "grad_norm": 42.78764724731445, - "kl": 0.1533203125, - "learning_rate": 3.4480560866794136e-07, - "loss": 0.0061, - "reward": 1.6163208484649658, - "reward_std": 0.08929796516895294, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49132072925567627, - "step": 2056 - }, - { - "completion_length": 70.390625, - "epoch": 0.6555130656469088, - "grad_norm": 35.00171661376953, - "kl": 0.14453125, - "learning_rate": 3.444869343530911e-07, - "loss": 0.0058, - "reward": 1.6293587684631348, - "reward_std": 0.17790740728378296, - "rewards/answer_reward": 0.203125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.44185870885849, - "step": 2057 - }, - { - "completion_length": 101.578125, - "epoch": 0.655831739961759, - "grad_norm": 88.68413543701172, - "kl": 0.11376953125, - "learning_rate": 3.441682600382409e-07, - "loss": 0.0045, - "reward": 1.516322374343872, - "reward_std": 0.07132712751626968, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39132243394851685, - "rewards/pad": 0.125, - "step": 2058 - }, - { - "completion_length": 98.625, - "epoch": 0.6561504142766093, - "grad_norm": 78.99787902832031, - "kl": 0.166015625, - "learning_rate": 3.438495857233907e-07, - "loss": 0.0066, - "reward": 1.563938856124878, - "reward_std": 0.054905910044908524, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5639387369155884, - "step": 2059 - }, - { - "completion_length": 45.765625, - "epoch": 0.6564690885914596, - "grad_norm": 24.56032371520996, - "kl": 0.1806640625, - "learning_rate": 3.435309114085405e-07, - "loss": 0.0072, - "reward": 1.6660699844360352, - "reward_std": 0.07278779149055481, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.41606995463371277, - "step": 2060 - }, - { - "completion_length": 147.34375, - "epoch": 0.6567877629063098, - "grad_norm": 15.71139907836914, - "kl": 0.0859375, - "learning_rate": 3.4321223709369024e-07, - "loss": 0.0034, - "reward": 1.4695935249328613, - "reward_std": 0.02930522710084915, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46959346532821655, - "step": 2061 - }, - { - "completion_length": 149.84375, - "epoch": 0.65710643722116, - "grad_norm": 62.19662857055664, - "kl": 0.09423828125, - "learning_rate": 3.4289356277884e-07, - "loss": 0.0038, - "reward": 1.710579514503479, - "reward_std": 0.10253609716892242, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.601204514503479, - "step": 2062 - }, - { - "completion_length": 124.0625, - "epoch": 0.6574251115360102, - "grad_norm": 45.72370147705078, - "kl": 0.1201171875, - "learning_rate": 3.425748884639898e-07, - "loss": 0.0048, - "reward": 1.4898618459701538, - "reward_std": 0.08097711205482483, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48986178636550903, - "rewards/pad": 0.0, - "step": 2063 - }, - { - "completion_length": 102.46875, - "epoch": 0.6577437858508605, - "grad_norm": 83.81393432617188, - "kl": 0.1484375, - "learning_rate": 3.4225621414913955e-07, - "loss": 0.0059, - "reward": 1.7617790699005127, - "reward_std": 0.12920929491519928, - "rewards/pad": 0.265625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49615412950515747, - "step": 2064 - }, - { - "completion_length": 152.171875, - "epoch": 0.6580624601657107, - "grad_norm": 13.452592849731445, - "kl": 0.09814453125, - "learning_rate": 3.4193753983428936e-07, - "loss": 0.0039, - "reward": 1.6739437580108643, - "reward_std": 0.04467832297086716, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.29894375801086426, - "step": 2065 - }, - { - "completion_length": 99.296875, - "epoch": 0.6583811344805609, - "grad_norm": 81.42863464355469, - "kl": 0.1259765625, - "learning_rate": 3.416188655194391e-07, - "loss": 0.0051, - "reward": 1.5747714042663574, - "reward_std": 0.06781714409589767, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44977134466171265, - "rewards/pad": 0.125, - "step": 2066 - }, - { - "completion_length": 67.953125, - "epoch": 0.6586998087954111, - "grad_norm": 243.19931030273438, - "kl": 0.171875, - "learning_rate": 3.413001912045889e-07, - "loss": 0.0069, - "reward": 1.6069536209106445, - "reward_std": 0.09787873178720474, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6069536805152893, - "rewards/pad": 0.0, - "step": 2067 - }, - { - "completion_length": 98.40625, - "epoch": 0.6590184831102613, - "grad_norm": 13.955587387084961, - "kl": 0.25390625, - "learning_rate": 3.409815168897387e-07, - "loss": 0.0102, - "reward": 1.6424219608306885, - "reward_std": 0.12129343301057816, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5174219608306885, - "step": 2068 - }, - { - "completion_length": 125.796875, - "epoch": 0.6593371574251116, - "grad_norm": 62.8036003112793, - "kl": 0.1171875, - "learning_rate": 3.406628425748885e-07, - "loss": 0.0047, - "reward": 1.576296091079712, - "reward_std": 0.086067795753479, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4200460910797119, - "step": 2069 - }, - { - "completion_length": 47.796875, - "epoch": 0.6596558317399618, - "grad_norm": 17.966266632080078, - "kl": 0.2138671875, - "learning_rate": 3.403441682600382e-07, - "loss": 0.0086, - "reward": 1.8503811359405518, - "reward_std": 0.11749374866485596, - "rewards/pad": 0.359375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4910059869289398, - "step": 2070 - }, - { - "completion_length": 151.078125, - "epoch": 0.659974506054812, - "grad_norm": 12.83804702758789, - "kl": 0.09716796875, - "learning_rate": 3.40025493945188e-07, - "loss": 0.0039, - "reward": 1.547113060951233, - "reward_std": 0.03906556963920593, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4221130311489105, - "step": 2071 - }, - { - "completion_length": 150.03125, - "epoch": 0.6602931803696622, - "grad_norm": 28.711774826049805, - "kl": 0.11572265625, - "learning_rate": 3.3970681963033774e-07, - "loss": 0.0046, - "reward": 1.457901954650879, - "reward_std": 0.03792613372206688, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45790189504623413, - "rewards/pad": 0.0, - "step": 2072 - }, - { - "completion_length": 150.203125, - "epoch": 0.6606118546845124, - "grad_norm": 13.170918464660645, - "kl": 0.1220703125, - "learning_rate": 3.3938814531548755e-07, - "loss": 0.0049, - "reward": 1.4187397956848145, - "reward_std": 0.06365112215280533, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41873979568481445, - "step": 2073 - }, - { - "completion_length": 125.015625, - "epoch": 0.6609305289993627, - "grad_norm": 9.296521186828613, - "kl": 0.19140625, - "learning_rate": 3.390694710006373e-07, - "loss": 0.0076, - "reward": 1.4206171035766602, - "reward_std": 0.05997206270694733, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42061710357666016, - "rewards/pad": 0.0, - "step": 2074 - }, - { - "completion_length": 176.71875, - "epoch": 0.6612492033142129, - "grad_norm": 21.54621124267578, - "kl": 0.10009765625, - "learning_rate": 3.387507966857871e-07, - "loss": 0.004, - "reward": 1.5298080444335938, - "reward_std": 0.050099994987249374, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5298079252243042, - "rewards/pad": 0.0, - "step": 2075 - }, - { - "completion_length": 70.34375, - "epoch": 0.6615678776290631, - "grad_norm": 34.70930862426758, - "kl": 0.197265625, - "learning_rate": 3.3843212237093687e-07, - "loss": 0.0079, - "reward": 1.666641116142273, - "reward_std": 0.060540519654750824, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6666411757469177, - "rewards/pad": 0.0, - "step": 2076 - }, - { - "completion_length": 122.34375, - "epoch": 0.6618865519439133, - "grad_norm": 32.11553192138672, - "kl": 0.2314453125, - "learning_rate": 3.3811344805608667e-07, - "loss": 0.0092, - "reward": 1.54254150390625, - "reward_std": 0.060430414974689484, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5425414443016052, - "step": 2077 - }, - { - "completion_length": 73.453125, - "epoch": 0.6622052262587635, - "grad_norm": 26.948400497436523, - "kl": 0.1533203125, - "learning_rate": 3.3779477374123643e-07, - "loss": 0.0062, - "reward": 1.657631278038025, - "reward_std": 0.04715336114168167, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4076312482357025, - "step": 2078 - }, - { - "completion_length": 45.15625, - "epoch": 0.6625239005736138, - "grad_norm": 35.30155563354492, - "kl": 0.287109375, - "learning_rate": 3.3747609942638623e-07, - "loss": 0.0115, - "reward": 1.7125016450881958, - "reward_std": 0.14292427897453308, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6031266450881958, - "step": 2079 - }, - { - "completion_length": 73.671875, - "epoch": 0.662842574888464, - "grad_norm": 42.187400817871094, - "kl": 0.1416015625, - "learning_rate": 3.37157425111536e-07, - "loss": 0.0056, - "reward": 1.8499133586883545, - "reward_std": 0.06751154363155365, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4749133586883545, - "rewards/pad": 0.375, - "step": 2080 - }, - { - "completion_length": 121.28125, - "epoch": 0.6631612492033142, - "grad_norm": 44.404170989990234, - "kl": 0.2412109375, - "learning_rate": 3.368387507966858e-07, - "loss": 0.0096, - "reward": 1.5905735492706299, - "reward_std": 0.05482352152466774, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5905736088752747, - "step": 2081 - }, - { - "completion_length": 98.5, - "epoch": 0.6634799235181644, - "grad_norm": 24.07758903503418, - "kl": 0.1435546875, - "learning_rate": 3.3652007648183555e-07, - "loss": 0.0058, - "reward": 1.565967082977295, - "reward_std": 0.04835457354784012, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4409670829772949, - "step": 2082 - }, - { - "completion_length": 122.265625, - "epoch": 0.6637985978330146, - "grad_norm": 15.329931259155273, - "kl": 0.123046875, - "learning_rate": 3.3620140216698536e-07, - "loss": 0.0049, - "reward": 1.5620784759521484, - "reward_std": 0.05277356132864952, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43707847595214844, - "step": 2083 - }, - { - "completion_length": 71.390625, - "epoch": 0.6641172721478649, - "grad_norm": 72.33389282226562, - "kl": 0.201171875, - "learning_rate": 3.358827278521351e-07, - "loss": 0.0081, - "reward": 1.5701160430908203, - "reward_std": 0.0632457584142685, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5701161623001099, - "rewards/pad": 0.0, - "step": 2084 - }, - { - "completion_length": 97.3125, - "epoch": 0.6644359464627151, - "grad_norm": 16.85909080505371, - "kl": 0.1650390625, - "learning_rate": 3.355640535372849e-07, - "loss": 0.0066, - "reward": 1.6967799663543701, - "reward_std": 0.09800881892442703, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5717800259590149, - "rewards/pad": 0.125, - "step": 2085 - }, - { - "completion_length": 79.171875, - "epoch": 0.6647546207775653, - "grad_norm": 30.236915588378906, - "kl": 0.41015625, - "learning_rate": 3.3524537922243467e-07, - "loss": 0.0164, - "reward": 1.5436737537384033, - "reward_std": 0.15798990428447723, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4499237835407257, - "step": 2086 - }, - { - "completion_length": 46.5, - "epoch": 0.6650732950924155, - "grad_norm": 37.92073059082031, - "kl": 0.19140625, - "learning_rate": 3.349267049075845e-07, - "loss": 0.0076, - "reward": 1.7169394493103027, - "reward_std": 0.1119086891412735, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.591939389705658, - "step": 2087 - }, - { - "completion_length": 124.578125, - "epoch": 0.6653919694072657, - "grad_norm": 41.49848937988281, - "kl": 0.12158203125, - "learning_rate": 3.3460803059273423e-07, - "loss": 0.0049, - "reward": 1.580154538154602, - "reward_std": 0.0795680582523346, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47077950835227966, - "rewards/pad": 0.109375, - "step": 2088 - }, - { - "completion_length": 150.84375, - "epoch": 0.665710643722116, - "grad_norm": 11.288766860961914, - "kl": 0.09228515625, - "learning_rate": 3.34289356277884e-07, - "loss": 0.0037, - "reward": 1.4345180988311768, - "reward_std": 0.04635346680879593, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43451812863349915, - "step": 2089 - }, - { - "completion_length": 123.359375, - "epoch": 0.6660293180369662, - "grad_norm": 27.34168815612793, - "kl": 0.11767578125, - "learning_rate": 3.3397068196303374e-07, - "loss": 0.0047, - "reward": 1.7564454078674316, - "reward_std": 0.04770022630691528, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6314453482627869, - "rewards/pad": 0.125, - "step": 2090 - }, - { - "completion_length": 203.0, - "epoch": 0.6663479923518164, - "grad_norm": 6.87338924407959, - "kl": 0.08203125, - "learning_rate": 3.336520076481835e-07, - "loss": 0.0033, - "reward": 1.5325886011123657, - "reward_std": 0.0911032110452652, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4232136011123657, - "step": 2091 - }, - { - "completion_length": 121.09375, - "epoch": 0.6666666666666666, - "grad_norm": 22.599985122680664, - "kl": 0.1201171875, - "learning_rate": 3.333333333333333e-07, - "loss": 0.0048, - "reward": 1.4243464469909668, - "reward_std": 0.050836432725191116, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42434632778167725, - "rewards/pad": 0.0, - "step": 2092 - }, - { - "completion_length": 177.671875, - "epoch": 0.6669853409815168, - "grad_norm": 17.878097534179688, - "kl": 0.2255859375, - "learning_rate": 3.3301465901848305e-07, - "loss": 0.009, - "reward": 1.4137976169586182, - "reward_std": 0.06235119700431824, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.413797527551651, - "step": 2093 - }, - { - "completion_length": 46.15625, - "epoch": 0.6673040152963671, - "grad_norm": 24.08766746520996, - "kl": 0.2080078125, - "learning_rate": 3.3269598470363286e-07, - "loss": 0.0083, - "reward": 1.5770785808563232, - "reward_std": 0.08838474750518799, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4520787000656128, - "rewards/pad": 0.125, - "step": 2094 - }, - { - "completion_length": 124.09375, - "epoch": 0.6676226896112173, - "grad_norm": 31.90017318725586, - "kl": 0.10546875, - "learning_rate": 3.323773103887826e-07, - "loss": 0.0042, - "reward": 1.6145018339157104, - "reward_std": 0.1105368435382843, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5363768339157104, - "step": 2095 - }, - { - "completion_length": 148.34375, - "epoch": 0.6679413639260675, - "grad_norm": 33.11336135864258, - "kl": 0.1328125, - "learning_rate": 3.320586360739324e-07, - "loss": 0.0053, - "reward": 1.6489125490188599, - "reward_std": 0.056679897010326385, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5239125490188599, - "step": 2096 - }, - { - "completion_length": 97.296875, - "epoch": 0.6682600382409177, - "grad_norm": 93.13584899902344, - "kl": 0.1513671875, - "learning_rate": 3.317399617590822e-07, - "loss": 0.0061, - "reward": 1.645526647567749, - "reward_std": 0.17788124084472656, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5049015879631042, - "step": 2097 - }, - { - "completion_length": 96.53125, - "epoch": 0.668578712555768, - "grad_norm": 42.8403434753418, - "kl": 0.1533203125, - "learning_rate": 3.31421287444232e-07, - "loss": 0.0061, - "reward": 1.5992820262908936, - "reward_std": 0.053974829614162445, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5992820262908936, - "rewards/pad": 0.0, - "step": 2098 - }, - { - "completion_length": 122.390625, - "epoch": 0.6688973868706183, - "grad_norm": 16.354061126708984, - "kl": 0.09716796875, - "learning_rate": 3.3110261312938174e-07, - "loss": 0.0039, - "reward": 1.5621455907821655, - "reward_std": 0.10872915387153625, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4527706205844879, - "rewards/pad": 0.125, - "step": 2099 - }, - { - "completion_length": 95.90625, - "epoch": 0.6692160611854685, - "grad_norm": 25.892925262451172, - "kl": 0.1689453125, - "learning_rate": 3.3078393881453154e-07, - "loss": 0.0067, - "reward": 1.5702850818634033, - "reward_std": 0.08770357072353363, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5702850818634033, - "step": 2100 - }, - { - "completion_length": 95.1875, - "epoch": 0.6695347355003187, - "grad_norm": 76.42548370361328, - "kl": 0.25390625, - "learning_rate": 3.304652644996813e-07, - "loss": 0.0102, - "reward": 1.6273237466812134, - "reward_std": 0.12101085484027863, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6273237466812134, - "rewards/pad": 0.0, - "step": 2101 - }, - { - "completion_length": 123.578125, - "epoch": 0.6698534098151689, - "grad_norm": 37.725257873535156, - "kl": 0.10546875, - "learning_rate": 3.301465901848311e-07, - "loss": 0.0042, - "reward": 1.449527621269226, - "reward_std": 0.030608469620347023, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4495276212692261, - "rewards/pad": 0.0, - "step": 2102 - }, - { - "completion_length": 72.234375, - "epoch": 0.6701720841300192, - "grad_norm": 57.91838073730469, - "kl": 0.134765625, - "learning_rate": 3.2982791586998086e-07, - "loss": 0.0054, - "reward": 1.7783544063568115, - "reward_std": 0.0827033668756485, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6533544659614563, - "step": 2103 - }, - { - "completion_length": 73.046875, - "epoch": 0.6704907584448694, - "grad_norm": 55.47222900390625, - "kl": 0.1806640625, - "learning_rate": 3.2950924155513067e-07, - "loss": 0.0072, - "reward": 1.428412914276123, - "reward_std": 0.1598389595746994, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.44403794407844543, - "rewards/pad": 0.0, - "step": 2104 - }, - { - "completion_length": 97.203125, - "epoch": 0.6708094327597196, - "grad_norm": 29.002933502197266, - "kl": 0.2021484375, - "learning_rate": 3.291905672402804e-07, - "loss": 0.0081, - "reward": 1.4885926246643066, - "reward_std": 0.06171192601323128, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47296765446662903, - "rewards/pad": 0.015625, - "step": 2105 - }, - { - "completion_length": 149.90625, - "epoch": 0.6711281070745698, - "grad_norm": 20.084819793701172, - "kl": 0.1123046875, - "learning_rate": 3.2887189292543023e-07, - "loss": 0.0045, - "reward": 1.6652863025665283, - "reward_std": 0.06025255471467972, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5402863025665283, - "step": 2106 - }, - { - "completion_length": 21.0, - "epoch": 0.67144678138942, - "grad_norm": 62.11418914794922, - "kl": 0.19140625, - "learning_rate": 3.2855321861058e-07, - "loss": 0.0077, - "reward": 1.6471470594406128, - "reward_std": 0.156574547290802, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3658970594406128, - "rewards/pad": 0.28125, - "step": 2107 - }, - { - "completion_length": 72.640625, - "epoch": 0.6717654557042703, - "grad_norm": 34.148719787597656, - "kl": 0.82421875, - "learning_rate": 3.282345442957298e-07, - "loss": 0.033, - "reward": 1.6660782098770142, - "reward_std": 0.11351638287305832, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5410780906677246, - "rewards/pad": 0.125, - "step": 2108 - }, - { - "completion_length": 97.421875, - "epoch": 0.6720841300191205, - "grad_norm": 18.513395309448242, - "kl": 0.125, - "learning_rate": 3.279158699808795e-07, - "loss": 0.005, - "reward": 1.4584836959838867, - "reward_std": 0.10004407167434692, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45848363637924194, - "step": 2109 - }, - { - "completion_length": 124.90625, - "epoch": 0.6724028043339707, - "grad_norm": 21.997827529907227, - "kl": 0.134765625, - "learning_rate": 3.275971956660293e-07, - "loss": 0.0054, - "reward": 1.6060831546783447, - "reward_std": 0.06853923201560974, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4810832142829895, - "rewards/pad": 0.125, - "step": 2110 - }, - { - "completion_length": 151.21875, - "epoch": 0.6727214786488209, - "grad_norm": 32.82044219970703, - "kl": 0.11474609375, - "learning_rate": 3.2727852135117905e-07, - "loss": 0.0046, - "reward": 1.449620008468628, - "reward_std": 0.12432484328746796, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3714950680732727, - "step": 2111 - }, - { - "completion_length": 100.296875, - "epoch": 0.6730401529636711, - "grad_norm": 24.006595611572266, - "kl": 0.185546875, - "learning_rate": 3.2695984703632886e-07, - "loss": 0.0074, - "reward": 1.717818260192871, - "reward_std": 0.09362143278121948, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46781814098358154, - "step": 2112 - }, - { - "completion_length": 72.8125, - "epoch": 0.6733588272785214, - "grad_norm": 30.597938537597656, - "kl": 0.1533203125, - "learning_rate": 3.266411727214786e-07, - "loss": 0.0061, - "reward": 1.7185537815093994, - "reward_std": 0.16108709573745728, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.6404287815093994, - "step": 2113 - }, - { - "completion_length": 121.421875, - "epoch": 0.6736775015933716, - "grad_norm": 77.68666076660156, - "kl": 0.1669921875, - "learning_rate": 3.263224984066284e-07, - "loss": 0.0067, - "reward": 1.43209969997406, - "reward_std": 0.10472838580608368, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43209969997406006, - "rewards/pad": 0.0, - "step": 2114 - }, - { - "completion_length": 74.015625, - "epoch": 0.6739961759082218, - "grad_norm": 26.1304931640625, - "kl": 0.2578125, - "learning_rate": 3.2600382409177817e-07, - "loss": 0.0103, - "reward": 1.494581937789917, - "reward_std": 0.08470875024795532, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36958199739456177, - "step": 2115 - }, - { - "completion_length": 122.984375, - "epoch": 0.674314850223072, - "grad_norm": 49.73745346069336, - "kl": 0.1640625, - "learning_rate": 3.25685149776928e-07, - "loss": 0.0066, - "reward": 1.5139338970184326, - "reward_std": 0.1726274937391281, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4670588970184326, - "step": 2116 - }, - { - "completion_length": 121.171875, - "epoch": 0.6746335245379222, - "grad_norm": 192.2226104736328, - "kl": 0.1513671875, - "learning_rate": 3.2536647546207773e-07, - "loss": 0.006, - "reward": 1.5089627504348755, - "reward_std": 0.08061007410287857, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5089626908302307, - "rewards/pad": 0.0, - "step": 2117 - }, - { - "completion_length": 95.171875, - "epoch": 0.6749521988527725, - "grad_norm": 62.013916015625, - "kl": 0.173828125, - "learning_rate": 3.2504780114722754e-07, - "loss": 0.0069, - "reward": 1.700049877166748, - "reward_std": 0.07004581391811371, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.700049877166748, - "step": 2118 - }, - { - "completion_length": 124.34375, - "epoch": 0.6752708731676227, - "grad_norm": 26.129501342773438, - "kl": 0.1328125, - "learning_rate": 3.247291268323773e-07, - "loss": 0.0053, - "reward": 1.3937647342681885, - "reward_std": 0.09628598392009735, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37813982367515564, - "rewards/pad": 0.015625, - "step": 2119 - }, - { - "completion_length": 98.859375, - "epoch": 0.6755895474824729, - "grad_norm": 25.754552841186523, - "kl": 0.1162109375, - "learning_rate": 3.244104525175271e-07, - "loss": 0.0047, - "reward": 1.6929116249084473, - "reward_std": 0.06136511638760567, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5679115653038025, - "rewards/pad": 0.125, - "step": 2120 - }, - { - "completion_length": 68.734375, - "epoch": 0.6759082217973231, - "grad_norm": 39.082984924316406, - "kl": 0.134765625, - "learning_rate": 3.2409177820267686e-07, - "loss": 0.0054, - "reward": 1.6138142347335815, - "reward_std": 0.038715608417987823, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6138142347335815, - "rewards/pad": 0.0, - "step": 2121 - }, - { - "completion_length": 69.90625, - "epoch": 0.6762268961121733, - "grad_norm": 35.8658561706543, - "kl": 0.1328125, - "learning_rate": 3.2377310388782666e-07, - "loss": 0.0053, - "reward": 1.545872688293457, - "reward_std": 0.1844017207622528, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40524768829345703, - "step": 2122 - }, - { - "completion_length": 73.578125, - "epoch": 0.6765455704270236, - "grad_norm": 35.3978157043457, - "kl": 0.130859375, - "learning_rate": 3.234544295729764e-07, - "loss": 0.0052, - "reward": 1.9951844215393066, - "reward_std": 0.11782717704772949, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6201843619346619, - "step": 2123 - }, - { - "completion_length": 97.5, - "epoch": 0.6768642447418738, - "grad_norm": 31.411121368408203, - "kl": 0.15234375, - "learning_rate": 3.2313575525812617e-07, - "loss": 0.0061, - "reward": 1.5026302337646484, - "reward_std": 0.09135762602090836, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5026302933692932, - "rewards/pad": 0.0, - "step": 2124 - }, - { - "completion_length": 98.546875, - "epoch": 0.677182919056724, - "grad_norm": 82.23831939697266, - "kl": 0.11669921875, - "learning_rate": 3.22817080943276e-07, - "loss": 0.0047, - "reward": 1.466087818145752, - "reward_std": 0.0600445456802845, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46608781814575195, - "step": 2125 - }, - { - "completion_length": 97.1875, - "epoch": 0.6775015933715742, - "grad_norm": 20.297983169555664, - "kl": 0.1953125, - "learning_rate": 3.2249840662842573e-07, - "loss": 0.0078, - "reward": 1.5842697620391846, - "reward_std": 0.0901041328907013, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5842697620391846, - "step": 2126 - }, - { - "completion_length": 123.6875, - "epoch": 0.6778202676864244, - "grad_norm": 24.40261459350586, - "kl": 0.10693359375, - "learning_rate": 3.2217973231357554e-07, - "loss": 0.0043, - "reward": 1.6924340724945068, - "reward_std": 0.04547817260026932, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6924340724945068, - "rewards/pad": 0.0, - "step": 2127 - }, - { - "completion_length": 123.84375, - "epoch": 0.6781389420012747, - "grad_norm": 30.418781280517578, - "kl": 0.08349609375, - "learning_rate": 3.2186105799872524e-07, - "loss": 0.0033, - "reward": 1.8715764284133911, - "reward_std": 0.07162997871637344, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4965764582157135, - "step": 2128 - }, - { - "completion_length": 145.203125, - "epoch": 0.6784576163161249, - "grad_norm": 7.301049709320068, - "kl": 0.0869140625, - "learning_rate": 3.2154238368387505e-07, - "loss": 0.0035, - "reward": 1.5839428901672363, - "reward_std": 0.03347952663898468, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5839429497718811, - "rewards/pad": 0.0, - "step": 2129 - }, - { - "completion_length": 71.828125, - "epoch": 0.6787762906309751, - "grad_norm": 68.99181365966797, - "kl": 0.275390625, - "learning_rate": 3.212237093690248e-07, - "loss": 0.011, - "reward": 1.4546984434127808, - "reward_std": 0.05381061136722565, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45469844341278076, - "rewards/pad": 0.0, - "step": 2130 - }, - { - "completion_length": 96.015625, - "epoch": 0.6790949649458253, - "grad_norm": 22.433712005615234, - "kl": 0.1259765625, - "learning_rate": 3.209050350541746e-07, - "loss": 0.0051, - "reward": 1.629838466644287, - "reward_std": 0.0413581021130085, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6298385858535767, - "rewards/pad": 0.0, - "step": 2131 - }, - { - "completion_length": 123.984375, - "epoch": 0.6794136392606756, - "grad_norm": 33.9914665222168, - "kl": 0.126953125, - "learning_rate": 3.2058636073932436e-07, - "loss": 0.0051, - "reward": 1.6053473949432373, - "reward_std": 0.10383665561676025, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4647223949432373, - "step": 2132 - }, - { - "completion_length": 71.265625, - "epoch": 0.6797323135755258, - "grad_norm": 26.708370208740234, - "kl": 0.26171875, - "learning_rate": 3.2026768642447417e-07, - "loss": 0.0104, - "reward": 1.5721994638442993, - "reward_std": 0.10347621887922287, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5721994638442993, - "step": 2133 - }, - { - "completion_length": 71.03125, - "epoch": 0.680050987890376, - "grad_norm": 61.37842559814453, - "kl": 0.2080078125, - "learning_rate": 3.199490121096239e-07, - "loss": 0.0083, - "reward": 1.6109099388122559, - "reward_std": 0.13730362057685852, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5171599388122559, - "rewards/pad": 0.09375, - "step": 2134 - }, - { - "completion_length": 153.796875, - "epoch": 0.6803696622052262, - "grad_norm": 35.16779327392578, - "kl": 0.11669921875, - "learning_rate": 3.1963033779477373e-07, - "loss": 0.0047, - "reward": 1.4929907321929932, - "reward_std": 0.055013976991176605, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4929906725883484, - "step": 2135 - }, - { - "completion_length": 123.671875, - "epoch": 0.6806883365200764, - "grad_norm": 44.921199798583984, - "kl": 0.146484375, - "learning_rate": 3.193116634799235e-07, - "loss": 0.0058, - "reward": 1.6867095232009888, - "reward_std": 0.0794215276837349, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43670958280563354, - "step": 2136 - }, - { - "completion_length": 73.765625, - "epoch": 0.6810070108349267, - "grad_norm": 40.000160217285156, - "kl": 0.1376953125, - "learning_rate": 3.189929891650733e-07, - "loss": 0.0055, - "reward": 1.6728726625442505, - "reward_std": 0.13227654993534088, - "rewards/answer_reward": 0.09375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5791226625442505, - "step": 2137 - }, - { - "completion_length": 96.71875, - "epoch": 0.681325685149777, - "grad_norm": 19.429340362548828, - "kl": 0.1474609375, - "learning_rate": 3.1867431485022304e-07, - "loss": 0.0059, - "reward": 1.6199922561645508, - "reward_std": 0.0660354495048523, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.619992196559906, - "rewards/pad": 0.0, - "step": 2138 - }, - { - "completion_length": 44.859375, - "epoch": 0.6816443594646272, - "grad_norm": 30.003305435180664, - "kl": 0.1865234375, - "learning_rate": 3.1835564053537285e-07, - "loss": 0.0074, - "reward": 1.820550799369812, - "reward_std": 0.04275095835328102, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6955506801605225, - "rewards/pad": 0.125, - "step": 2139 - }, - { - "completion_length": 72.421875, - "epoch": 0.6819630337794774, - "grad_norm": 46.98480224609375, - "kl": 0.240234375, - "learning_rate": 3.180369662205226e-07, - "loss": 0.0096, - "reward": 1.5586421489715576, - "reward_std": 0.09271854162216187, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43364202976226807, - "step": 2140 - }, - { - "completion_length": 123.59375, - "epoch": 0.6822817080943276, - "grad_norm": 19.73856544494629, - "kl": 0.158203125, - "learning_rate": 3.177182919056724e-07, - "loss": 0.0063, - "reward": 1.6235109567642212, - "reward_std": 0.10110151022672653, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4985108971595764, - "rewards/pad": 0.125, - "step": 2141 - }, - { - "completion_length": 73.046875, - "epoch": 0.6826003824091779, - "grad_norm": 48.96926498413086, - "kl": 0.201171875, - "learning_rate": 3.1739961759082217e-07, - "loss": 0.008, - "reward": 1.6435010433197021, - "reward_std": 0.14474226534366608, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.40912607312202454, - "rewards/pad": 0.25, - "step": 2142 - }, - { - "completion_length": 97.671875, - "epoch": 0.6829190567240281, - "grad_norm": 85.29547882080078, - "kl": 0.2490234375, - "learning_rate": 3.17080943275972e-07, - "loss": 0.01, - "reward": 1.5109562873840332, - "reward_std": 0.22265812754631042, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 0.96875, - "rewards/tracking_iou_reward": 0.4797063171863556, - "step": 2143 - }, - { - "completion_length": 70.109375, - "epoch": 0.6832377310388783, - "grad_norm": 47.72813415527344, - "kl": 0.154296875, - "learning_rate": 3.1676226896112173e-07, - "loss": 0.0062, - "reward": 1.8669711351394653, - "reward_std": 0.062427740544080734, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6169711947441101, - "step": 2144 - }, - { - "completion_length": 148.578125, - "epoch": 0.6835564053537285, - "grad_norm": 15.509635925292969, - "kl": 0.11865234375, - "learning_rate": 3.1644359464627153e-07, - "loss": 0.0047, - "reward": 1.3581414222717285, - "reward_std": 0.022797567769885063, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3581415116786957, - "step": 2145 - }, - { - "completion_length": 149.234375, - "epoch": 0.6838750796685787, - "grad_norm": 108.00698852539062, - "kl": 0.10791015625, - "learning_rate": 3.161249203314213e-07, - "loss": 0.0043, - "reward": 1.6501152515411377, - "reward_std": 0.053191155195236206, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5251152515411377, - "step": 2146 - }, - { - "completion_length": 97.5, - "epoch": 0.684193753983429, - "grad_norm": 141.09266662597656, - "kl": 0.236328125, - "learning_rate": 3.158062460165711e-07, - "loss": 0.0094, - "reward": 1.5822439193725586, - "reward_std": 0.1238677054643631, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5822440385818481, - "rewards/pad": 0.0, - "step": 2147 - }, - { - "completion_length": 70.828125, - "epoch": 0.6845124282982792, - "grad_norm": 50.54616928100586, - "kl": 0.1669921875, - "learning_rate": 3.154875717017208e-07, - "loss": 0.0067, - "reward": 1.46274733543396, - "reward_std": 0.07585113495588303, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4627472162246704, - "rewards/pad": 0.0, - "step": 2148 - }, - { - "completion_length": 46.609375, - "epoch": 0.6848311026131294, - "grad_norm": 37.283790588378906, - "kl": 0.296875, - "learning_rate": 3.151688973868706e-07, - "loss": 0.0119, - "reward": 1.5692379474639893, - "reward_std": 0.08897761255502701, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.569238007068634, - "rewards/pad": 0.0, - "step": 2149 - }, - { - "completion_length": 97.75, - "epoch": 0.6851497769279796, - "grad_norm": 34.73873519897461, - "kl": 0.11083984375, - "learning_rate": 3.1485022307202036e-07, - "loss": 0.0044, - "reward": 1.595937728881836, - "reward_std": 0.06123333424329758, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4709377884864807, - "step": 2150 - }, - { - "completion_length": 119.0625, - "epoch": 0.6854684512428298, - "grad_norm": 19.22545051574707, - "kl": 0.220703125, - "learning_rate": 3.1453154875717016e-07, - "loss": 0.0088, - "reward": 1.591064691543579, - "reward_std": 0.13165190815925598, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4816897511482239, - "rewards/pad": 0.109375, - "step": 2151 - }, - { - "completion_length": 71.265625, - "epoch": 0.6857871255576801, - "grad_norm": 34.94192123413086, - "kl": 0.1572265625, - "learning_rate": 3.142128744423199e-07, - "loss": 0.0063, - "reward": 1.983314871788025, - "reward_std": 0.05449950322508812, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7333148717880249, - "rewards/pad": 0.25, - "step": 2152 - }, - { - "completion_length": 96.34375, - "epoch": 0.6861057998725303, - "grad_norm": 71.74163818359375, - "kl": 0.1533203125, - "learning_rate": 3.1389420012746967e-07, - "loss": 0.0061, - "reward": 1.654395341873169, - "reward_std": 0.10707074403762817, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.529395341873169, - "step": 2153 - }, - { - "completion_length": 124.5, - "epoch": 0.6864244741873805, - "grad_norm": 23.12221336364746, - "kl": 0.12890625, - "learning_rate": 3.135755258126195e-07, - "loss": 0.0051, - "reward": 1.7158911228179932, - "reward_std": 0.04626501724123955, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5908911824226379, - "rewards/pad": 0.125, - "step": 2154 - }, - { - "completion_length": 97.859375, - "epoch": 0.6867431485022307, - "grad_norm": 10.936779022216797, - "kl": 0.1953125, - "learning_rate": 3.1325685149776923e-07, - "loss": 0.0078, - "reward": 1.6908012628555298, - "reward_std": 0.10128079354763031, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5658012628555298, - "rewards/pad": 0.125, - "step": 2155 - }, - { - "completion_length": 71.265625, - "epoch": 0.687061822817081, - "grad_norm": 110.206787109375, - "kl": 0.240234375, - "learning_rate": 3.1293817718291904e-07, - "loss": 0.0096, - "reward": 1.594151258468628, - "reward_std": 0.12307152897119522, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46915125846862793, - "step": 2156 - }, - { - "completion_length": 45.453125, - "epoch": 0.6873804971319312, - "grad_norm": 68.23284149169922, - "kl": 0.27734375, - "learning_rate": 3.126195028680688e-07, - "loss": 0.0111, - "reward": 1.5062565803527832, - "reward_std": 0.04553546756505966, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3812566101551056, - "rewards/pad": 0.125, - "step": 2157 - }, - { - "completion_length": 72.828125, - "epoch": 0.6876991714467814, - "grad_norm": 45.148319244384766, - "kl": 0.23828125, - "learning_rate": 3.123008285532186e-07, - "loss": 0.0095, - "reward": 1.7396340370178223, - "reward_std": 0.07617457211017609, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48963409662246704, - "step": 2158 - }, - { - "completion_length": 148.390625, - "epoch": 0.6880178457616316, - "grad_norm": 25.591800689697266, - "kl": 0.11279296875, - "learning_rate": 3.1198215423836836e-07, - "loss": 0.0045, - "reward": 1.4891088008880615, - "reward_std": 0.04170014709234238, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.489108681678772, - "rewards/pad": 0.0, - "step": 2159 - }, - { - "completion_length": 99.796875, - "epoch": 0.6883365200764818, - "grad_norm": 17.934783935546875, - "kl": 0.1796875, - "learning_rate": 3.1166347992351816e-07, - "loss": 0.0072, - "reward": 1.5252175331115723, - "reward_std": 0.12577782571315765, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.96875, - "rewards/iou_glue_reward": 0.4314674735069275, - "step": 2160 - }, - { - "completion_length": 72.671875, - "epoch": 0.688655194391332, - "grad_norm": 51.22257995605469, - "kl": 0.16796875, - "learning_rate": 3.113448056086679e-07, - "loss": 0.0067, - "reward": 1.6587944030761719, - "reward_std": 0.20532920956611633, - "rewards/pad": 0.15625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5025444626808167, - "step": 2161 - }, - { - "completion_length": 98.578125, - "epoch": 0.6889738687061823, - "grad_norm": 55.02689743041992, - "kl": 0.1533203125, - "learning_rate": 3.110261312938177e-07, - "loss": 0.0061, - "reward": 1.7418324947357178, - "reward_std": 0.13208447396755219, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5074574947357178, - "step": 2162 - }, - { - "completion_length": 71.15625, - "epoch": 0.6892925430210325, - "grad_norm": 18.410661697387695, - "kl": 0.1904296875, - "learning_rate": 3.107074569789675e-07, - "loss": 0.0076, - "reward": 1.748297095298767, - "reward_std": 0.11876393109560013, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6389220952987671, - "step": 2163 - }, - { - "completion_length": 45.203125, - "epoch": 0.6896112173358827, - "grad_norm": 34.00616455078125, - "kl": 0.2216796875, - "learning_rate": 3.103887826641173e-07, - "loss": 0.0089, - "reward": 1.6725029945373535, - "reward_std": 0.0798911452293396, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5475030541419983, - "rewards/pad": 0.125, - "step": 2164 - }, - { - "completion_length": 72.4375, - "epoch": 0.6899298916507329, - "grad_norm": 74.08090209960938, - "kl": 0.1484375, - "learning_rate": 3.1007010834926704e-07, - "loss": 0.0059, - "reward": 1.4844574928283691, - "reward_std": 0.1599559187889099, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.37508249282836914, - "rewards/pad": 0.125, - "step": 2165 - }, - { - "completion_length": 98.1875, - "epoch": 0.6902485659655831, - "grad_norm": 75.02046966552734, - "kl": 0.1337890625, - "learning_rate": 3.0975143403441685e-07, - "loss": 0.0053, - "reward": 1.6016144752502441, - "reward_std": 0.09428554773330688, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.36723947525024414, - "step": 2166 - }, - { - "completion_length": 98.578125, - "epoch": 0.6905672402804334, - "grad_norm": 56.74724578857422, - "kl": 0.1708984375, - "learning_rate": 3.094327597195666e-07, - "loss": 0.0068, - "reward": 1.6080045700073242, - "reward_std": 0.0633222758769989, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48300448060035706, - "step": 2167 - }, - { - "completion_length": 120.046875, - "epoch": 0.6908859145952836, - "grad_norm": 48.74546432495117, - "kl": 0.1953125, - "learning_rate": 3.0911408540471635e-07, - "loss": 0.0078, - "reward": 1.5229651927947998, - "reward_std": 0.1694946438074112, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4448402225971222, - "step": 2168 - }, - { - "completion_length": 70.25, - "epoch": 0.6912045889101338, - "grad_norm": 22.903034210205078, - "kl": 0.166015625, - "learning_rate": 3.087954110898661e-07, - "loss": 0.0066, - "reward": 1.5259134769439697, - "reward_std": 0.09347060322761536, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.525913417339325, - "rewards/pad": 0.0, - "step": 2169 - }, - { - "completion_length": 72.53125, - "epoch": 0.691523263224984, - "grad_norm": 28.263362884521484, - "kl": 0.26171875, - "learning_rate": 3.084767367750159e-07, - "loss": 0.0105, - "reward": 1.703984260559082, - "reward_std": 0.08166246116161346, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5789843797683716, - "step": 2170 - }, - { - "completion_length": 124.125, - "epoch": 0.6918419375398343, - "grad_norm": 47.77621841430664, - "kl": 0.1328125, - "learning_rate": 3.0815806246016567e-07, - "loss": 0.0053, - "reward": 1.582489252090454, - "reward_std": 0.07792859524488449, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5824892520904541, - "step": 2171 - }, - { - "completion_length": 124.734375, - "epoch": 0.6921606118546845, - "grad_norm": 59.42560577392578, - "kl": 0.1015625, - "learning_rate": 3.078393881453155e-07, - "loss": 0.0041, - "reward": 1.4870238304138184, - "reward_std": 0.05382505804300308, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36202380061149597, - "rewards/pad": 0.125, - "step": 2172 - }, - { - "completion_length": 69.71875, - "epoch": 0.6924792861695347, - "grad_norm": 58.80769729614258, - "kl": 0.2041015625, - "learning_rate": 3.0752071383046523e-07, - "loss": 0.0082, - "reward": 1.573935866355896, - "reward_std": 0.10074318945407867, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5739358067512512, - "rewards/pad": 0.0, - "step": 2173 - }, - { - "completion_length": 71.125, - "epoch": 0.6927979604843849, - "grad_norm": 85.91229248046875, - "kl": 0.154296875, - "learning_rate": 3.0720203951561504e-07, - "loss": 0.0062, - "reward": 1.6171116828918457, - "reward_std": 0.07128950953483582, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4921117424964905, - "rewards/pad": 0.125, - "step": 2174 - }, - { - "completion_length": 44.8125, - "epoch": 0.6931166347992351, - "grad_norm": 82.84741973876953, - "kl": 0.3125, - "learning_rate": 3.068833652007648e-07, - "loss": 0.0125, - "reward": 1.5185716152191162, - "reward_std": 0.09817488491535187, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3779466450214386, - "rewards/pad": 0.140625, - "step": 2175 - }, - { - "completion_length": 97.875, - "epoch": 0.6934353091140854, - "grad_norm": 12.625312805175781, - "kl": 0.091796875, - "learning_rate": 3.065646908859146e-07, - "loss": 0.0037, - "reward": 1.7972065210342407, - "reward_std": 0.04230862110853195, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5472065806388855, - "rewards/pad": 0.25, - "step": 2176 - }, - { - "completion_length": 96.515625, - "epoch": 0.6937539834289357, - "grad_norm": 15.882893562316895, - "kl": 0.1572265625, - "learning_rate": 3.0624601657106435e-07, - "loss": 0.0063, - "reward": 1.6735115051269531, - "reward_std": 0.05752333253622055, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6735113859176636, - "rewards/pad": 0.0, - "step": 2177 - }, - { - "completion_length": 97.015625, - "epoch": 0.6940726577437859, - "grad_norm": 26.218341827392578, - "kl": 0.2099609375, - "learning_rate": 3.0592734225621416e-07, - "loss": 0.0084, - "reward": 1.4729814529418945, - "reward_std": 0.06405826658010483, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4729815721511841, - "step": 2178 - }, - { - "completion_length": 123.421875, - "epoch": 0.6943913320586361, - "grad_norm": 54.89400863647461, - "kl": 0.1708984375, - "learning_rate": 3.056086679413639e-07, - "loss": 0.0068, - "reward": 1.4379938840866089, - "reward_std": 0.1479625701904297, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.45361894369125366, - "rewards/pad": 0.0, - "step": 2179 - }, - { - "completion_length": 123.84375, - "epoch": 0.6947100063734863, - "grad_norm": 24.94068717956543, - "kl": 0.1435546875, - "learning_rate": 3.052899936265137e-07, - "loss": 0.0058, - "reward": 1.5022281408309937, - "reward_std": 0.09763312339782715, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5178531408309937, - "step": 2180 - }, - { - "completion_length": 45.390625, - "epoch": 0.6950286806883366, - "grad_norm": 47.86198806762695, - "kl": 0.1484375, - "learning_rate": 3.049713193116635e-07, - "loss": 0.0059, - "reward": 1.6937261819839478, - "reward_std": 0.10928689688444138, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5531011819839478, - "rewards/pad": 0.140625, - "step": 2181 - }, - { - "completion_length": 151.015625, - "epoch": 0.6953473550031868, - "grad_norm": 31.436471939086914, - "kl": 0.07861328125, - "learning_rate": 3.046526449968133e-07, - "loss": 0.0031, - "reward": 1.3685061931610107, - "reward_std": 0.14001545310020447, - "rewards/pad": 0.03125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.35288113355636597, - "step": 2182 - }, - { - "completion_length": 122.15625, - "epoch": 0.695666029318037, - "grad_norm": 33.87594223022461, - "kl": 0.1201171875, - "learning_rate": 3.0433397068196304e-07, - "loss": 0.0048, - "reward": 1.4364391565322876, - "reward_std": 0.13284854590892792, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4051891565322876, - "rewards/pad": 0.03125, - "step": 2183 - }, - { - "completion_length": 97.921875, - "epoch": 0.6959847036328872, - "grad_norm": 56.98023223876953, - "kl": 0.1279296875, - "learning_rate": 3.0401529636711284e-07, - "loss": 0.0051, - "reward": 1.6480770111083984, - "reward_std": 0.07613584399223328, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3980771005153656, - "step": 2184 - }, - { - "completion_length": 17.8125, - "epoch": 0.6963033779477374, - "grad_norm": 44.34600830078125, - "kl": 0.302734375, - "learning_rate": 3.036966220522626e-07, - "loss": 0.0121, - "reward": 1.6638925075531006, - "reward_std": 0.07695624232292175, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6638926267623901, - "rewards/pad": 0.0, - "step": 2185 - }, - { - "completion_length": 69.046875, - "epoch": 0.6966220522625877, - "grad_norm": 23.324520111083984, - "kl": 0.310546875, - "learning_rate": 3.0337794773741235e-07, - "loss": 0.0124, - "reward": 1.2351598739624023, - "reward_std": 0.0588654950261116, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.23515987396240234, - "rewards/pad": 0.0, - "step": 2186 - }, - { - "completion_length": 123.234375, - "epoch": 0.6969407265774379, - "grad_norm": 28.746795654296875, - "kl": 0.5625, - "learning_rate": 3.030592734225621e-07, - "loss": 0.0226, - "reward": 1.597653865814209, - "reward_std": 0.05662325769662857, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.597653865814209, - "step": 2187 - }, - { - "completion_length": 68.765625, - "epoch": 0.6972594008922881, - "grad_norm": 55.966339111328125, - "kl": 0.1455078125, - "learning_rate": 3.0274059910771186e-07, - "loss": 0.0058, - "reward": 1.6799708604812622, - "reward_std": 0.055459875613451004, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6799708604812622, - "rewards/pad": 0.0, - "step": 2188 - }, - { - "completion_length": 96.765625, - "epoch": 0.6975780752071383, - "grad_norm": 84.16297149658203, - "kl": 0.1748046875, - "learning_rate": 3.0242192479286167e-07, - "loss": 0.007, - "reward": 1.4844096899032593, - "reward_std": 0.080804742872715, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4844096601009369, - "rewards/pad": 0.0, - "step": 2189 - }, - { - "completion_length": 122.671875, - "epoch": 0.6978967495219885, - "grad_norm": 160.37826538085938, - "kl": 0.13671875, - "learning_rate": 3.021032504780114e-07, - "loss": 0.0055, - "reward": 1.465484619140625, - "reward_std": 0.08741322159767151, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3404846787452698, - "rewards/pad": 0.125, - "step": 2190 - }, - { - "completion_length": 75.34375, - "epoch": 0.6982154238368388, - "grad_norm": 86.90062713623047, - "kl": 0.142578125, - "learning_rate": 3.017845761631612e-07, - "loss": 0.0057, - "reward": 1.9296181201934814, - "reward_std": 0.07115404307842255, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5546181201934814, - "step": 2191 - }, - { - "completion_length": 71.609375, - "epoch": 0.698534098151689, - "grad_norm": 48.61540603637695, - "kl": 0.1455078125, - "learning_rate": 3.01465901848311e-07, - "loss": 0.0058, - "reward": 1.573476791381836, - "reward_std": 0.042799632996320724, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44847679138183594, - "rewards/pad": 0.125, - "step": 2192 - }, - { - "completion_length": 174.796875, - "epoch": 0.6988527724665392, - "grad_norm": 10.433985710144043, - "kl": 0.0986328125, - "learning_rate": 3.011472275334608e-07, - "loss": 0.0039, - "reward": 1.439502239227295, - "reward_std": 0.03480418771505356, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4395022988319397, - "step": 2193 - }, - { - "completion_length": 96.96875, - "epoch": 0.6991714467813894, - "grad_norm": 22.337554931640625, - "kl": 0.2265625, - "learning_rate": 3.0082855321861054e-07, - "loss": 0.0091, - "reward": 1.6683993339538574, - "reward_std": 0.13437384366989136, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6058992743492126, - "rewards/pad": 0.0625, - "step": 2194 - }, - { - "completion_length": 124.328125, - "epoch": 0.6994901210962396, - "grad_norm": 87.454345703125, - "kl": 0.14453125, - "learning_rate": 3.0050987890376035e-07, - "loss": 0.0058, - "reward": 1.5993907451629639, - "reward_std": 0.10592551529407501, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4743907153606415, - "step": 2195 - }, - { - "completion_length": 44.34375, - "epoch": 0.6998087954110899, - "grad_norm": 45.15059280395508, - "kl": 0.2197265625, - "learning_rate": 3.001912045889101e-07, - "loss": 0.0088, - "reward": 1.6603868007659912, - "reward_std": 0.06433721631765366, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5353868007659912, - "rewards/pad": 0.125, - "step": 2196 - }, - { - "completion_length": 70.53125, - "epoch": 0.7001274697259401, - "grad_norm": 47.8062858581543, - "kl": 0.1337890625, - "learning_rate": 2.998725302740599e-07, - "loss": 0.0054, - "reward": 1.5866845846176147, - "reward_std": 0.04083329066634178, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.58668452501297, - "rewards/pad": 0.0, - "step": 2197 - }, - { - "completion_length": 94.171875, - "epoch": 0.7004461440407903, - "grad_norm": 24.064556121826172, - "kl": 0.125, - "learning_rate": 2.9955385595920966e-07, - "loss": 0.005, - "reward": 1.5073517560958862, - "reward_std": 0.05560840666294098, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5073516964912415, - "rewards/pad": 0.0, - "step": 2198 - }, - { - "completion_length": 20.09375, - "epoch": 0.7007648183556405, - "grad_norm": 75.1050796508789, - "kl": 0.2578125, - "learning_rate": 2.9923518164435947e-07, - "loss": 0.0103, - "reward": 1.6298410892486572, - "reward_std": 0.13905611634254456, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5048412084579468, - "step": 2199 - }, - { - "completion_length": 70.046875, - "epoch": 0.7010834926704907, - "grad_norm": 72.1885757446289, - "kl": 0.201171875, - "learning_rate": 2.989165073295092e-07, - "loss": 0.008, - "reward": 1.5954376459121704, - "reward_std": 0.18055960536003113, - "rewards/answer_reward": 0.171875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4235627055168152, - "step": 2200 - }, - { - "completion_length": 97.28125, - "epoch": 0.701402166985341, - "grad_norm": 106.49429321289062, - "kl": 0.201171875, - "learning_rate": 2.9859783301465903e-07, - "loss": 0.0081, - "reward": 1.68535315990448, - "reward_std": 0.08157364279031754, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5603532195091248, - "rewards/pad": 0.125, - "step": 2201 - }, - { - "completion_length": 98.109375, - "epoch": 0.7017208413001912, - "grad_norm": 28.185997009277344, - "kl": 0.138671875, - "learning_rate": 2.982791586998088e-07, - "loss": 0.0055, - "reward": 1.6847625970840454, - "reward_std": 0.11430058628320694, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5910125970840454, - "step": 2202 - }, - { - "completion_length": 176.484375, - "epoch": 0.7020395156150414, - "grad_norm": 37.22420120239258, - "kl": 0.068359375, - "learning_rate": 2.979604843849586e-07, - "loss": 0.0027, - "reward": 1.5570272207260132, - "reward_std": 0.04657837003469467, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4320272207260132, - "step": 2203 - }, - { - "completion_length": 123.421875, - "epoch": 0.7023581899298916, - "grad_norm": 40.63876724243164, - "kl": 0.12158203125, - "learning_rate": 2.9764181007010835e-07, - "loss": 0.0049, - "reward": 1.5214793682098389, - "reward_std": 0.05515363812446594, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5214793682098389, - "rewards/pad": 0.0, - "step": 2204 - }, - { - "completion_length": 71.234375, - "epoch": 0.7026768642447419, - "grad_norm": 100.44686889648438, - "kl": 0.205078125, - "learning_rate": 2.9732313575525815e-07, - "loss": 0.0082, - "reward": 1.6354838609695435, - "reward_std": 0.15714408457279205, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.635483980178833, - "step": 2205 - }, - { - "completion_length": 99.71875, - "epoch": 0.7029955385595921, - "grad_norm": 209.54771423339844, - "kl": 0.1181640625, - "learning_rate": 2.970044614404079e-07, - "loss": 0.0047, - "reward": 1.630167007446289, - "reward_std": 0.07337269186973572, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5051670074462891, - "step": 2206 - }, - { - "completion_length": 122.953125, - "epoch": 0.7033142128744423, - "grad_norm": 9.853918075561523, - "kl": 0.1357421875, - "learning_rate": 2.9668578712555766e-07, - "loss": 0.0054, - "reward": 1.50269615650177, - "reward_std": 0.06395269185304642, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.50269615650177, - "step": 2207 - }, - { - "completion_length": 47.359375, - "epoch": 0.7036328871892925, - "grad_norm": 55.91080093383789, - "kl": 0.158203125, - "learning_rate": 2.963671128107074e-07, - "loss": 0.0064, - "reward": 1.7377474308013916, - "reward_std": 0.05993042141199112, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4877474904060364, - "rewards/pad": 0.25, - "step": 2208 - }, - { - "completion_length": 148.03125, - "epoch": 0.7039515615041427, - "grad_norm": 18.883148193359375, - "kl": 0.09619140625, - "learning_rate": 2.960484384958572e-07, - "loss": 0.0038, - "reward": 1.3356266021728516, - "reward_std": 0.029426738619804382, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.33562666177749634, - "rewards/pad": 0.0, - "step": 2209 - }, - { - "completion_length": 96.0, - "epoch": 0.704270235818993, - "grad_norm": 17.142446517944336, - "kl": 0.1708984375, - "learning_rate": 2.95729764181007e-07, - "loss": 0.0068, - "reward": 1.6269714832305908, - "reward_std": 0.059701986610889435, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6269715428352356, - "step": 2210 - }, - { - "completion_length": 96.640625, - "epoch": 0.7045889101338432, - "grad_norm": 23.42155647277832, - "kl": 0.2421875, - "learning_rate": 2.954110898661568e-07, - "loss": 0.0097, - "reward": 1.6402925252914429, - "reward_std": 0.0891985148191452, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5152925252914429, - "step": 2211 - }, - { - "completion_length": 71.421875, - "epoch": 0.7049075844486934, - "grad_norm": 135.06629943847656, - "kl": 0.14453125, - "learning_rate": 2.9509241555130654e-07, - "loss": 0.0058, - "reward": 1.638238787651062, - "reward_std": 0.06692403554916382, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.638238787651062, - "step": 2212 - }, - { - "completion_length": 146.78125, - "epoch": 0.7052262587635436, - "grad_norm": 43.68022155761719, - "kl": 0.1044921875, - "learning_rate": 2.9477374123645634e-07, - "loss": 0.0042, - "reward": 1.5327023267745972, - "reward_std": 0.100974440574646, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42332732677459717, - "rewards/pad": 0.109375, - "step": 2213 - }, - { - "completion_length": 118.171875, - "epoch": 0.7055449330783938, - "grad_norm": 39.241512298583984, - "kl": 0.1474609375, - "learning_rate": 2.944550669216061e-07, - "loss": 0.0059, - "reward": 1.5181466341018677, - "reward_std": 0.05647048354148865, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5181467533111572, - "step": 2214 - }, - { - "completion_length": 97.328125, - "epoch": 0.705863607393244, - "grad_norm": 55.47908020019531, - "kl": 0.1767578125, - "learning_rate": 2.9413639260675585e-07, - "loss": 0.0071, - "reward": 1.6087799072265625, - "reward_std": 0.063574880361557, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4837798476219177, - "rewards/pad": 0.125, - "step": 2215 - }, - { - "completion_length": 45.9375, - "epoch": 0.7061822817080943, - "grad_norm": 37.14297103881836, - "kl": 0.2138671875, - "learning_rate": 2.9381771829190566e-07, - "loss": 0.0085, - "reward": 1.868995189666748, - "reward_std": 0.07784055173397064, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.618995189666748, - "rewards/pad": 0.25, - "step": 2216 - }, - { - "completion_length": 119.46875, - "epoch": 0.7065009560229446, - "grad_norm": 20.412870407104492, - "kl": 0.150390625, - "learning_rate": 2.934990439770554e-07, - "loss": 0.006, - "reward": 1.7465416193008423, - "reward_std": 0.04842020571231842, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6215416193008423, - "step": 2217 - }, - { - "completion_length": 71.53125, - "epoch": 0.7068196303377948, - "grad_norm": 26.621227264404297, - "kl": 0.09423828125, - "learning_rate": 2.931803696622052e-07, - "loss": 0.0038, - "reward": 1.6498267650604248, - "reward_std": 0.08941024541854858, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.54045170545578, - "step": 2218 - }, - { - "completion_length": 124.578125, - "epoch": 0.707138304652645, - "grad_norm": 37.394710540771484, - "kl": 0.16015625, - "learning_rate": 2.92861695347355e-07, - "loss": 0.0064, - "reward": 1.5619118213653564, - "reward_std": 0.19820812344551086, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.5931618213653564, - "rewards/pad": 0.0, - "step": 2219 - }, - { - "completion_length": 154.5625, - "epoch": 0.7074569789674953, - "grad_norm": 12.474699974060059, - "kl": 0.11865234375, - "learning_rate": 2.925430210325048e-07, - "loss": 0.0048, - "reward": 1.529463768005371, - "reward_std": 0.045225899666547775, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4044637084007263, - "step": 2220 - }, - { - "completion_length": 99.234375, - "epoch": 0.7077756532823455, - "grad_norm": 27.70870018005371, - "kl": 0.11962890625, - "learning_rate": 2.9222434671765454e-07, - "loss": 0.0048, - "reward": 1.8728269338607788, - "reward_std": 0.11273490637540817, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6384519934654236, - "step": 2221 - }, - { - "completion_length": 96.296875, - "epoch": 0.7080943275971957, - "grad_norm": 42.31635284423828, - "kl": 0.177734375, - "learning_rate": 2.9190567240280434e-07, - "loss": 0.0071, - "reward": 1.4518624544143677, - "reward_std": 0.0700874999165535, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45186251401901245, - "rewards/pad": 0.0, - "step": 2222 - }, - { - "completion_length": 44.375, - "epoch": 0.7084130019120459, - "grad_norm": 128.1374053955078, - "kl": 0.171875, - "learning_rate": 2.915869980879541e-07, - "loss": 0.0069, - "reward": 1.6179513931274414, - "reward_std": 0.06480707228183746, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6179514527320862, - "rewards/pad": 0.0, - "step": 2223 - }, - { - "completion_length": 123.28125, - "epoch": 0.7087316762268961, - "grad_norm": 41.999210357666016, - "kl": 0.123046875, - "learning_rate": 2.912683237731039e-07, - "loss": 0.0049, - "reward": 1.6539459228515625, - "reward_std": 0.046917326748371124, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5289459228515625, - "rewards/pad": 0.125, - "step": 2224 - }, - { - "completion_length": 119.515625, - "epoch": 0.7090503505417464, - "grad_norm": 21.24114990234375, - "kl": 0.15625, - "learning_rate": 2.9094964945825366e-07, - "loss": 0.0062, - "reward": 1.4963219165802002, - "reward_std": 0.05676766857504845, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49632197618484497, - "rewards/pad": 0.0, - "step": 2225 - }, - { - "completion_length": 73.203125, - "epoch": 0.7093690248565966, - "grad_norm": 578.6454467773438, - "kl": 0.17578125, - "learning_rate": 2.9063097514340346e-07, - "loss": 0.007, - "reward": 1.5945147275924683, - "reward_std": 0.0449923574924469, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34451472759246826, - "step": 2226 - }, - { - "completion_length": 96.984375, - "epoch": 0.7096876991714468, - "grad_norm": 38.516117095947266, - "kl": 0.4296875, - "learning_rate": 2.9031230082855317e-07, - "loss": 0.0172, - "reward": 1.5225210189819336, - "reward_std": 0.11743885278701782, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5381460189819336, - "step": 2227 - }, - { - "completion_length": 70.359375, - "epoch": 0.710006373486297, - "grad_norm": 19.35215187072754, - "kl": 0.1787109375, - "learning_rate": 2.8999362651370297e-07, - "loss": 0.0071, - "reward": 1.6459407806396484, - "reward_std": 0.07379327714443207, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6459407806396484, - "rewards/pad": 0.0, - "step": 2228 - }, - { - "completion_length": 75.796875, - "epoch": 0.7103250478011472, - "grad_norm": 43.0965461730957, - "kl": 0.1552734375, - "learning_rate": 2.896749521988527e-07, - "loss": 0.0062, - "reward": 1.7117459774017334, - "reward_std": 0.1261281669139862, - "rewards/pad": 0.34375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3679960370063782, - "step": 2229 - }, - { - "completion_length": 96.125, - "epoch": 0.7106437221159975, - "grad_norm": 22.729938507080078, - "kl": 0.13671875, - "learning_rate": 2.8935627788400253e-07, - "loss": 0.0055, - "reward": 1.5968352556228638, - "reward_std": 0.0328817218542099, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5968351364135742, - "rewards/pad": 0.0, - "step": 2230 - }, - { - "completion_length": 123.0, - "epoch": 0.7109623964308477, - "grad_norm": 32.33634567260742, - "kl": 0.1904296875, - "learning_rate": 2.890376035691523e-07, - "loss": 0.0076, - "reward": 1.6645631790161133, - "reward_std": 0.06763999909162521, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5395632386207581, - "step": 2231 - }, - { - "completion_length": 72.1875, - "epoch": 0.7112810707456979, - "grad_norm": 79.91558837890625, - "kl": 0.1904296875, - "learning_rate": 2.887189292543021e-07, - "loss": 0.0076, - "reward": 1.6356229782104492, - "reward_std": 0.06550084054470062, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6356229782104492, - "step": 2232 - }, - { - "completion_length": 120.96875, - "epoch": 0.7115997450605481, - "grad_norm": 62.61525344848633, - "kl": 0.1943359375, - "learning_rate": 2.8840025493945185e-07, - "loss": 0.0078, - "reward": 1.470062255859375, - "reward_std": 0.05818459764122963, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.470062255859375, - "rewards/pad": 0.0, - "step": 2233 - }, - { - "completion_length": 95.859375, - "epoch": 0.7119184193753983, - "grad_norm": 21.574522018432617, - "kl": 0.140625, - "learning_rate": 2.8808158062460166e-07, - "loss": 0.0056, - "reward": 1.4399889707565308, - "reward_std": 0.04723798483610153, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4399890899658203, - "rewards/pad": 0.0, - "step": 2234 - }, - { - "completion_length": 97.140625, - "epoch": 0.7122370936902486, - "grad_norm": 39.37369155883789, - "kl": 0.16015625, - "learning_rate": 2.877629063097514e-07, - "loss": 0.0064, - "reward": 1.5832393169403076, - "reward_std": 0.06394324451684952, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45823925733566284, - "step": 2235 - }, - { - "completion_length": 43.078125, - "epoch": 0.7125557680050988, - "grad_norm": 48.86249923706055, - "kl": 0.1875, - "learning_rate": 2.874442319949012e-07, - "loss": 0.0075, - "reward": 1.5240147113800049, - "reward_std": 0.052651070058345795, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5240147113800049, - "rewards/pad": 0.0, - "step": 2236 - }, - { - "completion_length": 149.25, - "epoch": 0.712874442319949, - "grad_norm": 37.8237190246582, - "kl": 0.171875, - "learning_rate": 2.8712555768005097e-07, - "loss": 0.0069, - "reward": 1.610236406326294, - "reward_std": 0.06434710323810577, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.610236406326294, - "step": 2237 - }, - { - "completion_length": 73.15625, - "epoch": 0.7131931166347992, - "grad_norm": 28.396270751953125, - "kl": 0.181640625, - "learning_rate": 2.868068833652008e-07, - "loss": 0.0072, - "reward": 1.8386030197143555, - "reward_std": 0.086951844394207, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5886030197143555, - "rewards/pad": 0.25, - "step": 2238 - }, - { - "completion_length": 73.203125, - "epoch": 0.7135117909496494, - "grad_norm": 22.13792610168457, - "kl": 0.328125, - "learning_rate": 2.8648820905035053e-07, - "loss": 0.0132, - "reward": 1.4031696319580078, - "reward_std": 0.0826619416475296, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4031696617603302, - "step": 2239 - }, - { - "completion_length": 98.640625, - "epoch": 0.7138304652644997, - "grad_norm": 71.24427795410156, - "kl": 0.1962890625, - "learning_rate": 2.8616953473550034e-07, - "loss": 0.0079, - "reward": 1.6134414672851562, - "reward_std": 0.1222405731678009, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5040664672851562, - "rewards/pad": 0.109375, - "step": 2240 - }, - { - "completion_length": 122.734375, - "epoch": 0.7141491395793499, - "grad_norm": 24.058292388916016, - "kl": 0.173828125, - "learning_rate": 2.858508604206501e-07, - "loss": 0.007, - "reward": 1.4026800394058228, - "reward_std": 0.05747363343834877, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40268003940582275, - "rewards/pad": 0.0, - "step": 2241 - }, - { - "completion_length": 121.96875, - "epoch": 0.7144678138942001, - "grad_norm": 62.72412109375, - "kl": 0.21875, - "learning_rate": 2.855321861057999e-07, - "loss": 0.0088, - "reward": 1.5252528190612793, - "reward_std": 0.07871957123279572, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5252527594566345, - "step": 2242 - }, - { - "completion_length": 124.328125, - "epoch": 0.7147864882090503, - "grad_norm": 21.873132705688477, - "kl": 0.12255859375, - "learning_rate": 2.8521351179094965e-07, - "loss": 0.0049, - "reward": 1.7477576732635498, - "reward_std": 0.08578725159168243, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5290076732635498, - "step": 2243 - }, - { - "completion_length": 148.34375, - "epoch": 0.7151051625239006, - "grad_norm": 23.316675186157227, - "kl": 0.140625, - "learning_rate": 2.8489483747609946e-07, - "loss": 0.0056, - "reward": 1.4658639430999756, - "reward_std": 0.11161128431558609, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4814888834953308, - "step": 2244 - }, - { - "completion_length": 71.859375, - "epoch": 0.7154238368387508, - "grad_norm": 12.032831192016602, - "kl": 0.18359375, - "learning_rate": 2.845761631612492e-07, - "loss": 0.0073, - "reward": 1.7768384218215942, - "reward_std": 0.09560340642929077, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.417463481426239, - "rewards/pad": 0.375, - "step": 2245 - }, - { - "completion_length": 125.0, - "epoch": 0.715742511153601, - "grad_norm": 84.09508514404297, - "kl": 0.119140625, - "learning_rate": 2.842574888463989e-07, - "loss": 0.0048, - "reward": 1.5787897109985352, - "reward_std": 0.053379226475954056, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32878974080085754, - "step": 2246 - }, - { - "completion_length": 97.265625, - "epoch": 0.7160611854684512, - "grad_norm": 267.7286682128906, - "kl": 0.193359375, - "learning_rate": 2.839388145315487e-07, - "loss": 0.0077, - "reward": 1.6631855964660645, - "reward_std": 0.06450549513101578, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.538185715675354, - "rewards/pad": 0.125, - "step": 2247 - }, - { - "completion_length": 173.734375, - "epoch": 0.7163798597833014, - "grad_norm": 11.09773063659668, - "kl": 0.08837890625, - "learning_rate": 2.836201402166985e-07, - "loss": 0.0035, - "reward": 1.4079930782318115, - "reward_std": 0.023111682385206223, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4079931080341339, - "step": 2248 - }, - { - "completion_length": 97.5625, - "epoch": 0.7166985340981517, - "grad_norm": 28.511962890625, - "kl": 0.134765625, - "learning_rate": 2.833014659018483e-07, - "loss": 0.0054, - "reward": 1.74592924118042, - "reward_std": 0.16153305768966675, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.6521791815757751, - "step": 2249 - }, - { - "completion_length": 71.140625, - "epoch": 0.7170172084130019, - "grad_norm": 115.95256042480469, - "kl": 0.25, - "learning_rate": 2.8298279158699804e-07, - "loss": 0.01, - "reward": 1.6066967248916626, - "reward_std": 0.07053203880786896, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6066967844963074, - "step": 2250 - }, - { - "completion_length": 72.0625, - "epoch": 0.7173358827278521, - "grad_norm": 66.62077331542969, - "kl": 0.71875, - "learning_rate": 2.8266411727214784e-07, - "loss": 0.0287, - "reward": 1.4966835975646973, - "reward_std": 0.1418730467557907, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4029335379600525, - "rewards/pad": 0.09375, - "step": 2251 - }, - { - "completion_length": 122.515625, - "epoch": 0.7176545570427023, - "grad_norm": 19.46177101135254, - "kl": 0.193359375, - "learning_rate": 2.823454429572976e-07, - "loss": 0.0077, - "reward": 1.482533574104309, - "reward_std": 0.05976587161421776, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48253360390663147, - "step": 2252 - }, - { - "completion_length": 71.921875, - "epoch": 0.7179732313575525, - "grad_norm": 10.516894340515137, - "kl": 0.1748046875, - "learning_rate": 2.820267686424474e-07, - "loss": 0.007, - "reward": 1.75198233127594, - "reward_std": 0.06279326975345612, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6269823312759399, - "step": 2253 - }, - { - "completion_length": 150.40625, - "epoch": 0.7182919056724028, - "grad_norm": 14.134764671325684, - "kl": 0.10009765625, - "learning_rate": 2.8170809432759716e-07, - "loss": 0.004, - "reward": 1.4842798709869385, - "reward_std": 0.03848659247159958, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35927993059158325, - "step": 2254 - }, - { - "completion_length": 98.84375, - "epoch": 0.718610579987253, - "grad_norm": 33.74140167236328, - "kl": 0.1552734375, - "learning_rate": 2.8138942001274697e-07, - "loss": 0.0062, - "reward": 1.5922157764434814, - "reward_std": 0.06147976592183113, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3422156572341919, - "rewards/pad": 0.25, - "step": 2255 - }, - { - "completion_length": 70.421875, - "epoch": 0.7189292543021033, - "grad_norm": 24.451801300048828, - "kl": 0.224609375, - "learning_rate": 2.810707456978967e-07, - "loss": 0.009, - "reward": 1.452077031135559, - "reward_std": 0.07750258594751358, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.45207715034484863, - "step": 2256 - }, - { - "completion_length": 45.578125, - "epoch": 0.7192479286169535, - "grad_norm": 60.85447311401367, - "kl": 0.140625, - "learning_rate": 2.8075207138304653e-07, - "loss": 0.0056, - "reward": 1.687796711921692, - "reward_std": 0.07646714895963669, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5627967715263367, - "rewards/pad": 0.125, - "step": 2257 - }, - { - "completion_length": 44.59375, - "epoch": 0.7195666029318037, - "grad_norm": 37.42232131958008, - "kl": 0.419921875, - "learning_rate": 2.804333970681963e-07, - "loss": 0.0168, - "reward": 1.5680201053619385, - "reward_std": 0.07154925167560577, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4430200755596161, - "rewards/pad": 0.125, - "step": 2258 - }, - { - "completion_length": 174.46875, - "epoch": 0.719885277246654, - "grad_norm": 35.96725845336914, - "kl": 0.068359375, - "learning_rate": 2.801147227533461e-07, - "loss": 0.0027, - "reward": 1.5415878295898438, - "reward_std": 0.06188948452472687, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.432212769985199, - "step": 2259 - }, - { - "completion_length": 45.703125, - "epoch": 0.7202039515615042, - "grad_norm": 84.68194580078125, - "kl": 0.2373046875, - "learning_rate": 2.7979604843849584e-07, - "loss": 0.0095, - "reward": 1.7907674312591553, - "reward_std": 0.10699145495891571, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5876424312591553, - "rewards/pad": 0.203125, - "step": 2260 - }, - { - "completion_length": 68.21875, - "epoch": 0.7205226258763544, - "grad_norm": 33.74509048461914, - "kl": 0.16015625, - "learning_rate": 2.7947737412364565e-07, - "loss": 0.0064, - "reward": 1.5191991329193115, - "reward_std": 0.08163504302501678, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.519199013710022, - "rewards/pad": 0.0, - "step": 2261 - }, - { - "completion_length": 70.453125, - "epoch": 0.7208413001912046, - "grad_norm": 24.795495986938477, - "kl": 0.2080078125, - "learning_rate": 2.791586998087954e-07, - "loss": 0.0083, - "reward": 1.7269294261932373, - "reward_std": 0.09076951444149017, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6019294261932373, - "step": 2262 - }, - { - "completion_length": 71.59375, - "epoch": 0.7211599745060548, - "grad_norm": 115.77197265625, - "kl": 0.212890625, - "learning_rate": 2.788400254939452e-07, - "loss": 0.0085, - "reward": 1.5558435916900635, - "reward_std": 0.045107245445251465, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4308435916900635, - "step": 2263 - }, - { - "completion_length": 46.984375, - "epoch": 0.7214786488209051, - "grad_norm": 82.89366149902344, - "kl": 0.267578125, - "learning_rate": 2.7852135117909496e-07, - "loss": 0.0107, - "reward": 1.823499083518982, - "reward_std": 0.1625116467475891, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.46412408351898193, - "rewards/pad": 0.375, - "step": 2264 - }, - { - "completion_length": 47.453125, - "epoch": 0.7217973231357553, - "grad_norm": 57.559303283691406, - "kl": 0.390625, - "learning_rate": 2.7820267686424477e-07, - "loss": 0.0156, - "reward": 1.7546806335449219, - "reward_std": 0.10245010256767273, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5046806335449219, - "step": 2265 - }, - { - "completion_length": 71.75, - "epoch": 0.7221159974506055, - "grad_norm": 105.75975036621094, - "kl": 0.234375, - "learning_rate": 2.7788400254939447e-07, - "loss": 0.0094, - "reward": 1.530150055885315, - "reward_std": 0.1313575655221939, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.42077499628067017, - "step": 2266 - }, - { - "completion_length": 98.5, - "epoch": 0.7224346717654557, - "grad_norm": 77.48545837402344, - "kl": 0.1728515625, - "learning_rate": 2.775653282345443e-07, - "loss": 0.0069, - "reward": 1.885993242263794, - "reward_std": 0.08213841915130615, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.635993242263794, - "step": 2267 - }, - { - "completion_length": 70.640625, - "epoch": 0.722753346080306, - "grad_norm": 43.32448196411133, - "kl": 0.1611328125, - "learning_rate": 2.7724665391969403e-07, - "loss": 0.0065, - "reward": 1.8109092712402344, - "reward_std": 0.14658910036087036, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5921593308448792, - "step": 2268 - }, - { - "completion_length": 126.640625, - "epoch": 0.7230720203951562, - "grad_norm": 28.20330810546875, - "kl": 0.126953125, - "learning_rate": 2.7692797960484384e-07, - "loss": 0.0051, - "reward": 1.5282201766967773, - "reward_std": 0.045800693333148956, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40322020649909973, - "rewards/pad": 0.125, - "step": 2269 - }, - { - "completion_length": 71.015625, - "epoch": 0.7233906947100064, - "grad_norm": 127.50252532958984, - "kl": 0.14453125, - "learning_rate": 2.766093052899936e-07, - "loss": 0.0058, - "reward": 1.8048551082611084, - "reward_std": 0.06221465393900871, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5548551082611084, - "step": 2270 - }, - { - "completion_length": 123.578125, - "epoch": 0.7237093690248566, - "grad_norm": 42.8072624206543, - "kl": 0.130859375, - "learning_rate": 2.762906309751434e-07, - "loss": 0.0052, - "reward": 1.6393482685089111, - "reward_std": 0.05822452902793884, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5143482685089111, - "step": 2271 - }, - { - "completion_length": 69.96875, - "epoch": 0.7240280433397068, - "grad_norm": 28.740028381347656, - "kl": 0.2197265625, - "learning_rate": 2.7597195666029316e-07, - "loss": 0.0088, - "reward": 1.4445308446884155, - "reward_std": 0.06185196340084076, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4445308744907379, - "step": 2272 - }, - { - "completion_length": 71.640625, - "epoch": 0.724346717654557, - "grad_norm": 65.78955841064453, - "kl": 0.2138671875, - "learning_rate": 2.7565328234544296e-07, - "loss": 0.0085, - "reward": 1.6825776100158691, - "reward_std": 0.07312022149562836, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5575776100158691, - "rewards/pad": 0.125, - "step": 2273 - }, - { - "completion_length": 45.40625, - "epoch": 0.7246653919694073, - "grad_norm": 127.24124145507812, - "kl": 0.1630859375, - "learning_rate": 2.753346080305927e-07, - "loss": 0.0065, - "reward": 1.766405463218689, - "reward_std": 0.07459418475627899, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.641405463218689, - "rewards/pad": 0.125, - "step": 2274 - }, - { - "completion_length": 150.296875, - "epoch": 0.7249840662842575, - "grad_norm": 17.000076293945312, - "kl": 0.099609375, - "learning_rate": 2.750159337157425e-07, - "loss": 0.004, - "reward": 1.5623540878295898, - "reward_std": 0.060097016394138336, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43735411763191223, - "rewards/pad": 0.125, - "step": 2275 - }, - { - "completion_length": 20.234375, - "epoch": 0.7253027405991077, - "grad_norm": 141.0899200439453, - "kl": 0.431640625, - "learning_rate": 2.746972594008923e-07, - "loss": 0.0173, - "reward": 1.9232772588729858, - "reward_std": 0.11568228900432587, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6732773780822754, - "rewards/pad": 0.25, - "step": 2276 - }, - { - "completion_length": 121.875, - "epoch": 0.7256214149139579, - "grad_norm": 6.202388763427734, - "kl": 0.3203125, - "learning_rate": 2.7437858508604203e-07, - "loss": 0.0128, - "reward": 1.6419615745544434, - "reward_std": 0.10208149254322052, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5169616341590881, - "step": 2277 - }, - { - "completion_length": 98.234375, - "epoch": 0.7259400892288081, - "grad_norm": 45.12540817260742, - "kl": 0.78125, - "learning_rate": 2.7405991077119184e-07, - "loss": 0.0311, - "reward": 1.7293262481689453, - "reward_std": 0.1494627594947815, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5105762481689453, - "rewards/pad": 0.21875, - "step": 2278 - }, - { - "completion_length": 46.8125, - "epoch": 0.7262587635436584, - "grad_norm": 57.610782623291016, - "kl": 0.2197265625, - "learning_rate": 2.737412364563416e-07, - "loss": 0.0088, - "reward": 1.719219446182251, - "reward_std": 0.11520032584667206, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46921950578689575, - "step": 2279 - }, - { - "completion_length": 70.421875, - "epoch": 0.7265774378585086, - "grad_norm": 82.78499603271484, - "kl": 0.126953125, - "learning_rate": 2.734225621414914e-07, - "loss": 0.0051, - "reward": 1.6706788539886475, - "reward_std": 0.06128586083650589, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.670678973197937, - "rewards/pad": 0.0, - "step": 2280 - }, - { - "completion_length": 123.96875, - "epoch": 0.7268961121733588, - "grad_norm": 12.616573333740234, - "kl": 0.10888671875, - "learning_rate": 2.7310388782664115e-07, - "loss": 0.0044, - "reward": 1.5960545539855957, - "reward_std": 0.09989671409130096, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.48667964339256287, - "step": 2281 - }, - { - "completion_length": 123.8125, - "epoch": 0.727214786488209, - "grad_norm": 25.62480354309082, - "kl": 0.2265625, - "learning_rate": 2.7278521351179096e-07, - "loss": 0.0091, - "reward": 1.6528668403625488, - "reward_std": 0.07082921266555786, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5278667211532593, - "step": 2282 - }, - { - "completion_length": 46.0625, - "epoch": 0.7275334608030593, - "grad_norm": 65.58390808105469, - "kl": 0.142578125, - "learning_rate": 2.724665391969407e-07, - "loss": 0.0057, - "reward": 1.8364577293395996, - "reward_std": 0.06432148814201355, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5864576101303101, - "step": 2283 - }, - { - "completion_length": 95.3125, - "epoch": 0.7278521351179095, - "grad_norm": 24.709632873535156, - "kl": 0.1318359375, - "learning_rate": 2.721478648820905e-07, - "loss": 0.0053, - "reward": 1.5833723545074463, - "reward_std": 0.07328394055366516, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5833722949028015, - "rewards/pad": 0.0, - "step": 2284 - }, - { - "completion_length": 19.25, - "epoch": 0.7281708094327597, - "grad_norm": 34.625465393066406, - "kl": 0.1611328125, - "learning_rate": 2.718291905672402e-07, - "loss": 0.0064, - "reward": 1.8348424434661865, - "reward_std": 0.08397142589092255, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7098423838615417, - "rewards/pad": 0.125, - "step": 2285 - }, - { - "completion_length": 150.578125, - "epoch": 0.7284894837476099, - "grad_norm": 80.94461822509766, - "kl": 0.09423828125, - "learning_rate": 2.7151051625239003e-07, - "loss": 0.0038, - "reward": 1.4016118049621582, - "reward_std": 0.0279681459069252, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.401611864566803, - "step": 2286 - }, - { - "completion_length": 121.453125, - "epoch": 0.7288081580624601, - "grad_norm": 22.56496238708496, - "kl": 0.1669921875, - "learning_rate": 2.711918419375398e-07, - "loss": 0.0067, - "reward": 1.5645333528518677, - "reward_std": 0.07016131281852722, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5645333528518677, - "rewards/pad": 0.0, - "step": 2287 - }, - { - "completion_length": 96.375, - "epoch": 0.7291268323773104, - "grad_norm": 82.41597747802734, - "kl": 0.11572265625, - "learning_rate": 2.708731676226896e-07, - "loss": 0.0046, - "reward": 1.628800392150879, - "reward_std": 0.0435745008289814, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6288003921508789, - "step": 2288 - }, - { - "completion_length": 71.71875, - "epoch": 0.7294455066921606, - "grad_norm": 88.3929443359375, - "kl": 0.16015625, - "learning_rate": 2.7055449330783934e-07, - "loss": 0.0064, - "reward": 1.445934534072876, - "reward_std": 0.056166261434555054, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44593459367752075, - "rewards/pad": 0.0, - "step": 2289 - }, - { - "completion_length": 174.640625, - "epoch": 0.7297641810070108, - "grad_norm": 8.242386817932129, - "kl": 0.1005859375, - "learning_rate": 2.7023581899298915e-07, - "loss": 0.004, - "reward": 1.5474566221237183, - "reward_std": 0.052069712430238724, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5474566221237183, - "rewards/pad": 0.0, - "step": 2290 - }, - { - "completion_length": 122.546875, - "epoch": 0.730082855321861, - "grad_norm": 31.46306800842285, - "kl": 0.13671875, - "learning_rate": 2.699171446781389e-07, - "loss": 0.0055, - "reward": 1.48299241065979, - "reward_std": 0.057699453085660934, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48299241065979004, - "step": 2291 - }, - { - "completion_length": 147.9375, - "epoch": 0.7304015296367112, - "grad_norm": 12.528481483459473, - "kl": 0.11865234375, - "learning_rate": 2.695984703632887e-07, - "loss": 0.0047, - "reward": 1.6210896968841553, - "reward_std": 0.10064315795898438, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5117148160934448, - "rewards/pad": 0.109375, - "step": 2292 - }, - { - "completion_length": 69.78125, - "epoch": 0.7307202039515615, - "grad_norm": 52.95103073120117, - "kl": 0.263671875, - "learning_rate": 2.6927979604843847e-07, - "loss": 0.0105, - "reward": 1.677060604095459, - "reward_std": 0.09070073068141937, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4270605444908142, - "step": 2293 - }, - { - "completion_length": 20.296875, - "epoch": 0.7310388782664117, - "grad_norm": 64.19196319580078, - "kl": 0.2041015625, - "learning_rate": 2.689611217335883e-07, - "loss": 0.0082, - "reward": 1.7720385789871216, - "reward_std": 0.0941629707813263, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.537663459777832, - "step": 2294 - }, - { - "completion_length": 96.875, - "epoch": 0.731357552581262, - "grad_norm": 45.8812255859375, - "kl": 0.158203125, - "learning_rate": 2.6864244741873803e-07, - "loss": 0.0063, - "reward": 1.5812724828720093, - "reward_std": 0.0480443611741066, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45627251267433167, - "rewards/pad": 0.125, - "step": 2295 - }, - { - "completion_length": 124.53125, - "epoch": 0.7316762268961122, - "grad_norm": 52.578399658203125, - "kl": 0.10791015625, - "learning_rate": 2.6832377310388783e-07, - "loss": 0.0043, - "reward": 1.749119520187378, - "reward_std": 0.08590525388717651, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5147445201873779, - "step": 2296 - }, - { - "completion_length": 149.203125, - "epoch": 0.7319949012109624, - "grad_norm": 44.240333557128906, - "kl": 0.138671875, - "learning_rate": 2.680050987890376e-07, - "loss": 0.0055, - "reward": 1.5186524391174316, - "reward_std": 0.047837622463703156, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39365243911743164, - "step": 2297 - }, - { - "completion_length": 95.984375, - "epoch": 0.7323135755258127, - "grad_norm": 16.60160255432129, - "kl": 0.1611328125, - "learning_rate": 2.676864244741874e-07, - "loss": 0.0065, - "reward": 1.4992430210113525, - "reward_std": 0.03506264090538025, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.499242901802063, - "rewards/pad": 0.0, - "step": 2298 - }, - { - "completion_length": 98.296875, - "epoch": 0.7326322498406629, - "grad_norm": 41.40070724487305, - "kl": 0.111328125, - "learning_rate": 2.6736775015933715e-07, - "loss": 0.0045, - "reward": 1.8091952800750732, - "reward_std": 0.04441278055310249, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5591952204704285, - "rewards/pad": 0.25, - "step": 2299 - }, - { - "completion_length": 97.671875, - "epoch": 0.7329509241555131, - "grad_norm": 44.4100341796875, - "kl": 0.1396484375, - "learning_rate": 2.6704907584448696e-07, - "loss": 0.0056, - "reward": 1.826994776725769, - "reward_std": 0.050389401614665985, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4519948661327362, - "step": 2300 - }, - { - "completion_length": 45.015625, - "epoch": 0.7332695984703633, - "grad_norm": 23.837890625, - "kl": 0.3515625, - "learning_rate": 2.667304015296367e-07, - "loss": 0.014, - "reward": 1.5495420694351196, - "reward_std": 0.14073896408081055, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4401671290397644, - "rewards/pad": 0.125, - "step": 2301 - }, - { - "completion_length": 72.6875, - "epoch": 0.7335882727852135, - "grad_norm": 30.09236717224121, - "kl": 0.1669921875, - "learning_rate": 2.664117272147865e-07, - "loss": 0.0067, - "reward": 1.7357730865478516, - "reward_std": 0.1373557597398758, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.6263981461524963, - "rewards/pad": 0.125, - "step": 2302 - }, - { - "completion_length": 19.21875, - "epoch": 0.7339069471000638, - "grad_norm": 68.52455139160156, - "kl": 0.22265625, - "learning_rate": 2.6609305289993627e-07, - "loss": 0.0089, - "reward": 1.8176792860031128, - "reward_std": 0.06696180254220963, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6926792860031128, - "rewards/pad": 0.125, - "step": 2303 - }, - { - "completion_length": 98.0, - "epoch": 0.734225621414914, - "grad_norm": 12.61028003692627, - "kl": 0.25, - "learning_rate": 2.657743785850861e-07, - "loss": 0.01, - "reward": 1.7191720008850098, - "reward_std": 0.10746273398399353, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5941721200942993, - "step": 2304 - }, - { - "completion_length": 95.296875, - "epoch": 0.7345442957297642, - "grad_norm": 38.44736862182617, - "kl": 0.21875, - "learning_rate": 2.654557042702358e-07, - "loss": 0.0088, - "reward": 1.7357244491577148, - "reward_std": 0.0899784043431282, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6107244491577148, - "rewards/pad": 0.125, - "step": 2305 - }, - { - "completion_length": 122.125, - "epoch": 0.7348629700446144, - "grad_norm": 24.969764709472656, - "kl": 0.10107421875, - "learning_rate": 2.651370299553856e-07, - "loss": 0.0041, - "reward": 1.6699448823928833, - "reward_std": 0.08816447854042053, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4511949419975281, - "rewards/pad": 0.21875, - "step": 2306 - }, - { - "completion_length": 72.6875, - "epoch": 0.7351816443594646, - "grad_norm": 20.42173957824707, - "kl": 0.1923828125, - "learning_rate": 2.6481835564053534e-07, - "loss": 0.0077, - "reward": 1.8201383352279663, - "reward_std": 0.06316646933555603, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5701382756233215, - "step": 2307 - }, - { - "completion_length": 70.921875, - "epoch": 0.7355003186743149, - "grad_norm": 31.99380111694336, - "kl": 0.1708984375, - "learning_rate": 2.644996813256851e-07, - "loss": 0.0068, - "reward": 1.7687439918518066, - "reward_std": 0.04723686724901199, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6437438726425171, - "step": 2308 - }, - { - "completion_length": 71.46875, - "epoch": 0.7358189929891651, - "grad_norm": 32.928592681884766, - "kl": 0.1748046875, - "learning_rate": 2.641810070108349e-07, - "loss": 0.007, - "reward": 1.5270164012908936, - "reward_std": 0.06720034778118134, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40201643109321594, - "rewards/pad": 0.125, - "step": 2309 - }, - { - "completion_length": 124.203125, - "epoch": 0.7361376673040153, - "grad_norm": 61.79475021362305, - "kl": 0.09716796875, - "learning_rate": 2.6386233269598466e-07, - "loss": 0.0039, - "reward": 1.7632999420166016, - "reward_std": 0.09831054508686066, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5289248824119568, - "step": 2310 - }, - { - "completion_length": 71.359375, - "epoch": 0.7364563416188655, - "grad_norm": 40.55093765258789, - "kl": 0.2041015625, - "learning_rate": 2.6354365838113446e-07, - "loss": 0.0082, - "reward": 1.803855061531067, - "reward_std": 0.08483041077852249, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5538550615310669, - "rewards/pad": 0.25, - "step": 2311 - }, - { - "completion_length": 95.78125, - "epoch": 0.7367750159337157, - "grad_norm": 57.996498107910156, - "kl": 0.11181640625, - "learning_rate": 2.632249840662842e-07, - "loss": 0.0045, - "reward": 1.4923651218414307, - "reward_std": 0.05749805271625519, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49236518144607544, - "rewards/pad": 0.0, - "step": 2312 - }, - { - "completion_length": 94.125, - "epoch": 0.737093690248566, - "grad_norm": 20.021610260009766, - "kl": 0.1201171875, - "learning_rate": 2.62906309751434e-07, - "loss": 0.0048, - "reward": 1.426276683807373, - "reward_std": 0.056567881256341934, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42627671360969543, - "step": 2313 - }, - { - "completion_length": 71.84375, - "epoch": 0.7374123645634162, - "grad_norm": 40.765987396240234, - "kl": 0.1845703125, - "learning_rate": 2.625876354365838e-07, - "loss": 0.0074, - "reward": 1.6615710258483887, - "reward_std": 0.09196199476718903, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5365709066390991, - "step": 2314 - }, - { - "completion_length": 98.375, - "epoch": 0.7377310388782664, - "grad_norm": 60.426727294921875, - "kl": 0.173828125, - "learning_rate": 2.622689611217336e-07, - "loss": 0.0069, - "reward": 1.4886939525604248, - "reward_std": 0.12488123774528503, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45744404196739197, - "rewards/pad": 0.03125, - "step": 2315 - }, - { - "completion_length": 97.90625, - "epoch": 0.7380497131931166, - "grad_norm": 21.20757484436035, - "kl": 0.26953125, - "learning_rate": 2.6195028680688334e-07, - "loss": 0.0108, - "reward": 1.8146427869796753, - "reward_std": 0.15877693891525269, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5958927869796753, - "step": 2316 - }, - { - "completion_length": 45.828125, - "epoch": 0.7383683875079669, - "grad_norm": 42.13408279418945, - "kl": 0.154296875, - "learning_rate": 2.6163161249203315e-07, - "loss": 0.0062, - "reward": 1.6832377910614014, - "reward_std": 0.11729064583778381, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5738628506660461, - "step": 2317 - }, - { - "completion_length": 97.15625, - "epoch": 0.7386870618228171, - "grad_norm": 22.208709716796875, - "kl": 0.16796875, - "learning_rate": 2.613129381771829e-07, - "loss": 0.0067, - "reward": 1.6230249404907227, - "reward_std": 0.06276427209377289, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4980248808860779, - "step": 2318 - }, - { - "completion_length": 20.8125, - "epoch": 0.7390057361376673, - "grad_norm": 27.831518173217773, - "kl": 0.328125, - "learning_rate": 2.609942638623327e-07, - "loss": 0.0131, - "reward": 1.4572490453720093, - "reward_std": 0.06960473954677582, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3322490453720093, - "rewards/pad": 0.125, - "step": 2319 - }, - { - "completion_length": 98.40625, - "epoch": 0.7393244104525175, - "grad_norm": 29.015913009643555, - "kl": 0.1259765625, - "learning_rate": 2.6067558954748246e-07, - "loss": 0.005, - "reward": 1.5660903453826904, - "reward_std": 0.12316054850816727, - "rewards/pad": 0.203125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3629652261734009, - "step": 2320 - }, - { - "completion_length": 148.4375, - "epoch": 0.7396430847673677, - "grad_norm": 10.599801063537598, - "kl": 0.10791015625, - "learning_rate": 2.6035691523263227e-07, - "loss": 0.0043, - "reward": 1.4441365003585815, - "reward_std": 0.026503991335630417, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44413644075393677, - "step": 2321 - }, - { - "completion_length": 124.5625, - "epoch": 0.739961759082218, - "grad_norm": 8.700201034545898, - "kl": 0.1513671875, - "learning_rate": 2.60038240917782e-07, - "loss": 0.006, - "reward": 1.644855260848999, - "reward_std": 0.06340721249580383, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5198552012443542, - "rewards/pad": 0.125, - "step": 2322 - }, - { - "completion_length": 123.875, - "epoch": 0.7402804333970682, - "grad_norm": 15.96863842010498, - "kl": 0.255859375, - "learning_rate": 2.5971956660293183e-07, - "loss": 0.0103, - "reward": 1.6885113716125488, - "reward_std": 0.04282812401652336, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43851131200790405, - "step": 2323 - }, - { - "completion_length": 44.5, - "epoch": 0.7405991077119184, - "grad_norm": 18.425813674926758, - "kl": 0.341796875, - "learning_rate": 2.594008922880816e-07, - "loss": 0.0137, - "reward": 1.5486342906951904, - "reward_std": 0.09400318562984467, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.54863440990448, - "rewards/pad": 0.0, - "step": 2324 - }, - { - "completion_length": 97.578125, - "epoch": 0.7409177820267686, - "grad_norm": 2675.150146484375, - "kl": 0.1318359375, - "learning_rate": 2.5908221797323134e-07, - "loss": 0.0053, - "reward": 1.8657528162002563, - "reward_std": 0.09240710735321045, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6157528758049011, - "step": 2325 - }, - { - "completion_length": 122.515625, - "epoch": 0.7412364563416188, - "grad_norm": 20.519306182861328, - "kl": 0.130859375, - "learning_rate": 2.587635436583811e-07, - "loss": 0.0052, - "reward": 1.6174451112747192, - "reward_std": 0.07409971952438354, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5080699920654297, - "rewards/pad": 0.109375, - "step": 2326 - }, - { - "completion_length": 70.953125, - "epoch": 0.741555130656469, - "grad_norm": 23.402151107788086, - "kl": 0.373046875, - "learning_rate": 2.584448693435309e-07, - "loss": 0.0149, - "reward": 1.6871752738952637, - "reward_std": 0.12008976936340332, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5934253931045532, - "rewards/pad": 0.09375, - "step": 2327 - }, - { - "completion_length": 20.25, - "epoch": 0.7418738049713193, - "grad_norm": 41.974708557128906, - "kl": 0.203125, - "learning_rate": 2.5812619502868065e-07, - "loss": 0.0081, - "reward": 2.0941109657287598, - "reward_std": 0.12088044732809067, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.7191112637519836, - "step": 2328 - }, - { - "completion_length": 68.25, - "epoch": 0.7421924792861695, - "grad_norm": 37.48021697998047, - "kl": 0.1787109375, - "learning_rate": 2.5780752071383046e-07, - "loss": 0.0072, - "reward": 1.568420171737671, - "reward_std": 0.07940567284822464, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5684201121330261, - "rewards/pad": 0.0, - "step": 2329 - }, - { - "completion_length": 148.796875, - "epoch": 0.7425111536010197, - "grad_norm": 15.084420204162598, - "kl": 0.10205078125, - "learning_rate": 2.574888463989802e-07, - "loss": 0.0041, - "reward": 1.5860767364501953, - "reward_std": 0.027519021183252335, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5860767364501953, - "step": 2330 - }, - { - "completion_length": 70.21875, - "epoch": 0.7428298279158699, - "grad_norm": 31.250144958496094, - "kl": 0.181640625, - "learning_rate": 2.5717017208413e-07, - "loss": 0.0073, - "reward": 1.6655259132385254, - "reward_std": 0.07507406175136566, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5405259728431702, - "step": 2331 - }, - { - "completion_length": 144.390625, - "epoch": 0.7431485022307202, - "grad_norm": 139.04603576660156, - "kl": 0.0947265625, - "learning_rate": 2.568514977692798e-07, - "loss": 0.0038, - "reward": 1.5995535850524902, - "reward_std": 0.07657554000616074, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5995535850524902, - "step": 2332 - }, - { - "completion_length": 68.640625, - "epoch": 0.7434671765455704, - "grad_norm": 129.79537963867188, - "kl": 0.17578125, - "learning_rate": 2.565328234544296e-07, - "loss": 0.007, - "reward": 1.5914616584777832, - "reward_std": 0.03448464721441269, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4664616584777832, - "step": 2333 - }, - { - "completion_length": 93.484375, - "epoch": 0.7437858508604207, - "grad_norm": 37.845760345458984, - "kl": 0.17578125, - "learning_rate": 2.5621414913957934e-07, - "loss": 0.007, - "reward": 1.6175404787063599, - "reward_std": 0.06029842048883438, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6175404787063599, - "rewards/pad": 0.0, - "step": 2334 - }, - { - "completion_length": 44.875, - "epoch": 0.7441045251752709, - "grad_norm": 49.47697448730469, - "kl": 0.37890625, - "learning_rate": 2.5589547482472914e-07, - "loss": 0.0152, - "reward": 1.5388339757919312, - "reward_std": 0.08021228015422821, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41383397579193115, - "rewards/pad": 0.125, - "step": 2335 - }, - { - "completion_length": 119.90625, - "epoch": 0.7444231994901211, - "grad_norm": 17.561262130737305, - "kl": 0.2109375, - "learning_rate": 2.555768005098789e-07, - "loss": 0.0084, - "reward": 1.7621181011199951, - "reward_std": 0.054667938500642776, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6371180415153503, - "step": 2336 - }, - { - "completion_length": 97.015625, - "epoch": 0.7447418738049714, - "grad_norm": 24.10749053955078, - "kl": 0.1396484375, - "learning_rate": 2.552581261950287e-07, - "loss": 0.0056, - "reward": 1.53496253490448, - "reward_std": 0.1598353087902069, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47246253490448, - "step": 2337 - }, - { - "completion_length": 70.625, - "epoch": 0.7450605481198216, - "grad_norm": 54.45188903808594, - "kl": 0.1962890625, - "learning_rate": 2.5493945188017846e-07, - "loss": 0.0079, - "reward": 1.7008757591247559, - "reward_std": 0.09980419278144836, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5758758783340454, - "step": 2338 - }, - { - "completion_length": 96.296875, - "epoch": 0.7453792224346718, - "grad_norm": 23.287134170532227, - "kl": 0.26171875, - "learning_rate": 2.546207775653282e-07, - "loss": 0.0105, - "reward": 1.472541093826294, - "reward_std": 0.049887463450431824, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4725410044193268, - "rewards/pad": 0.0, - "step": 2339 - }, - { - "completion_length": 46.375, - "epoch": 0.745697896749522, - "grad_norm": 189.74688720703125, - "kl": 0.361328125, - "learning_rate": 2.54302103250478e-07, - "loss": 0.0144, - "reward": 1.6629632711410522, - "reward_std": 0.14199450612068176, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44421327114105225, - "rewards/pad": 0.21875, - "step": 2340 - }, - { - "completion_length": 70.140625, - "epoch": 0.7460165710643722, - "grad_norm": 43.23068618774414, - "kl": 0.224609375, - "learning_rate": 2.5398342893562777e-07, - "loss": 0.009, - "reward": 1.6819307804107666, - "reward_std": 0.10673655569553375, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5881807804107666, - "step": 2341 - }, - { - "completion_length": 98.375, - "epoch": 0.7463352453792225, - "grad_norm": 19.788820266723633, - "kl": 0.1982421875, - "learning_rate": 2.536647546207776e-07, - "loss": 0.008, - "reward": 1.52030348777771, - "reward_std": 0.06852295994758606, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.39530351758003235, - "step": 2342 - }, - { - "completion_length": 44.921875, - "epoch": 0.7466539196940727, - "grad_norm": 25.615558624267578, - "kl": 0.28515625, - "learning_rate": 2.5334608030592733e-07, - "loss": 0.0114, - "reward": 1.6790242195129395, - "reward_std": 0.09928073734045029, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5540242195129395, - "rewards/pad": 0.125, - "step": 2343 - }, - { - "completion_length": 175.25, - "epoch": 0.7469725940089229, - "grad_norm": 11.273604393005371, - "kl": 0.0654296875, - "learning_rate": 2.530274059910771e-07, - "loss": 0.0026, - "reward": 1.5310853719711304, - "reward_std": 0.051154520362615585, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5310853719711304, - "rewards/pad": 0.0, - "step": 2344 - }, - { - "completion_length": 73.0625, - "epoch": 0.7472912683237731, - "grad_norm": 28.395612716674805, - "kl": 0.1455078125, - "learning_rate": 2.5270873167622684e-07, - "loss": 0.0058, - "reward": 1.5528991222381592, - "reward_std": 0.12336267530918121, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4591490626335144, - "step": 2345 - }, - { - "completion_length": 120.53125, - "epoch": 0.7476099426386233, - "grad_norm": 9.888472557067871, - "kl": 0.1279296875, - "learning_rate": 2.5239005736137665e-07, - "loss": 0.0051, - "reward": 1.5787100791931152, - "reward_std": 0.04524374008178711, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.45371013879776, - "step": 2346 - }, - { - "completion_length": 95.96875, - "epoch": 0.7479286169534736, - "grad_norm": 49.379024505615234, - "kl": 0.1494140625, - "learning_rate": 2.520713830465264e-07, - "loss": 0.006, - "reward": 1.5315344333648682, - "reward_std": 0.10110367834568024, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40653443336486816, - "rewards/pad": 0.125, - "step": 2347 - }, - { - "completion_length": 97.34375, - "epoch": 0.7482472912683238, - "grad_norm": 15.307049751281738, - "kl": 0.20703125, - "learning_rate": 2.517527087316762e-07, - "loss": 0.0083, - "reward": 1.5940765142440796, - "reward_std": 0.16235579550266266, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4847015142440796, - "rewards/pad": 0.125, - "step": 2348 - }, - { - "completion_length": 96.734375, - "epoch": 0.748565965583174, - "grad_norm": 25.55277442932129, - "kl": 0.1376953125, - "learning_rate": 2.5143403441682596e-07, - "loss": 0.0055, - "reward": 1.6655036211013794, - "reward_std": 0.06567064672708511, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5405036807060242, - "step": 2349 - }, - { - "completion_length": 72.140625, - "epoch": 0.7488846398980242, - "grad_norm": 41.61199188232422, - "kl": 0.10888671875, - "learning_rate": 2.5111536010197577e-07, - "loss": 0.0044, - "reward": 1.576913595199585, - "reward_std": 0.10146424174308777, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5769136548042297, - "step": 2350 - }, - { - "completion_length": 97.171875, - "epoch": 0.7492033142128744, - "grad_norm": 88.93692779541016, - "kl": 0.12451171875, - "learning_rate": 2.507966857871255e-07, - "loss": 0.005, - "reward": 1.824180245399475, - "reward_std": 0.042419739067554474, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5741802453994751, - "rewards/pad": 0.25, - "step": 2351 - }, - { - "completion_length": 72.359375, - "epoch": 0.7495219885277247, - "grad_norm": 126.26813507080078, - "kl": 0.154296875, - "learning_rate": 2.5047801147227533e-07, - "loss": 0.0062, - "reward": 1.7405102252960205, - "reward_std": 0.07302525639533997, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4905102252960205, - "rewards/pad": 0.25, - "step": 2352 - }, - { - "completion_length": 122.703125, - "epoch": 0.7498406628425749, - "grad_norm": 17.439945220947266, - "kl": 0.11669921875, - "learning_rate": 2.501593371574251e-07, - "loss": 0.0047, - "reward": 1.6083955764770508, - "reward_std": 0.027068469673395157, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.483395516872406, - "step": 2353 - }, - { - "completion_length": 96.5625, - "epoch": 0.7501593371574251, - "grad_norm": 63.4716796875, - "kl": 0.1337890625, - "learning_rate": 2.498406628425749e-07, - "loss": 0.0054, - "reward": 1.6589890718460083, - "reward_std": 0.05223289132118225, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4089891314506531, - "rewards/pad": 0.25, - "step": 2354 - }, - { - "completion_length": 146.6875, - "epoch": 0.7504780114722753, - "grad_norm": 15.716063499450684, - "kl": 0.1328125, - "learning_rate": 2.4952198852772465e-07, - "loss": 0.0053, - "reward": 1.3606678247451782, - "reward_std": 0.0546543225646019, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3606678545475006, - "step": 2355 - }, - { - "completion_length": 96.78125, - "epoch": 0.7507966857871256, - "grad_norm": 45.387916564941406, - "kl": 0.11669921875, - "learning_rate": 2.4920331421287445e-07, - "loss": 0.0047, - "reward": 1.5202877521514893, - "reward_std": 0.07994785904884338, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41091275215148926, - "step": 2356 - }, - { - "completion_length": 122.078125, - "epoch": 0.7511153601019758, - "grad_norm": 28.241512298583984, - "kl": 0.130859375, - "learning_rate": 2.488846398980242e-07, - "loss": 0.0053, - "reward": 1.547778606414795, - "reward_std": 0.05721326917409897, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5477786660194397, - "rewards/pad": 0.0, - "step": 2357 - }, - { - "completion_length": 71.25, - "epoch": 0.751434034416826, - "grad_norm": 47.084571838378906, - "kl": 0.294921875, - "learning_rate": 2.48565965583174e-07, - "loss": 0.0118, - "reward": 1.5903526544570923, - "reward_std": 0.11178088188171387, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5903526544570923, - "rewards/pad": 0.0, - "step": 2358 - }, - { - "completion_length": 71.265625, - "epoch": 0.7517527087316762, - "grad_norm": 79.732666015625, - "kl": 0.171875, - "learning_rate": 2.4824729126832377e-07, - "loss": 0.0069, - "reward": 1.8239046335220337, - "reward_std": 0.05157014727592468, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5739046931266785, - "step": 2359 - }, - { - "completion_length": 46.109375, - "epoch": 0.7520713830465264, - "grad_norm": 31.065034866333008, - "kl": 0.1982421875, - "learning_rate": 2.479286169534735e-07, - "loss": 0.0079, - "reward": 1.4934003353118896, - "reward_std": 0.16332748532295227, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4621503949165344, - "rewards/pad": 0.03125, - "step": 2360 - }, - { - "completion_length": 122.46875, - "epoch": 0.7523900573613767, - "grad_norm": 10.515298843383789, - "kl": 0.12158203125, - "learning_rate": 2.4760994263862333e-07, - "loss": 0.0048, - "reward": 1.6254892349243164, - "reward_std": 0.04377575218677521, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.500489354133606, - "step": 2361 - }, - { - "completion_length": 71.921875, - "epoch": 0.7527087316762269, - "grad_norm": 74.92431640625, - "kl": 0.142578125, - "learning_rate": 2.472912683237731e-07, - "loss": 0.0057, - "reward": 1.7222537994384766, - "reward_std": 0.07536216080188751, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5972539186477661, - "step": 2362 - }, - { - "completion_length": 47.265625, - "epoch": 0.7530274059910771, - "grad_norm": 39.071067810058594, - "kl": 0.126953125, - "learning_rate": 2.469725940089229e-07, - "loss": 0.0051, - "reward": 1.7800179719924927, - "reward_std": 0.10000339150428772, - "rewards/answer_reward": 0.34375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4362678527832031, - "step": 2363 - }, - { - "completion_length": 68.890625, - "epoch": 0.7533460803059273, - "grad_norm": 66.52899169921875, - "kl": 0.1826171875, - "learning_rate": 2.4665391969407264e-07, - "loss": 0.0073, - "reward": 1.5558980703353882, - "reward_std": 0.09053242206573486, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5558980703353882, - "step": 2364 - }, - { - "completion_length": 46.453125, - "epoch": 0.7536647546207775, - "grad_norm": 18.502832412719727, - "kl": 0.2392578125, - "learning_rate": 2.4633524537922245e-07, - "loss": 0.0096, - "reward": 1.6923441886901855, - "reward_std": 0.11259526014328003, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.45796921849250793, - "rewards/pad": 0.25, - "step": 2365 - }, - { - "completion_length": 95.765625, - "epoch": 0.7539834289356278, - "grad_norm": 21.200660705566406, - "kl": 0.31640625, - "learning_rate": 2.460165710643722e-07, - "loss": 0.0127, - "reward": 1.5287537574768066, - "reward_std": 0.06867505609989166, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.40375378727912903, - "step": 2366 - }, - { - "completion_length": 71.1875, - "epoch": 0.754302103250478, - "grad_norm": 28.54607582092285, - "kl": 0.29296875, - "learning_rate": 2.45697896749522e-07, - "loss": 0.0117, - "reward": 1.588828682899475, - "reward_std": 0.10961493849754333, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5888286828994751, - "rewards/pad": 0.0, - "step": 2367 - }, - { - "completion_length": 18.859375, - "epoch": 0.7546207775653282, - "grad_norm": 2452.90283203125, - "kl": 0.353515625, - "learning_rate": 2.4537922243467177e-07, - "loss": 0.0141, - "reward": 1.714134693145752, - "reward_std": 0.18132619559764862, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.589134693145752, - "rewards/pad": 0.125, - "step": 2368 - }, - { - "completion_length": 97.390625, - "epoch": 0.7549394518801784, - "grad_norm": 34.554298400878906, - "kl": 0.1298828125, - "learning_rate": 2.450605481198215e-07, - "loss": 0.0052, - "reward": 1.5022608041763306, - "reward_std": 0.07751639187335968, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37726086378097534, - "step": 2369 - }, - { - "completion_length": 97.0625, - "epoch": 0.7552581261950286, - "grad_norm": 29.822830200195312, - "kl": 0.1962890625, - "learning_rate": 2.447418738049713e-07, - "loss": 0.0079, - "reward": 1.742760181427002, - "reward_std": 0.05533137172460556, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4927600920200348, - "rewards/pad": 0.25, - "step": 2370 - }, - { - "completion_length": 95.71875, - "epoch": 0.7555768005098789, - "grad_norm": 26.240768432617188, - "kl": 0.12890625, - "learning_rate": 2.444231994901211e-07, - "loss": 0.0052, - "reward": 1.585935115814209, - "reward_std": 0.046207722276449203, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5859351754188538, - "step": 2371 - }, - { - "completion_length": 74.515625, - "epoch": 0.7558954748247291, - "grad_norm": 52.59564971923828, - "kl": 0.189453125, - "learning_rate": 2.4410452517527084e-07, - "loss": 0.0076, - "reward": 1.7158679962158203, - "reward_std": 0.05426628887653351, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5908680558204651, - "rewards/pad": 0.125, - "step": 2372 - }, - { - "completion_length": 71.34375, - "epoch": 0.7562141491395793, - "grad_norm": 30.036500930786133, - "kl": 0.330078125, - "learning_rate": 2.4378585086042064e-07, - "loss": 0.0132, - "reward": 1.827400803565979, - "reward_std": 0.09894035756587982, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.7024007439613342, - "step": 2373 - }, - { - "completion_length": 123.515625, - "epoch": 0.7565328234544296, - "grad_norm": 28.441667556762695, - "kl": 0.09423828125, - "learning_rate": 2.434671765455704e-07, - "loss": 0.0038, - "reward": 1.42775559425354, - "reward_std": 0.04605486989021301, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42775559425354004, - "step": 2374 - }, - { - "completion_length": 97.09375, - "epoch": 0.7568514977692798, - "grad_norm": 121.87911987304688, - "kl": 0.1396484375, - "learning_rate": 2.431485022307202e-07, - "loss": 0.0056, - "reward": 1.460996150970459, - "reward_std": 0.07576128840446472, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46099621057510376, - "step": 2375 - }, - { - "completion_length": 123.328125, - "epoch": 0.7571701720841301, - "grad_norm": 22.768299102783203, - "kl": 0.173828125, - "learning_rate": 2.4282982791586996e-07, - "loss": 0.007, - "reward": 1.684352159500122, - "reward_std": 0.05991646274924278, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5593521595001221, - "step": 2376 - }, - { - "completion_length": 124.46875, - "epoch": 0.7574888463989803, - "grad_norm": 20.72931480407715, - "kl": 0.15625, - "learning_rate": 2.4251115360101976e-07, - "loss": 0.0063, - "reward": 1.5033481121063232, - "reward_std": 0.09932418167591095, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5033482909202576, - "rewards/pad": 0.0, - "step": 2377 - }, - { - "completion_length": 124.265625, - "epoch": 0.7578075207138305, - "grad_norm": 83.03893280029297, - "kl": 0.111328125, - "learning_rate": 2.421924792861695e-07, - "loss": 0.0045, - "reward": 1.5489113330841064, - "reward_std": 0.19425338506698608, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4082863926887512, - "rewards/pad": 0.15625, - "step": 2378 - }, - { - "completion_length": 71.109375, - "epoch": 0.7581261950286807, - "grad_norm": 36.17487716674805, - "kl": 0.150390625, - "learning_rate": 2.4187380497131927e-07, - "loss": 0.006, - "reward": 1.5251681804656982, - "reward_std": 0.06316641718149185, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40016818046569824, - "rewards/pad": 0.125, - "step": 2379 - }, - { - "completion_length": 96.828125, - "epoch": 0.758444869343531, - "grad_norm": 14.41674518585205, - "kl": 0.1953125, - "learning_rate": 2.415551306564691e-07, - "loss": 0.0078, - "reward": 1.8518142700195312, - "reward_std": 0.13031429052352905, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5861891508102417, - "rewards/pad": 0.265625, - "step": 2380 - }, - { - "completion_length": 70.5, - "epoch": 0.7587635436583812, - "grad_norm": 62.165035247802734, - "kl": 0.17578125, - "learning_rate": 2.4123645634161883e-07, - "loss": 0.007, - "reward": 1.574102759361267, - "reward_std": 0.06988977640867233, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4491027593612671, - "rewards/pad": 0.125, - "step": 2381 - }, - { - "completion_length": 70.234375, - "epoch": 0.7590822179732314, - "grad_norm": 37.101009368896484, - "kl": 0.30078125, - "learning_rate": 2.4091778202676864e-07, - "loss": 0.0121, - "reward": 1.5496270656585693, - "reward_std": 0.07697926461696625, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4246269762516022, - "rewards/pad": 0.125, - "step": 2382 - }, - { - "completion_length": 170.796875, - "epoch": 0.7594008922880816, - "grad_norm": 118.73182678222656, - "kl": 0.10400390625, - "learning_rate": 2.405991077119184e-07, - "loss": 0.0042, - "reward": 1.4457409381866455, - "reward_std": 0.04694356769323349, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44574084877967834, - "step": 2383 - }, - { - "completion_length": 96.90625, - "epoch": 0.7597195666029318, - "grad_norm": 44.60519027709961, - "kl": 0.1494140625, - "learning_rate": 2.402804333970682e-07, - "loss": 0.006, - "reward": 1.6157231330871582, - "reward_std": 0.06788098067045212, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4907231330871582, - "step": 2384 - }, - { - "completion_length": 123.296875, - "epoch": 0.760038240917782, - "grad_norm": 22.74419593811035, - "kl": 0.1357421875, - "learning_rate": 2.3996175908221796e-07, - "loss": 0.0054, - "reward": 1.516005277633667, - "reward_std": 0.06894777715206146, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39100539684295654, - "step": 2385 - }, - { - "completion_length": 71.5625, - "epoch": 0.7603569152326323, - "grad_norm": 49.663761138916016, - "kl": 0.19140625, - "learning_rate": 2.3964308476736776e-07, - "loss": 0.0076, - "reward": 1.659088134765625, - "reward_std": 0.08590816706418991, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5340881943702698, - "rewards/pad": 0.125, - "step": 2386 - }, - { - "completion_length": 94.953125, - "epoch": 0.7606755895474825, - "grad_norm": 50.491600036621094, - "kl": 0.11962890625, - "learning_rate": 2.393244104525175e-07, - "loss": 0.0048, - "reward": 1.5911861658096313, - "reward_std": 0.05975578725337982, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5911861658096313, - "rewards/pad": 0.0, - "step": 2387 - }, - { - "completion_length": 146.359375, - "epoch": 0.7609942638623327, - "grad_norm": 8.381948471069336, - "kl": 0.08935546875, - "learning_rate": 2.3900573613766727e-07, - "loss": 0.0036, - "reward": 1.4332441091537476, - "reward_std": 0.07146738469600677, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4332440495491028, - "step": 2388 - }, - { - "completion_length": 123.796875, - "epoch": 0.7613129381771829, - "grad_norm": 16.34398651123047, - "kl": 0.12255859375, - "learning_rate": 2.386870618228171e-07, - "loss": 0.0049, - "reward": 1.702054500579834, - "reward_std": 0.040900953114032745, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.452054500579834, - "step": 2389 - }, - { - "completion_length": 45.875, - "epoch": 0.7616316124920331, - "grad_norm": 242.82199096679688, - "kl": 0.197265625, - "learning_rate": 2.3836838750796683e-07, - "loss": 0.0079, - "reward": 1.4251450300216675, - "reward_std": 0.05859028920531273, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42514508962631226, - "rewards/pad": 0.0, - "step": 2390 - }, - { - "completion_length": 70.890625, - "epoch": 0.7619502868068834, - "grad_norm": 112.52105712890625, - "kl": 0.24609375, - "learning_rate": 2.380497131931166e-07, - "loss": 0.0098, - "reward": 1.5279008150100708, - "reward_std": 0.06554174423217773, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5279008746147156, - "rewards/pad": 0.0, - "step": 2391 - }, - { - "completion_length": 71.71875, - "epoch": 0.7622689611217336, - "grad_norm": 27.79994773864746, - "kl": 0.138671875, - "learning_rate": 2.377310388782664e-07, - "loss": 0.0055, - "reward": 1.476495385169983, - "reward_std": 0.07443070411682129, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3514954149723053, - "rewards/pad": 0.125, - "step": 2392 - }, - { - "completion_length": 18.109375, - "epoch": 0.7625876354365838, - "grad_norm": 44.67707824707031, - "kl": 0.158203125, - "learning_rate": 2.3741236456341617e-07, - "loss": 0.0063, - "reward": 1.6668736934661865, - "reward_std": 0.0606708899140358, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6668736934661865, - "rewards/pad": 0.0, - "step": 2393 - }, - { - "completion_length": 123.578125, - "epoch": 0.762906309751434, - "grad_norm": 20.952356338500977, - "kl": 0.1474609375, - "learning_rate": 2.3709369024856595e-07, - "loss": 0.0059, - "reward": 1.6714262962341309, - "reward_std": 0.05074295401573181, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5464261770248413, - "step": 2394 - }, - { - "completion_length": 145.953125, - "epoch": 0.7632249840662843, - "grad_norm": 11.986995697021484, - "kl": 0.1513671875, - "learning_rate": 2.3677501593371573e-07, - "loss": 0.0061, - "reward": 1.581899881362915, - "reward_std": 0.06308305263519287, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.581899881362915, - "rewards/pad": 0.0, - "step": 2395 - }, - { - "completion_length": 72.21875, - "epoch": 0.7635436583811345, - "grad_norm": 40.06754684448242, - "kl": 0.240234375, - "learning_rate": 2.3645634161886551e-07, - "loss": 0.0096, - "reward": 1.7467175722122192, - "reward_std": 0.08731202036142349, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.37171757221221924, - "step": 2396 - }, - { - "completion_length": 175.53125, - "epoch": 0.7638623326959847, - "grad_norm": 11.114773750305176, - "kl": 0.12353515625, - "learning_rate": 2.361376673040153e-07, - "loss": 0.0049, - "reward": 1.3718491792678833, - "reward_std": 0.10146905481815338, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3874741792678833, - "step": 2397 - }, - { - "completion_length": 99.53125, - "epoch": 0.7641810070108349, - "grad_norm": 23.668777465820312, - "kl": 0.1943359375, - "learning_rate": 2.3581899298916505e-07, - "loss": 0.0078, - "reward": 1.7718803882598877, - "reward_std": 0.08299198746681213, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5218803882598877, - "rewards/pad": 0.25, - "step": 2398 - }, - { - "completion_length": 146.421875, - "epoch": 0.7644996813256851, - "grad_norm": 17.925914764404297, - "kl": 0.1142578125, - "learning_rate": 2.3550031867431483e-07, - "loss": 0.0046, - "reward": 1.443023443222046, - "reward_std": 0.06263911724090576, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44302332401275635, - "step": 2399 - }, - { - "completion_length": 96.40625, - "epoch": 0.7648183556405354, - "grad_norm": 22.06613540649414, - "kl": 0.1669921875, - "learning_rate": 2.351816443594646e-07, - "loss": 0.0067, - "reward": 1.4820747375488281, - "reward_std": 0.04780896008014679, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48207470774650574, - "rewards/pad": 0.0, - "step": 2400 - }, - { - "completion_length": 71.421875, - "epoch": 0.7651370299553856, - "grad_norm": 64.82288360595703, - "kl": 0.2080078125, - "learning_rate": 2.348629700446144e-07, - "loss": 0.0083, - "reward": 1.682159662246704, - "reward_std": 0.16720019280910492, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5884097218513489, - "step": 2401 - }, - { - "completion_length": 72.625, - "epoch": 0.7654557042702358, - "grad_norm": 73.9577407836914, - "kl": 0.1650390625, - "learning_rate": 2.3454429572976417e-07, - "loss": 0.0066, - "reward": 1.510580062866211, - "reward_std": 0.09074722230434418, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.38558000326156616, - "step": 2402 - }, - { - "completion_length": 45.140625, - "epoch": 0.765774378585086, - "grad_norm": 42.64339065551758, - "kl": 0.330078125, - "learning_rate": 2.3422562141491395e-07, - "loss": 0.0132, - "reward": 1.5771032571792603, - "reward_std": 0.12901654839515686, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4989781975746155, - "rewards/pad": 0.078125, - "step": 2403 - }, - { - "completion_length": 97.09375, - "epoch": 0.7660930528999362, - "grad_norm": 25.328649520874023, - "kl": 0.1923828125, - "learning_rate": 2.3390694710006373e-07, - "loss": 0.0077, - "reward": 1.6229166984558105, - "reward_std": 0.0707821398973465, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6229166984558105, - "rewards/pad": 0.0, - "step": 2404 - }, - { - "completion_length": 122.859375, - "epoch": 0.7664117272147865, - "grad_norm": 7.7764997482299805, - "kl": 0.15625, - "learning_rate": 2.335882727852135e-07, - "loss": 0.0063, - "reward": 1.4564611911773682, - "reward_std": 0.07263123244047165, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45646119117736816, - "step": 2405 - }, - { - "completion_length": 174.59375, - "epoch": 0.7667304015296367, - "grad_norm": 85.34829711914062, - "kl": 0.08251953125, - "learning_rate": 2.332695984703633e-07, - "loss": 0.0033, - "reward": 1.509110689163208, - "reward_std": 0.03914462402462959, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.509110689163208, - "step": 2406 - }, - { - "completion_length": 99.71875, - "epoch": 0.7670490758444869, - "grad_norm": 41.52641677856445, - "kl": 0.10400390625, - "learning_rate": 2.3295092415551307e-07, - "loss": 0.0042, - "reward": 1.5907998085021973, - "reward_std": 0.039848215878009796, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3407997190952301, - "step": 2407 - }, - { - "completion_length": 100.03125, - "epoch": 0.7673677501593371, - "grad_norm": 99.26285552978516, - "kl": 0.138671875, - "learning_rate": 2.3263224984066283e-07, - "loss": 0.0055, - "reward": 1.9004437923431396, - "reward_std": 0.04943914711475372, - "rewards/answer_reward": 0.5, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.40044382214546204, - "step": 2408 - }, - { - "completion_length": 72.71875, - "epoch": 0.7676864244741873, - "grad_norm": 39.84721374511719, - "kl": 0.177734375, - "learning_rate": 2.323135755258126e-07, - "loss": 0.0071, - "reward": 1.661937952041626, - "reward_std": 0.08707262575626373, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5369380712509155, - "step": 2409 - }, - { - "completion_length": 123.5, - "epoch": 0.7680050987890376, - "grad_norm": 13.269757270812988, - "kl": 0.099609375, - "learning_rate": 2.319949012109624e-07, - "loss": 0.004, - "reward": 1.4472179412841797, - "reward_std": 0.038454461842775345, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4472179710865021, - "rewards/pad": 0.0, - "step": 2410 - }, - { - "completion_length": 122.90625, - "epoch": 0.7683237731038878, - "grad_norm": 43.56960678100586, - "kl": 0.125, - "learning_rate": 2.3167622689611217e-07, - "loss": 0.005, - "reward": 1.5403697490692139, - "reward_std": 0.11602583527565002, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4934948682785034, - "rewards/pad": 0.046875, - "step": 2411 - }, - { - "completion_length": 119.5, - "epoch": 0.768642447418738, - "grad_norm": 89.07806396484375, - "kl": 0.1328125, - "learning_rate": 2.3135755258126195e-07, - "loss": 0.0053, - "reward": 1.6336169242858887, - "reward_std": 0.0688549280166626, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6336168646812439, - "step": 2412 - }, - { - "completion_length": 121.78125, - "epoch": 0.7689611217335883, - "grad_norm": 72.90563201904297, - "kl": 0.123046875, - "learning_rate": 2.3103887826641173e-07, - "loss": 0.0049, - "reward": 1.510859727859497, - "reward_std": 0.04406733065843582, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5108597278594971, - "rewards/pad": 0.0, - "step": 2413 - }, - { - "completion_length": 121.171875, - "epoch": 0.7692797960484385, - "grad_norm": 26.723989486694336, - "kl": 0.11962890625, - "learning_rate": 2.307202039515615e-07, - "loss": 0.0048, - "reward": 1.6717638969421387, - "reward_std": 0.059133999049663544, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5467638969421387, - "step": 2414 - }, - { - "completion_length": 95.3125, - "epoch": 0.7695984703632888, - "grad_norm": 67.16515350341797, - "kl": 0.1611328125, - "learning_rate": 2.304015296367113e-07, - "loss": 0.0065, - "reward": 1.7321889400482178, - "reward_std": 0.10932879894971848, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5603138208389282, - "rewards/pad": 0.171875, - "step": 2415 - }, - { - "completion_length": 122.75, - "epoch": 0.769917144678139, - "grad_norm": 41.2907829284668, - "kl": 0.11767578125, - "learning_rate": 2.3008285532186105e-07, - "loss": 0.0047, - "reward": 1.5792759656906128, - "reward_std": 0.03771261125802994, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45427587628364563, - "step": 2416 - }, - { - "completion_length": 96.84375, - "epoch": 0.7702358189929892, - "grad_norm": 37.21782684326172, - "kl": 0.1943359375, - "learning_rate": 2.2976418100701083e-07, - "loss": 0.0078, - "reward": 1.5600199699401855, - "reward_std": 0.04090619832277298, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5600200295448303, - "step": 2417 - }, - { - "completion_length": 95.171875, - "epoch": 0.7705544933078394, - "grad_norm": 39.630950927734375, - "kl": 0.115234375, - "learning_rate": 2.2944550669216058e-07, - "loss": 0.0046, - "reward": 1.6066815853118896, - "reward_std": 0.04438459128141403, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6066815853118896, - "step": 2418 - }, - { - "completion_length": 68.90625, - "epoch": 0.7708731676226896, - "grad_norm": 124.37810516357422, - "kl": 0.1494140625, - "learning_rate": 2.2912683237731036e-07, - "loss": 0.006, - "reward": 1.5511808395385742, - "reward_std": 0.048661187291145325, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.42618075013160706, - "step": 2419 - }, - { - "completion_length": 120.609375, - "epoch": 0.7711918419375399, - "grad_norm": 189.0177001953125, - "kl": 0.2001953125, - "learning_rate": 2.2880815806246014e-07, - "loss": 0.008, - "reward": 1.488572120666504, - "reward_std": 0.04395397752523422, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48857206106185913, - "rewards/pad": 0.0, - "step": 2420 - }, - { - "completion_length": 121.171875, - "epoch": 0.7715105162523901, - "grad_norm": 13.16550064086914, - "kl": 0.1162109375, - "learning_rate": 2.2848948374760992e-07, - "loss": 0.0047, - "reward": 1.6465511322021484, - "reward_std": 0.05637906491756439, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6465510725975037, - "step": 2421 - }, - { - "completion_length": 150.09375, - "epoch": 0.7718291905672403, - "grad_norm": 16.02005958557129, - "kl": 0.1357421875, - "learning_rate": 2.281708094327597e-07, - "loss": 0.0054, - "reward": 1.535804033279419, - "reward_std": 0.0664563775062561, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4108040928840637, - "rewards/pad": 0.125, - "step": 2422 - }, - { - "completion_length": 70.671875, - "epoch": 0.7721478648820905, - "grad_norm": 27.423751831054688, - "kl": 0.216796875, - "learning_rate": 2.2785213511790948e-07, - "loss": 0.0087, - "reward": 1.7402122020721436, - "reward_std": 0.17992594838142395, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5058372616767883, - "step": 2423 - }, - { - "completion_length": 98.546875, - "epoch": 0.7724665391969407, - "grad_norm": 15.953474998474121, - "kl": 0.1484375, - "learning_rate": 2.2753346080305926e-07, - "loss": 0.0059, - "reward": 1.907003402709961, - "reward_std": 0.08464668691158295, - "rewards/pad": 0.359375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5476284027099609, - "step": 2424 - }, - { - "completion_length": 73.234375, - "epoch": 0.772785213511791, - "grad_norm": 27.358421325683594, - "kl": 0.228515625, - "learning_rate": 2.2721478648820904e-07, - "loss": 0.0091, - "reward": 1.627385139465332, - "reward_std": 0.11387316137552261, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5180102586746216, - "rewards/pad": 0.109375, - "step": 2425 - }, - { - "completion_length": 71.5, - "epoch": 0.7731038878266412, - "grad_norm": 36.27244186401367, - "kl": 0.31640625, - "learning_rate": 2.2689611217335882e-07, - "loss": 0.0126, - "reward": 1.5924443006515503, - "reward_std": 0.08176687359809875, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5924442410469055, - "step": 2426 - }, - { - "completion_length": 68.953125, - "epoch": 0.7734225621414914, - "grad_norm": 26.173524856567383, - "kl": 0.1552734375, - "learning_rate": 2.265774378585086e-07, - "loss": 0.0062, - "reward": 1.7913737297058105, - "reward_std": 0.10003790259361267, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.556998610496521, - "step": 2427 - }, - { - "completion_length": 97.625, - "epoch": 0.7737412364563416, - "grad_norm": 34.93221664428711, - "kl": 0.1728515625, - "learning_rate": 2.2625876354365836e-07, - "loss": 0.0069, - "reward": 1.7253172397613525, - "reward_std": 0.09326071292161942, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6003172397613525, - "step": 2428 - }, - { - "completion_length": 67.09375, - "epoch": 0.7740599107711919, - "grad_norm": 26.156747817993164, - "kl": 0.2001953125, - "learning_rate": 2.2594008922880814e-07, - "loss": 0.008, - "reward": 1.7022361755371094, - "reward_std": 0.08020318299531937, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5928612947463989, - "step": 2429 - }, - { - "completion_length": 96.109375, - "epoch": 0.7743785850860421, - "grad_norm": 18.430601119995117, - "kl": 0.224609375, - "learning_rate": 2.2562141491395792e-07, - "loss": 0.009, - "reward": 1.7515828609466553, - "reward_std": 0.07464728504419327, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6265829205513, - "step": 2430 - }, - { - "completion_length": 120.28125, - "epoch": 0.7746972594008923, - "grad_norm": 159.50904846191406, - "kl": 0.09375, - "learning_rate": 2.253027405991077e-07, - "loss": 0.0037, - "reward": 1.4814085960388184, - "reward_std": 0.052604131400585175, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3564087748527527, - "step": 2431 - }, - { - "completion_length": 97.671875, - "epoch": 0.7750159337157425, - "grad_norm": 6.790005207061768, - "kl": 0.16015625, - "learning_rate": 2.2498406628425748e-07, - "loss": 0.0064, - "reward": 1.54752779006958, - "reward_std": 0.08390877395868301, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3131527304649353, - "step": 2432 - }, - { - "completion_length": 172.171875, - "epoch": 0.7753346080305927, - "grad_norm": 7.875866413116455, - "kl": 0.11572265625, - "learning_rate": 2.2466539196940726e-07, - "loss": 0.0046, - "reward": 1.4796223640441895, - "reward_std": 0.04749543219804764, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4796224534511566, - "step": 2433 - }, - { - "completion_length": 96.59375, - "epoch": 0.775653282345443, - "grad_norm": 19.865304946899414, - "kl": 0.24609375, - "learning_rate": 2.2434671765455704e-07, - "loss": 0.0098, - "reward": 1.6537237167358398, - "reward_std": 0.04369035363197327, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5287237167358398, - "step": 2434 - }, - { - "completion_length": 122.34375, - "epoch": 0.7759719566602932, - "grad_norm": 16.42331886291504, - "kl": 0.1982421875, - "learning_rate": 2.2402804333970682e-07, - "loss": 0.0079, - "reward": 1.4489281177520752, - "reward_std": 0.06528669595718384, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44892802834510803, - "step": 2435 - }, - { - "completion_length": 122.078125, - "epoch": 0.7762906309751434, - "grad_norm": 91.30301666259766, - "kl": 0.10302734375, - "learning_rate": 2.237093690248566e-07, - "loss": 0.0041, - "reward": 1.6978908777236938, - "reward_std": 0.042654849588871, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5728908777236938, - "rewards/pad": 0.125, - "step": 2436 - }, - { - "completion_length": 120.1875, - "epoch": 0.7766093052899936, - "grad_norm": 27.884532928466797, - "kl": 0.1357421875, - "learning_rate": 2.2339069471000636e-07, - "loss": 0.0054, - "reward": 1.4870136976242065, - "reward_std": 0.04622916132211685, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3620137572288513, - "step": 2437 - }, - { - "completion_length": 70.03125, - "epoch": 0.7769279796048438, - "grad_norm": 42.33784484863281, - "kl": 0.1396484375, - "learning_rate": 2.2307202039515614e-07, - "loss": 0.0056, - "reward": 1.5410778522491455, - "reward_std": 0.051338449120521545, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5410779714584351, - "rewards/pad": 0.0, - "step": 2438 - }, - { - "completion_length": 97.8125, - "epoch": 0.777246653919694, - "grad_norm": 37.22806167602539, - "kl": 0.1376953125, - "learning_rate": 2.2275334608030592e-07, - "loss": 0.0055, - "reward": 1.5154039859771729, - "reward_std": 0.1367894560098648, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4685288369655609, - "rewards/pad": 0.0625, - "step": 2439 - }, - { - "completion_length": 122.890625, - "epoch": 0.7775653282345443, - "grad_norm": 32.99778747558594, - "kl": 0.201171875, - "learning_rate": 2.224346717654557e-07, - "loss": 0.008, - "reward": 1.6719995737075806, - "reward_std": 0.08523323386907578, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42199957370758057, - "step": 2440 - }, - { - "completion_length": 94.890625, - "epoch": 0.7778840025493945, - "grad_norm": 41.571407318115234, - "kl": 0.201171875, - "learning_rate": 2.2211599745060548e-07, - "loss": 0.008, - "reward": 1.5549380779266357, - "reward_std": 0.08996760845184326, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5549381375312805, - "step": 2441 - }, - { - "completion_length": 69.546875, - "epoch": 0.7782026768642447, - "grad_norm": 58.19086837768555, - "kl": 0.69921875, - "learning_rate": 2.2179732313575526e-07, - "loss": 0.0279, - "reward": 1.4920077323913574, - "reward_std": 0.08193758130073547, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5076327323913574, - "rewards/pad": 0.0, - "step": 2442 - }, - { - "completion_length": 94.171875, - "epoch": 0.7785213511790949, - "grad_norm": 30.5841064453125, - "kl": 0.1455078125, - "learning_rate": 2.2147864882090504e-07, - "loss": 0.0058, - "reward": 1.55403470993042, - "reward_std": 0.08569388091564178, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42903465032577515, - "step": 2443 - }, - { - "completion_length": 73.875, - "epoch": 0.7788400254939452, - "grad_norm": 69.72589111328125, - "kl": 0.1552734375, - "learning_rate": 2.2115997450605482e-07, - "loss": 0.0062, - "reward": 2.079951763153076, - "reward_std": 0.06619341671466827, - "rewards/pad": 0.625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4549517035484314, - "step": 2444 - }, - { - "completion_length": 98.125, - "epoch": 0.7791586998087954, - "grad_norm": 39.69470977783203, - "kl": 0.1357421875, - "learning_rate": 2.208413001912046e-07, - "loss": 0.0054, - "reward": 1.742027759552002, - "reward_std": 0.05924932286143303, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49202778935432434, - "step": 2445 - }, - { - "completion_length": 148.796875, - "epoch": 0.7794773741236456, - "grad_norm": 9.07218074798584, - "kl": 0.1875, - "learning_rate": 2.2052262587635438e-07, - "loss": 0.0075, - "reward": 1.384962797164917, - "reward_std": 0.04016319662332535, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3849627375602722, - "step": 2446 - }, - { - "completion_length": 73.359375, - "epoch": 0.7797960484384958, - "grad_norm": 22.801563262939453, - "kl": 0.1083984375, - "learning_rate": 2.202039515615041e-07, - "loss": 0.0043, - "reward": 1.6630971431732178, - "reward_std": 0.12162607908248901, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5224721431732178, - "rewards/pad": 0.140625, - "step": 2447 - }, - { - "completion_length": 145.84375, - "epoch": 0.780114722753346, - "grad_norm": 19.592363357543945, - "kl": 0.2021484375, - "learning_rate": 2.198852772466539e-07, - "loss": 0.0081, - "reward": 1.5129486322402954, - "reward_std": 0.06351804733276367, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5129486918449402, - "rewards/pad": 0.0, - "step": 2448 - }, - { - "completion_length": 71.34375, - "epoch": 0.7804333970681963, - "grad_norm": 72.86392974853516, - "kl": 0.20703125, - "learning_rate": 2.1956660293180367e-07, - "loss": 0.0083, - "reward": 1.62663996219635, - "reward_std": 0.14645737409591675, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5172649621963501, - "rewards/pad": 0.125, - "step": 2449 - }, - { - "completion_length": 71.140625, - "epoch": 0.7807520713830465, - "grad_norm": 30.3797550201416, - "kl": 0.169921875, - "learning_rate": 2.1924792861695345e-07, - "loss": 0.0068, - "reward": 1.7255603075027466, - "reward_std": 0.08941961079835892, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4911852777004242, - "rewards/pad": 0.234375, - "step": 2450 - }, - { - "completion_length": 43.921875, - "epoch": 0.7810707456978967, - "grad_norm": 48.79005813598633, - "kl": 0.181640625, - "learning_rate": 2.1892925430210323e-07, - "loss": 0.0073, - "reward": 1.679802656173706, - "reward_std": 0.07583755254745483, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6798027753829956, - "rewards/pad": 0.0, - "step": 2451 - }, - { - "completion_length": 145.875, - "epoch": 0.781389420012747, - "grad_norm": 18.23691177368164, - "kl": 0.09765625, - "learning_rate": 2.18610579987253e-07, - "loss": 0.0039, - "reward": 1.3844993114471436, - "reward_std": 0.07726074010133743, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.38449931144714355, - "step": 2452 - }, - { - "completion_length": 70.515625, - "epoch": 0.7817080943275972, - "grad_norm": 25.58438491821289, - "kl": 0.232421875, - "learning_rate": 2.182919056724028e-07, - "loss": 0.0093, - "reward": 1.67991304397583, - "reward_std": 0.11509126424789429, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5705380439758301, - "rewards/pad": 0.109375, - "step": 2453 - }, - { - "completion_length": 46.40625, - "epoch": 0.7820267686424475, - "grad_norm": 25.998775482177734, - "kl": 0.232421875, - "learning_rate": 2.1797323135755257e-07, - "loss": 0.0093, - "reward": 1.8417690992355347, - "reward_std": 0.08827841281890869, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5917690992355347, - "step": 2454 - }, - { - "completion_length": 95.328125, - "epoch": 0.7823454429572977, - "grad_norm": 31.927288055419922, - "kl": 0.203125, - "learning_rate": 2.1765455704270235e-07, - "loss": 0.0081, - "reward": 1.6434804201126099, - "reward_std": 0.2039296180009842, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5653554201126099, - "rewards/pad": 0.09375, - "step": 2455 - }, - { - "completion_length": 44.828125, - "epoch": 0.7826641172721479, - "grad_norm": 25.86640739440918, - "kl": 0.2578125, - "learning_rate": 2.1733588272785213e-07, - "loss": 0.0103, - "reward": 1.8428138494491577, - "reward_std": 0.08614397048950195, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7178137302398682, - "rewards/pad": 0.125, - "step": 2456 - }, - { - "completion_length": 94.796875, - "epoch": 0.7829827915869981, - "grad_norm": 272.5141296386719, - "kl": 0.16796875, - "learning_rate": 2.170172084130019e-07, - "loss": 0.0067, - "reward": 1.67355215549469, - "reward_std": 0.0782032459974289, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6735521554946899, - "rewards/pad": 0.0, - "step": 2457 - }, - { - "completion_length": 69.28125, - "epoch": 0.7833014659018483, - "grad_norm": 36.2894401550293, - "kl": 0.1416015625, - "learning_rate": 2.1669853409815167e-07, - "loss": 0.0057, - "reward": 1.472875714302063, - "reward_std": 0.07404038310050964, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47287559509277344, - "rewards/pad": 0.0, - "step": 2458 - }, - { - "completion_length": 43.921875, - "epoch": 0.7836201402166986, - "grad_norm": 32.434974670410156, - "kl": 0.169921875, - "learning_rate": 2.1637985978330145e-07, - "loss": 0.0068, - "reward": 1.7181332111358643, - "reward_std": 0.145050510764122, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.6087582111358643, - "step": 2459 - }, - { - "completion_length": 120.484375, - "epoch": 0.7839388145315488, - "grad_norm": 55.203033447265625, - "kl": 0.154296875, - "learning_rate": 2.1606118546845123e-07, - "loss": 0.0062, - "reward": 1.6572990417480469, - "reward_std": 0.08091993629932404, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5322990417480469, - "step": 2460 - }, - { - "completion_length": 96.65625, - "epoch": 0.784257488846399, - "grad_norm": 31.529285430908203, - "kl": 0.2373046875, - "learning_rate": 2.15742511153601e-07, - "loss": 0.0095, - "reward": 1.8024909496307373, - "reward_std": 0.14928418397903442, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5681160688400269, - "step": 2461 - }, - { - "completion_length": 122.0625, - "epoch": 0.7845761631612492, - "grad_norm": 33.83241653442383, - "kl": 0.2275390625, - "learning_rate": 2.154238368387508e-07, - "loss": 0.0091, - "reward": 1.4690876007080078, - "reward_std": 0.09109814465045929, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4690876603126526, - "rewards/pad": 0.0, - "step": 2462 - }, - { - "completion_length": 146.15625, - "epoch": 0.7848948374760994, - "grad_norm": 20.81785774230957, - "kl": 0.2431640625, - "learning_rate": 2.1510516252390057e-07, - "loss": 0.0097, - "reward": 1.5801405906677246, - "reward_std": 0.046842776238918304, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5801405310630798, - "rewards/pad": 0.0, - "step": 2463 - }, - { - "completion_length": 145.0, - "epoch": 0.7852135117909497, - "grad_norm": 14.387179374694824, - "kl": 0.0830078125, - "learning_rate": 2.1478648820905035e-07, - "loss": 0.0033, - "reward": 1.6135268211364746, - "reward_std": 0.04455283656716347, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4885267913341522, - "step": 2464 - }, - { - "completion_length": 93.46875, - "epoch": 0.7855321861057999, - "grad_norm": 50.458255767822266, - "kl": 0.2119140625, - "learning_rate": 2.1446781389420013e-07, - "loss": 0.0085, - "reward": 1.3679983615875244, - "reward_std": 0.056924257427453995, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3679984509944916, - "rewards/pad": 0.0, - "step": 2465 - }, - { - "completion_length": 69.6875, - "epoch": 0.7858508604206501, - "grad_norm": 19.51398468017578, - "kl": 0.18359375, - "learning_rate": 2.141491395793499e-07, - "loss": 0.0073, - "reward": 1.6340663433074951, - "reward_std": 0.09008748829364777, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5090662240982056, - "rewards/pad": 0.125, - "step": 2466 - }, - { - "completion_length": 20.859375, - "epoch": 0.7861695347355003, - "grad_norm": 91.67015838623047, - "kl": 0.1982421875, - "learning_rate": 2.1383046526449967e-07, - "loss": 0.008, - "reward": 1.6718909740447998, - "reward_std": 0.08964984118938446, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5468908548355103, - "step": 2467 - }, - { - "completion_length": 94.015625, - "epoch": 0.7864882090503506, - "grad_norm": 33.59825134277344, - "kl": 0.11328125, - "learning_rate": 2.1351179094964945e-07, - "loss": 0.0045, - "reward": 1.6019232273101807, - "reward_std": 0.04051993787288666, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6019231677055359, - "step": 2468 - }, - { - "completion_length": 118.484375, - "epoch": 0.7868068833652008, - "grad_norm": 17.77065086364746, - "kl": 0.20703125, - "learning_rate": 2.1319311663479923e-07, - "loss": 0.0083, - "reward": 1.4984447956085205, - "reward_std": 0.04714757204055786, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4984448552131653, - "step": 2469 - }, - { - "completion_length": 120.609375, - "epoch": 0.787125557680051, - "grad_norm": 89.51698303222656, - "kl": 0.138671875, - "learning_rate": 2.12874442319949e-07, - "loss": 0.0056, - "reward": 1.6271116733551025, - "reward_std": 0.047239039093256, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6271117329597473, - "rewards/pad": 0.0, - "step": 2470 - }, - { - "completion_length": 44.203125, - "epoch": 0.7874442319949012, - "grad_norm": 16.942968368530273, - "kl": 0.244140625, - "learning_rate": 2.125557680050988e-07, - "loss": 0.0098, - "reward": 1.7499277591705322, - "reward_std": 0.09639608860015869, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6249277591705322, - "rewards/pad": 0.125, - "step": 2471 - }, - { - "completion_length": 96.765625, - "epoch": 0.7877629063097514, - "grad_norm": 31.528419494628906, - "kl": 0.1455078125, - "learning_rate": 2.1223709369024857e-07, - "loss": 0.0058, - "reward": 1.540238857269287, - "reward_std": 0.04830256104469299, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5402387976646423, - "step": 2472 - }, - { - "completion_length": 120.859375, - "epoch": 0.7880815806246017, - "grad_norm": 11.028849601745605, - "kl": 0.14453125, - "learning_rate": 2.1191841937539835e-07, - "loss": 0.0058, - "reward": 1.5415122509002686, - "reward_std": 0.04378604516386986, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.416512131690979, - "step": 2473 - }, - { - "completion_length": 119.84375, - "epoch": 0.7884002549394519, - "grad_norm": 15.173741340637207, - "kl": 0.17578125, - "learning_rate": 2.1159974506054813e-07, - "loss": 0.007, - "reward": 1.8214788436889648, - "reward_std": 0.0862974226474762, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6964788436889648, - "step": 2474 - }, - { - "completion_length": 120.46875, - "epoch": 0.7887189292543021, - "grad_norm": 36.748870849609375, - "kl": 0.1103515625, - "learning_rate": 2.112810707456979e-07, - "loss": 0.0044, - "reward": 1.4876748323440552, - "reward_std": 0.026824306696653366, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3626747727394104, - "step": 2475 - }, - { - "completion_length": 122.140625, - "epoch": 0.7890376035691523, - "grad_norm": 47.65331268310547, - "kl": 0.115234375, - "learning_rate": 2.109623964308477e-07, - "loss": 0.0046, - "reward": 1.5400230884552002, - "reward_std": 0.11705848574638367, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4306480884552002, - "step": 2476 - }, - { - "completion_length": 119.796875, - "epoch": 0.7893562778840025, - "grad_norm": 34.42538070678711, - "kl": 0.1611328125, - "learning_rate": 2.1064372211599742e-07, - "loss": 0.0064, - "reward": 1.5244556665420532, - "reward_std": 0.061718154698610306, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5244556665420532, - "rewards/pad": 0.0, - "step": 2477 - }, - { - "completion_length": 146.21875, - "epoch": 0.7896749521988528, - "grad_norm": 39.508583068847656, - "kl": 0.189453125, - "learning_rate": 2.103250478011472e-07, - "loss": 0.0076, - "reward": 1.4695749282836914, - "reward_std": 0.07347087562084198, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.469574898481369, - "rewards/pad": 0.0, - "step": 2478 - }, - { - "completion_length": 70.515625, - "epoch": 0.789993626513703, - "grad_norm": 37.19403839111328, - "kl": 0.2177734375, - "learning_rate": 2.1000637348629698e-07, - "loss": 0.0087, - "reward": 1.6448030471801758, - "reward_std": 0.05787891894578934, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5198030471801758, - "rewards/pad": 0.125, - "step": 2479 - }, - { - "completion_length": 120.625, - "epoch": 0.7903123008285532, - "grad_norm": 62.248783111572266, - "kl": 0.1337890625, - "learning_rate": 2.0968769917144676e-07, - "loss": 0.0054, - "reward": 1.6159905195236206, - "reward_std": 0.04167880117893219, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6159905195236206, - "step": 2480 - }, - { - "completion_length": 143.671875, - "epoch": 0.7906309751434034, - "grad_norm": 19.507673263549805, - "kl": 0.2080078125, - "learning_rate": 2.0936902485659654e-07, - "loss": 0.0083, - "reward": 1.468658447265625, - "reward_std": 0.0568566769361496, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46865853667259216, - "step": 2481 - }, - { - "completion_length": 147.46875, - "epoch": 0.7909496494582536, - "grad_norm": 10.353738784790039, - "kl": 0.10009765625, - "learning_rate": 2.0905035054174632e-07, - "loss": 0.004, - "reward": 1.5171469449996948, - "reward_std": 0.05589066445827484, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3921470046043396, - "step": 2482 - }, - { - "completion_length": 45.390625, - "epoch": 0.7912683237731039, - "grad_norm": 74.0340347290039, - "kl": 0.25390625, - "learning_rate": 2.087316762268961e-07, - "loss": 0.0101, - "reward": 1.7721078395843506, - "reward_std": 0.12054979801177979, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6471078395843506, - "rewards/pad": 0.125, - "step": 2483 - }, - { - "completion_length": 98.421875, - "epoch": 0.7915869980879541, - "grad_norm": 37.16038131713867, - "kl": 0.1875, - "learning_rate": 2.0841300191204588e-07, - "loss": 0.0075, - "reward": 1.6870630979537964, - "reward_std": 0.06434883177280426, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4370630383491516, - "step": 2484 - }, - { - "completion_length": 147.796875, - "epoch": 0.7919056724028043, - "grad_norm": 39.15401840209961, - "kl": 0.1318359375, - "learning_rate": 2.0809432759719566e-07, - "loss": 0.0053, - "reward": 1.449187994003296, - "reward_std": 0.046437084674835205, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32418808341026306, - "step": 2485 - }, - { - "completion_length": 98.734375, - "epoch": 0.7922243467176545, - "grad_norm": 62.96199035644531, - "kl": 0.23046875, - "learning_rate": 2.0777565328234542e-07, - "loss": 0.0092, - "reward": 1.6974351406097412, - "reward_std": 0.08225294947624207, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5880601406097412, - "step": 2486 - }, - { - "completion_length": 71.0, - "epoch": 0.7925430210325047, - "grad_norm": 246.57899475097656, - "kl": 0.1943359375, - "learning_rate": 2.074569789674952e-07, - "loss": 0.0078, - "reward": 1.6646615266799927, - "reward_std": 0.08801724016666412, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6646615266799927, - "rewards/pad": 0.0, - "step": 2487 - }, - { - "completion_length": 69.5, - "epoch": 0.792861695347355, - "grad_norm": 39.39512252807617, - "kl": 0.21875, - "learning_rate": 2.0713830465264498e-07, - "loss": 0.0088, - "reward": 1.6667184829711914, - "reward_std": 0.06767316162586212, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5417184233665466, - "step": 2488 - }, - { - "completion_length": 94.0625, - "epoch": 0.7931803696622052, - "grad_norm": 30.99285125732422, - "kl": 0.1669921875, - "learning_rate": 2.0681963033779476e-07, - "loss": 0.0067, - "reward": 1.5758203268051147, - "reward_std": 0.06976833194494247, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5758203268051147, - "rewards/pad": 0.0, - "step": 2489 - }, - { - "completion_length": 44.359375, - "epoch": 0.7934990439770554, - "grad_norm": 91.51752471923828, - "kl": 0.2001953125, - "learning_rate": 2.0650095602294454e-07, - "loss": 0.008, - "reward": 1.4082024097442627, - "reward_std": 0.05723255127668381, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4082023501396179, - "rewards/pad": 0.0, - "step": 2490 - }, - { - "completion_length": 97.40625, - "epoch": 0.7938177182919057, - "grad_norm": 78.94409942626953, - "kl": 0.177734375, - "learning_rate": 2.0618228170809432e-07, - "loss": 0.0071, - "reward": 1.519840955734253, - "reward_std": 0.06558603793382645, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39484092593193054, - "step": 2491 - }, - { - "completion_length": 172.453125, - "epoch": 0.794136392606756, - "grad_norm": 9.942802429199219, - "kl": 0.09716796875, - "learning_rate": 2.058636073932441e-07, - "loss": 0.0039, - "reward": 1.6672635078430176, - "reward_std": 0.03277071192860603, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5422636270523071, - "step": 2492 - }, - { - "completion_length": 70.515625, - "epoch": 0.7944550669216062, - "grad_norm": 39.335575103759766, - "kl": 0.228515625, - "learning_rate": 2.0554493307839388e-07, - "loss": 0.0092, - "reward": 1.717850923538208, - "reward_std": 0.12057183682918549, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5928510427474976, - "step": 2493 - }, - { - "completion_length": 172.1875, - "epoch": 0.7947737412364564, - "grad_norm": 13.286402702331543, - "kl": 0.09033203125, - "learning_rate": 2.0522625876354366e-07, - "loss": 0.0036, - "reward": 1.4089343547821045, - "reward_std": 0.02762546017765999, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4089343547821045, - "step": 2494 - }, - { - "completion_length": 70.421875, - "epoch": 0.7950924155513066, - "grad_norm": 24.12227439880371, - "kl": 0.19921875, - "learning_rate": 2.0490758444869344e-07, - "loss": 0.008, - "reward": 1.798035979270935, - "reward_std": 0.08788996189832687, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5636609792709351, - "rewards/pad": 0.234375, - "step": 2495 - }, - { - "completion_length": 19.984375, - "epoch": 0.7954110898661568, - "grad_norm": 134.33108520507812, - "kl": 0.2451171875, - "learning_rate": 2.045889101338432e-07, - "loss": 0.0098, - "reward": 1.7323206663131714, - "reward_std": 0.07797897607088089, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49794575572013855, - "rewards/pad": 0.234375, - "step": 2496 - }, - { - "completion_length": 44.703125, - "epoch": 0.795729764181007, - "grad_norm": 112.21025085449219, - "kl": 0.193359375, - "learning_rate": 2.0427023581899297e-07, - "loss": 0.0077, - "reward": 1.5991157293319702, - "reward_std": 0.04800572246313095, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5991157293319702, - "rewards/pad": 0.0, - "step": 2497 - }, - { - "completion_length": 92.8125, - "epoch": 0.7960484384958573, - "grad_norm": 48.543636322021484, - "kl": 0.2021484375, - "learning_rate": 2.0395156150414276e-07, - "loss": 0.0081, - "reward": 1.562514066696167, - "reward_std": 0.03672177344560623, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5625141263008118, - "rewards/pad": 0.0, - "step": 2498 - }, - { - "completion_length": 68.734375, - "epoch": 0.7963671128107075, - "grad_norm": 48.04470443725586, - "kl": 0.1474609375, - "learning_rate": 2.0363288718929254e-07, - "loss": 0.0059, - "reward": 1.6913621425628662, - "reward_std": 0.052751973271369934, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5663621425628662, - "step": 2499 - }, - { - "completion_length": 146.3125, - "epoch": 0.7966857871255577, - "grad_norm": 21.584346771240234, - "kl": 0.107421875, - "learning_rate": 2.0331421287444232e-07, - "loss": 0.0043, - "reward": 1.4488475322723389, - "reward_std": 0.07207922637462616, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43322253227233887, - "step": 2500 - }, - { - "completion_length": 171.71875, - "epoch": 0.7970044614404079, - "grad_norm": 11.806103706359863, - "kl": 0.1298828125, - "learning_rate": 2.029955385595921e-07, - "loss": 0.0052, - "reward": 1.574622631072998, - "reward_std": 0.062131188809871674, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5746225714683533, - "rewards/pad": 0.0, - "step": 2501 - }, - { - "completion_length": 69.65625, - "epoch": 0.7973231357552581, - "grad_norm": 45.881038665771484, - "kl": 0.30078125, - "learning_rate": 2.0267686424474188e-07, - "loss": 0.0121, - "reward": 1.6294794082641602, - "reward_std": 0.06624305248260498, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5044794678688049, - "rewards/pad": 0.125, - "step": 2502 - }, - { - "completion_length": 122.328125, - "epoch": 0.7976418100701084, - "grad_norm": 19.17009925842285, - "kl": 0.1455078125, - "learning_rate": 2.0235818992989166e-07, - "loss": 0.0058, - "reward": 1.4887192249298096, - "reward_std": 0.11026011407375336, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.48871925473213196, - "step": 2503 - }, - { - "completion_length": 121.40625, - "epoch": 0.7979604843849586, - "grad_norm": 28.53675079345703, - "kl": 0.09033203125, - "learning_rate": 2.0203951561504144e-07, - "loss": 0.0036, - "reward": 1.7072861194610596, - "reward_std": 0.05641203373670578, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45728611946105957, - "rewards/pad": 0.25, - "step": 2504 - }, - { - "completion_length": 94.0625, - "epoch": 0.7982791586998088, - "grad_norm": 30.394912719726562, - "kl": 0.162109375, - "learning_rate": 2.0172084130019122e-07, - "loss": 0.0065, - "reward": 1.6276651620864868, - "reward_std": 0.05921010673046112, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5026651620864868, - "step": 2505 - }, - { - "completion_length": 94.921875, - "epoch": 0.798597833014659, - "grad_norm": 19.432514190673828, - "kl": 0.2021484375, - "learning_rate": 2.0140216698534097e-07, - "loss": 0.0081, - "reward": 1.486906886100769, - "reward_std": 0.10192431509494781, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42440685629844666, - "step": 2506 - }, - { - "completion_length": 123.59375, - "epoch": 0.7989165073295093, - "grad_norm": 32.435279846191406, - "kl": 0.12060546875, - "learning_rate": 2.0108349267049073e-07, - "loss": 0.0048, - "reward": 1.606692910194397, - "reward_std": 0.04950396716594696, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4816928803920746, - "step": 2507 - }, - { - "completion_length": 95.6875, - "epoch": 0.7992351816443595, - "grad_norm": 100.50098419189453, - "kl": 0.140625, - "learning_rate": 2.007648183556405e-07, - "loss": 0.0056, - "reward": 1.4444937705993652, - "reward_std": 0.05337250977754593, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44449371099472046, - "rewards/pad": 0.0, - "step": 2508 - }, - { - "completion_length": 44.953125, - "epoch": 0.7995538559592097, - "grad_norm": 35.06789779663086, - "kl": 0.1923828125, - "learning_rate": 2.004461440407903e-07, - "loss": 0.0077, - "reward": 1.7531867027282715, - "reward_std": 0.06342080235481262, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6281867623329163, - "rewards/pad": 0.125, - "step": 2509 - }, - { - "completion_length": 96.953125, - "epoch": 0.7998725302740599, - "grad_norm": 94.3211898803711, - "kl": 0.1865234375, - "learning_rate": 2.0012746972594007e-07, - "loss": 0.0074, - "reward": 1.6970831155776978, - "reward_std": 0.07682255655527115, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.44708314538002014, - "step": 2510 - }, - { - "completion_length": 46.375, - "epoch": 0.8001912045889101, - "grad_norm": 22.679075241088867, - "kl": 0.365234375, - "learning_rate": 1.9980879541108985e-07, - "loss": 0.0146, - "reward": 1.5437774658203125, - "reward_std": 0.12744572758674622, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.543777585029602, - "step": 2511 - }, - { - "completion_length": 146.234375, - "epoch": 0.8005098789037604, - "grad_norm": 7.708920955657959, - "kl": 0.1201171875, - "learning_rate": 1.9949012109623963e-07, - "loss": 0.0048, - "reward": 1.5232867002487183, - "reward_std": 0.036368198692798615, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5232867002487183, - "step": 2512 - }, - { - "completion_length": 99.40625, - "epoch": 0.8008285532186106, - "grad_norm": 77.54698944091797, - "kl": 0.11181640625, - "learning_rate": 1.991714467813894e-07, - "loss": 0.0045, - "reward": 1.5867327451705933, - "reward_std": 0.10627540946006775, - "rewards/answer_reward": 0.15625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43048280477523804, - "step": 2513 - }, - { - "completion_length": 121.03125, - "epoch": 0.8011472275334608, - "grad_norm": 19.157119750976562, - "kl": 0.14453125, - "learning_rate": 1.988527724665392e-07, - "loss": 0.0058, - "reward": 1.5624780654907227, - "reward_std": 0.02887941338121891, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.31247803568840027, - "step": 2514 - }, - { - "completion_length": 145.65625, - "epoch": 0.801465901848311, - "grad_norm": 23.609500885009766, - "kl": 0.1376953125, - "learning_rate": 1.9853409815168897e-07, - "loss": 0.0055, - "reward": 1.5416293144226074, - "reward_std": 0.06482907384634018, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.541629433631897, - "rewards/pad": 0.0, - "step": 2515 - }, - { - "completion_length": 123.03125, - "epoch": 0.8017845761631612, - "grad_norm": 48.15242385864258, - "kl": 0.126953125, - "learning_rate": 1.9821542383683872e-07, - "loss": 0.0051, - "reward": 1.500812292098999, - "reward_std": 0.11029379069805145, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4226873517036438, - "rewards/pad": 0.078125, - "step": 2516 - }, - { - "completion_length": 118.4375, - "epoch": 0.8021032504780115, - "grad_norm": 44.145843505859375, - "kl": 0.10888671875, - "learning_rate": 1.978967495219885e-07, - "loss": 0.0044, - "reward": 1.3923659324645996, - "reward_std": 0.04641153663396835, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3923659324645996, - "step": 2517 - }, - { - "completion_length": 72.65625, - "epoch": 0.8024219247928617, - "grad_norm": 63.79623794555664, - "kl": 0.1337890625, - "learning_rate": 1.9757807520713829e-07, - "loss": 0.0053, - "reward": 1.8156921863555908, - "reward_std": 0.12485399842262268, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.581317126750946, - "rewards/pad": 0.25, - "step": 2518 - }, - { - "completion_length": 95.265625, - "epoch": 0.8027405991077119, - "grad_norm": 148.3966827392578, - "kl": 0.12890625, - "learning_rate": 1.9725940089228807e-07, - "loss": 0.0052, - "reward": 1.866058111190796, - "reward_std": 0.06109733134508133, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4910580813884735, - "rewards/pad": 0.375, - "step": 2519 - }, - { - "completion_length": 71.609375, - "epoch": 0.8030592734225621, - "grad_norm": 16.46411895751953, - "kl": 0.1455078125, - "learning_rate": 1.9694072657743785e-07, - "loss": 0.0058, - "reward": 1.8894639015197754, - "reward_std": 0.05118672549724579, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6394637823104858, - "step": 2520 - }, - { - "completion_length": 126.0625, - "epoch": 0.8033779477374123, - "grad_norm": 27.57711410522461, - "kl": 0.13671875, - "learning_rate": 1.9662205226258763e-07, - "loss": 0.0055, - "reward": 1.6503820419311523, - "reward_std": 0.07189613580703735, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5253819227218628, - "step": 2521 - }, - { - "completion_length": 97.796875, - "epoch": 0.8036966220522626, - "grad_norm": 75.9854736328125, - "kl": 0.2060546875, - "learning_rate": 1.963033779477374e-07, - "loss": 0.0083, - "reward": 1.6430482864379883, - "reward_std": 0.044374167919158936, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5180484056472778, - "step": 2522 - }, - { - "completion_length": 173.03125, - "epoch": 0.8040152963671128, - "grad_norm": 22.706045150756836, - "kl": 0.10400390625, - "learning_rate": 1.959847036328872e-07, - "loss": 0.0042, - "reward": 1.529914379119873, - "reward_std": 0.05517827719449997, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40491437911987305, - "step": 2523 - }, - { - "completion_length": 98.8125, - "epoch": 0.804333970681963, - "grad_norm": 68.7165298461914, - "kl": 0.142578125, - "learning_rate": 1.9566602931803697e-07, - "loss": 0.0057, - "reward": 1.7077398300170898, - "reward_std": 0.12510909140110016, - "rewards/pad": 0.328125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3796148896217346, - "step": 2524 - }, - { - "completion_length": 121.125, - "epoch": 0.8046526449968132, - "grad_norm": 23.41330337524414, - "kl": 0.1259765625, - "learning_rate": 1.9534735500318675e-07, - "loss": 0.005, - "reward": 1.5351699590682983, - "reward_std": 0.12469430267810822, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4257949888706207, - "rewards/pad": 0.125, - "step": 2525 - }, - { - "completion_length": 119.515625, - "epoch": 0.8049713193116634, - "grad_norm": 41.60466384887695, - "kl": 0.1220703125, - "learning_rate": 1.950286806883365e-07, - "loss": 0.0049, - "reward": 1.618861436843872, - "reward_std": 0.08256522566080093, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4938614070415497, - "rewards/pad": 0.125, - "step": 2526 - }, - { - "completion_length": 96.515625, - "epoch": 0.8052899936265137, - "grad_norm": 64.93916320800781, - "kl": 0.248046875, - "learning_rate": 1.9471000637348628e-07, - "loss": 0.0099, - "reward": 1.637161374092102, - "reward_std": 0.09472230076789856, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5121614336967468, - "rewards/pad": 0.125, - "step": 2527 - }, - { - "completion_length": 121.828125, - "epoch": 0.8056086679413639, - "grad_norm": 13.994966506958008, - "kl": 0.1796875, - "learning_rate": 1.9439133205863606e-07, - "loss": 0.0072, - "reward": 1.6026124954223633, - "reward_std": 0.07690699398517609, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4932374954223633, - "step": 2528 - }, - { - "completion_length": 21.015625, - "epoch": 0.8059273422562141, - "grad_norm": 60.27497100830078, - "kl": 0.25390625, - "learning_rate": 1.9407265774378584e-07, - "loss": 0.0102, - "reward": 1.5055058002471924, - "reward_std": 0.14143159985542297, - "rewards/answer_reward": 0.0625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4430057406425476, - "step": 2529 - }, - { - "completion_length": 93.578125, - "epoch": 0.8062460165710643, - "grad_norm": 28.571090698242188, - "kl": 0.2216796875, - "learning_rate": 1.9375398342893563e-07, - "loss": 0.0089, - "reward": 1.4926002025604248, - "reward_std": 0.04713389277458191, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49260014295578003, - "step": 2530 - }, - { - "completion_length": 96.21875, - "epoch": 0.8065646908859146, - "grad_norm": 23.014400482177734, - "kl": 0.11181640625, - "learning_rate": 1.934353091140854e-07, - "loss": 0.0045, - "reward": 1.564863681793213, - "reward_std": 0.08941254764795303, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4554886221885681, - "step": 2531 - }, - { - "completion_length": 97.84375, - "epoch": 0.8068833652007649, - "grad_norm": 32.33191680908203, - "kl": 0.212890625, - "learning_rate": 1.9311663479923519e-07, - "loss": 0.0085, - "reward": 1.5394681692123413, - "reward_std": 0.07038704305887222, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5394681692123413, - "rewards/pad": 0.0, - "step": 2532 - }, - { - "completion_length": 146.609375, - "epoch": 0.8072020395156151, - "grad_norm": 50.76522445678711, - "kl": 0.1796875, - "learning_rate": 1.9279796048438497e-07, - "loss": 0.0072, - "reward": 1.4944519996643066, - "reward_std": 0.04005897417664528, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.49445199966430664, - "step": 2533 - }, - { - "completion_length": 122.078125, - "epoch": 0.8075207138304653, - "grad_norm": 35.766685485839844, - "kl": 0.09619140625, - "learning_rate": 1.9247928616953475e-07, - "loss": 0.0038, - "reward": 1.5580180883407593, - "reward_std": 0.09663405269384384, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4642680287361145, - "rewards/pad": 0.09375, - "step": 2534 - }, - { - "completion_length": 121.1875, - "epoch": 0.8078393881453155, - "grad_norm": 20.163616180419922, - "kl": 0.11474609375, - "learning_rate": 1.921606118546845e-07, - "loss": 0.0046, - "reward": 1.4902430772781372, - "reward_std": 0.09373285621404648, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3964930772781372, - "step": 2535 - }, - { - "completion_length": 71.3125, - "epoch": 0.8081580624601657, - "grad_norm": 22.997821807861328, - "kl": 0.1123046875, - "learning_rate": 1.9184193753983428e-07, - "loss": 0.0045, - "reward": 1.7351340055465698, - "reward_std": 0.04858613759279251, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48513394594192505, - "step": 2536 - }, - { - "completion_length": 95.25, - "epoch": 0.808476736775016, - "grad_norm": 91.627685546875, - "kl": 0.1640625, - "learning_rate": 1.9152326322498406e-07, - "loss": 0.0065, - "reward": 1.4741920232772827, - "reward_std": 0.04207082465291023, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4741920232772827, - "rewards/pad": 0.0, - "step": 2537 - }, - { - "completion_length": 120.578125, - "epoch": 0.8087954110898662, - "grad_norm": 22.113229751586914, - "kl": 0.1357421875, - "learning_rate": 1.9120458891013382e-07, - "loss": 0.0054, - "reward": 1.5726546049118042, - "reward_std": 0.036812715232372284, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.572654664516449, - "step": 2538 - }, - { - "completion_length": 147.421875, - "epoch": 0.8091140854047164, - "grad_norm": 18.252666473388672, - "kl": 0.1669921875, - "learning_rate": 1.908859145952836e-07, - "loss": 0.0067, - "reward": 1.5629411935806274, - "reward_std": 0.10346086323261261, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5629411935806274, - "step": 2539 - }, - { - "completion_length": 70.546875, - "epoch": 0.8094327597195666, - "grad_norm": 37.548370361328125, - "kl": 0.232421875, - "learning_rate": 1.9056724028043338e-07, - "loss": 0.0093, - "reward": 1.7286677360534668, - "reward_std": 0.08055858314037323, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6036676168441772, - "step": 2540 - }, - { - "completion_length": 18.984375, - "epoch": 0.8097514340344169, - "grad_norm": 46.97723388671875, - "kl": 0.16796875, - "learning_rate": 1.9024856596558316e-07, - "loss": 0.0067, - "reward": 1.5712471008300781, - "reward_std": 0.03860289603471756, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5712471008300781, - "rewards/pad": 0.0, - "step": 2541 - }, - { - "completion_length": 70.796875, - "epoch": 0.8100701083492671, - "grad_norm": 78.65106201171875, - "kl": 0.1806640625, - "learning_rate": 1.8992989165073294e-07, - "loss": 0.0072, - "reward": 1.5877249240875244, - "reward_std": 0.06156248226761818, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46272486448287964, - "rewards/pad": 0.125, - "step": 2542 - }, - { - "completion_length": 45.265625, - "epoch": 0.8103887826641173, - "grad_norm": 38.531715393066406, - "kl": 0.2197265625, - "learning_rate": 1.8961121733588272e-07, - "loss": 0.0088, - "reward": 1.666687250137329, - "reward_std": 0.04203493520617485, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41668736934661865, - "step": 2543 - }, - { - "completion_length": 173.375, - "epoch": 0.8107074569789675, - "grad_norm": 9.728880882263184, - "kl": 0.09130859375, - "learning_rate": 1.892925430210325e-07, - "loss": 0.0037, - "reward": 1.4494705200195312, - "reward_std": 0.0460132360458374, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44947052001953125, - "step": 2544 - }, - { - "completion_length": 20.390625, - "epoch": 0.8110261312938177, - "grad_norm": 79.57874298095703, - "kl": 0.205078125, - "learning_rate": 1.8897386870618225e-07, - "loss": 0.0082, - "reward": 1.6763660907745361, - "reward_std": 0.09058883041143417, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5513660311698914, - "rewards/pad": 0.125, - "step": 2545 - }, - { - "completion_length": 94.75, - "epoch": 0.811344805608668, - "grad_norm": 187.05030822753906, - "kl": 0.232421875, - "learning_rate": 1.8865519439133203e-07, - "loss": 0.0093, - "reward": 1.487398386001587, - "reward_std": 0.050307560712099075, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48739832639694214, - "rewards/pad": 0.0, - "step": 2546 - }, - { - "completion_length": 43.75, - "epoch": 0.8116634799235182, - "grad_norm": 40.909996032714844, - "kl": 0.181640625, - "learning_rate": 1.8833652007648181e-07, - "loss": 0.0073, - "reward": 1.591270923614502, - "reward_std": 0.03539936989545822, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5912708640098572, - "rewards/pad": 0.0, - "step": 2547 - }, - { - "completion_length": 172.171875, - "epoch": 0.8119821542383684, - "grad_norm": 5.128893852233887, - "kl": 0.1435546875, - "learning_rate": 1.880178457616316e-07, - "loss": 0.0058, - "reward": 1.5630221366882324, - "reward_std": 0.028799906373023987, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4380221664905548, - "step": 2548 - }, - { - "completion_length": 95.890625, - "epoch": 0.8123008285532186, - "grad_norm": 22.643131256103516, - "kl": 0.126953125, - "learning_rate": 1.8769917144678138e-07, - "loss": 0.0051, - "reward": 1.5871820449829102, - "reward_std": 0.0587935745716095, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4621821641921997, - "rewards/pad": 0.125, - "step": 2549 - }, - { - "completion_length": 71.125, - "epoch": 0.8126195028680688, - "grad_norm": 10.908669471740723, - "kl": 0.181640625, - "learning_rate": 1.8738049713193116e-07, - "loss": 0.0073, - "reward": 1.713303565979004, - "reward_std": 0.04703020676970482, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5883035659790039, - "step": 2550 - }, - { - "completion_length": 148.453125, - "epoch": 0.812938177182919, - "grad_norm": 11.391550064086914, - "kl": 0.1025390625, - "learning_rate": 1.8706182281708094e-07, - "loss": 0.0041, - "reward": 1.48088538646698, - "reward_std": 0.1062798872590065, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.4965103268623352, - "step": 2551 - }, - { - "completion_length": 124.984375, - "epoch": 0.8132568514977693, - "grad_norm": 21.370189666748047, - "kl": 0.09130859375, - "learning_rate": 1.8674314850223072e-07, - "loss": 0.0037, - "reward": 1.8383700847625732, - "reward_std": 0.08607466518878937, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.6039950847625732, - "rewards/pad": 0.25, - "step": 2552 - }, - { - "completion_length": 95.734375, - "epoch": 0.8135755258126195, - "grad_norm": 38.19483184814453, - "kl": 0.140625, - "learning_rate": 1.864244741873805e-07, - "loss": 0.0056, - "reward": 1.7248027324676514, - "reward_std": 0.07156145572662354, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5998027324676514, - "step": 2553 - }, - { - "completion_length": 121.578125, - "epoch": 0.8138942001274697, - "grad_norm": 30.011445999145508, - "kl": 0.25390625, - "learning_rate": 1.8610579987253028e-07, - "loss": 0.0102, - "reward": 1.801880121231079, - "reward_std": 0.06565716862678528, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6768800616264343, - "step": 2554 - }, - { - "completion_length": 121.9375, - "epoch": 0.8142128744423199, - "grad_norm": 32.855445861816406, - "kl": 0.1103515625, - "learning_rate": 1.8578712555768003e-07, - "loss": 0.0044, - "reward": 1.6574033498764038, - "reward_std": 0.03758838772773743, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5324033498764038, - "rewards/pad": 0.125, - "step": 2555 - }, - { - "completion_length": 123.28125, - "epoch": 0.8145315487571702, - "grad_norm": 39.836605072021484, - "kl": 0.466796875, - "learning_rate": 1.854684512428298e-07, - "loss": 0.0187, - "reward": 1.734020709991455, - "reward_std": 0.08828628063201904, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49964579939842224, - "rewards/pad": 0.234375, - "step": 2556 - }, - { - "completion_length": 122.734375, - "epoch": 0.8148502230720204, - "grad_norm": 33.45938491821289, - "kl": 0.1259765625, - "learning_rate": 1.851497769279796e-07, - "loss": 0.005, - "reward": 1.7804425954818726, - "reward_std": 0.179967999458313, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.5304425358772278, - "rewards/pad": 0.28125, - "step": 2557 - }, - { - "completion_length": 119.234375, - "epoch": 0.8151688973868706, - "grad_norm": 53.81477355957031, - "kl": 0.12890625, - "learning_rate": 1.8483110261312937e-07, - "loss": 0.0051, - "reward": 1.5291807651519775, - "reward_std": 0.04661605507135391, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5291807055473328, - "step": 2558 - }, - { - "completion_length": 73.015625, - "epoch": 0.8154875717017208, - "grad_norm": 61.18295669555664, - "kl": 0.419921875, - "learning_rate": 1.8451242829827915e-07, - "loss": 0.0168, - "reward": 1.5933921337127686, - "reward_std": 0.1468154639005661, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4215170741081238, - "rewards/pad": 0.171875, - "step": 2559 - }, - { - "completion_length": 147.03125, - "epoch": 0.815806246016571, - "grad_norm": 18.40555191040039, - "kl": 0.1015625, - "learning_rate": 1.8419375398342893e-07, - "loss": 0.0041, - "reward": 1.5896304845809937, - "reward_std": 0.03527717664837837, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5896305441856384, - "step": 2560 - }, - { - "completion_length": 170.578125, - "epoch": 0.8161249203314213, - "grad_norm": 9.662720680236816, - "kl": 0.07080078125, - "learning_rate": 1.8387507966857871e-07, - "loss": 0.0028, - "reward": 1.4293944835662842, - "reward_std": 0.030367599800229073, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42939436435699463, - "rewards/pad": 0.0, - "step": 2561 - }, - { - "completion_length": 73.125, - "epoch": 0.8164435946462715, - "grad_norm": 41.47747039794922, - "kl": 0.146484375, - "learning_rate": 1.835564053537285e-07, - "loss": 0.0058, - "reward": 1.9574675559997559, - "reward_std": 0.06329420208930969, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5824676156044006, - "step": 2562 - }, - { - "completion_length": 70.71875, - "epoch": 0.8167622689611217, - "grad_norm": 99.56320190429688, - "kl": 0.25390625, - "learning_rate": 1.8323773103887828e-07, - "loss": 0.0102, - "reward": 1.462177038192749, - "reward_std": 0.08060286939144135, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.462177038192749, - "rewards/pad": 0.0, - "step": 2563 - }, - { - "completion_length": 147.15625, - "epoch": 0.8170809432759719, - "grad_norm": 40.50618362426758, - "kl": 0.1005859375, - "learning_rate": 1.8291905672402806e-07, - "loss": 0.004, - "reward": 1.5000665187835693, - "reward_std": 0.04850205034017563, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5000665187835693, - "step": 2564 - }, - { - "completion_length": 94.375, - "epoch": 0.8173996175908221, - "grad_norm": 28.008052825927734, - "kl": 0.12060546875, - "learning_rate": 1.826003824091778e-07, - "loss": 0.0048, - "reward": 1.5912353992462158, - "reward_std": 0.04305996745824814, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5912354588508606, - "rewards/pad": 0.0, - "step": 2565 - }, - { - "completion_length": 69.984375, - "epoch": 0.8177182919056724, - "grad_norm": 94.50910949707031, - "kl": 0.146484375, - "learning_rate": 1.822817080943276e-07, - "loss": 0.0059, - "reward": 1.651397466659546, - "reward_std": 0.05276678502559662, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6513974070549011, - "step": 2566 - }, - { - "completion_length": 97.453125, - "epoch": 0.8180369662205226, - "grad_norm": 26.70928192138672, - "kl": 0.1044921875, - "learning_rate": 1.8196303377947737e-07, - "loss": 0.0042, - "reward": 1.7432599067687988, - "reward_std": 0.03985518962144852, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6182599067687988, - "step": 2567 - }, - { - "completion_length": 120.25, - "epoch": 0.8183556405353728, - "grad_norm": 29.575092315673828, - "kl": 0.1328125, - "learning_rate": 1.8164435946462715e-07, - "loss": 0.0053, - "reward": 1.607896327972412, - "reward_std": 0.03988271206617355, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6078962683677673, - "step": 2568 - }, - { - "completion_length": 96.734375, - "epoch": 0.818674314850223, - "grad_norm": 48.102760314941406, - "kl": 0.205078125, - "learning_rate": 1.813256851497769e-07, - "loss": 0.0082, - "reward": 1.57046377658844, - "reward_std": 0.1351807415485382, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3985888361930847, - "rewards/pad": 0.171875, - "step": 2569 - }, - { - "completion_length": 70.046875, - "epoch": 0.8189929891650733, - "grad_norm": 52.74428939819336, - "kl": 0.1298828125, - "learning_rate": 1.8100701083492669e-07, - "loss": 0.0052, - "reward": 1.6519252061843872, - "reward_std": 0.047878287732601166, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5269252061843872, - "step": 2570 - }, - { - "completion_length": 72.359375, - "epoch": 0.8193116634799236, - "grad_norm": 23.516767501831055, - "kl": 0.12890625, - "learning_rate": 1.8068833652007647e-07, - "loss": 0.0051, - "reward": 1.7319979667663574, - "reward_std": 0.04590877145528793, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4819980561733246, - "rewards/pad": 0.25, - "step": 2571 - }, - { - "completion_length": 98.109375, - "epoch": 0.8196303377947738, - "grad_norm": 51.41060256958008, - "kl": 0.1513671875, - "learning_rate": 1.8036966220522625e-07, - "loss": 0.006, - "reward": 1.6041666269302368, - "reward_std": 0.12016516178846359, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49479150772094727, - "rewards/pad": 0.109375, - "step": 2572 - }, - { - "completion_length": 44.484375, - "epoch": 0.819949012109624, - "grad_norm": 73.11418151855469, - "kl": 0.2578125, - "learning_rate": 1.8005098789037603e-07, - "loss": 0.0103, - "reward": 1.486095905303955, - "reward_std": 0.1536726951599121, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.45484596490859985, - "step": 2573 - }, - { - "completion_length": 92.28125, - "epoch": 0.8202676864244742, - "grad_norm": 28.09284782409668, - "kl": 0.35546875, - "learning_rate": 1.797323135755258e-07, - "loss": 0.0142, - "reward": 1.3996057510375977, - "reward_std": 0.0659431666135788, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39960581064224243, - "step": 2574 - }, - { - "completion_length": 43.078125, - "epoch": 0.8205863607393244, - "grad_norm": 128.5286407470703, - "kl": 0.1875, - "learning_rate": 1.7941363926067556e-07, - "loss": 0.0075, - "reward": 1.719598650932312, - "reward_std": 0.13763976097106934, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5945987105369568, - "step": 2575 - }, - { - "completion_length": 44.65625, - "epoch": 0.8209050350541747, - "grad_norm": 49.774539947509766, - "kl": 0.1591796875, - "learning_rate": 1.7909496494582534e-07, - "loss": 0.0064, - "reward": 1.6712117195129395, - "reward_std": 0.054868489503860474, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.421211838722229, - "rewards/pad": 0.25, - "step": 2576 - }, - { - "completion_length": 72.828125, - "epoch": 0.8212237093690249, - "grad_norm": 103.25884246826172, - "kl": 0.1708984375, - "learning_rate": 1.7877629063097512e-07, - "loss": 0.0068, - "reward": 1.5469772815704346, - "reward_std": 0.06502607464790344, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42197737097740173, - "step": 2577 - }, - { - "completion_length": 71.625, - "epoch": 0.8215423836838751, - "grad_norm": 108.45342254638672, - "kl": 0.09814453125, - "learning_rate": 1.784576163161249e-07, - "loss": 0.0039, - "reward": 1.9143872261047363, - "reward_std": 0.07708927243947983, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5393871665000916, - "step": 2578 - }, - { - "completion_length": 173.84375, - "epoch": 0.8218610579987253, - "grad_norm": 11.318681716918945, - "kl": 0.0615234375, - "learning_rate": 1.7813894200127468e-07, - "loss": 0.0025, - "reward": 1.603217601776123, - "reward_std": 0.02870849147439003, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4782176613807678, - "step": 2579 - }, - { - "completion_length": 146.953125, - "epoch": 0.8221797323135756, - "grad_norm": 67.63125610351562, - "kl": 0.1123046875, - "learning_rate": 1.7782026768642447e-07, - "loss": 0.0045, - "reward": 1.5123826265335083, - "reward_std": 0.04073712229728699, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5123825073242188, - "step": 2580 - }, - { - "completion_length": 42.875, - "epoch": 0.8224984066284258, - "grad_norm": 49.53752517700195, - "kl": 0.2412109375, - "learning_rate": 1.7750159337157425e-07, - "loss": 0.0097, - "reward": 1.7396655082702637, - "reward_std": 0.12579971551895142, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5052905678749084, - "step": 2581 - }, - { - "completion_length": 95.9375, - "epoch": 0.822817080943276, - "grad_norm": 11.10189151763916, - "kl": 0.154296875, - "learning_rate": 1.7718291905672403e-07, - "loss": 0.0062, - "reward": 1.5884000062942505, - "reward_std": 0.04006306082010269, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5884000658988953, - "step": 2582 - }, - { - "completion_length": 95.1875, - "epoch": 0.8231357552581262, - "grad_norm": 23.773466110229492, - "kl": 0.123046875, - "learning_rate": 1.768642447418738e-07, - "loss": 0.0049, - "reward": 1.6813939809799194, - "reward_std": 0.07705575227737427, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4313940405845642, - "rewards/pad": 0.25, - "step": 2583 - }, - { - "completion_length": 70.25, - "epoch": 0.8234544295729764, - "grad_norm": 97.11119842529297, - "kl": 0.1640625, - "learning_rate": 1.7654557042702356e-07, - "loss": 0.0066, - "reward": 1.461164116859436, - "reward_std": 0.13551735877990723, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3986641466617584, - "rewards/pad": 0.0625, - "step": 2584 - }, - { - "completion_length": 148.96875, - "epoch": 0.8237731038878267, - "grad_norm": 13.473774909973145, - "kl": 0.201171875, - "learning_rate": 1.7622689611217334e-07, - "loss": 0.008, - "reward": 1.4465606212615967, - "reward_std": 0.038685984909534454, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44656071066856384, - "rewards/pad": 0.0, - "step": 2585 - }, - { - "completion_length": 69.4375, - "epoch": 0.8240917782026769, - "grad_norm": 59.2571907043457, - "kl": 0.10986328125, - "learning_rate": 1.7590822179732312e-07, - "loss": 0.0044, - "reward": 1.6228172779083252, - "reward_std": 0.08944885432720184, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5134422779083252, - "rewards/pad": 0.109375, - "step": 2586 - }, - { - "completion_length": 69.953125, - "epoch": 0.8244104525175271, - "grad_norm": 48.022098541259766, - "kl": 0.15625, - "learning_rate": 1.755895474824729e-07, - "loss": 0.0062, - "reward": 1.5253853797912598, - "reward_std": 0.050665490329265594, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5253853797912598, - "rewards/pad": 0.0, - "step": 2587 - }, - { - "completion_length": 67.453125, - "epoch": 0.8247291268323773, - "grad_norm": 350.6123962402344, - "kl": 0.212890625, - "learning_rate": 1.7527087316762268e-07, - "loss": 0.0085, - "reward": 1.5839134454727173, - "reward_std": 0.12105952203273773, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5995384454727173, - "step": 2588 - }, - { - "completion_length": 70.484375, - "epoch": 0.8250478011472275, - "grad_norm": 25.583349227905273, - "kl": 0.376953125, - "learning_rate": 1.7495219885277246e-07, - "loss": 0.0151, - "reward": 1.696850299835205, - "reward_std": 0.1311756670475006, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5718502998352051, - "rewards/pad": 0.125, - "step": 2589 - }, - { - "completion_length": 120.203125, - "epoch": 0.8253664754620778, - "grad_norm": 25.217897415161133, - "kl": 0.1552734375, - "learning_rate": 1.7463352453792224e-07, - "loss": 0.0062, - "reward": 1.5294244289398193, - "reward_std": 0.0945592150092125, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5137994289398193, - "step": 2590 - }, - { - "completion_length": 96.1875, - "epoch": 0.825685149776928, - "grad_norm": 61.383060455322266, - "kl": 0.1162109375, - "learning_rate": 1.7431485022307202e-07, - "loss": 0.0046, - "reward": 1.4659759998321533, - "reward_std": 0.09193190932273865, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3409760296344757, - "step": 2591 - }, - { - "completion_length": 68.421875, - "epoch": 0.8260038240917782, - "grad_norm": 19.33612060546875, - "kl": 0.193359375, - "learning_rate": 1.739961759082218e-07, - "loss": 0.0077, - "reward": 1.5404525995254517, - "reward_std": 0.09838010370731354, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5248275995254517, - "rewards/pad": 0.03125, - "step": 2592 - }, - { - "completion_length": 172.09375, - "epoch": 0.8263224984066284, - "grad_norm": 9.181909561157227, - "kl": 0.1474609375, - "learning_rate": 1.7367750159337159e-07, - "loss": 0.0059, - "reward": 1.5430819988250732, - "reward_std": 0.036296404898166656, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5430819988250732, - "step": 2593 - }, - { - "completion_length": 96.671875, - "epoch": 0.8266411727214786, - "grad_norm": 26.713693618774414, - "kl": 0.15625, - "learning_rate": 1.7335882727852134e-07, - "loss": 0.0063, - "reward": 1.7793209552764893, - "reward_std": 0.05804768204689026, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5293208956718445, - "step": 2594 - }, - { - "completion_length": 120.890625, - "epoch": 0.8269598470363289, - "grad_norm": 127.09708404541016, - "kl": 0.1533203125, - "learning_rate": 1.7304015296367112e-07, - "loss": 0.0061, - "reward": 1.4177944660186768, - "reward_std": 0.033887144178152084, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41779449582099915, - "step": 2595 - }, - { - "completion_length": 70.734375, - "epoch": 0.8272785213511791, - "grad_norm": 32.772132873535156, - "kl": 0.28125, - "learning_rate": 1.727214786488209e-07, - "loss": 0.0113, - "reward": 1.6892006397247314, - "reward_std": 0.17646805942058563, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4548256993293762, - "step": 2596 - }, - { - "completion_length": 122.96875, - "epoch": 0.8275971956660293, - "grad_norm": 100.04345703125, - "kl": 0.1279296875, - "learning_rate": 1.7240280433397068e-07, - "loss": 0.0051, - "reward": 1.647017478942871, - "reward_std": 0.1251906454563141, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42826738953590393, - "rewards/pad": 0.21875, - "step": 2597 - }, - { - "completion_length": 121.71875, - "epoch": 0.8279158699808795, - "grad_norm": 20.383155822753906, - "kl": 0.1337890625, - "learning_rate": 1.7208413001912046e-07, - "loss": 0.0054, - "reward": 1.6237883567810059, - "reward_std": 0.04436013475060463, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49878835678100586, - "rewards/pad": 0.125, - "step": 2598 - }, - { - "completion_length": 121.375, - "epoch": 0.8282345442957297, - "grad_norm": 48.96121597290039, - "kl": 0.1533203125, - "learning_rate": 1.7176545570427024e-07, - "loss": 0.0061, - "reward": 1.5679914951324463, - "reward_std": 0.056499335914850235, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5679914951324463, - "step": 2599 - }, - { - "completion_length": 120.890625, - "epoch": 0.82855321861058, - "grad_norm": 13.06563663482666, - "kl": 0.126953125, - "learning_rate": 1.7144678138942e-07, - "loss": 0.0051, - "reward": 1.629065990447998, - "reward_std": 0.056700609624385834, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5040660500526428, - "rewards/pad": 0.125, - "step": 2600 - }, - { - "completion_length": 96.28125, - "epoch": 0.8288718929254302, - "grad_norm": 55.01938247680664, - "kl": 0.18359375, - "learning_rate": 1.7112810707456978e-07, - "loss": 0.0073, - "reward": 1.4708433151245117, - "reward_std": 0.07404984533786774, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47084322571754456, - "rewards/pad": 0.0, - "step": 2601 - }, - { - "completion_length": 148.21875, - "epoch": 0.8291905672402804, - "grad_norm": 21.739028930664062, - "kl": 0.1328125, - "learning_rate": 1.7080943275971956e-07, - "loss": 0.0053, - "reward": 1.4217995405197144, - "reward_std": 0.053845398128032684, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2967994809150696, - "step": 2602 - }, - { - "completion_length": 148.625, - "epoch": 0.8295092415551306, - "grad_norm": 24.647369384765625, - "kl": 0.11669921875, - "learning_rate": 1.7049075844486934e-07, - "loss": 0.0047, - "reward": 1.5416154861450195, - "reward_std": 0.07124875485897064, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4166156053543091, - "rewards/pad": 0.125, - "step": 2603 - }, - { - "completion_length": 45.75, - "epoch": 0.8298279158699808, - "grad_norm": 23.10195541381836, - "kl": 0.2001953125, - "learning_rate": 1.701720841300191e-07, - "loss": 0.008, - "reward": 1.5835518836975098, - "reward_std": 0.13451796770095825, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.536676824092865, - "step": 2604 - }, - { - "completion_length": 43.5625, - "epoch": 0.8301465901848311, - "grad_norm": 62.544151306152344, - "kl": 0.232421875, - "learning_rate": 1.6985340981516887e-07, - "loss": 0.0093, - "reward": 1.6618508100509644, - "reward_std": 0.14059233665466309, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6618507504463196, - "rewards/pad": 0.0, - "step": 2605 - }, - { - "completion_length": 122.0625, - "epoch": 0.8304652644996813, - "grad_norm": 14.006827354431152, - "kl": 0.1806640625, - "learning_rate": 1.6953473550031865e-07, - "loss": 0.0072, - "reward": 1.612912893295288, - "reward_std": 0.07815717160701752, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4879128932952881, - "step": 2606 - }, - { - "completion_length": 97.6875, - "epoch": 0.8307839388145315, - "grad_norm": 34.14162826538086, - "kl": 0.1953125, - "learning_rate": 1.6921606118546843e-07, - "loss": 0.0078, - "reward": 1.6610808372497559, - "reward_std": 0.10042650997638702, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5360807776451111, - "step": 2607 - }, - { - "completion_length": 72.125, - "epoch": 0.8311026131293817, - "grad_norm": 22.228124618530273, - "kl": 0.177734375, - "learning_rate": 1.6889738687061821e-07, - "loss": 0.0071, - "reward": 1.5810412168502808, - "reward_std": 0.06514905393123627, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.456041157245636, - "step": 2608 - }, - { - "completion_length": 97.4375, - "epoch": 0.831421287444232, - "grad_norm": 18.93842887878418, - "kl": 0.2578125, - "learning_rate": 1.68578712555768e-07, - "loss": 0.0103, - "reward": 1.6564600467681885, - "reward_std": 0.06771206110715866, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5314601063728333, - "rewards/pad": 0.125, - "step": 2609 - }, - { - "completion_length": 97.578125, - "epoch": 0.8317399617590823, - "grad_norm": 65.8399887084961, - "kl": 0.11083984375, - "learning_rate": 1.6826003824091777e-07, - "loss": 0.0044, - "reward": 1.4755096435546875, - "reward_std": 0.10804099589586258, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41300976276397705, - "step": 2610 - }, - { - "completion_length": 147.71875, - "epoch": 0.8320586360739325, - "grad_norm": 17.270606994628906, - "kl": 0.1142578125, - "learning_rate": 1.6794136392606755e-07, - "loss": 0.0046, - "reward": 1.5525813102722168, - "reward_std": 0.09812011569738388, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.44320622086524963, - "step": 2611 - }, - { - "completion_length": 172.453125, - "epoch": 0.8323773103887827, - "grad_norm": 33.714534759521484, - "kl": 0.32421875, - "learning_rate": 1.6762268961121734e-07, - "loss": 0.0129, - "reward": 1.5914037227630615, - "reward_std": 0.05641365051269531, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.591403603553772, - "rewards/pad": 0.0, - "step": 2612 - }, - { - "completion_length": 18.46875, - "epoch": 0.8326959847036329, - "grad_norm": 114.2525634765625, - "kl": 0.1689453125, - "learning_rate": 1.6730401529636712e-07, - "loss": 0.0068, - "reward": 1.8317246437072754, - "reward_std": 0.05801478773355484, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.7067245841026306, - "step": 2613 - }, - { - "completion_length": 97.09375, - "epoch": 0.8330146590184832, - "grad_norm": 38.123756408691406, - "kl": 0.21484375, - "learning_rate": 1.6698534098151687e-07, - "loss": 0.0086, - "reward": 1.662263035774231, - "reward_std": 0.1684742569923401, - "rewards/answer_reward": 0.078125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.584138035774231, - "step": 2614 - }, - { - "completion_length": 122.40625, - "epoch": 0.8333333333333334, - "grad_norm": 252.03700256347656, - "kl": 0.1318359375, - "learning_rate": 1.6666666666666665e-07, - "loss": 0.0053, - "reward": 1.4378101825714111, - "reward_std": 0.07450634986162186, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4378102719783783, - "step": 2615 - }, - { - "completion_length": 97.0625, - "epoch": 0.8336520076481836, - "grad_norm": 26.747224807739258, - "kl": 0.1328125, - "learning_rate": 1.6634799235181643e-07, - "loss": 0.0053, - "reward": 1.5330252647399902, - "reward_std": 0.055772747844457626, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.40802526473999023, - "step": 2616 - }, - { - "completion_length": 172.21875, - "epoch": 0.8339706819630338, - "grad_norm": 28.24957275390625, - "kl": 0.1044921875, - "learning_rate": 1.660293180369662e-07, - "loss": 0.0042, - "reward": 1.4632399082183838, - "reward_std": 0.04309063404798508, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46323996782302856, - "rewards/pad": 0.0, - "step": 2617 - }, - { - "completion_length": 68.421875, - "epoch": 0.834289356277884, - "grad_norm": 39.480796813964844, - "kl": 0.1953125, - "learning_rate": 1.65710643722116e-07, - "loss": 0.0078, - "reward": 1.8473598957061768, - "reward_std": 0.060732681304216385, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7223598957061768, - "rewards/pad": 0.125, - "step": 2618 - }, - { - "completion_length": 44.375, - "epoch": 0.8346080305927343, - "grad_norm": 25.87619972229004, - "kl": 0.1689453125, - "learning_rate": 1.6539196940726577e-07, - "loss": 0.0068, - "reward": 1.4389697313308716, - "reward_std": 0.056923579424619675, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43896961212158203, - "rewards/pad": 0.0, - "step": 2619 - }, - { - "completion_length": 122.265625, - "epoch": 0.8349267049075845, - "grad_norm": 44.532230377197266, - "kl": 0.1015625, - "learning_rate": 1.6507329509241555e-07, - "loss": 0.0041, - "reward": 1.5837821960449219, - "reward_std": 0.0566696859896183, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4587821066379547, - "rewards/pad": 0.125, - "step": 2620 - }, - { - "completion_length": 91.1875, - "epoch": 0.8352453792224347, - "grad_norm": 44.035194396972656, - "kl": 0.158203125, - "learning_rate": 1.6475462077756533e-07, - "loss": 0.0063, - "reward": 1.5844500064849854, - "reward_std": 0.06715071201324463, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5844499468803406, - "rewards/pad": 0.0, - "step": 2621 - }, - { - "completion_length": 43.6875, - "epoch": 0.8355640535372849, - "grad_norm": 51.73539733886719, - "kl": 0.1650390625, - "learning_rate": 1.6443594646271511e-07, - "loss": 0.0066, - "reward": 1.6584193706512451, - "reward_std": 0.07346348464488983, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6584193706512451, - "rewards/pad": 0.0, - "step": 2622 - }, - { - "completion_length": 124.1875, - "epoch": 0.8358827278521351, - "grad_norm": 128.14300537109375, - "kl": 0.10400390625, - "learning_rate": 1.641172721478649e-07, - "loss": 0.0042, - "reward": 1.4553518295288086, - "reward_std": 0.04990885406732559, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.33035188913345337, - "step": 2623 - }, - { - "completion_length": 70.96875, - "epoch": 0.8362014021669854, - "grad_norm": 43.594383239746094, - "kl": 0.21484375, - "learning_rate": 1.6379859783301465e-07, - "loss": 0.0086, - "reward": 1.6906349658966064, - "reward_std": 0.06027328222990036, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5656350255012512, - "rewards/pad": 0.125, - "step": 2624 - }, - { - "completion_length": 95.84375, - "epoch": 0.8365200764818356, - "grad_norm": 19.044055938720703, - "kl": 0.1416015625, - "learning_rate": 1.6347992351816443e-07, - "loss": 0.0057, - "reward": 1.478894591331482, - "reward_std": 0.07891231775283813, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.47889453172683716, - "rewards/pad": 0.0, - "step": 2625 - }, - { - "completion_length": 124.5, - "epoch": 0.8368387507966858, - "grad_norm": 44.692325592041016, - "kl": 0.09765625, - "learning_rate": 1.631612492033142e-07, - "loss": 0.0039, - "reward": 1.757611632347107, - "reward_std": 0.07381190359592438, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5076116323471069, - "rewards/pad": 0.25, - "step": 2626 - }, - { - "completion_length": 122.0625, - "epoch": 0.837157425111536, - "grad_norm": 18.987163543701172, - "kl": 0.10986328125, - "learning_rate": 1.62842574888464e-07, - "loss": 0.0044, - "reward": 1.2537281513214111, - "reward_std": 0.04036957025527954, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.2537282109260559, - "step": 2627 - }, - { - "completion_length": 146.53125, - "epoch": 0.8374760994263862, - "grad_norm": 20.48886489868164, - "kl": 0.2275390625, - "learning_rate": 1.6252390057361377e-07, - "loss": 0.0091, - "reward": 1.4572899341583252, - "reward_std": 0.0421544574201107, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4572899639606476, - "rewards/pad": 0.0, - "step": 2628 - }, - { - "completion_length": 46.609375, - "epoch": 0.8377947737412365, - "grad_norm": 37.17943572998047, - "kl": 0.197265625, - "learning_rate": 1.6220522625876355e-07, - "loss": 0.0079, - "reward": 1.8263375759124756, - "reward_std": 0.043008752167224884, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5763376951217651, - "step": 2629 - }, - { - "completion_length": 121.6875, - "epoch": 0.8381134480560867, - "grad_norm": 75.12956237792969, - "kl": 0.103515625, - "learning_rate": 1.6188655194391333e-07, - "loss": 0.0042, - "reward": 1.5346864461898804, - "reward_std": 0.07906049489974976, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5346865057945251, - "step": 2630 - }, - { - "completion_length": 122.40625, - "epoch": 0.8384321223709369, - "grad_norm": 50.21953201293945, - "kl": 0.12060546875, - "learning_rate": 1.6156787762906309e-07, - "loss": 0.0048, - "reward": 1.5212500095367432, - "reward_std": 0.061381660401821136, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.396249920129776, - "step": 2631 - }, - { - "completion_length": 98.5, - "epoch": 0.8387507966857871, - "grad_norm": 24.53875732421875, - "kl": 0.11767578125, - "learning_rate": 1.6124920331421287e-07, - "loss": 0.0047, - "reward": 1.530691385269165, - "reward_std": 0.029347026720643044, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4056914448738098, - "step": 2632 - }, - { - "completion_length": 121.625, - "epoch": 0.8390694710006373, - "grad_norm": 51.883243560791016, - "kl": 0.189453125, - "learning_rate": 1.6093052899936262e-07, - "loss": 0.0076, - "reward": 1.6813349723815918, - "reward_std": 0.0806775689125061, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5563348531723022, - "step": 2633 - }, - { - "completion_length": 70.46875, - "epoch": 0.8393881453154876, - "grad_norm": 16.21527862548828, - "kl": 0.2099609375, - "learning_rate": 1.606118546845124e-07, - "loss": 0.0084, - "reward": 1.8099238872528076, - "reward_std": 0.12390229851007462, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7317988276481628, - "rewards/pad": 0.078125, - "step": 2634 - }, - { - "completion_length": 123.828125, - "epoch": 0.8397068196303378, - "grad_norm": 32.55116653442383, - "kl": 0.1640625, - "learning_rate": 1.6029318036966218e-07, - "loss": 0.0066, - "reward": 1.345779538154602, - "reward_std": 0.1039619892835617, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.36140456795692444, - "step": 2635 - }, - { - "completion_length": 122.796875, - "epoch": 0.840025493945188, - "grad_norm": 44.53833770751953, - "kl": 0.12109375, - "learning_rate": 1.5997450605481196e-07, - "loss": 0.0048, - "reward": 1.5953428745269775, - "reward_std": 0.031780026853084564, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47034284472465515, - "step": 2636 - }, - { - "completion_length": 122.171875, - "epoch": 0.8403441682600382, - "grad_norm": 18.348587036132812, - "kl": 0.1416015625, - "learning_rate": 1.5965583173996174e-07, - "loss": 0.0057, - "reward": 1.5309064388275146, - "reward_std": 0.07383998483419418, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40590640902519226, - "rewards/pad": 0.125, - "step": 2637 - }, - { - "completion_length": 73.125, - "epoch": 0.8406628425748884, - "grad_norm": 40.64803695678711, - "kl": 0.15625, - "learning_rate": 1.5933715742511152e-07, - "loss": 0.0063, - "reward": 1.7465347051620483, - "reward_std": 0.14019320905208588, - "rewards/pad": 0.21875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5277846455574036, - "step": 2638 - }, - { - "completion_length": 98.4375, - "epoch": 0.8409815168897387, - "grad_norm": 25.688995361328125, - "kl": 0.1103515625, - "learning_rate": 1.590184831102613e-07, - "loss": 0.0044, - "reward": 1.6058697700500488, - "reward_std": 0.054219409823417664, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4808697998523712, - "rewards/pad": 0.125, - "step": 2639 - }, - { - "completion_length": 95.125, - "epoch": 0.8413001912045889, - "grad_norm": 127.85604095458984, - "kl": 0.1455078125, - "learning_rate": 1.5869980879541108e-07, - "loss": 0.0058, - "reward": 1.6282551288604736, - "reward_std": 0.05407290905714035, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6282550096511841, - "rewards/pad": 0.0, - "step": 2640 - }, - { - "completion_length": 145.90625, - "epoch": 0.8416188655194391, - "grad_norm": 61.949310302734375, - "kl": 0.1357421875, - "learning_rate": 1.5838113448056086e-07, - "loss": 0.0054, - "reward": 1.39642333984375, - "reward_std": 0.044233374297618866, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3964233100414276, - "step": 2641 - }, - { - "completion_length": 70.25, - "epoch": 0.8419375398342893, - "grad_norm": 65.2403564453125, - "kl": 0.1767578125, - "learning_rate": 1.5806246016571064e-07, - "loss": 0.0071, - "reward": 1.5552221536636353, - "reward_std": 0.07399885356426239, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4302220940589905, - "rewards/pad": 0.125, - "step": 2642 - }, - { - "completion_length": 97.3125, - "epoch": 0.8422562141491395, - "grad_norm": 14.721014022827148, - "kl": 0.2265625, - "learning_rate": 1.577437858508604e-07, - "loss": 0.0091, - "reward": 1.5365911722183228, - "reward_std": 0.0709502175450325, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5365911722183228, - "rewards/pad": 0.0, - "step": 2643 - }, - { - "completion_length": 70.796875, - "epoch": 0.8425748884639898, - "grad_norm": 31.82359504699707, - "kl": 0.2431640625, - "learning_rate": 1.5742511153601018e-07, - "loss": 0.0097, - "reward": 1.594054937362671, - "reward_std": 0.08968573063611984, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4690549075603485, - "step": 2644 - }, - { - "completion_length": 148.984375, - "epoch": 0.84289356277884, - "grad_norm": 11.718220710754395, - "kl": 0.1025390625, - "learning_rate": 1.5710643722115996e-07, - "loss": 0.0041, - "reward": 1.7030255794525146, - "reward_std": 0.05551174283027649, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4530256390571594, - "step": 2645 - }, - { - "completion_length": 120.8125, - "epoch": 0.8432122370936902, - "grad_norm": 38.980865478515625, - "kl": 0.166015625, - "learning_rate": 1.5678776290630974e-07, - "loss": 0.0066, - "reward": 1.4111497402191162, - "reward_std": 0.09590955078601837, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4111497700214386, - "step": 2646 - }, - { - "completion_length": 96.40625, - "epoch": 0.8435309114085404, - "grad_norm": 18.516830444335938, - "kl": 0.193359375, - "learning_rate": 1.5646908859145952e-07, - "loss": 0.0077, - "reward": 1.4889684915542603, - "reward_std": 0.07828620076179504, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.48896846175193787, - "step": 2647 - }, - { - "completion_length": 72.046875, - "epoch": 0.8438495857233907, - "grad_norm": 79.8927993774414, - "kl": 0.193359375, - "learning_rate": 1.561504142766093e-07, - "loss": 0.0077, - "reward": 1.7945959568023682, - "reward_std": 0.05644000321626663, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5445958971977234, - "rewards/pad": 0.25, - "step": 2648 - }, - { - "completion_length": 69.15625, - "epoch": 0.844168260038241, - "grad_norm": 80.28946685791016, - "kl": 0.216796875, - "learning_rate": 1.5583173996175908e-07, - "loss": 0.0087, - "reward": 1.436242699623108, - "reward_std": 0.042065173387527466, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4362426996231079, - "step": 2649 - }, - { - "completion_length": 154.953125, - "epoch": 0.8444869343530912, - "grad_norm": 14.139089584350586, - "kl": 0.083984375, - "learning_rate": 1.5551306564690886e-07, - "loss": 0.0034, - "reward": 1.463057041168213, - "reward_std": 0.051803916692733765, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4630569815635681, - "rewards/pad": 0.0, - "step": 2650 - }, - { - "completion_length": 96.765625, - "epoch": 0.8448056086679414, - "grad_norm": 138.07135009765625, - "kl": 0.130859375, - "learning_rate": 1.5519439133205864e-07, - "loss": 0.0052, - "reward": 1.5896549224853516, - "reward_std": 0.061335496604442596, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5896549224853516, - "step": 2651 - }, - { - "completion_length": 43.84375, - "epoch": 0.8451242829827916, - "grad_norm": 40.37471389770508, - "kl": 0.1953125, - "learning_rate": 1.5487571701720842e-07, - "loss": 0.0078, - "reward": 1.456931710243225, - "reward_std": 0.07474370300769806, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4569316804409027, - "rewards/pad": 0.0, - "step": 2652 - }, - { - "completion_length": 97.296875, - "epoch": 0.8454429572976419, - "grad_norm": 19.235700607299805, - "kl": 0.166015625, - "learning_rate": 1.5455704270235818e-07, - "loss": 0.0066, - "reward": 1.8206788301467896, - "reward_std": 0.08305393159389496, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5706788301467896, - "step": 2653 - }, - { - "completion_length": 121.015625, - "epoch": 0.8457616316124921, - "grad_norm": 15.048810005187988, - "kl": 0.1162109375, - "learning_rate": 1.5423836838750796e-07, - "loss": 0.0046, - "reward": 1.5825250148773193, - "reward_std": 0.047882575541734695, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5825249552726746, - "rewards/pad": 0.0, - "step": 2654 - }, - { - "completion_length": 121.4375, - "epoch": 0.8460803059273423, - "grad_norm": 20.13825798034668, - "kl": 0.1201171875, - "learning_rate": 1.5391969407265774e-07, - "loss": 0.0048, - "reward": 1.6793920993804932, - "reward_std": 0.044683486223220825, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5543920397758484, - "step": 2655 - }, - { - "completion_length": 72.1875, - "epoch": 0.8463989802421925, - "grad_norm": 31.22608184814453, - "kl": 0.1669921875, - "learning_rate": 1.5360101975780752e-07, - "loss": 0.0067, - "reward": 1.6091265678405762, - "reward_std": 0.05492344871163368, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48412656784057617, - "rewards/pad": 0.125, - "step": 2656 - }, - { - "completion_length": 150.453125, - "epoch": 0.8467176545570427, - "grad_norm": 10.927957534790039, - "kl": 0.09521484375, - "learning_rate": 1.532823454429573e-07, - "loss": 0.0038, - "reward": 1.5135812759399414, - "reward_std": 0.05462616682052612, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5135812759399414, - "step": 2657 - }, - { - "completion_length": 43.546875, - "epoch": 0.847036328871893, - "grad_norm": 62.26527404785156, - "kl": 0.259765625, - "learning_rate": 1.5296367112810708e-07, - "loss": 0.0104, - "reward": 1.752439260482788, - "reward_std": 0.09079361706972122, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7524390816688538, - "rewards/pad": 0.0, - "step": 2658 - }, - { - "completion_length": 45.984375, - "epoch": 0.8473550031867432, - "grad_norm": 26.958433151245117, - "kl": 0.251953125, - "learning_rate": 1.5264499681325686e-07, - "loss": 0.0101, - "reward": 1.6483477354049683, - "reward_std": 0.06506000459194183, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5233477354049683, - "rewards/pad": 0.125, - "step": 2659 - }, - { - "completion_length": 173.703125, - "epoch": 0.8476736775015934, - "grad_norm": 40.21539306640625, - "kl": 0.0908203125, - "learning_rate": 1.5232632249840664e-07, - "loss": 0.0036, - "reward": 1.623093843460083, - "reward_std": 0.03879357501864433, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4980939030647278, - "step": 2660 - }, - { - "completion_length": 119.78125, - "epoch": 0.8479923518164436, - "grad_norm": 29.048297882080078, - "kl": 0.1689453125, - "learning_rate": 1.5200764818355642e-07, - "loss": 0.0068, - "reward": 1.4315879344940186, - "reward_std": 0.08094234764575958, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4315878748893738, - "rewards/pad": 0.0, - "step": 2661 - }, - { - "completion_length": 96.78125, - "epoch": 0.8483110261312938, - "grad_norm": 18.212772369384766, - "kl": 0.1005859375, - "learning_rate": 1.5168897386870618e-07, - "loss": 0.004, - "reward": 1.8102524280548096, - "reward_std": 0.1550176441669464, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5915024876594543, - "rewards/pad": 0.234375, - "step": 2662 - }, - { - "completion_length": 148.796875, - "epoch": 0.848629700446144, - "grad_norm": 13.177532196044922, - "kl": 0.1259765625, - "learning_rate": 1.5137029955385593e-07, - "loss": 0.005, - "reward": 1.6542266607284546, - "reward_std": 0.07177316397428513, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5292267203330994, - "rewards/pad": 0.125, - "step": 2663 - }, - { - "completion_length": 123.6875, - "epoch": 0.8489483747609943, - "grad_norm": 68.9928207397461, - "kl": 0.10009765625, - "learning_rate": 1.510516252390057e-07, - "loss": 0.004, - "reward": 1.6512646675109863, - "reward_std": 0.027718177065253258, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5262647271156311, - "step": 2664 - }, - { - "completion_length": 121.484375, - "epoch": 0.8492670490758445, - "grad_norm": 26.726791381835938, - "kl": 0.1796875, - "learning_rate": 1.507329509241555e-07, - "loss": 0.0072, - "reward": 1.5700151920318604, - "reward_std": 0.06693685054779053, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5700151920318604, - "rewards/pad": 0.0, - "step": 2665 - }, - { - "completion_length": 123.0625, - "epoch": 0.8495857233906947, - "grad_norm": 16.982797622680664, - "kl": 0.1591796875, - "learning_rate": 1.5041427660930527e-07, - "loss": 0.0064, - "reward": 1.5845251083374023, - "reward_std": 0.054715149104595184, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5845251083374023, - "rewards/pad": 0.0, - "step": 2666 - }, - { - "completion_length": 120.421875, - "epoch": 0.8499043977055449, - "grad_norm": 47.70309066772461, - "kl": 0.1435546875, - "learning_rate": 1.5009560229445505e-07, - "loss": 0.0057, - "reward": 1.520828366279602, - "reward_std": 0.1224186047911644, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.536453366279602, - "rewards/pad": 0.0, - "step": 2667 - }, - { - "completion_length": 148.5625, - "epoch": 0.8502230720203952, - "grad_norm": 14.824073791503906, - "kl": 0.10400390625, - "learning_rate": 1.4977692797960483e-07, - "loss": 0.0042, - "reward": 1.5155977010726929, - "reward_std": 0.030607666820287704, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5155977010726929, - "step": 2668 - }, - { - "completion_length": 97.890625, - "epoch": 0.8505417463352454, - "grad_norm": 43.689300537109375, - "kl": 0.1767578125, - "learning_rate": 1.494582536647546e-07, - "loss": 0.0071, - "reward": 1.589605450630188, - "reward_std": 0.07949741184711456, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.3552304804325104, - "step": 2669 - }, - { - "completion_length": 121.796875, - "epoch": 0.8508604206500956, - "grad_norm": 24.583927154541016, - "kl": 0.0751953125, - "learning_rate": 1.491395793499044e-07, - "loss": 0.003, - "reward": 1.4888324737548828, - "reward_std": 0.1194739043712616, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4263325333595276, - "step": 2670 - }, - { - "completion_length": 175.15625, - "epoch": 0.8511790949649458, - "grad_norm": 17.891311645507812, - "kl": 0.1005859375, - "learning_rate": 1.4882090503505417e-07, - "loss": 0.004, - "reward": 1.6280725002288818, - "reward_std": 0.04061845690011978, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5030724406242371, - "step": 2671 - }, - { - "completion_length": 96.265625, - "epoch": 0.851497769279796, - "grad_norm": 104.97727966308594, - "kl": 0.1259765625, - "learning_rate": 1.4850223072020395e-07, - "loss": 0.005, - "reward": 1.5070886611938477, - "reward_std": 0.051849115639925, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5070886611938477, - "step": 2672 - }, - { - "completion_length": 97.078125, - "epoch": 0.8518164435946463, - "grad_norm": 57.460472106933594, - "kl": 0.1572265625, - "learning_rate": 1.481835564053537e-07, - "loss": 0.0063, - "reward": 1.5827641487121582, - "reward_std": 0.15324586629867554, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.48901426792144775, - "step": 2673 - }, - { - "completion_length": 145.96875, - "epoch": 0.8521351179094965, - "grad_norm": 22.31479263305664, - "kl": 0.107421875, - "learning_rate": 1.478648820905035e-07, - "loss": 0.0043, - "reward": 1.520214557647705, - "reward_std": 0.05476115643978119, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5202144980430603, - "rewards/pad": 0.0, - "step": 2674 - }, - { - "completion_length": 150.171875, - "epoch": 0.8524537922243467, - "grad_norm": 59.294612884521484, - "kl": 0.0966796875, - "learning_rate": 1.4754620777565327e-07, - "loss": 0.0039, - "reward": 1.6401712894439697, - "reward_std": 0.05010083317756653, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39017125964164734, - "step": 2675 - }, - { - "completion_length": 71.28125, - "epoch": 0.8527724665391969, - "grad_norm": 50.30670166015625, - "kl": 0.11669921875, - "learning_rate": 1.4722753346080305e-07, - "loss": 0.0047, - "reward": 1.7515373229980469, - "reward_std": 0.0303016509860754, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6265373229980469, - "rewards/pad": 0.125, - "step": 2676 - }, - { - "completion_length": 118.40625, - "epoch": 0.8530911408540471, - "grad_norm": 16.006433486938477, - "kl": 0.11962890625, - "learning_rate": 1.4690885914595283e-07, - "loss": 0.0048, - "reward": 1.3631829023361206, - "reward_std": 0.031119568273425102, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36318284273147583, - "rewards/pad": 0.0, - "step": 2677 - }, - { - "completion_length": 121.171875, - "epoch": 0.8534098151688974, - "grad_norm": 29.49525260925293, - "kl": 0.1162109375, - "learning_rate": 1.465901848311026e-07, - "loss": 0.0046, - "reward": 1.6165872812271118, - "reward_std": 0.04108864441514015, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4915872812271118, - "rewards/pad": 0.125, - "step": 2678 - }, - { - "completion_length": 69.75, - "epoch": 0.8537284894837476, - "grad_norm": 32.46714782714844, - "kl": 0.126953125, - "learning_rate": 1.462715105162524e-07, - "loss": 0.0051, - "reward": 1.6220935583114624, - "reward_std": 0.04669243097305298, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6220934391021729, - "rewards/pad": 0.0, - "step": 2679 - }, - { - "completion_length": 100.8125, - "epoch": 0.8540471637985978, - "grad_norm": 157.86827087402344, - "kl": 0.1259765625, - "learning_rate": 1.4595283620140217e-07, - "loss": 0.005, - "reward": 1.5971333980560303, - "reward_std": 0.1475202441215515, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.96875, - "rewards/iou_glue_reward": 0.5033833980560303, - "step": 2680 - }, - { - "completion_length": 71.59375, - "epoch": 0.854365838113448, - "grad_norm": 23.01113510131836, - "kl": 0.2734375, - "learning_rate": 1.4563416188655195e-07, - "loss": 0.011, - "reward": 1.652078628540039, - "reward_std": 0.11400812119245529, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.6677037477493286, - "rewards/pad": 0.0, - "step": 2681 - }, - { - "completion_length": 148.6875, - "epoch": 0.8546845124282982, - "grad_norm": 15.51531982421875, - "kl": 0.0771484375, - "learning_rate": 1.4531548757170173e-07, - "loss": 0.0031, - "reward": 1.4181718826293945, - "reward_std": 0.09832640737295151, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4337969124317169, - "rewards/pad": 0.0, - "step": 2682 - }, - { - "completion_length": 70.0, - "epoch": 0.8550031867431485, - "grad_norm": 60.99382400512695, - "kl": 0.185546875, - "learning_rate": 1.4499681325685149e-07, - "loss": 0.0074, - "reward": 1.5028945207595825, - "reward_std": 0.06972403824329376, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5028945207595825, - "rewards/pad": 0.0, - "step": 2683 - }, - { - "completion_length": 69.15625, - "epoch": 0.8553218610579987, - "grad_norm": 23.123205184936523, - "kl": 0.18359375, - "learning_rate": 1.4467813894200127e-07, - "loss": 0.0073, - "reward": 1.616814374923706, - "reward_std": 0.09260426461696625, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6168143153190613, - "rewards/pad": 0.0, - "step": 2684 - }, - { - "completion_length": 119.953125, - "epoch": 0.8556405353728489, - "grad_norm": 22.664691925048828, - "kl": 0.1474609375, - "learning_rate": 1.4435946462715105e-07, - "loss": 0.0059, - "reward": 1.5155465602874756, - "reward_std": 0.08761819452047348, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40617164969444275, - "step": 2685 - }, - { - "completion_length": 70.9375, - "epoch": 0.8559592096876991, - "grad_norm": 37.55588912963867, - "kl": 0.142578125, - "learning_rate": 1.4404079031230083e-07, - "loss": 0.0057, - "reward": 1.6823314428329468, - "reward_std": 0.07761171460151672, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5573315620422363, - "rewards/pad": 0.125, - "step": 2686 - }, - { - "completion_length": 70.640625, - "epoch": 0.8562778840025494, - "grad_norm": 34.67982864379883, - "kl": 0.1923828125, - "learning_rate": 1.437221159974506e-07, - "loss": 0.0077, - "reward": 1.8627105951309204, - "reward_std": 0.09182138741016388, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6127105355262756, - "step": 2687 - }, - { - "completion_length": 96.984375, - "epoch": 0.8565965583173997, - "grad_norm": 32.860870361328125, - "kl": 0.1533203125, - "learning_rate": 1.434034416826004e-07, - "loss": 0.0061, - "reward": 1.5270265340805054, - "reward_std": 0.09325046837329865, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5270265340805054, - "rewards/pad": 0.0, - "step": 2688 - }, - { - "completion_length": 67.90625, - "epoch": 0.8569152326322499, - "grad_norm": 19.75588035583496, - "kl": 0.1416015625, - "learning_rate": 1.4308476736775017e-07, - "loss": 0.0057, - "reward": 1.7512493133544922, - "reward_std": 0.11892116069793701, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.6418742537498474, - "step": 2689 - }, - { - "completion_length": 95.5, - "epoch": 0.8572339069471001, - "grad_norm": 20.87849235534668, - "kl": 0.302734375, - "learning_rate": 1.4276609305289995e-07, - "loss": 0.0121, - "reward": 1.7276328802108765, - "reward_std": 0.09207872301340103, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.7276328802108765, - "step": 2690 - }, - { - "completion_length": 122.5625, - "epoch": 0.8575525812619503, - "grad_norm": 20.82269287109375, - "kl": 0.1455078125, - "learning_rate": 1.4244741873804973e-07, - "loss": 0.0058, - "reward": 1.6641173362731934, - "reward_std": 0.11703188717365265, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5547422766685486, - "step": 2691 - }, - { - "completion_length": 97.1875, - "epoch": 0.8578712555768006, - "grad_norm": 20.181957244873047, - "kl": 0.12353515625, - "learning_rate": 1.4212874442319946e-07, - "loss": 0.0049, - "reward": 1.6254770755767822, - "reward_std": 0.09635715186595917, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5317270755767822, - "rewards/pad": 0.09375, - "step": 2692 - }, - { - "completion_length": 121.953125, - "epoch": 0.8581899298916508, - "grad_norm": 11.201703071594238, - "kl": 0.2138671875, - "learning_rate": 1.4181007010834924e-07, - "loss": 0.0086, - "reward": 1.5552585124969482, - "reward_std": 0.078560471534729, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5552584528923035, - "rewards/pad": 0.0, - "step": 2693 - }, - { - "completion_length": 96.765625, - "epoch": 0.858508604206501, - "grad_norm": 62.409873962402344, - "kl": 0.1298828125, - "learning_rate": 1.4149139579349902e-07, - "loss": 0.0052, - "reward": 1.5960705280303955, - "reward_std": 0.04456819221377373, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.596070408821106, - "step": 2694 - }, - { - "completion_length": 120.625, - "epoch": 0.8588272785213512, - "grad_norm": 36.87535095214844, - "kl": 0.1005859375, - "learning_rate": 1.411727214786488e-07, - "loss": 0.004, - "reward": 1.4737019538879395, - "reward_std": 0.0814041867852211, - "rewards/pad": 0.15625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.317452073097229, - "step": 2695 - }, - { - "completion_length": 70.953125, - "epoch": 0.8591459528362014, - "grad_norm": 28.575361251831055, - "kl": 0.203125, - "learning_rate": 1.4085404716379858e-07, - "loss": 0.0081, - "reward": 1.6485555171966553, - "reward_std": 0.09977797418832779, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5235554575920105, - "rewards/pad": 0.125, - "step": 2696 - }, - { - "completion_length": 71.703125, - "epoch": 0.8594646271510517, - "grad_norm": 59.23045349121094, - "kl": 0.1767578125, - "learning_rate": 1.4053537284894836e-07, - "loss": 0.0071, - "reward": 1.758746862411499, - "reward_std": 0.058865562081336975, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.633746862411499, - "rewards/pad": 0.125, - "step": 2697 - }, - { - "completion_length": 97.0625, - "epoch": 0.8597833014659019, - "grad_norm": 22.85202407836914, - "kl": 0.1259765625, - "learning_rate": 1.4021669853409814e-07, - "loss": 0.005, - "reward": 1.8228514194488525, - "reward_std": 0.13548430800437927, - "rewards/pad": 0.1875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6353514790534973, - "step": 2698 - }, - { - "completion_length": 68.171875, - "epoch": 0.8601019757807521, - "grad_norm": 25.938024520874023, - "kl": 0.1591796875, - "learning_rate": 1.3989802421924792e-07, - "loss": 0.0064, - "reward": 1.4957406520843506, - "reward_std": 0.07756452262401581, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4957405626773834, - "rewards/pad": 0.0, - "step": 2699 - }, - { - "completion_length": 123.5625, - "epoch": 0.8604206500956023, - "grad_norm": 91.65367126464844, - "kl": 0.12060546875, - "learning_rate": 1.395793499043977e-07, - "loss": 0.0048, - "reward": 1.576904535293579, - "reward_std": 0.05569456145167351, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4519045352935791, - "rewards/pad": 0.125, - "step": 2700 - }, - { - "completion_length": 123.03125, - "epoch": 0.8607393244104525, - "grad_norm": 13.199487686157227, - "kl": 0.13671875, - "learning_rate": 1.3926067558954748e-07, - "loss": 0.0055, - "reward": 1.589841365814209, - "reward_std": 0.07075409591197968, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.46484124660491943, - "step": 2701 - }, - { - "completion_length": 97.671875, - "epoch": 0.8610579987253028, - "grad_norm": 66.15384674072266, - "kl": 0.142578125, - "learning_rate": 1.3894200127469724e-07, - "loss": 0.0057, - "reward": 1.5517024993896484, - "reward_std": 0.0735410824418068, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42670249938964844, - "rewards/pad": 0.125, - "step": 2702 - }, - { - "completion_length": 95.3125, - "epoch": 0.861376673040153, - "grad_norm": 35.88011169433594, - "kl": 0.1572265625, - "learning_rate": 1.3862332695984702e-07, - "loss": 0.0063, - "reward": 1.6739253997802734, - "reward_std": 0.04082561284303665, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5489252805709839, - "step": 2703 - }, - { - "completion_length": 44.0, - "epoch": 0.8616953473550032, - "grad_norm": 41.33247375488281, - "kl": 0.5, - "learning_rate": 1.383046526449968e-07, - "loss": 0.02, - "reward": 1.6581693887710571, - "reward_std": 0.07377001643180847, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6581693887710571, - "rewards/pad": 0.0, - "step": 2704 - }, - { - "completion_length": 96.625, - "epoch": 0.8620140216698534, - "grad_norm": 30.842103958129883, - "kl": 0.158203125, - "learning_rate": 1.3798597833014658e-07, - "loss": 0.0063, - "reward": 1.63382089138031, - "reward_std": 0.05160008370876312, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6338208913803101, - "rewards/pad": 0.0, - "step": 2705 - }, - { - "completion_length": 44.65625, - "epoch": 0.8623326959847036, - "grad_norm": 78.3980484008789, - "kl": 0.1630859375, - "learning_rate": 1.3766730401529636e-07, - "loss": 0.0065, - "reward": 1.4698817729949951, - "reward_std": 0.06917546689510345, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4698818325996399, - "rewards/pad": 0.0, - "step": 2706 - }, - { - "completion_length": 96.78125, - "epoch": 0.8626513702995539, - "grad_norm": 33.21269607543945, - "kl": 0.1279296875, - "learning_rate": 1.3734862970044614e-07, - "loss": 0.0051, - "reward": 1.4502224922180176, - "reward_std": 0.10222014784812927, - "rewards/pad": 0.09375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3564724624156952, - "step": 2707 - }, - { - "completion_length": 70.390625, - "epoch": 0.8629700446144041, - "grad_norm": 57.733829498291016, - "kl": 0.142578125, - "learning_rate": 1.3702995538559592e-07, - "loss": 0.0057, - "reward": 1.687762975692749, - "reward_std": 0.07852350175380707, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.562762975692749, - "step": 2708 - }, - { - "completion_length": 123.21875, - "epoch": 0.8632887189292543, - "grad_norm": 20.890010833740234, - "kl": 0.119140625, - "learning_rate": 1.367112810707457e-07, - "loss": 0.0048, - "reward": 1.291185975074768, - "reward_std": 0.07491404563188553, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.18181096017360687, - "step": 2709 - }, - { - "completion_length": 73.375, - "epoch": 0.8636073932441045, - "grad_norm": 28.094013214111328, - "kl": 0.30859375, - "learning_rate": 1.3639260675589548e-07, - "loss": 0.0123, - "reward": 1.773735761642456, - "reward_std": 0.05704750865697861, - "rewards/answer_reward": 0.375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.39873576164245605, - "step": 2710 - }, - { - "completion_length": 118.875, - "epoch": 0.8639260675589547, - "grad_norm": 68.48311614990234, - "kl": 0.11328125, - "learning_rate": 1.3607393244104526e-07, - "loss": 0.0045, - "reward": 1.636932134628296, - "reward_std": 0.03979235142469406, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5119320750236511, - "step": 2711 - }, - { - "completion_length": 44.6875, - "epoch": 0.864244741873805, - "grad_norm": 74.5843276977539, - "kl": 0.1767578125, - "learning_rate": 1.3575525812619501e-07, - "loss": 0.0071, - "reward": 1.6634106636047363, - "reward_std": 0.09597359597682953, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42903560400009155, - "rewards/pad": 0.234375, - "step": 2712 - }, - { - "completion_length": 96.3125, - "epoch": 0.8645634161886552, - "grad_norm": 95.82728576660156, - "kl": 0.185546875, - "learning_rate": 1.354365838113448e-07, - "loss": 0.0074, - "reward": 1.50628662109375, - "reward_std": 0.06524436175823212, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5062866806983948, - "rewards/pad": 0.0, - "step": 2713 - }, - { - "completion_length": 143.40625, - "epoch": 0.8648820905035054, - "grad_norm": 12.538618087768555, - "kl": 0.125, - "learning_rate": 1.3511790949649458e-07, - "loss": 0.005, - "reward": 1.5285263061523438, - "reward_std": 0.02601650357246399, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5285263061523438, - "rewards/pad": 0.0, - "step": 2714 - }, - { - "completion_length": 69.453125, - "epoch": 0.8652007648183556, - "grad_norm": 65.76648712158203, - "kl": 0.1396484375, - "learning_rate": 1.3479923518164436e-07, - "loss": 0.0056, - "reward": 1.654858946800232, - "reward_std": 0.04377948120236397, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6548588871955872, - "step": 2715 - }, - { - "completion_length": 97.609375, - "epoch": 0.8655194391332058, - "grad_norm": 172.28298950195312, - "kl": 0.1630859375, - "learning_rate": 1.3448056086679414e-07, - "loss": 0.0065, - "reward": 1.7394400835037231, - "reward_std": 0.09455197304487228, - "rewards/pad": 0.28125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4581901729106903, - "step": 2716 - }, - { - "completion_length": 173.84375, - "epoch": 0.8658381134480561, - "grad_norm": 6.5358781814575195, - "kl": 0.1982421875, - "learning_rate": 1.3416188655194392e-07, - "loss": 0.0079, - "reward": 1.5423023700714111, - "reward_std": 0.12627559900283813, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4329274892807007, - "step": 2717 - }, - { - "completion_length": 71.40625, - "epoch": 0.8661567877629063, - "grad_norm": 25.88863182067871, - "kl": 0.1865234375, - "learning_rate": 1.338432122370937e-07, - "loss": 0.0075, - "reward": 1.7255803346633911, - "reward_std": 0.06208154559135437, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6005802154541016, - "rewards/pad": 0.125, - "step": 2718 - }, - { - "completion_length": 44.8125, - "epoch": 0.8664754620777565, - "grad_norm": 61.54029083251953, - "kl": 0.15625, - "learning_rate": 1.3352453792224348e-07, - "loss": 0.0062, - "reward": 1.6610568761825562, - "reward_std": 0.06639941036701202, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6610568761825562, - "step": 2719 - }, - { - "completion_length": 73.828125, - "epoch": 0.8667941363926067, - "grad_norm": 28.209598541259766, - "kl": 0.220703125, - "learning_rate": 1.3320586360739326e-07, - "loss": 0.0089, - "reward": 1.7774858474731445, - "reward_std": 0.1864936798810959, - "rewards/pad": 0.1875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5899857878684998, - "step": 2720 - }, - { - "completion_length": 147.375, - "epoch": 0.8671128107074569, - "grad_norm": 12.662481307983398, - "kl": 0.11669921875, - "learning_rate": 1.3288718929254304e-07, - "loss": 0.0047, - "reward": 1.5595229864120483, - "reward_std": 0.06435461342334747, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43452298641204834, - "step": 2721 - }, - { - "completion_length": 97.125, - "epoch": 0.8674314850223072, - "grad_norm": 57.771488189697266, - "kl": 0.1123046875, - "learning_rate": 1.325685149776928e-07, - "loss": 0.0045, - "reward": 1.5305076837539673, - "reward_std": 0.05875205621123314, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5305076837539673, - "rewards/pad": 0.0, - "step": 2722 - }, - { - "completion_length": 145.46875, - "epoch": 0.8677501593371574, - "grad_norm": 21.42304229736328, - "kl": 0.09619140625, - "learning_rate": 1.3224984066284255e-07, - "loss": 0.0039, - "reward": 1.5055104494094849, - "reward_std": 0.10674820840358734, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5211354494094849, - "step": 2723 - }, - { - "completion_length": 121.0625, - "epoch": 0.8680688336520076, - "grad_norm": 12.052943229675293, - "kl": 0.248046875, - "learning_rate": 1.3193116634799233e-07, - "loss": 0.0099, - "reward": 1.6499567031860352, - "reward_std": 0.07832995802164078, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5249567031860352, - "rewards/pad": 0.125, - "step": 2724 - }, - { - "completion_length": 70.28125, - "epoch": 0.8683875079668578, - "grad_norm": 19.712549209594727, - "kl": 0.1845703125, - "learning_rate": 1.316124920331421e-07, - "loss": 0.0074, - "reward": 1.6299546957015991, - "reward_std": 0.0473661869764328, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5049546957015991, - "step": 2725 - }, - { - "completion_length": 119.453125, - "epoch": 0.868706182281708, - "grad_norm": 34.98122024536133, - "kl": 0.1494140625, - "learning_rate": 1.312938177182919e-07, - "loss": 0.006, - "reward": 1.393830418586731, - "reward_std": 0.0540420226752758, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.39383038878440857, - "rewards/pad": 0.0, - "step": 2726 - }, - { - "completion_length": 46.34375, - "epoch": 0.8690248565965584, - "grad_norm": 99.46790313720703, - "kl": 0.1982421875, - "learning_rate": 1.3097514340344167e-07, - "loss": 0.0079, - "reward": 1.495451807975769, - "reward_std": 0.12049788236618042, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.511076807975769, - "step": 2727 - }, - { - "completion_length": 93.796875, - "epoch": 0.8693435309114086, - "grad_norm": 28.54528045654297, - "kl": 0.44921875, - "learning_rate": 1.3065646908859145e-07, - "loss": 0.018, - "reward": 1.6903252601623535, - "reward_std": 0.11592370271682739, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6903253197669983, - "step": 2728 - }, - { - "completion_length": 96.765625, - "epoch": 0.8696622052262588, - "grad_norm": 58.99710464477539, - "kl": 0.1591796875, - "learning_rate": 1.3033779477374123e-07, - "loss": 0.0063, - "reward": 1.7372379302978516, - "reward_std": 0.08875305950641632, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5028629302978516, - "step": 2729 - }, - { - "completion_length": 70.90625, - "epoch": 0.869980879541109, - "grad_norm": 33.128196716308594, - "kl": 0.279296875, - "learning_rate": 1.30019120458891e-07, - "loss": 0.0112, - "reward": 1.5718579292297363, - "reward_std": 0.0795237272977829, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46248289942741394, - "rewards/pad": 0.109375, - "step": 2730 - }, - { - "completion_length": 124.3125, - "epoch": 0.8702995538559593, - "grad_norm": 27.280370712280273, - "kl": 0.1396484375, - "learning_rate": 1.297004461440408e-07, - "loss": 0.0056, - "reward": 1.501305103302002, - "reward_std": 0.06377540528774261, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5013052225112915, - "step": 2731 - }, - { - "completion_length": 45.375, - "epoch": 0.8706182281708095, - "grad_norm": 50.033111572265625, - "kl": 0.248046875, - "learning_rate": 1.2938177182919055e-07, - "loss": 0.0099, - "reward": 1.6895687580108643, - "reward_std": 0.14168678224086761, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5958187580108643, - "rewards/pad": 0.09375, - "step": 2732 - }, - { - "completion_length": 119.375, - "epoch": 0.8709369024856597, - "grad_norm": 60.8686637878418, - "kl": 0.10546875, - "learning_rate": 1.2906309751434033e-07, - "loss": 0.0042, - "reward": 1.5451840162277222, - "reward_std": 0.05866240710020065, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5451840162277222, - "step": 2733 - }, - { - "completion_length": 95.75, - "epoch": 0.8712555768005099, - "grad_norm": 36.15638732910156, - "kl": 0.205078125, - "learning_rate": 1.287444231994901e-07, - "loss": 0.0082, - "reward": 1.605086088180542, - "reward_std": 0.06847918778657913, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6050861477851868, - "step": 2734 - }, - { - "completion_length": 122.96875, - "epoch": 0.8715742511153601, - "grad_norm": 9.641300201416016, - "kl": 0.158203125, - "learning_rate": 1.284257488846399e-07, - "loss": 0.0063, - "reward": 1.5682752132415771, - "reward_std": 0.10088510811328888, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4432750344276428, - "step": 2735 - }, - { - "completion_length": 145.671875, - "epoch": 0.8718929254302104, - "grad_norm": 14.491096496582031, - "kl": 0.11767578125, - "learning_rate": 1.2810707456978967e-07, - "loss": 0.0047, - "reward": 1.3433637619018555, - "reward_std": 0.027368104085326195, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.34336382150650024, - "step": 2736 - }, - { - "completion_length": 72.421875, - "epoch": 0.8722115997450606, - "grad_norm": 52.69004440307617, - "kl": 0.1630859375, - "learning_rate": 1.2778840025493945e-07, - "loss": 0.0065, - "reward": 1.489732265472412, - "reward_std": 0.16871726512908936, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3959823250770569, - "rewards/pad": 0.09375, - "step": 2737 - }, - { - "completion_length": 143.859375, - "epoch": 0.8725302740599108, - "grad_norm": 36.90729522705078, - "kl": 0.09912109375, - "learning_rate": 1.2746972594008923e-07, - "loss": 0.004, - "reward": 1.7073585987091064, - "reward_std": 0.05773121491074562, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5823585391044617, - "step": 2738 - }, - { - "completion_length": 94.453125, - "epoch": 0.872848948374761, - "grad_norm": 14.649452209472656, - "kl": 0.15234375, - "learning_rate": 1.27151051625239e-07, - "loss": 0.0061, - "reward": 1.7781200408935547, - "reward_std": 0.07444360852241516, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6531199216842651, - "step": 2739 - }, - { - "completion_length": 69.359375, - "epoch": 0.8731676226896112, - "grad_norm": 364.4878845214844, - "kl": 0.2099609375, - "learning_rate": 1.268323773103888e-07, - "loss": 0.0084, - "reward": 1.5531272888183594, - "reward_std": 0.10912410914897919, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5687524080276489, - "step": 2740 - }, - { - "completion_length": 46.5, - "epoch": 0.8734862970044615, - "grad_norm": 23.947185516357422, - "kl": 0.2470703125, - "learning_rate": 1.2651370299553854e-07, - "loss": 0.0099, - "reward": 1.677107810974121, - "reward_std": 0.12452313303947449, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5677329897880554, - "step": 2741 - }, - { - "completion_length": 122.046875, - "epoch": 0.8738049713193117, - "grad_norm": 17.992095947265625, - "kl": 0.2333984375, - "learning_rate": 1.2619502868068832e-07, - "loss": 0.0093, - "reward": 1.645282506942749, - "reward_std": 0.05429815873503685, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.39528244733810425, - "step": 2742 - }, - { - "completion_length": 71.890625, - "epoch": 0.8741236456341619, - "grad_norm": 45.481292724609375, - "kl": 0.140625, - "learning_rate": 1.258763543658381e-07, - "loss": 0.0056, - "reward": 1.602842926979065, - "reward_std": 0.11617221683263779, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4778429865837097, - "step": 2743 - }, - { - "completion_length": 71.078125, - "epoch": 0.8744423199490121, - "grad_norm": 15.559009552001953, - "kl": 0.23046875, - "learning_rate": 1.2555768005098789e-07, - "loss": 0.0092, - "reward": 1.6246838569641113, - "reward_std": 0.09834360331296921, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5153087973594666, - "rewards/pad": 0.109375, - "step": 2744 - }, - { - "completion_length": 68.6875, - "epoch": 0.8747609942638623, - "grad_norm": 31.50205421447754, - "kl": 0.2490234375, - "learning_rate": 1.2523900573613767e-07, - "loss": 0.01, - "reward": 1.60776948928833, - "reward_std": 0.07542349398136139, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6077694892883301, - "rewards/pad": 0.0, - "step": 2745 - }, - { - "completion_length": 124.0625, - "epoch": 0.8750796685787126, - "grad_norm": 94.65583038330078, - "kl": 0.154296875, - "learning_rate": 1.2492033142128745e-07, - "loss": 0.0062, - "reward": 1.5391860008239746, - "reward_std": 0.052262432873249054, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5391859412193298, - "step": 2746 - }, - { - "completion_length": 98.125, - "epoch": 0.8753983428935628, - "grad_norm": 28.832426071166992, - "kl": 0.1591796875, - "learning_rate": 1.2460165710643723e-07, - "loss": 0.0064, - "reward": 1.5147631168365479, - "reward_std": 0.07860881835222244, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5147630572319031, - "rewards/pad": 0.0, - "step": 2747 - }, - { - "completion_length": 99.171875, - "epoch": 0.875717017208413, - "grad_norm": 24.03593635559082, - "kl": 0.40234375, - "learning_rate": 1.24282982791587e-07, - "loss": 0.0161, - "reward": 1.420696496963501, - "reward_std": 0.09086885303258896, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.43632152676582336, - "step": 2748 - }, - { - "completion_length": 150.703125, - "epoch": 0.8760356915232632, - "grad_norm": 41.519927978515625, - "kl": 0.154296875, - "learning_rate": 1.2396430847673676e-07, - "loss": 0.0062, - "reward": 1.4927181005477905, - "reward_std": 0.12375061213970184, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41459304094314575, - "step": 2749 - }, - { - "completion_length": 69.4375, - "epoch": 0.8763543658381134, - "grad_norm": 83.76514434814453, - "kl": 0.1572265625, - "learning_rate": 1.2364563416188654e-07, - "loss": 0.0063, - "reward": 1.7640998363494873, - "reward_std": 0.05387468636035919, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6390999555587769, - "step": 2750 - }, - { - "completion_length": 122.15625, - "epoch": 0.8766730401529637, - "grad_norm": 19.833681106567383, - "kl": 0.12890625, - "learning_rate": 1.2332695984703632e-07, - "loss": 0.0051, - "reward": 1.7683115005493164, - "reward_std": 0.07841889560222626, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5183114409446716, - "step": 2751 - }, - { - "completion_length": 45.046875, - "epoch": 0.8769917144678139, - "grad_norm": 24.46291160583496, - "kl": 0.15234375, - "learning_rate": 1.230082855321861e-07, - "loss": 0.0061, - "reward": 1.757706880569458, - "reward_std": 0.07475492358207703, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.757706880569458, - "rewards/pad": 0.0, - "step": 2752 - }, - { - "completion_length": 72.140625, - "epoch": 0.8773103887826641, - "grad_norm": 23.918794631958008, - "kl": 0.2060546875, - "learning_rate": 1.2268961121733588e-07, - "loss": 0.0082, - "reward": 1.5892629623413086, - "reward_std": 0.14511264860630035, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4798879027366638, - "step": 2753 - }, - { - "completion_length": 95.84375, - "epoch": 0.8776290630975143, - "grad_norm": 20.943511962890625, - "kl": 0.17578125, - "learning_rate": 1.2237093690248564e-07, - "loss": 0.007, - "reward": 1.7631535530090332, - "reward_std": 0.1328577697277069, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.7006535530090332, - "step": 2754 - }, - { - "completion_length": 69.328125, - "epoch": 0.8779477374123645, - "grad_norm": 68.40469360351562, - "kl": 0.1904296875, - "learning_rate": 1.2205226258763542e-07, - "loss": 0.0076, - "reward": 1.6983041763305664, - "reward_std": 0.04504488408565521, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6983040571212769, - "rewards/pad": 0.0, - "step": 2755 - }, - { - "completion_length": 44.3125, - "epoch": 0.8782664117272148, - "grad_norm": 48.420501708984375, - "kl": 0.1982421875, - "learning_rate": 1.217335882727852e-07, - "loss": 0.0079, - "reward": 1.797545313835144, - "reward_std": 0.060333847999572754, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.797545313835144, - "rewards/pad": 0.0, - "step": 2756 - }, - { - "completion_length": 147.71875, - "epoch": 0.878585086042065, - "grad_norm": 46.09718322753906, - "kl": 0.08984375, - "learning_rate": 1.2141491395793498e-07, - "loss": 0.0036, - "reward": 1.476060152053833, - "reward_std": 0.07144233584403992, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4760601818561554, - "step": 2757 - }, - { - "completion_length": 93.015625, - "epoch": 0.8789037603569152, - "grad_norm": 20.685373306274414, - "kl": 0.2041015625, - "learning_rate": 1.2109623964308476e-07, - "loss": 0.0081, - "reward": 1.4160687923431396, - "reward_std": 0.10059767961502075, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.41606879234313965, - "rewards/pad": 0.0, - "step": 2758 - }, - { - "completion_length": 70.40625, - "epoch": 0.8792224346717654, - "grad_norm": 82.97880554199219, - "kl": 0.1318359375, - "learning_rate": 1.2077756532823454e-07, - "loss": 0.0053, - "reward": 1.672110915184021, - "reward_std": 0.10492867976427078, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.531485915184021, - "step": 2759 - }, - { - "completion_length": 95.96875, - "epoch": 0.8795411089866156, - "grad_norm": 44.148155212402344, - "kl": 0.10302734375, - "learning_rate": 1.2045889101338432e-07, - "loss": 0.0041, - "reward": 1.7742416858673096, - "reward_std": 0.05053679645061493, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5242416858673096, - "rewards/pad": 0.25, - "step": 2760 - }, - { - "completion_length": 94.015625, - "epoch": 0.8798597833014659, - "grad_norm": 50.04271697998047, - "kl": 0.0986328125, - "learning_rate": 1.201402166985341e-07, - "loss": 0.004, - "reward": 1.66432523727417, - "reward_std": 0.12925195693969727, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5549502372741699, - "step": 2761 - }, - { - "completion_length": 99.40625, - "epoch": 0.8801784576163161, - "grad_norm": 45.97916030883789, - "kl": 0.1357421875, - "learning_rate": 1.1982154238368388e-07, - "loss": 0.0054, - "reward": 1.7068970203399658, - "reward_std": 0.0851178914308548, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45689696073532104, - "step": 2762 - }, - { - "completion_length": 45.015625, - "epoch": 0.8804971319311663, - "grad_norm": 27.48224449157715, - "kl": 0.216796875, - "learning_rate": 1.1950286806883364e-07, - "loss": 0.0087, - "reward": 1.5753300189971924, - "reward_std": 0.08806179463863373, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5753300189971924, - "rewards/pad": 0.0, - "step": 2763 - }, - { - "completion_length": 96.21875, - "epoch": 0.8808158062460165, - "grad_norm": 25.854440689086914, - "kl": 0.1865234375, - "learning_rate": 1.1918419375398342e-07, - "loss": 0.0075, - "reward": 1.5593594312667847, - "reward_std": 0.07620866596698761, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5593595504760742, - "step": 2764 - }, - { - "completion_length": 97.046875, - "epoch": 0.8811344805608667, - "grad_norm": 25.43502426147461, - "kl": 0.12158203125, - "learning_rate": 1.188655194391332e-07, - "loss": 0.0049, - "reward": 1.6679394245147705, - "reward_std": 0.06287582218647003, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.542939305305481, - "step": 2765 - }, - { - "completion_length": 119.40625, - "epoch": 0.8814531548757171, - "grad_norm": 18.379669189453125, - "kl": 0.1318359375, - "learning_rate": 1.1854684512428298e-07, - "loss": 0.0053, - "reward": 1.4637765884399414, - "reward_std": 0.1088210940361023, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4012766480445862, - "rewards/pad": 0.0625, - "step": 2766 - }, - { - "completion_length": 47.203125, - "epoch": 0.8817718291905673, - "grad_norm": 76.71330261230469, - "kl": 0.244140625, - "learning_rate": 1.1822817080943276e-07, - "loss": 0.0097, - "reward": 1.5661251544952393, - "reward_std": 0.05948948487639427, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4411252439022064, - "rewards/pad": 0.125, - "step": 2767 - }, - { - "completion_length": 96.296875, - "epoch": 0.8820905035054175, - "grad_norm": 101.81230926513672, - "kl": 0.146484375, - "learning_rate": 1.1790949649458252e-07, - "loss": 0.0059, - "reward": 1.5366199016571045, - "reward_std": 0.024259870871901512, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2866199314594269, - "rewards/pad": 0.25, - "step": 2768 - }, - { - "completion_length": 124.484375, - "epoch": 0.8824091778202677, - "grad_norm": 28.96718406677246, - "kl": 0.1328125, - "learning_rate": 1.175908221797323e-07, - "loss": 0.0053, - "reward": 1.5327292680740356, - "reward_std": 0.0342310331761837, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.40772929787635803, - "rewards/pad": 0.125, - "step": 2769 - }, - { - "completion_length": 122.203125, - "epoch": 0.882727852135118, - "grad_norm": 14.978941917419434, - "kl": 0.18359375, - "learning_rate": 1.1727214786488209e-07, - "loss": 0.0073, - "reward": 1.6567342281341553, - "reward_std": 0.06245347112417221, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6567343473434448, - "rewards/pad": 0.0, - "step": 2770 - }, - { - "completion_length": 70.90625, - "epoch": 0.8830465264499682, - "grad_norm": 64.56586456298828, - "kl": 0.15625, - "learning_rate": 1.1695347355003187e-07, - "loss": 0.0062, - "reward": 1.6610746383666992, - "reward_std": 0.09375263750553131, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.551699697971344, - "step": 2771 - }, - { - "completion_length": 71.40625, - "epoch": 0.8833652007648184, - "grad_norm": 38.95387268066406, - "kl": 0.2431640625, - "learning_rate": 1.1663479923518165e-07, - "loss": 0.0097, - "reward": 1.73610258102417, - "reward_std": 0.11607648432254791, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6423525810241699, - "rewards/pad": 0.09375, - "step": 2772 - }, - { - "completion_length": 96.0625, - "epoch": 0.8836838750796686, - "grad_norm": 16.487346649169922, - "kl": 0.1923828125, - "learning_rate": 1.1631612492033141e-07, - "loss": 0.0077, - "reward": 1.6498181819915771, - "reward_std": 0.07369641214609146, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6498181819915771, - "rewards/pad": 0.0, - "step": 2773 - }, - { - "completion_length": 72.859375, - "epoch": 0.8840025493945188, - "grad_norm": 25.980937957763672, - "kl": 0.1923828125, - "learning_rate": 1.159974506054812e-07, - "loss": 0.0077, - "reward": 1.79311203956604, - "reward_std": 0.053481630980968475, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.54311203956604, - "step": 2774 - }, - { - "completion_length": 70.96875, - "epoch": 0.884321223709369, - "grad_norm": 53.19056701660156, - "kl": 0.1787109375, - "learning_rate": 1.1567877629063097e-07, - "loss": 0.0071, - "reward": 1.7510408163070679, - "reward_std": 0.12297496944665909, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5166658163070679, - "rewards/pad": 0.25, - "step": 2775 - }, - { - "completion_length": 95.671875, - "epoch": 0.8846398980242193, - "grad_norm": 34.38197708129883, - "kl": 0.216796875, - "learning_rate": 1.1536010197578076e-07, - "loss": 0.0087, - "reward": 1.4682893753051758, - "reward_std": 0.08085277676582336, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.468289315700531, - "rewards/pad": 0.0, - "step": 2776 - }, - { - "completion_length": 174.328125, - "epoch": 0.8849585723390695, - "grad_norm": 10.826027870178223, - "kl": 0.09765625, - "learning_rate": 1.1504142766093052e-07, - "loss": 0.0039, - "reward": 1.53999662399292, - "reward_std": 0.03267857804894447, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5399965047836304, - "step": 2777 - }, - { - "completion_length": 44.890625, - "epoch": 0.8852772466539197, - "grad_norm": 32.9930419921875, - "kl": 0.267578125, - "learning_rate": 1.1472275334608029e-07, - "loss": 0.0107, - "reward": 1.919132947921753, - "reward_std": 0.10893414914608002, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.6847580075263977, - "rewards/pad": 0.25, - "step": 2778 - }, - { - "completion_length": 120.5625, - "epoch": 0.8855959209687699, - "grad_norm": 11.993074417114258, - "kl": 0.1162109375, - "learning_rate": 1.1440407903123007e-07, - "loss": 0.0046, - "reward": 1.4315476417541504, - "reward_std": 0.036456815898418427, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4315475821495056, - "step": 2779 - }, - { - "completion_length": 147.53125, - "epoch": 0.8859145952836202, - "grad_norm": 47.42682647705078, - "kl": 0.10107421875, - "learning_rate": 1.1408540471637985e-07, - "loss": 0.004, - "reward": 1.5162981748580933, - "reward_std": 0.02957623079419136, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5162981152534485, - "rewards/pad": 0.0, - "step": 2780 - }, - { - "completion_length": 121.96875, - "epoch": 0.8862332695984704, - "grad_norm": 15.857872009277344, - "kl": 0.12890625, - "learning_rate": 1.1376673040152963e-07, - "loss": 0.0051, - "reward": 1.446213722229004, - "reward_std": 0.055445119738578796, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4462137818336487, - "step": 2781 - }, - { - "completion_length": 70.796875, - "epoch": 0.8865519439133206, - "grad_norm": 34.068119049072266, - "kl": 0.205078125, - "learning_rate": 1.1344805608667941e-07, - "loss": 0.0082, - "reward": 1.6310408115386963, - "reward_std": 0.10182473808526993, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5060407519340515, - "step": 2782 - }, - { - "completion_length": 69.96875, - "epoch": 0.8868706182281708, - "grad_norm": 31.797401428222656, - "kl": 0.201171875, - "learning_rate": 1.1312938177182918e-07, - "loss": 0.008, - "reward": 1.573502779006958, - "reward_std": 0.05599069595336914, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5735026597976685, - "rewards/pad": 0.0, - "step": 2783 - }, - { - "completion_length": 43.8125, - "epoch": 0.887189292543021, - "grad_norm": 74.26041412353516, - "kl": 0.1953125, - "learning_rate": 1.1281070745697896e-07, - "loss": 0.0078, - "reward": 1.6191648244857788, - "reward_std": 0.07862254977226257, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6191648244857788, - "rewards/pad": 0.0, - "step": 2784 - }, - { - "completion_length": 98.03125, - "epoch": 0.8875079668578713, - "grad_norm": 34.77701187133789, - "kl": 0.16015625, - "learning_rate": 1.1249203314212874e-07, - "loss": 0.0064, - "reward": 1.4505538940429688, - "reward_std": 0.14543873071670532, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.48180389404296875, - "rewards/pad": 0.0, - "step": 2785 - }, - { - "completion_length": 95.296875, - "epoch": 0.8878266411727215, - "grad_norm": 21.708093643188477, - "kl": 0.306640625, - "learning_rate": 1.1217335882727852e-07, - "loss": 0.0122, - "reward": 1.4635813236236572, - "reward_std": 0.08164389431476593, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4635813236236572, - "step": 2786 - }, - { - "completion_length": 69.65625, - "epoch": 0.8881453154875717, - "grad_norm": 33.7183952331543, - "kl": 0.2060546875, - "learning_rate": 1.118546845124283e-07, - "loss": 0.0082, - "reward": 1.6941120624542236, - "reward_std": 0.06286899000406265, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5691120028495789, - "rewards/pad": 0.125, - "step": 2787 - }, - { - "completion_length": 68.359375, - "epoch": 0.8884639898024219, - "grad_norm": 41.23957443237305, - "kl": 0.310546875, - "learning_rate": 1.1153601019757807e-07, - "loss": 0.0124, - "reward": 1.5158627033233643, - "reward_std": 0.14926230907440186, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5314877033233643, - "rewards/pad": 0.0, - "step": 2788 - }, - { - "completion_length": 94.640625, - "epoch": 0.8887826641172721, - "grad_norm": 52.03951644897461, - "kl": 0.125, - "learning_rate": 1.1121733588272785e-07, - "loss": 0.005, - "reward": 1.6361286640167236, - "reward_std": 0.06877190619707108, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5111286640167236, - "rewards/pad": 0.125, - "step": 2789 - }, - { - "completion_length": 145.421875, - "epoch": 0.8891013384321224, - "grad_norm": 33.65622329711914, - "kl": 0.11376953125, - "learning_rate": 1.1089866156787763e-07, - "loss": 0.0046, - "reward": 1.5649499893188477, - "reward_std": 0.04964219778776169, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5649500489234924, - "step": 2790 - }, - { - "completion_length": 147.796875, - "epoch": 0.8894200127469726, - "grad_norm": 24.133140563964844, - "kl": 0.09619140625, - "learning_rate": 1.1057998725302741e-07, - "loss": 0.0038, - "reward": 1.5920072793960571, - "reward_std": 0.03973953425884247, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4670073091983795, - "rewards/pad": 0.125, - "step": 2791 - }, - { - "completion_length": 147.875, - "epoch": 0.8897386870618228, - "grad_norm": 39.576568603515625, - "kl": 0.1767578125, - "learning_rate": 1.1026131293817719e-07, - "loss": 0.0071, - "reward": 1.4278740882873535, - "reward_std": 0.04383498802781105, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42787405848503113, - "step": 2792 - }, - { - "completion_length": 71.4375, - "epoch": 0.890057361376673, - "grad_norm": 16.36480712890625, - "kl": 0.181640625, - "learning_rate": 1.0994263862332694e-07, - "loss": 0.0073, - "reward": 1.9082775115966797, - "reward_std": 0.054589416831731796, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7832775115966797, - "rewards/pad": 0.125, - "step": 2793 - }, - { - "completion_length": 70.328125, - "epoch": 0.8903760356915232, - "grad_norm": 30.750213623046875, - "kl": 0.17578125, - "learning_rate": 1.0962396430847672e-07, - "loss": 0.007, - "reward": 1.5296229124069214, - "reward_std": 0.06360771507024765, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5296229124069214, - "rewards/pad": 0.0, - "step": 2794 - }, - { - "completion_length": 98.46875, - "epoch": 0.8906947100063735, - "grad_norm": 14.531323432922363, - "kl": 0.25, - "learning_rate": 1.093052899936265e-07, - "loss": 0.01, - "reward": 1.6425288915634155, - "reward_std": 0.13553979992866516, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5331540107727051, - "step": 2795 - }, - { - "completion_length": 94.96875, - "epoch": 0.8910133843212237, - "grad_norm": 19.674827575683594, - "kl": 0.1845703125, - "learning_rate": 1.0898661567877629e-07, - "loss": 0.0074, - "reward": 1.7117376327514648, - "reward_std": 0.06816531717777252, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5867376327514648, - "rewards/pad": 0.125, - "step": 2796 - }, - { - "completion_length": 68.328125, - "epoch": 0.8913320586360739, - "grad_norm": 55.083740234375, - "kl": 0.23828125, - "learning_rate": 1.0866794136392607e-07, - "loss": 0.0095, - "reward": 1.8895810842514038, - "reward_std": 0.059445325285196304, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7645809650421143, - "rewards/pad": 0.125, - "step": 2797 - }, - { - "completion_length": 70.953125, - "epoch": 0.8916507329509241, - "grad_norm": 19.55609703063965, - "kl": 0.216796875, - "learning_rate": 1.0834926704907583e-07, - "loss": 0.0087, - "reward": 1.7575838565826416, - "reward_std": 0.055263977497816086, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6325837969779968, - "rewards/pad": 0.125, - "step": 2798 - }, - { - "completion_length": 97.046875, - "epoch": 0.8919694072657743, - "grad_norm": 21.733686447143555, - "kl": 0.1611328125, - "learning_rate": 1.0803059273422561e-07, - "loss": 0.0064, - "reward": 1.7677216529846191, - "reward_std": 0.05408502370119095, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5177216529846191, - "rewards/pad": 0.25, - "step": 2799 - }, - { - "completion_length": 100.375, - "epoch": 0.8922880815806246, - "grad_norm": 57.19150924682617, - "kl": 0.17578125, - "learning_rate": 1.077119184193754e-07, - "loss": 0.007, - "reward": 1.5054199695587158, - "reward_std": 0.08746735006570816, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4897950291633606, - "step": 2800 - }, - { - "completion_length": 126.625, - "epoch": 0.8926067558954748, - "grad_norm": 23.023639678955078, - "kl": 0.1396484375, - "learning_rate": 1.0739324410452518e-07, - "loss": 0.0056, - "reward": 1.4498021602630615, - "reward_std": 0.04989495873451233, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4498022198677063, - "step": 2801 - }, - { - "completion_length": 176.5, - "epoch": 0.892925430210325, - "grad_norm": 52.16209030151367, - "kl": 0.134765625, - "learning_rate": 1.0707456978967496e-07, - "loss": 0.0054, - "reward": 1.4218698740005493, - "reward_std": 0.045519448816776276, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4218699038028717, - "step": 2802 - }, - { - "completion_length": 72.5, - "epoch": 0.8932441045251752, - "grad_norm": 1095.290771484375, - "kl": 0.1279296875, - "learning_rate": 1.0675589547482472e-07, - "loss": 0.0051, - "reward": 1.6382055282592773, - "reward_std": 0.04405108466744423, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5132055282592773, - "rewards/pad": 0.125, - "step": 2803 - }, - { - "completion_length": 124.328125, - "epoch": 0.8935627788400254, - "grad_norm": 20.249889373779297, - "kl": 0.12158203125, - "learning_rate": 1.064372211599745e-07, - "loss": 0.0049, - "reward": 1.6384567022323608, - "reward_std": 0.040937215089797974, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5134565830230713, - "step": 2804 - }, - { - "completion_length": 96.53125, - "epoch": 0.8938814531548758, - "grad_norm": 47.22350311279297, - "kl": 0.1845703125, - "learning_rate": 1.0611854684512428e-07, - "loss": 0.0074, - "reward": 1.4459178447723389, - "reward_std": 0.0687478631734848, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44591790437698364, - "rewards/pad": 0.0, - "step": 2805 - }, - { - "completion_length": 95.203125, - "epoch": 0.894200127469726, - "grad_norm": 48.77071762084961, - "kl": 0.138671875, - "learning_rate": 1.0579987253027406e-07, - "loss": 0.0056, - "reward": 1.64748215675354, - "reward_std": 0.05162367969751358, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.52248215675354, - "rewards/pad": 0.125, - "step": 2806 - }, - { - "completion_length": 98.09375, - "epoch": 0.8945188017845762, - "grad_norm": 33.9787483215332, - "kl": 0.1572265625, - "learning_rate": 1.0548119821542384e-07, - "loss": 0.0063, - "reward": 1.3451802730560303, - "reward_std": 0.04213887080550194, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3451803922653198, - "rewards/pad": 0.0, - "step": 2807 - }, - { - "completion_length": 118.8125, - "epoch": 0.8948374760994264, - "grad_norm": 34.39272689819336, - "kl": 0.111328125, - "learning_rate": 1.051625239005736e-07, - "loss": 0.0045, - "reward": 1.5682514905929565, - "reward_std": 0.04192160815000534, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5682514905929565, - "rewards/pad": 0.0, - "step": 2808 - }, - { - "completion_length": 69.828125, - "epoch": 0.8951561504142767, - "grad_norm": 61.049163818359375, - "kl": 0.27734375, - "learning_rate": 1.0484384958572338e-07, - "loss": 0.0111, - "reward": 1.6074988842010498, - "reward_std": 0.045794710516929626, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48249879479408264, - "rewards/pad": 0.125, - "step": 2809 - }, - { - "completion_length": 97.359375, - "epoch": 0.8954748247291269, - "grad_norm": 41.99542999267578, - "kl": 0.1591796875, - "learning_rate": 1.0452517527087316e-07, - "loss": 0.0064, - "reward": 1.4806034564971924, - "reward_std": 0.062367819249629974, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48060333728790283, - "step": 2810 - }, - { - "completion_length": 148.125, - "epoch": 0.8957934990439771, - "grad_norm": 21.82179069519043, - "kl": 0.09521484375, - "learning_rate": 1.0420650095602294e-07, - "loss": 0.0038, - "reward": 1.3913319110870361, - "reward_std": 0.0555611327290535, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3913319706916809, - "step": 2811 - }, - { - "completion_length": 119.0, - "epoch": 0.8961121733588273, - "grad_norm": 17.92748260498047, - "kl": 0.1201171875, - "learning_rate": 1.0388782664117271e-07, - "loss": 0.0048, - "reward": 1.6690609455108643, - "reward_std": 0.06315435469150543, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5440609455108643, - "step": 2812 - }, - { - "completion_length": 45.0, - "epoch": 0.8964308476736775, - "grad_norm": 93.83169555664062, - "kl": 0.15625, - "learning_rate": 1.0356915232632249e-07, - "loss": 0.0062, - "reward": 1.67413330078125, - "reward_std": 0.06929302215576172, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42413339018821716, - "step": 2813 - }, - { - "completion_length": 97.125, - "epoch": 0.8967495219885278, - "grad_norm": 66.44733428955078, - "kl": 0.189453125, - "learning_rate": 1.0325047801147227e-07, - "loss": 0.0076, - "reward": 1.5241734981536865, - "reward_std": 0.06407955288887024, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5241734981536865, - "step": 2814 - }, - { - "completion_length": 71.5625, - "epoch": 0.897068196303378, - "grad_norm": 67.99653625488281, - "kl": 0.22265625, - "learning_rate": 1.0293180369662205e-07, - "loss": 0.0089, - "reward": 1.6482584476470947, - "reward_std": 0.13813325762748718, - "rewards/answer_reward": 0.078125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5701333284378052, - "step": 2815 - }, - { - "completion_length": 72.0, - "epoch": 0.8973868706182282, - "grad_norm": 80.87406158447266, - "kl": 0.2197265625, - "learning_rate": 1.0261312938177183e-07, - "loss": 0.0088, - "reward": 1.9038026332855225, - "reward_std": 0.06934420764446259, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6538025736808777, - "rewards/pad": 0.25, - "step": 2816 - }, - { - "completion_length": 148.125, - "epoch": 0.8977055449330784, - "grad_norm": 25.08995246887207, - "kl": 0.5, - "learning_rate": 1.022944550669216e-07, - "loss": 0.0201, - "reward": 1.659876823425293, - "reward_std": 0.05532084405422211, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5348767638206482, - "step": 2817 - }, - { - "completion_length": 100.3125, - "epoch": 0.8980242192479286, - "grad_norm": 28.815271377563477, - "kl": 0.1513671875, - "learning_rate": 1.0197578075207138e-07, - "loss": 0.0061, - "reward": 1.5481399297714233, - "reward_std": 0.0830165445804596, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.29813987016677856, - "rewards/pad": 0.25, - "step": 2818 - }, - { - "completion_length": 98.140625, - "epoch": 0.8983428935627789, - "grad_norm": 12.215855598449707, - "kl": 0.146484375, - "learning_rate": 1.0165710643722116e-07, - "loss": 0.0058, - "reward": 1.4812893867492676, - "reward_std": 0.02404611185193062, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3562892973423004, - "rewards/pad": 0.125, - "step": 2819 - }, - { - "completion_length": 97.65625, - "epoch": 0.8986615678776291, - "grad_norm": 21.20149803161621, - "kl": 0.21875, - "learning_rate": 1.0133843212237094e-07, - "loss": 0.0087, - "reward": 1.4777677059173584, - "reward_std": 0.07897792756557465, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.352767676115036, - "step": 2820 - }, - { - "completion_length": 73.75, - "epoch": 0.8989802421924793, - "grad_norm": 28.28120231628418, - "kl": 0.203125, - "learning_rate": 1.0101975780752072e-07, - "loss": 0.0081, - "reward": 1.907674789428711, - "reward_std": 0.0637560784816742, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6576747894287109, - "rewards/pad": 0.25, - "step": 2821 - }, - { - "completion_length": 71.21875, - "epoch": 0.8992989165073295, - "grad_norm": 14.870098114013672, - "kl": 0.1552734375, - "learning_rate": 1.0070108349267049e-07, - "loss": 0.0062, - "reward": 1.5852243900299072, - "reward_std": 0.07287629693746567, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5852242708206177, - "rewards/pad": 0.0, - "step": 2822 - }, - { - "completion_length": 123.28125, - "epoch": 0.8996175908221797, - "grad_norm": 21.250886917114258, - "kl": 0.1455078125, - "learning_rate": 1.0038240917782025e-07, - "loss": 0.0058, - "reward": 1.5068753957748413, - "reward_std": 0.11394312977790833, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3975003957748413, - "step": 2823 - }, - { - "completion_length": 71.40625, - "epoch": 0.89993626513703, - "grad_norm": 36.98274612426758, - "kl": 0.283203125, - "learning_rate": 1.0006373486297003e-07, - "loss": 0.0114, - "reward": 1.643838882446289, - "reward_std": 0.06396117806434631, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3938388228416443, - "rewards/pad": 0.25, - "step": 2824 - }, - { - "completion_length": 149.0, - "epoch": 0.9002549394518802, - "grad_norm": 48.26386260986328, - "kl": 0.07568359375, - "learning_rate": 9.974506054811981e-08, - "loss": 0.003, - "reward": 1.6491787433624268, - "reward_std": 0.04626685753464699, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5241787433624268, - "rewards/pad": 0.125, - "step": 2825 - }, - { - "completion_length": 70.765625, - "epoch": 0.9005736137667304, - "grad_norm": 48.901039123535156, - "kl": 0.1376953125, - "learning_rate": 9.94263862332696e-08, - "loss": 0.0055, - "reward": 1.495769739151001, - "reward_std": 0.05101197585463524, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.495769739151001, - "rewards/pad": 0.0, - "step": 2826 - }, - { - "completion_length": 46.078125, - "epoch": 0.9008922880815806, - "grad_norm": 26.932992935180664, - "kl": 0.181640625, - "learning_rate": 9.910771191841936e-08, - "loss": 0.0073, - "reward": 1.8378021717071533, - "reward_std": 0.05551741272211075, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5878022313117981, - "step": 2827 - }, - { - "completion_length": 96.734375, - "epoch": 0.9012109623964308, - "grad_norm": 30.460529327392578, - "kl": 0.1435546875, - "learning_rate": 9.878903760356914e-08, - "loss": 0.0057, - "reward": 1.732578158378601, - "reward_std": 0.06156246364116669, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6075780987739563, - "rewards/pad": 0.125, - "step": 2828 - }, - { - "completion_length": 97.390625, - "epoch": 0.9015296367112811, - "grad_norm": 63.52983474731445, - "kl": 0.1328125, - "learning_rate": 9.847036328871892e-08, - "loss": 0.0053, - "reward": 1.3585861921310425, - "reward_std": 0.07809005677700043, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.35858625173568726, - "rewards/pad": 0.0, - "step": 2829 - }, - { - "completion_length": 94.453125, - "epoch": 0.9018483110261313, - "grad_norm": 26.720457077026367, - "kl": 0.1533203125, - "learning_rate": 9.81516889738687e-08, - "loss": 0.0062, - "reward": 1.5476949214935303, - "reward_std": 0.04042517766356468, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5476949214935303, - "step": 2830 - }, - { - "completion_length": 21.171875, - "epoch": 0.9021669853409815, - "grad_norm": 135.34568786621094, - "kl": 0.26953125, - "learning_rate": 9.783301465901848e-08, - "loss": 0.0108, - "reward": 1.9858126640319824, - "reward_std": 0.07888701558113098, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6108126044273376, - "rewards/pad": 0.375, - "step": 2831 - }, - { - "completion_length": 96.140625, - "epoch": 0.9024856596558317, - "grad_norm": 23.70723533630371, - "kl": 0.1416015625, - "learning_rate": 9.751434034416825e-08, - "loss": 0.0057, - "reward": 1.6917176246643066, - "reward_std": 0.04848533496260643, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5667175054550171, - "rewards/pad": 0.125, - "step": 2832 - }, - { - "completion_length": 94.546875, - "epoch": 0.9028043339706819, - "grad_norm": 30.213918685913086, - "kl": 0.1513671875, - "learning_rate": 9.719566602931803e-08, - "loss": 0.0061, - "reward": 1.6319092512130737, - "reward_std": 0.10927574336528778, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.631909191608429, - "step": 2833 - }, - { - "completion_length": 119.59375, - "epoch": 0.9031230082855322, - "grad_norm": 73.39154052734375, - "kl": 0.1455078125, - "learning_rate": 9.687699171446781e-08, - "loss": 0.0058, - "reward": 1.679621934890747, - "reward_std": 0.06592319160699844, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5546219944953918, - "step": 2834 - }, - { - "completion_length": 93.828125, - "epoch": 0.9034416826003824, - "grad_norm": 19.510112762451172, - "kl": 0.12353515625, - "learning_rate": 9.655831739961759e-08, - "loss": 0.0049, - "reward": 1.6480538845062256, - "reward_std": 0.054087623953819275, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5230540037155151, - "rewards/pad": 0.125, - "step": 2835 - }, - { - "completion_length": 44.0, - "epoch": 0.9037603569152326, - "grad_norm": 27.508333206176758, - "kl": 0.42578125, - "learning_rate": 9.623964308476737e-08, - "loss": 0.017, - "reward": 1.6644142866134644, - "reward_std": 0.10253306478261948, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43003928661346436, - "step": 2836 - }, - { - "completion_length": 95.4375, - "epoch": 0.9040790312300828, - "grad_norm": 15.999404907226562, - "kl": 0.1259765625, - "learning_rate": 9.592096876991714e-08, - "loss": 0.005, - "reward": 1.4208650588989258, - "reward_std": 0.02377263642847538, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42086514830589294, - "step": 2837 - }, - { - "completion_length": 122.5625, - "epoch": 0.904397705544933, - "grad_norm": 158.2935791015625, - "kl": 0.0947265625, - "learning_rate": 9.560229445506691e-08, - "loss": 0.0038, - "reward": 1.4768131971359253, - "reward_std": 0.09162989258766174, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4768131375312805, - "rewards/pad": 0.0, - "step": 2838 - }, - { - "completion_length": 121.875, - "epoch": 0.9047163798597833, - "grad_norm": 146.5867919921875, - "kl": 0.12060546875, - "learning_rate": 9.528362014021669e-08, - "loss": 0.0048, - "reward": 1.6847409009933472, - "reward_std": 0.10323180258274078, - "rewards/pad": 0.171875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5128659009933472, - "step": 2839 - }, - { - "completion_length": 71.40625, - "epoch": 0.9050350541746335, - "grad_norm": 94.4487075805664, - "kl": 0.232421875, - "learning_rate": 9.496494582536647e-08, - "loss": 0.0093, - "reward": 1.6142141819000244, - "reward_std": 0.06473654508590698, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.489214152097702, - "rewards/pad": 0.125, - "step": 2840 - }, - { - "completion_length": 71.390625, - "epoch": 0.9053537284894837, - "grad_norm": 28.924110412597656, - "kl": 0.166015625, - "learning_rate": 9.464627151051625e-08, - "loss": 0.0066, - "reward": 1.800523281097412, - "reward_std": 0.07699442654848099, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5505234003067017, - "step": 2841 - }, - { - "completion_length": 98.875, - "epoch": 0.9056724028043339, - "grad_norm": 172.80946350097656, - "kl": 0.12158203125, - "learning_rate": 9.432759719566602e-08, - "loss": 0.0049, - "reward": 1.5703397989273071, - "reward_std": 0.06775246560573578, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44533979892730713, - "rewards/pad": 0.125, - "step": 2842 - }, - { - "completion_length": 44.4375, - "epoch": 0.9059910771191841, - "grad_norm": 101.51387023925781, - "kl": 0.263671875, - "learning_rate": 9.40089228808158e-08, - "loss": 0.0106, - "reward": 1.516183853149414, - "reward_std": 0.05424252897500992, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5161838531494141, - "rewards/pad": 0.0, - "step": 2843 - }, - { - "completion_length": 73.25, - "epoch": 0.9063097514340345, - "grad_norm": 31.52298355102539, - "kl": 0.1376953125, - "learning_rate": 9.369024856596558e-08, - "loss": 0.0055, - "reward": 1.6688685417175293, - "reward_std": 0.044357214123010635, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5438685417175293, - "step": 2844 - }, - { - "completion_length": 146.359375, - "epoch": 0.9066284257488847, - "grad_norm": 25.240934371948242, - "kl": 0.150390625, - "learning_rate": 9.337157425111536e-08, - "loss": 0.006, - "reward": 1.5053327083587646, - "reward_std": 0.05326319485902786, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5053327083587646, - "step": 2845 - }, - { - "completion_length": 95.09375, - "epoch": 0.9069471000637349, - "grad_norm": 32.554603576660156, - "kl": 0.16015625, - "learning_rate": 9.305289993626514e-08, - "loss": 0.0064, - "reward": 1.515863299369812, - "reward_std": 0.035547398030757904, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5158632397651672, - "rewards/pad": 0.0, - "step": 2846 - }, - { - "completion_length": 97.96875, - "epoch": 0.9072657743785851, - "grad_norm": 32.66326904296875, - "kl": 0.1982421875, - "learning_rate": 9.27342256214149e-08, - "loss": 0.0079, - "reward": 1.5872151851654053, - "reward_std": 0.07223374396562576, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4622151255607605, - "rewards/pad": 0.125, - "step": 2847 - }, - { - "completion_length": 120.578125, - "epoch": 0.9075844486934354, - "grad_norm": 15.716976165771484, - "kl": 0.150390625, - "learning_rate": 9.241555130656469e-08, - "loss": 0.006, - "reward": 1.4728636741638184, - "reward_std": 0.046222954988479614, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47286370396614075, - "step": 2848 - }, - { - "completion_length": 70.109375, - "epoch": 0.9079031230082856, - "grad_norm": 42.22808837890625, - "kl": 0.1494140625, - "learning_rate": 9.209687699171447e-08, - "loss": 0.006, - "reward": 1.7201340198516846, - "reward_std": 0.06590262055397034, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.47013407945632935, - "step": 2849 - }, - { - "completion_length": 72.703125, - "epoch": 0.9082217973231358, - "grad_norm": 104.17584228515625, - "kl": 0.318359375, - "learning_rate": 9.177820267686425e-08, - "loss": 0.0127, - "reward": 1.8309214115142822, - "reward_std": 0.15780913829803467, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6746714115142822, - "rewards/pad": 0.15625, - "step": 2850 - }, - { - "completion_length": 71.3125, - "epoch": 0.908540471637986, - "grad_norm": 27.67194938659668, - "kl": 0.1513671875, - "learning_rate": 9.145952836201403e-08, - "loss": 0.0061, - "reward": 1.5947067737579346, - "reward_std": 0.05079054832458496, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.46970680356025696, - "rewards/pad": 0.125, - "step": 2851 - }, - { - "completion_length": 149.40625, - "epoch": 0.9088591459528362, - "grad_norm": 34.95130920410156, - "kl": 0.11572265625, - "learning_rate": 9.11408540471638e-08, - "loss": 0.0046, - "reward": 1.626198410987854, - "reward_std": 0.052007220685482025, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.501198410987854, - "rewards/pad": 0.125, - "step": 2852 - }, - { - "completion_length": 172.75, - "epoch": 0.9091778202676865, - "grad_norm": 50.463539123535156, - "kl": 0.11083984375, - "learning_rate": 9.082217973231358e-08, - "loss": 0.0045, - "reward": 1.513413906097412, - "reward_std": 0.05457693338394165, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5134139657020569, - "step": 2853 - }, - { - "completion_length": 47.109375, - "epoch": 0.9094964945825367, - "grad_norm": 190.93516540527344, - "kl": 0.1591796875, - "learning_rate": 9.050350541746334e-08, - "loss": 0.0064, - "reward": 1.531948447227478, - "reward_std": 0.08213237673044205, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.28194835782051086, - "rewards/pad": 0.25, - "step": 2854 - }, - { - "completion_length": 93.71875, - "epoch": 0.9098151688973869, - "grad_norm": 33.00968933105469, - "kl": 0.1435546875, - "learning_rate": 9.018483110261312e-08, - "loss": 0.0057, - "reward": 1.6150282621383667, - "reward_std": 0.11225007474422455, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5056532621383667, - "step": 2855 - }, - { - "completion_length": 121.125, - "epoch": 0.9101338432122371, - "grad_norm": 45.09959411621094, - "kl": 0.125, - "learning_rate": 8.98661567877629e-08, - "loss": 0.005, - "reward": 1.4039934873580933, - "reward_std": 0.05160229280591011, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40399348735809326, - "step": 2856 - }, - { - "completion_length": 96.953125, - "epoch": 0.9104525175270873, - "grad_norm": 49.587623596191406, - "kl": 0.1357421875, - "learning_rate": 8.954748247291267e-08, - "loss": 0.0054, - "reward": 1.6635723114013672, - "reward_std": 0.12691214680671692, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5541973114013672, - "step": 2857 - }, - { - "completion_length": 121.125, - "epoch": 0.9107711918419376, - "grad_norm": 11.192237854003906, - "kl": 0.138671875, - "learning_rate": 8.922880815806245e-08, - "loss": 0.0056, - "reward": 1.44820237159729, - "reward_std": 0.044160421937704086, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44820237159729004, - "step": 2858 - }, - { - "completion_length": 74.921875, - "epoch": 0.9110898661567878, - "grad_norm": 24.94388771057129, - "kl": 0.265625, - "learning_rate": 8.891013384321223e-08, - "loss": 0.0106, - "reward": 1.7816369533538818, - "reward_std": 0.1505826860666275, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4222618639469147, - "step": 2859 - }, - { - "completion_length": 148.90625, - "epoch": 0.911408540471638, - "grad_norm": 16.26469612121582, - "kl": 0.115234375, - "learning_rate": 8.859145952836201e-08, - "loss": 0.0046, - "reward": 1.532886028289795, - "reward_std": 0.059405386447906494, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4078860282897949, - "step": 2860 - }, - { - "completion_length": 122.25, - "epoch": 0.9117272147864882, - "grad_norm": 52.324886322021484, - "kl": 0.1943359375, - "learning_rate": 8.827278521351178e-08, - "loss": 0.0078, - "reward": 1.5306769609451294, - "reward_std": 0.0913526713848114, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4056769013404846, - "step": 2861 - }, - { - "completion_length": 97.953125, - "epoch": 0.9120458891013384, - "grad_norm": 56.95656967163086, - "kl": 0.125, - "learning_rate": 8.795411089866156e-08, - "loss": 0.005, - "reward": 1.6419885158538818, - "reward_std": 0.09374599158763885, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5326135754585266, - "step": 2862 - }, - { - "completion_length": 69.828125, - "epoch": 0.9123645634161887, - "grad_norm": 17.65058135986328, - "kl": 0.1513671875, - "learning_rate": 8.763543658381134e-08, - "loss": 0.0061, - "reward": 1.6570945978164673, - "reward_std": 0.1054392158985138, - "rewards/answer_reward": 0.21875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4383445084095001, - "step": 2863 - }, - { - "completion_length": 95.640625, - "epoch": 0.9126832377310389, - "grad_norm": 27.415205001831055, - "kl": 0.169921875, - "learning_rate": 8.731676226896112e-08, - "loss": 0.0068, - "reward": 1.5892078876495361, - "reward_std": 0.10721639543771744, - "rewards/pad": 0.140625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4485829174518585, - "step": 2864 - }, - { - "completion_length": 68.609375, - "epoch": 0.9130019120458891, - "grad_norm": 67.93570709228516, - "kl": 0.205078125, - "learning_rate": 8.69980879541109e-08, - "loss": 0.0082, - "reward": 1.531421184539795, - "reward_std": 0.08368706703186035, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5314211845397949, - "rewards/pad": 0.0, - "step": 2865 - }, - { - "completion_length": 71.671875, - "epoch": 0.9133205863607393, - "grad_norm": 56.584232330322266, - "kl": 0.302734375, - "learning_rate": 8.667941363926067e-08, - "loss": 0.0121, - "reward": 1.8283965587615967, - "reward_std": 0.08539590984582901, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5783965587615967, - "rewards/pad": 0.25, - "step": 2866 - }, - { - "completion_length": 95.90625, - "epoch": 0.9136392606755895, - "grad_norm": 18.290477752685547, - "kl": 0.17578125, - "learning_rate": 8.636073932441045e-08, - "loss": 0.007, - "reward": 1.740995168685913, - "reward_std": 0.10548253357410431, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6159951686859131, - "rewards/pad": 0.125, - "step": 2867 - }, - { - "completion_length": 70.0625, - "epoch": 0.9139579349904398, - "grad_norm": 77.42828369140625, - "kl": 0.1455078125, - "learning_rate": 8.604206500956023e-08, - "loss": 0.0058, - "reward": 1.4377033710479736, - "reward_std": 0.07341436296701431, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4377034902572632, - "rewards/pad": 0.0, - "step": 2868 - }, - { - "completion_length": 70.109375, - "epoch": 0.91427660930529, - "grad_norm": 26.113903045654297, - "kl": 0.2470703125, - "learning_rate": 8.572339069471e-08, - "loss": 0.0099, - "reward": 1.4969959259033203, - "reward_std": 0.14446821808815002, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5126208662986755, - "step": 2869 - }, - { - "completion_length": 96.609375, - "epoch": 0.9145952836201402, - "grad_norm": 110.2703628540039, - "kl": 0.1328125, - "learning_rate": 8.540471637985978e-08, - "loss": 0.0053, - "reward": 1.370757818222046, - "reward_std": 0.17480114102363586, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.2770078778266907, - "step": 2870 - }, - { - "completion_length": 69.96875, - "epoch": 0.9149139579349904, - "grad_norm": 27.96677017211914, - "kl": 0.396484375, - "learning_rate": 8.508604206500955e-08, - "loss": 0.0158, - "reward": 1.8911104202270508, - "reward_std": 0.07958716154098511, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6411104202270508, - "step": 2871 - }, - { - "completion_length": 97.015625, - "epoch": 0.9152326322498406, - "grad_norm": 60.4406623840332, - "kl": 0.86328125, - "learning_rate": 8.476736775015933e-08, - "loss": 0.0347, - "reward": 1.6432859897613525, - "reward_std": 0.04772375524044037, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5182859897613525, - "step": 2872 - }, - { - "completion_length": 122.515625, - "epoch": 0.9155513065646909, - "grad_norm": 52.94770812988281, - "kl": 0.0908203125, - "learning_rate": 8.444869343530911e-08, - "loss": 0.0036, - "reward": 1.4628794193267822, - "reward_std": 0.09584290534257889, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44725432991981506, - "step": 2873 - }, - { - "completion_length": 69.734375, - "epoch": 0.9158699808795411, - "grad_norm": 18.92051124572754, - "kl": 0.1494140625, - "learning_rate": 8.413001912045889e-08, - "loss": 0.006, - "reward": 1.5590465068817139, - "reward_std": 0.10996858775615692, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.5746715068817139, - "step": 2874 - }, - { - "completion_length": 72.265625, - "epoch": 0.9161886551943913, - "grad_norm": 22.928791046142578, - "kl": 0.142578125, - "learning_rate": 8.381134480560867e-08, - "loss": 0.0057, - "reward": 1.6328473091125488, - "reward_std": 0.09314089268445969, - "rewards/pad": 0.234375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3984721601009369, - "step": 2875 - }, - { - "completion_length": 97.296875, - "epoch": 0.9165073295092415, - "grad_norm": 55.735721588134766, - "kl": 0.1591796875, - "learning_rate": 8.349267049075843e-08, - "loss": 0.0064, - "reward": 1.683035135269165, - "reward_std": 0.13315176963806152, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5736602544784546, - "rewards/pad": 0.125, - "step": 2876 - }, - { - "completion_length": 97.1875, - "epoch": 0.9168260038240917, - "grad_norm": 21.763675689697266, - "kl": 0.14453125, - "learning_rate": 8.317399617590822e-08, - "loss": 0.0058, - "reward": 1.701788306236267, - "reward_std": 0.04371989145874977, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5767883062362671, - "rewards/pad": 0.125, - "step": 2877 - }, - { - "completion_length": 123.984375, - "epoch": 0.917144678138942, - "grad_norm": 34.83768081665039, - "kl": 0.11962890625, - "learning_rate": 8.2855321861058e-08, - "loss": 0.0048, - "reward": 1.6477702856063843, - "reward_std": 0.09868469834327698, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5696454048156738, - "rewards/pad": 0.078125, - "step": 2878 - }, - { - "completion_length": 122.109375, - "epoch": 0.9174633524537922, - "grad_norm": 15.59700870513916, - "kl": 0.1923828125, - "learning_rate": 8.253664754620778e-08, - "loss": 0.0077, - "reward": 1.4150080680847168, - "reward_std": 0.045275501906871796, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41500794887542725, - "step": 2879 - }, - { - "completion_length": 70.421875, - "epoch": 0.9177820267686424, - "grad_norm": 33.2744255065918, - "kl": 0.162109375, - "learning_rate": 8.221797323135756e-08, - "loss": 0.0065, - "reward": 1.5689854621887207, - "reward_std": 0.04865889623761177, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5689855217933655, - "rewards/pad": 0.0, - "step": 2880 - }, - { - "completion_length": 71.125, - "epoch": 0.9181007010834926, - "grad_norm": 103.49323272705078, - "kl": 0.11767578125, - "learning_rate": 8.189929891650732e-08, - "loss": 0.0047, - "reward": 1.81866455078125, - "reward_std": 0.08594033122062683, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5842896103858948, - "step": 2881 - }, - { - "completion_length": 70.984375, - "epoch": 0.9184193753983428, - "grad_norm": 20.405597686767578, - "kl": 0.18359375, - "learning_rate": 8.15806246016571e-08, - "loss": 0.0074, - "reward": 1.776833176612854, - "reward_std": 0.06381825357675552, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.651833176612854, - "rewards/pad": 0.125, - "step": 2882 - }, - { - "completion_length": 121.546875, - "epoch": 0.9187380497131931, - "grad_norm": 71.76720428466797, - "kl": 0.2314453125, - "learning_rate": 8.126195028680689e-08, - "loss": 0.0093, - "reward": 1.5695829391479492, - "reward_std": 0.07406845688819885, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5695829391479492, - "step": 2883 - }, - { - "completion_length": 121.515625, - "epoch": 0.9190567240280434, - "grad_norm": 19.956207275390625, - "kl": 0.1142578125, - "learning_rate": 8.094327597195667e-08, - "loss": 0.0046, - "reward": 1.5454630851745605, - "reward_std": 0.042439982295036316, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5454631447792053, - "rewards/pad": 0.0, - "step": 2884 - }, - { - "completion_length": 69.90625, - "epoch": 0.9193753983428936, - "grad_norm": 26.49988555908203, - "kl": 0.2421875, - "learning_rate": 8.062460165710643e-08, - "loss": 0.0097, - "reward": 1.7898268699645996, - "reward_std": 0.09240086376667023, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6648268699645996, - "step": 2885 - }, - { - "completion_length": 69.5, - "epoch": 0.9196940726577438, - "grad_norm": 14.298598289489746, - "kl": 0.1943359375, - "learning_rate": 8.03059273422562e-08, - "loss": 0.0078, - "reward": 1.4611117839813232, - "reward_std": 0.03294749557971954, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.461111843585968, - "rewards/pad": 0.0, - "step": 2886 - }, - { - "completion_length": 73.71875, - "epoch": 0.920012746972594, - "grad_norm": 68.65585327148438, - "kl": 0.2216796875, - "learning_rate": 7.998725302740598e-08, - "loss": 0.0089, - "reward": 1.6945104598999023, - "reward_std": 0.15323123335838318, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.46013540029525757, - "rewards/pad": 0.25, - "step": 2887 - }, - { - "completion_length": 99.375, - "epoch": 0.9203314212874443, - "grad_norm": 82.1524429321289, - "kl": 0.150390625, - "learning_rate": 7.966857871255576e-08, - "loss": 0.006, - "reward": 1.7663450241088867, - "reward_std": 0.06288193166255951, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6413450241088867, - "step": 2888 - }, - { - "completion_length": 97.890625, - "epoch": 0.9206500956022945, - "grad_norm": 38.29640579223633, - "kl": 0.34765625, - "learning_rate": 7.934990439770554e-08, - "loss": 0.0139, - "reward": 1.614652156829834, - "reward_std": 0.08598000556230545, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3802771270275116, - "rewards/pad": 0.234375, - "step": 2889 - }, - { - "completion_length": 72.265625, - "epoch": 0.9209687699171447, - "grad_norm": 22.514677047729492, - "kl": 0.162109375, - "learning_rate": 7.903123008285532e-08, - "loss": 0.0065, - "reward": 1.739565372467041, - "reward_std": 0.0939113050699234, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5051903128623962, - "rewards/pad": 0.234375, - "step": 2890 - }, - { - "completion_length": 72.640625, - "epoch": 0.9212874442319949, - "grad_norm": 27.841285705566406, - "kl": 0.208984375, - "learning_rate": 7.871255576800509e-08, - "loss": 0.0084, - "reward": 1.7863534688949585, - "reward_std": 0.07173985242843628, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5363534688949585, - "step": 2891 - }, - { - "completion_length": 117.75, - "epoch": 0.9216061185468452, - "grad_norm": 30.70351791381836, - "kl": 0.142578125, - "learning_rate": 7.839388145315487e-08, - "loss": 0.0057, - "reward": 1.5791267156600952, - "reward_std": 0.041720978915691376, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5791267156600952, - "step": 2892 - }, - { - "completion_length": 95.484375, - "epoch": 0.9219247928616954, - "grad_norm": 29.678829193115234, - "kl": 0.142578125, - "learning_rate": 7.807520713830465e-08, - "loss": 0.0057, - "reward": 1.6197293996810913, - "reward_std": 0.059633661061525345, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6197293996810913, - "step": 2893 - }, - { - "completion_length": 121.0625, - "epoch": 0.9222434671765456, - "grad_norm": 15.231504440307617, - "kl": 0.115234375, - "learning_rate": 7.775653282345443e-08, - "loss": 0.0046, - "reward": 1.5021556615829468, - "reward_std": 0.06919392943382263, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5021557211875916, - "step": 2894 - }, - { - "completion_length": 43.28125, - "epoch": 0.9225621414913958, - "grad_norm": 66.6026382446289, - "kl": 0.310546875, - "learning_rate": 7.743785850860421e-08, - "loss": 0.0124, - "reward": 1.734342098236084, - "reward_std": 0.08491496741771698, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7343420386314392, - "rewards/pad": 0.0, - "step": 2895 - }, - { - "completion_length": 120.59375, - "epoch": 0.922880815806246, - "grad_norm": 10.146793365478516, - "kl": 0.2197265625, - "learning_rate": 7.711918419375398e-08, - "loss": 0.0088, - "reward": 1.4570331573486328, - "reward_std": 0.08783900737762451, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4570331573486328, - "step": 2896 - }, - { - "completion_length": 68.21875, - "epoch": 0.9231994901210963, - "grad_norm": 218.56849670410156, - "kl": 0.326171875, - "learning_rate": 7.680050987890376e-08, - "loss": 0.013, - "reward": 1.6151440143585205, - "reward_std": 0.058759938925504684, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6151441335678101, - "rewards/pad": 0.0, - "step": 2897 - }, - { - "completion_length": 120.328125, - "epoch": 0.9235181644359465, - "grad_norm": 47.88025665283203, - "kl": 0.2294921875, - "learning_rate": 7.648183556405354e-08, - "loss": 0.0092, - "reward": 1.5233371257781982, - "reward_std": 0.0550679937005043, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5233370661735535, - "step": 2898 - }, - { - "completion_length": 149.4375, - "epoch": 0.9238368387507967, - "grad_norm": 12.78074836730957, - "kl": 0.1220703125, - "learning_rate": 7.616316124920332e-08, - "loss": 0.0049, - "reward": 1.5040568113327026, - "reward_std": 0.03877232223749161, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5040568113327026, - "step": 2899 - }, - { - "completion_length": 94.640625, - "epoch": 0.9241555130656469, - "grad_norm": 41.853092193603516, - "kl": 0.25, - "learning_rate": 7.584448693435309e-08, - "loss": 0.01, - "reward": 1.4440009593963623, - "reward_std": 0.07161448895931244, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4440010190010071, - "rewards/pad": 0.0, - "step": 2900 - }, - { - "completion_length": 70.28125, - "epoch": 0.9244741873804971, - "grad_norm": 26.74795913696289, - "kl": 0.193359375, - "learning_rate": 7.552581261950285e-08, - "loss": 0.0077, - "reward": 1.7943412065505981, - "reward_std": 0.06383783370256424, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5443412661552429, - "step": 2901 - }, - { - "completion_length": 72.40625, - "epoch": 0.9247928616953474, - "grad_norm": 28.704776763916016, - "kl": 0.140625, - "learning_rate": 7.520713830465264e-08, - "loss": 0.0056, - "reward": 1.8878512382507324, - "reward_std": 0.09012124687433243, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5284762978553772, - "step": 2902 - }, - { - "completion_length": 100.75, - "epoch": 0.9251115360101976, - "grad_norm": 20.47161293029785, - "kl": 0.1005859375, - "learning_rate": 7.488846398980242e-08, - "loss": 0.004, - "reward": 1.5786961317062378, - "reward_std": 0.0906924456357956, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.4693211615085602, - "step": 2903 - }, - { - "completion_length": 45.625, - "epoch": 0.9254302103250478, - "grad_norm": 31.97844123840332, - "kl": 0.2314453125, - "learning_rate": 7.45697896749522e-08, - "loss": 0.0093, - "reward": 1.838749647140503, - "reward_std": 0.13938641548156738, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5887496471405029, - "rewards/pad": 0.25, - "step": 2904 - }, - { - "completion_length": 68.390625, - "epoch": 0.925748884639898, - "grad_norm": 65.3585433959961, - "kl": 0.13671875, - "learning_rate": 7.425111536010198e-08, - "loss": 0.0055, - "reward": 1.760368824005127, - "reward_std": 0.05466555058956146, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6353688836097717, - "step": 2905 - }, - { - "completion_length": 95.734375, - "epoch": 0.9260675589547482, - "grad_norm": 15.4692964553833, - "kl": 0.1328125, - "learning_rate": 7.393244104525174e-08, - "loss": 0.0053, - "reward": 1.659071922302246, - "reward_std": 0.041400231420993805, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4090718626976013, - "rewards/pad": 0.25, - "step": 2906 - }, - { - "completion_length": 43.453125, - "epoch": 0.9263862332695985, - "grad_norm": 294.6001281738281, - "kl": 0.1875, - "learning_rate": 7.361376673040152e-08, - "loss": 0.0075, - "reward": 1.5708719491958618, - "reward_std": 0.04681427776813507, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.570871889591217, - "rewards/pad": 0.0, - "step": 2907 - }, - { - "completion_length": 94.9375, - "epoch": 0.9267049075844487, - "grad_norm": 48.028587341308594, - "kl": 0.1630859375, - "learning_rate": 7.32950924155513e-08, - "loss": 0.0065, - "reward": 1.6135939359664917, - "reward_std": 0.049565255641937256, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6135939955711365, - "step": 2908 - }, - { - "completion_length": 71.578125, - "epoch": 0.9270235818992989, - "grad_norm": 37.10649871826172, - "kl": 0.154296875, - "learning_rate": 7.297641810070109e-08, - "loss": 0.0062, - "reward": 1.6163232326507568, - "reward_std": 0.07260958850383759, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4913232922554016, - "step": 2909 - }, - { - "completion_length": 92.828125, - "epoch": 0.9273422562141491, - "grad_norm": 154.8212432861328, - "kl": 0.515625, - "learning_rate": 7.265774378585087e-08, - "loss": 0.0206, - "reward": 1.5289678573608398, - "reward_std": 0.1055968776345253, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5289679765701294, - "rewards/pad": 0.0, - "step": 2910 - }, - { - "completion_length": 95.328125, - "epoch": 0.9276609305289993, - "grad_norm": 41.80696105957031, - "kl": 0.279296875, - "learning_rate": 7.233906947100063e-08, - "loss": 0.0112, - "reward": 1.5693634748458862, - "reward_std": 0.04554504156112671, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5693634748458862, - "step": 2911 - }, - { - "completion_length": 44.703125, - "epoch": 0.9279796048438496, - "grad_norm": 450.05029296875, - "kl": 0.27734375, - "learning_rate": 7.202039515615041e-08, - "loss": 0.0111, - "reward": 1.714645266532898, - "reward_std": 0.12786036729812622, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46464529633522034, - "step": 2912 - }, - { - "completion_length": 124.328125, - "epoch": 0.9282982791586998, - "grad_norm": 31.789283752441406, - "kl": 0.12255859375, - "learning_rate": 7.17017208413002e-08, - "loss": 0.0049, - "reward": 1.5354914665222168, - "reward_std": 0.03460142761468887, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5354914665222168, - "step": 2913 - }, - { - "completion_length": 172.9375, - "epoch": 0.92861695347355, - "grad_norm": 36.49752426147461, - "kl": 0.08935546875, - "learning_rate": 7.138304652644997e-08, - "loss": 0.0036, - "reward": 1.5260934829711914, - "reward_std": 0.03774914890527725, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5260934233665466, - "step": 2914 - }, - { - "completion_length": 72.15625, - "epoch": 0.9289356277884002, - "grad_norm": 44.16923904418945, - "kl": 0.1591796875, - "learning_rate": 7.106437221159973e-08, - "loss": 0.0064, - "reward": 1.7683162689208984, - "reward_std": 0.07336032390594482, - "rewards/pad": 0.015625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.7526911497116089, - "step": 2915 - }, - { - "completion_length": 97.96875, - "epoch": 0.9292543021032504, - "grad_norm": 50.224979400634766, - "kl": 0.12890625, - "learning_rate": 7.074569789674951e-08, - "loss": 0.0051, - "reward": 1.6346163749694824, - "reward_std": 0.05000638589262962, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.38461625576019287, - "step": 2916 - }, - { - "completion_length": 96.953125, - "epoch": 0.9295729764181007, - "grad_norm": 92.64608764648438, - "kl": 0.1572265625, - "learning_rate": 7.042702358189929e-08, - "loss": 0.0063, - "reward": 1.6365630626678467, - "reward_std": 0.05900702625513077, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5115630626678467, - "step": 2917 - }, - { - "completion_length": 119.40625, - "epoch": 0.9298916507329509, - "grad_norm": 66.34664916992188, - "kl": 0.1318359375, - "learning_rate": 7.010834926704907e-08, - "loss": 0.0053, - "reward": 1.4824174642562866, - "reward_std": 0.1585683822631836, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.5136675238609314, - "rewards/pad": 0.0, - "step": 2918 - }, - { - "completion_length": 153.40625, - "epoch": 0.9302103250478011, - "grad_norm": 13.153512001037598, - "kl": 0.0927734375, - "learning_rate": 6.978967495219885e-08, - "loss": 0.0037, - "reward": 1.5633649826049805, - "reward_std": 0.05241779237985611, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43836501240730286, - "step": 2919 - }, - { - "completion_length": 96.3125, - "epoch": 0.9305289993626513, - "grad_norm": 17.02448081970215, - "kl": 0.158203125, - "learning_rate": 6.947100063734862e-08, - "loss": 0.0063, - "reward": 1.6069416999816895, - "reward_std": 0.05798110365867615, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48194175958633423, - "rewards/pad": 0.125, - "step": 2920 - }, - { - "completion_length": 118.375, - "epoch": 0.9308476736775015, - "grad_norm": 44.8119010925293, - "kl": 0.12890625, - "learning_rate": 6.91523263224984e-08, - "loss": 0.0051, - "reward": 1.5962743759155273, - "reward_std": 0.09969855844974518, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5962744951248169, - "rewards/pad": 0.0, - "step": 2921 - }, - { - "completion_length": 119.546875, - "epoch": 0.9311663479923518, - "grad_norm": 33.66161346435547, - "kl": 0.150390625, - "learning_rate": 6.883365200764818e-08, - "loss": 0.006, - "reward": 1.5806576013565063, - "reward_std": 0.06134621053934097, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5806576013565063, - "rewards/pad": 0.0, - "step": 2922 - }, - { - "completion_length": 72.015625, - "epoch": 0.9314850223072021, - "grad_norm": 51.13123321533203, - "kl": 0.17578125, - "learning_rate": 6.851497769279796e-08, - "loss": 0.0071, - "reward": 1.6464474201202393, - "reward_std": 0.1740022897720337, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4276975095272064, - "rewards/pad": 0.234375, - "step": 2923 - }, - { - "completion_length": 121.71875, - "epoch": 0.9318036966220523, - "grad_norm": 33.446937561035156, - "kl": 0.201171875, - "learning_rate": 6.819630337794774e-08, - "loss": 0.0081, - "reward": 1.6526374816894531, - "reward_std": 0.136085644364357, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5432624220848083, - "rewards/pad": 0.125, - "step": 2924 - }, - { - "completion_length": 123.46875, - "epoch": 0.9321223709369025, - "grad_norm": 107.50666046142578, - "kl": 0.169921875, - "learning_rate": 6.787762906309751e-08, - "loss": 0.0068, - "reward": 1.502575159072876, - "reward_std": 0.05350276455283165, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5025750994682312, - "step": 2925 - }, - { - "completion_length": 69.59375, - "epoch": 0.9324410452517528, - "grad_norm": 110.6546630859375, - "kl": 0.19140625, - "learning_rate": 6.755895474824729e-08, - "loss": 0.0076, - "reward": 1.6307780742645264, - "reward_std": 0.03509455546736717, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5057780742645264, - "step": 2926 - }, - { - "completion_length": 72.0, - "epoch": 0.932759719566603, - "grad_norm": 67.64140319824219, - "kl": 0.1630859375, - "learning_rate": 6.724028043339707e-08, - "loss": 0.0065, - "reward": 1.6671245098114014, - "reward_std": 0.18406322598457336, - "rewards/answer_reward": 0.1875, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4796244502067566, - "step": 2927 - }, - { - "completion_length": 72.671875, - "epoch": 0.9330783938814532, - "grad_norm": 16.61734390258789, - "kl": 0.171875, - "learning_rate": 6.692160611854685e-08, - "loss": 0.0068, - "reward": 1.606482982635498, - "reward_std": 0.06252360343933105, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.48148295283317566, - "step": 2928 - }, - { - "completion_length": 122.390625, - "epoch": 0.9333970681963034, - "grad_norm": 29.119325637817383, - "kl": 0.2353515625, - "learning_rate": 6.660293180369663e-08, - "loss": 0.0094, - "reward": 1.6139140129089355, - "reward_std": 0.09718403220176697, - "rewards/pad": 0.078125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5357890725135803, - "step": 2929 - }, - { - "completion_length": 122.75, - "epoch": 0.9337157425111536, - "grad_norm": 45.781211853027344, - "kl": 0.173828125, - "learning_rate": 6.62842574888464e-08, - "loss": 0.0069, - "reward": 1.4269945621490479, - "reward_std": 0.04262663424015045, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4269945025444031, - "step": 2930 - }, - { - "completion_length": 69.9375, - "epoch": 0.9340344168260039, - "grad_norm": 25.132644653320312, - "kl": 0.1650390625, - "learning_rate": 6.596558317399616e-08, - "loss": 0.0066, - "reward": 1.6815913915634155, - "reward_std": 0.06493282318115234, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5565913915634155, - "rewards/pad": 0.125, - "step": 2931 - }, - { - "completion_length": 97.828125, - "epoch": 0.9343530911408541, - "grad_norm": 38.049644470214844, - "kl": 0.1298828125, - "learning_rate": 6.564690885914594e-08, - "loss": 0.0052, - "reward": 1.6862627267837524, - "reward_std": 0.14579840004444122, - "rewards/answer_reward": 0.265625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4206376075744629, - "step": 2932 - }, - { - "completion_length": 45.171875, - "epoch": 0.9346717654557043, - "grad_norm": 22.782184600830078, - "kl": 0.1923828125, - "learning_rate": 6.532823454429572e-08, - "loss": 0.0077, - "reward": 1.612952470779419, - "reward_std": 0.07671575248241425, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4879525303840637, - "step": 2933 - }, - { - "completion_length": 122.609375, - "epoch": 0.9349904397705545, - "grad_norm": 38.51630783081055, - "kl": 0.1708984375, - "learning_rate": 6.50095602294455e-08, - "loss": 0.0068, - "reward": 1.7245512008666992, - "reward_std": 0.03792456537485123, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4745512008666992, - "step": 2934 - }, - { - "completion_length": 124.828125, - "epoch": 0.9353091140854047, - "grad_norm": 129.76730346679688, - "kl": 0.1298828125, - "learning_rate": 6.469088591459527e-08, - "loss": 0.0052, - "reward": 1.666576862335205, - "reward_std": 0.16981662809848785, - "rewards/pad": 0.1875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4790768623352051, - "step": 2935 - }, - { - "completion_length": 93.703125, - "epoch": 0.935627788400255, - "grad_norm": 120.7542953491211, - "kl": 0.12451171875, - "learning_rate": 6.437221159974505e-08, - "loss": 0.005, - "reward": 1.6693174839019775, - "reward_std": 0.05616677179932594, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6693174839019775, - "step": 2936 - }, - { - "completion_length": 122.25, - "epoch": 0.9359464627151052, - "grad_norm": 35.39690399169922, - "kl": 0.09033203125, - "learning_rate": 6.405353728489483e-08, - "loss": 0.0036, - "reward": 1.6234962940216064, - "reward_std": 0.03904394805431366, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4984963834285736, - "step": 2937 - }, - { - "completion_length": 45.984375, - "epoch": 0.9362651370299554, - "grad_norm": 987.269287109375, - "kl": 0.1728515625, - "learning_rate": 6.373486297004461e-08, - "loss": 0.0069, - "reward": 1.6837446689605713, - "reward_std": 0.05997525155544281, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5587445497512817, - "rewards/pad": 0.125, - "step": 2938 - }, - { - "completion_length": 92.6875, - "epoch": 0.9365838113448056, - "grad_norm": 42.27017593383789, - "kl": 0.44140625, - "learning_rate": 6.34161886551944e-08, - "loss": 0.0176, - "reward": 1.6802515983581543, - "reward_std": 0.05614824220538139, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6802517175674438, - "rewards/pad": 0.0, - "step": 2939 - }, - { - "completion_length": 145.890625, - "epoch": 0.9369024856596558, - "grad_norm": 27.89042854309082, - "kl": 0.1533203125, - "learning_rate": 6.309751434034416e-08, - "loss": 0.0061, - "reward": 1.5655685663223267, - "reward_std": 0.11378233879804611, - "rewards/format_reward_tg": 0.96875, - "rewards/iou_timestamp_reward": 0.47181859612464905, - "rewards/pad": 0.125, - "step": 2940 - }, - { - "completion_length": 146.578125, - "epoch": 0.9372211599745061, - "grad_norm": 35.095279693603516, - "kl": 0.0791015625, - "learning_rate": 6.277884002549394e-08, - "loss": 0.0032, - "reward": 1.510606050491333, - "reward_std": 0.10268741101026535, - "rewards/answer_reward": 0.03125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4793560206890106, - "step": 2941 - }, - { - "completion_length": 44.765625, - "epoch": 0.9375398342893563, - "grad_norm": 54.84659957885742, - "kl": 0.2421875, - "learning_rate": 6.246016571064372e-08, - "loss": 0.0097, - "reward": 1.5426725149154663, - "reward_std": 0.05223310738801956, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.2926725149154663, - "rewards/pad": 0.25, - "step": 2942 - }, - { - "completion_length": 96.875, - "epoch": 0.9378585086042065, - "grad_norm": 54.6537971496582, - "kl": 0.1455078125, - "learning_rate": 6.21414913957935e-08, - "loss": 0.0058, - "reward": 1.5120577812194824, - "reward_std": 0.02403687685728073, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5120577812194824, - "rewards/pad": 0.0, - "step": 2943 - }, - { - "completion_length": 95.09375, - "epoch": 0.9381771829190567, - "grad_norm": 23.36185073852539, - "kl": 0.12353515625, - "learning_rate": 6.182281708094327e-08, - "loss": 0.0049, - "reward": 1.7600469589233398, - "reward_std": 0.058877404779195786, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5100470185279846, - "step": 2944 - }, - { - "completion_length": 95.484375, - "epoch": 0.9384958572339069, - "grad_norm": 17.236286163330078, - "kl": 0.1162109375, - "learning_rate": 6.150414276609305e-08, - "loss": 0.0047, - "reward": 1.5400110483169556, - "reward_std": 0.07531306147575378, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5400110483169556, - "step": 2945 - }, - { - "completion_length": 95.984375, - "epoch": 0.9388145315487572, - "grad_norm": 21.27970314025879, - "kl": 0.248046875, - "learning_rate": 6.118546845124282e-08, - "loss": 0.0099, - "reward": 1.5612072944641113, - "reward_std": 0.08073738217353821, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5612072944641113, - "step": 2946 - }, - { - "completion_length": 121.6875, - "epoch": 0.9391332058636074, - "grad_norm": 67.81165313720703, - "kl": 0.14453125, - "learning_rate": 6.08667941363926e-08, - "loss": 0.0058, - "reward": 1.5851449966430664, - "reward_std": 0.06143315136432648, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5851449966430664, - "step": 2947 - }, - { - "completion_length": 122.328125, - "epoch": 0.9394518801784576, - "grad_norm": 11.614081382751465, - "kl": 0.1083984375, - "learning_rate": 6.054811982154238e-08, - "loss": 0.0043, - "reward": 1.7119860649108887, - "reward_std": 0.036694057285785675, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5869860649108887, - "step": 2948 - }, - { - "completion_length": 95.203125, - "epoch": 0.9397705544933078, - "grad_norm": 51.55110549926758, - "kl": 0.1484375, - "learning_rate": 6.022944550669216e-08, - "loss": 0.0059, - "reward": 1.535896897315979, - "reward_std": 0.08128312230110168, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5358968377113342, - "step": 2949 - }, - { - "completion_length": 123.015625, - "epoch": 0.940089228808158, - "grad_norm": 29.273845672607422, - "kl": 0.205078125, - "learning_rate": 5.991077119184194e-08, - "loss": 0.0082, - "reward": 1.498410940170288, - "reward_std": 0.06493903696537018, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3734109699726105, - "step": 2950 - }, - { - "completion_length": 118.3125, - "epoch": 0.9404079031230083, - "grad_norm": 19.915864944458008, - "kl": 0.12158203125, - "learning_rate": 5.959209687699171e-08, - "loss": 0.0049, - "reward": 1.4336977005004883, - "reward_std": 0.036178942769765854, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4336977005004883, - "step": 2951 - }, - { - "completion_length": 94.671875, - "epoch": 0.9407265774378585, - "grad_norm": 54.04530334472656, - "kl": 0.12109375, - "learning_rate": 5.927342256214149e-08, - "loss": 0.0048, - "reward": 1.5930801630020142, - "reward_std": 0.03752660006284714, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.468080073595047, - "rewards/pad": 0.125, - "step": 2952 - }, - { - "completion_length": 44.96875, - "epoch": 0.9410452517527087, - "grad_norm": 52.64925765991211, - "kl": 0.1611328125, - "learning_rate": 5.895474824729126e-08, - "loss": 0.0064, - "reward": 1.6158406734466553, - "reward_std": 0.04798610508441925, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4908406138420105, - "rewards/pad": 0.125, - "step": 2953 - }, - { - "completion_length": 147.421875, - "epoch": 0.9413639260675589, - "grad_norm": 35.65459060668945, - "kl": 0.162109375, - "learning_rate": 5.863607393244104e-08, - "loss": 0.0065, - "reward": 1.509047269821167, - "reward_std": 0.051846154034137726, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.509047269821167, - "step": 2954 - }, - { - "completion_length": 123.890625, - "epoch": 0.9416826003824091, - "grad_norm": 88.2826919555664, - "kl": 0.126953125, - "learning_rate": 5.831739961759082e-08, - "loss": 0.0051, - "reward": 1.6294455528259277, - "reward_std": 0.029041368514299393, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.37944549322128296, - "rewards/pad": 0.25, - "step": 2955 - }, - { - "completion_length": 69.296875, - "epoch": 0.9420012746972594, - "grad_norm": 143.51040649414062, - "kl": 0.1435546875, - "learning_rate": 5.79987253027406e-08, - "loss": 0.0058, - "reward": 1.628396987915039, - "reward_std": 0.08412352204322815, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5033970475196838, - "step": 2956 - }, - { - "completion_length": 97.578125, - "epoch": 0.9423199490121096, - "grad_norm": 30.71607208251953, - "kl": 0.251953125, - "learning_rate": 5.768005098789038e-08, - "loss": 0.0101, - "reward": 1.593425989151001, - "reward_std": 0.07173962891101837, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5934259295463562, - "step": 2957 - }, - { - "completion_length": 45.09375, - "epoch": 0.9426386233269598, - "grad_norm": 40.68990707397461, - "kl": 0.150390625, - "learning_rate": 5.7361376673040145e-08, - "loss": 0.006, - "reward": 1.5776606798171997, - "reward_std": 0.09257933497428894, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4526606798171997, - "rewards/pad": 0.125, - "step": 2958 - }, - { - "completion_length": 71.46875, - "epoch": 0.94295729764181, - "grad_norm": 46.51622772216797, - "kl": 0.1767578125, - "learning_rate": 5.7042702358189925e-08, - "loss": 0.0071, - "reward": 1.6765297651290894, - "reward_std": 0.08185146749019623, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5515297055244446, - "rewards/pad": 0.125, - "step": 2959 - }, - { - "completion_length": 70.546875, - "epoch": 0.9432759719566602, - "grad_norm": 14.74997615814209, - "kl": 0.197265625, - "learning_rate": 5.6724028043339706e-08, - "loss": 0.0079, - "reward": 1.6244655847549438, - "reward_std": 0.043617237359285355, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4994656443595886, - "rewards/pad": 0.125, - "step": 2960 - }, - { - "completion_length": 95.28125, - "epoch": 0.9435946462715105, - "grad_norm": 33.05752944946289, - "kl": 0.125, - "learning_rate": 5.640535372848948e-08, - "loss": 0.005, - "reward": 1.6229265928268433, - "reward_std": 0.044810131192207336, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6229265928268433, - "step": 2961 - }, - { - "completion_length": 71.890625, - "epoch": 0.9439133205863608, - "grad_norm": 40.47278594970703, - "kl": 0.1640625, - "learning_rate": 5.608667941363926e-08, - "loss": 0.0066, - "reward": 1.8271048069000244, - "reward_std": 0.0722237378358841, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.577104926109314, - "rewards/pad": 0.25, - "step": 2962 - }, - { - "completion_length": 95.453125, - "epoch": 0.944231994901211, - "grad_norm": 18.345399856567383, - "kl": 0.15234375, - "learning_rate": 5.5768005098789034e-08, - "loss": 0.0061, - "reward": 1.6857061386108398, - "reward_std": 0.07458589226007462, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5607060194015503, - "rewards/pad": 0.125, - "step": 2963 - }, - { - "completion_length": 145.796875, - "epoch": 0.9445506692160612, - "grad_norm": 90.41543579101562, - "kl": 0.1220703125, - "learning_rate": 5.5449330783938815e-08, - "loss": 0.0049, - "reward": 1.5884913206100464, - "reward_std": 0.10379238426685333, - "rewards/pad": 0.0625, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5259913206100464, - "step": 2964 - }, - { - "completion_length": 47.265625, - "epoch": 0.9448693435309115, - "grad_norm": 23.373537063598633, - "kl": 0.251953125, - "learning_rate": 5.5130656469088595e-08, - "loss": 0.0101, - "reward": 1.7719521522521973, - "reward_std": 0.07292000204324722, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5219520330429077, - "step": 2965 - }, - { - "completion_length": 118.671875, - "epoch": 0.9451880178457617, - "grad_norm": 53.581172943115234, - "kl": 0.1484375, - "learning_rate": 5.481198215423836e-08, - "loss": 0.0059, - "reward": 1.7567198276519775, - "reward_std": 0.10402566194534302, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5379699468612671, - "rewards/pad": 0.21875, - "step": 2966 - }, - { - "completion_length": 99.46875, - "epoch": 0.9455066921606119, - "grad_norm": 42.33812713623047, - "kl": 0.119140625, - "learning_rate": 5.449330783938814e-08, - "loss": 0.0048, - "reward": 1.5766716003417969, - "reward_std": 0.046387527137994766, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4516715407371521, - "step": 2967 - }, - { - "completion_length": 95.328125, - "epoch": 0.9458253664754621, - "grad_norm": 17.89882469177246, - "kl": 0.140625, - "learning_rate": 5.417463352453792e-08, - "loss": 0.0056, - "reward": 1.7956726551055908, - "reward_std": 0.06366585195064545, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.545672595500946, - "rewards/pad": 0.25, - "step": 2968 - }, - { - "completion_length": 93.09375, - "epoch": 0.9461440407903123, - "grad_norm": 25.676315307617188, - "kl": 0.390625, - "learning_rate": 5.38559592096877e-08, - "loss": 0.0156, - "reward": 1.665808916091919, - "reward_std": 0.06176231428980827, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5408089756965637, - "rewards/pad": 0.125, - "step": 2969 - }, - { - "completion_length": 69.859375, - "epoch": 0.9464627151051626, - "grad_norm": 29.421112060546875, - "kl": 0.2119140625, - "learning_rate": 5.353728489483748e-08, - "loss": 0.0085, - "reward": 1.5956969261169434, - "reward_std": 0.08274352550506592, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5956969261169434, - "rewards/pad": 0.0, - "step": 2970 - }, - { - "completion_length": 71.625, - "epoch": 0.9467813894200128, - "grad_norm": 43.09214782714844, - "kl": 0.216796875, - "learning_rate": 5.321861057998725e-08, - "loss": 0.0087, - "reward": 1.5480172634124756, - "reward_std": 0.09477528929710388, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.4386422038078308, - "rewards/pad": 0.125, - "step": 2971 - }, - { - "completion_length": 70.75, - "epoch": 0.947100063734863, - "grad_norm": 47.417930603027344, - "kl": 0.18359375, - "learning_rate": 5.289993626513703e-08, - "loss": 0.0073, - "reward": 1.4876710176467896, - "reward_std": 0.039261188358068466, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36267098784446716, - "rewards/pad": 0.125, - "step": 2972 - }, - { - "completion_length": 96.578125, - "epoch": 0.9474187380497132, - "grad_norm": 73.08779907226562, - "kl": 0.1669921875, - "learning_rate": 5.25812619502868e-08, - "loss": 0.0067, - "reward": 1.6365768909454346, - "reward_std": 0.06127271056175232, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.511576771736145, - "step": 2973 - }, - { - "completion_length": 173.515625, - "epoch": 0.9477374123645634, - "grad_norm": 37.521583557128906, - "kl": 0.11181640625, - "learning_rate": 5.226258763543658e-08, - "loss": 0.0045, - "reward": 1.3070462942123413, - "reward_std": 0.035560242831707, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3070463240146637, - "step": 2974 - }, - { - "completion_length": 150.0, - "epoch": 0.9480560866794137, - "grad_norm": 39.87615203857422, - "kl": 0.134765625, - "learning_rate": 5.1943913320586354e-08, - "loss": 0.0054, - "reward": 1.6112351417541504, - "reward_std": 0.07507481426000595, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5018600225448608, - "step": 2975 - }, - { - "completion_length": 20.921875, - "epoch": 0.9483747609942639, - "grad_norm": 108.36138916015625, - "kl": 0.1845703125, - "learning_rate": 5.1625239005736134e-08, - "loss": 0.0074, - "reward": 1.6514463424682617, - "reward_std": 0.15345437824726105, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5733214020729065, - "rewards/pad": 0.078125, - "step": 2976 - }, - { - "completion_length": 46.828125, - "epoch": 0.9486934353091141, - "grad_norm": 88.67145538330078, - "kl": 0.2119140625, - "learning_rate": 5.1306564690885915e-08, - "loss": 0.0084, - "reward": 1.8383784294128418, - "reward_std": 0.15417829155921936, - "rewards/answer_reward": 0.34375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4946284294128418, - "step": 2977 - }, - { - "completion_length": 94.109375, - "epoch": 0.9490121096239643, - "grad_norm": 93.40848541259766, - "kl": 0.1953125, - "learning_rate": 5.098789037603569e-08, - "loss": 0.0078, - "reward": 1.5676114559173584, - "reward_std": 0.08058977127075195, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5676113963127136, - "rewards/pad": 0.0, - "step": 2978 - }, - { - "completion_length": 71.578125, - "epoch": 0.9493307839388145, - "grad_norm": 140.27362060546875, - "kl": 0.125, - "learning_rate": 5.066921606118547e-08, - "loss": 0.005, - "reward": 1.673303246498108, - "reward_std": 0.04373828321695328, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6733032464981079, - "rewards/pad": 0.0, - "step": 2979 - }, - { - "completion_length": 123.796875, - "epoch": 0.9496494582536648, - "grad_norm": 22.28351593017578, - "kl": 0.1640625, - "learning_rate": 5.035054174633524e-08, - "loss": 0.0066, - "reward": 1.7183589935302734, - "reward_std": 0.09938669204711914, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.48398399353027344, - "rewards/pad": 0.234375, - "step": 2980 - }, - { - "completion_length": 121.203125, - "epoch": 0.949968132568515, - "grad_norm": 24.14664077758789, - "kl": 0.09619140625, - "learning_rate": 5.003186743148502e-08, - "loss": 0.0039, - "reward": 1.4288082122802734, - "reward_std": 0.07892707735300064, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.303808331489563, - "step": 2981 - }, - { - "completion_length": 98.59375, - "epoch": 0.9502868068833652, - "grad_norm": 77.13253021240234, - "kl": 0.10205078125, - "learning_rate": 4.97131931166348e-08, - "loss": 0.0041, - "reward": 1.7554478645324707, - "reward_std": 0.10776199400424957, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5054478049278259, - "step": 2982 - }, - { - "completion_length": 44.890625, - "epoch": 0.9506054811982154, - "grad_norm": 59.162620544433594, - "kl": 0.2158203125, - "learning_rate": 4.939451880178457e-08, - "loss": 0.0086, - "reward": 1.8520599603652954, - "reward_std": 0.06661409139633179, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7270599603652954, - "rewards/pad": 0.125, - "step": 2983 - }, - { - "completion_length": 172.984375, - "epoch": 0.9509241555130656, - "grad_norm": 53.317298889160156, - "kl": 0.07666015625, - "learning_rate": 4.907584448693435e-08, - "loss": 0.0031, - "reward": 1.458888292312622, - "reward_std": 0.11015963554382324, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.3495132327079773, - "step": 2984 - }, - { - "completion_length": 97.71875, - "epoch": 0.9512428298279159, - "grad_norm": 37.27375793457031, - "kl": 0.138671875, - "learning_rate": 4.8757170172084126e-08, - "loss": 0.0056, - "reward": 1.4250479936599731, - "reward_std": 0.08955788612365723, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.34692302346229553, - "rewards/pad": 0.078125, - "step": 2985 - }, - { - "completion_length": 46.3125, - "epoch": 0.9515615041427661, - "grad_norm": 61.8337516784668, - "kl": 0.1806640625, - "learning_rate": 4.8438495857233906e-08, - "loss": 0.0072, - "reward": 1.6398192644119263, - "reward_std": 0.08233866095542908, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5148192644119263, - "rewards/pad": 0.125, - "step": 2986 - }, - { - "completion_length": 72.34375, - "epoch": 0.9518801784576163, - "grad_norm": 34.869503021240234, - "kl": 0.2578125, - "learning_rate": 4.811982154238369e-08, - "loss": 0.0103, - "reward": 1.621230125427246, - "reward_std": 0.04410223290324211, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49623024463653564, - "rewards/pad": 0.125, - "step": 2987 - }, - { - "completion_length": 98.125, - "epoch": 0.9521988527724665, - "grad_norm": 28.22113037109375, - "kl": 0.1865234375, - "learning_rate": 4.7801147227533454e-08, - "loss": 0.0075, - "reward": 1.8525173664093018, - "reward_std": 0.05112370476126671, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6025174260139465, - "step": 2988 - }, - { - "completion_length": 98.296875, - "epoch": 0.9525175270873167, - "grad_norm": 17.279558181762695, - "kl": 0.12451171875, - "learning_rate": 4.7482472912683235e-08, - "loss": 0.005, - "reward": 1.7908687591552734, - "reward_std": 0.05182887613773346, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5408687591552734, - "rewards/pad": 0.25, - "step": 2989 - }, - { - "completion_length": 171.90625, - "epoch": 0.952836201402167, - "grad_norm": 12.004485130310059, - "kl": 0.0791015625, - "learning_rate": 4.716379859783301e-08, - "loss": 0.0032, - "reward": 1.5807535648345947, - "reward_std": 0.030118271708488464, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5807535648345947, - "step": 2990 - }, - { - "completion_length": 147.765625, - "epoch": 0.9531548757170172, - "grad_norm": 15.81447696685791, - "kl": 0.1171875, - "learning_rate": 4.684512428298279e-08, - "loss": 0.0047, - "reward": 1.4999476671218872, - "reward_std": 0.03739581257104874, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4999476373195648, - "step": 2991 - }, - { - "completion_length": 94.203125, - "epoch": 0.9534735500318674, - "grad_norm": 22.1085147857666, - "kl": 0.203125, - "learning_rate": 4.652644996813257e-08, - "loss": 0.0081, - "reward": 1.4721016883850098, - "reward_std": 0.06764543801546097, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4721015691757202, - "step": 2992 - }, - { - "completion_length": 20.765625, - "epoch": 0.9537922243467176, - "grad_norm": 30.846187591552734, - "kl": 0.212890625, - "learning_rate": 4.6207775653282343e-08, - "loss": 0.0085, - "reward": 1.4842817783355713, - "reward_std": 0.08108275383710861, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.37490689754486084, - "step": 2993 - }, - { - "completion_length": 95.71875, - "epoch": 0.9541108986615678, - "grad_norm": 19.16341209411621, - "kl": 0.12060546875, - "learning_rate": 4.5889101338432124e-08, - "loss": 0.0048, - "reward": 1.5571094751358032, - "reward_std": 0.0856865867972374, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4321094751358032, - "rewards/pad": 0.125, - "step": 2994 - }, - { - "completion_length": 122.125, - "epoch": 0.9544295729764181, - "grad_norm": 20.750417709350586, - "kl": 0.10693359375, - "learning_rate": 4.55704270235819e-08, - "loss": 0.0043, - "reward": 1.5284767150878906, - "reward_std": 0.03402542322874069, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.40347665548324585, - "step": 2995 - }, - { - "completion_length": 74.5, - "epoch": 0.9547482472912683, - "grad_norm": 32.03830337524414, - "kl": 0.19921875, - "learning_rate": 4.525175270873167e-08, - "loss": 0.008, - "reward": 1.9529647827148438, - "reward_std": 0.05978967249393463, - "rewards/answer_reward": 0.5, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.45296481251716614, - "step": 2996 - }, - { - "completion_length": 122.1875, - "epoch": 0.9550669216061185, - "grad_norm": 39.445640563964844, - "kl": 0.11669921875, - "learning_rate": 4.493307839388145e-08, - "loss": 0.0047, - "reward": 1.7907742261886597, - "reward_std": 0.034271303564310074, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6657742261886597, - "rewards/pad": 0.125, - "step": 2997 - }, - { - "completion_length": 70.75, - "epoch": 0.9553855959209687, - "grad_norm": 39.62389373779297, - "kl": 0.251953125, - "learning_rate": 4.4614404079031226e-08, - "loss": 0.0101, - "reward": 1.426303744316101, - "reward_std": 0.040360528975725174, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.42630377411842346, - "step": 2998 - }, - { - "completion_length": 95.359375, - "epoch": 0.9557042702358189, - "grad_norm": 50.70322036743164, - "kl": 0.1279296875, - "learning_rate": 4.4295729764181007e-08, - "loss": 0.0051, - "reward": 1.724592685699463, - "reward_std": 0.03771749883890152, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7245926260948181, - "rewards/pad": 0.0, - "step": 2999 - }, - { - "completion_length": 122.640625, - "epoch": 0.9560229445506692, - "grad_norm": 13.841503143310547, - "kl": 0.1064453125, - "learning_rate": 4.397705544933078e-08, - "loss": 0.0043, - "reward": 1.7508357763290405, - "reward_std": 0.045969102531671524, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5008357167243958, - "step": 3000 - }, - { - "completion_length": 123.25, - "epoch": 0.9563416188655195, - "grad_norm": 114.12747192382812, - "kl": 0.1298828125, - "learning_rate": 4.365838113448056e-08, - "loss": 0.0052, - "reward": 1.5129914283752441, - "reward_std": 0.08709900081157684, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5129914879798889, - "rewards/pad": 0.0, - "step": 3001 - }, - { - "completion_length": 121.5625, - "epoch": 0.9566602931803697, - "grad_norm": 177.0281524658203, - "kl": 0.140625, - "learning_rate": 4.3339706819630335e-08, - "loss": 0.0057, - "reward": 1.42633855342865, - "reward_std": 0.1432398110628128, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.36383864283561707, - "rewards/pad": 0.0625, - "step": 3002 - }, - { - "completion_length": 122.046875, - "epoch": 0.9569789674952199, - "grad_norm": 74.1451416015625, - "kl": 0.1376953125, - "learning_rate": 4.3021032504780115e-08, - "loss": 0.0055, - "reward": 1.6103394031524658, - "reward_std": 0.06744758039712906, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48533937335014343, - "step": 3003 - }, - { - "completion_length": 70.140625, - "epoch": 0.9572976418100702, - "grad_norm": 72.7608413696289, - "kl": 0.1982421875, - "learning_rate": 4.270235818992989e-08, - "loss": 0.0079, - "reward": 1.6359548568725586, - "reward_std": 0.08110431581735611, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6359548568725586, - "rewards/pad": 0.0, - "step": 3004 - }, - { - "completion_length": 67.671875, - "epoch": 0.9576163161249204, - "grad_norm": 30.834381103515625, - "kl": 0.189453125, - "learning_rate": 4.238368387507966e-08, - "loss": 0.0076, - "reward": 1.6679446697235107, - "reward_std": 0.06894217431545258, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6679446697235107, - "step": 3005 - }, - { - "completion_length": 97.375, - "epoch": 0.9579349904397706, - "grad_norm": 209.162353515625, - "kl": 0.244140625, - "learning_rate": 4.2065009560229444e-08, - "loss": 0.0098, - "reward": 1.44875168800354, - "reward_std": 0.14680612087249756, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43312662839889526, - "rewards/pad": 0.015625, - "step": 3006 - }, - { - "completion_length": 70.9375, - "epoch": 0.9582536647546208, - "grad_norm": 31.107500076293945, - "kl": 0.1572265625, - "learning_rate": 4.174633524537922e-08, - "loss": 0.0063, - "reward": 1.7754242420196533, - "reward_std": 0.05395086854696274, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6504241228103638, - "rewards/pad": 0.125, - "step": 3007 - }, - { - "completion_length": 73.3125, - "epoch": 0.958572339069471, - "grad_norm": 28.936445236206055, - "kl": 0.12158203125, - "learning_rate": 4.1427660930529e-08, - "loss": 0.0049, - "reward": 1.7380356788635254, - "reward_std": 0.11337875574827194, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 0.984375, - "rewards/iou_glue_reward": 0.5036606788635254, - "step": 3008 - }, - { - "completion_length": 96.328125, - "epoch": 0.9588910133843213, - "grad_norm": 32.57992935180664, - "kl": 0.150390625, - "learning_rate": 4.110898661567878e-08, - "loss": 0.006, - "reward": 1.7709015607833862, - "reward_std": 0.0693637877702713, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6459015607833862, - "rewards/pad": 0.125, - "step": 3009 - }, - { - "completion_length": 94.546875, - "epoch": 0.9592096876991715, - "grad_norm": 20.70581817626953, - "kl": 0.1787109375, - "learning_rate": 4.079031230082855e-08, - "loss": 0.0072, - "reward": 1.4237182140350342, - "reward_std": 0.028100572526454926, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.29871827363967896, - "step": 3010 - }, - { - "completion_length": 122.0625, - "epoch": 0.9595283620140217, - "grad_norm": 17.17173957824707, - "kl": 0.1201171875, - "learning_rate": 4.047163798597833e-08, - "loss": 0.0048, - "reward": 1.610293984413147, - "reward_std": 0.040450602769851685, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.610293984413147, - "rewards/pad": 0.0, - "step": 3011 - }, - { - "completion_length": 72.875, - "epoch": 0.9598470363288719, - "grad_norm": 27.44310760498047, - "kl": 0.189453125, - "learning_rate": 4.01529636711281e-08, - "loss": 0.0076, - "reward": 1.9695830345153809, - "reward_std": 0.05835095793008804, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5945830345153809, - "step": 3012 - }, - { - "completion_length": 43.71875, - "epoch": 0.9601657106437221, - "grad_norm": 22.398454666137695, - "kl": 0.27734375, - "learning_rate": 3.983428935627788e-08, - "loss": 0.0111, - "reward": 1.6686134338378906, - "reward_std": 0.08060306310653687, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6686134934425354, - "step": 3013 - }, - { - "completion_length": 122.265625, - "epoch": 0.9604843849585724, - "grad_norm": 37.84651565551758, - "kl": 0.10986328125, - "learning_rate": 3.951561504142766e-08, - "loss": 0.0044, - "reward": 1.5465751886367798, - "reward_std": 0.09929150342941284, - "rewards/pad": 0.046875, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4997002184391022, - "step": 3014 - }, - { - "completion_length": 147.515625, - "epoch": 0.9608030592734226, - "grad_norm": 38.23558807373047, - "kl": 0.177734375, - "learning_rate": 3.9196940726577435e-08, - "loss": 0.0071, - "reward": 1.5440813302993774, - "reward_std": 0.049021974205970764, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5440813302993774, - "rewards/pad": 0.0, - "step": 3015 - }, - { - "completion_length": 70.96875, - "epoch": 0.9611217335882728, - "grad_norm": 20.668821334838867, - "kl": 0.1923828125, - "learning_rate": 3.8878266411727215e-08, - "loss": 0.0077, - "reward": 1.7165735960006714, - "reward_std": 0.06255701184272766, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5915735363960266, - "rewards/pad": 0.125, - "step": 3016 - }, - { - "completion_length": 95.984375, - "epoch": 0.961440407903123, - "grad_norm": 34.80844497680664, - "kl": 0.17578125, - "learning_rate": 3.855959209687699e-08, - "loss": 0.007, - "reward": 1.5303641557693481, - "reward_std": 0.05917413532733917, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5303641557693481, - "step": 3017 - }, - { - "completion_length": 119.515625, - "epoch": 0.9617590822179732, - "grad_norm": 15.914820671081543, - "kl": 0.1337890625, - "learning_rate": 3.824091778202677e-08, - "loss": 0.0054, - "reward": 1.614253044128418, - "reward_std": 0.062158532440662384, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4892529845237732, - "step": 3018 - }, - { - "completion_length": 172.796875, - "epoch": 0.9620777565328235, - "grad_norm": 11.0382661819458, - "kl": 0.134765625, - "learning_rate": 3.7922243467176544e-08, - "loss": 0.0054, - "reward": 1.4397987127304077, - "reward_std": 0.0498490110039711, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4397987127304077, - "step": 3019 - }, - { - "completion_length": 95.84375, - "epoch": 0.9623964308476737, - "grad_norm": 38.04750061035156, - "kl": 0.2236328125, - "learning_rate": 3.760356915232632e-08, - "loss": 0.0089, - "reward": 1.5318949222564697, - "reward_std": 0.06952402740716934, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5318949818611145, - "step": 3020 - }, - { - "completion_length": 121.5, - "epoch": 0.9627151051625239, - "grad_norm": 210.04891967773438, - "kl": 0.1474609375, - "learning_rate": 3.72848948374761e-08, - "loss": 0.0059, - "reward": 1.5183238983154297, - "reward_std": 0.11033372581005096, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5339489579200745, - "rewards/pad": 0.0, - "step": 3021 - }, - { - "completion_length": 72.609375, - "epoch": 0.9630337794773741, - "grad_norm": 69.69294738769531, - "kl": 0.17578125, - "learning_rate": 3.696622052262587e-08, - "loss": 0.007, - "reward": 1.5531673431396484, - "reward_std": 0.12044639885425568, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4437922239303589, - "step": 3022 - }, - { - "completion_length": 47.328125, - "epoch": 0.9633524537922243, - "grad_norm": 108.01530456542969, - "kl": 0.12890625, - "learning_rate": 3.664754620777565e-08, - "loss": 0.0052, - "reward": 1.7897529602050781, - "reward_std": 0.1433810442686081, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.49287793040275574, - "rewards/pad": 0.296875, - "step": 3023 - }, - { - "completion_length": 124.40625, - "epoch": 0.9636711281070746, - "grad_norm": 23.7318115234375, - "kl": 0.130859375, - "learning_rate": 3.632887189292543e-08, - "loss": 0.0052, - "reward": 1.7141940593719482, - "reward_std": 0.050903551280498505, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4641939401626587, - "rewards/pad": 0.25, - "step": 3024 - }, - { - "completion_length": 120.8125, - "epoch": 0.9639898024219248, - "grad_norm": 27.079105377197266, - "kl": 0.12158203125, - "learning_rate": 3.601019757807521e-08, - "loss": 0.0049, - "reward": 1.682950496673584, - "reward_std": 0.05144767835736275, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5579504370689392, - "step": 3025 - }, - { - "completion_length": 73.046875, - "epoch": 0.964308476736775, - "grad_norm": 126.71540832519531, - "kl": 0.11279296875, - "learning_rate": 3.569152326322499e-08, - "loss": 0.0045, - "reward": 1.7656242847442627, - "reward_std": 0.10512179136276245, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4999992847442627, - "rewards/pad": 0.265625, - "step": 3026 - }, - { - "completion_length": 123.234375, - "epoch": 0.9646271510516252, - "grad_norm": 134.09176635742188, - "kl": 0.4453125, - "learning_rate": 3.5372848948374755e-08, - "loss": 0.0178, - "reward": 1.4624063968658447, - "reward_std": 0.15438668429851532, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.47803133726119995, - "step": 3027 - }, - { - "completion_length": 97.21875, - "epoch": 0.9649458253664754, - "grad_norm": 60.89470672607422, - "kl": 0.21484375, - "learning_rate": 3.5054174633524535e-08, - "loss": 0.0086, - "reward": 1.6267002820968628, - "reward_std": 0.11622709035873413, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5017002820968628, - "step": 3028 - }, - { - "completion_length": 121.078125, - "epoch": 0.9652644996813257, - "grad_norm": 22.674972534179688, - "kl": 0.2236328125, - "learning_rate": 3.473550031867431e-08, - "loss": 0.0089, - "reward": 1.6407219171524048, - "reward_std": 0.03624836355447769, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5157219171524048, - "step": 3029 - }, - { - "completion_length": 147.96875, - "epoch": 0.9655831739961759, - "grad_norm": 10.713055610656738, - "kl": 0.08740234375, - "learning_rate": 3.441682600382409e-08, - "loss": 0.0035, - "reward": 1.5930461883544922, - "reward_std": 0.046371519565582275, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4680461883544922, - "rewards/pad": 0.125, - "step": 3030 - }, - { - "completion_length": 46.796875, - "epoch": 0.9659018483110261, - "grad_norm": 127.0864486694336, - "kl": 0.232421875, - "learning_rate": 3.409815168897387e-08, - "loss": 0.0093, - "reward": 1.6783881187438965, - "reward_std": 0.08810573816299438, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5533881187438965, - "step": 3031 - }, - { - "completion_length": 96.546875, - "epoch": 0.9662205226258763, - "grad_norm": 32.99958419799805, - "kl": 0.2578125, - "learning_rate": 3.3779477374123644e-08, - "loss": 0.0103, - "reward": 1.7011256217956543, - "reward_std": 0.11335960030555725, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5761255025863647, - "rewards/pad": 0.125, - "step": 3032 - }, - { - "completion_length": 120.984375, - "epoch": 0.9665391969407265, - "grad_norm": 68.67292022705078, - "kl": 0.158203125, - "learning_rate": 3.3460803059273424e-08, - "loss": 0.0063, - "reward": 1.6629323959350586, - "reward_std": 0.05116612836718559, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.662932276725769, - "rewards/pad": 0.0, - "step": 3033 - }, - { - "completion_length": 172.46875, - "epoch": 0.9668578712555768, - "grad_norm": 8.863585472106934, - "kl": 0.1259765625, - "learning_rate": 3.31421287444232e-08, - "loss": 0.005, - "reward": 1.4362820386886597, - "reward_std": 0.07784001529216766, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.45190703868865967, - "step": 3034 - }, - { - "completion_length": 71.578125, - "epoch": 0.967176545570427, - "grad_norm": 54.781150817871094, - "kl": 0.1708984375, - "learning_rate": 3.282345442957297e-08, - "loss": 0.0068, - "reward": 1.7650046348571777, - "reward_std": 0.07283900678157806, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5150046348571777, - "step": 3035 - }, - { - "completion_length": 93.625, - "epoch": 0.9674952198852772, - "grad_norm": 31.825538635253906, - "kl": 0.2060546875, - "learning_rate": 3.250478011472275e-08, - "loss": 0.0083, - "reward": 1.537459373474121, - "reward_std": 0.0667218565940857, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5374593734741211, - "step": 3036 - }, - { - "completion_length": 96.640625, - "epoch": 0.9678138942001274, - "grad_norm": 54.06278991699219, - "kl": 0.146484375, - "learning_rate": 3.2186105799872527e-08, - "loss": 0.0059, - "reward": 1.5844182968139648, - "reward_std": 0.05093790963292122, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5844182968139648, - "rewards/pad": 0.0, - "step": 3037 - }, - { - "completion_length": 120.90625, - "epoch": 0.9681325685149776, - "grad_norm": 35.58761215209961, - "kl": 0.1748046875, - "learning_rate": 3.186743148502231e-08, - "loss": 0.007, - "reward": 1.70296311378479, - "reward_std": 0.07070493698120117, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.57796311378479, - "step": 3038 - }, - { - "completion_length": 70.390625, - "epoch": 0.9684512428298279, - "grad_norm": 25.037538528442383, - "kl": 0.1533203125, - "learning_rate": 3.154875717017208e-08, - "loss": 0.0061, - "reward": 1.5511980056762695, - "reward_std": 0.03978591039776802, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5511980056762695, - "step": 3039 - }, - { - "completion_length": 98.046875, - "epoch": 0.9687699171446782, - "grad_norm": 13.006641387939453, - "kl": 0.2080078125, - "learning_rate": 3.123008285532186e-08, - "loss": 0.0083, - "reward": 1.6046113967895508, - "reward_std": 0.0850830078125, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4796113669872284, - "step": 3040 - }, - { - "completion_length": 97.578125, - "epoch": 0.9690885914595284, - "grad_norm": 54.957183837890625, - "kl": 0.2314453125, - "learning_rate": 3.0911408540471635e-08, - "loss": 0.0092, - "reward": 1.4358372688293457, - "reward_std": 0.10187076032161713, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4358373284339905, - "step": 3041 - }, - { - "completion_length": 65.703125, - "epoch": 0.9694072657743786, - "grad_norm": 48.17506408691406, - "kl": 0.21875, - "learning_rate": 3.059273422562141e-08, - "loss": 0.0087, - "reward": 1.597632884979248, - "reward_std": 0.0918826162815094, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.597632884979248, - "step": 3042 - }, - { - "completion_length": 72.078125, - "epoch": 0.9697259400892289, - "grad_norm": 79.25590515136719, - "kl": 0.197265625, - "learning_rate": 3.027405991077119e-08, - "loss": 0.0079, - "reward": 1.6580369472503662, - "reward_std": 0.07974573969841003, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6580369472503662, - "rewards/pad": 0.0, - "step": 3043 - }, - { - "completion_length": 94.71875, - "epoch": 0.9700446144040791, - "grad_norm": 21.131467819213867, - "kl": 0.2275390625, - "learning_rate": 2.995538559592097e-08, - "loss": 0.0091, - "reward": 1.5688925981521606, - "reward_std": 0.061785049736499786, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5688925385475159, - "step": 3044 - }, - { - "completion_length": 94.421875, - "epoch": 0.9703632887189293, - "grad_norm": 27.149314880371094, - "kl": 0.16015625, - "learning_rate": 2.9636711281070744e-08, - "loss": 0.0064, - "reward": 1.515899896621704, - "reward_std": 0.05860135704278946, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5158998966217041, - "step": 3045 - }, - { - "completion_length": 46.0625, - "epoch": 0.9706819630337795, - "grad_norm": 18.18086814880371, - "kl": 0.181640625, - "learning_rate": 2.931803696622052e-08, - "loss": 0.0073, - "reward": 1.745971918106079, - "reward_std": 0.0631784126162529, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4959719181060791, - "rewards/pad": 0.25, - "step": 3046 - }, - { - "completion_length": 95.40625, - "epoch": 0.9710006373486297, - "grad_norm": 34.62543869018555, - "kl": 0.1455078125, - "learning_rate": 2.89993626513703e-08, - "loss": 0.0058, - "reward": 1.6828827857971191, - "reward_std": 0.06356634944677353, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6828826665878296, - "step": 3047 - }, - { - "completion_length": 44.015625, - "epoch": 0.97131931166348, - "grad_norm": 115.41775512695312, - "kl": 0.142578125, - "learning_rate": 2.8680688336520072e-08, - "loss": 0.0057, - "reward": 1.6739516258239746, - "reward_std": 0.04360014945268631, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6739515662193298, - "rewards/pad": 0.0, - "step": 3048 - }, - { - "completion_length": 96.59375, - "epoch": 0.9716379859783302, - "grad_norm": 22.5186710357666, - "kl": 0.1298828125, - "learning_rate": 2.8362014021669853e-08, - "loss": 0.0052, - "reward": 1.5890880823135376, - "reward_std": 0.05285734310746193, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5890881419181824, - "rewards/pad": 0.0, - "step": 3049 - }, - { - "completion_length": 94.109375, - "epoch": 0.9719566602931804, - "grad_norm": 50.44366455078125, - "kl": 0.1875, - "learning_rate": 2.804333970681963e-08, - "loss": 0.0075, - "reward": 1.582545280456543, - "reward_std": 0.04785811901092529, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5825453400611877, - "rewards/pad": 0.0, - "step": 3050 - }, - { - "completion_length": 96.71875, - "epoch": 0.9722753346080306, - "grad_norm": 86.10174560546875, - "kl": 0.16015625, - "learning_rate": 2.7724665391969407e-08, - "loss": 0.0064, - "reward": 1.5000967979431152, - "reward_std": 0.1500009149312973, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4063469171524048, - "rewards/pad": 0.09375, - "step": 3051 - }, - { - "completion_length": 43.96875, - "epoch": 0.9725940089228808, - "grad_norm": 42.23125076293945, - "kl": 0.154296875, - "learning_rate": 2.740599107711918e-08, - "loss": 0.0062, - "reward": 1.674316644668579, - "reward_std": 0.11258865892887115, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5649415850639343, - "step": 3052 - }, - { - "completion_length": 122.953125, - "epoch": 0.9729126832377311, - "grad_norm": 46.929664611816406, - "kl": 0.130859375, - "learning_rate": 2.708731676226896e-08, - "loss": 0.0053, - "reward": 1.5492351055145264, - "reward_std": 0.04716845601797104, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5492352843284607, - "step": 3053 - }, - { - "completion_length": 146.109375, - "epoch": 0.9732313575525813, - "grad_norm": 96.08281707763672, - "kl": 0.10302734375, - "learning_rate": 2.676864244741874e-08, - "loss": 0.0041, - "reward": 1.4895622730255127, - "reward_std": 0.050299882888793945, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4895622730255127, - "rewards/pad": 0.0, - "step": 3054 - }, - { - "completion_length": 98.90625, - "epoch": 0.9735500318674315, - "grad_norm": 26.63153076171875, - "kl": 0.1943359375, - "learning_rate": 2.6449968132568516e-08, - "loss": 0.0078, - "reward": 1.6397548913955688, - "reward_std": 0.06552456319332123, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6397548317909241, - "step": 3055 - }, - { - "completion_length": 70.453125, - "epoch": 0.9738687061822817, - "grad_norm": 77.2447509765625, - "kl": 0.140625, - "learning_rate": 2.613129381771829e-08, - "loss": 0.0056, - "reward": 1.7738982439041138, - "reward_std": 0.07243245840072632, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5238982439041138, - "step": 3056 - }, - { - "completion_length": 146.578125, - "epoch": 0.9741873804971319, - "grad_norm": 23.816730499267578, - "kl": 0.216796875, - "learning_rate": 2.5812619502868067e-08, - "loss": 0.0086, - "reward": 1.64516019821167, - "reward_std": 0.07656388729810715, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6451602578163147, - "rewards/pad": 0.0, - "step": 3057 - }, - { - "completion_length": 97.890625, - "epoch": 0.9745060548119822, - "grad_norm": 24.058162689208984, - "kl": 0.158203125, - "learning_rate": 2.5493945188017844e-08, - "loss": 0.0063, - "reward": 1.6628170013427734, - "reward_std": 0.08006662130355835, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5378170013427734, - "step": 3058 - }, - { - "completion_length": 71.8125, - "epoch": 0.9748247291268324, - "grad_norm": 33.326622009277344, - "kl": 0.1728515625, - "learning_rate": 2.517527087316762e-08, - "loss": 0.0069, - "reward": 1.9145175218582153, - "reward_std": 0.09791958332061768, - "rewards/format_reward_tg": 0.984375, - "rewards/iou_timestamp_reward": 0.5551425814628601, - "rewards/pad": 0.375, - "step": 3059 - }, - { - "completion_length": 96.390625, - "epoch": 0.9751434034416826, - "grad_norm": 20.628767013549805, - "kl": 0.27734375, - "learning_rate": 2.48565965583174e-08, - "loss": 0.0111, - "reward": 1.7663002014160156, - "reward_std": 0.06648366153240204, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6413001418113708, - "step": 3060 - }, - { - "completion_length": 122.984375, - "epoch": 0.9754620777565328, - "grad_norm": 26.692424774169922, - "kl": 0.185546875, - "learning_rate": 2.4537922243467176e-08, - "loss": 0.0074, - "reward": 1.545546054840088, - "reward_std": 0.05895493924617767, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5455459356307983, - "rewards/pad": 0.0, - "step": 3061 - }, - { - "completion_length": 97.125, - "epoch": 0.975780752071383, - "grad_norm": 14.738457679748535, - "kl": 0.2177734375, - "learning_rate": 2.4219247928616953e-08, - "loss": 0.0087, - "reward": 1.71378493309021, - "reward_std": 0.06787759065628052, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5887849926948547, - "step": 3062 - }, - { - "completion_length": 69.828125, - "epoch": 0.9760994263862333, - "grad_norm": 53.15437698364258, - "kl": 0.30859375, - "learning_rate": 2.3900573613766727e-08, - "loss": 0.0123, - "reward": 1.756661295890808, - "reward_std": 0.09398344159126282, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6316612958908081, - "step": 3063 - }, - { - "completion_length": 175.0625, - "epoch": 0.9764181007010835, - "grad_norm": 46.098899841308594, - "kl": 0.12158203125, - "learning_rate": 2.3581899298916504e-08, - "loss": 0.0049, - "reward": 1.59013831615448, - "reward_std": 0.05127100273966789, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5901383757591248, - "step": 3064 - }, - { - "completion_length": 71.875, - "epoch": 0.9767367750159337, - "grad_norm": 54.73717498779297, - "kl": 0.18359375, - "learning_rate": 2.3263224984066285e-08, - "loss": 0.0073, - "reward": 1.608504056930542, - "reward_std": 0.12157782167196274, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.577254056930542, - "rewards/pad": 0.03125, - "step": 3065 - }, - { - "completion_length": 71.15625, - "epoch": 0.9770554493307839, - "grad_norm": 28.98688316345215, - "kl": 0.1572265625, - "learning_rate": 2.2944550669216062e-08, - "loss": 0.0063, - "reward": 1.7561508417129517, - "reward_std": 0.07162339240312576, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5061509013175964, - "rewards/pad": 0.25, - "step": 3066 - }, - { - "completion_length": 96.5625, - "epoch": 0.9773741236456341, - "grad_norm": 27.905614852905273, - "kl": 0.09130859375, - "learning_rate": 2.2625876354365836e-08, - "loss": 0.0037, - "reward": 1.8647595643997192, - "reward_std": 0.03292068839073181, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.614759624004364, - "step": 3067 - }, - { - "completion_length": 97.109375, - "epoch": 0.9776927979604844, - "grad_norm": 125.5517807006836, - "kl": 0.2490234375, - "learning_rate": 2.2307202039515613e-08, - "loss": 0.01, - "reward": 1.5547399520874023, - "reward_std": 0.09220758080482483, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.42973989248275757, - "rewards/pad": 0.125, - "step": 3068 - }, - { - "completion_length": 120.75, - "epoch": 0.9780114722753346, - "grad_norm": 27.46649742126465, - "kl": 0.10546875, - "learning_rate": 2.198852772466539e-08, - "loss": 0.0042, - "reward": 1.6488523483276367, - "reward_std": 0.055029600858688354, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6488524079322815, - "step": 3069 - }, - { - "completion_length": 95.265625, - "epoch": 0.9783301465901848, - "grad_norm": 21.403928756713867, - "kl": 0.234375, - "learning_rate": 2.1669853409815167e-08, - "loss": 0.0094, - "reward": 1.6315056085586548, - "reward_std": 0.08213837444782257, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5065056085586548, - "step": 3070 - }, - { - "completion_length": 70.25, - "epoch": 0.978648820905035, - "grad_norm": 32.960025787353516, - "kl": 0.25, - "learning_rate": 2.1351179094964945e-08, - "loss": 0.01, - "reward": 1.819123387336731, - "reward_std": 0.07433437556028366, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.569123387336731, - "rewards/pad": 0.25, - "step": 3071 - }, - { - "completion_length": 68.5, - "epoch": 0.9789674952198852, - "grad_norm": 67.70450592041016, - "kl": 0.162109375, - "learning_rate": 2.1032504780114722e-08, - "loss": 0.0065, - "reward": 1.6073789596557617, - "reward_std": 0.052327897399663925, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6073789596557617, - "rewards/pad": 0.0, - "step": 3072 - }, - { - "completion_length": 98.578125, - "epoch": 0.9792861695347355, - "grad_norm": 21.891450881958008, - "kl": 0.146484375, - "learning_rate": 2.07138304652645e-08, - "loss": 0.0058, - "reward": 1.4414243698120117, - "reward_std": 0.05215851217508316, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3164243698120117, - "step": 3073 - }, - { - "completion_length": 95.90625, - "epoch": 0.9796048438495857, - "grad_norm": 31.6024227142334, - "kl": 0.109375, - "learning_rate": 2.0395156150414276e-08, - "loss": 0.0044, - "reward": 1.606677770614624, - "reward_std": 0.05546068400144577, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.3566778302192688, - "step": 3074 - }, - { - "completion_length": 120.375, - "epoch": 0.9799235181644359, - "grad_norm": 18.056358337402344, - "kl": 0.1552734375, - "learning_rate": 2.007648183556405e-08, - "loss": 0.0062, - "reward": 1.5160579681396484, - "reward_std": 0.08094209432601929, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39105790853500366, - "step": 3075 - }, - { - "completion_length": 71.1875, - "epoch": 0.9802421924792861, - "grad_norm": 18.132429122924805, - "kl": 0.201171875, - "learning_rate": 1.975780752071383e-08, - "loss": 0.008, - "reward": 1.665654182434082, - "reward_std": 0.04702577367424965, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6656541228294373, - "rewards/pad": 0.0, - "step": 3076 - }, - { - "completion_length": 73.15625, - "epoch": 0.9805608667941363, - "grad_norm": 27.031850814819336, - "kl": 0.220703125, - "learning_rate": 1.9439133205863608e-08, - "loss": 0.0088, - "reward": 1.8380687236785889, - "reward_std": 0.06334608048200607, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5880686640739441, - "step": 3077 - }, - { - "completion_length": 21.171875, - "epoch": 0.9808795411089866, - "grad_norm": 52.4039192199707, - "kl": 0.1904296875, - "learning_rate": 1.9120458891013385e-08, - "loss": 0.0076, - "reward": 1.757773756980896, - "reward_std": 0.11881926655769348, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5077738165855408, - "rewards/pad": 0.25, - "step": 3078 - }, - { - "completion_length": 123.59375, - "epoch": 0.9811982154238368, - "grad_norm": 26.56479835510254, - "kl": 0.1494140625, - "learning_rate": 1.880178457616316e-08, - "loss": 0.006, - "reward": 1.4535199403762817, - "reward_std": 0.0564667209982872, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.32851988077163696, - "rewards/pad": 0.125, - "step": 3079 - }, - { - "completion_length": 120.734375, - "epoch": 0.9815168897386871, - "grad_norm": 8.847326278686523, - "kl": 0.134765625, - "learning_rate": 1.8483110261312936e-08, - "loss": 0.0054, - "reward": 1.3950556516647339, - "reward_std": 0.029516663402318954, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3950556516647339, - "rewards/pad": 0.0, - "step": 3080 - }, - { - "completion_length": 46.59375, - "epoch": 0.9818355640535373, - "grad_norm": 35.24257278442383, - "kl": 0.251953125, - "learning_rate": 1.8164435946462717e-08, - "loss": 0.0101, - "reward": 1.447073221206665, - "reward_std": 0.14961804449558258, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3689482510089874, - "rewards/pad": 0.078125, - "step": 3081 - }, - { - "completion_length": 43.015625, - "epoch": 0.9821542383683876, - "grad_norm": 46.730186462402344, - "kl": 0.1552734375, - "learning_rate": 1.7845761631612494e-08, - "loss": 0.0062, - "reward": 1.702662706375122, - "reward_std": 0.0442371740937233, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7026626467704773, - "rewards/pad": 0.0, - "step": 3082 - }, - { - "completion_length": 94.375, - "epoch": 0.9824729126832378, - "grad_norm": 58.291168212890625, - "kl": 0.111328125, - "learning_rate": 1.7527087316762268e-08, - "loss": 0.0045, - "reward": 1.3969348669052124, - "reward_std": 0.06678808480501175, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.39693483710289, - "step": 3083 - }, - { - "completion_length": 68.984375, - "epoch": 0.982791586998088, - "grad_norm": 16.67777442932129, - "kl": 0.171875, - "learning_rate": 1.7208413001912045e-08, - "loss": 0.0069, - "reward": 1.4315367937088013, - "reward_std": 0.039326176047325134, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.43153685331344604, - "step": 3084 - }, - { - "completion_length": 71.453125, - "epoch": 0.9831102613129382, - "grad_norm": 22.13905143737793, - "kl": 0.1748046875, - "learning_rate": 1.6889738687061822e-08, - "loss": 0.007, - "reward": 1.5601651668548584, - "reward_std": 0.06011567264795303, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5601651072502136, - "rewards/pad": 0.0, - "step": 3085 - }, - { - "completion_length": 144.25, - "epoch": 0.9834289356277884, - "grad_norm": 77.24085235595703, - "kl": 0.2099609375, - "learning_rate": 1.65710643722116e-08, - "loss": 0.0084, - "reward": 1.687404751777649, - "reward_std": 0.05866100266575813, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5624047517776489, - "rewards/pad": 0.125, - "step": 3086 - }, - { - "completion_length": 72.875, - "epoch": 0.9837476099426387, - "grad_norm": 36.1534538269043, - "kl": 0.1728515625, - "learning_rate": 1.6252390057361376e-08, - "loss": 0.0069, - "reward": 1.8389731645584106, - "reward_std": 0.10530993342399597, - "rewards/pad": 0.484375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.35459810495376587, - "step": 3087 - }, - { - "completion_length": 98.140625, - "epoch": 0.9840662842574889, - "grad_norm": 29.45891571044922, - "kl": 0.1796875, - "learning_rate": 1.5933715742511154e-08, - "loss": 0.0072, - "reward": 1.5523386001586914, - "reward_std": 0.05010468512773514, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.4273386001586914, - "step": 3088 - }, - { - "completion_length": 98.21875, - "epoch": 0.9843849585723391, - "grad_norm": 35.70580291748047, - "kl": 0.28125, - "learning_rate": 1.561504142766093e-08, - "loss": 0.0113, - "reward": 1.6309047937393188, - "reward_std": 0.07136161625385284, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5059047937393188, - "rewards/pad": 0.125, - "step": 3089 - }, - { - "completion_length": 96.71875, - "epoch": 0.9847036328871893, - "grad_norm": 59.9937629699707, - "kl": 0.12353515625, - "learning_rate": 1.5296367112810705e-08, - "loss": 0.0049, - "reward": 1.5103379487991333, - "reward_std": 0.035373713821172714, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5103378891944885, - "step": 3090 - }, - { - "completion_length": 142.59375, - "epoch": 0.9850223072020395, - "grad_norm": 18.170406341552734, - "kl": 0.09814453125, - "learning_rate": 1.4977692797960485e-08, - "loss": 0.0039, - "reward": 1.4885005950927734, - "reward_std": 0.03841561824083328, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48850059509277344, - "step": 3091 - }, - { - "completion_length": 120.421875, - "epoch": 0.9853409815168898, - "grad_norm": 9.481536865234375, - "kl": 0.11669921875, - "learning_rate": 1.465901848311026e-08, - "loss": 0.0047, - "reward": 1.4869211912155151, - "reward_std": 0.06074891239404678, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.36192119121551514, - "step": 3092 - }, - { - "completion_length": 119.890625, - "epoch": 0.98565965583174, - "grad_norm": 24.530763626098633, - "kl": 0.3046875, - "learning_rate": 1.4340344168260036e-08, - "loss": 0.0122, - "reward": 1.433422565460205, - "reward_std": 0.08270367980003357, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.43342268466949463, - "rewards/pad": 0.0, - "step": 3093 - }, - { - "completion_length": 96.25, - "epoch": 0.9859783301465902, - "grad_norm": 44.355506896972656, - "kl": 0.1630859375, - "learning_rate": 1.4021669853409815e-08, - "loss": 0.0065, - "reward": 1.6022392511367798, - "reward_std": 0.08567021042108536, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4772392809391022, - "rewards/pad": 0.125, - "step": 3094 - }, - { - "completion_length": 146.703125, - "epoch": 0.9862970044614404, - "grad_norm": 45.989227294921875, - "kl": 0.08251953125, - "learning_rate": 1.370299553855959e-08, - "loss": 0.0033, - "reward": 1.6105356216430664, - "reward_std": 0.029232844710350037, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.48553550243377686, - "step": 3095 - }, - { - "completion_length": 171.828125, - "epoch": 0.9866156787762906, - "grad_norm": 12.362031936645508, - "kl": 0.07373046875, - "learning_rate": 1.338432122370937e-08, - "loss": 0.003, - "reward": 1.6057605743408203, - "reward_std": 0.10795149952173233, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 0.984375, - "rewards/tracking_iou_reward": 0.49638551473617554, - "step": 3096 - }, - { - "completion_length": 122.09375, - "epoch": 0.9869343530911409, - "grad_norm": 87.25640869140625, - "kl": 0.162109375, - "learning_rate": 1.3065646908859145e-08, - "loss": 0.0065, - "reward": 1.665325403213501, - "reward_std": 0.07152172923088074, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.665325403213501, - "step": 3097 - }, - { - "completion_length": 95.171875, - "epoch": 0.9872530274059911, - "grad_norm": 18.827186584472656, - "kl": 0.115234375, - "learning_rate": 1.2746972594008922e-08, - "loss": 0.0046, - "reward": 1.6247881650924683, - "reward_std": 0.046058617532253265, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.4997881054878235, - "rewards/pad": 0.125, - "step": 3098 - }, - { - "completion_length": 72.21875, - "epoch": 0.9875717017208413, - "grad_norm": 37.85802459716797, - "kl": 0.1494140625, - "learning_rate": 1.24282982791587e-08, - "loss": 0.006, - "reward": 1.5816888809204102, - "reward_std": 0.09474524110555649, - "rewards/pad": 0.203125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.37856388092041016, - "step": 3099 - }, - { - "completion_length": 46.625, - "epoch": 0.9878903760356915, - "grad_norm": 39.316165924072266, - "kl": 0.1767578125, - "learning_rate": 1.2109623964308477e-08, - "loss": 0.0071, - "reward": 1.935715913772583, - "reward_std": 0.06848332285881042, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.6857160329818726, - "step": 3100 - }, - { - "completion_length": 96.328125, - "epoch": 0.9882090503505417, - "grad_norm": 41.54635238647461, - "kl": 0.154296875, - "learning_rate": 1.1790949649458252e-08, - "loss": 0.0062, - "reward": 1.4594142436981201, - "reward_std": 0.08696560561656952, - "rewards/answer_reward": 0.015625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.44378918409347534, - "step": 3101 - }, - { - "completion_length": 146.65625, - "epoch": 0.988527724665392, - "grad_norm": 98.6771240234375, - "kl": 0.12890625, - "learning_rate": 1.1472275334608031e-08, - "loss": 0.0052, - "reward": 1.5170422792434692, - "reward_std": 0.06204860284924507, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5170422792434692, - "step": 3102 - }, - { - "completion_length": 71.875, - "epoch": 0.9888463989802422, - "grad_norm": 50.55533981323242, - "kl": 0.16796875, - "learning_rate": 1.1153601019757807e-08, - "loss": 0.0067, - "reward": 1.6279575824737549, - "reward_std": 0.05080032721161842, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5029575824737549, - "step": 3103 - }, - { - "completion_length": 96.328125, - "epoch": 0.9891650732950924, - "grad_norm": 36.75556182861328, - "kl": 0.1806640625, - "learning_rate": 1.0834926704907584e-08, - "loss": 0.0072, - "reward": 1.5686193704605103, - "reward_std": 0.1014842763543129, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.44361937046051025, - "step": 3104 - }, - { - "completion_length": 148.953125, - "epoch": 0.9894837476099426, - "grad_norm": 25.972782135009766, - "kl": 0.07666015625, - "learning_rate": 1.0516252390057361e-08, - "loss": 0.0031, - "reward": 1.6567128896713257, - "reward_std": 0.16708610951900482, - "rewards/pad": 0.203125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4535878896713257, - "step": 3105 - }, - { - "completion_length": 74.109375, - "epoch": 0.9898024219247928, - "grad_norm": 121.40696716308594, - "kl": 0.1552734375, - "learning_rate": 1.0197578075207138e-08, - "loss": 0.0062, - "reward": 1.938371181488037, - "reward_std": 0.04860205575823784, - "rewards/pad": 0.375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5633713603019714, - "step": 3106 - }, - { - "completion_length": 123.828125, - "epoch": 0.9901210962396431, - "grad_norm": 69.93238067626953, - "kl": 0.1259765625, - "learning_rate": 9.878903760356915e-09, - "loss": 0.005, - "reward": 1.55735182762146, - "reward_std": 0.06273900717496872, - "rewards/answer_reward": 0.125, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.43235182762145996, - "step": 3107 - }, - { - "completion_length": 19.15625, - "epoch": 0.9904397705544933, - "grad_norm": 266.4374084472656, - "kl": 0.1943359375, - "learning_rate": 9.560229445506692e-09, - "loss": 0.0077, - "reward": 1.6784332990646362, - "reward_std": 0.08986538648605347, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5690582990646362, - "rewards/pad": 0.109375, - "step": 3108 - }, - { - "completion_length": 46.6875, - "epoch": 0.9907584448693435, - "grad_norm": 136.66653442382812, - "kl": 0.2451171875, - "learning_rate": 9.241555130656468e-09, - "loss": 0.0098, - "reward": 1.7314443588256836, - "reward_std": 0.12061554938554764, - "rewards/answer_reward": 0.25, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.481444388628006, - "step": 3109 - }, - { - "completion_length": 69.359375, - "epoch": 0.9910771191841937, - "grad_norm": 95.51264190673828, - "kl": 0.2451171875, - "learning_rate": 8.922880815806247e-09, - "loss": 0.0098, - "reward": 1.639791488647461, - "reward_std": 0.0745251476764679, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6397914886474609, - "rewards/pad": 0.0, - "step": 3110 - }, - { - "completion_length": 95.03125, - "epoch": 0.9913957934990439, - "grad_norm": 36.415000915527344, - "kl": 0.1376953125, - "learning_rate": 8.604206500956022e-09, - "loss": 0.0055, - "reward": 1.479468822479248, - "reward_std": 0.07794281840324402, - "rewards/answer_reward": 0.0, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47946885228157043, - "step": 3111 - }, - { - "completion_length": 123.328125, - "epoch": 0.9917144678138942, - "grad_norm": 194.49880981445312, - "kl": 0.0830078125, - "learning_rate": 8.2855321861058e-09, - "loss": 0.0033, - "reward": 1.8002910614013672, - "reward_std": 0.0898490622639656, - "rewards/pad": 0.34375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4565410614013672, - "step": 3112 - }, - { - "completion_length": 44.953125, - "epoch": 0.9920331421287444, - "grad_norm": 53.95032501220703, - "kl": 0.169921875, - "learning_rate": 7.966857871255577e-09, - "loss": 0.0068, - "reward": 1.7282993793487549, - "reward_std": 0.09149537980556488, - "rewards/answer_reward": 0.140625, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5876743793487549, - "step": 3113 - }, - { - "completion_length": 94.234375, - "epoch": 0.9923518164435946, - "grad_norm": 42.89377975463867, - "kl": 0.1708984375, - "learning_rate": 7.648183556405352e-09, - "loss": 0.0068, - "reward": 1.5565345287322998, - "reward_std": 0.07186020910739899, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.556534469127655, - "rewards/pad": 0.0, - "step": 3114 - }, - { - "completion_length": 171.921875, - "epoch": 0.9926704907584448, - "grad_norm": 21.411792755126953, - "kl": 0.091796875, - "learning_rate": 7.32950924155513e-09, - "loss": 0.0037, - "reward": 1.6367301940917969, - "reward_std": 0.048075273633003235, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.6367303133010864, - "step": 3115 - }, - { - "completion_length": 72.515625, - "epoch": 0.992989165073295, - "grad_norm": 23.5406551361084, - "kl": 0.203125, - "learning_rate": 7.0108349267049075e-09, - "loss": 0.0081, - "reward": 1.7331008911132812, - "reward_std": 0.13289625942707062, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.483100950717926, - "rewards/pad": 0.25, - "step": 3116 - }, - { - "completion_length": 72.34375, - "epoch": 0.9933078393881453, - "grad_norm": 43.361385345458984, - "kl": 0.20703125, - "learning_rate": 6.692160611854685e-09, - "loss": 0.0083, - "reward": 1.8268316984176636, - "reward_std": 0.1177218109369278, - "rewards/answer_reward": 0.359375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.46745675802230835, - "step": 3117 - }, - { - "completion_length": 124.40625, - "epoch": 0.9936265137029955, - "grad_norm": 120.89412689208984, - "kl": 0.208984375, - "learning_rate": 6.373486297004461e-09, - "loss": 0.0084, - "reward": 1.4763442277908325, - "reward_std": 0.05512676015496254, - "rewards/pad": 0.0, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.4763442575931549, - "step": 3118 - }, - { - "completion_length": 46.765625, - "epoch": 0.9939451880178458, - "grad_norm": 28.565040588378906, - "kl": 0.1962890625, - "learning_rate": 6.054811982154238e-09, - "loss": 0.0079, - "reward": 1.7104213237762451, - "reward_std": 0.1353614330291748, - "rewards/answer_reward": 0.234375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.47604626417160034, - "step": 3119 - }, - { - "completion_length": 47.34375, - "epoch": 0.994263862332696, - "grad_norm": 512.4242553710938, - "kl": 0.181640625, - "learning_rate": 5.7361376673040155e-09, - "loss": 0.0073, - "reward": 1.5802503824234009, - "reward_std": 0.0615379624068737, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5802503228187561, - "rewards/pad": 0.0, - "step": 3120 - }, - { - "completion_length": 175.359375, - "epoch": 0.9945825366475463, - "grad_norm": 10.934308052062988, - "kl": 0.0849609375, - "learning_rate": 5.417463352453792e-09, - "loss": 0.0034, - "reward": 1.454976201057434, - "reward_std": 0.027619700878858566, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.32997626066207886, - "step": 3121 - }, - { - "completion_length": 97.109375, - "epoch": 0.9949012109623965, - "grad_norm": 40.875038146972656, - "kl": 0.205078125, - "learning_rate": 5.098789037603569e-09, - "loss": 0.0082, - "reward": 1.576786756515503, - "reward_std": 0.09565810859203339, - "rewards/pad": 0.125, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.45178669691085815, - "step": 3122 - }, - { - "completion_length": 45.75, - "epoch": 0.9952198852772467, - "grad_norm": 325.85675048828125, - "kl": 0.2451171875, - "learning_rate": 4.780114722753346e-09, - "loss": 0.0098, - "reward": 1.6433312892913818, - "reward_std": 0.13505569100379944, - "rewards/answer_reward": 0.109375, - "rewards/format_reward_gqa": 1.0, - "rewards/iou_glue_reward": 0.5339564085006714, - "step": 3123 - }, - { - "completion_length": 94.4375, - "epoch": 0.9955385595920969, - "grad_norm": 77.18116760253906, - "kl": 0.1787109375, - "learning_rate": 4.4614404079031234e-09, - "loss": 0.0072, - "reward": 1.4464138746261597, - "reward_std": 0.061967186629772186, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.44641390442848206, - "rewards/pad": 0.0, - "step": 3124 - }, - { - "completion_length": 95.96875, - "epoch": 0.9958572339069471, - "grad_norm": 39.98259353637695, - "kl": 0.140625, - "learning_rate": 4.1427660930529e-09, - "loss": 0.0056, - "reward": 1.5200185775756836, - "reward_std": 0.07784561812877655, - "rewards/pad": 0.109375, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.41064363718032837, - "step": 3125 - }, - { - "completion_length": 73.75, - "epoch": 0.9961759082217974, - "grad_norm": 395.03717041015625, - "kl": 0.1435546875, - "learning_rate": 3.824091778202676e-09, - "loss": 0.0057, - "reward": 1.750535011291504, - "reward_std": 0.07563355565071106, - "rewards/pad": 0.25, - "rewards/tracking_format_reward": 1.0, - "rewards/tracking_iou_reward": 0.5005348920822144, - "step": 3126 - }, - { - "completion_length": 96.828125, - "epoch": 0.9964945825366476, - "grad_norm": 53.452335357666016, - "kl": 0.1494140625, - "learning_rate": 3.5054174633524538e-09, - "loss": 0.006, - "reward": 1.6828091144561768, - "reward_std": 0.06455633789300919, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5578091740608215, - "rewards/pad": 0.125, - "step": 3127 - }, - { - "completion_length": 103.171875, - "epoch": 0.9968132568514978, - "grad_norm": 26.258230209350586, - "kl": 0.12890625, - "learning_rate": 3.1867431485022305e-09, - "loss": 0.0051, - "reward": 1.5044879913330078, - "reward_std": 0.10846585035324097, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.31698793172836304, - "rewards/pad": 0.1875, - "step": 3128 - }, - { - "completion_length": 119.0625, - "epoch": 0.997131931166348, - "grad_norm": 29.402008056640625, - "kl": 0.1259765625, - "learning_rate": 2.8680688336520077e-09, - "loss": 0.005, - "reward": 1.634867548942566, - "reward_std": 0.04931756854057312, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6348674893379211, - "rewards/pad": 0.0, - "step": 3129 - }, - { - "completion_length": 121.484375, - "epoch": 0.9974506054811982, - "grad_norm": 64.99362182617188, - "kl": 0.14453125, - "learning_rate": 2.5493945188017845e-09, - "loss": 0.0058, - "reward": 1.6985540390014648, - "reward_std": 0.10358104854822159, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5735540986061096, - "rewards/pad": 0.125, - "step": 3130 - }, - { - "completion_length": 70.90625, - "epoch": 0.9977692797960485, - "grad_norm": 39.68723678588867, - "kl": 0.158203125, - "learning_rate": 2.2307202039515617e-09, - "loss": 0.0063, - "reward": 1.7205939292907715, - "reward_std": 0.07576900720596313, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7205939888954163, - "rewards/pad": 0.0, - "step": 3131 - }, - { - "completion_length": 95.671875, - "epoch": 0.9980879541108987, - "grad_norm": 24.42562484741211, - "kl": 0.1728515625, - "learning_rate": 1.912045889101338e-09, - "loss": 0.0069, - "reward": 1.6577162742614746, - "reward_std": 0.055983833968639374, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6577163338661194, - "rewards/pad": 0.0, - "step": 3132 - }, - { - "completion_length": 95.125, - "epoch": 0.9984066284257489, - "grad_norm": 42.81901931762695, - "kl": 0.140625, - "learning_rate": 1.5933715742511153e-09, - "loss": 0.0056, - "reward": 1.6087919473648071, - "reward_std": 0.04479397088289261, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.6087919473648071, - "rewards/pad": 0.0, - "step": 3133 - }, - { - "completion_length": 46.484375, - "epoch": 0.9987253027405991, - "grad_norm": 32.559547424316406, - "kl": 0.173828125, - "learning_rate": 1.2746972594008923e-09, - "loss": 0.0069, - "reward": 1.7041927576065063, - "reward_std": 0.05296722427010536, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.45419275760650635, - "rewards/pad": 0.25, - "step": 3134 - }, - { - "completion_length": 145.75, - "epoch": 0.9990439770554493, - "grad_norm": 78.43256378173828, - "kl": 0.1162109375, - "learning_rate": 9.56022944550669e-10, - "loss": 0.0046, - "reward": 1.3788959980010986, - "reward_std": 0.03648217022418976, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.3788960576057434, - "rewards/pad": 0.0, - "step": 3135 - }, - { - "completion_length": 71.140625, - "epoch": 0.9993626513702996, - "grad_norm": 29.7988338470459, - "kl": 0.2080078125, - "learning_rate": 6.373486297004461e-10, - "loss": 0.0083, - "reward": 1.7131028175354004, - "reward_std": 0.06020983308553696, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5881027579307556, - "rewards/pad": 0.125, - "step": 3136 - }, - { - "completion_length": 45.84375, - "epoch": 0.9996813256851498, - "grad_norm": 366.3592529296875, - "kl": 0.26171875, - "learning_rate": 3.1867431485022307e-10, - "loss": 0.0105, - "reward": 1.8288544416427612, - "reward_std": 0.11563456058502197, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.5788545608520508, - "rewards/pad": 0.25, - "step": 3137 - }, - { - "completion_length": 19.0, - "epoch": 1.0, - "grad_norm": 117.4539566040039, - "kl": 0.142578125, - "learning_rate": 0.0, - "loss": 0.0058, - "reward": 1.7700499296188354, - "reward_std": 0.03572399169206619, - "rewards/format_reward_tg": 1.0, - "rewards/iou_timestamp_reward": 0.7700498104095459, - "rewards/pad": 0.0, - "step": 3138 - } - ], - "logging_steps": 1.0, - "max_steps": 3138, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 0.0, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}