diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9538 @@ +{ + "best_global_step": 2475, + "best_metric": 0.3483333396911621, + "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/intern3vl-8b-grpo_v2/v19-20250430-174625/checkpoint-2475", + "epoch": 1.0, + "eval_steps": 250, + "global_step": 2475, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.5, + "completions/mean_length": 292.2916717529297, + "completions/min_length": 175.5, + "epoch": 0.00040404040404040404, + "grad_norm": 2.6534149601732357, + "kl": 0.00283050537109375, + "learning_rate": 1.6129032258064515e-09, + "loss": 0.04529620707035065, + "memory(GiB)": 92.98, + "reward": 0.2083333395421505, + "reward_std": 0.3905205577611923, + "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3905205577611923, + "step": 1, + "train_speed(iter/s)": 0.011973 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.25, + "completions/mean_length": 238.60417366027832, + "completions/min_length": 109.75, + "epoch": 0.00202020202020202, + "grad_norm": 1.7382476360832968, + "kl": 0.004979610443115234, + "learning_rate": 8.064516129032257e-09, + "loss": 0.005735308863222599, + "memory(GiB)": 104.19, + "reward": 0.18750000558793545, + "reward_std": 0.1695556379854679, + "rewards/MultiModalAccuracyORM/mean": 0.18750000558793545, + "rewards/MultiModalAccuracyORM/std": 0.1695556379854679, + "step": 5, + "train_speed(iter/s)": 0.026061 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.8, + "completions/mean_length": 493.87501831054686, + "completions/min_length": 266.1, + "epoch": 0.00404040404040404, + "grad_norm": 1.6461868811442486, + "kl": 0.0029445648193359374, + "learning_rate": 1.6129032258064514e-08, + "loss": 0.02294178307056427, + "memory(GiB)": 104.37, + "reward": 0.22500000819563865, + "reward_std": 0.308176326751709, + "rewards/MultiModalAccuracyORM/mean": 0.22500000819563865, + "rewards/MultiModalAccuracyORM/std": 0.308176326751709, + "step": 10, + "train_speed(iter/s)": 0.027382 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.8, + "completions/mean_length": 231.4250061035156, + "completions/min_length": 144.3, + "epoch": 0.006060606060606061, + "grad_norm": 3.6175414067372516, + "kl": 0.0058765411376953125, + "learning_rate": 2.4193548387096773e-08, + "loss": -0.020487520098686218, + "memory(GiB)": 107.13, + "reward": 0.4250000178813934, + "reward_std": 0.37195889055728915, + "rewards/MultiModalAccuracyORM/mean": 0.4250000178813934, + "rewards/MultiModalAccuracyORM/std": 0.37195889055728915, + "step": 15, + "train_speed(iter/s)": 0.031173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/mean_length": 374.85834045410155, + "completions/min_length": 234.0, + "epoch": 0.00808080808080808, + "grad_norm": 2.0453002988188924, + "kl": 0.0025386810302734375, + "learning_rate": 3.225806451612903e-08, + "loss": 0.018081194162368773, + "memory(GiB)": 110.66, + "reward": 0.2833333373069763, + "reward_std": 0.2855865716934204, + "rewards/MultiModalAccuracyORM/mean": 0.2833333373069763, + "rewards/MultiModalAccuracyORM/std": 0.2855865716934204, + "step": 20, + "train_speed(iter/s)": 0.032111 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.3, + "completions/mean_length": 343.33334197998045, + "completions/min_length": 163.6, + "epoch": 0.010101010101010102, + "grad_norm": 2.0297666321727066, + "kl": 0.005942535400390625, + "learning_rate": 4.032258064516129e-08, + "loss": -0.003527432680130005, + "memory(GiB)": 110.66, + "reward": 0.26666667982935904, + "reward_std": 0.3784792721271515, + "rewards/MultiModalAccuracyORM/mean": 0.26666667982935904, + "rewards/MultiModalAccuracyORM/std": 0.3784792721271515, + "step": 25, + "train_speed(iter/s)": 0.03346 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.7, + "completions/mean_length": 279.9750091552734, + "completions/min_length": 170.9, + "epoch": 0.012121212121212121, + "grad_norm": 1.580858331896628, + "kl": 0.0038494110107421876, + "learning_rate": 4.8387096774193546e-08, + "loss": -0.00242428183555603, + "memory(GiB)": 110.68, + "reward": 0.10000000298023223, + "reward_std": 0.2711698323488235, + "rewards/MultiModalAccuracyORM/mean": 0.10000000298023223, + "rewards/MultiModalAccuracyORM/std": 0.2711698323488235, + "step": 30, + "train_speed(iter/s)": 0.034153 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.8, + "completions/mean_length": 286.36667404174807, + "completions/min_length": 165.1, + "epoch": 0.014141414141414142, + "grad_norm": 1.8379975346697042, + "kl": 0.02647857666015625, + "learning_rate": 5.645161290322581e-08, + "loss": 0.00997340977191925, + "memory(GiB)": 110.68, + "reward": 0.25000000521540644, + "reward_std": 0.2200503796339035, + "rewards/MultiModalAccuracyORM/mean": 0.25000000521540644, + "rewards/MultiModalAccuracyORM/std": 0.2200503796339035, + "step": 35, + "train_speed(iter/s)": 0.034524 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.7, + "completions/mean_length": 407.9500198364258, + "completions/min_length": 231.7, + "epoch": 0.01616161616161616, + "grad_norm": 1.879368475551475, + "kl": 0.00126495361328125, + "learning_rate": 6.451612903225806e-08, + "loss": 0.005544811487197876, + "memory(GiB)": 111.72, + "reward": 0.16666667014360428, + "reward_std": 0.32451151907444, + "rewards/MultiModalAccuracyORM/mean": 0.16666667014360428, + "rewards/MultiModalAccuracyORM/std": 0.32451151907444, + "step": 40, + "train_speed(iter/s)": 0.034576 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.7, + "completions/mean_length": 326.12501068115233, + "completions/min_length": 189.6, + "epoch": 0.01818181818181818, + "grad_norm": 0.7460899635365059, + "kl": 0.0039581298828125, + "learning_rate": 7.258064516129032e-08, + "loss": 0.006708705425262451, + "memory(GiB)": 111.74, + "reward": 0.2083333395421505, + "reward_std": 0.22406027615070342, + "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.22406027615070342, + "step": 45, + "train_speed(iter/s)": 0.034933 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.3, + "completions/mean_length": 274.28333892822263, + "completions/min_length": 131.9, + "epoch": 0.020202020202020204, + "grad_norm": 2.4079312295812714, + "kl": 0.00251922607421875, + "learning_rate": 8.064516129032257e-08, + "loss": 0.015183356404304505, + "memory(GiB)": 111.74, + "reward": 0.21666667386889457, + "reward_std": 0.25738072395324707, + "rewards/MultiModalAccuracyORM/mean": 0.21666667386889457, + "rewards/MultiModalAccuracyORM/std": 0.25738072395324707, + "step": 50, + "train_speed(iter/s)": 0.035232 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.5, + "completions/mean_length": 365.0666778564453, + "completions/min_length": 191.6, + "epoch": 0.022222222222222223, + "grad_norm": 0.014705836185752576, + "kl": 0.004721450805664063, + "learning_rate": 8.870967741935484e-08, + "loss": 0.01203818917274475, + "memory(GiB)": 111.74, + "reward": 0.32500001043081284, + "reward_std": 0.3044206529855728, + "rewards/MultiModalAccuracyORM/mean": 0.32500001043081284, + "rewards/MultiModalAccuracyORM/std": 0.3044206529855728, + "step": 55, + "train_speed(iter/s)": 0.035135 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.3, + "completions/mean_length": 338.5833435058594, + "completions/min_length": 199.7, + "epoch": 0.024242424242424242, + "grad_norm": 2.6954085340696765, + "kl": 0.0020017623901367188, + "learning_rate": 9.677419354838709e-08, + "loss": -0.005992072820663452, + "memory(GiB)": 111.74, + "reward": 0.18333333507180213, + "reward_std": 0.33354574739933013, + "rewards/MultiModalAccuracyORM/mean": 0.18333333507180213, + "rewards/MultiModalAccuracyORM/std": 0.33354574739933013, + "step": 60, + "train_speed(iter/s)": 0.035177 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.8, + "completions/mean_length": 363.6166793823242, + "completions/min_length": 208.5, + "epoch": 0.026262626262626262, + "grad_norm": 3.0115754925592952, + "kl": 0.0037433624267578123, + "learning_rate": 1.0483870967741934e-07, + "loss": -0.03836339712142944, + "memory(GiB)": 111.74, + "reward": 0.2666666738688946, + "reward_std": 0.4085534304380417, + "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, + "rewards/MultiModalAccuracyORM/std": 0.4085534304380417, + "step": 65, + "train_speed(iter/s)": 0.035437 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.2, + "completions/mean_length": 377.9750099182129, + "completions/min_length": 204.0, + "epoch": 0.028282828282828285, + "grad_norm": 1.7279437509176054, + "kl": 0.001779937744140625, + "learning_rate": 1.1290322580645162e-07, + "loss": -0.05415753722190857, + "memory(GiB)": 111.74, + "reward": 0.3000000074505806, + "reward_std": 0.30035116374492643, + "rewards/MultiModalAccuracyORM/mean": 0.3000000074505806, + "rewards/MultiModalAccuracyORM/std": 0.30035116374492643, + "step": 70, + "train_speed(iter/s)": 0.035665 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.6, + "completions/mean_length": 242.45834197998047, + "completions/min_length": 116.8, + "epoch": 0.030303030303030304, + "grad_norm": 3.0031072335906597, + "kl": 0.002858734130859375, + "learning_rate": 1.2096774193548387e-07, + "loss": 0.03029954433441162, + "memory(GiB)": 111.74, + "reward": 0.26666667237877845, + "reward_std": 0.36043521761894226, + "rewards/MultiModalAccuracyORM/mean": 0.26666667237877845, + "rewards/MultiModalAccuracyORM/std": 0.36043521761894226, + "step": 75, + "train_speed(iter/s)": 0.036005 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 776.6, + "completions/mean_length": 435.741682434082, + "completions/min_length": 231.3, + "epoch": 0.03232323232323232, + "grad_norm": 0.42303978897841893, + "kl": 0.0016681671142578125, + "learning_rate": 1.2903225806451611e-07, + "loss": 0.049380439519882205, + "memory(GiB)": 111.74, + "reward": 0.325000012665987, + "reward_std": 0.3008513689041138, + "rewards/MultiModalAccuracyORM/mean": 0.325000012665987, + "rewards/MultiModalAccuracyORM/std": 0.3008513689041138, + "step": 80, + "train_speed(iter/s)": 0.035635 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.1, + "completions/mean_length": 302.8333435058594, + "completions/min_length": 166.0, + "epoch": 0.03434343434343434, + "grad_norm": 2.6438328703498097, + "kl": 0.00451507568359375, + "learning_rate": 1.3709677419354838e-07, + "loss": -0.0442815363407135, + "memory(GiB)": 111.74, + "reward": 0.2833333402872086, + "reward_std": 0.3933126300573349, + "rewards/MultiModalAccuracyORM/mean": 0.2833333402872086, + "rewards/MultiModalAccuracyORM/std": 0.3933126300573349, + "step": 85, + "train_speed(iter/s)": 0.035979 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/mean_length": 381.02501525878904, + "completions/min_length": 183.3, + "epoch": 0.03636363636363636, + "grad_norm": 1.74840980915549, + "kl": 0.0013660430908203126, + "learning_rate": 1.4516129032258064e-07, + "loss": 0.07182409167289734, + "memory(GiB)": 111.74, + "reward": 0.30000000521540643, + "reward_std": 0.35937642157077787, + "rewards/MultiModalAccuracyORM/mean": 0.30000000521540643, + "rewards/MultiModalAccuracyORM/std": 0.35937642157077787, + "step": 90, + "train_speed(iter/s)": 0.035659 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.2, + "completions/mean_length": 325.55834197998047, + "completions/min_length": 170.8, + "epoch": 0.03838383838383838, + "grad_norm": 0.04177816415582162, + "kl": 0.014581298828125, + "learning_rate": 1.5322580645161288e-07, + "loss": 0.029976147413253783, + "memory(GiB)": 111.74, + "reward": 0.18333333879709243, + "reward_std": 0.2358713388442993, + "rewards/MultiModalAccuracyORM/mean": 0.18333333879709243, + "rewards/MultiModalAccuracyORM/std": 0.2358713388442993, + "step": 95, + "train_speed(iter/s)": 0.035533 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.2, + "completions/mean_length": 339.98334045410155, + "completions/min_length": 187.8, + "epoch": 0.04040404040404041, + "grad_norm": 3.190540630566101, + "kl": 0.004257583618164062, + "learning_rate": 1.6129032258064515e-07, + "loss": 0.0416176974773407, + "memory(GiB)": 111.74, + "reward": 0.28333334252238274, + "reward_std": 0.3247897386550903, + "rewards/MultiModalAccuracyORM/mean": 0.28333334252238274, + "rewards/MultiModalAccuracyORM/std": 0.3247897386550903, + "step": 100, + "train_speed(iter/s)": 0.035677 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 589.7, + "completions/mean_length": 345.52500610351564, + "completions/min_length": 173.9, + "epoch": 0.04242424242424243, + "grad_norm": 3.073635935584006, + "kl": 0.00194549560546875, + "learning_rate": 1.6935483870967741e-07, + "loss": 0.042548298835754395, + "memory(GiB)": 111.74, + "reward": 0.2000000111758709, + "reward_std": 0.2611959934234619, + "rewards/MultiModalAccuracyORM/mean": 0.2000000111758709, + "rewards/MultiModalAccuracyORM/std": 0.2611959934234619, + "step": 105, + "train_speed(iter/s)": 0.035468 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 745.5, + "completions/mean_length": 380.5166748046875, + "completions/min_length": 225.9, + "epoch": 0.044444444444444446, + "grad_norm": 0.9626100429708261, + "kl": 0.0016246795654296874, + "learning_rate": 1.7741935483870968e-07, + "loss": -0.02766646146774292, + "memory(GiB)": 111.74, + "reward": 0.1916666731238365, + "reward_std": 0.3073477536439896, + "rewards/MultiModalAccuracyORM/mean": 0.1916666731238365, + "rewards/MultiModalAccuracyORM/std": 0.3073477536439896, + "step": 110, + "train_speed(iter/s)": 0.035455 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/mean_length": 311.90000915527344, + "completions/min_length": 154.2, + "epoch": 0.046464646464646465, + "grad_norm": 1.342836390340581, + "kl": 0.008540725708007813, + "learning_rate": 1.8548387096774192e-07, + "loss": -0.010879068076610566, + "memory(GiB)": 111.74, + "reward": 0.10000000074505806, + "reward_std": 0.22228264510631562, + "rewards/MultiModalAccuracyORM/mean": 0.10000000074505806, + "rewards/MultiModalAccuracyORM/std": 0.22228264510631562, + "step": 115, + "train_speed(iter/s)": 0.035535 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/mean_length": 288.9583435058594, + "completions/min_length": 165.6, + "epoch": 0.048484848484848485, + "grad_norm": 2.6619939115206135, + "kl": 0.00256195068359375, + "learning_rate": 1.9354838709677418e-07, + "loss": 0.033258992433547976, + "memory(GiB)": 111.74, + "reward": 0.4083333469927311, + "reward_std": 0.40963622033596037, + "rewards/MultiModalAccuracyORM/mean": 0.4083333469927311, + "rewards/MultiModalAccuracyORM/std": 0.40963622033596037, + "step": 120, + "train_speed(iter/s)": 0.035724 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/mean_length": 379.45001220703125, + "completions/min_length": 187.1, + "epoch": 0.050505050505050504, + "grad_norm": 1.321049130692736, + "kl": 0.0020069122314453126, + "learning_rate": 2e-07, + "loss": -0.019822967052459717, + "memory(GiB)": 111.74, + "reward": 0.2916666708886623, + "reward_std": 0.32370694279670714, + "rewards/MultiModalAccuracyORM/mean": 0.2916666708886623, + "rewards/MultiModalAccuracyORM/std": 0.32370694279670714, + "step": 125, + "train_speed(iter/s)": 0.035602 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.7, + "completions/mean_length": 316.5916748046875, + "completions/min_length": 171.4, + "epoch": 0.052525252525252523, + "grad_norm": 2.460967418512405, + "kl": 0.0105987548828125, + "learning_rate": 2e-07, + "loss": 0.0003096837550401688, + "memory(GiB)": 111.74, + "reward": 0.20833333656191827, + "reward_std": 0.29007510244846346, + "rewards/MultiModalAccuracyORM/mean": 0.20833333656191827, + "rewards/MultiModalAccuracyORM/std": 0.29007510244846346, + "step": 130, + "train_speed(iter/s)": 0.035448 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.4, + "completions/mean_length": 387.5166763305664, + "completions/min_length": 184.3, + "epoch": 0.05454545454545454, + "grad_norm": 0.059862028341158974, + "kl": 0.011987686157226562, + "learning_rate": 2e-07, + "loss": -0.011434757709503173, + "memory(GiB)": 111.74, + "reward": 0.1083333358168602, + "reward_std": 0.25866150557994844, + "rewards/MultiModalAccuracyORM/mean": 0.1083333358168602, + "rewards/MultiModalAccuracyORM/std": 0.25866150557994844, + "step": 135, + "train_speed(iter/s)": 0.035278 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.3, + "completions/mean_length": 382.4166778564453, + "completions/min_length": 206.3, + "epoch": 0.05656565656565657, + "grad_norm": 0.8204164270444702, + "kl": 0.002767181396484375, + "learning_rate": 2e-07, + "loss": 0.004211039841175079, + "memory(GiB)": 111.74, + "reward": 0.27500001192092893, + "reward_std": 0.2777498096227646, + "rewards/MultiModalAccuracyORM/mean": 0.27500001192092893, + "rewards/MultiModalAccuracyORM/std": 0.2777498096227646, + "step": 140, + "train_speed(iter/s)": 0.035472 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.4, + "completions/mean_length": 358.13333892822266, + "completions/min_length": 230.0, + "epoch": 0.05858585858585859, + "grad_norm": 2.288187560312466, + "kl": 0.006110763549804688, + "learning_rate": 2e-07, + "loss": -6.483197212219239e-05, + "memory(GiB)": 111.74, + "reward": 0.13333334028720856, + "reward_std": 0.19964569807052612, + "rewards/MultiModalAccuracyORM/mean": 0.13333334028720856, + "rewards/MultiModalAccuracyORM/std": 0.19964569807052612, + "step": 145, + "train_speed(iter/s)": 0.035406 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.8, + "completions/mean_length": 361.4166763305664, + "completions/min_length": 210.5, + "epoch": 0.06060606060606061, + "grad_norm": 0.015594201645230225, + "kl": 0.015087890625, + "learning_rate": 2e-07, + "loss": 0.015390211343765258, + "memory(GiB)": 111.74, + "reward": 0.14166667237877845, + "reward_std": 0.21374862194061278, + "rewards/MultiModalAccuracyORM/mean": 0.14166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.21374862194061278, + "step": 150, + "train_speed(iter/s)": 0.035348 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.4, + "completions/mean_length": 268.90834045410156, + "completions/min_length": 145.7, + "epoch": 0.06262626262626263, + "grad_norm": 1.9984607447420715, + "kl": 0.009865570068359374, + "learning_rate": 2e-07, + "loss": 0.041778740286827085, + "memory(GiB)": 111.74, + "reward": 0.15000000596046448, + "reward_std": 0.2238060563802719, + "rewards/MultiModalAccuracyORM/mean": 0.15000000596046448, + "rewards/MultiModalAccuracyORM/std": 0.2238060563802719, + "step": 155, + "train_speed(iter/s)": 0.035429 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.6, + "completions/mean_length": 307.5416748046875, + "completions/min_length": 163.1, + "epoch": 0.06464646464646465, + "grad_norm": 1.9710039404778148, + "kl": 0.0016231536865234375, + "learning_rate": 2e-07, + "loss": 0.06229003667831421, + "memory(GiB)": 111.74, + "reward": 0.2583333395421505, + "reward_std": 0.35413345992565154, + "rewards/MultiModalAccuracyORM/mean": 0.2583333395421505, + "rewards/MultiModalAccuracyORM/std": 0.35413345992565154, + "step": 160, + "train_speed(iter/s)": 0.035424 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.8, + "completions/mean_length": 392.75000915527346, + "completions/min_length": 207.7, + "epoch": 0.06666666666666667, + "grad_norm": 1.4786377917798241, + "kl": 0.009944915771484375, + "learning_rate": 2e-07, + "loss": 0.01215519905090332, + "memory(GiB)": 111.74, + "reward": 0.24166667237877845, + "reward_std": 0.28784283697605134, + "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.28784283697605134, + "step": 165, + "train_speed(iter/s)": 0.035279 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.7, + "completions/mean_length": 280.4750061035156, + "completions/min_length": 144.3, + "epoch": 0.06868686868686869, + "grad_norm": 3.7940420455147077, + "kl": 0.019321441650390625, + "learning_rate": 2e-07, + "loss": -0.022571200132369997, + "memory(GiB)": 111.74, + "reward": 0.30833334028720855, + "reward_std": 0.365692725777626, + "rewards/MultiModalAccuracyORM/mean": 0.30833334028720855, + "rewards/MultiModalAccuracyORM/std": 0.365692725777626, + "step": 170, + "train_speed(iter/s)": 0.035381 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.2, + "completions/mean_length": 346.808341217041, + "completions/min_length": 159.3, + "epoch": 0.0707070707070707, + "grad_norm": 1.6037297839480729, + "kl": 0.0017574310302734375, + "learning_rate": 2e-07, + "loss": 0.05014150142669678, + "memory(GiB)": 111.74, + "reward": 0.35000001415610316, + "reward_std": 0.3534030318260193, + "rewards/MultiModalAccuracyORM/mean": 0.35000001415610316, + "rewards/MultiModalAccuracyORM/std": 0.3534030318260193, + "step": 175, + "train_speed(iter/s)": 0.035382 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/mean_length": 324.31666870117186, + "completions/min_length": 202.0, + "epoch": 0.07272727272727272, + "grad_norm": 2.7315358529507865, + "kl": 0.0067108154296875, + "learning_rate": 2e-07, + "loss": 0.017354550957679748, + "memory(GiB)": 111.74, + "reward": 0.10833333730697632, + "reward_std": 0.2448128044605255, + "rewards/MultiModalAccuracyORM/mean": 0.10833333730697632, + "rewards/MultiModalAccuracyORM/std": 0.2448128044605255, + "step": 180, + "train_speed(iter/s)": 0.035416 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.3, + "completions/mean_length": 270.41667861938475, + "completions/min_length": 138.9, + "epoch": 0.07474747474747474, + "grad_norm": 2.314028672730481, + "kl": 0.002983856201171875, + "learning_rate": 2e-07, + "loss": 0.033014419674873355, + "memory(GiB)": 111.74, + "reward": 0.3333333425223827, + "reward_std": 0.2566834628582001, + "rewards/MultiModalAccuracyORM/mean": 0.3333333425223827, + "rewards/MultiModalAccuracyORM/std": 0.2566834628582001, + "step": 185, + "train_speed(iter/s)": 0.035387 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.6, + "completions/mean_length": 341.2416763305664, + "completions/min_length": 181.1, + "epoch": 0.07676767676767676, + "grad_norm": 2.3931438253006387, + "kl": 0.00200347900390625, + "learning_rate": 2e-07, + "loss": 0.038839906454086304, + "memory(GiB)": 111.74, + "reward": 0.17500000596046447, + "reward_std": 0.2684228092432022, + "rewards/MultiModalAccuracyORM/mean": 0.17500000596046447, + "rewards/MultiModalAccuracyORM/std": 0.2684228092432022, + "step": 190, + "train_speed(iter/s)": 0.03545 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.3, + "completions/mean_length": 375.1000045776367, + "completions/min_length": 215.9, + "epoch": 0.07878787878787878, + "grad_norm": 1.8630040945251685, + "kl": 0.002384376525878906, + "learning_rate": 2e-07, + "loss": -0.015469104051589966, + "memory(GiB)": 111.74, + "reward": 0.1583333395421505, + "reward_std": 0.27148365080356596, + "rewards/MultiModalAccuracyORM/mean": 0.1583333395421505, + "rewards/MultiModalAccuracyORM/std": 0.27148365080356596, + "step": 195, + "train_speed(iter/s)": 0.035415 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.2, + "completions/mean_length": 379.8833435058594, + "completions/min_length": 200.8, + "epoch": 0.08080808080808081, + "grad_norm": 2.200570213421646, + "kl": 0.0036174774169921873, + "learning_rate": 2e-07, + "loss": 0.006271684169769287, + "memory(GiB)": 111.74, + "reward": 0.25000000447034837, + "reward_std": 0.42421777844429015, + "rewards/MultiModalAccuracyORM/mean": 0.25000000447034837, + "rewards/MultiModalAccuracyORM/std": 0.42421777844429015, + "step": 200, + "train_speed(iter/s)": 0.035369 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.3, + "completions/mean_length": 345.00001220703126, + "completions/min_length": 174.6, + "epoch": 0.08282828282828283, + "grad_norm": 1.1008615802288388, + "kl": 0.0024932861328125, + "learning_rate": 2e-07, + "loss": 0.006234277784824371, + "memory(GiB)": 111.74, + "reward": 0.16666667237877847, + "reward_std": 0.2938547760248184, + "rewards/MultiModalAccuracyORM/mean": 0.16666667237877847, + "rewards/MultiModalAccuracyORM/std": 0.2938547760248184, + "step": 205, + "train_speed(iter/s)": 0.035338 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.3, + "completions/mean_length": 269.37500762939453, + "completions/min_length": 147.6, + "epoch": 0.08484848484848485, + "grad_norm": 3.476093319706285, + "kl": 0.0026340484619140625, + "learning_rate": 2e-07, + "loss": -0.0015334427356719972, + "memory(GiB)": 111.74, + "reward": 0.25000000447034837, + "reward_std": 0.300192129611969, + "rewards/MultiModalAccuracyORM/mean": 0.25000000447034837, + "rewards/MultiModalAccuracyORM/std": 0.300192129611969, + "step": 210, + "train_speed(iter/s)": 0.035464 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.8, + "completions/mean_length": 285.75000762939453, + "completions/min_length": 148.7, + "epoch": 0.08686868686868687, + "grad_norm": 2.1593026278667984, + "kl": 0.006510162353515625, + "learning_rate": 2e-07, + "loss": -0.015721744298934935, + "memory(GiB)": 111.74, + "reward": 0.21666667088866234, + "reward_std": 0.3470772713422775, + "rewards/MultiModalAccuracyORM/mean": 0.21666667088866234, + "rewards/MultiModalAccuracyORM/std": 0.3470772713422775, + "step": 215, + "train_speed(iter/s)": 0.035439 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.7, + "completions/mean_length": 354.20001220703125, + "completions/min_length": 199.6, + "epoch": 0.08888888888888889, + "grad_norm": 3.7456181210533077, + "kl": 0.004998016357421875, + "learning_rate": 2e-07, + "loss": -0.02768584489822388, + "memory(GiB)": 111.74, + "reward": 0.28333333879709244, + "reward_std": 0.28452777564525605, + "rewards/MultiModalAccuracyORM/mean": 0.28333333879709244, + "rewards/MultiModalAccuracyORM/std": 0.28452777564525605, + "step": 220, + "train_speed(iter/s)": 0.035428 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.5, + "completions/mean_length": 311.5416778564453, + "completions/min_length": 177.8, + "epoch": 0.09090909090909091, + "grad_norm": 2.0378307788473684, + "kl": 0.002862548828125, + "learning_rate": 2e-07, + "loss": 0.003831219673156738, + "memory(GiB)": 111.74, + "reward": 0.4000000111758709, + "reward_std": 0.3752594023942947, + "rewards/MultiModalAccuracyORM/mean": 0.4000000111758709, + "rewards/MultiModalAccuracyORM/std": 0.3752594023942947, + "step": 225, + "train_speed(iter/s)": 0.035407 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.5, + "completions/mean_length": 371.4583435058594, + "completions/min_length": 190.4, + "epoch": 0.09292929292929293, + "grad_norm": 2.1323681326918855, + "kl": 0.0035661697387695313, + "learning_rate": 2e-07, + "loss": 0.0016314834356307983, + "memory(GiB)": 111.74, + "reward": 0.2083333432674408, + "reward_std": 0.3477985322475433, + "rewards/MultiModalAccuracyORM/mean": 0.2083333432674408, + "rewards/MultiModalAccuracyORM/std": 0.3477985322475433, + "step": 230, + "train_speed(iter/s)": 0.035371 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/mean_length": 287.6916778564453, + "completions/min_length": 168.0, + "epoch": 0.09494949494949495, + "grad_norm": 3.249083513364966, + "kl": 0.00834503173828125, + "learning_rate": 2e-07, + "loss": -0.004596877098083496, + "memory(GiB)": 111.74, + "reward": 0.13333333730697633, + "reward_std": 0.19513316750526427, + "rewards/MultiModalAccuracyORM/mean": 0.13333333730697633, + "rewards/MultiModalAccuracyORM/std": 0.19513316750526427, + "step": 235, + "train_speed(iter/s)": 0.03535 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.4, + "completions/mean_length": 316.3583450317383, + "completions/min_length": 173.8, + "epoch": 0.09696969696969697, + "grad_norm": 2.412571205764537, + "kl": 0.005106735229492188, + "learning_rate": 2e-07, + "loss": 0.004295679926872254, + "memory(GiB)": 111.74, + "reward": 0.23333333879709245, + "reward_std": 0.3171865612268448, + "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, + "rewards/MultiModalAccuracyORM/std": 0.3171865612268448, + "step": 240, + "train_speed(iter/s)": 0.035314 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.3, + "completions/mean_length": 298.17500457763674, + "completions/min_length": 166.5, + "epoch": 0.09898989898989899, + "grad_norm": 1.9493555044308044, + "kl": 0.003982925415039062, + "learning_rate": 2e-07, + "loss": -0.04734513759613037, + "memory(GiB)": 111.74, + "reward": 0.2333333395421505, + "reward_std": 0.3471368670463562, + "rewards/MultiModalAccuracyORM/mean": 0.2333333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3471368670463562, + "step": 245, + "train_speed(iter/s)": 0.035338 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 1.3381064401700158, + "learning_rate": 2e-07, + "loss": -0.013491255044937134, + "memory(GiB)": 111.78, + "step": 250, + "train_speed(iter/s)": 0.035321 + }, + { + "epoch": 0.10101010101010101, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0016666666666666666, + "eval_completions/max_length": 567.88, + "eval_completions/mean_length": 340.8433419799805, + "eval_completions/min_length": 176.68, + "eval_kl": 0.0008290672302246094, + "eval_loss": 0.011471391655504704, + "eval_reward": 0.25833333894610405, + "eval_reward_std": 0.3269642275571823, + "eval_rewards/MultiModalAccuracyORM/mean": 0.25833333894610405, + "eval_rewards/MultiModalAccuracyORM/std": 0.3269642275571823, + "eval_runtime": 589.5277, + "eval_samples_per_second": 0.085, + "eval_steps_per_second": 0.008, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.2, + "completions/mean_length": 405.27917556762696, + "completions/min_length": 229.2, + "epoch": 0.10303030303030303, + "grad_norm": 1.3096626974864818, + "kl": 0.002015495300292969, + "learning_rate": 2e-07, + "loss": 0.022876815497875215, + "memory(GiB)": 113.5, + "reward": 0.21250000447034836, + "reward_std": 0.2526913657784462, + "rewards/MultiModalAccuracyORM/mean": 0.21250000447034836, + "rewards/MultiModalAccuracyORM/std": 0.2526913657784462, + "step": 255, + "train_speed(iter/s)": 0.031791 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.6, + "completions/mean_length": 291.32500915527345, + "completions/min_length": 161.1, + "epoch": 0.10505050505050505, + "grad_norm": 2.7968135195637585, + "kl": 0.0034709930419921874, + "learning_rate": 2e-07, + "loss": 0.02938370406627655, + "memory(GiB)": 113.5, + "reward": 0.2333333410322666, + "reward_std": 0.30821192264556885, + "rewards/MultiModalAccuracyORM/mean": 0.2333333410322666, + "rewards/MultiModalAccuracyORM/std": 0.30821192264556885, + "step": 260, + "train_speed(iter/s)": 0.031882 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.9, + "completions/mean_length": 381.02501220703124, + "completions/min_length": 193.9, + "epoch": 0.10707070707070707, + "grad_norm": 2.2674884321553908, + "kl": 0.0033966064453125, + "learning_rate": 2e-07, + "loss": 0.03137490749359131, + "memory(GiB)": 113.5, + "reward": 0.20000000149011612, + "reward_std": 0.3492949903011322, + "rewards/MultiModalAccuracyORM/mean": 0.20000000149011612, + "rewards/MultiModalAccuracyORM/std": 0.3492949903011322, + "step": 265, + "train_speed(iter/s)": 0.031856 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.5, + "completions/mean_length": 384.0666763305664, + "completions/min_length": 238.4, + "epoch": 0.10909090909090909, + "grad_norm": 1.4757764767450905, + "kl": 0.006084823608398437, + "learning_rate": 2e-07, + "loss": 0.012543919682502746, + "memory(GiB)": 113.5, + "reward": 0.3000000141561031, + "reward_std": 0.42771587073802947, + "rewards/MultiModalAccuracyORM/mean": 0.3000000141561031, + "rewards/MultiModalAccuracyORM/std": 0.42771587073802947, + "step": 270, + "train_speed(iter/s)": 0.031865 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.9, + "completions/mean_length": 362.15000610351564, + "completions/min_length": 202.6, + "epoch": 0.1111111111111111, + "grad_norm": 2.133208686622741, + "kl": 0.004328155517578125, + "learning_rate": 2e-07, + "loss": 0.014178204536437988, + "memory(GiB)": 113.5, + "reward": 0.3083333447575569, + "reward_std": 0.35184402465820314, + "rewards/MultiModalAccuracyORM/mean": 0.3083333447575569, + "rewards/MultiModalAccuracyORM/std": 0.35184402465820314, + "step": 275, + "train_speed(iter/s)": 0.031998 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.6, + "completions/mean_length": 274.9250061035156, + "completions/min_length": 153.3, + "epoch": 0.11313131313131314, + "grad_norm": 2.320837755784546, + "kl": 0.002793121337890625, + "learning_rate": 2e-07, + "loss": -0.002980351448059082, + "memory(GiB)": 113.5, + "reward": 0.2666666738688946, + "reward_std": 0.30639869570732114, + "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, + "rewards/MultiModalAccuracyORM/std": 0.30639869570732114, + "step": 280, + "train_speed(iter/s)": 0.032128 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03333333333333333, + "completions/max_length": 807.9, + "completions/mean_length": 470.2083465576172, + "completions/min_length": 219.6, + "epoch": 0.11515151515151516, + "grad_norm": 1.5979399011587243, + "kl": 0.006278228759765625, + "learning_rate": 2e-07, + "loss": 0.01850479543209076, + "memory(GiB)": 113.5, + "reward": 0.39166667088866236, + "reward_std": 0.4097074121236801, + "rewards/MultiModalAccuracyORM/mean": 0.39166667088866236, + "rewards/MultiModalAccuracyORM/std": 0.4097074121236801, + "step": 285, + "train_speed(iter/s)": 0.032047 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.6, + "completions/mean_length": 375.62501373291013, + "completions/min_length": 199.8, + "epoch": 0.11717171717171718, + "grad_norm": 1.6711790369238562, + "kl": 0.002816009521484375, + "learning_rate": 2e-07, + "loss": 0.05777819156646728, + "memory(GiB)": 113.5, + "reward": 0.34166667237877846, + "reward_std": 0.34181976318359375, + "rewards/MultiModalAccuracyORM/mean": 0.34166667237877846, + "rewards/MultiModalAccuracyORM/std": 0.34181976318359375, + "step": 290, + "train_speed(iter/s)": 0.032072 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.7, + "completions/mean_length": 373.40001068115237, + "completions/min_length": 222.1, + "epoch": 0.1191919191919192, + "grad_norm": 1.2952752164962844, + "kl": 0.006529617309570313, + "learning_rate": 2e-07, + "loss": 0.02864307165145874, + "memory(GiB)": 113.5, + "reward": 0.21666667386889457, + "reward_std": 0.22631654143333435, + "rewards/MultiModalAccuracyORM/mean": 0.21666667386889457, + "rewards/MultiModalAccuracyORM/std": 0.22631654143333435, + "step": 295, + "train_speed(iter/s)": 0.032146 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.7, + "completions/mean_length": 389.9750091552734, + "completions/min_length": 260.7, + "epoch": 0.12121212121212122, + "grad_norm": 2.5199865002602895, + "kl": 0.00448150634765625, + "learning_rate": 2e-07, + "loss": 0.0044337153434753414, + "memory(GiB)": 113.5, + "reward": 0.3583333417773247, + "reward_std": 0.3886078953742981, + "rewards/MultiModalAccuracyORM/mean": 0.3583333417773247, + "rewards/MultiModalAccuracyORM/std": 0.3886078953742981, + "step": 300, + "train_speed(iter/s)": 0.03218 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.3, + "completions/mean_length": 298.0666748046875, + "completions/min_length": 161.4, + "epoch": 0.12323232323232323, + "grad_norm": 0.04178305906141455, + "kl": 0.00428619384765625, + "learning_rate": 2e-07, + "loss": -0.04246575832366943, + "memory(GiB)": 113.5, + "reward": 0.10000000223517418, + "reward_std": 0.20118070244789124, + "rewards/MultiModalAccuracyORM/mean": 0.10000000223517418, + "rewards/MultiModalAccuracyORM/std": 0.20118070244789124, + "step": 305, + "train_speed(iter/s)": 0.032256 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.8, + "completions/mean_length": 311.39167404174805, + "completions/min_length": 131.0, + "epoch": 0.12525252525252525, + "grad_norm": 0.041069103688074135, + "kl": 0.004656982421875, + "learning_rate": 2e-07, + "loss": 0.024589771032333375, + "memory(GiB)": 113.5, + "reward": 0.23333334401249886, + "reward_std": 0.274494343996048, + "rewards/MultiModalAccuracyORM/mean": 0.23333334401249886, + "rewards/MultiModalAccuracyORM/std": 0.274494343996048, + "step": 310, + "train_speed(iter/s)": 0.032348 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.4, + "completions/mean_length": 349.1750030517578, + "completions/min_length": 191.6, + "epoch": 0.12727272727272726, + "grad_norm": 1.4578057904181938, + "kl": 0.008466339111328125, + "learning_rate": 2e-07, + "loss": 0.019071149826049804, + "memory(GiB)": 113.5, + "reward": 0.18333334103226662, + "reward_std": 0.24637180864810942, + "rewards/MultiModalAccuracyORM/mean": 0.18333334103226662, + "rewards/MultiModalAccuracyORM/std": 0.24637180864810942, + "step": 315, + "train_speed(iter/s)": 0.032385 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.1, + "completions/mean_length": 305.83334426879884, + "completions/min_length": 177.6, + "epoch": 0.1292929292929293, + "grad_norm": 2.0332697577512895, + "kl": 0.003513336181640625, + "learning_rate": 2e-07, + "loss": 0.012425613403320313, + "memory(GiB)": 113.5, + "reward": 0.2583333395421505, + "reward_std": 0.3207202464342117, + "rewards/MultiModalAccuracyORM/mean": 0.2583333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, + "step": 320, + "train_speed(iter/s)": 0.032468 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.6, + "completions/mean_length": 350.608341217041, + "completions/min_length": 207.5, + "epoch": 0.13131313131313133, + "grad_norm": 2.9017059326660206, + "kl": 0.008218002319335938, + "learning_rate": 2e-07, + "loss": -0.007495748996734619, + "memory(GiB)": 113.5, + "reward": 0.24166667237877845, + "reward_std": 0.2847819983959198, + "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.2847819983959198, + "step": 325, + "train_speed(iter/s)": 0.032489 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.7, + "completions/mean_length": 348.37501220703126, + "completions/min_length": 230.4, + "epoch": 0.13333333333333333, + "grad_norm": 2.0452895180997612, + "kl": 0.00405426025390625, + "learning_rate": 2e-07, + "loss": 0.012925130128860474, + "memory(GiB)": 113.5, + "reward": 0.2250000059604645, + "reward_std": 0.34633229672908783, + "rewards/MultiModalAccuracyORM/mean": 0.2250000059604645, + "rewards/MultiModalAccuracyORM/std": 0.34633229672908783, + "step": 330, + "train_speed(iter/s)": 0.032601 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.4, + "completions/mean_length": 391.80001068115234, + "completions/min_length": 205.6, + "epoch": 0.13535353535353536, + "grad_norm": 2.3689531245965014, + "kl": 0.0037220001220703127, + "learning_rate": 2e-07, + "loss": -0.02884441614151001, + "memory(GiB)": 113.5, + "reward": 0.34166667610406876, + "reward_std": 0.3244759202003479, + "rewards/MultiModalAccuracyORM/mean": 0.34166667610406876, + "rewards/MultiModalAccuracyORM/std": 0.3244759202003479, + "step": 335, + "train_speed(iter/s)": 0.032628 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.7, + "completions/mean_length": 393.9500137329102, + "completions/min_length": 210.7, + "epoch": 0.13737373737373737, + "grad_norm": 3.1268062962961447, + "kl": 0.00513458251953125, + "learning_rate": 2e-07, + "loss": -0.007295359671115875, + "memory(GiB)": 113.5, + "reward": 0.12500000447034837, + "reward_std": 0.2837377518415451, + "rewards/MultiModalAccuracyORM/mean": 0.12500000447034837, + "rewards/MultiModalAccuracyORM/std": 0.2837377518415451, + "step": 340, + "train_speed(iter/s)": 0.032638 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.2, + "completions/mean_length": 325.42500457763674, + "completions/min_length": 202.4, + "epoch": 0.1393939393939394, + "grad_norm": 2.570539853128275, + "kl": 0.010897064208984375, + "learning_rate": 2e-07, + "loss": -0.03583614826202393, + "memory(GiB)": 113.5, + "reward": 0.23333333879709245, + "reward_std": 0.28154108226299285, + "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, + "rewards/MultiModalAccuracyORM/std": 0.28154108226299285, + "step": 345, + "train_speed(iter/s)": 0.032636 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.3, + "completions/mean_length": 348.71667327880857, + "completions/min_length": 202.0, + "epoch": 0.1414141414141414, + "grad_norm": 1.4744760782673672, + "kl": 0.005255126953125, + "learning_rate": 2e-07, + "loss": 0.06839704513549805, + "memory(GiB)": 113.5, + "reward": 0.3416666738688946, + "reward_std": 0.3267677813768387, + "rewards/MultiModalAccuracyORM/mean": 0.3416666738688946, + "rewards/MultiModalAccuracyORM/std": 0.3267677813768387, + "step": 350, + "train_speed(iter/s)": 0.032723 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 685.7, + "completions/mean_length": 395.8666763305664, + "completions/min_length": 217.6, + "epoch": 0.14343434343434344, + "grad_norm": 0.032365545804024926, + "kl": 0.00413818359375, + "learning_rate": 2e-07, + "loss": -0.008323472738265992, + "memory(GiB)": 113.5, + "reward": 0.24166667610406875, + "reward_std": 0.29187673330307007, + "rewards/MultiModalAccuracyORM/mean": 0.24166667610406875, + "rewards/MultiModalAccuracyORM/std": 0.29187673330307007, + "step": 355, + "train_speed(iter/s)": 0.032744 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.9, + "completions/mean_length": 327.34167633056643, + "completions/min_length": 184.3, + "epoch": 0.14545454545454545, + "grad_norm": 1.1619770767978876, + "kl": 0.01970672607421875, + "learning_rate": 2e-07, + "loss": 0.014476829767227173, + "memory(GiB)": 113.5, + "reward": 0.3916666731238365, + "reward_std": 0.35942656397819517, + "rewards/MultiModalAccuracyORM/mean": 0.3916666731238365, + "rewards/MultiModalAccuracyORM/std": 0.35942656397819517, + "step": 360, + "train_speed(iter/s)": 0.032848 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.8, + "completions/mean_length": 331.90001220703124, + "completions/min_length": 222.0, + "epoch": 0.14747474747474748, + "grad_norm": 1.4073504269814208, + "kl": 0.006307220458984375, + "learning_rate": 2e-07, + "loss": 0.03325994312763214, + "memory(GiB)": 113.5, + "reward": 0.05833333432674408, + "reward_std": 0.16069675385951995, + "rewards/MultiModalAccuracyORM/mean": 0.05833333432674408, + "rewards/MultiModalAccuracyORM/std": 0.16069675385951995, + "step": 365, + "train_speed(iter/s)": 0.032856 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/mean_length": 423.8916839599609, + "completions/min_length": 252.2, + "epoch": 0.1494949494949495, + "grad_norm": 1.4976657581094635, + "kl": 0.006170654296875, + "learning_rate": 2e-07, + "loss": -0.01670956760644913, + "memory(GiB)": 113.5, + "reward": 0.20000000223517417, + "reward_std": 0.21999078392982482, + "rewards/MultiModalAccuracyORM/mean": 0.20000000223517417, + "rewards/MultiModalAccuracyORM/std": 0.21999078392982482, + "step": 370, + "train_speed(iter/s)": 0.032832 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.2, + "completions/mean_length": 363.00001068115233, + "completions/min_length": 182.0, + "epoch": 0.15151515151515152, + "grad_norm": 2.481807345956626, + "kl": 0.0046051025390625, + "learning_rate": 2e-07, + "loss": 0.04444247186183929, + "memory(GiB)": 113.5, + "reward": 0.400000012665987, + "reward_std": 0.3985941380262375, + "rewards/MultiModalAccuracyORM/mean": 0.400000012665987, + "rewards/MultiModalAccuracyORM/std": 0.3985941380262375, + "step": 375, + "train_speed(iter/s)": 0.032805 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.2, + "completions/mean_length": 362.1333435058594, + "completions/min_length": 207.8, + "epoch": 0.15353535353535352, + "grad_norm": 1.225556055703092, + "kl": 0.01065216064453125, + "learning_rate": 2e-07, + "loss": 0.0010599255561828612, + "memory(GiB)": 113.5, + "reward": 0.2250000022351742, + "reward_std": 0.22698737680912018, + "rewards/MultiModalAccuracyORM/mean": 0.2250000022351742, + "rewards/MultiModalAccuracyORM/std": 0.22698737680912018, + "step": 380, + "train_speed(iter/s)": 0.032797 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.5, + "completions/mean_length": 259.9750068664551, + "completions/min_length": 151.0, + "epoch": 0.15555555555555556, + "grad_norm": 3.170333391476991, + "kl": 0.010870361328125, + "learning_rate": 2e-07, + "loss": 0.04853119254112244, + "memory(GiB)": 113.5, + "reward": 0.4500000074505806, + "reward_std": 0.32345272302627565, + "rewards/MultiModalAccuracyORM/mean": 0.4500000074505806, + "rewards/MultiModalAccuracyORM/std": 0.32345272302627565, + "step": 385, + "train_speed(iter/s)": 0.032869 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.9, + "completions/mean_length": 359.0833465576172, + "completions/min_length": 170.4, + "epoch": 0.15757575757575756, + "grad_norm": 1.6322015536148482, + "kl": 0.00597076416015625, + "learning_rate": 2e-07, + "loss": -0.003878127783536911, + "memory(GiB)": 113.5, + "reward": 0.19166667237877846, + "reward_std": 0.3196614503860474, + "rewards/MultiModalAccuracyORM/mean": 0.19166667237877846, + "rewards/MultiModalAccuracyORM/std": 0.3196614503860474, + "step": 390, + "train_speed(iter/s)": 0.032905 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.1, + "completions/mean_length": 429.06668014526366, + "completions/min_length": 281.5, + "epoch": 0.1595959595959596, + "grad_norm": 2.750918910992668, + "kl": 0.059673309326171875, + "learning_rate": 2e-07, + "loss": 0.016079676151275635, + "memory(GiB)": 113.5, + "reward": 0.14166666865348815, + "reward_std": 0.23854664266109465, + "rewards/MultiModalAccuracyORM/mean": 0.14166666865348815, + "rewards/MultiModalAccuracyORM/std": 0.23854664266109465, + "step": 395, + "train_speed(iter/s)": 0.032918 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 722.7, + "completions/mean_length": 381.7416793823242, + "completions/min_length": 187.8, + "epoch": 0.16161616161616163, + "grad_norm": 1.276714724002977, + "kl": 0.004840087890625, + "learning_rate": 2e-07, + "loss": 0.030894118547439575, + "memory(GiB)": 113.5, + "reward": 0.2750000074505806, + "reward_std": 0.21374862194061278, + "rewards/MultiModalAccuracyORM/mean": 0.2750000074505806, + "rewards/MultiModalAccuracyORM/std": 0.21374862194061278, + "step": 400, + "train_speed(iter/s)": 0.032861 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/mean_length": 292.2416748046875, + "completions/min_length": 188.5, + "epoch": 0.16363636363636364, + "grad_norm": 1.285497466986634, + "kl": 0.00401611328125, + "learning_rate": 2e-07, + "loss": -0.00028939247131347655, + "memory(GiB)": 113.5, + "reward": 0.25833333656191826, + "reward_std": 0.2986306995153427, + "rewards/MultiModalAccuracyORM/mean": 0.25833333656191826, + "rewards/MultiModalAccuracyORM/std": 0.2986306995153427, + "step": 405, + "train_speed(iter/s)": 0.032956 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.5, + "completions/mean_length": 332.90001373291017, + "completions/min_length": 195.8, + "epoch": 0.16565656565656567, + "grad_norm": 2.4986293478171695, + "kl": 0.0099639892578125, + "learning_rate": 2e-07, + "loss": 0.01775420904159546, + "memory(GiB)": 113.5, + "reward": 0.14166666939854622, + "reward_std": 0.2355453997850418, + "rewards/MultiModalAccuracyORM/mean": 0.14166666939854622, + "rewards/MultiModalAccuracyORM/std": 0.2355453997850418, + "step": 410, + "train_speed(iter/s)": 0.032979 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.6, + "completions/mean_length": 352.77500915527344, + "completions/min_length": 189.9, + "epoch": 0.16767676767676767, + "grad_norm": 1.8788296454969475, + "kl": 0.00422210693359375, + "learning_rate": 2e-07, + "loss": -0.005545926094055176, + "memory(GiB)": 113.5, + "reward": 0.32500001043081284, + "reward_std": 0.3388330668210983, + "rewards/MultiModalAccuracyORM/mean": 0.32500001043081284, + "rewards/MultiModalAccuracyORM/std": 0.3388330668210983, + "step": 415, + "train_speed(iter/s)": 0.033025 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 690.9, + "completions/mean_length": 414.40001068115237, + "completions/min_length": 239.5, + "epoch": 0.1696969696969697, + "grad_norm": 0.07032446522446908, + "kl": 0.005554962158203125, + "learning_rate": 2e-07, + "loss": -0.002293400466442108, + "memory(GiB)": 113.5, + "reward": 0.20833333879709243, + "reward_std": 0.21973656117916107, + "rewards/MultiModalAccuracyORM/mean": 0.20833333879709243, + "rewards/MultiModalAccuracyORM/std": 0.21973656117916107, + "step": 420, + "train_speed(iter/s)": 0.032985 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.6, + "completions/mean_length": 308.7250091552734, + "completions/min_length": 175.5, + "epoch": 0.1717171717171717, + "grad_norm": 1.4798323094999317, + "kl": 0.00482025146484375, + "learning_rate": 2e-07, + "loss": 0.01790083050727844, + "memory(GiB)": 113.5, + "reward": 0.25000000521540644, + "reward_std": 0.2104335606098175, + "rewards/MultiModalAccuracyORM/mean": 0.25000000521540644, + "rewards/MultiModalAccuracyORM/std": 0.2104335606098175, + "step": 425, + "train_speed(iter/s)": 0.033033 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.9, + "completions/mean_length": 350.28334655761716, + "completions/min_length": 202.6, + "epoch": 0.17373737373737375, + "grad_norm": 1.9633281758859618, + "kl": 0.004430389404296875, + "learning_rate": 2e-07, + "loss": 0.0008227840065956116, + "memory(GiB)": 113.5, + "reward": 0.37500001713633535, + "reward_std": 0.3780064254999161, + "rewards/MultiModalAccuracyORM/mean": 0.37500001713633535, + "rewards/MultiModalAccuracyORM/std": 0.3780064254999161, + "step": 430, + "train_speed(iter/s)": 0.033105 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.3, + "completions/mean_length": 264.40834045410156, + "completions/min_length": 139.7, + "epoch": 0.17575757575757575, + "grad_norm": 1.9529808864934317, + "kl": 0.00596923828125, + "learning_rate": 2e-07, + "loss": -0.06038873791694641, + "memory(GiB)": 113.5, + "reward": 0.3333333387970924, + "reward_std": 0.29837648272514344, + "rewards/MultiModalAccuracyORM/mean": 0.3333333387970924, + "rewards/MultiModalAccuracyORM/std": 0.29837648272514344, + "step": 435, + "train_speed(iter/s)": 0.033193 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.5, + "completions/mean_length": 296.8333374023438, + "completions/min_length": 171.6, + "epoch": 0.17777777777777778, + "grad_norm": 0.03169449948005974, + "kl": 0.00481719970703125, + "learning_rate": 2e-07, + "loss": 0.018176303803920747, + "memory(GiB)": 113.5, + "reward": 0.25000000968575475, + "reward_std": 0.2596701592206955, + "rewards/MultiModalAccuracyORM/mean": 0.25000000968575475, + "rewards/MultiModalAccuracyORM/std": 0.2596701592206955, + "step": 440, + "train_speed(iter/s)": 0.03327 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.5, + "completions/mean_length": 268.50834197998046, + "completions/min_length": 126.3, + "epoch": 0.1797979797979798, + "grad_norm": 2.4262437209194774, + "kl": 0.0057281494140625, + "learning_rate": 2e-07, + "loss": -0.034365218877792356, + "memory(GiB)": 113.5, + "reward": 0.2500000074505806, + "reward_std": 0.38001427948474886, + "rewards/MultiModalAccuracyORM/mean": 0.2500000074505806, + "rewards/MultiModalAccuracyORM/std": 0.38001427948474886, + "step": 445, + "train_speed(iter/s)": 0.033325 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.4, + "completions/mean_length": 337.42501373291014, + "completions/min_length": 194.1, + "epoch": 0.18181818181818182, + "grad_norm": 2.3770604401183997, + "kl": 0.00361785888671875, + "learning_rate": 2e-07, + "loss": -0.010681581497192384, + "memory(GiB)": 113.5, + "reward": 0.2833333358168602, + "reward_std": 0.24490799605846406, + "rewards/MultiModalAccuracyORM/mean": 0.2833333358168602, + "rewards/MultiModalAccuracyORM/std": 0.24490799605846406, + "step": 450, + "train_speed(iter/s)": 0.033355 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 645.6, + "completions/mean_length": 383.0333450317383, + "completions/min_length": 228.9, + "epoch": 0.18383838383838383, + "grad_norm": 1.5212583244692293, + "kl": 0.0044342041015625, + "learning_rate": 2e-07, + "loss": 0.010468679666519164, + "memory(GiB)": 113.5, + "reward": 0.22500000447034835, + "reward_std": 0.29815449118614196, + "rewards/MultiModalAccuracyORM/mean": 0.22500000447034835, + "rewards/MultiModalAccuracyORM/std": 0.29815449118614196, + "step": 455, + "train_speed(iter/s)": 0.033387 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 682.1, + "completions/mean_length": 331.59167556762696, + "completions/min_length": 148.5, + "epoch": 0.18585858585858586, + "grad_norm": 2.3101338751804605, + "kl": 0.005951690673828125, + "learning_rate": 2e-07, + "loss": 0.013955891132354736, + "memory(GiB)": 113.5, + "reward": 0.2083333395421505, + "reward_std": 0.3207202464342117, + "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, + "step": 460, + "train_speed(iter/s)": 0.033356 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.1, + "completions/mean_length": 324.4416748046875, + "completions/min_length": 189.3, + "epoch": 0.18787878787878787, + "grad_norm": 1.9306296492930712, + "kl": 0.00476531982421875, + "learning_rate": 2e-07, + "loss": 0.0007774412631988525, + "memory(GiB)": 113.5, + "reward": 0.20833333805203438, + "reward_std": 0.18332210481166838, + "rewards/MultiModalAccuracyORM/mean": 0.20833333805203438, + "rewards/MultiModalAccuracyORM/std": 0.18332210481166838, + "step": 465, + "train_speed(iter/s)": 0.03337 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.5, + "completions/mean_length": 451.75001220703126, + "completions/min_length": 242.0, + "epoch": 0.1898989898989899, + "grad_norm": 2.9489928820712117, + "kl": 0.003478240966796875, + "learning_rate": 2e-07, + "loss": 0.0002551078796386719, + "memory(GiB)": 113.5, + "reward": 0.14166666865348815, + "reward_std": 0.22453648447990418, + "rewards/MultiModalAccuracyORM/mean": 0.14166666865348815, + "rewards/MultiModalAccuracyORM/std": 0.22453648447990418, + "step": 470, + "train_speed(iter/s)": 0.033353 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.2, + "completions/mean_length": 419.60001983642576, + "completions/min_length": 252.7, + "epoch": 0.1919191919191919, + "grad_norm": 1.657148402320105, + "kl": 0.00272979736328125, + "learning_rate": 2e-07, + "loss": -0.02806915044784546, + "memory(GiB)": 113.5, + "reward": 0.25000000894069674, + "reward_std": 0.3011055916547775, + "rewards/MultiModalAccuracyORM/mean": 0.25000000894069674, + "rewards/MultiModalAccuracyORM/std": 0.3011055916547775, + "step": 475, + "train_speed(iter/s)": 0.033331 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.1, + "completions/mean_length": 373.71667633056643, + "completions/min_length": 256.2, + "epoch": 0.19393939393939394, + "grad_norm": 2.869711221257181, + "kl": 0.0064971923828125, + "learning_rate": 2e-07, + "loss": -0.002555108070373535, + "memory(GiB)": 113.5, + "reward": 0.3916666768491268, + "reward_std": 0.2636824816465378, + "rewards/MultiModalAccuracyORM/mean": 0.3916666768491268, + "rewards/MultiModalAccuracyORM/std": 0.2636824816465378, + "step": 480, + "train_speed(iter/s)": 0.033361 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.2, + "completions/mean_length": 391.5833404541016, + "completions/min_length": 218.1, + "epoch": 0.19595959595959597, + "grad_norm": 1.9631879540052586, + "kl": 0.005725860595703125, + "learning_rate": 2e-07, + "loss": 0.0018699795007705688, + "memory(GiB)": 113.5, + "reward": 0.14166667237877845, + "reward_std": 0.15595400035381318, + "rewards/MultiModalAccuracyORM/mean": 0.14166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.15595400035381318, + "step": 485, + "train_speed(iter/s)": 0.033379 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/mean_length": 295.8583442687988, + "completions/min_length": 156.6, + "epoch": 0.19797979797979798, + "grad_norm": 0.037793670384228664, + "kl": 0.0073211669921875, + "learning_rate": 2e-07, + "loss": 0.020484793186187743, + "memory(GiB)": 113.5, + "reward": 0.20000000149011612, + "reward_std": 0.24483142793178558, + "rewards/MultiModalAccuracyORM/mean": 0.20000000149011612, + "rewards/MultiModalAccuracyORM/std": 0.24483142793178558, + "step": 490, + "train_speed(iter/s)": 0.033414 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.9, + "completions/mean_length": 380.616682434082, + "completions/min_length": 220.0, + "epoch": 0.2, + "grad_norm": 2.1512862837965163, + "kl": 0.003929901123046875, + "learning_rate": 2e-07, + "loss": 0.0034599393606185914, + "memory(GiB)": 113.5, + "reward": 0.30000000521540643, + "reward_std": 0.30715312659740446, + "rewards/MultiModalAccuracyORM/mean": 0.30000000521540643, + "rewards/MultiModalAccuracyORM/std": 0.30715312659740446, + "step": 495, + "train_speed(iter/s)": 0.033449 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.239910097717952, + "learning_rate": 2e-07, + "loss": 0.014047640562057494, + "memory(GiB)": 113.5, + "step": 500, + "train_speed(iter/s)": 0.033495 + }, + { + "epoch": 0.20202020202020202, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0016666666666666666, + "eval_completions/max_length": 591.26, + "eval_completions/mean_length": 358.19000946044923, + "eval_completions/min_length": 202.24, + "eval_kl": 0.002655487060546875, + "eval_loss": 0.00915438961237669, + "eval_reward": 0.22833333894610405, + "eval_reward_std": 0.28466624081134795, + "eval_rewards/MultiModalAccuracyORM/mean": 0.22833333894610405, + "eval_rewards/MultiModalAccuracyORM/std": 0.28466624081134795, + "eval_runtime": 608.1673, + "eval_samples_per_second": 0.082, + "eval_steps_per_second": 0.008, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.35, + "completions/mean_length": 332.39167404174805, + "completions/min_length": 199.1, + "epoch": 0.20404040404040405, + "grad_norm": 2.3622087713081186, + "kl": 0.004245758056640625, + "learning_rate": 2e-07, + "loss": -0.00013803243637084962, + "memory(GiB)": 113.5, + "reward": 0.3125000067055225, + "reward_std": 0.3219920754432678, + "rewards/MultiModalAccuracyORM/mean": 0.3125000067055225, + "rewards/MultiModalAccuracyORM/std": 0.3219920754432678, + "step": 505, + "train_speed(iter/s)": 0.031802 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.3, + "completions/mean_length": 374.4833450317383, + "completions/min_length": 209.5, + "epoch": 0.20606060606060606, + "grad_norm": 1.7757575475794216, + "kl": 0.006531524658203125, + "learning_rate": 2e-07, + "loss": 0.03503022789955139, + "memory(GiB)": 113.5, + "reward": 0.29166667312383654, + "reward_std": 0.28778324127197263, + "rewards/MultiModalAccuracyORM/mean": 0.29166667312383654, + "rewards/MultiModalAccuracyORM/std": 0.28778324127197263, + "step": 510, + "train_speed(iter/s)": 0.031819 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.2, + "completions/mean_length": 300.96667633056643, + "completions/min_length": 179.3, + "epoch": 0.2080808080808081, + "grad_norm": 2.2727530064482235, + "kl": 0.01416778564453125, + "learning_rate": 2e-07, + "loss": 0.022283512353897094, + "memory(GiB)": 113.5, + "reward": 0.24166667610406875, + "reward_std": 0.3347875773906708, + "rewards/MultiModalAccuracyORM/mean": 0.24166667610406875, + "rewards/MultiModalAccuracyORM/std": 0.3347875773906708, + "step": 515, + "train_speed(iter/s)": 0.03184 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.9, + "completions/mean_length": 341.6000099182129, + "completions/min_length": 175.8, + "epoch": 0.2101010101010101, + "grad_norm": 1.1487867895660082, + "kl": 0.00421295166015625, + "learning_rate": 2e-07, + "loss": 0.04290072023868561, + "memory(GiB)": 113.5, + "reward": 0.3666666761040688, + "reward_std": 0.28399197161197665, + "rewards/MultiModalAccuracyORM/mean": 0.3666666761040688, + "rewards/MultiModalAccuracyORM/std": 0.28399197161197665, + "step": 520, + "train_speed(iter/s)": 0.03186 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.5, + "completions/mean_length": 344.0500091552734, + "completions/min_length": 175.8, + "epoch": 0.21212121212121213, + "grad_norm": 2.2941717609617767, + "kl": 0.0046539306640625, + "learning_rate": 2e-07, + "loss": 0.004269888997077942, + "memory(GiB)": 113.5, + "reward": 0.30833333879709246, + "reward_std": 0.3267677813768387, + "rewards/MultiModalAccuracyORM/mean": 0.30833333879709246, + "rewards/MultiModalAccuracyORM/std": 0.3267677813768387, + "step": 525, + "train_speed(iter/s)": 0.031902 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.2, + "completions/mean_length": 345.83334197998045, + "completions/min_length": 172.9, + "epoch": 0.21414141414141413, + "grad_norm": 1.2948745647020719, + "kl": 0.004862213134765625, + "learning_rate": 2e-07, + "loss": -0.007743622362613678, + "memory(GiB)": 113.5, + "reward": 0.33333333805203436, + "reward_std": 0.25897532403469087, + "rewards/MultiModalAccuracyORM/mean": 0.33333333805203436, + "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, + "step": 530, + "train_speed(iter/s)": 0.031973 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/mean_length": 277.8500061035156, + "completions/min_length": 127.7, + "epoch": 0.21616161616161617, + "grad_norm": 2.820652916445064, + "kl": 0.004701995849609375, + "learning_rate": 2e-07, + "loss": 0.019122210144996644, + "memory(GiB)": 113.5, + "reward": 0.25833334028720856, + "reward_std": 0.38930273354053496, + "rewards/MultiModalAccuracyORM/mean": 0.25833334028720856, + "rewards/MultiModalAccuracyORM/std": 0.38930273354053496, + "step": 535, + "train_speed(iter/s)": 0.032051 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.3, + "completions/mean_length": 311.0666732788086, + "completions/min_length": 155.9, + "epoch": 0.21818181818181817, + "grad_norm": 0.02000320571323216, + "kl": 0.006194305419921875, + "learning_rate": 2e-07, + "loss": 0.023233750462532045, + "memory(GiB)": 113.5, + "reward": 0.29166667237877847, + "reward_std": 0.26298522055149076, + "rewards/MultiModalAccuracyORM/mean": 0.29166667237877847, + "rewards/MultiModalAccuracyORM/std": 0.26298522055149076, + "step": 540, + "train_speed(iter/s)": 0.032099 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.1, + "completions/mean_length": 298.1750099182129, + "completions/min_length": 159.8, + "epoch": 0.2202020202020202, + "grad_norm": 1.7992434949177767, + "kl": 0.00469207763671875, + "learning_rate": 2e-07, + "loss": 0.015616017580032348, + "memory(GiB)": 113.5, + "reward": 0.32500000596046447, + "reward_std": 0.22704697251319886, + "rewards/MultiModalAccuracyORM/mean": 0.32500000596046447, + "rewards/MultiModalAccuracyORM/std": 0.22704697251319886, + "step": 545, + "train_speed(iter/s)": 0.032156 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.6, + "completions/mean_length": 273.84167633056643, + "completions/min_length": 145.6, + "epoch": 0.2222222222222222, + "grad_norm": 2.8559923799679794, + "kl": 0.00508270263671875, + "learning_rate": 2e-07, + "loss": 0.050173360109329226, + "memory(GiB)": 113.5, + "reward": 0.37500001341104505, + "reward_std": 0.33303394317626955, + "rewards/MultiModalAccuracyORM/mean": 0.37500001341104505, + "rewards/MultiModalAccuracyORM/std": 0.33303394317626955, + "step": 550, + "train_speed(iter/s)": 0.032216 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.5, + "completions/mean_length": 421.71667633056643, + "completions/min_length": 240.9, + "epoch": 0.22424242424242424, + "grad_norm": 2.3260782482625366, + "kl": 0.005718994140625, + "learning_rate": 2e-07, + "loss": 0.02654660940170288, + "memory(GiB)": 113.5, + "reward": 0.36666667759418486, + "reward_std": 0.46648178398609164, + "rewards/MultiModalAccuracyORM/mean": 0.36666667759418486, + "rewards/MultiModalAccuracyORM/std": 0.46648178398609164, + "step": 555, + "train_speed(iter/s)": 0.032254 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.9, + "completions/mean_length": 376.9666717529297, + "completions/min_length": 211.5, + "epoch": 0.22626262626262628, + "grad_norm": 1.9191123297699473, + "kl": 0.0039215087890625, + "learning_rate": 2e-07, + "loss": 0.013482053577899934, + "memory(GiB)": 113.5, + "reward": 0.1750000037252903, + "reward_std": 0.3042020261287689, + "rewards/MultiModalAccuracyORM/mean": 0.1750000037252903, + "rewards/MultiModalAccuracyORM/std": 0.3042020261287689, + "step": 560, + "train_speed(iter/s)": 0.032243 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.5, + "completions/mean_length": 348.8333450317383, + "completions/min_length": 184.5, + "epoch": 0.22828282828282828, + "grad_norm": 2.0009650914845873, + "kl": 0.01170501708984375, + "learning_rate": 2e-07, + "loss": 0.035267585515975954, + "memory(GiB)": 113.5, + "reward": 0.3583333410322666, + "reward_std": 0.38205191493034363, + "rewards/MultiModalAccuracyORM/mean": 0.3583333410322666, + "rewards/MultiModalAccuracyORM/std": 0.38205191493034363, + "step": 565, + "train_speed(iter/s)": 0.032295 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.6, + "completions/mean_length": 319.6166748046875, + "completions/min_length": 181.1, + "epoch": 0.23030303030303031, + "grad_norm": 0.1996246857202343, + "kl": 0.005075836181640625, + "learning_rate": 2e-07, + "loss": -0.02471494972705841, + "memory(GiB)": 113.5, + "reward": 0.24166666939854622, + "reward_std": 0.2549654275178909, + "rewards/MultiModalAccuracyORM/mean": 0.24166666939854622, + "rewards/MultiModalAccuracyORM/std": 0.2549654275178909, + "step": 570, + "train_speed(iter/s)": 0.032297 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/mean_length": 338.40833892822263, + "completions/min_length": 187.7, + "epoch": 0.23232323232323232, + "grad_norm": 2.3362669602060033, + "kl": 0.00451202392578125, + "learning_rate": 2e-07, + "loss": 0.03307419717311859, + "memory(GiB)": 113.5, + "reward": 0.2000000037252903, + "reward_std": 0.3081523299217224, + "rewards/MultiModalAccuracyORM/mean": 0.2000000037252903, + "rewards/MultiModalAccuracyORM/std": 0.3081523299217224, + "step": 575, + "train_speed(iter/s)": 0.03234 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.9, + "completions/mean_length": 292.68334197998047, + "completions/min_length": 158.3, + "epoch": 0.23434343434343435, + "grad_norm": 2.8417938649503394, + "kl": 0.014810943603515625, + "learning_rate": 2e-07, + "loss": -0.03590070009231568, + "memory(GiB)": 113.5, + "reward": 0.3500000089406967, + "reward_std": 0.39629932343959806, + "rewards/MultiModalAccuracyORM/mean": 0.3500000089406967, + "rewards/MultiModalAccuracyORM/std": 0.39629932343959806, + "step": 580, + "train_speed(iter/s)": 0.032381 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.8, + "completions/mean_length": 396.57501831054685, + "completions/min_length": 239.6, + "epoch": 0.23636363636363636, + "grad_norm": 0.03715745404820811, + "kl": 0.00491180419921875, + "learning_rate": 2e-07, + "loss": -0.0016106054186820983, + "memory(GiB)": 113.5, + "reward": 0.25000000596046446, + "reward_std": 0.27749558687210085, + "rewards/MultiModalAccuracyORM/mean": 0.25000000596046446, + "rewards/MultiModalAccuracyORM/std": 0.27749558687210085, + "step": 585, + "train_speed(iter/s)": 0.032385 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.1, + "completions/mean_length": 371.41667861938475, + "completions/min_length": 206.0, + "epoch": 0.2383838383838384, + "grad_norm": 2.5904505607936237, + "kl": 0.0038330078125, + "learning_rate": 2e-07, + "loss": -0.0013609230518341064, + "memory(GiB)": 113.5, + "reward": 0.4416666768491268, + "reward_std": 0.3044206529855728, + "rewards/MultiModalAccuracyORM/mean": 0.4416666768491268, + "rewards/MultiModalAccuracyORM/std": 0.3044206529855728, + "step": 590, + "train_speed(iter/s)": 0.032404 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.8, + "completions/mean_length": 332.2250091552734, + "completions/min_length": 175.5, + "epoch": 0.2404040404040404, + "grad_norm": 3.252161568752739, + "kl": 0.00532073974609375, + "learning_rate": 2e-07, + "loss": 0.022338399291038515, + "memory(GiB)": 113.5, + "reward": 0.316666679084301, + "reward_std": 0.35766714811325073, + "rewards/MultiModalAccuracyORM/mean": 0.316666679084301, + "rewards/MultiModalAccuracyORM/std": 0.35766714811325073, + "step": 595, + "train_speed(iter/s)": 0.032438 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.6, + "completions/mean_length": 366.18334045410154, + "completions/min_length": 210.1, + "epoch": 0.24242424242424243, + "grad_norm": 1.7160058152461715, + "kl": 0.0050140380859375, + "learning_rate": 2e-07, + "loss": -0.0045736730098724365, + "memory(GiB)": 113.5, + "reward": 0.3250000074505806, + "reward_std": 0.32682737708091736, + "rewards/MultiModalAccuracyORM/mean": 0.3250000074505806, + "rewards/MultiModalAccuracyORM/std": 0.32682737708091736, + "step": 600, + "train_speed(iter/s)": 0.032452 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 968.4, + "completions/mean_length": 407.6500114440918, + "completions/min_length": 221.4, + "epoch": 0.24444444444444444, + "grad_norm": 1.5763528784256282, + "kl": 0.0037322998046875, + "learning_rate": 2e-07, + "loss": 0.003979828953742981, + "memory(GiB)": 113.5, + "reward": 0.30000000819563866, + "reward_std": 0.4196960777044296, + "rewards/MultiModalAccuracyORM/mean": 0.30000000819563866, + "rewards/MultiModalAccuracyORM/std": 0.4196960777044296, + "step": 605, + "train_speed(iter/s)": 0.0324 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.6, + "completions/mean_length": 293.57500610351565, + "completions/min_length": 166.8, + "epoch": 0.24646464646464647, + "grad_norm": 3.2425538671850047, + "kl": 0.009912109375, + "learning_rate": 2e-07, + "loss": 0.024757757782936096, + "memory(GiB)": 113.5, + "reward": 0.20000000670552254, + "reward_std": 0.2184557795524597, + "rewards/MultiModalAccuracyORM/mean": 0.20000000670552254, + "rewards/MultiModalAccuracyORM/std": 0.2184557795524597, + "step": 610, + "train_speed(iter/s)": 0.032454 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.6, + "completions/mean_length": 272.27501068115237, + "completions/min_length": 161.5, + "epoch": 0.24848484848484848, + "grad_norm": 2.9002217301843682, + "kl": 0.006238555908203125, + "learning_rate": 2e-07, + "loss": 0.006809020042419433, + "memory(GiB)": 113.5, + "reward": 0.2083333373069763, + "reward_std": 0.28784283697605134, + "rewards/MultiModalAccuracyORM/mean": 0.2083333373069763, + "rewards/MultiModalAccuracyORM/std": 0.28784283697605134, + "step": 615, + "train_speed(iter/s)": 0.032521 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.6, + "completions/mean_length": 323.6333435058594, + "completions/min_length": 178.6, + "epoch": 0.2505050505050505, + "grad_norm": 1.6543202512317519, + "kl": 0.005059814453125, + "learning_rate": 2e-07, + "loss": -0.013031059503555298, + "memory(GiB)": 113.5, + "reward": 0.21666667088866234, + "reward_std": 0.22625694572925567, + "rewards/MultiModalAccuracyORM/mean": 0.21666667088866234, + "rewards/MultiModalAccuracyORM/std": 0.22625694572925567, + "step": 620, + "train_speed(iter/s)": 0.032557 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.9, + "completions/mean_length": 368.2416793823242, + "completions/min_length": 204.2, + "epoch": 0.25252525252525254, + "grad_norm": 1.9398904097162017, + "kl": 0.00662689208984375, + "learning_rate": 2e-07, + "loss": 0.020694077014923096, + "memory(GiB)": 113.5, + "reward": 0.20833333805203438, + "reward_std": 0.2567190587520599, + "rewards/MultiModalAccuracyORM/mean": 0.20833333805203438, + "rewards/MultiModalAccuracyORM/std": 0.2567190587520599, + "step": 625, + "train_speed(iter/s)": 0.032596 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.2, + "completions/mean_length": 298.1750030517578, + "completions/min_length": 161.6, + "epoch": 0.2545454545454545, + "grad_norm": 3.9909953401900657, + "kl": 0.00722808837890625, + "learning_rate": 2e-07, + "loss": 0.012149769067764282, + "memory(GiB)": 113.5, + "reward": 0.3500000089406967, + "reward_std": 0.21594529151916503, + "rewards/MultiModalAccuracyORM/mean": 0.3500000089406967, + "rewards/MultiModalAccuracyORM/std": 0.21594529151916503, + "step": 630, + "train_speed(iter/s)": 0.03264 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.6, + "completions/mean_length": 335.81667404174806, + "completions/min_length": 203.4, + "epoch": 0.25656565656565655, + "grad_norm": 3.223504719612698, + "kl": 0.00595703125, + "learning_rate": 2e-07, + "loss": 0.0355703592300415, + "memory(GiB)": 113.5, + "reward": 0.39166667833924296, + "reward_std": 0.3838055461645126, + "rewards/MultiModalAccuracyORM/mean": 0.39166667833924296, + "rewards/MultiModalAccuracyORM/std": 0.3838055461645126, + "step": 635, + "train_speed(iter/s)": 0.032663 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/mean_length": 399.683349609375, + "completions/min_length": 215.3, + "epoch": 0.2585858585858586, + "grad_norm": 0.031047629899617252, + "kl": 0.00643310546875, + "learning_rate": 2e-07, + "loss": -0.002796703577041626, + "memory(GiB)": 113.5, + "reward": 0.20000000670552254, + "reward_std": 0.24866367280483245, + "rewards/MultiModalAccuracyORM/mean": 0.20000000670552254, + "rewards/MultiModalAccuracyORM/std": 0.24866367280483245, + "step": 640, + "train_speed(iter/s)": 0.032678 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.7, + "completions/mean_length": 308.32500915527345, + "completions/min_length": 162.9, + "epoch": 0.2606060606060606, + "grad_norm": 2.661462961010607, + "kl": 0.0068939208984375, + "learning_rate": 2e-07, + "loss": 0.006179103255271911, + "memory(GiB)": 113.5, + "reward": 0.25833333656191826, + "reward_std": 0.2652174860239029, + "rewards/MultiModalAccuracyORM/mean": 0.25833333656191826, + "rewards/MultiModalAccuracyORM/std": 0.2652174860239029, + "step": 645, + "train_speed(iter/s)": 0.032733 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.6, + "completions/mean_length": 246.7083396911621, + "completions/min_length": 116.2, + "epoch": 0.26262626262626265, + "grad_norm": 2.282962166826479, + "kl": 0.00615692138671875, + "learning_rate": 2e-07, + "loss": -0.022863130271434783, + "memory(GiB)": 113.5, + "reward": 0.22500000149011612, + "reward_std": 0.25664491653442384, + "rewards/MultiModalAccuracyORM/mean": 0.22500000149011612, + "rewards/MultiModalAccuracyORM/std": 0.25664491653442384, + "step": 650, + "train_speed(iter/s)": 0.032795 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.1, + "completions/mean_length": 367.6666763305664, + "completions/min_length": 205.7, + "epoch": 0.26464646464646463, + "grad_norm": 2.380599579002688, + "kl": 0.0057464599609375, + "learning_rate": 2e-07, + "loss": -0.013085761666297912, + "memory(GiB)": 113.5, + "reward": 0.2583333395421505, + "reward_std": 0.2993255376815796, + "rewards/MultiModalAccuracyORM/mean": 0.2583333395421505, + "rewards/MultiModalAccuracyORM/std": 0.2993255376815796, + "step": 655, + "train_speed(iter/s)": 0.032829 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.3, + "completions/mean_length": 398.7916854858398, + "completions/min_length": 212.0, + "epoch": 0.26666666666666666, + "grad_norm": 2.1060964762409085, + "kl": 0.0065460205078125, + "learning_rate": 2e-07, + "loss": 0.001984366774559021, + "memory(GiB)": 113.5, + "reward": 0.2583333417773247, + "reward_std": 0.35184402465820314, + "rewards/MultiModalAccuracyORM/mean": 0.2583333417773247, + "rewards/MultiModalAccuracyORM/std": 0.35184402465820314, + "step": 660, + "train_speed(iter/s)": 0.032832 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.5, + "completions/mean_length": 379.8500099182129, + "completions/min_length": 222.5, + "epoch": 0.2686868686868687, + "grad_norm": 3.0979902221373083, + "kl": 0.00526275634765625, + "learning_rate": 2e-07, + "loss": -0.00811660885810852, + "memory(GiB)": 113.5, + "reward": 0.2583333358168602, + "reward_std": 0.22446234226226808, + "rewards/MultiModalAccuracyORM/mean": 0.2583333358168602, + "rewards/MultiModalAccuracyORM/std": 0.22446234226226808, + "step": 665, + "train_speed(iter/s)": 0.03283 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.4, + "completions/mean_length": 274.05001068115234, + "completions/min_length": 143.8, + "epoch": 0.27070707070707073, + "grad_norm": 2.886266049614615, + "kl": 0.00730133056640625, + "learning_rate": 2e-07, + "loss": 0.008006072044372559, + "memory(GiB)": 113.5, + "reward": 0.2500000037252903, + "reward_std": 0.3111986219882965, + "rewards/MultiModalAccuracyORM/mean": 0.2500000037252903, + "rewards/MultiModalAccuracyORM/std": 0.3111986219882965, + "step": 670, + "train_speed(iter/s)": 0.032864 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.8, + "completions/mean_length": 304.02500915527344, + "completions/min_length": 156.8, + "epoch": 0.2727272727272727, + "grad_norm": 0.3952238447884339, + "kl": 0.0077239990234375, + "learning_rate": 2e-07, + "loss": 0.036022895574569704, + "memory(GiB)": 113.5, + "reward": 0.37500001192092897, + "reward_std": 0.32858100831508635, + "rewards/MultiModalAccuracyORM/mean": 0.37500001192092897, + "rewards/MultiModalAccuracyORM/std": 0.32858100831508635, + "step": 675, + "train_speed(iter/s)": 0.032929 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.8, + "completions/mean_length": 364.55834197998047, + "completions/min_length": 203.6, + "epoch": 0.27474747474747474, + "grad_norm": 0.0686693440428557, + "kl": 0.00585784912109375, + "learning_rate": 2e-07, + "loss": 0.006394723057746887, + "memory(GiB)": 113.5, + "reward": 0.18333334103226662, + "reward_std": 0.24637180864810942, + "rewards/MultiModalAccuracyORM/mean": 0.18333334103226662, + "rewards/MultiModalAccuracyORM/std": 0.24637180864810942, + "step": 680, + "train_speed(iter/s)": 0.032965 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.4, + "completions/mean_length": 235.7500099182129, + "completions/min_length": 118.5, + "epoch": 0.2767676767676768, + "grad_norm": 2.1797104035382873, + "kl": 0.01773681640625, + "learning_rate": 2e-07, + "loss": 0.008138242363929748, + "memory(GiB)": 113.5, + "reward": 0.26666667610406875, + "reward_std": 0.3862804383039474, + "rewards/MultiModalAccuracyORM/mean": 0.26666667610406875, + "rewards/MultiModalAccuracyORM/std": 0.3862804383039474, + "step": 685, + "train_speed(iter/s)": 0.032993 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.3, + "completions/mean_length": 420.05834503173827, + "completions/min_length": 225.0, + "epoch": 0.2787878787878788, + "grad_norm": 1.2422518482071012, + "kl": 0.005517578125, + "learning_rate": 2e-07, + "loss": -0.025521010160446167, + "memory(GiB)": 113.5, + "reward": 0.07500000223517418, + "reward_std": 0.22218745648860933, + "rewards/MultiModalAccuracyORM/mean": 0.07500000223517418, + "rewards/MultiModalAccuracyORM/std": 0.22218745648860933, + "step": 690, + "train_speed(iter/s)": 0.032968 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.4, + "completions/mean_length": 335.8666763305664, + "completions/min_length": 201.2, + "epoch": 0.2808080808080808, + "grad_norm": 0.8721711597058662, + "kl": 0.007273101806640625, + "learning_rate": 2e-07, + "loss": -0.005113717913627624, + "memory(GiB)": 113.5, + "reward": 0.2083333395421505, + "reward_std": 0.32370694279670714, + "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.32370694279670714, + "step": 695, + "train_speed(iter/s)": 0.032988 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 995.9, + "completions/mean_length": 439.34168548583983, + "completions/min_length": 226.8, + "epoch": 0.2828282828282828, + "grad_norm": 2.6514991151372906, + "kl": 0.00522308349609375, + "learning_rate": 2e-07, + "loss": 0.03241249620914459, + "memory(GiB)": 113.5, + "reward": 0.12500000298023223, + "reward_std": 0.25916995108127594, + "rewards/MultiModalAccuracyORM/mean": 0.12500000298023223, + "rewards/MultiModalAccuracyORM/std": 0.25916995108127594, + "step": 700, + "train_speed(iter/s)": 0.032951 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/mean_length": 208.48334121704102, + "completions/min_length": 125.4, + "epoch": 0.28484848484848485, + "grad_norm": 2.9111051216276933, + "kl": 0.00951080322265625, + "learning_rate": 2e-07, + "loss": 0.011016063392162323, + "memory(GiB)": 113.5, + "reward": 0.49166668131947516, + "reward_std": 0.3610968828201294, + "rewards/MultiModalAccuracyORM/mean": 0.49166668131947516, + "rewards/MultiModalAccuracyORM/std": 0.3610968828201294, + "step": 705, + "train_speed(iter/s)": 0.033011 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 684.6, + "completions/mean_length": 394.20833587646484, + "completions/min_length": 217.3, + "epoch": 0.2868686868686869, + "grad_norm": 0.9791505372375504, + "kl": 0.0062164306640625, + "learning_rate": 2e-07, + "loss": 0.009031829237937928, + "memory(GiB)": 113.5, + "reward": 0.4000000037252903, + "reward_std": 0.2825257331132889, + "rewards/MultiModalAccuracyORM/mean": 0.4000000037252903, + "rewards/MultiModalAccuracyORM/std": 0.2825257331132889, + "step": 710, + "train_speed(iter/s)": 0.032996 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.9, + "completions/mean_length": 316.80834197998047, + "completions/min_length": 187.2, + "epoch": 0.28888888888888886, + "grad_norm": 2.748998227170115, + "kl": 0.0071197509765625, + "learning_rate": 2e-07, + "loss": -0.04136030673980713, + "memory(GiB)": 113.5, + "reward": 0.17500000745058059, + "reward_std": 0.2551840543746948, + "rewards/MultiModalAccuracyORM/mean": 0.17500000745058059, + "rewards/MultiModalAccuracyORM/std": 0.2551840543746948, + "step": 715, + "train_speed(iter/s)": 0.033011 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/mean_length": 334.85834884643555, + "completions/min_length": 158.8, + "epoch": 0.2909090909090909, + "grad_norm": 2.5712788721497355, + "kl": 0.005682373046875, + "learning_rate": 2e-07, + "loss": 0.014300698041915893, + "memory(GiB)": 113.5, + "reward": 0.4000000089406967, + "reward_std": 0.26816858947277067, + "rewards/MultiModalAccuracyORM/mean": 0.4000000089406967, + "rewards/MultiModalAccuracyORM/std": 0.26816858947277067, + "step": 720, + "train_speed(iter/s)": 0.033012 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 613.2, + "completions/mean_length": 331.2666763305664, + "completions/min_length": 199.3, + "epoch": 0.29292929292929293, + "grad_norm": 1.8581663152703145, + "kl": 0.00664215087890625, + "learning_rate": 2e-07, + "loss": -0.010144461691379548, + "memory(GiB)": 113.5, + "reward": 0.21666667386889457, + "reward_std": 0.31676994562149047, + "rewards/MultiModalAccuracyORM/mean": 0.21666667386889457, + "rewards/MultiModalAccuracyORM/std": 0.31676994562149047, + "step": 725, + "train_speed(iter/s)": 0.033019 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.9, + "completions/mean_length": 326.3500129699707, + "completions/min_length": 181.5, + "epoch": 0.29494949494949496, + "grad_norm": 0.775697379774875, + "kl": 0.00522918701171875, + "learning_rate": 2e-07, + "loss": 0.0003711044788360596, + "memory(GiB)": 113.5, + "reward": 0.2333333373069763, + "reward_std": 0.3189997851848602, + "rewards/MultiModalAccuracyORM/mean": 0.2333333373069763, + "rewards/MultiModalAccuracyORM/std": 0.3189997851848602, + "step": 730, + "train_speed(iter/s)": 0.033016 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.2, + "completions/mean_length": 330.55001068115234, + "completions/min_length": 198.5, + "epoch": 0.296969696969697, + "grad_norm": 1.5273483945564796, + "kl": 0.006597900390625, + "learning_rate": 2e-07, + "loss": 0.012019181251525879, + "memory(GiB)": 113.5, + "reward": 0.34166667312383653, + "reward_std": 0.4374805331230164, + "rewards/MultiModalAccuracyORM/mean": 0.34166667312383653, + "rewards/MultiModalAccuracyORM/std": 0.4374805331230164, + "step": 735, + "train_speed(iter/s)": 0.033045 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.7, + "completions/mean_length": 381.558349609375, + "completions/min_length": 230.5, + "epoch": 0.298989898989899, + "grad_norm": 2.3594893660788374, + "kl": 0.0050323486328125, + "learning_rate": 2e-07, + "loss": 0.01788020133972168, + "memory(GiB)": 113.5, + "reward": 0.20000000596046447, + "reward_std": 0.3330695390701294, + "rewards/MultiModalAccuracyORM/mean": 0.20000000596046447, + "rewards/MultiModalAccuracyORM/std": 0.3330695390701294, + "step": 740, + "train_speed(iter/s)": 0.033058 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.7, + "completions/mean_length": 324.033349609375, + "completions/min_length": 184.4, + "epoch": 0.301010101010101, + "grad_norm": 2.29708410353418, + "kl": 0.00646209716796875, + "learning_rate": 2e-07, + "loss": -0.009415292739868164, + "memory(GiB)": 113.5, + "reward": 0.41666667386889455, + "reward_std": 0.2529277890920639, + "rewards/MultiModalAccuracyORM/mean": 0.41666667386889455, + "rewards/MultiModalAccuracyORM/std": 0.2529277890920639, + "step": 745, + "train_speed(iter/s)": 0.033072 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 2.3159606947695557, + "learning_rate": 2e-07, + "loss": 0.006078800559043885, + "memory(GiB)": 113.5, + "step": 750, + "train_speed(iter/s)": 0.033129 + }, + { + "epoch": 0.30303030303030304, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 539.88, + "eval_completions/mean_length": 336.97334396362305, + "eval_completions/min_length": 192.2, + "eval_kl": 0.00380157470703125, + "eval_loss": 0.01653137058019638, + "eval_reward": 0.2800000062584877, + "eval_reward_std": 0.28693030297756195, + "eval_rewards/MultiModalAccuracyORM/mean": 0.2800000062584877, + "eval_rewards/MultiModalAccuracyORM/std": 0.28693030297756195, + "eval_runtime": 588.5073, + "eval_samples_per_second": 0.085, + "eval_steps_per_second": 0.008, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.004166666666666667, + "completions/max_length": 541.6, + "completions/mean_length": 313.96250953674314, + "completions/min_length": 176.25, + "epoch": 0.30505050505050507, + "grad_norm": 1.6419689379277844, + "kl": 0.008066558837890625, + "learning_rate": 2e-07, + "loss": -0.004991033673286438, + "memory(GiB)": 113.5, + "reward": 0.31250000894069674, + "reward_std": 0.35801745802164076, + "rewards/MultiModalAccuracyORM/mean": 0.31250000894069674, + "rewards/MultiModalAccuracyORM/std": 0.35801745802164076, + "step": 755, + "train_speed(iter/s)": 0.031887 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.9, + "completions/mean_length": 292.9916732788086, + "completions/min_length": 150.9, + "epoch": 0.30707070707070705, + "grad_norm": 2.0844707046825723, + "kl": 0.00627288818359375, + "learning_rate": 2e-07, + "loss": 0.0167288139462471, + "memory(GiB)": 113.5, + "reward": 0.21666666865348816, + "reward_std": 0.3554166704416275, + "rewards/MultiModalAccuracyORM/mean": 0.21666666865348816, + "rewards/MultiModalAccuracyORM/std": 0.3554166704416275, + "step": 760, + "train_speed(iter/s)": 0.031901 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.4, + "completions/mean_length": 296.4166778564453, + "completions/min_length": 161.6, + "epoch": 0.3090909090909091, + "grad_norm": 1.7792521459456232, + "kl": 0.01121368408203125, + "learning_rate": 2e-07, + "loss": 0.017529194056987763, + "memory(GiB)": 113.5, + "reward": 0.4000000134110451, + "reward_std": 0.3734437495470047, + "rewards/MultiModalAccuracyORM/mean": 0.4000000134110451, + "rewards/MultiModalAccuracyORM/std": 0.3734437495470047, + "step": 765, + "train_speed(iter/s)": 0.031933 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.8, + "completions/mean_length": 228.51667175292968, + "completions/min_length": 120.5, + "epoch": 0.3111111111111111, + "grad_norm": 2.1702261558412697, + "kl": 0.0090240478515625, + "learning_rate": 2e-07, + "loss": -0.05565891861915588, + "memory(GiB)": 113.5, + "reward": 0.15000000596046448, + "reward_std": 0.24261613488197326, + "rewards/MultiModalAccuracyORM/mean": 0.15000000596046448, + "rewards/MultiModalAccuracyORM/std": 0.24261613488197326, + "step": 770, + "train_speed(iter/s)": 0.031968 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.4, + "completions/mean_length": 306.7916748046875, + "completions/min_length": 165.3, + "epoch": 0.31313131313131315, + "grad_norm": 1.2794183651984758, + "kl": 0.00740814208984375, + "learning_rate": 2e-07, + "loss": 0.04246864318847656, + "memory(GiB)": 113.5, + "reward": 0.46666667982935905, + "reward_std": 0.4767192959785461, + "rewards/MultiModalAccuracyORM/mean": 0.46666667982935905, + "rewards/MultiModalAccuracyORM/std": 0.4767192959785461, + "step": 775, + "train_speed(iter/s)": 0.032 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.4, + "completions/mean_length": 415.7916763305664, + "completions/min_length": 231.3, + "epoch": 0.3151515151515151, + "grad_norm": 1.1135361589863462, + "kl": 0.00513763427734375, + "learning_rate": 2e-07, + "loss": 0.028287124633789063, + "memory(GiB)": 113.5, + "reward": 0.15000000596046448, + "reward_std": 0.18482151329517366, + "rewards/MultiModalAccuracyORM/mean": 0.15000000596046448, + "rewards/MultiModalAccuracyORM/std": 0.18482151329517366, + "step": 780, + "train_speed(iter/s)": 0.031996 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.5, + "completions/mean_length": 379.56668243408205, + "completions/min_length": 220.9, + "epoch": 0.31717171717171716, + "grad_norm": 2.365467793016899, + "kl": 0.0066986083984375, + "learning_rate": 2e-07, + "loss": -0.014639610052108764, + "memory(GiB)": 113.5, + "reward": 0.1833333358168602, + "reward_std": 0.20363159477710724, + "rewards/MultiModalAccuracyORM/mean": 0.1833333358168602, + "rewards/MultiModalAccuracyORM/std": 0.20363159477710724, + "step": 785, + "train_speed(iter/s)": 0.032005 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.3, + "completions/mean_length": 267.75834503173826, + "completions/min_length": 155.6, + "epoch": 0.3191919191919192, + "grad_norm": 1.7124152646997672, + "kl": 0.00589141845703125, + "learning_rate": 2e-07, + "loss": 0.001770263910293579, + "memory(GiB)": 113.5, + "reward": 0.33333333730697634, + "reward_std": 0.29483942985534667, + "rewards/MultiModalAccuracyORM/mean": 0.33333333730697634, + "rewards/MultiModalAccuracyORM/std": 0.29483942985534667, + "step": 790, + "train_speed(iter/s)": 0.032028 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.7, + "completions/mean_length": 331.6916763305664, + "completions/min_length": 190.2, + "epoch": 0.3212121212121212, + "grad_norm": 2.1658449229692316, + "kl": 0.0065338134765625, + "learning_rate": 2e-07, + "loss": 0.018888431787490844, + "memory(GiB)": 113.5, + "reward": 0.22500001192092894, + "reward_std": 0.3477985322475433, + "rewards/MultiModalAccuracyORM/mean": 0.22500001192092894, + "rewards/MultiModalAccuracyORM/std": 0.3477985322475433, + "step": 795, + "train_speed(iter/s)": 0.032028 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.9, + "completions/mean_length": 319.0250076293945, + "completions/min_length": 159.2, + "epoch": 0.32323232323232326, + "grad_norm": 0.10193444456144864, + "kl": 0.0068878173828125, + "learning_rate": 2e-07, + "loss": 0.008858251571655273, + "memory(GiB)": 113.5, + "reward": 0.1916666693985462, + "reward_std": 0.2567190587520599, + "rewards/MultiModalAccuracyORM/mean": 0.1916666693985462, + "rewards/MultiModalAccuracyORM/std": 0.2567190587520599, + "step": 800, + "train_speed(iter/s)": 0.032045 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.7, + "completions/mean_length": 316.66667556762695, + "completions/min_length": 179.2, + "epoch": 0.32525252525252524, + "grad_norm": 2.315498401390807, + "kl": 0.00754852294921875, + "learning_rate": 2e-07, + "loss": -0.002603813260793686, + "memory(GiB)": 113.5, + "reward": 0.33333333656191827, + "reward_std": 0.2722736746072769, + "rewards/MultiModalAccuracyORM/mean": 0.33333333656191827, + "rewards/MultiModalAccuracyORM/std": 0.2722736746072769, + "step": 805, + "train_speed(iter/s)": 0.03204 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 675.0, + "completions/mean_length": 342.4083450317383, + "completions/min_length": 170.8, + "epoch": 0.32727272727272727, + "grad_norm": 3.1462959818853165, + "kl": 0.006378173828125, + "learning_rate": 2e-07, + "loss": -0.010855591297149659, + "memory(GiB)": 113.5, + "reward": 0.1916666693985462, + "reward_std": 0.3259988039731979, + "rewards/MultiModalAccuracyORM/mean": 0.1916666693985462, + "rewards/MultiModalAccuracyORM/std": 0.3259988039731979, + "step": 810, + "train_speed(iter/s)": 0.032048 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.7, + "completions/mean_length": 386.7166778564453, + "completions/min_length": 189.5, + "epoch": 0.3292929292929293, + "grad_norm": 2.441646562638453, + "kl": 0.008112335205078125, + "learning_rate": 2e-07, + "loss": 0.022695478796958924, + "memory(GiB)": 113.5, + "reward": 0.30833333507180216, + "reward_std": 0.29793586432933805, + "rewards/MultiModalAccuracyORM/mean": 0.30833333507180216, + "rewards/MultiModalAccuracyORM/std": 0.29793586432933805, + "step": 815, + "train_speed(iter/s)": 0.032056 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.7, + "completions/mean_length": 339.0166732788086, + "completions/min_length": 194.4, + "epoch": 0.33131313131313134, + "grad_norm": 1.020422275315147, + "kl": 0.0112762451171875, + "learning_rate": 2e-07, + "loss": 0.05103216171264648, + "memory(GiB)": 113.5, + "reward": 0.22500000670552253, + "reward_std": 0.2956440031528473, + "rewards/MultiModalAccuracyORM/mean": 0.22500000670552253, + "rewards/MultiModalAccuracyORM/std": 0.2956440031528473, + "step": 820, + "train_speed(iter/s)": 0.032046 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.8, + "completions/mean_length": 313.6000061035156, + "completions/min_length": 160.3, + "epoch": 0.3333333333333333, + "grad_norm": 3.530685574433075, + "kl": 0.00774993896484375, + "learning_rate": 2e-07, + "loss": 0.0580863893032074, + "memory(GiB)": 113.5, + "reward": 0.21666667535901069, + "reward_std": 0.31899061501026155, + "rewards/MultiModalAccuracyORM/mean": 0.21666667535901069, + "rewards/MultiModalAccuracyORM/std": 0.31899061501026155, + "step": 825, + "train_speed(iter/s)": 0.03206 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.5, + "completions/mean_length": 246.05833740234374, + "completions/min_length": 134.2, + "epoch": 0.33535353535353535, + "grad_norm": 0.05179156879937817, + "kl": 0.007355499267578125, + "learning_rate": 2e-07, + "loss": 0.0357688844203949, + "memory(GiB)": 113.5, + "reward": 0.2750000074505806, + "reward_std": 0.20817729830741882, + "rewards/MultiModalAccuracyORM/mean": 0.2750000074505806, + "rewards/MultiModalAccuracyORM/std": 0.20817729830741882, + "step": 830, + "train_speed(iter/s)": 0.03208 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/mean_length": 445.533349609375, + "completions/min_length": 285.9, + "epoch": 0.3373737373737374, + "grad_norm": 2.4237696716807413, + "kl": 0.005771636962890625, + "learning_rate": 2e-07, + "loss": 0.0007819652557373047, + "memory(GiB)": 113.5, + "reward": 0.31666666865348814, + "reward_std": 0.3596066445112228, + "rewards/MultiModalAccuracyORM/mean": 0.31666666865348814, + "rewards/MultiModalAccuracyORM/std": 0.3596066445112228, + "step": 835, + "train_speed(iter/s)": 0.032078 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.1, + "completions/mean_length": 283.40834350585936, + "completions/min_length": 164.0, + "epoch": 0.3393939393939394, + "grad_norm": 2.6192745381364615, + "kl": 0.00804901123046875, + "learning_rate": 2e-07, + "loss": 0.04405608177185059, + "memory(GiB)": 113.5, + "reward": 0.43333334401249884, + "reward_std": 0.2840515673160553, + "rewards/MultiModalAccuracyORM/mean": 0.43333334401249884, + "rewards/MultiModalAccuracyORM/std": 0.2840515673160553, + "step": 840, + "train_speed(iter/s)": 0.032104 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.4, + "completions/mean_length": 347.0500091552734, + "completions/min_length": 163.6, + "epoch": 0.3414141414141414, + "grad_norm": 2.7151298756229827, + "kl": 0.00639801025390625, + "learning_rate": 2e-07, + "loss": -0.004790738224983215, + "memory(GiB)": 113.5, + "reward": 0.4333333484828472, + "reward_std": 0.39859413504600527, + "rewards/MultiModalAccuracyORM/mean": 0.4333333484828472, + "rewards/MultiModalAccuracyORM/std": 0.39859413504600527, + "step": 845, + "train_speed(iter/s)": 0.032112 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.8, + "completions/mean_length": 273.2666717529297, + "completions/min_length": 157.8, + "epoch": 0.3434343434343434, + "grad_norm": 1.030614252722568, + "kl": 0.0097747802734375, + "learning_rate": 2e-07, + "loss": 0.0008672773838043213, + "memory(GiB)": 113.5, + "reward": 0.14166667237877845, + "reward_std": 0.28624823689460754, + "rewards/MultiModalAccuracyORM/mean": 0.14166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.28624823689460754, + "step": 850, + "train_speed(iter/s)": 0.032141 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.9, + "completions/mean_length": 196.70833892822264, + "completions/min_length": 103.7, + "epoch": 0.34545454545454546, + "grad_norm": 4.600894892489762, + "kl": 0.00904083251953125, + "learning_rate": 2e-07, + "loss": -0.002990037202835083, + "memory(GiB)": 113.5, + "reward": 0.35000001043081286, + "reward_std": 0.2511145621538162, + "rewards/MultiModalAccuracyORM/mean": 0.35000001043081286, + "rewards/MultiModalAccuracyORM/std": 0.2511145621538162, + "step": 855, + "train_speed(iter/s)": 0.0322 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.7, + "completions/mean_length": 280.9416793823242, + "completions/min_length": 175.6, + "epoch": 0.3474747474747475, + "grad_norm": 2.236927092635949, + "kl": 0.0068603515625, + "learning_rate": 2e-07, + "loss": 0.034914878010749814, + "memory(GiB)": 113.5, + "reward": 0.27500000670552255, + "reward_std": 0.28004167079925535, + "rewards/MultiModalAccuracyORM/mean": 0.27500000670552255, + "rewards/MultiModalAccuracyORM/std": 0.28004167079925535, + "step": 860, + "train_speed(iter/s)": 0.032218 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.4, + "completions/mean_length": 340.1083419799805, + "completions/min_length": 183.8, + "epoch": 0.34949494949494947, + "grad_norm": 3.1857406542737943, + "kl": 0.00835723876953125, + "learning_rate": 2e-07, + "loss": 0.019358628988265993, + "memory(GiB)": 113.5, + "reward": 0.21666667088866234, + "reward_std": 0.25585488975048065, + "rewards/MultiModalAccuracyORM/mean": 0.21666667088866234, + "rewards/MultiModalAccuracyORM/std": 0.25585488975048065, + "step": 865, + "train_speed(iter/s)": 0.032254 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.8, + "completions/mean_length": 342.0833381652832, + "completions/min_length": 181.3, + "epoch": 0.3515151515151515, + "grad_norm": 2.5743781015760714, + "kl": 0.00620880126953125, + "learning_rate": 2e-07, + "loss": -0.019692707061767577, + "memory(GiB)": 113.5, + "reward": 0.2083333358168602, + "reward_std": 0.23004821836948394, + "rewards/MultiModalAccuracyORM/mean": 0.2083333358168602, + "rewards/MultiModalAccuracyORM/std": 0.23004821836948394, + "step": 870, + "train_speed(iter/s)": 0.032261 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.4, + "completions/mean_length": 339.12501220703126, + "completions/min_length": 174.0, + "epoch": 0.35353535353535354, + "grad_norm": 3.184656199614579, + "kl": 0.00756072998046875, + "learning_rate": 2e-07, + "loss": 0.016688653826713563, + "memory(GiB)": 113.5, + "reward": 0.39166667610406875, + "reward_std": 0.37845527231693266, + "rewards/MultiModalAccuracyORM/mean": 0.39166667610406875, + "rewards/MultiModalAccuracyORM/std": 0.37845527231693266, + "step": 875, + "train_speed(iter/s)": 0.032289 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.3, + "completions/mean_length": 336.61668090820314, + "completions/min_length": 199.1, + "epoch": 0.35555555555555557, + "grad_norm": 1.8292091239029376, + "kl": 0.006783294677734375, + "learning_rate": 2e-07, + "loss": -0.0035984992980957033, + "memory(GiB)": 113.5, + "reward": 0.25000000521540644, + "reward_std": 0.353110259771347, + "rewards/MultiModalAccuracyORM/mean": 0.25000000521540644, + "rewards/MultiModalAccuracyORM/std": 0.353110259771347, + "step": 880, + "train_speed(iter/s)": 0.032303 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.7, + "completions/mean_length": 367.0833480834961, + "completions/min_length": 201.2, + "epoch": 0.3575757575757576, + "grad_norm": 2.157154554024042, + "kl": 0.0074066162109375, + "learning_rate": 2e-07, + "loss": -0.012543225288391113, + "memory(GiB)": 113.5, + "reward": 0.2666666693985462, + "reward_std": 0.292328941822052, + "rewards/MultiModalAccuracyORM/mean": 0.2666666693985462, + "rewards/MultiModalAccuracyORM/std": 0.292328941822052, + "step": 885, + "train_speed(iter/s)": 0.032311 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.1, + "completions/mean_length": 368.5666778564453, + "completions/min_length": 197.0, + "epoch": 0.3595959595959596, + "grad_norm": 1.8591339481325562, + "kl": 0.01016082763671875, + "learning_rate": 2e-07, + "loss": -0.015211772918701173, + "memory(GiB)": 113.5, + "reward": 0.22500000670552253, + "reward_std": 0.3802089035511017, + "rewards/MultiModalAccuracyORM/mean": 0.22500000670552253, + "rewards/MultiModalAccuracyORM/std": 0.3802089035511017, + "step": 890, + "train_speed(iter/s)": 0.032317 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.9, + "completions/mean_length": 334.3000091552734, + "completions/min_length": 169.7, + "epoch": 0.3616161616161616, + "grad_norm": 1.891661158050905, + "kl": 0.00751495361328125, + "learning_rate": 2e-07, + "loss": 0.057868242263793945, + "memory(GiB)": 113.5, + "reward": 0.2833333373069763, + "reward_std": 0.36168283224105835, + "rewards/MultiModalAccuracyORM/mean": 0.2833333373069763, + "rewards/MultiModalAccuracyORM/std": 0.36168283224105835, + "step": 895, + "train_speed(iter/s)": 0.03232 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.6, + "completions/mean_length": 367.8083435058594, + "completions/min_length": 193.6, + "epoch": 0.36363636363636365, + "grad_norm": 2.944909454157867, + "kl": 0.0079620361328125, + "learning_rate": 2e-07, + "loss": 0.003379705175757408, + "memory(GiB)": 113.5, + "reward": 0.17500000670552254, + "reward_std": 0.22300148010253906, + "rewards/MultiModalAccuracyORM/mean": 0.17500000670552254, + "rewards/MultiModalAccuracyORM/std": 0.22300148010253906, + "step": 900, + "train_speed(iter/s)": 0.032324 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.4, + "completions/mean_length": 414.4750144958496, + "completions/min_length": 241.5, + "epoch": 0.3656565656565657, + "grad_norm": 1.0572142583091821, + "kl": 0.00611724853515625, + "learning_rate": 2e-07, + "loss": 0.02717306911945343, + "memory(GiB)": 113.5, + "reward": 0.3083333417773247, + "reward_std": 0.27447034418582916, + "rewards/MultiModalAccuracyORM/mean": 0.3083333417773247, + "rewards/MultiModalAccuracyORM/std": 0.27447034418582916, + "step": 905, + "train_speed(iter/s)": 0.03233 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.5, + "completions/mean_length": 329.6416717529297, + "completions/min_length": 166.2, + "epoch": 0.36767676767676766, + "grad_norm": 1.806687314036588, + "kl": 0.0089080810546875, + "learning_rate": 2e-07, + "loss": 0.010141277313232422, + "memory(GiB)": 113.5, + "reward": 0.391666679084301, + "reward_std": 0.40894138514995576, + "rewards/MultiModalAccuracyORM/mean": 0.391666679084301, + "rewards/MultiModalAccuracyORM/std": 0.40894138514995576, + "step": 910, + "train_speed(iter/s)": 0.032344 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.1, + "completions/mean_length": 302.3416748046875, + "completions/min_length": 159.3, + "epoch": 0.3696969696969697, + "grad_norm": 2.825820952489286, + "kl": 0.0180572509765625, + "learning_rate": 2e-07, + "loss": 0.011392435431480408, + "memory(GiB)": 113.5, + "reward": 0.1916666731238365, + "reward_std": 0.33297434747219085, + "rewards/MultiModalAccuracyORM/mean": 0.1916666731238365, + "rewards/MultiModalAccuracyORM/std": 0.33297434747219085, + "step": 915, + "train_speed(iter/s)": 0.032356 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.4, + "completions/mean_length": 327.7416717529297, + "completions/min_length": 178.4, + "epoch": 0.3717171717171717, + "grad_norm": 2.438516683028765, + "kl": 0.00719757080078125, + "learning_rate": 2e-07, + "loss": 0.037606388330459595, + "memory(GiB)": 113.5, + "reward": 0.23333333879709245, + "reward_std": 0.3543280869722366, + "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, + "rewards/MultiModalAccuracyORM/std": 0.3543280869722366, + "step": 920, + "train_speed(iter/s)": 0.032355 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.9, + "completions/mean_length": 351.9500076293945, + "completions/min_length": 190.7, + "epoch": 0.37373737373737376, + "grad_norm": 2.1782598398370943, + "kl": 0.006235504150390625, + "learning_rate": 2e-07, + "loss": -0.007940790057182312, + "memory(GiB)": 113.5, + "reward": 0.2250000096857548, + "reward_std": 0.3659113526344299, + "rewards/MultiModalAccuracyORM/mean": 0.2250000096857548, + "rewards/MultiModalAccuracyORM/std": 0.3659113526344299, + "step": 925, + "train_speed(iter/s)": 0.032383 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.8, + "completions/mean_length": 379.47501373291016, + "completions/min_length": 206.3, + "epoch": 0.37575757575757573, + "grad_norm": 2.304852196233359, + "kl": 0.0072235107421875, + "learning_rate": 2e-07, + "loss": 0.03286640048027038, + "memory(GiB)": 113.5, + "reward": 0.34166667312383653, + "reward_std": 0.44222086369991304, + "rewards/MultiModalAccuracyORM/mean": 0.34166667312383653, + "rewards/MultiModalAccuracyORM/std": 0.44222086369991304, + "step": 930, + "train_speed(iter/s)": 0.032396 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.6, + "completions/mean_length": 323.35000762939455, + "completions/min_length": 192.7, + "epoch": 0.37777777777777777, + "grad_norm": 2.9791049041845494, + "kl": 0.01037445068359375, + "learning_rate": 2e-07, + "loss": -0.007777485251426697, + "memory(GiB)": 113.5, + "reward": 0.25000000447034837, + "reward_std": 0.35737437903881075, + "rewards/MultiModalAccuracyORM/mean": 0.25000000447034837, + "rewards/MultiModalAccuracyORM/std": 0.35737437903881075, + "step": 935, + "train_speed(iter/s)": 0.032395 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.9, + "completions/mean_length": 280.97500762939455, + "completions/min_length": 138.2, + "epoch": 0.3797979797979798, + "grad_norm": 1.881006300919645, + "kl": 0.013104248046875, + "learning_rate": 2e-07, + "loss": 0.02690579891204834, + "memory(GiB)": 113.5, + "reward": 0.29166667759418485, + "reward_std": 0.337774270772934, + "rewards/MultiModalAccuracyORM/mean": 0.29166667759418485, + "rewards/MultiModalAccuracyORM/std": 0.337774270772934, + "step": 940, + "train_speed(iter/s)": 0.032434 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.1, + "completions/mean_length": 295.9500076293945, + "completions/min_length": 174.9, + "epoch": 0.38181818181818183, + "grad_norm": 3.313912407777126, + "kl": 0.00870513916015625, + "learning_rate": 2e-07, + "loss": -0.032750940322875975, + "memory(GiB)": 113.5, + "reward": 0.40000000819563863, + "reward_std": 0.45158345997333527, + "rewards/MultiModalAccuracyORM/mean": 0.40000000819563863, + "rewards/MultiModalAccuracyORM/std": 0.45158345997333527, + "step": 945, + "train_speed(iter/s)": 0.032469 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.3, + "completions/mean_length": 262.3000038146973, + "completions/min_length": 133.7, + "epoch": 0.3838383838383838, + "grad_norm": 2.7566340852478053, + "kl": 0.00853729248046875, + "learning_rate": 2e-07, + "loss": 0.018448495864868165, + "memory(GiB)": 113.5, + "reward": 0.4083333440124989, + "reward_std": 0.2674977511167526, + "rewards/MultiModalAccuracyORM/mean": 0.4083333440124989, + "rewards/MultiModalAccuracyORM/std": 0.2674977511167526, + "step": 950, + "train_speed(iter/s)": 0.032489 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/mean_length": 363.36668090820314, + "completions/min_length": 190.9, + "epoch": 0.38585858585858585, + "grad_norm": 1.6957648809966595, + "kl": 0.0067291259765625, + "learning_rate": 2e-07, + "loss": -0.02898831069469452, + "memory(GiB)": 113.5, + "reward": 0.2000000074505806, + "reward_std": 0.32902404963970183, + "rewards/MultiModalAccuracyORM/mean": 0.2000000074505806, + "rewards/MultiModalAccuracyORM/std": 0.32902404963970183, + "step": 955, + "train_speed(iter/s)": 0.032506 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.1, + "completions/mean_length": 307.16667022705076, + "completions/min_length": 173.2, + "epoch": 0.3878787878787879, + "grad_norm": 2.6324617057971755, + "kl": 0.0082183837890625, + "learning_rate": 2e-07, + "loss": 0.010876613110303879, + "memory(GiB)": 113.5, + "reward": 0.2916666708886623, + "reward_std": 0.3953502655029297, + "rewards/MultiModalAccuracyORM/mean": 0.2916666708886623, + "rewards/MultiModalAccuracyORM/std": 0.3953502655029297, + "step": 960, + "train_speed(iter/s)": 0.032526 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.3, + "completions/mean_length": 310.78333892822263, + "completions/min_length": 176.0, + "epoch": 0.3898989898989899, + "grad_norm": 0.25204548209314886, + "kl": 0.01051025390625, + "learning_rate": 2e-07, + "loss": 0.05701416730880737, + "memory(GiB)": 113.5, + "reward": 0.2500000029802322, + "reward_std": 0.2885732680559158, + "rewards/MultiModalAccuracyORM/mean": 0.2500000029802322, + "rewards/MultiModalAccuracyORM/std": 0.2885732680559158, + "step": 965, + "train_speed(iter/s)": 0.032526 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.1, + "completions/mean_length": 354.4916687011719, + "completions/min_length": 207.3, + "epoch": 0.39191919191919194, + "grad_norm": 1.8105174117337208, + "kl": 0.00800018310546875, + "learning_rate": 2e-07, + "loss": 0.008932539820671081, + "memory(GiB)": 113.5, + "reward": 0.18333333730697632, + "reward_std": 0.3538196414709091, + "rewards/MultiModalAccuracyORM/mean": 0.18333333730697632, + "rewards/MultiModalAccuracyORM/std": 0.3538196414709091, + "step": 970, + "train_speed(iter/s)": 0.032536 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.5, + "completions/mean_length": 375.68334503173827, + "completions/min_length": 236.0, + "epoch": 0.3939393939393939, + "grad_norm": 1.4251930411180411, + "kl": 0.00720672607421875, + "learning_rate": 2e-07, + "loss": -0.04558621346950531, + "memory(GiB)": 113.5, + "reward": 0.10833333507180214, + "reward_std": 0.2549058347940445, + "rewards/MultiModalAccuracyORM/mean": 0.10833333507180214, + "rewards/MultiModalAccuracyORM/std": 0.2549058347940445, + "step": 975, + "train_speed(iter/s)": 0.032554 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.2, + "completions/mean_length": 440.5833435058594, + "completions/min_length": 218.4, + "epoch": 0.39595959595959596, + "grad_norm": 1.8329415728640532, + "kl": 0.0074310302734375, + "learning_rate": 2e-07, + "loss": -0.004531031847000122, + "memory(GiB)": 113.5, + "reward": 0.2666666738688946, + "reward_std": 0.351182359457016, + "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, + "rewards/MultiModalAccuracyORM/std": 0.351182359457016, + "step": 980, + "train_speed(iter/s)": 0.032558 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.7, + "completions/mean_length": 267.5916748046875, + "completions/min_length": 165.0, + "epoch": 0.397979797979798, + "grad_norm": 2.742069878873229, + "kl": 0.05179443359375, + "learning_rate": 2e-07, + "loss": 0.019256360828876495, + "memory(GiB)": 113.5, + "reward": 0.33333334028720857, + "reward_std": 0.3274982154369354, + "rewards/MultiModalAccuracyORM/mean": 0.33333334028720857, + "rewards/MultiModalAccuracyORM/std": 0.3274982154369354, + "step": 985, + "train_speed(iter/s)": 0.032587 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.6, + "completions/mean_length": 360.90001068115237, + "completions/min_length": 205.4, + "epoch": 0.4, + "grad_norm": 3.049274715544681, + "kl": 0.00958404541015625, + "learning_rate": 2e-07, + "loss": -0.033705079555511476, + "memory(GiB)": 113.5, + "reward": 0.31666667610406873, + "reward_std": 0.27122942507267, + "rewards/MultiModalAccuracyORM/mean": 0.31666667610406873, + "rewards/MultiModalAccuracyORM/std": 0.27122942507267, + "step": 990, + "train_speed(iter/s)": 0.032615 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.2, + "completions/mean_length": 330.02501220703124, + "completions/min_length": 198.8, + "epoch": 0.402020202020202, + "grad_norm": 2.6515591125640574, + "kl": 0.0110382080078125, + "learning_rate": 2e-07, + "loss": 0.008444187045097352, + "memory(GiB)": 113.5, + "reward": 0.41666667982935907, + "reward_std": 0.4297270834445953, + "rewards/MultiModalAccuracyORM/mean": 0.41666667982935907, + "rewards/MultiModalAccuracyORM/std": 0.4297270834445953, + "step": 995, + "train_speed(iter/s)": 0.032638 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.6423776292289114, + "learning_rate": 2e-07, + "loss": -0.0013245075941085815, + "memory(GiB)": 113.5, + "step": 1000, + "train_speed(iter/s)": 0.032641 + }, + { + "epoch": 0.40404040404040403, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 565.38, + "eval_completions/mean_length": 346.96667633056643, + "eval_completions/min_length": 203.6, + "eval_kl": 0.00558807373046875, + "eval_loss": 0.016358518972992897, + "eval_reward": 0.3083333417773247, + "eval_reward_std": 0.3403226917982101, + "eval_rewards/MultiModalAccuracyORM/mean": 0.3083333417773247, + "eval_rewards/MultiModalAccuracyORM/std": 0.3403226917982101, + "eval_runtime": 586.662, + "eval_samples_per_second": 0.085, + "eval_steps_per_second": 0.009, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.004166666666666667, + "completions/max_length": 608.15, + "completions/mean_length": 370.19167709350586, + "completions/min_length": 202.85, + "epoch": 0.40606060606060607, + "grad_norm": 2.014189773891532, + "kl": 0.009693145751953125, + "learning_rate": 2e-07, + "loss": 0.026693809032440185, + "memory(GiB)": 113.5, + "reward": 0.22500000484287738, + "reward_std": 0.2774069786071777, + "rewards/MultiModalAccuracyORM/mean": 0.22500000484287738, + "rewards/MultiModalAccuracyORM/std": 0.2774069786071777, + "step": 1005, + "train_speed(iter/s)": 0.031849 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.9, + "completions/mean_length": 427.22500762939455, + "completions/min_length": 238.0, + "epoch": 0.4080808080808081, + "grad_norm": 2.2096007474060633, + "kl": 0.00854034423828125, + "learning_rate": 2e-07, + "loss": -0.01839480996131897, + "memory(GiB)": 113.5, + "reward": 0.19166667014360428, + "reward_std": 0.23004821836948394, + "rewards/MultiModalAccuracyORM/mean": 0.19166667014360428, + "rewards/MultiModalAccuracyORM/std": 0.23004821836948394, + "step": 1010, + "train_speed(iter/s)": 0.031846 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.1, + "completions/mean_length": 383.8250106811523, + "completions/min_length": 206.1, + "epoch": 0.4101010101010101, + "grad_norm": 2.360953993727072, + "kl": 0.00855560302734375, + "learning_rate": 2e-07, + "loss": -0.03324509263038635, + "memory(GiB)": 113.5, + "reward": 0.46666667610406876, + "reward_std": 0.36664178371429446, + "rewards/MultiModalAccuracyORM/mean": 0.46666667610406876, + "rewards/MultiModalAccuracyORM/std": 0.36664178371429446, + "step": 1015, + "train_speed(iter/s)": 0.031859 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.4, + "completions/mean_length": 396.28334350585936, + "completions/min_length": 187.6, + "epoch": 0.4121212121212121, + "grad_norm": 1.1532109667394932, + "kl": 0.00652008056640625, + "learning_rate": 2e-07, + "loss": 0.012686711549758912, + "memory(GiB)": 113.5, + "reward": 0.23333333805203438, + "reward_std": 0.3129522502422333, + "rewards/MultiModalAccuracyORM/mean": 0.23333333805203438, + "rewards/MultiModalAccuracyORM/std": 0.3129522502422333, + "step": 1020, + "train_speed(iter/s)": 0.031867 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.3, + "completions/mean_length": 314.48334045410155, + "completions/min_length": 181.4, + "epoch": 0.41414141414141414, + "grad_norm": 2.3285330234433017, + "kl": 0.01016845703125, + "learning_rate": 2e-07, + "loss": -0.00456441193819046, + "memory(GiB)": 113.5, + "reward": 0.35000001043081286, + "reward_std": 0.36670138239860534, + "rewards/MultiModalAccuracyORM/mean": 0.35000001043081286, + "rewards/MultiModalAccuracyORM/std": 0.36670138239860534, + "step": 1025, + "train_speed(iter/s)": 0.03188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.1, + "completions/mean_length": 417.8000122070313, + "completions/min_length": 228.2, + "epoch": 0.4161616161616162, + "grad_norm": 4.123945619995185, + "kl": 0.0071319580078125, + "learning_rate": 2e-07, + "loss": -0.015000586211681367, + "memory(GiB)": 113.5, + "reward": 0.30833334252238276, + "reward_std": 0.4016164273023605, + "rewards/MultiModalAccuracyORM/mean": 0.30833334252238276, + "rewards/MultiModalAccuracyORM/std": 0.4016164273023605, + "step": 1030, + "train_speed(iter/s)": 0.031877 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.2, + "completions/mean_length": 334.6833435058594, + "completions/min_length": 201.1, + "epoch": 0.41818181818181815, + "grad_norm": 1.0210308419459193, + "kl": 0.00837860107421875, + "learning_rate": 2e-07, + "loss": -0.008147723227739333, + "memory(GiB)": 113.5, + "reward": 0.14166667312383652, + "reward_std": 0.14815283417701722, + "rewards/MultiModalAccuracyORM/mean": 0.14166667312383652, + "rewards/MultiModalAccuracyORM/std": 0.14815283417701722, + "step": 1035, + "train_speed(iter/s)": 0.03191 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.1, + "completions/mean_length": 269.1333374023437, + "completions/min_length": 140.9, + "epoch": 0.4202020202020202, + "grad_norm": 2.4151827408725546, + "kl": 0.01279144287109375, + "learning_rate": 2e-07, + "loss": -0.0017376184463500977, + "memory(GiB)": 113.5, + "reward": 0.5000000074505806, + "reward_std": 0.2591939508914948, + "rewards/MultiModalAccuracyORM/mean": 0.5000000074505806, + "rewards/MultiModalAccuracyORM/std": 0.2591939508914948, + "step": 1040, + "train_speed(iter/s)": 0.031912 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/mean_length": 420.541682434082, + "completions/min_length": 252.2, + "epoch": 0.4222222222222222, + "grad_norm": 1.5651289466382694, + "kl": 0.0089202880859375, + "learning_rate": 2e-07, + "loss": 0.007678426802158356, + "memory(GiB)": 113.5, + "reward": 0.07500000074505805, + "reward_std": 0.17705594301223754, + "rewards/MultiModalAccuracyORM/mean": 0.07500000074505805, + "rewards/MultiModalAccuracyORM/std": 0.17705594301223754, + "step": 1045, + "train_speed(iter/s)": 0.031887 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.3, + "completions/mean_length": 382.8750061035156, + "completions/min_length": 219.2, + "epoch": 0.42424242424242425, + "grad_norm": 1.6669744297788438, + "kl": 0.010504150390625, + "learning_rate": 2e-07, + "loss": 0.04403962194919586, + "memory(GiB)": 113.5, + "reward": 0.1916666716337204, + "reward_std": 0.2908295333385468, + "rewards/MultiModalAccuracyORM/mean": 0.1916666716337204, + "rewards/MultiModalAccuracyORM/std": 0.2908295333385468, + "step": 1050, + "train_speed(iter/s)": 0.031882 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.7, + "completions/mean_length": 340.3916778564453, + "completions/min_length": 203.1, + "epoch": 0.4262626262626263, + "grad_norm": 0.08753771583148155, + "kl": 0.006915283203125, + "learning_rate": 2e-07, + "loss": -0.00030135512351989744, + "memory(GiB)": 113.5, + "reward": 0.3250000074505806, + "reward_std": 0.31046818792819975, + "rewards/MultiModalAccuracyORM/mean": 0.3250000074505806, + "rewards/MultiModalAccuracyORM/std": 0.31046818792819975, + "step": 1055, + "train_speed(iter/s)": 0.031895 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.3, + "completions/mean_length": 407.2916793823242, + "completions/min_length": 253.6, + "epoch": 0.42828282828282827, + "grad_norm": 2.1853625197058877, + "kl": 0.01122894287109375, + "learning_rate": 2e-07, + "loss": -0.009478866308927535, + "memory(GiB)": 113.5, + "reward": 0.2666666679084301, + "reward_std": 0.2940108567476273, + "rewards/MultiModalAccuracyORM/mean": 0.2666666679084301, + "rewards/MultiModalAccuracyORM/std": 0.2940108567476273, + "step": 1060, + "train_speed(iter/s)": 0.03188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.8, + "completions/mean_length": 379.0666793823242, + "completions/min_length": 175.8, + "epoch": 0.4303030303030303, + "grad_norm": 1.8429441156917366, + "kl": 0.0084381103515625, + "learning_rate": 2e-07, + "loss": 0.008666989207267762, + "memory(GiB)": 113.5, + "reward": 0.3666666731238365, + "reward_std": 0.40242100059986113, + "rewards/MultiModalAccuracyORM/mean": 0.3666666731238365, + "rewards/MultiModalAccuracyORM/std": 0.40242100059986113, + "step": 1065, + "train_speed(iter/s)": 0.031897 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.8, + "completions/mean_length": 411.7250183105469, + "completions/min_length": 222.1, + "epoch": 0.43232323232323233, + "grad_norm": 1.8025359450969856, + "kl": 0.0067352294921875, + "learning_rate": 2e-07, + "loss": -0.020195412635803222, + "memory(GiB)": 113.5, + "reward": 0.1166666716337204, + "reward_std": 0.1745694547891617, + "rewards/MultiModalAccuracyORM/mean": 0.1166666716337204, + "rewards/MultiModalAccuracyORM/std": 0.1745694547891617, + "step": 1070, + "train_speed(iter/s)": 0.031919 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/mean_length": 334.0916748046875, + "completions/min_length": 184.4, + "epoch": 0.43434343434343436, + "grad_norm": 1.6111066415316333, + "kl": 0.00709228515625, + "learning_rate": 2e-07, + "loss": -0.004982185363769531, + "memory(GiB)": 113.5, + "reward": 0.3583333410322666, + "reward_std": 0.27148364782333373, + "rewards/MultiModalAccuracyORM/mean": 0.3583333410322666, + "rewards/MultiModalAccuracyORM/std": 0.27148364782333373, + "step": 1075, + "train_speed(iter/s)": 0.031909 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.8, + "completions/mean_length": 305.34167404174804, + "completions/min_length": 176.1, + "epoch": 0.43636363636363634, + "grad_norm": 2.273654506806337, + "kl": 0.00942840576171875, + "learning_rate": 2e-07, + "loss": -0.0076661787927150725, + "memory(GiB)": 113.5, + "reward": 0.2583333417773247, + "reward_std": 0.2122136175632477, + "rewards/MultiModalAccuracyORM/mean": 0.2583333417773247, + "rewards/MultiModalAccuracyORM/std": 0.2122136175632477, + "step": 1080, + "train_speed(iter/s)": 0.031918 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.9, + "completions/mean_length": 298.60000839233396, + "completions/min_length": 156.5, + "epoch": 0.4383838383838384, + "grad_norm": 2.1394215246213495, + "kl": 0.0081207275390625, + "learning_rate": 2e-07, + "loss": 0.01651126444339752, + "memory(GiB)": 113.5, + "reward": 0.3416666768491268, + "reward_std": 0.4186849981546402, + "rewards/MultiModalAccuracyORM/mean": 0.3416666768491268, + "rewards/MultiModalAccuracyORM/std": 0.4186849981546402, + "step": 1085, + "train_speed(iter/s)": 0.031921 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.4, + "completions/mean_length": 338.9500137329102, + "completions/min_length": 196.0, + "epoch": 0.4404040404040404, + "grad_norm": 2.545454860927846, + "kl": 0.00835113525390625, + "learning_rate": 2e-07, + "loss": 0.04256980717182159, + "memory(GiB)": 113.5, + "reward": 0.3250000014901161, + "reward_std": 0.2712650209665298, + "rewards/MultiModalAccuracyORM/mean": 0.3250000014901161, + "rewards/MultiModalAccuracyORM/std": 0.2712650209665298, + "step": 1090, + "train_speed(iter/s)": 0.031917 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 737.9, + "completions/mean_length": 350.0750045776367, + "completions/min_length": 177.8, + "epoch": 0.44242424242424244, + "grad_norm": 1.1515652768332443, + "kl": 0.0083038330078125, + "learning_rate": 2e-07, + "loss": 0.05727236866950989, + "memory(GiB)": 113.5, + "reward": 0.1333333395421505, + "reward_std": 0.22625694572925567, + "rewards/MultiModalAccuracyORM/mean": 0.1333333395421505, + "rewards/MultiModalAccuracyORM/std": 0.22625694572925567, + "step": 1095, + "train_speed(iter/s)": 0.031885 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.9, + "completions/mean_length": 404.4166793823242, + "completions/min_length": 222.3, + "epoch": 0.4444444444444444, + "grad_norm": 2.0946897692044906, + "kl": 0.0076324462890625, + "learning_rate": 2e-07, + "loss": 0.03506229817867279, + "memory(GiB)": 113.5, + "reward": 0.45833334177732465, + "reward_std": 0.41185393929481506, + "rewards/MultiModalAccuracyORM/mean": 0.45833334177732465, + "rewards/MultiModalAccuracyORM/std": 0.41185393929481506, + "step": 1100, + "train_speed(iter/s)": 0.031869 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.2, + "completions/mean_length": 237.7166732788086, + "completions/min_length": 130.5, + "epoch": 0.44646464646464645, + "grad_norm": 2.7145244594416837, + "kl": 0.0112579345703125, + "learning_rate": 2e-07, + "loss": -0.004306972026824951, + "memory(GiB)": 113.5, + "reward": 0.27500000447034834, + "reward_std": 0.28853767216205595, + "rewards/MultiModalAccuracyORM/mean": 0.27500000447034834, + "rewards/MultiModalAccuracyORM/std": 0.28853767216205595, + "step": 1105, + "train_speed(iter/s)": 0.031861 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.2, + "completions/mean_length": 339.60834350585935, + "completions/min_length": 189.7, + "epoch": 0.4484848484848485, + "grad_norm": 1.990851354822169, + "kl": 0.04109954833984375, + "learning_rate": 2e-07, + "loss": 0.01136043295264244, + "memory(GiB)": 113.5, + "reward": 0.4666666768491268, + "reward_std": 0.29859510362148284, + "rewards/MultiModalAccuracyORM/mean": 0.4666666768491268, + "rewards/MultiModalAccuracyORM/std": 0.29859510362148284, + "step": 1110, + "train_speed(iter/s)": 0.031854 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.4, + "completions/mean_length": 347.5666748046875, + "completions/min_length": 218.3, + "epoch": 0.4505050505050505, + "grad_norm": 2.9268025842966563, + "kl": 0.0082672119140625, + "learning_rate": 2e-07, + "loss": -0.0031023643910884856, + "memory(GiB)": 113.5, + "reward": 0.35000001192092894, + "reward_std": 0.3800142765045166, + "rewards/MultiModalAccuracyORM/mean": 0.35000001192092894, + "rewards/MultiModalAccuracyORM/std": 0.3800142765045166, + "step": 1115, + "train_speed(iter/s)": 0.031867 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.2, + "completions/mean_length": 336.5333419799805, + "completions/min_length": 190.3, + "epoch": 0.45252525252525255, + "grad_norm": 2.361080053690534, + "kl": 0.009368896484375, + "learning_rate": 2e-07, + "loss": -0.007122965157032013, + "memory(GiB)": 113.5, + "reward": 0.4000000074505806, + "reward_std": 0.25241934359073637, + "rewards/MultiModalAccuracyORM/mean": 0.4000000074505806, + "rewards/MultiModalAccuracyORM/std": 0.25241934359073637, + "step": 1120, + "train_speed(iter/s)": 0.031889 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.4, + "completions/mean_length": 387.1333465576172, + "completions/min_length": 233.6, + "epoch": 0.45454545454545453, + "grad_norm": 1.323972233543725, + "kl": 0.008551025390625, + "learning_rate": 2e-07, + "loss": 0.03706555962562561, + "memory(GiB)": 113.5, + "reward": 0.19166667014360428, + "reward_std": 0.3109443962574005, + "rewards/MultiModalAccuracyORM/mean": 0.19166667014360428, + "rewards/MultiModalAccuracyORM/std": 0.3109443962574005, + "step": 1125, + "train_speed(iter/s)": 0.031889 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.3, + "completions/mean_length": 389.6666778564453, + "completions/min_length": 238.9, + "epoch": 0.45656565656565656, + "grad_norm": 0.7314973239006443, + "kl": 0.00838775634765625, + "learning_rate": 2e-07, + "loss": -0.0037152446806430818, + "memory(GiB)": 113.5, + "reward": 0.32500000223517417, + "reward_std": 0.2556006729602814, + "rewards/MultiModalAccuracyORM/mean": 0.32500000223517417, + "rewards/MultiModalAccuracyORM/std": 0.2556006729602814, + "step": 1130, + "train_speed(iter/s)": 0.031898 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.1, + "completions/mean_length": 376.69167175292966, + "completions/min_length": 193.9, + "epoch": 0.4585858585858586, + "grad_norm": 2.6192210055071947, + "kl": 0.008709716796875, + "learning_rate": 2e-07, + "loss": -0.010700675845146179, + "memory(GiB)": 113.5, + "reward": 0.2750000089406967, + "reward_std": 0.3663875609636307, + "rewards/MultiModalAccuracyORM/mean": 0.2750000089406967, + "rewards/MultiModalAccuracyORM/std": 0.3663875609636307, + "step": 1135, + "train_speed(iter/s)": 0.031914 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/mean_length": 373.23334350585935, + "completions/min_length": 224.3, + "epoch": 0.46060606060606063, + "grad_norm": 7.217746674191174, + "kl": 0.05963134765625, + "learning_rate": 2e-07, + "loss": 0.005391424894332886, + "memory(GiB)": 113.5, + "reward": 0.11666666865348815, + "reward_std": 0.255160054564476, + "rewards/MultiModalAccuracyORM/mean": 0.11666666865348815, + "rewards/MultiModalAccuracyORM/std": 0.255160054564476, + "step": 1140, + "train_speed(iter/s)": 0.031929 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.7, + "completions/mean_length": 264.12500686645507, + "completions/min_length": 149.1, + "epoch": 0.4626262626262626, + "grad_norm": 2.7608756487475525, + "kl": 0.014227294921875, + "learning_rate": 2e-07, + "loss": 0.01821192502975464, + "memory(GiB)": 113.5, + "reward": 0.3916666731238365, + "reward_std": 0.29640085995197296, + "rewards/MultiModalAccuracyORM/mean": 0.3916666731238365, + "rewards/MultiModalAccuracyORM/std": 0.29640085995197296, + "step": 1145, + "train_speed(iter/s)": 0.031947 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.1, + "completions/mean_length": 274.7416763305664, + "completions/min_length": 172.5, + "epoch": 0.46464646464646464, + "grad_norm": 2.4711978185790886, + "kl": 0.1343414306640625, + "learning_rate": 2e-07, + "loss": 0.017589953541755677, + "memory(GiB)": 113.5, + "reward": 0.24166667237877845, + "reward_std": 0.28959646821022034, + "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.28959646821022034, + "step": 1150, + "train_speed(iter/s)": 0.031968 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.7, + "completions/mean_length": 316.32501220703125, + "completions/min_length": 170.1, + "epoch": 0.4666666666666667, + "grad_norm": 2.193137307137183, + "kl": 0.00921173095703125, + "learning_rate": 2e-07, + "loss": 0.03984123468399048, + "memory(GiB)": 113.5, + "reward": 0.25833334103226663, + "reward_std": 0.3578915596008301, + "rewards/MultiModalAccuracyORM/mean": 0.25833334103226663, + "rewards/MultiModalAccuracyORM/std": 0.3578915596008301, + "step": 1155, + "train_speed(iter/s)": 0.031978 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.9, + "completions/mean_length": 312.7916748046875, + "completions/min_length": 192.9, + "epoch": 0.4686868686868687, + "grad_norm": 2.4370225749301886, + "kl": 0.0099822998046875, + "learning_rate": 2e-07, + "loss": 0.04419963359832764, + "memory(GiB)": 113.5, + "reward": 0.36666667759418486, + "reward_std": 0.34560186266899107, + "rewards/MultiModalAccuracyORM/mean": 0.36666667759418486, + "rewards/MultiModalAccuracyORM/std": 0.34560186266899107, + "step": 1160, + "train_speed(iter/s)": 0.031974 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/mean_length": 355.541674041748, + "completions/min_length": 156.5, + "epoch": 0.4707070707070707, + "grad_norm": 3.306493843440885, + "kl": 0.01005859375, + "learning_rate": 2e-07, + "loss": 0.021104392409324647, + "memory(GiB)": 113.5, + "reward": 0.31666667461395265, + "reward_std": 0.37450254559516905, + "rewards/MultiModalAccuracyORM/mean": 0.31666667461395265, + "rewards/MultiModalAccuracyORM/std": 0.37450254559516905, + "step": 1165, + "train_speed(iter/s)": 0.031991 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.5, + "completions/mean_length": 266.8083435058594, + "completions/min_length": 159.0, + "epoch": 0.4727272727272727, + "grad_norm": 3.2381508792172107, + "kl": 0.0092529296875, + "learning_rate": 2e-07, + "loss": 0.0005752682685852051, + "memory(GiB)": 113.5, + "reward": 0.19166667237877846, + "reward_std": 0.29159851372241974, + "rewards/MultiModalAccuracyORM/mean": 0.19166667237877846, + "rewards/MultiModalAccuracyORM/std": 0.29159851372241974, + "step": 1170, + "train_speed(iter/s)": 0.032017 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.1, + "completions/mean_length": 376.39167938232424, + "completions/min_length": 220.4, + "epoch": 0.47474747474747475, + "grad_norm": 1.6637816276606223, + "kl": 0.00755615234375, + "learning_rate": 2e-07, + "loss": 0.04589937329292297, + "memory(GiB)": 113.5, + "reward": 0.3083333417773247, + "reward_std": 0.37851486802101136, + "rewards/MultiModalAccuracyORM/mean": 0.3083333417773247, + "rewards/MultiModalAccuracyORM/std": 0.37851486802101136, + "step": 1175, + "train_speed(iter/s)": 0.032023 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.9, + "completions/mean_length": 448.13334197998046, + "completions/min_length": 217.0, + "epoch": 0.4767676767676768, + "grad_norm": 2.1978602872808075, + "kl": 0.0097900390625, + "learning_rate": 2e-07, + "loss": -0.009159280359745026, + "memory(GiB)": 113.5, + "reward": 0.3166666738688946, + "reward_std": 0.28452777564525605, + "rewards/MultiModalAccuracyORM/mean": 0.3166666738688946, + "rewards/MultiModalAccuracyORM/std": 0.28452777564525605, + "step": 1180, + "train_speed(iter/s)": 0.032033 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.5, + "completions/mean_length": 289.1250053405762, + "completions/min_length": 172.7, + "epoch": 0.47878787878787876, + "grad_norm": 1.3926480253734976, + "kl": 0.0114959716796875, + "learning_rate": 2e-07, + "loss": 0.009945812821388244, + "memory(GiB)": 113.5, + "reward": 0.5000000149011612, + "reward_std": 0.29630566835403443, + "rewards/MultiModalAccuracyORM/mean": 0.5000000149011612, + "rewards/MultiModalAccuracyORM/std": 0.29630566835403443, + "step": 1185, + "train_speed(iter/s)": 0.032058 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.5, + "completions/mean_length": 330.6666732788086, + "completions/min_length": 149.8, + "epoch": 0.4808080808080808, + "grad_norm": 1.9719497919311402, + "kl": 0.0093994140625, + "learning_rate": 2e-07, + "loss": 0.0009687811136245728, + "memory(GiB)": 113.5, + "reward": 0.3083333410322666, + "reward_std": 0.3478672981262207, + "rewards/MultiModalAccuracyORM/mean": 0.3083333410322666, + "rewards/MultiModalAccuracyORM/std": 0.3478672981262207, + "step": 1190, + "train_speed(iter/s)": 0.032072 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.4, + "completions/mean_length": 421.73334350585935, + "completions/min_length": 236.0, + "epoch": 0.48282828282828283, + "grad_norm": 1.243534573096356, + "kl": 0.0077484130859375, + "learning_rate": 2e-07, + "loss": -0.003622010350227356, + "memory(GiB)": 113.5, + "reward": 0.20833333879709243, + "reward_std": 0.29815449118614196, + "rewards/MultiModalAccuracyORM/mean": 0.20833333879709243, + "rewards/MultiModalAccuracyORM/std": 0.29815449118614196, + "step": 1195, + "train_speed(iter/s)": 0.03208 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.5, + "completions/mean_length": 360.5666748046875, + "completions/min_length": 198.9, + "epoch": 0.48484848484848486, + "grad_norm": 2.916364282012391, + "kl": 0.0125274658203125, + "learning_rate": 2e-07, + "loss": -0.021604710817337038, + "memory(GiB)": 113.5, + "reward": 0.22500000447034835, + "reward_std": 0.35037778615951537, + "rewards/MultiModalAccuracyORM/mean": 0.22500000447034835, + "rewards/MultiModalAccuracyORM/std": 0.35037778615951537, + "step": 1200, + "train_speed(iter/s)": 0.032073 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/mean_length": 282.54167556762695, + "completions/min_length": 161.2, + "epoch": 0.4868686868686869, + "grad_norm": 1.8011146439004695, + "kl": 0.0113494873046875, + "learning_rate": 2e-07, + "loss": 0.006717947870492935, + "memory(GiB)": 113.5, + "reward": 0.3333333387970924, + "reward_std": 0.19114727079868316, + "rewards/MultiModalAccuracyORM/mean": 0.3333333387970924, + "rewards/MultiModalAccuracyORM/std": 0.19114727079868316, + "step": 1205, + "train_speed(iter/s)": 0.032089 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.7, + "completions/mean_length": 442.2083435058594, + "completions/min_length": 237.3, + "epoch": 0.4888888888888889, + "grad_norm": 1.4349681794675622, + "kl": 0.0094146728515625, + "learning_rate": 2e-07, + "loss": 0.0006179869174957276, + "memory(GiB)": 113.5, + "reward": 0.2333333410322666, + "reward_std": 0.3762586027383804, + "rewards/MultiModalAccuracyORM/mean": 0.2333333410322666, + "rewards/MultiModalAccuracyORM/std": 0.3762586027383804, + "step": 1210, + "train_speed(iter/s)": 0.032088 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 738.7, + "completions/mean_length": 433.8916748046875, + "completions/min_length": 249.1, + "epoch": 0.4909090909090909, + "grad_norm": 2.547575664275865, + "kl": 0.00975189208984375, + "learning_rate": 2e-07, + "loss": -0.026794981956481934, + "memory(GiB)": 113.5, + "reward": 0.18333333656191825, + "reward_std": 0.20118070244789124, + "rewards/MultiModalAccuracyORM/mean": 0.18333333656191825, + "rewards/MultiModalAccuracyORM/std": 0.20118070244789124, + "step": 1215, + "train_speed(iter/s)": 0.032076 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 627.1, + "completions/mean_length": 359.12500915527346, + "completions/min_length": 197.8, + "epoch": 0.49292929292929294, + "grad_norm": 1.645283475386655, + "kl": 0.0115875244140625, + "learning_rate": 2e-07, + "loss": 0.024244531989097595, + "memory(GiB)": 113.5, + "reward": 0.3333333387970924, + "reward_std": 0.2511145621538162, + "rewards/MultiModalAccuracyORM/mean": 0.3333333387970924, + "rewards/MultiModalAccuracyORM/std": 0.2511145621538162, + "step": 1220, + "train_speed(iter/s)": 0.032088 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.3, + "completions/mean_length": 267.17501068115234, + "completions/min_length": 152.0, + "epoch": 0.494949494949495, + "grad_norm": 1.2460930349970594, + "kl": 0.010992431640625, + "learning_rate": 2e-07, + "loss": 0.007182718813419342, + "memory(GiB)": 113.5, + "reward": 0.2583333358168602, + "reward_std": 0.2536582201719284, + "rewards/MultiModalAccuracyORM/mean": 0.2583333358168602, + "rewards/MultiModalAccuracyORM/std": 0.2536582201719284, + "step": 1225, + "train_speed(iter/s)": 0.032122 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.8, + "completions/mean_length": 405.7416702270508, + "completions/min_length": 233.4, + "epoch": 0.49696969696969695, + "grad_norm": 3.259598192610936, + "kl": 0.0076690673828125, + "learning_rate": 2e-07, + "loss": 0.0032115459442138674, + "memory(GiB)": 113.5, + "reward": 0.33333334177732465, + "reward_std": 0.3470627248287201, + "rewards/MultiModalAccuracyORM/mean": 0.33333334177732465, + "rewards/MultiModalAccuracyORM/std": 0.3470627248287201, + "step": 1230, + "train_speed(iter/s)": 0.032125 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.1, + "completions/mean_length": 306.12501068115233, + "completions/min_length": 188.9, + "epoch": 0.498989898989899, + "grad_norm": 0.7719456506471253, + "kl": 0.01029205322265625, + "learning_rate": 2e-07, + "loss": -0.016546979546546936, + "memory(GiB)": 113.5, + "reward": 0.1916666693985462, + "reward_std": 0.2895223259925842, + "rewards/MultiModalAccuracyORM/mean": 0.1916666693985462, + "rewards/MultiModalAccuracyORM/std": 0.2895223259925842, + "step": 1235, + "train_speed(iter/s)": 0.032159 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.6, + "completions/mean_length": 343.00834503173826, + "completions/min_length": 207.8, + "epoch": 0.501010101010101, + "grad_norm": 1.6531811897726711, + "kl": 0.0093170166015625, + "learning_rate": 2e-07, + "loss": 0.02186596691608429, + "memory(GiB)": 113.5, + "reward": 0.19166667610406876, + "reward_std": 0.2448128044605255, + "rewards/MultiModalAccuracyORM/mean": 0.19166667610406876, + "rewards/MultiModalAccuracyORM/std": 0.2448128044605255, + "step": 1240, + "train_speed(iter/s)": 0.032173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.6, + "completions/mean_length": 299.2916717529297, + "completions/min_length": 172.4, + "epoch": 0.503030303030303, + "grad_norm": 3.19592384950856, + "kl": 0.0099273681640625, + "learning_rate": 2e-07, + "loss": -0.006601364910602569, + "memory(GiB)": 113.5, + "reward": 0.2083333410322666, + "reward_std": 0.3292782694101334, + "rewards/MultiModalAccuracyORM/mean": 0.2083333410322666, + "rewards/MultiModalAccuracyORM/std": 0.3292782694101334, + "step": 1245, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.912303521551538, + "learning_rate": 2e-07, + "loss": -0.0002701073884963989, + "memory(GiB)": 113.5, + "step": 1250, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.5050505050505051, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 570.74, + "eval_completions/mean_length": 352.94834228515623, + "eval_completions/min_length": 210.42, + "eval_kl": 0.00790496826171875, + "eval_loss": 0.01708856225013733, + "eval_reward": 0.2983333393931389, + "eval_reward_std": 0.3327623122930527, + "eval_rewards/MultiModalAccuracyORM/mean": 0.2983333393931389, + "eval_rewards/MultiModalAccuracyORM/std": 0.3327623122930527, + "eval_runtime": 568.068, + "eval_samples_per_second": 0.088, + "eval_steps_per_second": 0.009, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.8, + "completions/mean_length": 358.1875087738037, + "completions/min_length": 223.5, + "epoch": 0.5070707070707071, + "grad_norm": 1.8888348326711508, + "kl": 0.01190643310546875, + "learning_rate": 2e-07, + "loss": 0.019428746402263643, + "memory(GiB)": 113.5, + "reward": 0.27916667275130747, + "reward_std": 0.38802969008684157, + "rewards/MultiModalAccuracyORM/mean": 0.27916667275130747, + "rewards/MultiModalAccuracyORM/std": 0.38802969008684157, + "step": 1255, + "train_speed(iter/s)": 0.031527 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.7, + "completions/mean_length": 373.8333465576172, + "completions/min_length": 225.6, + "epoch": 0.509090909090909, + "grad_norm": 1.8028043067539863, + "kl": 0.0100677490234375, + "learning_rate": 2e-07, + "loss": 0.027076438069343567, + "memory(GiB)": 113.5, + "reward": 0.19166667386889458, + "reward_std": 0.3207202464342117, + "rewards/MultiModalAccuracyORM/mean": 0.19166667386889458, + "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, + "step": 1260, + "train_speed(iter/s)": 0.031526 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.3, + "completions/mean_length": 397.9250152587891, + "completions/min_length": 204.6, + "epoch": 0.5111111111111111, + "grad_norm": 2.2225728768142723, + "kl": 0.011029052734375, + "learning_rate": 2e-07, + "loss": -0.04860515892505646, + "memory(GiB)": 113.5, + "reward": 0.29166667684912684, + "reward_std": 0.33303394317626955, + "rewards/MultiModalAccuracyORM/mean": 0.29166667684912684, + "rewards/MultiModalAccuracyORM/std": 0.33303394317626955, + "step": 1265, + "train_speed(iter/s)": 0.031521 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.6, + "completions/mean_length": 342.73334350585935, + "completions/min_length": 191.1, + "epoch": 0.5131313131313131, + "grad_norm": 1.467354462173463, + "kl": 0.0107818603515625, + "learning_rate": 2e-07, + "loss": 0.03341347873210907, + "memory(GiB)": 113.5, + "reward": 0.34166667982935905, + "reward_std": 0.2812868595123291, + "rewards/MultiModalAccuracyORM/mean": 0.34166667982935905, + "rewards/MultiModalAccuracyORM/std": 0.2812868595123291, + "step": 1270, + "train_speed(iter/s)": 0.031536 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.8, + "completions/mean_length": 283.72500762939455, + "completions/min_length": 163.8, + "epoch": 0.5151515151515151, + "grad_norm": 3.716342095943599, + "kl": 0.014361572265625, + "learning_rate": 2e-07, + "loss": 0.02838865518569946, + "memory(GiB)": 113.5, + "reward": 0.433333345502615, + "reward_std": 0.3993005663156509, + "rewards/MultiModalAccuracyORM/mean": 0.433333345502615, + "rewards/MultiModalAccuracyORM/std": 0.3993005663156509, + "step": 1275, + "train_speed(iter/s)": 0.031554 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.3, + "completions/mean_length": 302.8583389282227, + "completions/min_length": 167.6, + "epoch": 0.5171717171717172, + "grad_norm": 2.503627797323309, + "kl": 0.0103668212890625, + "learning_rate": 2e-07, + "loss": 0.00705558955669403, + "memory(GiB)": 113.5, + "reward": 0.6333333551883698, + "reward_std": 0.43680969774723055, + "rewards/MultiModalAccuracyORM/mean": 0.6333333551883698, + "rewards/MultiModalAccuracyORM/std": 0.43680969774723055, + "step": 1280, + "train_speed(iter/s)": 0.031583 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.5, + "completions/mean_length": 403.7000076293945, + "completions/min_length": 185.7, + "epoch": 0.5191919191919192, + "grad_norm": 3.2251809838929315, + "kl": 0.0101654052734375, + "learning_rate": 2e-07, + "loss": -0.037446904182434085, + "memory(GiB)": 113.5, + "reward": 0.3083333395421505, + "reward_std": 0.3978011578321457, + "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3978011578321457, + "step": 1285, + "train_speed(iter/s)": 0.031592 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.7, + "completions/mean_length": 300.60834045410155, + "completions/min_length": 158.4, + "epoch": 0.5212121212121212, + "grad_norm": 1.7359935662818697, + "kl": 0.0238189697265625, + "learning_rate": 2e-07, + "loss": 0.005645626783370971, + "memory(GiB)": 113.5, + "reward": 0.4916666761040688, + "reward_std": 0.37272491455078127, + "rewards/MultiModalAccuracyORM/mean": 0.4916666761040688, + "rewards/MultiModalAccuracyORM/std": 0.37272491455078127, + "step": 1290, + "train_speed(iter/s)": 0.031618 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 658.6, + "completions/mean_length": 381.3166809082031, + "completions/min_length": 189.5, + "epoch": 0.5232323232323233, + "grad_norm": 2.291063786312688, + "kl": 0.0126251220703125, + "learning_rate": 2e-07, + "loss": 0.03457438945770264, + "memory(GiB)": 113.5, + "reward": 0.29166667312383654, + "reward_std": 0.3760043799877167, + "rewards/MultiModalAccuracyORM/mean": 0.29166667312383654, + "rewards/MultiModalAccuracyORM/std": 0.3760043799877167, + "step": 1295, + "train_speed(iter/s)": 0.031626 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.2, + "completions/mean_length": 334.5833480834961, + "completions/min_length": 191.9, + "epoch": 0.5252525252525253, + "grad_norm": 2.290129258389379, + "kl": 0.0095916748046875, + "learning_rate": 2e-07, + "loss": -0.024787557125091553, + "memory(GiB)": 113.5, + "reward": 0.37500000894069674, + "reward_std": 0.29634126722812654, + "rewards/MultiModalAccuracyORM/mean": 0.37500000894069674, + "rewards/MultiModalAccuracyORM/std": 0.29634126722812654, + "step": 1300, + "train_speed(iter/s)": 0.031656 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.6, + "completions/mean_length": 374.36668395996094, + "completions/min_length": 192.7, + "epoch": 0.5272727272727272, + "grad_norm": 2.0627127342369556, + "kl": 0.0099151611328125, + "learning_rate": 2e-07, + "loss": 0.004585762321949005, + "memory(GiB)": 113.5, + "reward": 0.2250000037252903, + "reward_std": 0.40560232698917387, + "rewards/MultiModalAccuracyORM/mean": 0.2250000037252903, + "rewards/MultiModalAccuracyORM/std": 0.40560232698917387, + "step": 1305, + "train_speed(iter/s)": 0.031669 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 569.9, + "completions/mean_length": 281.84167556762696, + "completions/min_length": 151.0, + "epoch": 0.5292929292929293, + "grad_norm": 1.2890377388651022, + "kl": 0.012725830078125, + "learning_rate": 2e-07, + "loss": -0.00015339255332946777, + "memory(GiB)": 113.5, + "reward": 0.341666679084301, + "reward_std": 0.31068681478500365, + "rewards/MultiModalAccuracyORM/mean": 0.341666679084301, + "rewards/MultiModalAccuracyORM/std": 0.31068681478500365, + "step": 1310, + "train_speed(iter/s)": 0.031675 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.4, + "completions/mean_length": 307.5916717529297, + "completions/min_length": 186.8, + "epoch": 0.5313131313131313, + "grad_norm": 2.0391373518251648, + "kl": 0.0098358154296875, + "learning_rate": 2e-07, + "loss": 0.06573413610458374, + "memory(GiB)": 113.5, + "reward": 0.28333333805203437, + "reward_std": 0.351182359457016, + "rewards/MultiModalAccuracyORM/mean": 0.28333333805203437, + "rewards/MultiModalAccuracyORM/std": 0.351182359457016, + "step": 1315, + "train_speed(iter/s)": 0.031691 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/mean_length": 273.75001068115233, + "completions/min_length": 144.0, + "epoch": 0.5333333333333333, + "grad_norm": 2.059578451448539, + "kl": 0.01175537109375, + "learning_rate": 2e-07, + "loss": 0.04888114631175995, + "memory(GiB)": 113.5, + "reward": 0.3500000096857548, + "reward_std": 0.4166352391242981, + "rewards/MultiModalAccuracyORM/mean": 0.3500000096857548, + "rewards/MultiModalAccuracyORM/std": 0.4166352391242981, + "step": 1320, + "train_speed(iter/s)": 0.031715 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.8, + "completions/mean_length": 376.00000915527346, + "completions/min_length": 193.4, + "epoch": 0.5353535353535354, + "grad_norm": 2.604163597368088, + "kl": 0.014556884765625, + "learning_rate": 2e-07, + "loss": 0.025493156909942628, + "memory(GiB)": 113.5, + "reward": 0.2833333432674408, + "reward_std": 0.33376437425613403, + "rewards/MultiModalAccuracyORM/mean": 0.2833333432674408, + "rewards/MultiModalAccuracyORM/std": 0.33376437425613403, + "step": 1325, + "train_speed(iter/s)": 0.031732 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.3, + "completions/mean_length": 319.8833465576172, + "completions/min_length": 185.2, + "epoch": 0.5373737373737374, + "grad_norm": 2.939920293327845, + "kl": 0.0071502685546875, + "learning_rate": 2e-07, + "loss": -0.0020159482955932617, + "memory(GiB)": 113.5, + "reward": 0.2500000037252903, + "reward_std": 0.33000870048999786, + "rewards/MultiModalAccuracyORM/mean": 0.2500000037252903, + "rewards/MultiModalAccuracyORM/std": 0.33000870048999786, + "step": 1330, + "train_speed(iter/s)": 0.031756 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.3, + "completions/mean_length": 409.9833419799805, + "completions/min_length": 273.6, + "epoch": 0.5393939393939394, + "grad_norm": 1.8600557381971845, + "kl": 0.011651611328125, + "learning_rate": 2e-07, + "loss": 0.014566189050674439, + "memory(GiB)": 113.5, + "reward": 0.2500000029802322, + "reward_std": 0.36642315685749055, + "rewards/MultiModalAccuracyORM/mean": 0.2500000029802322, + "rewards/MultiModalAccuracyORM/std": 0.36642315685749055, + "step": 1335, + "train_speed(iter/s)": 0.031764 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/mean_length": 304.6666717529297, + "completions/min_length": 186.0, + "epoch": 0.5414141414141415, + "grad_norm": 2.250726659882682, + "kl": 0.0128570556640625, + "learning_rate": 2e-07, + "loss": 0.0025543123483657837, + "memory(GiB)": 113.5, + "reward": 0.1416666679084301, + "reward_std": 0.24939410090446473, + "rewards/MultiModalAccuracyORM/mean": 0.1416666679084301, + "rewards/MultiModalAccuracyORM/std": 0.24939410090446473, + "step": 1340, + "train_speed(iter/s)": 0.031764 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.8, + "completions/mean_length": 320.21667633056643, + "completions/min_length": 156.5, + "epoch": 0.5434343434343434, + "grad_norm": 3.49354443152123, + "kl": 0.01195068359375, + "learning_rate": 2e-07, + "loss": -0.026122617721557616, + "memory(GiB)": 113.5, + "reward": 0.10833333656191826, + "reward_std": 0.2714240521192551, + "rewards/MultiModalAccuracyORM/mean": 0.10833333656191826, + "rewards/MultiModalAccuracyORM/std": 0.2714240521192551, + "step": 1345, + "train_speed(iter/s)": 0.031794 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/mean_length": 359.27501220703124, + "completions/min_length": 227.5, + "epoch": 0.5454545454545454, + "grad_norm": 1.3573423115932872, + "kl": 0.008941650390625, + "learning_rate": 2e-07, + "loss": 0.02098418176174164, + "memory(GiB)": 113.5, + "reward": 0.30833334401249884, + "reward_std": 0.3207202464342117, + "rewards/MultiModalAccuracyORM/mean": 0.30833334401249884, + "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, + "step": 1350, + "train_speed(iter/s)": 0.031817 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.3, + "completions/mean_length": 368.5750106811523, + "completions/min_length": 194.4, + "epoch": 0.5474747474747474, + "grad_norm": 1.268246321814541, + "kl": 0.011553955078125, + "learning_rate": 2e-07, + "loss": -0.037621939182281496, + "memory(GiB)": 113.5, + "reward": 0.3083333447575569, + "reward_std": 0.3823301374912262, + "rewards/MultiModalAccuracyORM/mean": 0.3083333447575569, + "rewards/MultiModalAccuracyORM/std": 0.3823301374912262, + "step": 1355, + "train_speed(iter/s)": 0.031821 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.5, + "completions/mean_length": 337.325008392334, + "completions/min_length": 166.7, + "epoch": 0.5494949494949495, + "grad_norm": 1.1405744205796668, + "kl": 0.0093414306640625, + "learning_rate": 2e-07, + "loss": 0.05270506143569946, + "memory(GiB)": 113.5, + "reward": 0.5333333417773247, + "reward_std": 0.30996555387973784, + "rewards/MultiModalAccuracyORM/mean": 0.5333333417773247, + "rewards/MultiModalAccuracyORM/std": 0.30996555387973784, + "step": 1360, + "train_speed(iter/s)": 0.031838 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.6, + "completions/mean_length": 287.0166732788086, + "completions/min_length": 144.9, + "epoch": 0.5515151515151515, + "grad_norm": 2.505246091989759, + "kl": 0.0113037109375, + "learning_rate": 2e-07, + "loss": -0.027878284454345703, + "memory(GiB)": 113.5, + "reward": 0.24166667833924294, + "reward_std": 0.34710127115249634, + "rewards/MultiModalAccuracyORM/mean": 0.24166667833924294, + "rewards/MultiModalAccuracyORM/std": 0.34710127115249634, + "step": 1365, + "train_speed(iter/s)": 0.031858 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.9, + "completions/mean_length": 330.1916793823242, + "completions/min_length": 153.1, + "epoch": 0.5535353535353535, + "grad_norm": 3.131582003300663, + "kl": 0.0138336181640625, + "learning_rate": 2e-07, + "loss": -0.0038233429193496706, + "memory(GiB)": 113.5, + "reward": 0.31666667982935903, + "reward_std": 0.37345829904079436, + "rewards/MultiModalAccuracyORM/mean": 0.31666667982935903, + "rewards/MultiModalAccuracyORM/std": 0.37345829904079436, + "step": 1370, + "train_speed(iter/s)": 0.031869 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.4, + "completions/mean_length": 286.3666702270508, + "completions/min_length": 149.1, + "epoch": 0.5555555555555556, + "grad_norm": 1.9956984289591713, + "kl": 0.0112335205078125, + "learning_rate": 2e-07, + "loss": -0.012190797924995422, + "memory(GiB)": 113.5, + "reward": 0.40833333879709244, + "reward_std": 0.3855446308851242, + "rewards/MultiModalAccuracyORM/mean": 0.40833333879709244, + "rewards/MultiModalAccuracyORM/std": 0.3855446308851242, + "step": 1375, + "train_speed(iter/s)": 0.031897 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.7, + "completions/mean_length": 324.9166763305664, + "completions/min_length": 202.3, + "epoch": 0.5575757575757576, + "grad_norm": 2.1303502921633335, + "kl": 0.00983428955078125, + "learning_rate": 2e-07, + "loss": 0.02664785385131836, + "memory(GiB)": 113.5, + "reward": 0.40833333879709244, + "reward_std": 0.30971133410930635, + "rewards/MultiModalAccuracyORM/mean": 0.40833333879709244, + "rewards/MultiModalAccuracyORM/std": 0.30971133410930635, + "step": 1380, + "train_speed(iter/s)": 0.031915 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.6, + "completions/mean_length": 356.1666793823242, + "completions/min_length": 201.3, + "epoch": 0.5595959595959596, + "grad_norm": 2.9931509831712524, + "kl": 0.012689208984375, + "learning_rate": 2e-07, + "loss": -0.039350539445877075, + "memory(GiB)": 113.5, + "reward": 0.1666666716337204, + "reward_std": 0.2917931377887726, + "rewards/MultiModalAccuracyORM/mean": 0.1666666716337204, + "rewards/MultiModalAccuracyORM/std": 0.2917931377887726, + "step": 1385, + "train_speed(iter/s)": 0.031924 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/mean_length": 356.6083358764648, + "completions/min_length": 193.0, + "epoch": 0.5616161616161616, + "grad_norm": 2.198573582527943, + "kl": 0.008941650390625, + "learning_rate": 2e-07, + "loss": -0.022810643911361693, + "memory(GiB)": 113.5, + "reward": 0.35000001415610316, + "reward_std": 0.32673218548297883, + "rewards/MultiModalAccuracyORM/mean": 0.35000001415610316, + "rewards/MultiModalAccuracyORM/std": 0.32673218548297883, + "step": 1390, + "train_speed(iter/s)": 0.031929 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/mean_length": 314.82500686645506, + "completions/min_length": 182.1, + "epoch": 0.5636363636363636, + "grad_norm": 2.150533068523157, + "kl": 0.0106353759765625, + "learning_rate": 2e-07, + "loss": -0.013728708028793335, + "memory(GiB)": 113.5, + "reward": 0.46666667312383653, + "reward_std": 0.25897532403469087, + "rewards/MultiModalAccuracyORM/mean": 0.46666667312383653, + "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, + "step": 1395, + "train_speed(iter/s)": 0.031954 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.3, + "completions/mean_length": 353.3833465576172, + "completions/min_length": 204.6, + "epoch": 0.5656565656565656, + "grad_norm": 1.940941493471918, + "kl": 0.0095428466796875, + "learning_rate": 2e-07, + "loss": -0.006394821405410767, + "memory(GiB)": 113.5, + "reward": 0.3916666753590107, + "reward_std": 0.34550372064113616, + "rewards/MultiModalAccuracyORM/mean": 0.3916666753590107, + "rewards/MultiModalAccuracyORM/std": 0.34550372064113616, + "step": 1400, + "train_speed(iter/s)": 0.031962 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.4, + "completions/mean_length": 432.6000045776367, + "completions/min_length": 228.0, + "epoch": 0.5676767676767677, + "grad_norm": 2.4378327864764286, + "kl": 0.011553955078125, + "learning_rate": 2e-07, + "loss": 0.04005226194858551, + "memory(GiB)": 113.5, + "reward": 0.2916666746139526, + "reward_std": 0.3370794355869293, + "rewards/MultiModalAccuracyORM/mean": 0.2916666746139526, + "rewards/MultiModalAccuracyORM/std": 0.3370794355869293, + "step": 1405, + "train_speed(iter/s)": 0.031967 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.2, + "completions/mean_length": 384.7333465576172, + "completions/min_length": 220.2, + "epoch": 0.5696969696969697, + "grad_norm": 0.780805540568698, + "kl": 0.01131591796875, + "learning_rate": 2e-07, + "loss": 0.03709500730037689, + "memory(GiB)": 113.5, + "reward": 0.33333333730697634, + "reward_std": 0.3572298943996429, + "rewards/MultiModalAccuracyORM/mean": 0.33333333730697634, + "rewards/MultiModalAccuracyORM/std": 0.3572298943996429, + "step": 1410, + "train_speed(iter/s)": 0.031973 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/mean_length": 321.6333465576172, + "completions/min_length": 164.9, + "epoch": 0.5717171717171717, + "grad_norm": 1.430362343806847, + "kl": 0.0101104736328125, + "learning_rate": 2e-07, + "loss": 0.013754424452781678, + "memory(GiB)": 113.5, + "reward": 0.2583333387970924, + "reward_std": 0.28555097579956057, + "rewards/MultiModalAccuracyORM/mean": 0.2583333387970924, + "rewards/MultiModalAccuracyORM/std": 0.28555097579956057, + "step": 1415, + "train_speed(iter/s)": 0.031992 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 816.5, + "completions/mean_length": 438.3833435058594, + "completions/min_length": 264.8, + "epoch": 0.5737373737373738, + "grad_norm": 1.6263448971015675, + "kl": 0.010430908203125, + "learning_rate": 2e-07, + "loss": -0.0029776930809020997, + "memory(GiB)": 113.5, + "reward": 0.2333333395421505, + "reward_std": 0.3883536756038666, + "rewards/MultiModalAccuracyORM/mean": 0.2333333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3883536756038666, + "step": 1420, + "train_speed(iter/s)": 0.031979 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 766.6, + "completions/mean_length": 387.12500915527346, + "completions/min_length": 211.4, + "epoch": 0.5757575757575758, + "grad_norm": 2.1728432922463274, + "kl": 0.0098236083984375, + "learning_rate": 2e-07, + "loss": -0.004918041825294495, + "memory(GiB)": 113.5, + "reward": 0.23333333656191826, + "reward_std": 0.10697162449359894, + "rewards/MultiModalAccuracyORM/mean": 0.23333333656191826, + "rewards/MultiModalAccuracyORM/std": 0.10697162449359894, + "step": 1425, + "train_speed(iter/s)": 0.031977 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.8, + "completions/mean_length": 388.7416793823242, + "completions/min_length": 235.4, + "epoch": 0.5777777777777777, + "grad_norm": 1.7935893801244052, + "kl": 0.0087493896484375, + "learning_rate": 2e-07, + "loss": 0.04609963297843933, + "memory(GiB)": 113.5, + "reward": 0.3500000134110451, + "reward_std": 0.32297651171684266, + "rewards/MultiModalAccuracyORM/mean": 0.3500000134110451, + "rewards/MultiModalAccuracyORM/std": 0.32297651171684266, + "step": 1430, + "train_speed(iter/s)": 0.031989 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.9, + "completions/mean_length": 398.00001220703126, + "completions/min_length": 208.4, + "epoch": 0.5797979797979798, + "grad_norm": 2.549829840865519, + "kl": 0.010107421875, + "learning_rate": 2e-07, + "loss": -0.0018973067402839662, + "memory(GiB)": 113.5, + "reward": 0.4250000089406967, + "reward_std": 0.3973225235939026, + "rewards/MultiModalAccuracyORM/mean": 0.4250000089406967, + "rewards/MultiModalAccuracyORM/std": 0.3973225235939026, + "step": 1435, + "train_speed(iter/s)": 0.031994 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.2, + "completions/mean_length": 325.5333450317383, + "completions/min_length": 163.1, + "epoch": 0.5818181818181818, + "grad_norm": 2.800120485549645, + "kl": 0.0125640869140625, + "learning_rate": 2e-07, + "loss": -0.016949039697647095, + "memory(GiB)": 113.5, + "reward": 0.4166666716337204, + "reward_std": 0.34232239723205565, + "rewards/MultiModalAccuracyORM/mean": 0.4166666716337204, + "rewards/MultiModalAccuracyORM/std": 0.34232239723205565, + "step": 1440, + "train_speed(iter/s)": 0.032007 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.5, + "completions/mean_length": 359.07500915527345, + "completions/min_length": 200.2, + "epoch": 0.5838383838383838, + "grad_norm": 2.2400645386442526, + "kl": 0.0367034912109375, + "learning_rate": 2e-07, + "loss": 0.027681028842926024, + "memory(GiB)": 113.5, + "reward": 0.29166667312383654, + "reward_std": 0.29815449118614196, + "rewards/MultiModalAccuracyORM/mean": 0.29166667312383654, + "rewards/MultiModalAccuracyORM/std": 0.29815449118614196, + "step": 1445, + "train_speed(iter/s)": 0.032019 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.8, + "completions/mean_length": 411.00001068115233, + "completions/min_length": 244.1, + "epoch": 0.5858585858585859, + "grad_norm": 2.864884904580614, + "kl": 0.009783935546875, + "learning_rate": 2e-07, + "loss": 0.00823460817337036, + "memory(GiB)": 113.5, + "reward": 0.3416666768491268, + "reward_std": 0.3438218057155609, + "rewards/MultiModalAccuracyORM/mean": 0.3416666768491268, + "rewards/MultiModalAccuracyORM/std": 0.3438218057155609, + "step": 1450, + "train_speed(iter/s)": 0.032023 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.1, + "completions/mean_length": 335.3583465576172, + "completions/min_length": 205.1, + "epoch": 0.5878787878787879, + "grad_norm": 1.4688157931726233, + "kl": 0.0092010498046875, + "learning_rate": 2e-07, + "loss": 0.01696823239326477, + "memory(GiB)": 113.5, + "reward": 0.37500000968575475, + "reward_std": 0.35413345992565154, + "rewards/MultiModalAccuracyORM/mean": 0.37500000968575475, + "rewards/MultiModalAccuracyORM/std": 0.35413345992565154, + "step": 1455, + "train_speed(iter/s)": 0.03204 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.3, + "completions/mean_length": 473.9416748046875, + "completions/min_length": 249.7, + "epoch": 0.5898989898989899, + "grad_norm": 1.1646459187041633, + "kl": 0.0099945068359375, + "learning_rate": 2e-07, + "loss": 0.014775393903255463, + "memory(GiB)": 113.5, + "reward": 0.2666666738688946, + "reward_std": 0.30333785712718964, + "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, + "rewards/MultiModalAccuracyORM/std": 0.30333785712718964, + "step": 1460, + "train_speed(iter/s)": 0.032026 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 701.1, + "completions/mean_length": 384.5083404541016, + "completions/min_length": 204.1, + "epoch": 0.591919191919192, + "grad_norm": 0.04302173761513684, + "kl": 0.012548828125, + "learning_rate": 2e-07, + "loss": -0.001154869794845581, + "memory(GiB)": 113.5, + "reward": 0.3000000141561031, + "reward_std": 0.3127244532108307, + "rewards/MultiModalAccuracyORM/mean": 0.3000000141561031, + "rewards/MultiModalAccuracyORM/std": 0.3127244532108307, + "step": 1465, + "train_speed(iter/s)": 0.032029 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.3, + "completions/mean_length": 352.4416763305664, + "completions/min_length": 195.1, + "epoch": 0.593939393939394, + "grad_norm": 2.051161125641378, + "kl": 0.014813232421875, + "learning_rate": 2e-07, + "loss": 0.0119085431098938, + "memory(GiB)": 113.5, + "reward": 0.3083333395421505, + "reward_std": 0.34488060176372526, + "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.34488060176372526, + "step": 1470, + "train_speed(iter/s)": 0.03204 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.7, + "completions/mean_length": 466.6416809082031, + "completions/min_length": 251.3, + "epoch": 0.5959595959595959, + "grad_norm": 1.842366669706851, + "kl": 0.01016082763671875, + "learning_rate": 2e-07, + "loss": 0.015132546424865723, + "memory(GiB)": 113.5, + "reward": 0.22500001043081283, + "reward_std": 0.3044206529855728, + "rewards/MultiModalAccuracyORM/mean": 0.22500001043081283, + "rewards/MultiModalAccuracyORM/std": 0.3044206529855728, + "step": 1475, + "train_speed(iter/s)": 0.032046 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.7, + "completions/mean_length": 344.0333480834961, + "completions/min_length": 205.0, + "epoch": 0.597979797979798, + "grad_norm": 0.07710895823869458, + "kl": 0.01250762939453125, + "learning_rate": 2e-07, + "loss": 0.02509859800338745, + "memory(GiB)": 113.5, + "reward": 0.47500001192092894, + "reward_std": 0.2752393215894699, + "rewards/MultiModalAccuracyORM/mean": 0.47500001192092894, + "rewards/MultiModalAccuracyORM/std": 0.2752393215894699, + "step": 1480, + "train_speed(iter/s)": 0.032062 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.5, + "completions/mean_length": 306.80834197998047, + "completions/min_length": 174.4, + "epoch": 0.6, + "grad_norm": 0.084452934933302, + "kl": 0.0158172607421875, + "learning_rate": 2e-07, + "loss": -0.027300435304641723, + "memory(GiB)": 113.5, + "reward": 0.17500000521540643, + "reward_std": 0.24105713069438933, + "rewards/MultiModalAccuracyORM/mean": 0.17500000521540643, + "rewards/MultiModalAccuracyORM/std": 0.24105713069438933, + "step": 1485, + "train_speed(iter/s)": 0.032084 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.7, + "completions/mean_length": 324.4833389282227, + "completions/min_length": 169.9, + "epoch": 0.602020202020202, + "grad_norm": 1.3165133966084028, + "kl": 0.0114501953125, + "learning_rate": 2e-07, + "loss": 0.004012265801429748, + "memory(GiB)": 113.5, + "reward": 0.3916666753590107, + "reward_std": 0.31046818792819975, + "rewards/MultiModalAccuracyORM/mean": 0.3916666753590107, + "rewards/MultiModalAccuracyORM/std": 0.31046818792819975, + "step": 1490, + "train_speed(iter/s)": 0.032103 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.7, + "completions/mean_length": 336.21668243408203, + "completions/min_length": 202.5, + "epoch": 0.604040404040404, + "grad_norm": 3.938520632284254, + "kl": 0.0132232666015625, + "learning_rate": 2e-07, + "loss": -0.02633047103881836, + "memory(GiB)": 113.5, + "reward": 0.3416666708886623, + "reward_std": 0.3149157464504242, + "rewards/MultiModalAccuracyORM/mean": 0.3416666708886623, + "rewards/MultiModalAccuracyORM/std": 0.3149157464504242, + "step": 1495, + "train_speed(iter/s)": 0.032103 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 2.7010910619752164, + "learning_rate": 2e-07, + "loss": 0.023089283704757692, + "memory(GiB)": 113.5, + "step": 1500, + "train_speed(iter/s)": 0.032112 + }, + { + "epoch": 0.6060606060606061, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 598.76, + "eval_completions/mean_length": 375.5383447265625, + "eval_completions/min_length": 218.18, + "eval_kl": 0.00917266845703125, + "eval_loss": -0.012349152937531471, + "eval_reward": 0.32000000730156897, + "eval_reward_std": 0.3092414766550064, + "eval_rewards/MultiModalAccuracyORM/mean": 0.32000000730156897, + "eval_rewards/MultiModalAccuracyORM/std": 0.3092414766550064, + "eval_runtime": 601.161, + "eval_samples_per_second": 0.083, + "eval_steps_per_second": 0.008, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.6, + "completions/mean_length": 392.15001182556153, + "completions/min_length": 216.0, + "epoch": 0.6080808080808081, + "grad_norm": 1.4655160488310728, + "kl": 0.010688018798828126, + "learning_rate": 2e-07, + "loss": 0.00576329231262207, + "memory(GiB)": 113.5, + "reward": 0.40416667349636554, + "reward_std": 0.31379757523536683, + "rewards/MultiModalAccuracyORM/mean": 0.40416667349636554, + "rewards/MultiModalAccuracyORM/std": 0.31379757523536683, + "step": 1505, + "train_speed(iter/s)": 0.031582 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.6, + "completions/mean_length": 392.9166763305664, + "completions/min_length": 185.9, + "epoch": 0.6101010101010101, + "grad_norm": 2.300152870135833, + "kl": 0.0118072509765625, + "learning_rate": 2e-07, + "loss": 0.01058935523033142, + "memory(GiB)": 113.5, + "reward": 0.15833333656191825, + "reward_std": 0.27622397541999816, + "rewards/MultiModalAccuracyORM/mean": 0.15833333656191825, + "rewards/MultiModalAccuracyORM/std": 0.27622397541999816, + "step": 1510, + "train_speed(iter/s)": 0.031594 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.5, + "completions/mean_length": 401.3166809082031, + "completions/min_length": 227.3, + "epoch": 0.6121212121212121, + "grad_norm": 2.0573660536714256, + "kl": 0.01282958984375, + "learning_rate": 2e-07, + "loss": 0.028659382462501527, + "memory(GiB)": 113.5, + "reward": 0.27500000819563863, + "reward_std": 0.3438218057155609, + "rewards/MultiModalAccuracyORM/mean": 0.27500000819563863, + "rewards/MultiModalAccuracyORM/std": 0.3438218057155609, + "step": 1515, + "train_speed(iter/s)": 0.031593 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/mean_length": 390.6333465576172, + "completions/min_length": 240.3, + "epoch": 0.6141414141414141, + "grad_norm": 1.4644802229965364, + "kl": 0.0115325927734375, + "learning_rate": 2e-07, + "loss": 0.009964641928672791, + "memory(GiB)": 113.5, + "reward": 0.20833334624767302, + "reward_std": 0.25113856196403506, + "rewards/MultiModalAccuracyORM/mean": 0.20833334624767302, + "rewards/MultiModalAccuracyORM/std": 0.25113856196403506, + "step": 1520, + "train_speed(iter/s)": 0.031607 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.1, + "completions/mean_length": 338.3666763305664, + "completions/min_length": 187.9, + "epoch": 0.6161616161616161, + "grad_norm": 2.312953380739967, + "kl": 0.011322021484375, + "learning_rate": 2e-07, + "loss": 0.0045973040163516995, + "memory(GiB)": 113.5, + "reward": 0.22500000521540642, + "reward_std": 0.22224704921245575, + "rewards/MultiModalAccuracyORM/mean": 0.22500000521540642, + "rewards/MultiModalAccuracyORM/std": 0.22224704921245575, + "step": 1525, + "train_speed(iter/s)": 0.03163 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.6, + "completions/mean_length": 368.7583465576172, + "completions/min_length": 193.4, + "epoch": 0.6181818181818182, + "grad_norm": 3.0723153433233095, + "kl": 0.0133697509765625, + "learning_rate": 2e-07, + "loss": -0.030410391092300416, + "memory(GiB)": 113.5, + "reward": 0.25000000819563867, + "reward_std": 0.35340302884578706, + "rewards/MultiModalAccuracyORM/mean": 0.25000000819563867, + "rewards/MultiModalAccuracyORM/std": 0.35340302884578706, + "step": 1530, + "train_speed(iter/s)": 0.03164 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.6, + "completions/mean_length": 351.21667633056643, + "completions/min_length": 227.9, + "epoch": 0.6202020202020202, + "grad_norm": 1.4538179467280616, + "kl": 0.01126708984375, + "learning_rate": 2e-07, + "loss": 0.0038071274757385254, + "memory(GiB)": 113.5, + "reward": 0.31666667610406873, + "reward_std": 0.27749558687210085, + "rewards/MultiModalAccuracyORM/mean": 0.31666667610406873, + "rewards/MultiModalAccuracyORM/std": 0.27749558687210085, + "step": 1535, + "train_speed(iter/s)": 0.031636 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.2, + "completions/mean_length": 362.9000076293945, + "completions/min_length": 196.3, + "epoch": 0.6222222222222222, + "grad_norm": 2.3834408729545817, + "kl": 0.011865234375, + "learning_rate": 2e-07, + "loss": -0.007588768005371093, + "memory(GiB)": 113.5, + "reward": 0.2500000029802322, + "reward_std": 0.2885732680559158, + "rewards/MultiModalAccuracyORM/mean": 0.2500000029802322, + "rewards/MultiModalAccuracyORM/std": 0.2885732680559158, + "step": 1540, + "train_speed(iter/s)": 0.031648 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.7, + "completions/mean_length": 418.4583480834961, + "completions/min_length": 246.6, + "epoch": 0.6242424242424243, + "grad_norm": 1.498638277562189, + "kl": 0.0112030029296875, + "learning_rate": 2e-07, + "loss": 0.00476650595664978, + "memory(GiB)": 113.5, + "reward": 0.2583333387970924, + "reward_std": 0.3297544777393341, + "rewards/MultiModalAccuracyORM/mean": 0.2583333387970924, + "rewards/MultiModalAccuracyORM/std": 0.3297544777393341, + "step": 1545, + "train_speed(iter/s)": 0.031653 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.1, + "completions/mean_length": 338.0750076293945, + "completions/min_length": 180.2, + "epoch": 0.6262626262626263, + "grad_norm": 1.3673556260797224, + "kl": 0.012890625, + "learning_rate": 2e-07, + "loss": 0.011944988369941711, + "memory(GiB)": 113.5, + "reward": 0.2666666731238365, + "reward_std": 0.36717758774757386, + "rewards/MultiModalAccuracyORM/mean": 0.2666666731238365, + "rewards/MultiModalAccuracyORM/std": 0.36717758774757386, + "step": 1550, + "train_speed(iter/s)": 0.031667 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/mean_length": 389.5916778564453, + "completions/min_length": 242.8, + "epoch": 0.6282828282828283, + "grad_norm": 2.327729044871898, + "kl": 0.014337158203125, + "learning_rate": 2e-07, + "loss": -0.015535221993923187, + "memory(GiB)": 113.5, + "reward": 0.17500000298023224, + "reward_std": 0.3498097449541092, + "rewards/MultiModalAccuracyORM/mean": 0.17500000298023224, + "rewards/MultiModalAccuracyORM/std": 0.3498097449541092, + "step": 1555, + "train_speed(iter/s)": 0.031684 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/mean_length": 392.79168090820315, + "completions/min_length": 227.0, + "epoch": 0.6303030303030303, + "grad_norm": 0.053806194925700226, + "kl": 0.0107635498046875, + "learning_rate": 2e-07, + "loss": 0.017643353343009947, + "memory(GiB)": 113.5, + "reward": 0.19166667014360428, + "reward_std": 0.3011411875486374, + "rewards/MultiModalAccuracyORM/mean": 0.19166667014360428, + "rewards/MultiModalAccuracyORM/std": 0.3011411875486374, + "step": 1560, + "train_speed(iter/s)": 0.031689 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.9, + "completions/mean_length": 332.08333587646484, + "completions/min_length": 183.7, + "epoch": 0.6323232323232323, + "grad_norm": 0.570186834834556, + "kl": 0.016534423828125, + "learning_rate": 2e-07, + "loss": -0.02576545476913452, + "memory(GiB)": 113.5, + "reward": 0.3000000089406967, + "reward_std": 0.3503421902656555, + "rewards/MultiModalAccuracyORM/mean": 0.3000000089406967, + "rewards/MultiModalAccuracyORM/std": 0.3503421902656555, + "step": 1565, + "train_speed(iter/s)": 0.031696 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.6, + "completions/mean_length": 380.7333465576172, + "completions/min_length": 206.4, + "epoch": 0.6343434343434343, + "grad_norm": 2.2565482508451735, + "kl": 0.0091888427734375, + "learning_rate": 2e-07, + "loss": 0.01603304147720337, + "memory(GiB)": 113.5, + "reward": 0.2833333440124989, + "reward_std": 0.3637146830558777, + "rewards/MultiModalAccuracyORM/mean": 0.2833333440124989, + "rewards/MultiModalAccuracyORM/std": 0.3637146830558777, + "step": 1570, + "train_speed(iter/s)": 0.031705 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.3, + "completions/mean_length": 343.33334197998045, + "completions/min_length": 181.6, + "epoch": 0.6363636363636364, + "grad_norm": 1.578397296268303, + "kl": 0.013397216796875, + "learning_rate": 2e-07, + "loss": 0.04952932298183441, + "memory(GiB)": 113.5, + "reward": 0.4666666738688946, + "reward_std": 0.37498117983341217, + "rewards/MultiModalAccuracyORM/mean": 0.4666666738688946, + "rewards/MultiModalAccuracyORM/std": 0.37498117983341217, + "step": 1575, + "train_speed(iter/s)": 0.031725 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 713.8, + "completions/mean_length": 377.916682434082, + "completions/min_length": 199.8, + "epoch": 0.6383838383838384, + "grad_norm": 1.458403513622403, + "kl": 0.0122344970703125, + "learning_rate": 2e-07, + "loss": 0.016104981303215027, + "memory(GiB)": 113.5, + "reward": 0.4833333417773247, + "reward_std": 0.3252659499645233, + "rewards/MultiModalAccuracyORM/mean": 0.4833333417773247, + "rewards/MultiModalAccuracyORM/std": 0.3252659499645233, + "step": 1580, + "train_speed(iter/s)": 0.031726 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 678.8, + "completions/mean_length": 418.26668243408204, + "completions/min_length": 202.3, + "epoch": 0.6404040404040404, + "grad_norm": 1.8686390380230793, + "kl": 0.013201904296875, + "learning_rate": 2e-07, + "loss": 0.011665409803390503, + "memory(GiB)": 113.5, + "reward": 0.3000000096857548, + "reward_std": 0.2652414858341217, + "rewards/MultiModalAccuracyORM/mean": 0.3000000096857548, + "rewards/MultiModalAccuracyORM/std": 0.2652414858341217, + "step": 1585, + "train_speed(iter/s)": 0.031728 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.2, + "completions/mean_length": 388.46668395996096, + "completions/min_length": 206.4, + "epoch": 0.6424242424242425, + "grad_norm": 1.992138254292841, + "kl": 0.011810302734375, + "learning_rate": 2e-07, + "loss": 0.08419913649559022, + "memory(GiB)": 113.5, + "reward": 0.2916666746139526, + "reward_std": 0.4093579977750778, + "rewards/MultiModalAccuracyORM/mean": 0.2916666746139526, + "rewards/MultiModalAccuracyORM/std": 0.4093579977750778, + "step": 1590, + "train_speed(iter/s)": 0.031721 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.3, + "completions/mean_length": 303.75001220703126, + "completions/min_length": 176.1, + "epoch": 0.6444444444444445, + "grad_norm": 1.636979804109864, + "kl": 0.0161895751953125, + "learning_rate": 2e-07, + "loss": 0.011809319257736206, + "memory(GiB)": 113.5, + "reward": 0.3333333425223827, + "reward_std": 0.25897532403469087, + "rewards/MultiModalAccuracyORM/mean": 0.3333333425223827, + "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, + "step": 1595, + "train_speed(iter/s)": 0.031748 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.9, + "completions/mean_length": 333.1666763305664, + "completions/min_length": 160.1, + "epoch": 0.6464646464646465, + "grad_norm": 2.2869229330092393, + "kl": 0.01339111328125, + "learning_rate": 2e-07, + "loss": 0.005678671598434448, + "memory(GiB)": 113.5, + "reward": 0.3416666753590107, + "reward_std": 0.3189666152000427, + "rewards/MultiModalAccuracyORM/mean": 0.3416666753590107, + "rewards/MultiModalAccuracyORM/std": 0.3189666152000427, + "step": 1600, + "train_speed(iter/s)": 0.031761 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.7, + "completions/mean_length": 279.3166778564453, + "completions/min_length": 174.7, + "epoch": 0.6484848484848484, + "grad_norm": 1.3720767401777028, + "kl": 0.014947509765625, + "learning_rate": 2e-07, + "loss": 0.0007772698998451232, + "memory(GiB)": 113.5, + "reward": 0.2916666753590107, + "reward_std": 0.29786467254161836, + "rewards/MultiModalAccuracyORM/mean": 0.2916666753590107, + "rewards/MultiModalAccuracyORM/std": 0.29786467254161836, + "step": 1605, + "train_speed(iter/s)": 0.031778 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.7, + "completions/mean_length": 286.30834197998047, + "completions/min_length": 167.6, + "epoch": 0.6505050505050505, + "grad_norm": 2.272498565917859, + "kl": 0.0147857666015625, + "learning_rate": 2e-07, + "loss": 0.03825833797454834, + "memory(GiB)": 113.5, + "reward": 0.30833333656191825, + "reward_std": 0.3430673748254776, + "rewards/MultiModalAccuracyORM/mean": 0.30833333656191825, + "rewards/MultiModalAccuracyORM/std": 0.3430673748254776, + "step": 1610, + "train_speed(iter/s)": 0.031791 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.7, + "completions/mean_length": 305.6833435058594, + "completions/min_length": 176.9, + "epoch": 0.6525252525252525, + "grad_norm": 2.101651221741828, + "kl": 0.01689453125, + "learning_rate": 2e-07, + "loss": -0.010073482990264893, + "memory(GiB)": 113.5, + "reward": 0.40833334550261496, + "reward_std": 0.3845028102397919, + "rewards/MultiModalAccuracyORM/mean": 0.40833334550261496, + "rewards/MultiModalAccuracyORM/std": 0.3845028102397919, + "step": 1615, + "train_speed(iter/s)": 0.031812 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.2, + "completions/mean_length": 355.3666732788086, + "completions/min_length": 217.0, + "epoch": 0.6545454545454545, + "grad_norm": 1.7437833008639363, + "kl": 0.014605712890625, + "learning_rate": 2e-07, + "loss": 0.03341163992881775, + "memory(GiB)": 113.5, + "reward": 0.3083333432674408, + "reward_std": 0.3104085922241211, + "rewards/MultiModalAccuracyORM/mean": 0.3083333432674408, + "rewards/MultiModalAccuracyORM/std": 0.3104085922241211, + "step": 1620, + "train_speed(iter/s)": 0.031815 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.7, + "completions/mean_length": 320.88334197998046, + "completions/min_length": 176.8, + "epoch": 0.6565656565656566, + "grad_norm": 2.214426657751653, + "kl": 0.012939453125, + "learning_rate": 2e-07, + "loss": 0.0038519926369190217, + "memory(GiB)": 113.5, + "reward": 0.4500000111758709, + "reward_std": 0.3840597689151764, + "rewards/MultiModalAccuracyORM/mean": 0.4500000111758709, + "rewards/MultiModalAccuracyORM/std": 0.3840597689151764, + "step": 1625, + "train_speed(iter/s)": 0.031834 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.5, + "completions/mean_length": 307.9750091552734, + "completions/min_length": 179.6, + "epoch": 0.6585858585858586, + "grad_norm": 2.3559044349874965, + "kl": 0.011468505859375, + "learning_rate": 2e-07, + "loss": -0.007926353812217712, + "memory(GiB)": 113.5, + "reward": 0.3500000089406967, + "reward_std": 0.21594529151916503, + "rewards/MultiModalAccuracyORM/mean": 0.3500000089406967, + "rewards/MultiModalAccuracyORM/std": 0.21594529151916503, + "step": 1630, + "train_speed(iter/s)": 0.031861 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/mean_length": 272.0000061035156, + "completions/min_length": 140.0, + "epoch": 0.6606060606060606, + "grad_norm": 2.3218216739931163, + "kl": 0.01510009765625, + "learning_rate": 2e-07, + "loss": -0.017690959572792053, + "memory(GiB)": 113.5, + "reward": 0.28333333805203437, + "reward_std": 0.20416739881038665, + "rewards/MultiModalAccuracyORM/mean": 0.28333333805203437, + "rewards/MultiModalAccuracyORM/std": 0.20416739881038665, + "step": 1635, + "train_speed(iter/s)": 0.03188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/mean_length": 346.666682434082, + "completions/min_length": 213.6, + "epoch": 0.6626262626262627, + "grad_norm": 1.7521312796960462, + "kl": 0.011456298828125, + "learning_rate": 2e-07, + "loss": -0.01213396042585373, + "memory(GiB)": 113.5, + "reward": 0.34166667237877846, + "reward_std": 0.2464074045419693, + "rewards/MultiModalAccuracyORM/mean": 0.34166667237877846, + "rewards/MultiModalAccuracyORM/std": 0.2464074045419693, + "step": 1640, + "train_speed(iter/s)": 0.031896 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/mean_length": 235.60834426879882, + "completions/min_length": 120.5, + "epoch": 0.6646464646464646, + "grad_norm": 2.846675522202014, + "kl": 0.0129638671875, + "learning_rate": 2e-07, + "loss": -0.01681770384311676, + "memory(GiB)": 113.5, + "reward": 0.4000000089406967, + "reward_std": 0.364131298661232, + "rewards/MultiModalAccuracyORM/mean": 0.4000000089406967, + "rewards/MultiModalAccuracyORM/std": 0.364131298661232, + "step": 1645, + "train_speed(iter/s)": 0.031917 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.8, + "completions/mean_length": 355.72500762939455, + "completions/min_length": 171.8, + "epoch": 0.6666666666666666, + "grad_norm": 2.5063086447109546, + "kl": 0.01486053466796875, + "learning_rate": 2e-07, + "loss": 0.005304119735956192, + "memory(GiB)": 113.5, + "reward": 0.3500000134110451, + "reward_std": 0.41141627728939056, + "rewards/MultiModalAccuracyORM/mean": 0.3500000134110451, + "rewards/MultiModalAccuracyORM/std": 0.41141627728939056, + "step": 1650, + "train_speed(iter/s)": 0.031924 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.8, + "completions/mean_length": 412.1666793823242, + "completions/min_length": 210.8, + "epoch": 0.6686868686868687, + "grad_norm": 2.9814971352286297, + "kl": 0.018035888671875, + "learning_rate": 2e-07, + "loss": 0.0013743340969085693, + "memory(GiB)": 113.5, + "reward": 0.2083333373069763, + "reward_std": 0.28402756750583646, + "rewards/MultiModalAccuracyORM/mean": 0.2083333373069763, + "rewards/MultiModalAccuracyORM/std": 0.28402756750583646, + "step": 1655, + "train_speed(iter/s)": 0.031912 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.7, + "completions/mean_length": 315.45000915527345, + "completions/min_length": 164.2, + "epoch": 0.6707070707070707, + "grad_norm": 2.420560710043236, + "kl": 0.013409423828125, + "learning_rate": 2e-07, + "loss": -0.0018982872366905212, + "memory(GiB)": 113.5, + "reward": 0.20000000521540642, + "reward_std": 0.25270916223526, + "rewards/MultiModalAccuracyORM/mean": 0.20000000521540642, + "rewards/MultiModalAccuracyORM/std": 0.25270916223526, + "step": 1660, + "train_speed(iter/s)": 0.031926 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.3, + "completions/mean_length": 373.15834045410156, + "completions/min_length": 187.0, + "epoch": 0.6727272727272727, + "grad_norm": 2.099245716082938, + "kl": 0.0146942138671875, + "learning_rate": 2e-07, + "loss": 0.0194022536277771, + "memory(GiB)": 113.5, + "reward": 0.24166667237877845, + "reward_std": 0.29383077621459963, + "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, + "rewards/MultiModalAccuracyORM/std": 0.29383077621459963, + "step": 1665, + "train_speed(iter/s)": 0.031939 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.1, + "completions/mean_length": 409.43335113525393, + "completions/min_length": 252.2, + "epoch": 0.6747474747474748, + "grad_norm": 0.8827203782530715, + "kl": 0.01336669921875, + "learning_rate": 2e-07, + "loss": 0.022216227650642396, + "memory(GiB)": 113.5, + "reward": 0.2833333410322666, + "reward_std": 0.23704480826854707, + "rewards/MultiModalAccuracyORM/mean": 0.2833333410322666, + "rewards/MultiModalAccuracyORM/std": 0.23704480826854707, + "step": 1670, + "train_speed(iter/s)": 0.031942 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.7, + "completions/mean_length": 372.35001068115236, + "completions/min_length": 223.6, + "epoch": 0.6767676767676768, + "grad_norm": 2.67307804927538, + "kl": 0.012176513671875, + "learning_rate": 2e-07, + "loss": -0.025462892651557923, + "memory(GiB)": 113.5, + "reward": 0.3583333410322666, + "reward_std": 0.30489686131477356, + "rewards/MultiModalAccuracyORM/mean": 0.3583333410322666, + "rewards/MultiModalAccuracyORM/std": 0.30489686131477356, + "step": 1675, + "train_speed(iter/s)": 0.031947 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.9, + "completions/mean_length": 252.71667709350587, + "completions/min_length": 134.5, + "epoch": 0.6787878787878788, + "grad_norm": 2.948416033259282, + "kl": 0.013824462890625, + "learning_rate": 2e-07, + "loss": 0.007326580584049225, + "memory(GiB)": 113.5, + "reward": 0.5083333514630795, + "reward_std": 0.3945842385292053, + "rewards/MultiModalAccuracyORM/mean": 0.5083333514630795, + "rewards/MultiModalAccuracyORM/std": 0.3945842385292053, + "step": 1680, + "train_speed(iter/s)": 0.031971 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.6, + "completions/mean_length": 367.6166732788086, + "completions/min_length": 225.0, + "epoch": 0.6808080808080809, + "grad_norm": 0.07197046485321759, + "kl": 0.0118194580078125, + "learning_rate": 2e-07, + "loss": 0.03796108365058899, + "memory(GiB)": 113.5, + "reward": 0.23333333879709245, + "reward_std": 0.20995735228061677, + "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, + "rewards/MultiModalAccuracyORM/std": 0.20995735228061677, + "step": 1685, + "train_speed(iter/s)": 0.031972 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/mean_length": 348.7500061035156, + "completions/min_length": 212.9, + "epoch": 0.6828282828282828, + "grad_norm": 1.5442560082143544, + "kl": 0.01568603515625, + "learning_rate": 2e-07, + "loss": 0.017047417163848878, + "memory(GiB)": 113.5, + "reward": 0.358333345502615, + "reward_std": 0.4405413746833801, + "rewards/MultiModalAccuracyORM/mean": 0.358333345502615, + "rewards/MultiModalAccuracyORM/std": 0.4405413746833801, + "step": 1690, + "train_speed(iter/s)": 0.031977 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.5, + "completions/mean_length": 318.30000228881835, + "completions/min_length": 162.4, + "epoch": 0.6848484848484848, + "grad_norm": 2.822151558666746, + "kl": 0.013775634765625, + "learning_rate": 2e-07, + "loss": 0.03140446245670318, + "memory(GiB)": 113.5, + "reward": 0.3583333469927311, + "reward_std": 0.399324569106102, + "rewards/MultiModalAccuracyORM/mean": 0.3583333469927311, + "rewards/MultiModalAccuracyORM/std": 0.399324569106102, + "step": 1695, + "train_speed(iter/s)": 0.031999 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.9, + "completions/mean_length": 485.1916870117187, + "completions/min_length": 305.5, + "epoch": 0.6868686868686869, + "grad_norm": 0.9869928398468556, + "kl": 0.0103057861328125, + "learning_rate": 2e-07, + "loss": 0.018257686495780946, + "memory(GiB)": 113.5, + "reward": 0.2583333417773247, + "reward_std": 0.29035089910030365, + "rewards/MultiModalAccuracyORM/mean": 0.2583333417773247, + "rewards/MultiModalAccuracyORM/std": 0.29035089910030365, + "step": 1700, + "train_speed(iter/s)": 0.031993 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.9, + "completions/mean_length": 329.5166725158691, + "completions/min_length": 196.6, + "epoch": 0.6888888888888889, + "grad_norm": 2.390331116834798, + "kl": 0.0239990234375, + "learning_rate": 2e-07, + "loss": -0.02088260054588318, + "memory(GiB)": 113.5, + "reward": 0.4750000089406967, + "reward_std": 0.27753118276596067, + "rewards/MultiModalAccuracyORM/mean": 0.4750000089406967, + "rewards/MultiModalAccuracyORM/std": 0.27753118276596067, + "step": 1705, + "train_speed(iter/s)": 0.032012 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.8, + "completions/mean_length": 286.5166694641113, + "completions/min_length": 153.0, + "epoch": 0.6909090909090909, + "grad_norm": 3.070912031712293, + "kl": 0.0171630859375, + "learning_rate": 2e-07, + "loss": 0.00493430495262146, + "memory(GiB)": 113.5, + "reward": 0.45833334028720857, + "reward_std": 0.31192905008792876, + "rewards/MultiModalAccuracyORM/mean": 0.45833334028720857, + "rewards/MultiModalAccuracyORM/std": 0.31192905008792876, + "step": 1710, + "train_speed(iter/s)": 0.032022 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.9, + "completions/mean_length": 243.5666763305664, + "completions/min_length": 117.3, + "epoch": 0.692929292929293, + "grad_norm": 2.7698758058054214, + "kl": 0.0126708984375, + "learning_rate": 2e-07, + "loss": -0.0016166016459465027, + "memory(GiB)": 113.5, + "reward": 0.42500000819563866, + "reward_std": 0.25512445867061617, + "rewards/MultiModalAccuracyORM/mean": 0.42500000819563866, + "rewards/MultiModalAccuracyORM/std": 0.25512445867061617, + "step": 1715, + "train_speed(iter/s)": 0.032047 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.2, + "completions/mean_length": 255.48334503173828, + "completions/min_length": 158.4, + "epoch": 0.694949494949495, + "grad_norm": 2.744690041316947, + "kl": 0.015838623046875, + "learning_rate": 2e-07, + "loss": -0.019546210765838623, + "memory(GiB)": 113.5, + "reward": 0.26666667833924296, + "reward_std": 0.2754935443401337, + "rewards/MultiModalAccuracyORM/mean": 0.26666667833924296, + "rewards/MultiModalAccuracyORM/std": 0.2754935443401337, + "step": 1720, + "train_speed(iter/s)": 0.032071 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.9, + "completions/mean_length": 379.2666839599609, + "completions/min_length": 221.8, + "epoch": 0.696969696969697, + "grad_norm": 1.777431935717832, + "kl": 0.012689208984375, + "learning_rate": 2e-07, + "loss": 0.009233607351779938, + "memory(GiB)": 113.5, + "reward": 0.3083333395421505, + "reward_std": 0.28128685653209684, + "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.28128685653209684, + "step": 1725, + "train_speed(iter/s)": 0.032079 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/mean_length": 354.1666770935059, + "completions/min_length": 171.8, + "epoch": 0.6989898989898989, + "grad_norm": 3.0383086202130616, + "kl": 0.01773681640625, + "learning_rate": 2e-07, + "loss": 0.03813132643699646, + "memory(GiB)": 113.5, + "reward": 0.31666666865348814, + "reward_std": 0.27938000559806825, + "rewards/MultiModalAccuracyORM/mean": 0.31666666865348814, + "rewards/MultiModalAccuracyORM/std": 0.27938000559806825, + "step": 1730, + "train_speed(iter/s)": 0.032094 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.8, + "completions/mean_length": 351.7666717529297, + "completions/min_length": 157.5, + "epoch": 0.701010101010101, + "grad_norm": 1.8778050454869868, + "kl": 0.014495849609375, + "learning_rate": 2e-07, + "loss": 0.00038725733757019045, + "memory(GiB)": 113.5, + "reward": 0.4833333432674408, + "reward_std": 0.33153211176395414, + "rewards/MultiModalAccuracyORM/mean": 0.4833333432674408, + "rewards/MultiModalAccuracyORM/std": 0.33153211176395414, + "step": 1735, + "train_speed(iter/s)": 0.032098 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.2, + "completions/mean_length": 350.0083404541016, + "completions/min_length": 189.6, + "epoch": 0.703030303030303, + "grad_norm": 2.1562095065119053, + "kl": 0.0187957763671875, + "learning_rate": 2e-07, + "loss": -0.02958904504776001, + "memory(GiB)": 113.5, + "reward": 0.28333333879709244, + "reward_std": 0.3487591862678528, + "rewards/MultiModalAccuracyORM/mean": 0.28333333879709244, + "rewards/MultiModalAccuracyORM/std": 0.3487591862678528, + "step": 1740, + "train_speed(iter/s)": 0.032101 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.2, + "completions/mean_length": 310.36668090820314, + "completions/min_length": 187.0, + "epoch": 0.705050505050505, + "grad_norm": 2.8598050443718797, + "kl": 0.013079833984375, + "learning_rate": 2e-07, + "loss": -0.007939225435256958, + "memory(GiB)": 113.5, + "reward": 0.35000000819563865, + "reward_std": 0.3908045649528503, + "rewards/MultiModalAccuracyORM/mean": 0.35000000819563865, + "rewards/MultiModalAccuracyORM/std": 0.3908045649528503, + "step": 1745, + "train_speed(iter/s)": 0.032116 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 2.0976510908729256, + "learning_rate": 2e-07, + "loss": 0.05007731318473816, + "memory(GiB)": 113.5, + "step": 1750, + "train_speed(iter/s)": 0.03212 + }, + { + "epoch": 0.7070707070707071, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0016666666666666666, + "eval_completions/max_length": 587.9, + "eval_completions/mean_length": 354.56501251220703, + "eval_completions/min_length": 214.28, + "eval_kl": 0.01150848388671875, + "eval_loss": 0.0095694400370121, + "eval_reward": 0.3250000074505806, + "eval_reward_std": 0.32090782165527343, + "eval_rewards/MultiModalAccuracyORM/mean": 0.3250000074505806, + "eval_rewards/MultiModalAccuracyORM/std": 0.32090782165527343, + "eval_runtime": 581.3868, + "eval_samples_per_second": 0.086, + "eval_steps_per_second": 0.009, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.3, + "completions/mean_length": 361.1250114440918, + "completions/min_length": 205.05, + "epoch": 0.7090909090909091, + "grad_norm": 1.92993215123609, + "kl": 0.01456298828125, + "learning_rate": 2e-07, + "loss": 0.013172458112239837, + "memory(GiB)": 113.5, + "reward": 0.3041666720062494, + "reward_std": 0.3517512962222099, + "rewards/MultiModalAccuracyORM/mean": 0.3041666720062494, + "rewards/MultiModalAccuracyORM/std": 0.3517512962222099, + "step": 1755, + "train_speed(iter/s)": 0.031659 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 877.0, + "completions/mean_length": 508.4500274658203, + "completions/min_length": 241.1, + "epoch": 0.7111111111111111, + "grad_norm": 1.960501031799004, + "kl": 0.01246490478515625, + "learning_rate": 2e-07, + "loss": -0.01800227165222168, + "memory(GiB)": 113.5, + "reward": 0.24166667461395264, + "reward_std": 0.40063177347183226, + "rewards/MultiModalAccuracyORM/mean": 0.24166667461395264, + "rewards/MultiModalAccuracyORM/std": 0.40063177347183226, + "step": 1760, + "train_speed(iter/s)": 0.031653 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.9, + "completions/mean_length": 389.05833740234374, + "completions/min_length": 194.6, + "epoch": 0.7131313131313132, + "grad_norm": 0.06856184885436768, + "kl": 0.0163360595703125, + "learning_rate": 2e-07, + "loss": 0.05879574418067932, + "memory(GiB)": 113.5, + "reward": 0.45000000223517417, + "reward_std": 0.26600751280784607, + "rewards/MultiModalAccuracyORM/mean": 0.45000000223517417, + "rewards/MultiModalAccuracyORM/std": 0.26600751280784607, + "step": 1765, + "train_speed(iter/s)": 0.031655 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.8, + "completions/mean_length": 364.608349609375, + "completions/min_length": 206.9, + "epoch": 0.7151515151515152, + "grad_norm": 2.5267727464559195, + "kl": 0.013262939453125, + "learning_rate": 2e-07, + "loss": -0.05543935298919678, + "memory(GiB)": 113.5, + "reward": 0.4583333507180214, + "reward_std": 0.349611759185791, + "rewards/MultiModalAccuracyORM/mean": 0.4583333507180214, + "rewards/MultiModalAccuracyORM/std": 0.349611759185791, + "step": 1770, + "train_speed(iter/s)": 0.031665 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.2, + "completions/mean_length": 349.9750091552734, + "completions/min_length": 182.8, + "epoch": 0.7171717171717171, + "grad_norm": 1.8053530203955317, + "kl": 0.016015625, + "learning_rate": 2e-07, + "loss": 0.003249824047088623, + "memory(GiB)": 113.5, + "reward": 0.21666667535901069, + "reward_std": 0.36190145611763, + "rewards/MultiModalAccuracyORM/mean": 0.21666667535901069, + "rewards/MultiModalAccuracyORM/std": 0.36190145611763, + "step": 1775, + "train_speed(iter/s)": 0.031667 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.5, + "completions/mean_length": 257.55000534057615, + "completions/min_length": 140.9, + "epoch": 0.7191919191919192, + "grad_norm": 2.81422482443103, + "kl": 0.0193359375, + "learning_rate": 2e-07, + "loss": -0.02224818170070648, + "memory(GiB)": 113.5, + "reward": 0.30000000819563866, + "reward_std": 0.3563897281885147, + "rewards/MultiModalAccuracyORM/mean": 0.30000000819563866, + "rewards/MultiModalAccuracyORM/std": 0.3563897281885147, + "step": 1780, + "train_speed(iter/s)": 0.031687 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.3, + "completions/mean_length": 291.2166748046875, + "completions/min_length": 164.3, + "epoch": 0.7212121212121212, + "grad_norm": 2.8650400951164525, + "kl": 0.0154205322265625, + "learning_rate": 2e-07, + "loss": -0.02759958803653717, + "memory(GiB)": 113.5, + "reward": 0.21666667014360427, + "reward_std": 0.2892681032419205, + "rewards/MultiModalAccuracyORM/mean": 0.21666667014360427, + "rewards/MultiModalAccuracyORM/std": 0.2892681032419205, + "step": 1785, + "train_speed(iter/s)": 0.031703 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.8, + "completions/mean_length": 382.2583404541016, + "completions/min_length": 222.3, + "epoch": 0.7232323232323232, + "grad_norm": 0.04924378999535635, + "kl": 0.0165863037109375, + "learning_rate": 2e-07, + "loss": 0.002944570779800415, + "memory(GiB)": 113.5, + "reward": 0.1833333395421505, + "reward_std": 0.3059200614690781, + "rewards/MultiModalAccuracyORM/mean": 0.1833333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3059200614690781, + "step": 1790, + "train_speed(iter/s)": 0.031714 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.8, + "completions/mean_length": 361.8166778564453, + "completions/min_length": 215.4, + "epoch": 0.7252525252525253, + "grad_norm": 1.0400490856507523, + "kl": 0.015380859375, + "learning_rate": 2e-07, + "loss": 0.0032314777374267576, + "memory(GiB)": 113.5, + "reward": 0.21666667312383653, + "reward_std": 0.3141998678445816, + "rewards/MultiModalAccuracyORM/mean": 0.21666667312383653, + "rewards/MultiModalAccuracyORM/std": 0.3141998678445816, + "step": 1795, + "train_speed(iter/s)": 0.031727 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.5, + "completions/mean_length": 366.5666793823242, + "completions/min_length": 172.7, + "epoch": 0.7272727272727273, + "grad_norm": 1.579739169824459, + "kl": 0.015093994140625, + "learning_rate": 2e-07, + "loss": -0.01905302405357361, + "memory(GiB)": 113.5, + "reward": 0.3166666753590107, + "reward_std": 0.320466023683548, + "rewards/MultiModalAccuracyORM/mean": 0.3166666753590107, + "rewards/MultiModalAccuracyORM/std": 0.320466023683548, + "step": 1800, + "train_speed(iter/s)": 0.031739 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.7, + "completions/mean_length": 319.3916732788086, + "completions/min_length": 186.6, + "epoch": 0.7292929292929293, + "grad_norm": 3.0687917767271022, + "kl": 0.0137054443359375, + "learning_rate": 2e-07, + "loss": 0.02089669108390808, + "memory(GiB)": 113.5, + "reward": 0.3666666693985462, + "reward_std": 0.47085520029067995, + "rewards/MultiModalAccuracyORM/mean": 0.3666666693985462, + "rewards/MultiModalAccuracyORM/std": 0.47085520029067995, + "step": 1805, + "train_speed(iter/s)": 0.031757 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.7, + "completions/mean_length": 375.0416778564453, + "completions/min_length": 215.7, + "epoch": 0.7313131313131314, + "grad_norm": 1.8933797302813846, + "kl": 0.0150848388671875, + "learning_rate": 2e-07, + "loss": 0.03909637928009033, + "memory(GiB)": 113.5, + "reward": 0.39166667610406875, + "reward_std": 0.34688264429569243, + "rewards/MultiModalAccuracyORM/mean": 0.39166667610406875, + "rewards/MultiModalAccuracyORM/std": 0.34688264429569243, + "step": 1810, + "train_speed(iter/s)": 0.031762 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.9, + "completions/mean_length": 305.2583404541016, + "completions/min_length": 157.4, + "epoch": 0.7333333333333333, + "grad_norm": 1.8206918881783931, + "kl": 0.0149169921875, + "learning_rate": 2e-07, + "loss": -0.016247293353080748, + "memory(GiB)": 113.5, + "reward": 0.2666666701436043, + "reward_std": 0.32451151609420775, + "rewards/MultiModalAccuracyORM/mean": 0.2666666701436043, + "rewards/MultiModalAccuracyORM/std": 0.32451151609420775, + "step": 1815, + "train_speed(iter/s)": 0.031772 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.8, + "completions/mean_length": 322.85000762939455, + "completions/min_length": 183.5, + "epoch": 0.7353535353535353, + "grad_norm": 1.918825325754338, + "kl": 0.01243896484375, + "learning_rate": 2e-07, + "loss": 0.010172617435455323, + "memory(GiB)": 113.5, + "reward": 0.20000000223517417, + "reward_std": 0.21999078392982482, + "rewards/MultiModalAccuracyORM/mean": 0.20000000223517417, + "rewards/MultiModalAccuracyORM/std": 0.21999078392982482, + "step": 1820, + "train_speed(iter/s)": 0.031781 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.5, + "completions/mean_length": 253.28334197998046, + "completions/min_length": 136.0, + "epoch": 0.7373737373737373, + "grad_norm": 2.5646468814628482, + "kl": 0.01630859375, + "learning_rate": 2e-07, + "loss": 0.08878597021102905, + "memory(GiB)": 113.5, + "reward": 0.4000000089406967, + "reward_std": 0.3767348140478134, + "rewards/MultiModalAccuracyORM/mean": 0.4000000089406967, + "rewards/MultiModalAccuracyORM/std": 0.3767348140478134, + "step": 1825, + "train_speed(iter/s)": 0.031805 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 786.5, + "completions/mean_length": 391.63334197998046, + "completions/min_length": 175.8, + "epoch": 0.7393939393939394, + "grad_norm": 2.279597838394587, + "kl": 0.016058349609375, + "learning_rate": 2e-07, + "loss": -0.01255677342414856, + "memory(GiB)": 113.5, + "reward": 0.33333333656191827, + "reward_std": 0.30187161862850187, + "rewards/MultiModalAccuracyORM/mean": 0.33333333656191827, + "rewards/MultiModalAccuracyORM/std": 0.30187161862850187, + "step": 1830, + "train_speed(iter/s)": 0.031799 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/mean_length": 351.30834045410154, + "completions/min_length": 184.7, + "epoch": 0.7414141414141414, + "grad_norm": 1.220249950163537, + "kl": 0.014288330078125, + "learning_rate": 2e-07, + "loss": -0.03182802200317383, + "memory(GiB)": 113.5, + "reward": 0.4250000067055225, + "reward_std": 0.40566191971302035, + "rewards/MultiModalAccuracyORM/mean": 0.4250000067055225, + "rewards/MultiModalAccuracyORM/std": 0.40566191971302035, + "step": 1835, + "train_speed(iter/s)": 0.031813 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.3, + "completions/mean_length": 289.9583404541016, + "completions/min_length": 139.6, + "epoch": 0.7434343434343434, + "grad_norm": 3.7311094209711153, + "kl": 0.019146728515625, + "learning_rate": 2e-07, + "loss": -0.02434406876564026, + "memory(GiB)": 113.5, + "reward": 0.4333333432674408, + "reward_std": 0.3922538310289383, + "rewards/MultiModalAccuracyORM/mean": 0.4333333432674408, + "rewards/MultiModalAccuracyORM/std": 0.3922538310289383, + "step": 1840, + "train_speed(iter/s)": 0.031824 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.3, + "completions/mean_length": 292.6333404541016, + "completions/min_length": 165.6, + "epoch": 0.7454545454545455, + "grad_norm": 2.2491131974096503, + "kl": 0.018963623046875, + "learning_rate": 2e-07, + "loss": -0.029304242134094237, + "memory(GiB)": 113.5, + "reward": 0.4250000096857548, + "reward_std": 0.3370794355869293, + "rewards/MultiModalAccuracyORM/mean": 0.4250000096857548, + "rewards/MultiModalAccuracyORM/std": 0.3370794355869293, + "step": 1845, + "train_speed(iter/s)": 0.031836 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.7, + "completions/mean_length": 369.1416778564453, + "completions/min_length": 209.3, + "epoch": 0.7474747474747475, + "grad_norm": 1.8956894287229566, + "kl": 0.015484619140625, + "learning_rate": 2e-07, + "loss": 0.015110939741134644, + "memory(GiB)": 113.5, + "reward": 0.2916666716337204, + "reward_std": 0.4038462698459625, + "rewards/MultiModalAccuracyORM/mean": 0.2916666716337204, + "rewards/MultiModalAccuracyORM/std": 0.4038462698459625, + "step": 1850, + "train_speed(iter/s)": 0.031843 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/mean_length": 357.9083419799805, + "completions/min_length": 162.4, + "epoch": 0.7494949494949495, + "grad_norm": 2.5484409209581504, + "kl": 0.0146087646484375, + "learning_rate": 2e-07, + "loss": -0.023239874839782716, + "memory(GiB)": 113.5, + "reward": 0.20000000894069672, + "reward_std": 0.31517534554004667, + "rewards/MultiModalAccuracyORM/mean": 0.20000000894069672, + "rewards/MultiModalAccuracyORM/std": 0.31517534554004667, + "step": 1855, + "train_speed(iter/s)": 0.031855 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.3, + "completions/mean_length": 354.6916748046875, + "completions/min_length": 208.8, + "epoch": 0.7515151515151515, + "grad_norm": 2.0846151526365655, + "kl": 0.0114715576171875, + "learning_rate": 2e-07, + "loss": 0.0073637284338474275, + "memory(GiB)": 113.5, + "reward": 0.18333333879709243, + "reward_std": 0.2907939374446869, + "rewards/MultiModalAccuracyORM/mean": 0.18333333879709243, + "rewards/MultiModalAccuracyORM/std": 0.2907939374446869, + "step": 1860, + "train_speed(iter/s)": 0.031861 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.8, + "completions/mean_length": 304.43334045410154, + "completions/min_length": 155.7, + "epoch": 0.7535353535353535, + "grad_norm": 2.0624318263809047, + "kl": 0.01436767578125, + "learning_rate": 2e-07, + "loss": 0.040461289882659915, + "memory(GiB)": 113.5, + "reward": 0.2250000059604645, + "reward_std": 0.28959646821022034, + "rewards/MultiModalAccuracyORM/mean": 0.2250000059604645, + "rewards/MultiModalAccuracyORM/std": 0.28959646821022034, + "step": 1865, + "train_speed(iter/s)": 0.031882 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/mean_length": 348.55001678466795, + "completions/min_length": 185.6, + "epoch": 0.7555555555555555, + "grad_norm": 0.07620984486729401, + "kl": 0.02213134765625, + "learning_rate": 2e-07, + "loss": 0.014231646060943603, + "memory(GiB)": 113.5, + "reward": 0.1916666716337204, + "reward_std": 0.23860623836517333, + "rewards/MultiModalAccuracyORM/mean": 0.1916666716337204, + "rewards/MultiModalAccuracyORM/std": 0.23860623836517333, + "step": 1870, + "train_speed(iter/s)": 0.031881 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 763.2, + "completions/mean_length": 389.03334045410156, + "completions/min_length": 191.9, + "epoch": 0.7575757575757576, + "grad_norm": 2.525300571346317, + "kl": 0.02110443115234375, + "learning_rate": 2e-07, + "loss": 0.0036004871129989625, + "memory(GiB)": 113.5, + "reward": 0.2916666716337204, + "reward_std": 0.41791602075099943, + "rewards/MultiModalAccuracyORM/mean": 0.2916666716337204, + "rewards/MultiModalAccuracyORM/std": 0.41791602075099943, + "step": 1875, + "train_speed(iter/s)": 0.03188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.2, + "completions/mean_length": 309.6250030517578, + "completions/min_length": 170.7, + "epoch": 0.7595959595959596, + "grad_norm": 1.8476207975789374, + "kl": 0.0132843017578125, + "learning_rate": 2e-07, + "loss": 0.01698073446750641, + "memory(GiB)": 113.5, + "reward": 0.1416666679084301, + "reward_std": 0.24939410090446473, + "rewards/MultiModalAccuracyORM/mean": 0.1416666679084301, + "rewards/MultiModalAccuracyORM/std": 0.24939410090446473, + "step": 1880, + "train_speed(iter/s)": 0.031898 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 651.4, + "completions/mean_length": 324.4583480834961, + "completions/min_length": 173.3, + "epoch": 0.7616161616161616, + "grad_norm": 2.8918258139669333, + "kl": 0.0152374267578125, + "learning_rate": 2e-07, + "loss": -0.01050989031791687, + "memory(GiB)": 113.5, + "reward": 0.3916666731238365, + "reward_std": 0.3340185970067978, + "rewards/MultiModalAccuracyORM/mean": 0.3916666731238365, + "rewards/MultiModalAccuracyORM/std": 0.3340185970067978, + "step": 1885, + "train_speed(iter/s)": 0.031901 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/mean_length": 448.85834350585935, + "completions/min_length": 173.1, + "epoch": 0.7636363636363637, + "grad_norm": 1.4150104336871425, + "kl": 0.0143951416015625, + "learning_rate": 2e-07, + "loss": 0.013275668025016785, + "memory(GiB)": 113.5, + "reward": 0.2500000104308128, + "reward_std": 0.33704383969306945, + "rewards/MultiModalAccuracyORM/mean": 0.2500000104308128, + "rewards/MultiModalAccuracyORM/std": 0.33704383969306945, + "step": 1890, + "train_speed(iter/s)": 0.031898 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.5, + "completions/mean_length": 351.9833450317383, + "completions/min_length": 186.2, + "epoch": 0.7656565656565657, + "grad_norm": 1.8175161534038837, + "kl": 0.014886474609375, + "learning_rate": 2e-07, + "loss": -0.021983048319816588, + "memory(GiB)": 113.5, + "reward": 0.2750000089406967, + "reward_std": 0.30795769989490507, + "rewards/MultiModalAccuracyORM/mean": 0.2750000089406967, + "rewards/MultiModalAccuracyORM/std": 0.30795769989490507, + "step": 1895, + "train_speed(iter/s)": 0.031911 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.1, + "completions/mean_length": 327.0833404541016, + "completions/min_length": 144.5, + "epoch": 0.7676767676767676, + "grad_norm": 2.5158628276606025, + "kl": 0.0138824462890625, + "learning_rate": 2e-07, + "loss": 0.03910906314849853, + "memory(GiB)": 113.5, + "reward": 0.30833333656191825, + "reward_std": 0.3422983974218369, + "rewards/MultiModalAccuracyORM/mean": 0.30833333656191825, + "rewards/MultiModalAccuracyORM/std": 0.3422983974218369, + "step": 1900, + "train_speed(iter/s)": 0.031926 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.3, + "completions/mean_length": 446.5250076293945, + "completions/min_length": 241.6, + "epoch": 0.7696969696969697, + "grad_norm": 2.2012277452389415, + "kl": 0.026849365234375, + "learning_rate": 2e-07, + "loss": 0.0031028717756271364, + "memory(GiB)": 113.5, + "reward": 0.1916666716337204, + "reward_std": 0.2526139706373215, + "rewards/MultiModalAccuracyORM/mean": 0.1916666716337204, + "rewards/MultiModalAccuracyORM/std": 0.2526139706373215, + "step": 1905, + "train_speed(iter/s)": 0.031932 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.5, + "completions/mean_length": 332.84167633056643, + "completions/min_length": 201.7, + "epoch": 0.7717171717171717, + "grad_norm": 1.9126113362129455, + "kl": 0.0396942138671875, + "learning_rate": 2e-07, + "loss": -0.03872146010398865, + "memory(GiB)": 113.5, + "reward": 0.28333334252238274, + "reward_std": 0.22631654143333435, + "rewards/MultiModalAccuracyORM/mean": 0.28333334252238274, + "rewards/MultiModalAccuracyORM/std": 0.22631654143333435, + "step": 1910, + "train_speed(iter/s)": 0.031948 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.2, + "completions/mean_length": 324.3500061035156, + "completions/min_length": 168.6, + "epoch": 0.7737373737373737, + "grad_norm": 3.0923030780646883, + "kl": 0.0193115234375, + "learning_rate": 2e-07, + "loss": -0.00021869316697120667, + "memory(GiB)": 113.5, + "reward": 0.21666667312383653, + "reward_std": 0.3495877593755722, + "rewards/MultiModalAccuracyORM/mean": 0.21666667312383653, + "rewards/MultiModalAccuracyORM/std": 0.3495877593755722, + "step": 1915, + "train_speed(iter/s)": 0.031953 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 851.2, + "completions/mean_length": 429.6666732788086, + "completions/min_length": 246.7, + "epoch": 0.7757575757575758, + "grad_norm": 1.4230705183827115, + "kl": 0.0119659423828125, + "learning_rate": 2e-07, + "loss": -0.007732442766427994, + "memory(GiB)": 113.5, + "reward": 0.28333334177732467, + "reward_std": 0.3922538310289383, + "rewards/MultiModalAccuracyORM/mean": 0.28333334177732467, + "rewards/MultiModalAccuracyORM/std": 0.3922538310289383, + "step": 1920, + "train_speed(iter/s)": 0.03195 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.1, + "completions/mean_length": 327.52501220703124, + "completions/min_length": 145.3, + "epoch": 0.7777777777777778, + "grad_norm": 2.492778960496138, + "kl": 0.0209716796875, + "learning_rate": 2e-07, + "loss": 0.058314287662506105, + "memory(GiB)": 113.5, + "reward": 0.3416666746139526, + "reward_std": 0.3370794355869293, + "rewards/MultiModalAccuracyORM/mean": 0.3416666746139526, + "rewards/MultiModalAccuracyORM/std": 0.3370794355869293, + "step": 1925, + "train_speed(iter/s)": 0.031957 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.2, + "completions/mean_length": 314.4666748046875, + "completions/min_length": 159.8, + "epoch": 0.7797979797979798, + "grad_norm": 1.3216256644694324, + "kl": 0.019390869140625, + "learning_rate": 2e-07, + "loss": 0.003662779927253723, + "memory(GiB)": 113.5, + "reward": 0.40000000447034834, + "reward_std": 0.19031869769096374, + "rewards/MultiModalAccuracyORM/mean": 0.40000000447034834, + "rewards/MultiModalAccuracyORM/std": 0.19031869769096374, + "step": 1930, + "train_speed(iter/s)": 0.031969 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 614.5, + "completions/mean_length": 340.2666778564453, + "completions/min_length": 194.2, + "epoch": 0.7818181818181819, + "grad_norm": 0.16139191599066427, + "kl": 0.0214599609375, + "learning_rate": 2e-07, + "loss": -0.047375884652137754, + "memory(GiB)": 113.5, + "reward": 0.5166666835546494, + "reward_std": 0.33453335165977477, + "rewards/MultiModalAccuracyORM/mean": 0.5166666835546494, + "rewards/MultiModalAccuracyORM/std": 0.33453335165977477, + "step": 1935, + "train_speed(iter/s)": 0.031969 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/mean_length": 385.9500137329102, + "completions/min_length": 210.7, + "epoch": 0.7838383838383839, + "grad_norm": 1.6715004236392428, + "kl": 0.0131805419921875, + "learning_rate": 2e-07, + "loss": -0.010814064741134643, + "memory(GiB)": 113.5, + "reward": 0.2250000037252903, + "reward_std": 0.2325587034225464, + "rewards/MultiModalAccuracyORM/mean": 0.2250000037252903, + "rewards/MultiModalAccuracyORM/std": 0.2325587034225464, + "step": 1940, + "train_speed(iter/s)": 0.031974 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 803.8, + "completions/mean_length": 441.1000091552734, + "completions/min_length": 249.1, + "epoch": 0.7858585858585858, + "grad_norm": 1.1825903834954647, + "kl": 0.0154388427734375, + "learning_rate": 2e-07, + "loss": 0.0033442020416259766, + "memory(GiB)": 113.5, + "reward": 0.20833333656191827, + "reward_std": 0.2938903748989105, + "rewards/MultiModalAccuracyORM/mean": 0.20833333656191827, + "rewards/MultiModalAccuracyORM/std": 0.2938903748989105, + "step": 1945, + "train_speed(iter/s)": 0.031969 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.7, + "completions/mean_length": 325.9666748046875, + "completions/min_length": 175.2, + "epoch": 0.7878787878787878, + "grad_norm": 1.0389835461828303, + "kl": 0.029302978515625, + "learning_rate": 2e-07, + "loss": 0.0020487613976001738, + "memory(GiB)": 113.5, + "reward": 0.31666667461395265, + "reward_std": 0.2074468642473221, + "rewards/MultiModalAccuracyORM/mean": 0.31666667461395265, + "rewards/MultiModalAccuracyORM/std": 0.2074468642473221, + "step": 1950, + "train_speed(iter/s)": 0.031978 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 779.1, + "completions/mean_length": 420.0750106811523, + "completions/min_length": 233.8, + "epoch": 0.7898989898989899, + "grad_norm": 2.259966046846081, + "kl": 0.0141693115234375, + "learning_rate": 2e-07, + "loss": 0.0017102479934692383, + "memory(GiB)": 113.5, + "reward": 0.09166666939854622, + "reward_std": 0.18332210481166838, + "rewards/MultiModalAccuracyORM/mean": 0.09166666939854622, + "rewards/MultiModalAccuracyORM/std": 0.18332210481166838, + "step": 1955, + "train_speed(iter/s)": 0.031981 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.8, + "completions/mean_length": 396.0333450317383, + "completions/min_length": 206.8, + "epoch": 0.7919191919191919, + "grad_norm": 2.0736200850407713, + "kl": 0.0152496337890625, + "learning_rate": 2e-07, + "loss": -0.005099079012870789, + "memory(GiB)": 113.5, + "reward": 0.5583333425223828, + "reward_std": 0.28784283697605134, + "rewards/MultiModalAccuracyORM/mean": 0.5583333425223828, + "rewards/MultiModalAccuracyORM/std": 0.28784283697605134, + "step": 1960, + "train_speed(iter/s)": 0.031988 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 954.2, + "completions/mean_length": 473.5583465576172, + "completions/min_length": 247.3, + "epoch": 0.793939393939394, + "grad_norm": 2.157212917514597, + "kl": 0.0150390625, + "learning_rate": 2e-07, + "loss": 0.024318861961364745, + "memory(GiB)": 113.5, + "reward": 0.21666667535901069, + "reward_std": 0.36190145611763, + "rewards/MultiModalAccuracyORM/mean": 0.21666667535901069, + "rewards/MultiModalAccuracyORM/std": 0.36190145611763, + "step": 1965, + "train_speed(iter/s)": 0.031976 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.4, + "completions/mean_length": 456.8000152587891, + "completions/min_length": 269.1, + "epoch": 0.795959595959596, + "grad_norm": 2.346804141928421, + "kl": 0.0154296875, + "learning_rate": 2e-07, + "loss": 0.011195459961891174, + "memory(GiB)": 113.5, + "reward": 0.1500000037252903, + "reward_std": 0.25897532403469087, + "rewards/MultiModalAccuracyORM/mean": 0.1500000037252903, + "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, + "step": 1970, + "train_speed(iter/s)": 0.031975 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.7, + "completions/mean_length": 426.95000610351565, + "completions/min_length": 288.6, + "epoch": 0.797979797979798, + "grad_norm": 1.937444109706918, + "kl": 0.012738037109375, + "learning_rate": 2e-07, + "loss": 0.050849252939224245, + "memory(GiB)": 113.5, + "reward": 0.33333334401249887, + "reward_std": 0.35569489002227783, + "rewards/MultiModalAccuracyORM/mean": 0.33333334401249887, + "rewards/MultiModalAccuracyORM/std": 0.35569489002227783, + "step": 1975, + "train_speed(iter/s)": 0.031976 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.9, + "completions/mean_length": 308.3166732788086, + "completions/min_length": 159.5, + "epoch": 0.8, + "grad_norm": 1.2310292555448101, + "kl": 0.01746826171875, + "learning_rate": 2e-07, + "loss": 0.021820831298828124, + "memory(GiB)": 113.5, + "reward": 0.25000000149011614, + "reward_std": 0.34010172784328463, + "rewards/MultiModalAccuracyORM/mean": 0.25000000149011614, + "rewards/MultiModalAccuracyORM/std": 0.34010172784328463, + "step": 1980, + "train_speed(iter/s)": 0.031988 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.3, + "completions/mean_length": 397.74167861938474, + "completions/min_length": 229.1, + "epoch": 0.802020202020202, + "grad_norm": 1.2006546705713226, + "kl": 0.012060546875, + "learning_rate": 2e-07, + "loss": -0.00946882963180542, + "memory(GiB)": 113.5, + "reward": 0.3666666768491268, + "reward_std": 0.21775851845741273, + "rewards/MultiModalAccuracyORM/mean": 0.3666666768491268, + "rewards/MultiModalAccuracyORM/std": 0.21775851845741273, + "step": 1985, + "train_speed(iter/s)": 0.031999 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/mean_length": 455.6916870117187, + "completions/min_length": 271.2, + "epoch": 0.804040404040404, + "grad_norm": 1.7247663724146078, + "kl": 0.0120758056640625, + "learning_rate": 2e-07, + "loss": -0.013834655284881592, + "memory(GiB)": 113.5, + "reward": 0.2916666746139526, + "reward_std": 0.34933353662490846, + "rewards/MultiModalAccuracyORM/mean": 0.2916666746139526, + "rewards/MultiModalAccuracyORM/std": 0.34933353662490846, + "step": 1990, + "train_speed(iter/s)": 0.032003 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.3, + "completions/mean_length": 388.1000122070312, + "completions/min_length": 220.5, + "epoch": 0.806060606060606, + "grad_norm": 1.2751536443809328, + "kl": 0.0211517333984375, + "learning_rate": 2e-07, + "loss": 0.026651501655578613, + "memory(GiB)": 113.5, + "reward": 0.2750000096857548, + "reward_std": 0.29452561140060424, + "rewards/MultiModalAccuracyORM/mean": 0.2750000096857548, + "rewards/MultiModalAccuracyORM/std": 0.29452561140060424, + "step": 1995, + "train_speed(iter/s)": 0.032003 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 2.53993588975996, + "learning_rate": 2e-07, + "loss": 0.008918963372707367, + "memory(GiB)": 113.5, + "step": 2000, + "train_speed(iter/s)": 0.032019 + }, + { + "epoch": 0.8080808080808081, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 640.5, + "eval_completions/mean_length": 393.71500930786135, + "eval_completions/min_length": 218.08, + "eval_kl": 0.01480712890625, + "eval_loss": 0.023003682494163513, + "eval_reward": 0.30333334133028983, + "eval_reward_std": 0.2836029249429703, + "eval_rewards/MultiModalAccuracyORM/mean": 0.30333334133028983, + "eval_rewards/MultiModalAccuracyORM/std": 0.2836029249429703, + "eval_runtime": 625.7559, + "eval_samples_per_second": 0.08, + "eval_steps_per_second": 0.008, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.1, + "completions/mean_length": 439.12501831054686, + "completions/min_length": 245.5, + "epoch": 0.8101010101010101, + "grad_norm": 1.574060720208308, + "kl": 0.01459503173828125, + "learning_rate": 2e-07, + "loss": -0.005982875823974609, + "memory(GiB)": 113.5, + "reward": 0.33333334103226664, + "reward_std": 0.3096754729747772, + "rewards/MultiModalAccuracyORM/mean": 0.33333334103226664, + "rewards/MultiModalAccuracyORM/std": 0.3096754729747772, + "step": 2005, + "train_speed(iter/s)": 0.031605 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.3, + "completions/mean_length": 312.4916702270508, + "completions/min_length": 184.5, + "epoch": 0.8121212121212121, + "grad_norm": 1.8875313816028536, + "kl": 0.01746826171875, + "learning_rate": 2e-07, + "loss": 0.04548422992229462, + "memory(GiB)": 113.5, + "reward": 0.4666666842997074, + "reward_std": 0.4252053827047348, + "rewards/MultiModalAccuracyORM/mean": 0.4666666842997074, + "rewards/MultiModalAccuracyORM/std": 0.4252053827047348, + "step": 2010, + "train_speed(iter/s)": 0.031626 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.2, + "completions/mean_length": 329.9166778564453, + "completions/min_length": 202.0, + "epoch": 0.8141414141414142, + "grad_norm": 1.858641750452265, + "kl": 0.0157196044921875, + "learning_rate": 2e-07, + "loss": 0.023762321472167967, + "memory(GiB)": 113.5, + "reward": 0.40000001043081285, + "reward_std": 0.3144780844449997, + "rewards/MultiModalAccuracyORM/mean": 0.40000001043081285, + "rewards/MultiModalAccuracyORM/std": 0.3144780844449997, + "step": 2015, + "train_speed(iter/s)": 0.031642 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/mean_length": 422.71667633056643, + "completions/min_length": 200.3, + "epoch": 0.8161616161616162, + "grad_norm": 3.0722357868631334, + "kl": 0.0186279296875, + "learning_rate": 2e-07, + "loss": -0.03257267475128174, + "memory(GiB)": 113.5, + "reward": 0.32500000968575476, + "reward_std": 0.4204265087842941, + "rewards/MultiModalAccuracyORM/mean": 0.32500000968575476, + "rewards/MultiModalAccuracyORM/std": 0.4204265087842941, + "step": 2020, + "train_speed(iter/s)": 0.031653 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.7, + "completions/mean_length": 355.5083511352539, + "completions/min_length": 214.7, + "epoch": 0.8181818181818182, + "grad_norm": 2.729236730716231, + "kl": 0.022601318359375, + "learning_rate": 2e-07, + "loss": -0.003387349843978882, + "memory(GiB)": 113.5, + "reward": 0.4250000074505806, + "reward_std": 0.45008404850959777, + "rewards/MultiModalAccuracyORM/mean": 0.4250000074505806, + "rewards/MultiModalAccuracyORM/std": 0.45008404850959777, + "step": 2025, + "train_speed(iter/s)": 0.031658 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.6, + "completions/mean_length": 421.4000183105469, + "completions/min_length": 253.0, + "epoch": 0.8202020202020202, + "grad_norm": 1.744543874583184, + "kl": 0.0112030029296875, + "learning_rate": 2e-07, + "loss": -0.013242574036121368, + "memory(GiB)": 113.5, + "reward": 0.1083333358168602, + "reward_std": 0.29628167152404783, + "rewards/MultiModalAccuracyORM/mean": 0.1083333358168602, + "rewards/MultiModalAccuracyORM/std": 0.29628167152404783, + "step": 2030, + "train_speed(iter/s)": 0.031663 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.7, + "completions/mean_length": 348.20834197998045, + "completions/min_length": 219.4, + "epoch": 0.8222222222222222, + "grad_norm": 1.3474739820299675, + "kl": 0.018927001953125, + "learning_rate": 2e-07, + "loss": 0.04633485376834869, + "memory(GiB)": 113.5, + "reward": 0.37500000596046446, + "reward_std": 0.27622397541999816, + "rewards/MultiModalAccuracyORM/mean": 0.37500000596046446, + "rewards/MultiModalAccuracyORM/std": 0.27622397541999816, + "step": 2035, + "train_speed(iter/s)": 0.031672 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.4, + "completions/mean_length": 350.50000915527346, + "completions/min_length": 204.4, + "epoch": 0.8242424242424242, + "grad_norm": 1.5018646106657063, + "kl": 0.019403076171875, + "learning_rate": 2e-07, + "loss": 0.030666446685791014, + "memory(GiB)": 113.5, + "reward": 0.49166667014360427, + "reward_std": 0.32050161957740786, + "rewards/MultiModalAccuracyORM/mean": 0.49166667014360427, + "rewards/MultiModalAccuracyORM/std": 0.32050161957740786, + "step": 2040, + "train_speed(iter/s)": 0.031681 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.7, + "completions/mean_length": 343.3500045776367, + "completions/min_length": 214.3, + "epoch": 0.8262626262626263, + "grad_norm": 1.212062767454231, + "kl": 0.0136077880859375, + "learning_rate": 2e-07, + "loss": 0.00010424554347991944, + "memory(GiB)": 113.5, + "reward": 0.2833333402872086, + "reward_std": 0.3485885590314865, + "rewards/MultiModalAccuracyORM/mean": 0.2833333402872086, + "rewards/MultiModalAccuracyORM/std": 0.3485885590314865, + "step": 2045, + "train_speed(iter/s)": 0.031692 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/mean_length": 329.8250106811523, + "completions/min_length": 163.3, + "epoch": 0.8282828282828283, + "grad_norm": 1.4699358421537125, + "kl": 0.015545654296875, + "learning_rate": 2e-07, + "loss": 0.02045893669128418, + "memory(GiB)": 113.5, + "reward": 0.23333333656191826, + "reward_std": 0.21999078392982482, + "rewards/MultiModalAccuracyORM/mean": 0.23333333656191826, + "rewards/MultiModalAccuracyORM/std": 0.21999078392982482, + "step": 2050, + "train_speed(iter/s)": 0.031697 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.3, + "completions/mean_length": 291.33334197998045, + "completions/min_length": 162.8, + "epoch": 0.8303030303030303, + "grad_norm": 1.4524213577819918, + "kl": 0.0140533447265625, + "learning_rate": 2e-07, + "loss": 0.008110976219177246, + "memory(GiB)": 113.5, + "reward": 0.3166666738688946, + "reward_std": 0.20369119048118592, + "rewards/MultiModalAccuracyORM/mean": 0.3166666738688946, + "rewards/MultiModalAccuracyORM/std": 0.20369119048118592, + "step": 2055, + "train_speed(iter/s)": 0.031712 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.6, + "completions/mean_length": 289.6416732788086, + "completions/min_length": 159.3, + "epoch": 0.8323232323232324, + "grad_norm": 4.225291158056462, + "kl": 0.0178955078125, + "learning_rate": 2e-07, + "loss": 0.025725898146629334, + "memory(GiB)": 113.5, + "reward": 0.20000000447034835, + "reward_std": 0.29414459466934206, + "rewards/MultiModalAccuracyORM/mean": 0.20000000447034835, + "rewards/MultiModalAccuracyORM/std": 0.29414459466934206, + "step": 2060, + "train_speed(iter/s)": 0.031722 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.6, + "completions/mean_length": 311.5416793823242, + "completions/min_length": 190.1, + "epoch": 0.8343434343434344, + "grad_norm": 2.6613790818964134, + "kl": 0.021978759765625, + "learning_rate": 2e-07, + "loss": 0.006576963514089584, + "memory(GiB)": 113.5, + "reward": 0.2916666753590107, + "reward_std": 0.40155683159828187, + "rewards/MultiModalAccuracyORM/mean": 0.2916666753590107, + "rewards/MultiModalAccuracyORM/std": 0.40155683159828187, + "step": 2065, + "train_speed(iter/s)": 0.03173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.6, + "completions/mean_length": 315.6750038146973, + "completions/min_length": 170.0, + "epoch": 0.8363636363636363, + "grad_norm": 1.5051205676406512, + "kl": 0.0233642578125, + "learning_rate": 2e-07, + "loss": 0.09363476037979127, + "memory(GiB)": 113.5, + "reward": 0.508333345502615, + "reward_std": 0.2822715103626251, + "rewards/MultiModalAccuracyORM/mean": 0.508333345502615, + "rewards/MultiModalAccuracyORM/std": 0.2822715103626251, + "step": 2070, + "train_speed(iter/s)": 0.03174 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.5, + "completions/mean_length": 375.75000762939453, + "completions/min_length": 224.4, + "epoch": 0.8383838383838383, + "grad_norm": 1.8057057513107744, + "kl": 0.018963623046875, + "learning_rate": 2e-07, + "loss": -0.023636098206043243, + "memory(GiB)": 113.5, + "reward": 0.4000000111758709, + "reward_std": 0.33306954205036166, + "rewards/MultiModalAccuracyORM/mean": 0.4000000111758709, + "rewards/MultiModalAccuracyORM/std": 0.33306954205036166, + "step": 2075, + "train_speed(iter/s)": 0.031751 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.7, + "completions/mean_length": 414.0416793823242, + "completions/min_length": 194.4, + "epoch": 0.8404040404040404, + "grad_norm": 1.7580953588231485, + "kl": 0.015985107421875, + "learning_rate": 2e-07, + "loss": 0.004860112071037292, + "memory(GiB)": 113.5, + "reward": 0.33333334028720857, + "reward_std": 0.30333785712718964, + "rewards/MultiModalAccuracyORM/mean": 0.33333334028720857, + "rewards/MultiModalAccuracyORM/std": 0.30333785712718964, + "step": 2080, + "train_speed(iter/s)": 0.031761 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.9, + "completions/mean_length": 372.9250122070313, + "completions/min_length": 192.9, + "epoch": 0.8424242424242424, + "grad_norm": 1.6888227745633726, + "kl": 0.018841552734375, + "learning_rate": 2e-07, + "loss": 0.0038746654987335204, + "memory(GiB)": 113.5, + "reward": 0.40000001564621923, + "reward_std": 0.3948384612798691, + "rewards/MultiModalAccuracyORM/mean": 0.40000001564621923, + "rewards/MultiModalAccuracyORM/std": 0.3948384612798691, + "step": 2085, + "train_speed(iter/s)": 0.031778 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.5, + "completions/mean_length": 359.8833480834961, + "completions/min_length": 211.2, + "epoch": 0.8444444444444444, + "grad_norm": 2.0473894442291605, + "kl": 0.0135040283203125, + "learning_rate": 2e-07, + "loss": -0.005132901668548584, + "memory(GiB)": 113.5, + "reward": 0.4833333469927311, + "reward_std": 0.38904850780963895, + "rewards/MultiModalAccuracyORM/mean": 0.4833333469927311, + "rewards/MultiModalAccuracyORM/std": 0.38904850780963895, + "step": 2090, + "train_speed(iter/s)": 0.031799 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.2, + "completions/mean_length": 453.96668243408203, + "completions/min_length": 251.3, + "epoch": 0.8464646464646465, + "grad_norm": 1.9528281428716412, + "kl": 0.015765380859375, + "learning_rate": 2e-07, + "loss": -0.00459083616733551, + "memory(GiB)": 113.5, + "reward": 0.3416666783392429, + "reward_std": 0.4211809396743774, + "rewards/MultiModalAccuracyORM/mean": 0.3416666783392429, + "rewards/MultiModalAccuracyORM/std": 0.4211809396743774, + "step": 2095, + "train_speed(iter/s)": 0.031796 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.1, + "completions/mean_length": 360.11667785644534, + "completions/min_length": 181.9, + "epoch": 0.8484848484848485, + "grad_norm": 1.6610621083165809, + "kl": 0.0178131103515625, + "learning_rate": 2e-07, + "loss": 0.0021423667669296263, + "memory(GiB)": 113.5, + "reward": 0.31666667237877844, + "reward_std": 0.33000870048999786, + "rewards/MultiModalAccuracyORM/mean": 0.31666667237877844, + "rewards/MultiModalAccuracyORM/std": 0.33000870048999786, + "step": 2100, + "train_speed(iter/s)": 0.031806 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.3, + "completions/mean_length": 264.67501068115234, + "completions/min_length": 136.9, + "epoch": 0.8505050505050505, + "grad_norm": 2.0696467764003192, + "kl": 0.163067626953125, + "learning_rate": 2e-07, + "loss": 0.0025389432907104493, + "memory(GiB)": 113.5, + "reward": 0.5000000096857548, + "reward_std": 0.22625694572925567, + "rewards/MultiModalAccuracyORM/mean": 0.5000000096857548, + "rewards/MultiModalAccuracyORM/std": 0.22625694572925567, + "step": 2105, + "train_speed(iter/s)": 0.03182 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.7, + "completions/mean_length": 341.6750091552734, + "completions/min_length": 188.4, + "epoch": 0.8525252525252526, + "grad_norm": 1.7692403149903426, + "kl": 0.0196319580078125, + "learning_rate": 2e-07, + "loss": 0.016690313816070557, + "memory(GiB)": 113.5, + "reward": 0.3583333343267441, + "reward_std": 0.21292004883289337, + "rewards/MultiModalAccuracyORM/mean": 0.3583333343267441, + "rewards/MultiModalAccuracyORM/std": 0.21292004883289337, + "step": 2110, + "train_speed(iter/s)": 0.031829 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.9, + "completions/mean_length": 410.6416839599609, + "completions/min_length": 215.5, + "epoch": 0.8545454545454545, + "grad_norm": 1.0155490841614827, + "kl": 0.0239013671875, + "learning_rate": 2e-07, + "loss": 0.06116962432861328, + "memory(GiB)": 113.5, + "reward": 0.3000000067055225, + "reward_std": 0.3330695390701294, + "rewards/MultiModalAccuracyORM/mean": 0.3000000067055225, + "rewards/MultiModalAccuracyORM/std": 0.3330695390701294, + "step": 2115, + "train_speed(iter/s)": 0.031834 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 756.7, + "completions/mean_length": 427.03334350585936, + "completions/min_length": 240.2, + "epoch": 0.8565656565656565, + "grad_norm": 1.5010329222185153, + "kl": 0.0179931640625, + "learning_rate": 2e-07, + "loss": 0.008207672834396362, + "memory(GiB)": 113.5, + "reward": 0.3000000067055225, + "reward_std": 0.26822818219661715, + "rewards/MultiModalAccuracyORM/mean": 0.3000000067055225, + "rewards/MultiModalAccuracyORM/std": 0.26822818219661715, + "step": 2120, + "train_speed(iter/s)": 0.031834 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/mean_length": 278.35834350585935, + "completions/min_length": 133.5, + "epoch": 0.8585858585858586, + "grad_norm": 2.4751168878714296, + "kl": 0.0186279296875, + "learning_rate": 2e-07, + "loss": 0.002880534529685974, + "memory(GiB)": 113.5, + "reward": 0.44166667833924295, + "reward_std": 0.26897315979003905, + "rewards/MultiModalAccuracyORM/mean": 0.44166667833924295, + "rewards/MultiModalAccuracyORM/std": 0.26897315979003905, + "step": 2125, + "train_speed(iter/s)": 0.03185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 778.6, + "completions/mean_length": 397.366682434082, + "completions/min_length": 238.8, + "epoch": 0.8606060606060606, + "grad_norm": 2.5358484406016406, + "kl": 0.024908447265625, + "learning_rate": 2e-07, + "loss": -0.008894717693328858, + "memory(GiB)": 113.5, + "reward": 0.4166666753590107, + "reward_std": 0.39010730385780334, + "rewards/MultiModalAccuracyORM/mean": 0.4166666753590107, + "rewards/MultiModalAccuracyORM/std": 0.39010730385780334, + "step": 2130, + "train_speed(iter/s)": 0.031849 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.6, + "completions/mean_length": 347.92500762939454, + "completions/min_length": 210.0, + "epoch": 0.8626262626262626, + "grad_norm": 1.4480874521635712, + "kl": 0.013836669921875, + "learning_rate": 2e-07, + "loss": -0.02624996304512024, + "memory(GiB)": 113.5, + "reward": 0.24166666865348815, + "reward_std": 0.2815766751766205, + "rewards/MultiModalAccuracyORM/mean": 0.24166666865348815, + "rewards/MultiModalAccuracyORM/std": 0.2815766751766205, + "step": 2135, + "train_speed(iter/s)": 0.031857 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 792.0, + "completions/mean_length": 399.3750129699707, + "completions/min_length": 233.7, + "epoch": 0.8646464646464647, + "grad_norm": 2.3120304434709595, + "kl": 0.01986083984375, + "learning_rate": 2e-07, + "loss": -0.004719728231430053, + "memory(GiB)": 113.5, + "reward": 0.22500000149011612, + "reward_std": 0.22384164929389955, + "rewards/MultiModalAccuracyORM/mean": 0.22500000149011612, + "rewards/MultiModalAccuracyORM/std": 0.22384164929389955, + "step": 2140, + "train_speed(iter/s)": 0.03185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.2, + "completions/mean_length": 441.508349609375, + "completions/min_length": 236.5, + "epoch": 0.8666666666666667, + "grad_norm": 2.17037271282662, + "kl": 0.01793212890625, + "learning_rate": 2e-07, + "loss": -0.012784427404403687, + "memory(GiB)": 113.5, + "reward": 0.23333333432674408, + "reward_std": 0.2581467509269714, + "rewards/MultiModalAccuracyORM/mean": 0.23333333432674408, + "rewards/MultiModalAccuracyORM/std": 0.2581467509269714, + "step": 2145, + "train_speed(iter/s)": 0.03185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.1, + "completions/mean_length": 304.8583419799805, + "completions/min_length": 197.2, + "epoch": 0.8686868686868687, + "grad_norm": 2.4684483286798313, + "kl": 0.018658447265625, + "learning_rate": 2e-07, + "loss": -0.013285607099533081, + "memory(GiB)": 113.5, + "reward": 0.4250000067055225, + "reward_std": 0.3696640759706497, + "rewards/MultiModalAccuracyORM/mean": 0.4250000067055225, + "rewards/MultiModalAccuracyORM/std": 0.3696640759706497, + "step": 2150, + "train_speed(iter/s)": 0.031865 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.6, + "completions/mean_length": 357.44168243408205, + "completions/min_length": 206.1, + "epoch": 0.8707070707070707, + "grad_norm": 2.4866065792724794, + "kl": 0.01856689453125, + "learning_rate": 2e-07, + "loss": -0.014015734195709229, + "memory(GiB)": 113.5, + "reward": 0.23333334177732468, + "reward_std": 0.24436976611614228, + "rewards/MultiModalAccuracyORM/mean": 0.23333334177732468, + "rewards/MultiModalAccuracyORM/std": 0.24436976611614228, + "step": 2155, + "train_speed(iter/s)": 0.03187 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.2, + "completions/mean_length": 316.7916778564453, + "completions/min_length": 156.8, + "epoch": 0.8727272727272727, + "grad_norm": 0.10342060356020474, + "kl": 0.019561767578125, + "learning_rate": 2e-07, + "loss": 0.015072919428348541, + "memory(GiB)": 113.5, + "reward": 0.5250000067055225, + "reward_std": 0.23303491175174712, + "rewards/MultiModalAccuracyORM/mean": 0.5250000067055225, + "rewards/MultiModalAccuracyORM/std": 0.23303491175174712, + "step": 2160, + "train_speed(iter/s)": 0.031885 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.5, + "completions/mean_length": 359.40834350585936, + "completions/min_length": 213.1, + "epoch": 0.8747474747474747, + "grad_norm": 2.001531798018373, + "kl": 0.02108154296875, + "learning_rate": 2e-07, + "loss": 0.03580483496189117, + "memory(GiB)": 113.5, + "reward": 0.2916666708886623, + "reward_std": 0.37593023777008056, + "rewards/MultiModalAccuracyORM/mean": 0.2916666708886623, + "rewards/MultiModalAccuracyORM/std": 0.37593023777008056, + "step": 2165, + "train_speed(iter/s)": 0.03189 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.1, + "completions/mean_length": 376.5666778564453, + "completions/min_length": 215.1, + "epoch": 0.8767676767676768, + "grad_norm": 1.7513379048306494, + "kl": 0.01739501953125, + "learning_rate": 2e-07, + "loss": -0.001603315770626068, + "memory(GiB)": 113.5, + "reward": 0.35833333656191824, + "reward_std": 0.27927026748657224, + "rewards/MultiModalAccuracyORM/mean": 0.35833333656191824, + "rewards/MultiModalAccuracyORM/std": 0.27927026748657224, + "step": 2170, + "train_speed(iter/s)": 0.031901 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.4, + "completions/mean_length": 326.4833419799805, + "completions/min_length": 178.7, + "epoch": 0.8787878787878788, + "grad_norm": 1.8257626566364757, + "kl": 0.01832275390625, + "learning_rate": 2e-07, + "loss": -0.0064360305666923525, + "memory(GiB)": 113.5, + "reward": 0.4000000096857548, + "reward_std": 0.2528681933879852, + "rewards/MultiModalAccuracyORM/mean": 0.4000000096857548, + "rewards/MultiModalAccuracyORM/std": 0.2528681933879852, + "step": 2175, + "train_speed(iter/s)": 0.031922 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.1, + "completions/mean_length": 443.8500183105469, + "completions/min_length": 252.4, + "epoch": 0.8808080808080808, + "grad_norm": 2.0080312898238777, + "kl": 0.0168212890625, + "learning_rate": 2e-07, + "loss": 0.003071814775466919, + "memory(GiB)": 113.5, + "reward": 0.25000001341104505, + "reward_std": 0.27749558687210085, + "rewards/MultiModalAccuracyORM/mean": 0.25000001341104505, + "rewards/MultiModalAccuracyORM/std": 0.27749558687210085, + "step": 2180, + "train_speed(iter/s)": 0.031939 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.9, + "completions/mean_length": 286.4000068664551, + "completions/min_length": 137.1, + "epoch": 0.8828282828282829, + "grad_norm": 2.6881230452510176, + "kl": 0.0223968505859375, + "learning_rate": 2e-07, + "loss": 0.0006526708602905273, + "memory(GiB)": 113.5, + "reward": 0.5333333387970924, + "reward_std": 0.20369119048118592, + "rewards/MultiModalAccuracyORM/mean": 0.5333333387970924, + "rewards/MultiModalAccuracyORM/std": 0.20369119048118592, + "step": 2185, + "train_speed(iter/s)": 0.031947 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.5, + "completions/mean_length": 402.3166793823242, + "completions/min_length": 210.6, + "epoch": 0.8848484848484849, + "grad_norm": 2.1762586962994126, + "kl": 0.02080078125, + "learning_rate": 2e-07, + "loss": 0.01941031664609909, + "memory(GiB)": 113.5, + "reward": 0.13333333805203437, + "reward_std": 0.28399197161197665, + "rewards/MultiModalAccuracyORM/mean": 0.13333333805203437, + "rewards/MultiModalAccuracyORM/std": 0.28399197161197665, + "step": 2190, + "train_speed(iter/s)": 0.031947 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.7, + "completions/mean_length": 376.9000160217285, + "completions/min_length": 233.4, + "epoch": 0.8868686868686869, + "grad_norm": 1.5099724310943463, + "kl": 0.013751220703125, + "learning_rate": 2e-07, + "loss": 0.018771827220916748, + "memory(GiB)": 113.5, + "reward": 0.3083333395421505, + "reward_std": 0.22406027615070342, + "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, + "rewards/MultiModalAccuracyORM/std": 0.22406027615070342, + "step": 2195, + "train_speed(iter/s)": 0.031954 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 564.0, + "completions/mean_length": 270.4416732788086, + "completions/min_length": 142.9, + "epoch": 0.8888888888888888, + "grad_norm": 2.2743682671690997, + "kl": 0.023577880859375, + "learning_rate": 2e-07, + "loss": 0.025069376826286315, + "memory(GiB)": 113.5, + "reward": 0.41666666939854624, + "reward_std": 0.34936913251876833, + "rewards/MultiModalAccuracyORM/mean": 0.41666666939854624, + "rewards/MultiModalAccuracyORM/std": 0.34936913251876833, + "step": 2200, + "train_speed(iter/s)": 0.031957 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.7, + "completions/mean_length": 299.5250076293945, + "completions/min_length": 173.3, + "epoch": 0.8909090909090909, + "grad_norm": 1.7147767163429606, + "kl": 0.015533447265625, + "learning_rate": 2e-07, + "loss": -0.01650981158018112, + "memory(GiB)": 113.5, + "reward": 0.3666666761040688, + "reward_std": 0.26142621636390684, + "rewards/MultiModalAccuracyORM/mean": 0.3666666761040688, + "rewards/MultiModalAccuracyORM/std": 0.26142621636390684, + "step": 2205, + "train_speed(iter/s)": 0.031967 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 678.7, + "completions/mean_length": 303.3583396911621, + "completions/min_length": 148.9, + "epoch": 0.8929292929292929, + "grad_norm": 0.07977564640032515, + "kl": 0.022882080078125, + "learning_rate": 2e-07, + "loss": -0.015148724615573882, + "memory(GiB)": 113.5, + "reward": 0.2750000052154064, + "reward_std": 0.2333131343126297, + "rewards/MultiModalAccuracyORM/mean": 0.2750000052154064, + "rewards/MultiModalAccuracyORM/std": 0.2333131343126297, + "step": 2210, + "train_speed(iter/s)": 0.031965 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.5, + "completions/mean_length": 388.18333740234374, + "completions/min_length": 198.5, + "epoch": 0.8949494949494949, + "grad_norm": 0.8099900753838608, + "kl": 0.0284912109375, + "learning_rate": 2e-07, + "loss": 0.00753181129693985, + "memory(GiB)": 113.5, + "reward": 0.416666679084301, + "reward_std": 0.34156554043292997, + "rewards/MultiModalAccuracyORM/mean": 0.416666679084301, + "rewards/MultiModalAccuracyORM/std": 0.34156554043292997, + "step": 2215, + "train_speed(iter/s)": 0.03197 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 779.3, + "completions/mean_length": 450.6333526611328, + "completions/min_length": 239.0, + "epoch": 0.896969696969697, + "grad_norm": 1.7310208765669708, + "kl": 0.0198486328125, + "learning_rate": 2e-07, + "loss": -0.004081086814403534, + "memory(GiB)": 113.5, + "reward": 0.2500000074505806, + "reward_std": 0.3800142765045166, + "rewards/MultiModalAccuracyORM/mean": 0.2500000074505806, + "rewards/MultiModalAccuracyORM/std": 0.3800142765045166, + "step": 2220, + "train_speed(iter/s)": 0.03197 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 676.4, + "completions/mean_length": 360.75001220703126, + "completions/min_length": 217.2, + "epoch": 0.898989898989899, + "grad_norm": 2.1020973545612702, + "kl": 0.01806640625, + "learning_rate": 2e-07, + "loss": 0.03712728023529053, + "memory(GiB)": 113.5, + "reward": 0.3000000134110451, + "reward_std": 0.32673218548297883, + "rewards/MultiModalAccuracyORM/mean": 0.3000000134110451, + "rewards/MultiModalAccuracyORM/std": 0.32673218548297883, + "step": 2225, + "train_speed(iter/s)": 0.031975 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 820.2, + "completions/mean_length": 446.9583396911621, + "completions/min_length": 246.3, + "epoch": 0.901010101010101, + "grad_norm": 1.2524422904219505, + "kl": 0.016302490234375, + "learning_rate": 2e-07, + "loss": -0.02771589457988739, + "memory(GiB)": 113.5, + "reward": 0.32500000596046447, + "reward_std": 0.31088480055332185, + "rewards/MultiModalAccuracyORM/mean": 0.32500000596046447, + "rewards/MultiModalAccuracyORM/std": 0.31088480055332185, + "step": 2230, + "train_speed(iter/s)": 0.031963 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.1, + "completions/mean_length": 314.425008392334, + "completions/min_length": 211.4, + "epoch": 0.9030303030303031, + "grad_norm": 1.941568088508899, + "kl": 0.0159149169921875, + "learning_rate": 2e-07, + "loss": 0.03777821063995361, + "memory(GiB)": 113.5, + "reward": 0.450000011920929, + "reward_std": 0.391499400138855, + "rewards/MultiModalAccuracyORM/mean": 0.450000011920929, + "rewards/MultiModalAccuracyORM/std": 0.391499400138855, + "step": 2235, + "train_speed(iter/s)": 0.031975 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.2, + "completions/mean_length": 344.6333435058594, + "completions/min_length": 198.5, + "epoch": 0.9050505050505051, + "grad_norm": 2.0763848655087673, + "kl": 0.019061279296875, + "learning_rate": 2e-07, + "loss": -0.0011584073305130004, + "memory(GiB)": 113.5, + "reward": 0.40000000670552255, + "reward_std": 0.34407602846622465, + "rewards/MultiModalAccuracyORM/mean": 0.40000000670552255, + "rewards/MultiModalAccuracyORM/std": 0.34407602846622465, + "step": 2240, + "train_speed(iter/s)": 0.031993 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.4, + "completions/mean_length": 319.63334197998046, + "completions/min_length": 160.3, + "epoch": 0.907070707070707, + "grad_norm": 2.59722388457303, + "kl": 0.022515869140625, + "learning_rate": 2e-07, + "loss": -0.017506715655326844, + "memory(GiB)": 113.5, + "reward": 0.27500001043081285, + "reward_std": 0.3227818846702576, + "rewards/MultiModalAccuracyORM/mean": 0.27500001043081285, + "rewards/MultiModalAccuracyORM/std": 0.3227818846702576, + "step": 2245, + "train_speed(iter/s)": 0.031998 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.3781323461012882, + "learning_rate": 2e-07, + "loss": 0.01341366171836853, + "memory(GiB)": 113.5, + "step": 2250, + "train_speed(iter/s)": 0.032003 + }, + { + "epoch": 0.9090909090909091, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0016666666666666666, + "eval_completions/max_length": 642.72, + "eval_completions/mean_length": 376.58501220703124, + "eval_completions/min_length": 201.48, + "eval_kl": 0.01755615234375, + "eval_loss": 0.022878510877490044, + "eval_reward": 0.3366666728258133, + "eval_reward_std": 0.29963068544864657, + "eval_rewards/MultiModalAccuracyORM/mean": 0.3366666728258133, + "eval_rewards/MultiModalAccuracyORM/std": 0.29963068544864657, + "eval_runtime": 620.6156, + "eval_samples_per_second": 0.081, + "eval_steps_per_second": 0.008, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.55, + "completions/mean_length": 408.7458442687988, + "completions/min_length": 214.2, + "epoch": 0.9111111111111111, + "grad_norm": 0.0911724129495613, + "kl": 0.01767578125, + "learning_rate": 2e-07, + "loss": 0.05687015056610108, + "memory(GiB)": 113.5, + "reward": 0.3166666738688946, + "reward_std": 0.32789033353328706, + "rewards/MultiModalAccuracyORM/mean": 0.3166666738688946, + "rewards/MultiModalAccuracyORM/std": 0.32789033353328706, + "step": 2255, + "train_speed(iter/s)": 0.031634 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 646.3, + "completions/mean_length": 310.35000762939455, + "completions/min_length": 147.8, + "epoch": 0.9131313131313131, + "grad_norm": 2.3215328543725837, + "kl": 0.02152099609375, + "learning_rate": 2e-07, + "loss": -0.02131924331188202, + "memory(GiB)": 113.5, + "reward": 0.3750000037252903, + "reward_std": 0.2659719169139862, + "rewards/MultiModalAccuracyORM/mean": 0.3750000037252903, + "rewards/MultiModalAccuracyORM/std": 0.2659719169139862, + "step": 2260, + "train_speed(iter/s)": 0.03164 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.3, + "completions/mean_length": 371.5416793823242, + "completions/min_length": 220.7, + "epoch": 0.9151515151515152, + "grad_norm": 2.126621344773754, + "kl": 0.018896484375, + "learning_rate": 2e-07, + "loss": 0.024756547808647156, + "memory(GiB)": 113.5, + "reward": 0.2750000052154064, + "reward_std": 0.2619264245033264, + "rewards/MultiModalAccuracyORM/mean": 0.2750000052154064, + "rewards/MultiModalAccuracyORM/std": 0.2619264245033264, + "step": 2265, + "train_speed(iter/s)": 0.031637 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.4, + "completions/mean_length": 415.3916778564453, + "completions/min_length": 254.9, + "epoch": 0.9171717171717172, + "grad_norm": 2.9790243495572137, + "kl": 0.0215087890625, + "learning_rate": 2e-07, + "loss": -0.012356171011924743, + "memory(GiB)": 113.5, + "reward": 0.1666666679084301, + "reward_std": 0.27520077526569364, + "rewards/MultiModalAccuracyORM/mean": 0.1666666679084301, + "rewards/MultiModalAccuracyORM/std": 0.27520077526569364, + "step": 2270, + "train_speed(iter/s)": 0.031641 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 771.9, + "completions/mean_length": 425.84167633056643, + "completions/min_length": 242.2, + "epoch": 0.9191919191919192, + "grad_norm": 1.0861715717638791, + "kl": 0.0269012451171875, + "learning_rate": 2e-07, + "loss": 0.010645134747028351, + "memory(GiB)": 113.5, + "reward": 0.2583333425223827, + "reward_std": 0.30260742604732516, + "rewards/MultiModalAccuracyORM/mean": 0.2583333425223827, + "rewards/MultiModalAccuracyORM/std": 0.30260742604732516, + "step": 2275, + "train_speed(iter/s)": 0.031636 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.9, + "completions/mean_length": 271.05000762939454, + "completions/min_length": 149.0, + "epoch": 0.9212121212121213, + "grad_norm": 0.05924941442962051, + "kl": 0.0225830078125, + "learning_rate": 2e-07, + "loss": 0.031935521960258485, + "memory(GiB)": 113.5, + "reward": 0.28333333805203437, + "reward_std": 0.304396653175354, + "rewards/MultiModalAccuracyORM/mean": 0.28333333805203437, + "rewards/MultiModalAccuracyORM/std": 0.304396653175354, + "step": 2280, + "train_speed(iter/s)": 0.031652 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.7, + "completions/mean_length": 391.0416763305664, + "completions/min_length": 203.9, + "epoch": 0.9232323232323232, + "grad_norm": 2.8587723324821566, + "kl": 0.024896240234375, + "learning_rate": 2e-07, + "loss": 0.017455708980560303, + "memory(GiB)": 113.5, + "reward": 0.33333333805203436, + "reward_std": 0.29177859127521516, + "rewards/MultiModalAccuracyORM/mean": 0.33333333805203436, + "rewards/MultiModalAccuracyORM/std": 0.29177859127521516, + "step": 2285, + "train_speed(iter/s)": 0.031656 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.5, + "completions/mean_length": 356.05834197998047, + "completions/min_length": 179.4, + "epoch": 0.9252525252525252, + "grad_norm": 3.219718675307709, + "kl": 0.0145965576171875, + "learning_rate": 2e-07, + "loss": -0.032944440841674805, + "memory(GiB)": 113.5, + "reward": 0.3000000067055225, + "reward_std": 0.2840515673160553, + "rewards/MultiModalAccuracyORM/mean": 0.3000000067055225, + "rewards/MultiModalAccuracyORM/std": 0.2840515673160553, + "step": 2290, + "train_speed(iter/s)": 0.031663 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.9, + "completions/mean_length": 448.50000762939453, + "completions/min_length": 247.4, + "epoch": 0.9272727272727272, + "grad_norm": 0.8757317301258869, + "kl": 0.0155517578125, + "learning_rate": 2e-07, + "loss": -0.008566761016845703, + "memory(GiB)": 113.5, + "reward": 0.2833333432674408, + "reward_std": 0.27596975266933443, + "rewards/MultiModalAccuracyORM/mean": 0.2833333432674408, + "rewards/MultiModalAccuracyORM/std": 0.27596975266933443, + "step": 2295, + "train_speed(iter/s)": 0.031671 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.8, + "completions/mean_length": 507.6833526611328, + "completions/min_length": 320.1, + "epoch": 0.9292929292929293, + "grad_norm": 0.9362166291898165, + "kl": 0.02156982421875, + "learning_rate": 2e-07, + "loss": 0.018462255597114563, + "memory(GiB)": 113.5, + "reward": 0.3083333373069763, + "reward_std": 0.2464074045419693, + "rewards/MultiModalAccuracyORM/mean": 0.3083333373069763, + "rewards/MultiModalAccuracyORM/std": 0.2464074045419693, + "step": 2300, + "train_speed(iter/s)": 0.03166 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.9, + "completions/mean_length": 257.65000381469724, + "completions/min_length": 128.1, + "epoch": 0.9313131313131313, + "grad_norm": 2.658818675138077, + "kl": 0.029730224609375, + "learning_rate": 2e-07, + "loss": 0.023678554594516753, + "memory(GiB)": 113.5, + "reward": 0.30000000521540643, + "reward_std": 0.15821026563644408, + "rewards/MultiModalAccuracyORM/mean": 0.30000000521540643, + "rewards/MultiModalAccuracyORM/std": 0.15821026563644408, + "step": 2305, + "train_speed(iter/s)": 0.031671 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 800.1, + "completions/mean_length": 461.5750137329102, + "completions/min_length": 264.1, + "epoch": 0.9333333333333333, + "grad_norm": 1.525329758838897, + "kl": 0.0238433837890625, + "learning_rate": 2e-07, + "loss": 0.016385090351104737, + "memory(GiB)": 113.5, + "reward": 0.1666666693985462, + "reward_std": 0.3190022110939026, + "rewards/MultiModalAccuracyORM/mean": 0.1666666693985462, + "rewards/MultiModalAccuracyORM/std": 0.3190022110939026, + "step": 2310, + "train_speed(iter/s)": 0.031667 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/mean_length": 360.6000061035156, + "completions/min_length": 211.2, + "epoch": 0.9353535353535354, + "grad_norm": 2.555254446139955, + "kl": 0.01925048828125, + "learning_rate": 2e-07, + "loss": -0.025304621458053587, + "memory(GiB)": 113.5, + "reward": 0.24166667088866234, + "reward_std": 0.309637188911438, + "rewards/MultiModalAccuracyORM/mean": 0.24166667088866234, + "rewards/MultiModalAccuracyORM/std": 0.309637188911438, + "step": 2315, + "train_speed(iter/s)": 0.031675 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/mean_length": 406.8916778564453, + "completions/min_length": 230.6, + "epoch": 0.9373737373737374, + "grad_norm": 1.8305198392785023, + "kl": 0.0230133056640625, + "learning_rate": 2e-07, + "loss": -0.014680406451225281, + "memory(GiB)": 113.5, + "reward": 0.3500000037252903, + "reward_std": 0.3111986190080643, + "rewards/MultiModalAccuracyORM/mean": 0.3500000037252903, + "rewards/MultiModalAccuracyORM/std": 0.3111986190080643, + "step": 2320, + "train_speed(iter/s)": 0.031676 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.8, + "completions/mean_length": 282.15833892822263, + "completions/min_length": 162.0, + "epoch": 0.9393939393939394, + "grad_norm": 2.852841229555507, + "kl": 0.02640380859375, + "learning_rate": 2e-07, + "loss": 0.0326883852481842, + "memory(GiB)": 113.5, + "reward": 0.3083333432674408, + "reward_std": 0.4167425513267517, + "rewards/MultiModalAccuracyORM/mean": 0.3083333432674408, + "rewards/MultiModalAccuracyORM/std": 0.4167425513267517, + "step": 2325, + "train_speed(iter/s)": 0.031687 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 790.1, + "completions/mean_length": 384.29168395996095, + "completions/min_length": 177.1, + "epoch": 0.9414141414141414, + "grad_norm": 2.0178414254144723, + "kl": 0.018560791015625, + "learning_rate": 2e-07, + "loss": 0.008831435441970825, + "memory(GiB)": 113.5, + "reward": 0.38333334028720856, + "reward_std": 0.36893364489078523, + "rewards/MultiModalAccuracyORM/mean": 0.38333334028720856, + "rewards/MultiModalAccuracyORM/std": 0.36893364489078523, + "step": 2330, + "train_speed(iter/s)": 0.031685 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 679.0, + "completions/mean_length": 368.05834503173827, + "completions/min_length": 206.5, + "epoch": 0.9434343434343434, + "grad_norm": 1.5228784773962754, + "kl": 0.015521240234375, + "learning_rate": 2e-07, + "loss": -0.008360534906387329, + "memory(GiB)": 113.5, + "reward": 0.3083333373069763, + "reward_std": 0.3352662086486816, + "rewards/MultiModalAccuracyORM/mean": 0.3083333373069763, + "rewards/MultiModalAccuracyORM/std": 0.3352662086486816, + "step": 2335, + "train_speed(iter/s)": 0.031686 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 1030.2, + "completions/mean_length": 481.2666900634766, + "completions/min_length": 237.4, + "epoch": 0.9454545454545454, + "grad_norm": 1.418697346446445, + "kl": 0.0329925537109375, + "learning_rate": 2e-07, + "loss": 0.0726934552192688, + "memory(GiB)": 113.5, + "reward": 0.2833333395421505, + "reward_std": 0.3713845372200012, + "rewards/MultiModalAccuracyORM/mean": 0.2833333395421505, + "rewards/MultiModalAccuracyORM/std": 0.3713845372200012, + "step": 2340, + "train_speed(iter/s)": 0.031672 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 668.9, + "completions/mean_length": 321.70001220703125, + "completions/min_length": 150.2, + "epoch": 0.9474747474747475, + "grad_norm": 2.0538342098414333, + "kl": 0.03148193359375, + "learning_rate": 2e-07, + "loss": 0.035471782088279724, + "memory(GiB)": 113.5, + "reward": 0.29166667610406877, + "reward_std": 0.1973894327878952, + "rewards/MultiModalAccuracyORM/mean": 0.29166667610406877, + "rewards/MultiModalAccuracyORM/std": 0.1973894327878952, + "step": 2345, + "train_speed(iter/s)": 0.031674 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.4, + "completions/mean_length": 358.76668243408204, + "completions/min_length": 218.4, + "epoch": 0.9494949494949495, + "grad_norm": 2.6339218970926903, + "kl": 0.0245635986328125, + "learning_rate": 2e-07, + "loss": 0.004336267709732056, + "memory(GiB)": 113.5, + "reward": 0.36666667833924294, + "reward_std": 0.32297651171684266, + "rewards/MultiModalAccuracyORM/mean": 0.36666667833924294, + "rewards/MultiModalAccuracyORM/std": 0.32297651171684266, + "step": 2350, + "train_speed(iter/s)": 0.031691 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.4, + "completions/mean_length": 282.98334274291994, + "completions/min_length": 158.3, + "epoch": 0.9515151515151515, + "grad_norm": 2.0291656458591145, + "kl": 0.020587158203125, + "learning_rate": 2e-07, + "loss": -0.05831232666969299, + "memory(GiB)": 113.5, + "reward": 0.5000000081956386, + "reward_std": 0.3330099433660507, + "rewards/MultiModalAccuracyORM/mean": 0.5000000081956386, + "rewards/MultiModalAccuracyORM/std": 0.3330099433660507, + "step": 2355, + "train_speed(iter/s)": 0.031702 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.3, + "completions/mean_length": 282.77500610351564, + "completions/min_length": 139.0, + "epoch": 0.9535353535353536, + "grad_norm": 0.11573786538748869, + "kl": 0.0304229736328125, + "learning_rate": 2e-07, + "loss": 0.03489102721214295, + "memory(GiB)": 113.5, + "reward": 0.24166666939854622, + "reward_std": 0.2355453997850418, + "rewards/MultiModalAccuracyORM/mean": 0.24166666939854622, + "rewards/MultiModalAccuracyORM/std": 0.2355453997850418, + "step": 2360, + "train_speed(iter/s)": 0.031718 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.4, + "completions/mean_length": 412.06668243408205, + "completions/min_length": 245.3, + "epoch": 0.9555555555555556, + "grad_norm": 2.17622948866824, + "kl": 0.019061279296875, + "learning_rate": 2e-07, + "loss": 0.005562397837638855, + "memory(GiB)": 113.5, + "reward": 0.4250000111758709, + "reward_std": 0.45383972525596616, + "rewards/MultiModalAccuracyORM/mean": 0.4250000111758709, + "rewards/MultiModalAccuracyORM/std": 0.45383972525596616, + "step": 2365, + "train_speed(iter/s)": 0.031722 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.1, + "completions/mean_length": 371.3166793823242, + "completions/min_length": 208.3, + "epoch": 0.9575757575757575, + "grad_norm": 2.3917395292059282, + "kl": 0.031591796875, + "learning_rate": 2e-07, + "loss": 0.00018071085214614867, + "memory(GiB)": 113.5, + "reward": 0.291666679084301, + "reward_std": 0.26498726308345794, + "rewards/MultiModalAccuracyORM/mean": 0.291666679084301, + "rewards/MultiModalAccuracyORM/std": 0.26498726308345794, + "step": 2370, + "train_speed(iter/s)": 0.031735 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.6, + "completions/mean_length": 301.4333435058594, + "completions/min_length": 178.5, + "epoch": 0.9595959595959596, + "grad_norm": 3.5970167327213822, + "kl": 0.02054443359375, + "learning_rate": 2e-07, + "loss": 0.01565767079591751, + "memory(GiB)": 113.5, + "reward": 0.44166667833924295, + "reward_std": 0.26897316575050356, + "rewards/MultiModalAccuracyORM/mean": 0.44166667833924295, + "rewards/MultiModalAccuracyORM/std": 0.26897316575050356, + "step": 2375, + "train_speed(iter/s)": 0.031751 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.8, + "completions/mean_length": 327.98334350585935, + "completions/min_length": 184.6, + "epoch": 0.9616161616161616, + "grad_norm": 2.197976826013823, + "kl": 0.021331787109375, + "learning_rate": 2e-07, + "loss": 0.005569913983345031, + "memory(GiB)": 113.5, + "reward": 0.43333334028720855, + "reward_std": 0.3840597689151764, + "rewards/MultiModalAccuracyORM/mean": 0.43333334028720855, + "rewards/MultiModalAccuracyORM/std": 0.3840597689151764, + "step": 2380, + "train_speed(iter/s)": 0.031761 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 708.0, + "completions/mean_length": 293.06667556762693, + "completions/min_length": 155.5, + "epoch": 0.9636363636363636, + "grad_norm": 2.126614857423257, + "kl": 0.0335205078125, + "learning_rate": 2e-07, + "loss": 0.0018027305603027343, + "memory(GiB)": 113.5, + "reward": 0.416666679084301, + "reward_std": 0.3855114609003067, + "rewards/MultiModalAccuracyORM/mean": 0.416666679084301, + "rewards/MultiModalAccuracyORM/std": 0.3855114609003067, + "step": 2385, + "train_speed(iter/s)": 0.031758 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.6, + "completions/mean_length": 374.27501220703124, + "completions/min_length": 226.1, + "epoch": 0.9656565656565657, + "grad_norm": 2.285791825740683, + "kl": 0.03223876953125, + "learning_rate": 2e-07, + "loss": -0.007699564099311829, + "memory(GiB)": 113.5, + "reward": 0.14166667088866233, + "reward_std": 0.3000969380140305, + "rewards/MultiModalAccuracyORM/mean": 0.14166667088866233, + "rewards/MultiModalAccuracyORM/std": 0.3000969380140305, + "step": 2390, + "train_speed(iter/s)": 0.031759 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 601.7, + "completions/mean_length": 297.45834197998045, + "completions/min_length": 152.0, + "epoch": 0.9676767676767677, + "grad_norm": 2.926559087104104, + "kl": 0.0413818359375, + "learning_rate": 2e-07, + "loss": 0.04997736811637878, + "memory(GiB)": 113.5, + "reward": 0.31666667610406873, + "reward_std": 0.3687034219503403, + "rewards/MultiModalAccuracyORM/mean": 0.31666667610406873, + "rewards/MultiModalAccuracyORM/std": 0.3687034219503403, + "step": 2395, + "train_speed(iter/s)": 0.031761 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.1, + "completions/mean_length": 422.52500915527344, + "completions/min_length": 260.5, + "epoch": 0.9696969696969697, + "grad_norm": 1.0391142999786047, + "kl": 0.02857666015625, + "learning_rate": 2e-07, + "loss": -0.008375594019889831, + "memory(GiB)": 113.5, + "reward": 0.45000000670552254, + "reward_std": 0.34407602846622465, + "rewards/MultiModalAccuracyORM/mean": 0.45000000670552254, + "rewards/MultiModalAccuracyORM/std": 0.34407602846622465, + "step": 2400, + "train_speed(iter/s)": 0.031765 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 726.7, + "completions/mean_length": 369.96667938232423, + "completions/min_length": 166.7, + "epoch": 0.9717171717171718, + "grad_norm": 1.9044448293066447, + "kl": 0.034771728515625, + "learning_rate": 2e-07, + "loss": 0.041448038816452024, + "memory(GiB)": 113.5, + "reward": 0.3666666761040688, + "reward_std": 0.3330695390701294, + "rewards/MultiModalAccuracyORM/mean": 0.3666666761040688, + "rewards/MultiModalAccuracyORM/std": 0.3330695390701294, + "step": 2405, + "train_speed(iter/s)": 0.031765 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 860.8, + "completions/mean_length": 442.18335113525393, + "completions/min_length": 199.9, + "epoch": 0.9737373737373738, + "grad_norm": 1.1043100849775993, + "kl": 0.0294525146484375, + "learning_rate": 2e-07, + "loss": 0.011988846212625503, + "memory(GiB)": 113.5, + "reward": 0.3166666731238365, + "reward_std": 0.383000972867012, + "rewards/MultiModalAccuracyORM/mean": 0.3166666731238365, + "rewards/MultiModalAccuracyORM/std": 0.383000972867012, + "step": 2410, + "train_speed(iter/s)": 0.031763 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03333333333333333, + "completions/max_length": 964.9, + "completions/mean_length": 423.55834503173827, + "completions/min_length": 182.4, + "epoch": 0.9757575757575757, + "grad_norm": 3.0380086133675968, + "kl": 0.037750244140625, + "learning_rate": 2e-07, + "loss": 0.02129605710506439, + "memory(GiB)": 113.5, + "reward": 0.5166666716337204, + "reward_std": 0.2104335606098175, + "rewards/MultiModalAccuracyORM/mean": 0.5166666716337204, + "rewards/MultiModalAccuracyORM/std": 0.2104335606098175, + "step": 2415, + "train_speed(iter/s)": 0.031748 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/mean_length": 316.23334274291994, + "completions/min_length": 185.4, + "epoch": 0.9777777777777777, + "grad_norm": 3.286741279330765, + "kl": 0.03631591796875, + "learning_rate": 2e-07, + "loss": 0.01842118501663208, + "memory(GiB)": 113.5, + "reward": 0.4916666768491268, + "reward_std": 0.3266936391592026, + "rewards/MultiModalAccuracyORM/mean": 0.4916666768491268, + "rewards/MultiModalAccuracyORM/std": 0.3266936391592026, + "step": 2420, + "train_speed(iter/s)": 0.031765 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.4, + "completions/mean_length": 382.6333435058594, + "completions/min_length": 208.4, + "epoch": 0.9797979797979798, + "grad_norm": 3.1686862418479125, + "kl": 0.05205078125, + "learning_rate": 2e-07, + "loss": 0.011619596928358077, + "memory(GiB)": 113.5, + "reward": 0.3166666731238365, + "reward_std": 0.32526837289333344, + "rewards/MultiModalAccuracyORM/mean": 0.3166666731238365, + "rewards/MultiModalAccuracyORM/std": 0.32526837289333344, + "step": 2425, + "train_speed(iter/s)": 0.031766 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.9, + "completions/mean_length": 343.9250144958496, + "completions/min_length": 189.7, + "epoch": 0.9818181818181818, + "grad_norm": 1.5585214145494302, + "kl": 0.034375, + "learning_rate": 2e-07, + "loss": 0.0014587238430976868, + "memory(GiB)": 113.5, + "reward": 0.17500000596046447, + "reward_std": 0.3244759202003479, + "rewards/MultiModalAccuracyORM/mean": 0.17500000596046447, + "rewards/MultiModalAccuracyORM/std": 0.3244759202003479, + "step": 2430, + "train_speed(iter/s)": 0.031779 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.2, + "completions/mean_length": 378.7916778564453, + "completions/min_length": 209.2, + "epoch": 0.9838383838383838, + "grad_norm": 2.73232643290958, + "kl": 0.04532470703125, + "learning_rate": 2e-07, + "loss": 0.062485653162002566, + "memory(GiB)": 113.5, + "reward": 0.4666666828095913, + "reward_std": 0.4470617562532425, + "rewards/MultiModalAccuracyORM/mean": 0.4666666828095913, + "rewards/MultiModalAccuracyORM/std": 0.4470617562532425, + "step": 2435, + "train_speed(iter/s)": 0.031786 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 738.5, + "completions/mean_length": 341.6166717529297, + "completions/min_length": 163.9, + "epoch": 0.9858585858585859, + "grad_norm": 1.3853546154126857, + "kl": 0.0336669921875, + "learning_rate": 2e-07, + "loss": -0.028276541829109193, + "memory(GiB)": 113.5, + "reward": 0.14166667088866233, + "reward_std": 0.3000969380140305, + "rewards/MultiModalAccuracyORM/mean": 0.14166667088866233, + "rewards/MultiModalAccuracyORM/std": 0.3000969380140305, + "step": 2440, + "train_speed(iter/s)": 0.03179 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03333333333333333, + "completions/max_length": 914.9, + "completions/mean_length": 449.3500152587891, + "completions/min_length": 241.3, + "epoch": 0.9878787878787879, + "grad_norm": 2.2456582209413893, + "kl": 0.03409423828125, + "learning_rate": 2e-07, + "loss": -0.0006516605615615844, + "memory(GiB)": 113.5, + "reward": 0.2083333358168602, + "reward_std": 0.32050161957740786, + "rewards/MultiModalAccuracyORM/mean": 0.2083333358168602, + "rewards/MultiModalAccuracyORM/std": 0.32050161957740786, + "step": 2445, + "train_speed(iter/s)": 0.031782 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.008333333333333333, + "completions/max_length": 626.9, + "completions/mean_length": 322.7333419799805, + "completions/min_length": 203.0, + "epoch": 0.98989898989899, + "grad_norm": 2.6524648003013103, + "kl": 0.03165283203125, + "learning_rate": 2e-07, + "loss": -0.008733100444078445, + "memory(GiB)": 113.5, + "reward": 0.6416666708886624, + "reward_std": 0.15824586153030396, + "rewards/MultiModalAccuracyORM/mean": 0.6416666708886624, + "rewards/MultiModalAccuracyORM/std": 0.15824586153030396, + "step": 2450, + "train_speed(iter/s)": 0.031782 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 881.4, + "completions/mean_length": 330.15001068115237, + "completions/min_length": 158.1, + "epoch": 0.9919191919191919, + "grad_norm": 1.7910216600269697, + "kl": 0.05638427734375, + "learning_rate": 2e-07, + "loss": -0.0065705299377441405, + "memory(GiB)": 113.5, + "reward": 0.2333333395421505, + "reward_std": 0.2815410792827606, + "rewards/MultiModalAccuracyORM/mean": 0.2333333395421505, + "rewards/MultiModalAccuracyORM/std": 0.2815410792827606, + "step": 2455, + "train_speed(iter/s)": 0.031777 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.016666666666666666, + "completions/max_length": 851.2, + "completions/mean_length": 378.46667633056643, + "completions/min_length": 185.4, + "epoch": 0.9939393939393939, + "grad_norm": 2.5045509391063856, + "kl": 0.04505615234375, + "learning_rate": 2e-07, + "loss": -0.008945465087890625, + "memory(GiB)": 113.5, + "reward": 0.40833334252238274, + "reward_std": 0.3794672876596451, + "rewards/MultiModalAccuracyORM/mean": 0.40833334252238274, + "rewards/MultiModalAccuracyORM/std": 0.3794672876596451, + "step": 2460, + "train_speed(iter/s)": 0.031779 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03333333333333333, + "completions/max_length": 879.8, + "completions/mean_length": 371.80834045410154, + "completions/min_length": 158.4, + "epoch": 0.9959595959595959, + "grad_norm": 3.572517250532036, + "kl": 0.051068115234375, + "learning_rate": 2e-07, + "loss": -0.013737475872039795, + "memory(GiB)": 113.5, + "reward": 0.31666667088866235, + "reward_std": 0.29408499896526336, + "rewards/MultiModalAccuracyORM/mean": 0.31666667088866235, + "rewards/MultiModalAccuracyORM/std": 0.29408499896526336, + "step": 2465, + "train_speed(iter/s)": 0.031766 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.03333333333333333, + "completions/max_length": 1149.5, + "completions/mean_length": 445.8500122070312, + "completions/min_length": 184.4, + "epoch": 0.997979797979798, + "grad_norm": 2.2904897334575254, + "kl": 0.04656982421875, + "learning_rate": 2e-07, + "loss": -0.032226094603538515, + "memory(GiB)": 113.5, + "reward": 0.3416666775941849, + "reward_std": 0.4094175934791565, + "rewards/MultiModalAccuracyORM/mean": 0.3416666775941849, + "rewards/MultiModalAccuracyORM/std": 0.4094175934791565, + "step": 2470, + "train_speed(iter/s)": 0.031754 + }, + { + "epoch": 1.0, + "grad_norm": 1.4731764005283963, + "learning_rate": 2e-07, + "loss": 0.061235594749450686, + "memory(GiB)": 113.5, + "step": 2475, + "train_speed(iter/s)": 0.031746 + }, + { + "epoch": 1.0, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.018333333333333333, + "eval_completions/max_length": 787.14, + "eval_completions/mean_length": 378.51834548950194, + "eval_completions/min_length": 186.72, + "eval_kl": 0.040185546875, + "eval_loss": 0.029814261943101883, + "eval_reward": 0.3483333396911621, + "eval_reward_std": 0.3004326641559601, + "eval_rewards/MultiModalAccuracyORM/mean": 0.3483333396911621, + "eval_rewards/MultiModalAccuracyORM/std": 0.3004326641559601, + "eval_runtime": 729.694, + "eval_samples_per_second": 0.069, + "eval_steps_per_second": 0.007, + "step": 2475 + } + ], + "logging_steps": 5, + "max_steps": 2475, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}