{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.015350136616215884, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 171.46875, "completions/min_length": 94.0, "epoch": 1.5350136616215885e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.6439049243927002, "kl": 0.0, "learning_rate": 7.67341927562922e-10, "loss": -0.005220063030719757, "memory(GiB)": 66.78, "reward": 0.6083196997642517, "reward_std": 0.09502242505550385, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8503483533859253, "rewards/PlanningActionSetORM/std": 0.11198445409536362, "rewards/RMReward/mean": 0.5478124618530273, "rewards/RMReward/std": 0.17501583695411682, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 1, "train_speed(iter/s)": 0.007965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/mean_length": 62.75, "completions/min_length": 9.0, "epoch": 3.070027323243177e-05, "frac_reward_zero_std": 0.0, "grad_norm": 13.04994010925293, "kl": 0.0, "learning_rate": 1.534683855125844e-09, "loss": -0.1081731989979744, "memory(GiB)": 70.84, "reward": 0.8214062452316284, "reward_std": 0.1765439212322235, "rewards/MathAnswerFormat/mean": 0.8125, "rewards/MathAnswerFormat/std": 0.40311288833618164, "rewards/PlanningActionSetORM/mean": 0.854687511920929, "rewards/PlanningActionSetORM/std": 0.08421581238508224, "rewards/RMReward/mean": 0.690625011920929, "rewards/RMReward/std": 0.23109792172908783, "rewards/SpatialReasoningORM/mean": 0.9249999523162842, "rewards/SpatialReasoningORM/std": 0.16124515235424042, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 2, "train_speed(iter/s)": 0.011589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 118.59375, "completions/min_length": 66.0, "epoch": 4.605040984864765e-05, "frac_reward_zero_std": 0.0, "grad_norm": 3.4015135765075684, "kl": 0.0004466597456485033, "learning_rate": 2.3020257826887663e-09, "loss": 0.028353292495012283, "memory(GiB)": 70.84, "reward": 0.6434758305549622, "reward_std": 0.08584562689065933, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7686290740966797, "rewards/PlanningActionSetORM/std": 0.16826193034648895, "rewards/RMReward/mean": 0.6121875047683716, "rewards/RMReward/std": 0.23330354690551758, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 3, "train_speed(iter/s)": 0.013065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/mean_length": 55.6875, "completions/min_length": 2.0, "epoch": 6.140054646486354e-05, "frac_reward_zero_std": 0.0, "grad_norm": 28.406143188476562, "kl": 0.00018094007100444287, "learning_rate": 3.069367710251688e-09, "loss": 0.03656066209077835, "memory(GiB)": 70.84, "reward": 0.3340460956096649, "reward_std": 0.10788336396217346, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": 0.13371719419956207, "rewards/VisualPerceptionAccuracy/std": 0.07326673716306686, "step": 4, "train_speed(iter/s)": 0.016481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 89.84375, "completions/min_length": 9.0, "epoch": 7.675068308107942e-05, "frac_reward_zero_std": 0.0, "grad_norm": 15.731834411621094, "kl": 0.0002569019852671772, "learning_rate": 3.8367096378146105e-09, "loss": -0.08646176755428314, "memory(GiB)": 70.84, "reward": 0.47304946184158325, "reward_std": 0.1811455637216568, "rewards/MathAnswerFormat/mean": 0.75, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8624999523162842, "rewards/SpatialReasoningORM/std": 0.2801785171031952, "rewards/VisualPerceptionAccuracy/mean": 0.08922401070594788, "rewards/VisualPerceptionAccuracy/std": 0.07629070430994034, "step": 5, "train_speed(iter/s)": 0.019901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 59.15625, "completions/min_length": 9.0, "epoch": 9.21008196972953e-05, "frac_reward_zero_std": 0.0, "grad_norm": 15.050105094909668, "kl": 0.000460678682429716, "learning_rate": 4.6040515653775326e-09, "loss": -0.051835887134075165, "memory(GiB)": 70.84, "reward": 0.7484375238418579, "reward_std": 0.16370533406734467, "rewards/MathAnswerFormat/mean": 0.75, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": 0.7218749523162842, "rewards/PlanningActionSetORM/std": 0.09264002740383148, "rewards/RMReward/mean": 0.5750000476837158, "rewards/RMReward/std": 0.15705625712871552, "rewards/SpatialReasoningORM/mean": 0.8999999761581421, "rewards/SpatialReasoningORM/std": 0.17888543009757996, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 6, "train_speed(iter/s)": 0.020304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/mean_length": 258.46875, "completions/min_length": 130.0, "epoch": 0.00010745095631351119, "frac_reward_zero_std": 0.0, "grad_norm": 2.1093313694000244, "kl": 0.00032479516812600195, "learning_rate": 5.3713934929404555e-09, "loss": 0.039831630885601044, "memory(GiB)": 70.84, "reward": 0.13934914767742157, "reward_std": 0.09283407032489777, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13934914767742157, "rewards/VisualPerceptionAccuracy/std": 0.12846265733242035, "step": 7, "train_speed(iter/s)": 0.022011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 196.65625, "completions/min_length": 83.0, "epoch": 0.00012280109292972708, "frac_reward_zero_std": 0.0, "grad_norm": 2.241070032119751, "kl": 0.00024401751579716802, "learning_rate": 6.138735420503376e-09, "loss": -0.00960574671626091, "memory(GiB)": 70.84, "reward": 0.38931921124458313, "reward_std": 0.15384644269943237, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8291666507720947, "rewards/PlanningActionSetORM/std": 0.09761033207178116, "rewards/RMReward/mean": 0.550000011920929, "rewards/RMReward/std": 0.13165612518787384, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.172805055975914, "rewards/VisualPerceptionAccuracy/std": 0.18633483350276947, "step": 8, "train_speed(iter/s)": 0.02068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 56.21875, "completions/min_length": 2.0, "epoch": 0.00013815122954594295, "frac_reward_zero_std": 0.0, "grad_norm": 39.13835525512695, "kl": 6.87121573719196e-05, "learning_rate": 6.906077348066299e-09, "loss": -0.011473052203655243, "memory(GiB)": 70.84, "reward": 0.5293750166893005, "reward_std": 0.17129796743392944, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.824999988079071, "rewards/PlanningActionSetORM/std": 0.12292726337909698, "rewards/RMReward/mean": 0.4937499761581421, "rewards/RMReward/std": 0.17876894772052765, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 9, "train_speed(iter/s)": 0.021471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/mean_length": 294.9375, "completions/min_length": 126.0, "epoch": 0.00015350136616215884, "frac_reward_zero_std": 0.0, "grad_norm": 1.6598471403121948, "kl": 0.000295196776278317, "learning_rate": 7.673419275629221e-09, "loss": -0.014461830258369446, "memory(GiB)": 74.56, "reward": 0.3696648180484772, "reward_std": 0.12604056298732758, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8622976541519165, "rewards/PlanningActionSetORM/std": 0.06692205369472504, "rewards/RMReward/mean": 0.574999988079071, "rewards/RMReward/std": 0.14832396805286407, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.10687004774808884, "rewards/VisualPerceptionAccuracy/std": 0.12838509678840637, "step": 10, "train_speed(iter/s)": 0.021285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 71.6875, "completions/min_length": 2.0, "epoch": 0.00016885150277837473, "frac_reward_zero_std": 0.0, "grad_norm": 35.04463577270508, "kl": 0.00013995537301525474, "learning_rate": 8.440761203192144e-09, "loss": -0.01680171489715576, "memory(GiB)": 74.56, "reward": 0.5192690491676331, "reward_std": 0.18369005620479584, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5192690491676331, "rewards/VisualPerceptionAccuracy/std": 0.4663350582122803, "step": 11, "train_speed(iter/s)": 0.02297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 168.09375, "completions/min_length": 91.0, "epoch": 0.0001842016393945906, "frac_reward_zero_std": 0.0, "grad_norm": 3.068265438079834, "kl": 0.0004193384665995836, "learning_rate": 9.208103130755065e-09, "loss": -0.05210035294294357, "memory(GiB)": 74.56, "reward": 0.3758252263069153, "reward_std": 0.1342829465866089, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8100818395614624, "rewards/PlanningActionSetORM/std": 0.11051847785711288, "rewards/RMReward/mean": 0.5406249761581421, "rewards/RMReward/std": 0.10036392509937286, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1571340262889862, "rewards/VisualPerceptionAccuracy/std": 0.17745813727378845, "step": 12, "train_speed(iter/s)": 0.022399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.28125, "completions/min_length": 2.0, "epoch": 0.0001995517760108065, "frac_reward_zero_std": 0.0, "grad_norm": 93.41316223144531, "kl": 0.00030381945543922484, "learning_rate": 9.975445058317988e-09, "loss": -0.09939659386873245, "memory(GiB)": 74.56, "reward": 0.4818750023841858, "reward_std": 0.3661068081855774, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.22500000894069672, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 13, "train_speed(iter/s)": 0.02408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 67.78125, "completions/min_length": 8.0, "epoch": 0.00021490191262702238, "frac_reward_zero_std": 0.0, "grad_norm": 4.099529266357422, "kl": 0.00012786286242771894, "learning_rate": 1.0742786985880911e-08, "loss": -0.05250461399555206, "memory(GiB)": 74.56, "reward": 0.2827959358692169, "reward_std": 0.17935225367546082, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8592092990875244, "rewards/PlanningActionSetORM/std": 0.12379583716392517, "rewards/RMReward/mean": 0.359375, "rewards/RMReward/std": 0.13193275034427643, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 14, "train_speed(iter/s)": 0.023807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 52.0625, "completions/min_length": 2.0, "epoch": 0.00023025204924323828, "frac_reward_zero_std": 0.0, "grad_norm": 64.00300598144531, "kl": 0.0008142703445628285, "learning_rate": 1.1510128913443832e-08, "loss": -0.029547356069087982, "memory(GiB)": 74.56, "reward": 0.6258958578109741, "reward_std": 0.1712857186794281, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7958333492279053, "rewards/PlanningActionSetORM/std": 0.08845379203557968, "rewards/RMReward/mean": 0.7868750095367432, "rewards/RMReward/std": 0.13123612105846405, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 15, "train_speed(iter/s)": 0.02384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/mean_length": 50.875, "completions/min_length": 2.0, "epoch": 0.00024560218585945417, "frac_reward_zero_std": 0.0, "grad_norm": 51.15801239013672, "kl": 5.5475444241892546e-05, "learning_rate": 1.2277470841006752e-08, "loss": 0.05699886381626129, "memory(GiB)": 74.56, "reward": 0.6021875143051147, "reward_std": 0.16578692197799683, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.890625, "rewards/PlanningActionSetORM/std": 0.08974629640579224, "rewards/RMReward/mean": 0.659375011920929, "rewards/RMReward/std": 0.16453848779201508, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 16, "train_speed(iter/s)": 0.02374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/mean_length": 3.09375, "completions/min_length": 2.0, "epoch": 0.00026095232247567003, "frac_reward_zero_std": 0.0, "grad_norm": 44.94822692871094, "kl": 0.0011160714784637094, "learning_rate": 1.3044812768569675e-08, "loss": 0.04431448131799698, "memory(GiB)": 74.56, "reward": 0.5165625214576721, "reward_std": 0.1685960292816162, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5437500476837158, "rewards/SpatialReasoningORM/std": 0.17768675088882446, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 17, "train_speed(iter/s)": 0.025088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 109.84375, "completions/min_length": 62.0, "epoch": 0.0002763024590918859, "frac_reward_zero_std": 0.0, "grad_norm": 3.1549973487854004, "kl": 0.00017547917377669364, "learning_rate": 1.3812154696132598e-08, "loss": 0.0023469068109989166, "memory(GiB)": 74.56, "reward": 0.3477204740047455, "reward_std": 0.1505986452102661, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.824999988079071, "rewards/PlanningActionSetORM/std": 0.1538698375225067, "rewards/RMReward/mean": 0.48750001192092896, "rewards/RMReward/std": 0.10723806172609329, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1404409557580948, "rewards/VisualPerceptionAccuracy/std": 0.1991497427225113, "step": 18, "train_speed(iter/s)": 0.024761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/mean_length": 84.5625, "completions/min_length": 2.0, "epoch": 0.0002916525957081018, "frac_reward_zero_std": 0.0, "grad_norm": 28.165565490722656, "kl": 0.00015484774485230446, "learning_rate": 1.4579496623695519e-08, "loss": -0.04130588844418526, "memory(GiB)": 74.56, "reward": 0.5794588327407837, "reward_std": 0.1388310343027115, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8852137327194214, "rewards/PlanningActionSetORM/std": 0.09414609521627426, "rewards/RMReward/mean": 0.559374988079071, "rewards/RMReward/std": 0.1551544964313507, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 19, "train_speed(iter/s)": 0.023122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 116.375, "completions/min_length": 57.0, "epoch": 0.0003070027323243177, "frac_reward_zero_std": 0.0, "grad_norm": 2.1256558895111084, "kl": 2.7268815756542608e-05, "learning_rate": 1.5346838551258442e-08, "loss": 0.07961555570363998, "memory(GiB)": 74.56, "reward": 0.6395330429077148, "reward_std": 0.11179035156965256, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8239149451255798, "rewards/PlanningActionSetORM/std": 0.12152258306741714, "rewards/RMReward/mean": 0.5934375524520874, "rewards/RMReward/std": 0.1561349630355835, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 20, "train_speed(iter/s)": 0.0216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.28125, "completions/min_length": 2.0, "epoch": 0.00032235286894053355, "frac_reward_zero_std": 0.0, "grad_norm": 80.61385345458984, "kl": 0.0019965278916060925, "learning_rate": 1.6114180478821365e-08, "loss": -0.06285464763641357, "memory(GiB)": 74.56, "reward": 0.37406250834465027, "reward_std": 0.21375000476837158, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.39375001192092896, "rewards/SpatialReasoningORM/std": 0.2895352244377136, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 21, "train_speed(iter/s)": 0.022578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 95.4375, "completions/min_length": 12.0, "epoch": 0.00033770300555674947, "frac_reward_zero_std": 0.0, "grad_norm": 4.1570611000061035, "kl": 0.00013559102080762386, "learning_rate": 1.6881522406384288e-08, "loss": -0.05971769988536835, "memory(GiB)": 74.56, "reward": 0.7701696157455444, "reward_std": 0.20829859375953674, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8704466819763184, "rewards/PlanningActionSetORM/std": 0.07358107715845108, "rewards/RMReward/mean": 0.5656249523162842, "rewards/RMReward/std": 0.19554944336414337, "rewards/SpatialReasoningORM/mean": 0.9125000238418579, "rewards/SpatialReasoningORM/std": 0.26299554109573364, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 22, "train_speed(iter/s)": 0.019943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.46875, "completions/min_length": 2.0, "epoch": 0.00035305314217296533, "frac_reward_zero_std": 0.0, "grad_norm": 86.50675201416016, "kl": 8.877841173671186e-05, "learning_rate": 1.7648864333947207e-08, "loss": -0.03406687080860138, "memory(GiB)": 74.56, "reward": 0.4453125, "reward_std": 0.21375000476837158, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.46875, "rewards/SpatialReasoningORM/std": 0.2520080804824829, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 23, "train_speed(iter/s)": 0.020779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/mean_length": 62.4375, "completions/min_length": 8.0, "epoch": 0.0003684032787891812, "frac_reward_zero_std": 0.0, "grad_norm": 6.273565769195557, "kl": 0.0006925835623405874, "learning_rate": 1.841620626151013e-08, "loss": 0.003011047840118408, "memory(GiB)": 74.56, "reward": 0.8189583420753479, "reward_std": 0.16511324048042297, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8083333373069763, "rewards/PlanningActionSetORM/std": 0.06324554979801178, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.09393038600683212, "rewards/SpatialReasoningORM/mean": 0.9125000238418579, "rewards/SpatialReasoningORM/std": 0.26299554109573364, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 24, "train_speed(iter/s)": 0.020804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 231.78125, "completions/min_length": 84.0, "epoch": 0.0003837534154053971, "frac_reward_zero_std": 0.0, "grad_norm": 2.3816356658935547, "kl": 0.00018827947496902198, "learning_rate": 1.9183548189073053e-08, "loss": 0.1166524887084961, "memory(GiB)": 74.56, "reward": 0.4689823091030121, "reward_std": 0.18674570322036743, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.854687511920929, "rewards/PlanningActionSetORM/std": 0.09138688445091248, "rewards/RMReward/mean": 0.6031249761581421, "rewards/RMReward/std": 0.148849755525589, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.28452712297439575, "rewards/VisualPerceptionAccuracy/std": 0.24678021669387817, "step": 25, "train_speed(iter/s)": 0.020547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 112.875, "completions/min_length": 2.0, "epoch": 0.000399103552021613, "frac_reward_zero_std": 0.0, "grad_norm": 47.62897491455078, "kl": 0.0002811551094055176, "learning_rate": 1.9950890116635976e-08, "loss": 0.01882201060652733, "memory(GiB)": 74.56, "reward": 0.495017945766449, "reward_std": 0.1514635980129242, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.822054386138916, "rewards/PlanningActionSetORM/std": 0.12903809547424316, "rewards/RMReward/mean": 0.453125, "rewards/RMReward/std": 0.09568829089403152, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 26, "train_speed(iter/s)": 0.020231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 90.0, "completions/min_length": 12.0, "epoch": 0.0004144536886378289, "frac_reward_zero_std": 0.0, "grad_norm": 4.569225788116455, "kl": 8.136236283462495e-05, "learning_rate": 2.0718232044198896e-08, "loss": -0.01788545399904251, "memory(GiB)": 74.56, "reward": 0.7211570143699646, "reward_std": 0.20564742386341095, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8771949410438538, "rewards/PlanningActionSetORM/std": 0.0961606353521347, "rewards/RMReward/mean": 0.515625, "rewards/RMReward/std": 0.09953014552593231, "rewards/SpatialReasoningORM/mean": 0.8500000238418579, "rewards/SpatialReasoningORM/std": 0.3464101552963257, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 27, "train_speed(iter/s)": 0.020234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.25, "completions/min_length": 8.0, "epoch": 0.00042980382525404477, "frac_reward_zero_std": 0.0, "grad_norm": 9.211813926696777, "kl": 9.697020868770778e-05, "learning_rate": 2.1485573971761822e-08, "loss": -0.0068480633199214935, "memory(GiB)": 74.56, "reward": 0.46531248092651367, "reward_std": 0.4704556167125702, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.33601075410842896, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4437499940395355, "rewards/SpatialReasoningORM/std": 0.48919782042503357, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 28, "train_speed(iter/s)": 0.020874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/mean_length": 45.3125, "completions/min_length": 2.0, "epoch": 0.00044515396187026063, "frac_reward_zero_std": 0.0, "grad_norm": 30.062990188598633, "kl": 0.0001500367361586541, "learning_rate": 2.225291589932474e-08, "loss": -0.08666250109672546, "memory(GiB)": 74.56, "reward": 0.6151562929153442, "reward_std": 0.1641741842031479, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8671875, "rewards/PlanningActionSetORM/std": 0.20028842985630035, "rewards/RMReward/mean": 0.6531250476837158, "rewards/RMReward/std": 0.19362226128578186, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 29, "train_speed(iter/s)": 0.020826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/mean_length": 70.25, "completions/min_length": 3.0, "epoch": 0.00046050409848647655, "frac_reward_zero_std": 0.5, "grad_norm": 1.2602778673171997, "kl": 1.41699674713891e-05, "learning_rate": 2.3020257826887664e-08, "loss": 0.00800991803407669, "memory(GiB)": 74.56, "reward": 0.6820312738418579, "reward_std": 0.03544161468744278, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8203125, "rewards/PlanningActionSetORM/std": 0.097181037068367, "rewards/RMReward/mean": 0.7874999642372131, "rewards/RMReward/std": 0.08266398310661316, "rewards/SpatialReasoningORM/mean": 0.6000000238418579, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 30, "train_speed(iter/s)": 0.020207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/mean_length": 55.21875, "completions/min_length": 2.0, "epoch": 0.0004758542351026924, "frac_reward_zero_std": 0.0, "grad_norm": 34.869468688964844, "kl": -0.0007516290061175823, "learning_rate": 2.3787599754450584e-08, "loss": 0.04839706793427467, "memory(GiB)": 74.56, "reward": 0.6120312213897705, "reward_std": 0.1127019077539444, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8609374761581421, "rewards/PlanningActionSetORM/std": 0.1296619176864624, "rewards/RMReward/mean": 0.6468749642372131, "rewards/RMReward/std": 0.10403324663639069, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 31, "train_speed(iter/s)": 0.020342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2049.0, "completions/mean_length": 488.46875, "completions/min_length": 14.0, "epoch": 0.0004912043717189083, "frac_reward_zero_std": 0.0, "grad_norm": 5.0215559005737305, "kl": 0.00011140385322505608, "learning_rate": 2.4554941682013504e-08, "loss": 0.06015278398990631, "memory(GiB)": 74.56, "reward": 0.35019341111183167, "reward_std": 0.3668678104877472, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.17538684606552124, "rewards/VisualPerceptionAccuracy/std": 0.2431577444076538, "step": 32, "train_speed(iter/s)": 0.019934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 105.46875, "completions/min_length": 78.0, "epoch": 0.0005065545083351242, "frac_reward_zero_std": 0.0, "grad_norm": 3.0038836002349854, "kl": 0.00042627734364941716, "learning_rate": 2.532228360957643e-08, "loss": 0.03598878160119057, "memory(GiB)": 74.56, "reward": 0.6217812895774841, "reward_std": 0.10193012654781342, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8351562023162842, "rewards/PlanningActionSetORM/std": 0.10115227103233337, "rewards/RMReward/mean": 0.5684375166893005, "rewards/RMReward/std": 0.12051258981227875, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 33, "train_speed(iter/s)": 0.019709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 129.375, "completions/min_length": 63.0, "epoch": 0.0005219046449513401, "frac_reward_zero_std": 0.0, "grad_norm": 3.108196973800659, "kl": 0.00015249731950461864, "learning_rate": 2.608962553713935e-08, "loss": -0.04906691983342171, "memory(GiB)": 74.56, "reward": 0.5049455165863037, "reward_std": 0.14439092576503754, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8869791626930237, "rewards/PlanningActionSetORM/std": 0.1471327543258667, "rewards/RMReward/mean": 0.7093750238418579, "rewards/RMReward/std": 0.10834936797618866, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2649952173233032, "rewards/VisualPerceptionAccuracy/std": 0.19434969127178192, "step": 34, "train_speed(iter/s)": 0.019708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 114.71875, "completions/min_length": 49.0, "epoch": 0.0005372547815675559, "frac_reward_zero_std": 0.0, "grad_norm": 3.471479654312134, "kl": 0.0002840460219886154, "learning_rate": 2.6856967464702276e-08, "loss": -0.0877828299999237, "memory(GiB)": 74.56, "reward": 0.7825223207473755, "reward_std": 0.10029308497905731, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9063615798950195, "rewards/PlanningActionSetORM/std": 0.1269582211971283, "rewards/RMReward/mean": 0.7515625357627869, "rewards/RMReward/std": 0.11534266918897629, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 35, "train_speed(iter/s)": 0.019766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/mean_length": 56.09375, "completions/min_length": 2.0, "epoch": 0.0005526049181837718, "frac_reward_zero_std": 0.0, "grad_norm": 51.54420852661133, "kl": 8.37029074318707e-05, "learning_rate": 2.7624309392265195e-08, "loss": 0.007109135389328003, "memory(GiB)": 74.56, "reward": 0.5402708053588867, "reward_std": 0.1897229254245758, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7958333492279053, "rewards/PlanningActionSetORM/std": 0.06101001799106598, "rewards/RMReward/mean": 0.6618750095367432, "rewards/RMReward/std": 0.12802180647850037, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 36, "train_speed(iter/s)": 0.019741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 163.3125, "completions/min_length": 81.0, "epoch": 0.0005679550547999878, "frac_reward_zero_std": 0.0, "grad_norm": 2.249736785888672, "kl": 0.0001282807206735015, "learning_rate": 2.8391651319828118e-08, "loss": -0.02889895625412464, "memory(GiB)": 74.56, "reward": 0.6317557096481323, "reward_std": 0.10724844038486481, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8150286078453064, "rewards/PlanningActionSetORM/std": 0.16118313372135162, "rewards/RMReward/mean": 0.5859375, "rewards/RMReward/std": 0.22260712087154388, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 37, "train_speed(iter/s)": 0.018981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 61.375, "completions/min_length": 2.0, "epoch": 0.0005833051914162036, "frac_reward_zero_std": 0.0, "grad_norm": 55.96392822265625, "kl": 5.035347930970602e-05, "learning_rate": 2.9158993247391038e-08, "loss": -0.08906198292970657, "memory(GiB)": 74.56, "reward": 0.4391555190086365, "reward_std": 0.20495834946632385, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7696800827980042, "rewards/PlanningActionSetORM/std": 0.16446468234062195, "rewards/RMReward/mean": 0.59375, "rewards/RMReward/std": 0.1276388168334961, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 38, "train_speed(iter/s)": 0.019053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.28125, "completions/min_length": 2.0, "epoch": 0.0005986553280324195, "frac_reward_zero_std": 0.0, "grad_norm": 23.317447662353516, "kl": -2.2194602934177965e-05, "learning_rate": 2.9926335174953964e-08, "loss": -0.01588663086295128, "memory(GiB)": 74.56, "reward": 0.5043749809265137, "reward_std": 0.125, "rewards/MathAnswerFormat/mean": 0.46875, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5062500238418579, "rewards/SpatialReasoningORM/std": 0.49248382449150085, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 39, "train_speed(iter/s)": 0.019484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 179.3125, "completions/min_length": 120.0, "epoch": 0.0006140054646486354, "frac_reward_zero_std": 0.0, "grad_norm": 2.869744300842285, "kl": 8.743777289055288e-05, "learning_rate": 3.0693677102516884e-08, "loss": 0.04952360689640045, "memory(GiB)": 74.56, "reward": 0.6350415945053101, "reward_std": 0.11919644474983215, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8452080488204956, "rewards/PlanningActionSetORM/std": 0.1337539702653885, "rewards/RMReward/mean": 0.5824999809265137, "rewards/RMReward/std": 0.1288660168647766, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 40, "train_speed(iter/s)": 0.019317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 206.34375, "completions/min_length": 86.0, "epoch": 0.0006293556012648512, "frac_reward_zero_std": 0.0, "grad_norm": 2.366194725036621, "kl": 0.00041389258694835007, "learning_rate": 3.146101903007981e-08, "loss": -0.09540648013353348, "memory(GiB)": 74.56, "reward": 0.6623520851135254, "reward_std": 0.11996833980083466, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.799260675907135, "rewards/PlanningActionSetORM/std": 0.17727366089820862, "rewards/RMReward/mean": 0.628125011920929, "rewards/RMReward/std": 0.16260851919651031, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 41, "train_speed(iter/s)": 0.019115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 113.0, "completions/min_length": 85.0, "epoch": 0.0006447057378810671, "frac_reward_zero_std": 0.0, "grad_norm": 2.5600149631500244, "kl": 0.00018218421610072255, "learning_rate": 3.222836095764273e-08, "loss": -0.001065429300069809, "memory(GiB)": 74.56, "reward": 0.7871905565261841, "reward_std": 0.07638944685459137, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8872023820877075, "rewards/PlanningActionSetORM/std": 0.079099141061306, "rewards/RMReward/mean": 0.7621874809265137, "rewards/RMReward/std": 0.09075789898633957, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 42, "train_speed(iter/s)": 0.019111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 105.0625, "completions/min_length": 2.0, "epoch": 0.0006600558744972831, "frac_reward_zero_std": 0.0, "grad_norm": 89.39250946044922, "kl": 0.00013227242743596435, "learning_rate": 3.299570288520565e-08, "loss": 0.017392326146364212, "memory(GiB)": 74.56, "reward": 0.18990503251552582, "reward_std": 0.18608039617538452, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": 0.1304350644350052, "rewards/VisualPerceptionAccuracy/std": 0.0801226869225502, "step": 43, "train_speed(iter/s)": 0.019162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/mean_length": 79.125, "completions/min_length": 3.0, "epoch": 0.0006754060111134989, "frac_reward_zero_std": 0.0, "grad_norm": 41.29188537597656, "kl": 8.725299267098308e-06, "learning_rate": 3.3763044812768575e-08, "loss": -0.04470200091600418, "memory(GiB)": 74.56, "reward": 0.5123640894889832, "reward_std": 0.17156317830085754, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8355159163475037, "rewards/PlanningActionSetORM/std": 0.08903082460165024, "rewards/RMReward/mean": 0.4931250214576721, "rewards/RMReward/std": 0.14008182287216187, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 44, "train_speed(iter/s)": 0.018956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.40625, "completions/min_length": 2.0, "epoch": 0.0006907561477297148, "frac_reward_zero_std": 0.0, "grad_norm": 38.78861618041992, "kl": 9.34829076868482e-05, "learning_rate": 3.4530386740331495e-08, "loss": 0.048999637365341187, "memory(GiB)": 74.56, "reward": 0.5712499618530273, "reward_std": 0.33484601974487305, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5750000476837158, "rewards/SpatialReasoningORM/std": 0.3793032765388489, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 45, "train_speed(iter/s)": 0.01933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 92.375, "completions/min_length": 8.0, "epoch": 0.0007061062843459307, "frac_reward_zero_std": 0.0, "grad_norm": 16.458236694335938, "kl": 0.00036403670674189925, "learning_rate": 3.5297728667894415e-08, "loss": -0.10070043057203293, "memory(GiB)": 74.56, "reward": 0.7075357437133789, "reward_std": 0.21786822378635406, "rewards/MathAnswerFormat/mean": 0.75, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": 0.7128573656082153, "rewards/PlanningActionSetORM/std": 0.1629800647497177, "rewards/RMReward/mean": 0.59375, "rewards/RMReward/std": 0.09810708463191986, "rewards/SpatialReasoningORM/mean": 0.7999999523162842, "rewards/SpatialReasoningORM/std": 0.35023805499076843, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 46, "train_speed(iter/s)": 0.019352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.75, "completions/min_length": 2.0, "epoch": 0.0007214564209621465, "frac_reward_zero_std": 0.0, "grad_norm": 64.55854034423828, "kl": 0.00014001716044731438, "learning_rate": 3.606507059545734e-08, "loss": -0.12120739370584488, "memory(GiB)": 74.56, "reward": 0.36937499046325684, "reward_std": 0.38778895139694214, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.36250001192092896, "rewards/SpatialReasoningORM/std": 0.43828845024108887, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 47, "train_speed(iter/s)": 0.019711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 111.25, "completions/min_length": 52.0, "epoch": 0.0007368065575783624, "frac_reward_zero_std": 0.0, "grad_norm": 4.010896682739258, "kl": 0.0004115339834243059, "learning_rate": 3.683241252302026e-08, "loss": -0.03696002811193466, "memory(GiB)": 74.56, "reward": 0.5649553537368774, "reward_std": 0.09591395407915115, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.774776816368103, "rewards/PlanningActionSetORM/std": 0.19071711599826813, "rewards/RMReward/mean": 0.512499988079071, "rewards/RMReward/std": 0.178253173828125, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 48, "train_speed(iter/s)": 0.019314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/mean_length": 110.125, "completions/min_length": 74.0, "epoch": 0.0007521566941945784, "frac_reward_zero_std": 0.0, "grad_norm": 3.010282039642334, "kl": 0.0003967389930039644, "learning_rate": 3.759975445058318e-08, "loss": -0.03775809705257416, "memory(GiB)": 74.56, "reward": 0.3337979018688202, "reward_std": 0.09125732630491257, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7691963911056519, "rewards/PlanningActionSetORM/std": 0.09637895971536636, "rewards/RMReward/mean": 0.59375, "rewards/RMReward/std": 0.125, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.03875652328133583, "rewards/VisualPerceptionAccuracy/std": 0.08941391855478287, "step": 49, "train_speed(iter/s)": 0.019266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 152.15625, "completions/min_length": 76.0, "epoch": 0.0007675068308107942, "frac_reward_zero_std": 0.0, "grad_norm": 1.9200758934020996, "kl": 5.987838812870905e-05, "learning_rate": 3.8367096378146106e-08, "loss": 0.0124688521027565, "memory(GiB)": 74.56, "reward": 0.6147935390472412, "reward_std": 0.1049710363149643, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.855217456817627, "rewards/PlanningActionSetORM/std": 0.11035355925559998, "rewards/RMReward/mean": 0.5546875, "rewards/RMReward/std": 0.13461975753307343, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 50, "train_speed(iter/s)": 0.019139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/mean_length": 63.53125, "completions/min_length": 2.0, "epoch": 0.0007828569674270101, "frac_reward_zero_std": 0.0, "grad_norm": 41.37568283081055, "kl": 9.253063035430387e-05, "learning_rate": 3.913443830570903e-08, "loss": 0.022256948053836823, "memory(GiB)": 74.56, "reward": 0.5816406607627869, "reward_std": 0.15703155100345612, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8601562976837158, "rewards/PlanningActionSetORM/std": 0.14173991978168488, "rewards/RMReward/mean": 0.6156250238418579, "rewards/RMReward/std": 0.14458994567394257, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 51, "train_speed(iter/s)": 0.019161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/mean_length": 104.21875, "completions/min_length": 75.0, "epoch": 0.000798207104043226, "frac_reward_zero_std": 0.0, "grad_norm": 2.855074167251587, "kl": 0.0003993879072368145, "learning_rate": 3.990178023327195e-08, "loss": -0.004363805055618286, "memory(GiB)": 74.56, "reward": 0.7010416984558105, "reward_std": 0.09478209912776947, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8239582777023315, "rewards/PlanningActionSetORM/std": 0.11065036058425903, "rewards/RMReward/mean": 0.6703125238418579, "rewards/RMReward/std": 0.10840439051389694, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 52, "train_speed(iter/s)": 0.018948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 112.78125, "completions/min_length": 77.0, "epoch": 0.0008135572406594418, "frac_reward_zero_std": 0.0, "grad_norm": 2.8230040073394775, "kl": 0.00021881239081267267, "learning_rate": 4.066912216083487e-08, "loss": -0.04131322354078293, "memory(GiB)": 74.56, "reward": 0.2625662386417389, "reward_std": 0.10582676529884338, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.706250011920929, "rewards/PlanningActionSetORM/std": 0.10057703405618668, "rewards/RMReward/mean": 0.4468749761581421, "rewards/RMReward/std": 0.152171790599823, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.026382487267255783, "rewards/VisualPerceptionAccuracy/std": 0.09163423627614975, "step": 53, "train_speed(iter/s)": 0.018901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/mean_length": 93.375, "completions/min_length": 2.0, "epoch": 0.0008289073772756578, "frac_reward_zero_std": 0.0, "grad_norm": 46.0244026184082, "kl": 0.0003166758397128433, "learning_rate": 4.143646408839779e-08, "loss": 0.0026415474712848663, "memory(GiB)": 74.56, "reward": 0.49659407138824463, "reward_std": 0.19913743436336517, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8846906423568726, "rewards/PlanningActionSetORM/std": 0.08303117752075195, "rewards/RMReward/mean": 0.574999988079071, "rewards/RMReward/std": 0.13165612518787384, "rewards/SpatialReasoningORM/mean": 0.3750000298023224, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 54, "train_speed(iter/s)": 0.018877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 71.6875, "completions/min_length": 10.0, "epoch": 0.0008442575138918737, "frac_reward_zero_std": 0.0, "grad_norm": 4.854836463928223, "kl": 5.3892232244834304e-05, "learning_rate": 4.220380601596071e-08, "loss": -0.008120701648294926, "memory(GiB)": 74.56, "reward": 0.8306249976158142, "reward_std": 0.2081890106201172, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.921875, "rewards/PlanningActionSetORM/std": 0.04376653581857681, "rewards/RMReward/mean": 0.778124988079071, "rewards/RMReward/std": 0.1032291129231453, "rewards/SpatialReasoningORM/mean": 0.8500000238418579, "rewards/SpatialReasoningORM/std": 0.3464101552963257, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 55, "train_speed(iter/s)": 0.018925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 57.09375, "completions/min_length": 9.0, "epoch": 0.0008596076505080895, "frac_reward_zero_std": 0.0, "grad_norm": 6.719241619110107, "kl": 0.00020634793327189982, "learning_rate": 4.2971147943523644e-08, "loss": -0.005203314125537872, "memory(GiB)": 74.56, "reward": 0.8323437571525574, "reward_std": 0.18455752730369568, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.9515625238418579, "rewards/PlanningActionSetORM/std": 0.05735035613179207, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.04082484170794487, "rewards/SpatialReasoningORM/mean": 0.8500000238418579, "rewards/SpatialReasoningORM/std": 0.3464101552963257, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 56, "train_speed(iter/s)": 0.018952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/mean_length": 354.3125, "completions/min_length": 103.0, "epoch": 0.0008749577871243054, "frac_reward_zero_std": 0.0, "grad_norm": 1.7548335790634155, "kl": 0.00021877250401303172, "learning_rate": 4.3738489871086563e-08, "loss": -0.008655533194541931, "memory(GiB)": 74.56, "reward": 0.39088135957717896, "reward_std": 0.13952192664146423, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8394650220870972, "rewards/PlanningActionSetORM/std": 0.11288365721702576, "rewards/RMReward/mean": 0.5562499761581421, "rewards/RMReward/std": 0.1376892775297165, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.16886970400810242, "rewards/VisualPerceptionAccuracy/std": 0.15545348823070526, "step": 57, "train_speed(iter/s)": 0.018921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 67.53125, "completions/min_length": 8.0, "epoch": 0.0008903079237405213, "frac_reward_zero_std": 0.0, "grad_norm": 16.299053192138672, "kl": 5.2117553423158824e-05, "learning_rate": 4.450583179864948e-08, "loss": -0.029804818332195282, "memory(GiB)": 74.56, "reward": 0.8627194762229919, "reward_std": 0.10290978848934174, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.8334449529647827, "rewards/PlanningActionSetORM/std": 0.10079808533191681, "rewards/RMReward/mean": 0.765625, "rewards/RMReward/std": 0.05977387726306915, "rewards/SpatialReasoningORM/mean": 0.949999988079071, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 58, "train_speed(iter/s)": 0.018969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 88.21875, "completions/min_length": 2.0, "epoch": 0.0009056580603567371, "frac_reward_zero_std": 0.0, "grad_norm": 28.269424438476562, "kl": 0.00010144778934773058, "learning_rate": 4.52731737262124e-08, "loss": -0.04423141106963158, "memory(GiB)": 74.56, "reward": 0.02347831055521965, "reward_std": 0.08490069210529327, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.03750000149011612, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": 0.011331619694828987, "rewards/VisualPerceptionAccuracy/std": 0.027301384136080742, "step": 59, "train_speed(iter/s)": 0.019229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.53125, "completions/min_length": 2.0, "epoch": 0.0009210081969729531, "frac_reward_zero_std": 0.0, "grad_norm": 42.2840690612793, "kl": 0.0, "learning_rate": 4.604051565377533e-08, "loss": 0.04163754731416702, "memory(GiB)": 74.56, "reward": 0.5165625214576721, "reward_std": 0.1685960292816162, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5437500476837158, "rewards/SpatialReasoningORM/std": 0.17768675088882446, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 60, "train_speed(iter/s)": 0.019531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 113.0625, "completions/min_length": 2.0, "epoch": 0.000936358333589169, "frac_reward_zero_std": 0.0, "grad_norm": 50.345394134521484, "kl": 0.00044172193156555295, "learning_rate": 4.680785758133825e-08, "loss": -0.10938027501106262, "memory(GiB)": 74.56, "reward": 0.6050000190734863, "reward_std": 0.15403063595294952, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.859375, "rewards/PlanningActionSetORM/std": 0.11278770118951797, "rewards/RMReward/mean": 0.71875, "rewards/RMReward/std": 0.07719024270772934, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 61, "train_speed(iter/s)": 0.019405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/mean_length": 53.34375, "completions/min_length": 2.0, "epoch": 0.0009517084702053848, "frac_reward_zero_std": 0.0, "grad_norm": 72.07744598388672, "kl": 0.00019370345398783684, "learning_rate": 4.757519950890117e-08, "loss": 0.08670385181903839, "memory(GiB)": 74.56, "reward": 0.4730878174304962, "reward_std": 0.1943226456642151, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7277529835700989, "rewards/PlanningActionSetORM/std": 0.10906452685594559, "rewards/RMReward/mean": 0.6000000238418579, "rewards/RMReward/std": 0.1154700517654419, "rewards/SpatialReasoningORM/mean": 0.3375000059604645, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 62, "train_speed(iter/s)": 0.019396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 55.34375, "completions/min_length": 9.0, "epoch": 0.0009670586068216007, "frac_reward_zero_std": 0.0, "grad_norm": 9.797662734985352, "kl": 5.900153337279335e-05, "learning_rate": 4.834254143646409e-08, "loss": -0.0400274284183979, "memory(GiB)": 74.56, "reward": 0.8781249523162842, "reward_std": 0.1235293373465538, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8031250238418579, "rewards/PlanningActionSetORM/std": 0.09573666006326675, "rewards/RMReward/mean": 0.778124988079071, "rewards/RMReward/std": 0.15913176536560059, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 63, "train_speed(iter/s)": 0.019271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 127.75, "completions/min_length": 69.0, "epoch": 0.0009824087434378167, "frac_reward_zero_std": 0.0, "grad_norm": 2.7171733379364014, "kl": 0.00022787405760027468, "learning_rate": 4.910988336402701e-08, "loss": -0.027487270534038544, "memory(GiB)": 74.56, "reward": 0.3710458278656006, "reward_std": 0.10979422926902771, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8400297164916992, "rewards/PlanningActionSetORM/std": 0.08260323852300644, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.13524669408798218, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.059085749089717865, "rewards/VisualPerceptionAccuracy/std": 0.1044643223285675, "step": 64, "train_speed(iter/s)": 0.019231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 107.84375, "completions/min_length": 77.0, "epoch": 0.0009977588800540325, "frac_reward_zero_std": 0.0, "grad_norm": 3.2193350791931152, "kl": 0.0002200156741309911, "learning_rate": 4.987722529158994e-08, "loss": -0.007620919495820999, "memory(GiB)": 74.56, "reward": 0.6157737970352173, "reward_std": 0.13784848153591156, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7913690209388733, "rewards/PlanningActionSetORM/std": 0.12260878831148148, "rewards/RMReward/mean": 0.5718749761581421, "rewards/RMReward/std": 0.2015814334154129, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 65, "train_speed(iter/s)": 0.019059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/mean_length": 9.0, "completions/min_length": 2.0, "epoch": 0.0010131090166702484, "frac_reward_zero_std": 0.0, "grad_norm": 39.00764465332031, "kl": 1.52587890625e-05, "learning_rate": 5.064456721915286e-08, "loss": -0.023415762931108475, "memory(GiB)": 74.56, "reward": 0.7359374761581421, "reward_std": 0.15109604597091675, "rewards/MathAnswerFormat/mean": 0.46875, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.2782433331012726, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 66, "train_speed(iter/s)": 0.019313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.5, "completions/min_length": 2.0, "epoch": 0.0010284591532864643, "frac_reward_zero_std": 0.0, "grad_norm": 54.022499084472656, "kl": 0.0009602864738553762, "learning_rate": 5.141190914671578e-08, "loss": 0.004911486059427261, "memory(GiB)": 74.56, "reward": 0.7315624952316284, "reward_std": 0.18649035692214966, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.7437500357627869, "rewards/SpatialReasoningORM/std": 0.3099817931652069, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 67, "train_speed(iter/s)": 0.019569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/mean_length": 7.875, "completions/min_length": 2.0, "epoch": 0.0010438092899026801, "frac_reward_zero_std": 0.0, "grad_norm": 40.649574279785156, "kl": 0.0, "learning_rate": 5.21792510742787e-08, "loss": -0.10592072457075119, "memory(GiB)": 74.56, "reward": 0.5221874713897705, "reward_std": 0.15109604597091675, "rewards/MathAnswerFormat/mean": 0.46875, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.48393547534942627, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 68, "train_speed(iter/s)": 0.019814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 64.0625, "completions/min_length": 2.0, "epoch": 0.001059159426518896, "frac_reward_zero_std": 0.0, "grad_norm": 52.373714447021484, "kl": 4.5134049287298694e-05, "learning_rate": 5.294659300184163e-08, "loss": 0.024791929870843887, "memory(GiB)": 74.56, "reward": 0.7113956212997437, "reward_std": 0.26104211807250977, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8264565467834473, "rewards/PlanningActionSetORM/std": 0.14452791213989258, "rewards/RMReward/mean": 0.5562499761581421, "rewards/RMReward/std": 0.14930395781993866, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.8125, "rewards/VisualPerceptionAccuracy/std": 0.40311288833618164, "step": 69, "train_speed(iter/s)": 0.019846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/mean_length": 133.40625, "completions/min_length": 9.0, "epoch": 0.0010745095631351119, "frac_reward_zero_std": 0.0, "grad_norm": 5.0381245613098145, "kl": 4.103544051758945e-05, "learning_rate": 5.371393492940455e-08, "loss": 0.009107174351811409, "memory(GiB)": 74.56, "reward": 0.7463743090629578, "reward_std": 0.23879502713680267, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.816243052482605, "rewards/PlanningActionSetORM/std": 0.11325016617774963, "rewards/RMReward/mean": 0.6681250333786011, "rewards/RMReward/std": 0.11484591662883759, "rewards/SpatialReasoningORM/mean": 0.7875000238418579, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 70, "train_speed(iter/s)": 0.019822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 156.59375, "completions/min_length": 92.0, "epoch": 0.0010898596997513277, "frac_reward_zero_std": 0.0, "grad_norm": 3.138021945953369, "kl": 0.00018558744341135025, "learning_rate": 5.448127685696747e-08, "loss": -0.033286452293395996, "memory(GiB)": 74.56, "reward": 0.3870367705821991, "reward_std": 0.05457737296819687, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8552083373069763, "rewards/PlanningActionSetORM/std": 0.06182973459362984, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.09168560057878494, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.010531838051974773, "rewards/VisualPerceptionAccuracy/std": 0.028803959488868713, "step": 71, "train_speed(iter/s)": 0.019876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 62.78125, "completions/min_length": 2.0, "epoch": 0.0011052098363675436, "frac_reward_zero_std": 0.0, "grad_norm": 72.23681640625, "kl": 5.212300311541185e-05, "learning_rate": 5.524861878453039e-08, "loss": -0.061280686408281326, "memory(GiB)": 74.56, "reward": 0.46516743302345276, "reward_std": 0.20077869296073914, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7797991037368774, "rewards/PlanningActionSetORM/std": 0.06266719102859497, "rewards/RMReward/mean": 0.65625, "rewards/RMReward/std": 0.1289379745721817, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 72, "train_speed(iter/s)": 0.019931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/mean_length": 345.53125, "completions/min_length": 9.0, "epoch": 0.0011205599729837595, "frac_reward_zero_std": 0.0, "grad_norm": 6.115102291107178, "kl": 0.0001352687831968069, "learning_rate": 5.601596071209331e-08, "loss": 0.04145657271146774, "memory(GiB)": 74.56, "reward": 0.4820995032787323, "reward_std": 0.2376091480255127, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.887499988079071, "rewards/SpatialReasoningORM/std": 0.27294689416885376, "rewards/VisualPerceptionAccuracy/mean": 0.07732396572828293, "rewards/VisualPerceptionAccuracy/std": 0.20844198763370514, "step": 73, "train_speed(iter/s)": 0.019975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 146.59375, "completions/min_length": 80.0, "epoch": 0.0011359101095999755, "frac_reward_zero_std": 0.0, "grad_norm": 1.9625319242477417, "kl": 0.0002331490395590663, "learning_rate": 5.6783302639656236e-08, "loss": 0.01657807268202305, "memory(GiB)": 74.56, "reward": 0.7193994522094727, "reward_std": 0.06818827241659164, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8594973087310791, "rewards/PlanningActionSetORM/std": 0.07851463556289673, "rewards/RMReward/mean": 0.6843750476837158, "rewards/RMReward/std": 0.11460837721824646, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 74, "train_speed(iter/s)": 0.019934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 179.21875, "completions/min_length": 93.0, "epoch": 0.0011512602462161914, "frac_reward_zero_std": 0.0, "grad_norm": 1.3887053728103638, "kl": 0.0001976771600311622, "learning_rate": 5.7550644567219156e-08, "loss": -0.0017392374575138092, "memory(GiB)": 78.3, "reward": 0.7110389471054077, "reward_std": 0.099724680185318, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8939446210861206, "rewards/PlanningActionSetORM/std": 0.10789339244365692, "rewards/RMReward/mean": 0.6653125286102295, "rewards/RMReward/std": 0.1487145870923996, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 75, "train_speed(iter/s)": 0.019553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 148.53125, "completions/min_length": 80.0, "epoch": 0.0011666103828324073, "frac_reward_zero_std": 0.0, "grad_norm": 1.9149986505508423, "kl": 2.9032064048806205e-05, "learning_rate": 5.8317986494782076e-08, "loss": -0.023981526494026184, "memory(GiB)": 78.3, "reward": 0.724513053894043, "reward_std": 0.1256641149520874, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9063148498535156, "rewards/PlanningActionSetORM/std": 0.08611778169870377, "rewards/RMReward/mean": 0.6790624856948853, "rewards/RMReward/std": 0.1525769829750061, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 76, "train_speed(iter/s)": 0.019428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 61.84375, "completions/min_length": 8.0, "epoch": 0.0011819605194486231, "frac_reward_zero_std": 0.0, "grad_norm": 8.604411125183105, "kl": 0.0002695315342862159, "learning_rate": 5.9085328422344995e-08, "loss": -0.0472334548830986, "memory(GiB)": 78.3, "reward": 0.897656261920929, "reward_std": 0.07142694294452667, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.9859374761581421, "rewards/PlanningActionSetORM/std": 0.038696203380823135, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.040311299264431, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 77, "train_speed(iter/s)": 0.019455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/mean_length": 52.40625, "completions/min_length": 2.0, "epoch": 0.001197310656064839, "frac_reward_zero_std": 0.0, "grad_norm": 39.07120895385742, "kl": 0.0001801247417461127, "learning_rate": 5.985267034990793e-08, "loss": 0.1396024525165558, "memory(GiB)": 78.3, "reward": 0.49609375, "reward_std": 0.1653904914855957, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8703124523162842, "rewards/PlanningActionSetORM/std": 0.07649550586938858, "rewards/RMReward/mean": 0.800000011920929, "rewards/RMReward/std": 0.06324554979801178, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.28722813725471497, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 78, "train_speed(iter/s)": 0.019495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 366.90625, "completions/min_length": 101.0, "epoch": 0.0012126607926810549, "frac_reward_zero_std": 0.0, "grad_norm": 2.2584340572357178, "kl": 0.00023505027638748288, "learning_rate": 6.062001227747084e-08, "loss": -0.02326786518096924, "memory(GiB)": 78.3, "reward": 0.4350515604019165, "reward_std": 0.12140677869319916, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9095982313156128, "rewards/PlanningActionSetORM/std": 0.04923156648874283, "rewards/RMReward/mean": 0.637499988079071, "rewards/RMReward/std": 0.12583057582378387, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1781834363937378, "rewards/VisualPerceptionAccuracy/std": 0.14249278604984283, "step": 79, "train_speed(iter/s)": 0.019522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 199.4375, "completions/min_length": 100.0, "epoch": 0.0012280109292972707, "frac_reward_zero_std": 0.0, "grad_norm": 1.491513729095459, "kl": 0.00014259286399465054, "learning_rate": 6.138735420503377e-08, "loss": -0.07974611222743988, "memory(GiB)": 78.3, "reward": 0.6648682951927185, "reward_std": 0.13118110597133636, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8868415951728821, "rewards/PlanningActionSetORM/std": 0.1315804421901703, "rewards/RMReward/mean": 0.609375, "rewards/RMReward/std": 0.15157106518745422, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 80, "train_speed(iter/s)": 0.019425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.9375, "completions/min_length": 2.0, "epoch": 0.0012433610659134866, "frac_reward_zero_std": 0.5, "grad_norm": 63.24433898925781, "kl": 0.0005030776374042034, "learning_rate": 6.21546961325967e-08, "loss": -0.01269946713000536, "memory(GiB)": 78.3, "reward": 0.35624998807907104, "reward_std": 0.12745587527751923, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.2951216399669647, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 81, "train_speed(iter/s)": 0.019392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 107.28125, "completions/min_length": 2.0, "epoch": 0.0012587112025297025, "frac_reward_zero_std": 0.0, "grad_norm": 61.67808532714844, "kl": 1.530706686025951e-05, "learning_rate": 6.292203806015962e-08, "loss": 0.012424934655427933, "memory(GiB)": 78.3, "reward": 0.6790379285812378, "reward_std": 0.2821255326271057, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8428795337677002, "rewards/PlanningActionSetORM/std": 0.058311231434345245, "rewards/RMReward/mean": 0.5493749976158142, "rewards/RMReward/std": 0.14516513049602509, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 82, "train_speed(iter/s)": 0.019322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 118.4375, "completions/min_length": 2.0, "epoch": 0.0012740613391459183, "frac_reward_zero_std": 0.0, "grad_norm": 120.64942932128906, "kl": 0.00014379521599039435, "learning_rate": 6.368937998772253e-08, "loss": -0.0709470734000206, "memory(GiB)": 78.3, "reward": 0.2764449119567871, "reward_std": 0.2160906195640564, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": 0.19663986563682556, "rewards/VisualPerceptionAccuracy/std": 0.14718122780323029, "step": 83, "train_speed(iter/s)": 0.019511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.84375, "completions/min_length": 13.0, "epoch": 0.0012894114757621342, "frac_reward_zero_std": 0.0, "grad_norm": 9.287599563598633, "kl": 0.0, "learning_rate": 6.445672191528546e-08, "loss": -0.018693141639232635, "memory(GiB)": 78.3, "reward": 0.2578125, "reward_std": 0.40390509366989136, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.21875, "rewards/SpatialReasoningORM/std": 0.420013427734375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 84, "train_speed(iter/s)": 0.019534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.71875, "completions/min_length": 9.0, "epoch": 0.0013047616123783503, "frac_reward_zero_std": 0.0, "grad_norm": 27.34572982788086, "kl": 3.100198591710068e-05, "learning_rate": 6.522406384284837e-08, "loss": -0.1403048038482666, "memory(GiB)": 78.3, "reward": 0.9328124523162842, "reward_std": 0.14990092813968658, "rewards/MathAnswerFormat/mean": 0.84375, "rewards/MathAnswerFormat/std": 0.3689020276069641, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.14756080508232117, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 85, "train_speed(iter/s)": 0.019735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 426.09375, "completions/min_length": 42.0, "epoch": 0.0013201117489945661, "frac_reward_zero_std": 0.0, "grad_norm": 2.731105089187622, "kl": 0.0001968408760149032, "learning_rate": 6.59914057704113e-08, "loss": -0.15881671011447906, "memory(GiB)": 82.18, "reward": 0.37824586033821106, "reward_std": 0.22695089876651764, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7215636372566223, "rewards/PlanningActionSetORM/std": 0.28719133138656616, "rewards/RMReward/mean": 0.4906249940395355, "rewards/RMReward/std": 0.22302372753620148, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21967893838882446, "rewards/VisualPerceptionAccuracy/std": 0.22640661895275116, "step": 86, "train_speed(iter/s)": 0.019528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/mean_length": 100.25, "completions/min_length": 78.0, "epoch": 0.001335461885610782, "frac_reward_zero_std": 0.0, "grad_norm": 2.7042953968048096, "kl": 0.0003124857903458178, "learning_rate": 6.675874769797422e-08, "loss": 0.0024520959705114365, "memory(GiB)": 82.18, "reward": 0.7755357027053833, "reward_std": 0.08899518847465515, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8714286088943481, "rewards/PlanningActionSetORM/std": 0.12791834771633148, "rewards/RMReward/mean": 0.7515624761581421, "rewards/RMReward/std": 0.1027715727686882, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 87, "train_speed(iter/s)": 0.019539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 49.25, "completions/min_length": 2.0, "epoch": 0.0013508120222269979, "frac_reward_zero_std": 0.0, "grad_norm": 81.14804077148438, "kl": 0.00034118699841201305, "learning_rate": 6.752608962553715e-08, "loss": -0.044201888144016266, "memory(GiB)": 82.18, "reward": 0.49888020753860474, "reward_std": 0.21724028885364532, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9169270992279053, "rewards/PlanningActionSetORM/std": 0.10453235357999802, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.16214706003665924, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 88, "train_speed(iter/s)": 0.019554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.59375, "completions/min_length": 3.0, "epoch": 0.0013661621588432137, "frac_reward_zero_std": 0.0, "grad_norm": 33.9430046081543, "kl": 0.00029207643819972873, "learning_rate": 6.829343155310006e-08, "loss": -0.054649848490953445, "memory(GiB)": 82.18, "reward": 0.5871875286102295, "reward_std": 0.30854079127311707, "rewards/MathAnswerFormat/mean": 0.34375, "rewards/MathAnswerFormat/std": 0.4825586974620819, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6000000238418579, "rewards/SpatialReasoningORM/std": 0.34077709913253784, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 89, "train_speed(iter/s)": 0.019746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 119.6875, "completions/min_length": 2.0, "epoch": 0.0013815122954594296, "frac_reward_zero_std": 0.0, "grad_norm": 77.52767944335938, "kl": 3.3153508411487564e-05, "learning_rate": 6.906077348066299e-08, "loss": 0.09260334074497223, "memory(GiB)": 82.18, "reward": 0.48525556921958923, "reward_std": 0.16834107041358948, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8644305467605591, "rewards/PlanningActionSetORM/std": 0.14789721369743347, "rewards/RMReward/mean": 0.4181250035762787, "rewards/RMReward/std": 0.12051106244325638, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 90, "train_speed(iter/s)": 0.019753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.875, "completions/min_length": 2.0, "epoch": 0.0013968624320756455, "frac_reward_zero_std": 0.0, "grad_norm": 122.79671478271484, "kl": 0.006890069227665663, "learning_rate": 6.982811540822592e-08, "loss": -0.0028531700372695923, "memory(GiB)": 82.18, "reward": 0.6690624952316284, "reward_std": 0.209869846701622, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.504016101360321, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6812499761581421, "rewards/SpatialReasoningORM/std": 0.35143712162971497, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 91, "train_speed(iter/s)": 0.019923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/mean_length": 148.46875, "completions/min_length": 87.0, "epoch": 0.0014122125686918613, "frac_reward_zero_std": 0.0, "grad_norm": 3.645397186279297, "kl": 0.00025802815798670053, "learning_rate": 7.059545733578883e-08, "loss": -0.014777705073356628, "memory(GiB)": 82.18, "reward": 0.408976674079895, "reward_std": 0.0995732992887497, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7966145873069763, "rewards/PlanningActionSetORM/std": 0.1262824833393097, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.07852812856435776, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08863045275211334, "rewards/VisualPerceptionAccuracy/std": 0.12878955900669098, "step": 92, "train_speed(iter/s)": 0.019963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/mean_length": 105.3125, "completions/min_length": 79.0, "epoch": 0.0014275627053080772, "frac_reward_zero_std": 0.0, "grad_norm": 2.489121198654175, "kl": 7.342440221691504e-05, "learning_rate": 7.136279926335176e-08, "loss": -0.027500953525304794, "memory(GiB)": 82.18, "reward": 0.7587500214576721, "reward_std": 0.07647714763879776, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.13335011899471283, "rewards/RMReward/mean": 0.729687511920929, "rewards/RMReward/std": 0.08216758817434311, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 93, "train_speed(iter/s)": 0.019822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 149.15625, "completions/min_length": 2.0, "epoch": 0.001442912841924293, "frac_reward_zero_std": 0.0, "grad_norm": 89.86445617675781, "kl": 0.001968811033293605, "learning_rate": 7.213014119091468e-08, "loss": -0.0824211984872818, "memory(GiB)": 82.18, "reward": 0.48780009150505066, "reward_std": 0.2214755117893219, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7842509746551514, "rewards/PlanningActionSetORM/std": 0.1793862134218216, "rewards/RMReward/mean": 0.578125, "rewards/RMReward/std": 0.16829413175582886, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 94, "train_speed(iter/s)": 0.019611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.9375, "completions/min_length": 9.0, "epoch": 0.001458262978540509, "frac_reward_zero_std": 0.0, "grad_norm": 9.709818840026855, "kl": 3.616898175096139e-05, "learning_rate": 7.28974831184776e-08, "loss": 0.0041303858160972595, "memory(GiB)": 82.18, "reward": 0.7653124332427979, "reward_std": 0.3708881437778473, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.24593468010425568, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.7562500238418579, "rewards/SpatialReasoningORM/std": 0.4180889427661896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 95, "train_speed(iter/s)": 0.019788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.90625, "completions/min_length": 2.0, "epoch": 0.0014736131151567248, "frac_reward_zero_std": 0.0, "grad_norm": 43.570350646972656, "kl": 0.00016649911412969232, "learning_rate": 7.366482504604052e-08, "loss": -0.02151501178741455, "memory(GiB)": 82.18, "reward": 0.6899999976158142, "reward_std": 0.25958943367004395, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.7000000476837158, "rewards/SpatialReasoningORM/std": 0.3292219340801239, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 96, "train_speed(iter/s)": 0.019967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 221.53125, "completions/min_length": 115.0, "epoch": 0.0014889632517729409, "frac_reward_zero_std": 0.0, "grad_norm": 1.7271405458450317, "kl": 0.00016891756968107074, "learning_rate": 7.443216697360345e-08, "loss": -0.019815191626548767, "memory(GiB)": 82.18, "reward": 0.4570621848106384, "reward_std": 0.1294994354248047, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9150015711784363, "rewards/PlanningActionSetORM/std": 0.07327351719141006, "rewards/RMReward/mean": 0.6112499833106995, "rewards/RMReward/std": 0.1585717648267746, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.24212408065795898, "rewards/VisualPerceptionAccuracy/std": 0.12274396419525146, "step": 97, "train_speed(iter/s)": 0.019874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/mean_length": 101.21875, "completions/min_length": 71.0, "epoch": 0.0015043133883891567, "frac_reward_zero_std": 0.0, "grad_norm": 2.8247640132904053, "kl": 6.199958443176001e-05, "learning_rate": 7.519950890116636e-08, "loss": 0.014684464782476425, "memory(GiB)": 82.18, "reward": 0.661286473274231, "reward_std": 0.12337689101696014, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7326822876930237, "rewards/PlanningActionSetORM/std": 0.19304700195789337, "rewards/RMReward/mean": 0.6434375047683716, "rewards/RMReward/std": 0.15415972471237183, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 98, "train_speed(iter/s)": 0.019766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/mean_length": 129.78125, "completions/min_length": 9.0, "epoch": 0.0015196635250053726, "frac_reward_zero_std": 0.0, "grad_norm": 18.758859634399414, "kl": 0.00011788208939833567, "learning_rate": 7.596685082872929e-08, "loss": -0.06128770858049393, "memory(GiB)": 82.18, "reward": 0.46427589654922485, "reward_std": 0.14353355765342712, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.1914854198694229, "rewards/VisualPerceptionAccuracy/mean": 0.06292679160833359, "rewards/VisualPerceptionAccuracy/std": 0.08122027665376663, "step": 99, "train_speed(iter/s)": 0.019894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.21875, "completions/min_length": 2.0, "epoch": 0.0015350136616215885, "frac_reward_zero_std": 0.0, "grad_norm": 38.812740325927734, "kl": 1.2057496860506944e-05, "learning_rate": 7.673419275629221e-08, "loss": -0.0795946717262268, "memory(GiB)": 82.18, "reward": 0.6825000047683716, "reward_std": 0.1901833713054657, "rewards/MathAnswerFormat/mean": 0.46875, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6937500238418579, "rewards/SpatialReasoningORM/std": 0.3555436432361603, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 100, "train_speed(iter/s)": 0.020067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 167.15625, "completions/min_length": 64.0, "epoch": 0.0015503637982378043, "frac_reward_zero_std": 0.0, "grad_norm": 3.7087602615356445, "kl": 0.0005591105436906219, "learning_rate": 7.750153468385514e-08, "loss": -0.04469139873981476, "memory(GiB)": 82.18, "reward": 0.5832118391990662, "reward_std": 0.10276127606630325, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7535590529441833, "rewards/PlanningActionSetORM/std": 0.1613553911447525, "rewards/RMReward/mean": 0.5406249761581421, "rewards/RMReward/std": 0.18813066184520721, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 101, "train_speed(iter/s)": 0.019592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 53.03125, "completions/min_length": 2.0, "epoch": 0.0015657139348540202, "frac_reward_zero_std": 0.0, "grad_norm": 65.56649780273438, "kl": -0.0003256081254221499, "learning_rate": 7.826887661141806e-08, "loss": -0.072847381234169, "memory(GiB)": 82.18, "reward": 0.6715848445892334, "reward_std": 0.31660088896751404, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8783482313156128, "rewards/PlanningActionSetORM/std": 0.11036017537117004, "rewards/RMReward/mean": 0.7562499642372131, "rewards/RMReward/std": 0.13022416830062866, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5625, "rewards/VisualPerceptionAccuracy/std": 0.5123475790023804, "step": 102, "train_speed(iter/s)": 0.019638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/mean_length": 195.15625, "completions/min_length": 12.0, "epoch": 0.001581064071470236, "frac_reward_zero_std": 0.0, "grad_norm": 4.6511335372924805, "kl": 0.0004973018076270819, "learning_rate": 7.903621853898098e-08, "loss": 0.04434728994965553, "memory(GiB)": 82.18, "reward": 0.44073063135147095, "reward_std": 0.21731974184513092, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8500000238418579, "rewards/SpatialReasoningORM/std": 0.3464101552963257, "rewards/VisualPerceptionAccuracy/mean": 0.027086254209280014, "rewards/VisualPerceptionAccuracy/std": 0.10291734337806702, "step": 103, "train_speed(iter/s)": 0.019751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.28125, "completions/min_length": 2.0, "epoch": 0.001596414208086452, "frac_reward_zero_std": 0.0, "grad_norm": 47.300445556640625, "kl": 0.002316112630069256, "learning_rate": 7.98035604665439e-08, "loss": -0.11996720731258392, "memory(GiB)": 82.18, "reward": 0.4996874928474426, "reward_std": 0.21103808283805847, "rewards/MathAnswerFormat/mean": 0.375, "rewards/MathAnswerFormat/std": 0.49186936020851135, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5062499642372131, "rewards/SpatialReasoningORM/std": 0.4514760673046112, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 104, "train_speed(iter/s)": 0.019916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 94.25, "completions/min_length": 8.0, "epoch": 0.0016117643447026678, "frac_reward_zero_std": 0.0, "grad_norm": 10.616294860839844, "kl": 0.0010274938540533185, "learning_rate": 8.057090239410682e-08, "loss": -0.042313046753406525, "memory(GiB)": 82.18, "reward": 0.6409035921096802, "reward_std": 0.26418304443359375, "rewards/MathAnswerFormat/mean": 0.8125, "rewards/MathAnswerFormat/std": 0.40311288833618164, "rewards/PlanningActionSetORM/mean": 0.8234104514122009, "rewards/PlanningActionSetORM/std": 0.12024476379156113, "rewards/RMReward/mean": 0.5143749713897705, "rewards/RMReward/std": 0.12366454303264618, "rewards/SpatialReasoningORM/mean": 0.699999988079071, "rewards/SpatialReasoningORM/std": 0.43817806243896484, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 105, "train_speed(iter/s)": 0.01985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 293.0625, "completions/min_length": 93.0, "epoch": 0.0016271144813188837, "frac_reward_zero_std": 0.0, "grad_norm": 2.586669921875, "kl": 0.0006750813918188214, "learning_rate": 8.133824432166974e-08, "loss": 0.0630704015493393, "memory(GiB)": 82.18, "reward": 0.3264327645301819, "reward_std": 0.08492051064968109, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7500743865966797, "rewards/PlanningActionSetORM/std": 0.14463084936141968, "rewards/RMReward/mean": 0.5874999761581421, "rewards/RMReward/std": 0.12315302342176437, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.032850634306669235, "rewards/VisualPerceptionAccuracy/std": 0.06136137247085571, "step": 106, "train_speed(iter/s)": 0.019715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 103.59375, "completions/min_length": 12.0, "epoch": 0.0016424646179350995, "frac_reward_zero_std": 0.0, "grad_norm": 9.206119537353516, "kl": 0.0005291325505822897, "learning_rate": 8.210558624923267e-08, "loss": -0.08533339947462082, "memory(GiB)": 82.18, "reward": 0.5478838682174683, "reward_std": 0.10704190284013748, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": 0.12264269590377808, "rewards/VisualPerceptionAccuracy/std": 0.10658379644155502, "step": 107, "train_speed(iter/s)": 0.019845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/mean_length": 58.125, "completions/min_length": 14.0, "epoch": 0.0016578147545513156, "frac_reward_zero_std": 0.0, "grad_norm": 5.644015312194824, "kl": -6.729706456098938e-06, "learning_rate": 8.287292817679558e-08, "loss": -0.058429114520549774, "memory(GiB)": 82.18, "reward": 0.6390624642372131, "reward_std": 0.2780872583389282, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9406249523162842, "rewards/PlanningActionSetORM/std": 0.14516513049602509, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.07274384051561356, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 108, "train_speed(iter/s)": 0.019805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 132.53125, "completions/min_length": 78.0, "epoch": 0.0016731648911675315, "frac_reward_zero_std": 0.0, "grad_norm": 2.327810287475586, "kl": 0.00010555786138866097, "learning_rate": 8.364027010435851e-08, "loss": -0.07902313023805618, "memory(GiB)": 82.18, "reward": 0.6075791716575623, "reward_std": 0.1217886358499527, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8628957271575928, "rewards/PlanningActionSetORM/std": 0.08007115125656128, "rewards/RMReward/mean": 0.543749988079071, "rewards/RMReward/std": 0.1610199213027954, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 109, "train_speed(iter/s)": 0.019616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/mean_length": 97.40625, "completions/min_length": 17.0, "epoch": 0.0016885150277837473, "frac_reward_zero_std": 0.0, "grad_norm": 3.777162790298462, "kl": 0.00019811117090284824, "learning_rate": 8.440761203192142e-08, "loss": -0.09474502503871918, "memory(GiB)": 82.18, "reward": 0.7245937585830688, "reward_std": 0.14105163514614105, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.85546875, "rewards/PlanningActionSetORM/std": 0.15270279347896576, "rewards/RMReward/mean": 0.6918749809265137, "rewards/RMReward/std": 0.16008944809436798, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 110, "train_speed(iter/s)": 0.019595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/mean_length": 62.40625, "completions/min_length": 3.0, "epoch": 0.0017038651643999632, "frac_reward_zero_std": 0.0, "grad_norm": 48.44222640991211, "kl": 0.0006041243905201554, "learning_rate": 8.517495395948436e-08, "loss": -0.12332025170326233, "memory(GiB)": 82.18, "reward": 0.6343526840209961, "reward_std": 0.09818544238805771, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.7841517925262451, "rewards/PlanningActionSetORM/std": 0.11287382245063782, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.1046820655465126, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 111, "train_speed(iter/s)": 0.019628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 71.9375, "completions/min_length": 2.0, "epoch": 0.001719215301016179, "frac_reward_zero_std": 0.0, "grad_norm": 56.01914978027344, "kl": -0.00011587928747758269, "learning_rate": 8.594229588704729e-08, "loss": -0.09185831248760223, "memory(GiB)": 82.18, "reward": 0.574799120426178, "reward_std": 0.19555345177650452, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8323661088943481, "rewards/PlanningActionSetORM/std": 0.20347800850868225, "rewards/RMReward/mean": 0.6500000357627869, "rewards/RMReward/std": 0.17701224982738495, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 112, "train_speed(iter/s)": 0.019654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 178.53125, "completions/min_length": 102.0, "epoch": 0.001734565437632395, "frac_reward_zero_std": 0.0, "grad_norm": 1.992669701576233, "kl": 0.00011102802818641067, "learning_rate": 8.67096378146102e-08, "loss": -0.029365047812461853, "memory(GiB)": 82.18, "reward": 0.6022767424583435, "reward_std": 0.11098171770572662, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8688836097717285, "rewards/PlanningActionSetORM/std": 0.11198482662439346, "rewards/RMReward/mean": 0.5356249809265137, "rewards/RMReward/std": 0.12913952767848969, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 113, "train_speed(iter/s)": 0.019629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/mean_length": 318.0625, "completions/min_length": 72.0, "epoch": 0.0017499155742486108, "frac_reward_zero_std": 0.0, "grad_norm": 1.816124439239502, "kl": 0.00018151798576582223, "learning_rate": 8.747697974217313e-08, "loss": -0.05335157364606857, "memory(GiB)": 82.18, "reward": 0.460837721824646, "reward_std": 0.16000378131866455, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9684523940086365, "rewards/PlanningActionSetORM/std": 0.0499148964881897, "rewards/RMReward/mean": 0.6749999523162842, "rewards/RMReward/std": 0.11547006666660309, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1879849135875702, "rewards/VisualPerceptionAccuracy/std": 0.2250523865222931, "step": 114, "train_speed(iter/s)": 0.019695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 169.65625, "completions/min_length": 66.0, "epoch": 0.0017652657108648267, "frac_reward_zero_std": 0.0, "grad_norm": 2.6225225925445557, "kl": 0.0002967852633446455, "learning_rate": 8.824432166973604e-08, "loss": 0.019657675176858902, "memory(GiB)": 82.18, "reward": 0.4984452724456787, "reward_std": 0.12182480096817017, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8765624761581421, "rewards/PlanningActionSetORM/std": 0.15206049382686615, "rewards/RMReward/mean": 0.734375, "rewards/RMReward/std": 0.117924764752388, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23407801985740662, "rewards/VisualPerceptionAccuracy/std": 0.1322050392627716, "step": 115, "train_speed(iter/s)": 0.019737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/mean_length": 15.78125, "completions/min_length": 13.0, "epoch": 0.0017806158474810425, "frac_reward_zero_std": 0.0, "grad_norm": 9.275872230529785, "kl": 0.0009802766144275665, "learning_rate": 8.901166359729897e-08, "loss": -0.011837862432003021, "memory(GiB)": 82.18, "reward": 0.4818750023841858, "reward_std": 0.4026891589164734, "rewards/MathAnswerFormat/mean": 0.96875, "rewards/MathAnswerFormat/std": 0.1767766922712326, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.45625001192092896, "rewards/SpatialReasoningORM/std": 0.4983440041542053, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 116, "train_speed(iter/s)": 0.019884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/mean_length": 234.84375, "completions/min_length": 133.0, "epoch": 0.0017959659840972584, "frac_reward_zero_std": 0.0, "grad_norm": 3.3879446983337402, "kl": 0.00028994533931836486, "learning_rate": 8.977900552486188e-08, "loss": 0.021553047001361847, "memory(GiB)": 82.18, "reward": 0.34076911211013794, "reward_std": 0.10122103244066238, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8180803060531616, "rewards/PlanningActionSetORM/std": 0.1264815628528595, "rewards/RMReward/mean": 0.5743749737739563, "rewards/RMReward/std": 0.11592921614646912, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05842213332653046, "rewards/VisualPerceptionAccuracy/std": 0.09915997833013535, "step": 117, "train_speed(iter/s)": 0.019859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 113.125, "completions/min_length": 14.0, "epoch": 0.0018113161207134743, "frac_reward_zero_std": 0.0, "grad_norm": 6.3578362464904785, "kl": 4.042750515509397e-05, "learning_rate": 9.05463474524248e-08, "loss": -0.013486707583069801, "memory(GiB)": 82.18, "reward": 0.6057157516479492, "reward_std": 0.32296591997146606, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9259076714515686, "rewards/PlanningActionSetORM/std": 0.04485679045319557, "rewards/RMReward/mean": 0.4781249761581421, "rewards/RMReward/std": 0.20733928680419922, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 118, "train_speed(iter/s)": 0.019817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 59.53125, "completions/min_length": 2.0, "epoch": 0.0018266662573296903, "frac_reward_zero_std": 0.0, "grad_norm": 52.08045196533203, "kl": 0.00016026495723053813, "learning_rate": 9.131368937998772e-08, "loss": 0.014851607382297516, "memory(GiB)": 82.18, "reward": 0.5289843678474426, "reward_std": 0.1483142077922821, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.6742187738418579, "rewards/PlanningActionSetORM/std": 0.17579330503940582, "rewards/RMReward/mean": 0.5750000476837158, "rewards/RMReward/std": 0.09486832469701767, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 119, "train_speed(iter/s)": 0.019828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 71.34375, "completions/min_length": 9.0, "epoch": 0.0018420163939459062, "frac_reward_zero_std": 0.0, "grad_norm": 10.7200927734375, "kl": 0.003627514000982046, "learning_rate": 9.208103130755066e-08, "loss": 0.006818599998950958, "memory(GiB)": 82.18, "reward": 0.7631175518035889, "reward_std": 0.1308908611536026, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.8499255776405334, "rewards/PlanningActionSetORM/std": 0.10233984887599945, "rewards/RMReward/mean": 0.512499988079071, "rewards/RMReward/std": 0.128452330827713, "rewards/SpatialReasoningORM/mean": 0.949999988079071, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 120, "train_speed(iter/s)": 0.019711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 46.75, "completions/min_length": 2.0, "epoch": 0.001857366530562122, "frac_reward_zero_std": 0.0, "grad_norm": 64.76050567626953, "kl": 4.774014814756811e-05, "learning_rate": 9.284837323511358e-08, "loss": 0.10275628417730331, "memory(GiB)": 82.18, "reward": 0.585364580154419, "reward_std": 0.1790791004896164, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8723958134651184, "rewards/PlanningActionSetORM/std": 0.09558391571044922, "rewards/RMReward/mean": 0.7999999523162842, "rewards/RMReward/std": 0.07958224415779114, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 121, "train_speed(iter/s)": 0.01977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 48.375, "completions/min_length": 2.0, "epoch": 0.001872716667178338, "frac_reward_zero_std": 0.0, "grad_norm": 29.08493423461914, "kl": -7.976996130309999e-06, "learning_rate": 9.36157151626765e-08, "loss": -0.05950348079204559, "memory(GiB)": 82.18, "reward": 0.38734373450279236, "reward_std": 0.10193051397800446, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8578125238418579, "rewards/PlanningActionSetORM/std": 0.05404634401202202, "rewards/RMReward/mean": 0.7093750238418579, "rewards/RMReward/std": 0.07352721691131592, "rewards/SpatialReasoningORM/mean": 0.03750000149011612, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 122, "train_speed(iter/s)": 0.019744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/mean_length": 8.0625, "completions/min_length": 2.0, "epoch": 0.0018880668037945538, "frac_reward_zero_std": 0.0, "grad_norm": 98.54278564453125, "kl": 2.5699013349367306e-05, "learning_rate": 9.438305709023942e-08, "loss": 0.009627696126699448, "memory(GiB)": 82.18, "reward": 0.6009374856948853, "reward_std": 0.3082624673843384, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.606249988079071, "rewards/SpatialReasoningORM/std": 0.4203972816467285, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 123, "train_speed(iter/s)": 0.019883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.65625, "completions/min_length": 2.0, "epoch": 0.0019034169404107697, "frac_reward_zero_std": 0.0, "grad_norm": 131.2174530029297, "kl": 0.0014549794141203165, "learning_rate": 9.515039901780234e-08, "loss": 0.06130177527666092, "memory(GiB)": 82.18, "reward": 0.3740624785423279, "reward_std": 0.2789333462715149, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.39375001192092896, "rewards/SpatialReasoningORM/std": 0.2895352244377136, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 124, "train_speed(iter/s)": 0.020021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 154.15625, "completions/min_length": 89.0, "epoch": 0.0019187670770269855, "frac_reward_zero_std": 0.0, "grad_norm": 1.7219343185424805, "kl": 4.234005609760061e-05, "learning_rate": 9.591774094536526e-08, "loss": 0.014502383768558502, "memory(GiB)": 82.18, "reward": 0.7073860168457031, "reward_std": 0.09043677151203156, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.855679988861084, "rewards/PlanningActionSetORM/std": 0.09819028526544571, "rewards/RMReward/mean": 0.6703125238418579, "rewards/RMReward/std": 0.17499279975891113, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 125, "train_speed(iter/s)": 0.019999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.875, "completions/min_length": 9.0, "epoch": 0.0019341172136432014, "frac_reward_zero_std": 0.0, "grad_norm": 9.664033889770508, "kl": 0.0054778726771473885, "learning_rate": 9.668508287292818e-08, "loss": -0.059893012046813965, "memory(GiB)": 82.18, "reward": 0.6287499666213989, "reward_std": 0.43721771240234375, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.24593468010425568, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.612500011920929, "rewards/SpatialReasoningORM/std": 0.48709142208099365, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 126, "train_speed(iter/s)": 0.020137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 75.53125, "completions/min_length": 12.0, "epoch": 0.0019494673502594173, "frac_reward_zero_std": 0.0, "grad_norm": 11.15774917602539, "kl": 0.0029099665116518736, "learning_rate": 9.74524248004911e-08, "loss": -0.024370083585381508, "memory(GiB)": 82.18, "reward": 0.8393080234527588, "reward_std": 0.10491829365491867, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8899554014205933, "rewards/PlanningActionSetORM/std": 0.04688515514135361, "rewards/RMReward/mean": 0.659375011920929, "rewards/RMReward/std": 0.1254574954509735, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 127, "train_speed(iter/s)": 0.020091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/mean_length": 170.03125, "completions/min_length": 2.0, "epoch": 0.0019648174868756333, "frac_reward_zero_std": 0.0, "grad_norm": 37.41450500488281, "kl": 0.0015543372137472034, "learning_rate": 9.821976672805401e-08, "loss": 0.07742704451084137, "memory(GiB)": 82.18, "reward": 0.08115171641111374, "reward_std": 0.1579836755990982, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.03750000149011612, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": 0.1266784369945526, "rewards/VisualPerceptionAccuracy/std": 0.17346735298633575, "step": 128, "train_speed(iter/s)": 0.020197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 96.125, "completions/min_length": 2.0, "epoch": 0.001980167623491849, "frac_reward_zero_std": 0.0, "grad_norm": 22.843854904174805, "kl": -8.919787069316953e-05, "learning_rate": 9.898710865561695e-08, "loss": -0.04122789204120636, "memory(GiB)": 82.18, "reward": 0.6098771095275879, "reward_std": 0.13340596854686737, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8518961668014526, "rewards/PlanningActionSetORM/std": 0.08278250694274902, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.15370425581932068, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 129, "train_speed(iter/s)": 0.020127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 123.03125, "completions/min_length": 73.0, "epoch": 0.001995517760108065, "frac_reward_zero_std": 0.0, "grad_norm": 2.5013742446899414, "kl": 0.00034209073055535555, "learning_rate": 9.975445058317988e-08, "loss": 0.045389845967292786, "memory(GiB)": 82.18, "reward": 0.4453216791152954, "reward_std": 0.14072731137275696, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8968750238418579, "rewards/PlanningActionSetORM/std": 0.0858980342745781, "rewards/RMReward/mean": 0.793749988079071, "rewards/RMReward/std": 0.08539125323295593, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07626834511756897, "rewards/VisualPerceptionAccuracy/std": 0.20845824480056763, "step": 130, "train_speed(iter/s)": 0.020097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.21875, "completions/min_length": 3.0, "epoch": 0.0020108678967242807, "frac_reward_zero_std": 0.0, "grad_norm": 28.902130126953125, "kl": 0.006253058556467295, "learning_rate": 1.0052179251074279e-07, "loss": -0.04050467163324356, "memory(GiB)": 82.18, "reward": 0.3306249976158142, "reward_std": 0.26102763414382935, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.504016101360321, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.32500001788139343, "rewards/SpatialReasoningORM/std": 0.3436051607131958, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 131, "train_speed(iter/s)": 0.020226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 115.3125, "completions/min_length": 59.0, "epoch": 0.002026218033340497, "frac_reward_zero_std": 0.0, "grad_norm": 3.0190813541412354, "kl": 0.0006078595179133117, "learning_rate": 1.0128913443830572e-07, "loss": -0.07973203808069229, "memory(GiB)": 82.18, "reward": 0.451358437538147, "reward_std": 0.08747811615467072, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9322172403335571, "rewards/PlanningActionSetORM/std": 0.04245728626847267, "rewards/RMReward/mean": 0.8031250238418579, "rewards/RMReward/std": 0.09031196683645248, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07377346605062485, "rewards/VisualPerceptionAccuracy/std": 0.10179219394922256, "step": 132, "train_speed(iter/s)": 0.02023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 60.8125, "completions/min_length": 15.0, "epoch": 0.0020415681699567124, "frac_reward_zero_std": 0.0, "grad_norm": 9.70386791229248, "kl": 0.0001419289328623563, "learning_rate": 1.0205647636586863e-07, "loss": -0.0009736251085996628, "memory(GiB)": 82.18, "reward": 0.76953125, "reward_std": 0.2576833963394165, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7703125476837158, "rewards/PlanningActionSetORM/std": 0.15083007514476776, "rewards/RMReward/mean": 0.778124988079071, "rewards/RMReward/std": 0.1032291129231453, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 133, "train_speed(iter/s)": 0.020268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 148.8125, "completions/min_length": 80.0, "epoch": 0.0020569183065729285, "frac_reward_zero_std": 0.0, "grad_norm": 3.2088935375213623, "kl": 0.0004048725822940469, "learning_rate": 1.0282381829343156e-07, "loss": 0.02563592791557312, "memory(GiB)": 82.18, "reward": 0.6034122705459595, "reward_std": 0.1429830640554428, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7920612692832947, "rewards/PlanningActionSetORM/std": 0.12020208686590195, "rewards/RMReward/mean": 0.5562499761581421, "rewards/RMReward/std": 0.170270174741745, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 134, "train_speed(iter/s)": 0.02014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 138.875, "completions/min_length": 71.0, "epoch": 0.002072268443189144, "frac_reward_zero_std": 0.0, "grad_norm": 1.9490602016448975, "kl": 0.00013991931336931884, "learning_rate": 1.0359116022099448e-07, "loss": -0.00520208477973938, "memory(GiB)": 82.18, "reward": 0.6275057792663574, "reward_std": 0.12142471224069595, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8625287413597107, "rewards/PlanningActionSetORM/std": 0.14513854682445526, "rewards/RMReward/mean": 0.5687499642372131, "rewards/RMReward/std": 0.12684127688407898, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 135, "train_speed(iter/s)": 0.020024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 57.09375, "completions/min_length": 14.0, "epoch": 0.0020876185798053603, "frac_reward_zero_std": 0.0, "grad_norm": 4.604619979858398, "kl": 0.002148229628801346, "learning_rate": 1.043585021485574e-07, "loss": -0.006219390779733658, "memory(GiB)": 82.18, "reward": 0.7808854579925537, "reward_std": 0.19576534628868103, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8057291507720947, "rewards/PlanningActionSetORM/std": 0.14500059187412262, "rewards/RMReward/mean": 0.574999988079071, "rewards/RMReward/std": 0.17795130610466003, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 136, "train_speed(iter/s)": 0.020021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 171.15625, "completions/min_length": 91.0, "epoch": 0.0021029687164215763, "frac_reward_zero_std": 0.0, "grad_norm": 2.7364706993103027, "kl": 0.00020853491150774062, "learning_rate": 1.0512584407612032e-07, "loss": 0.0114082470536232, "memory(GiB)": 82.18, "reward": 0.357754111289978, "reward_std": 0.10306087136268616, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9241383075714111, "rewards/PlanningActionSetORM/std": 0.08676137775182724, "rewards/RMReward/mean": 0.6499999761581421, "rewards/RMReward/std": 0.211344912648201, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.010680579580366611, "rewards/VisualPerceptionAccuracy/std": 0.030018232762813568, "step": 137, "train_speed(iter/s)": 0.019985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 167.9375, "completions/min_length": 93.0, "epoch": 0.002118318853037792, "frac_reward_zero_std": 0.0, "grad_norm": 2.0081429481506348, "kl": 0.00011954510409850627, "learning_rate": 1.0589318600368326e-07, "loss": 0.01969953626394272, "memory(GiB)": 82.18, "reward": 0.6481867432594299, "reward_std": 0.11065022647380829, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.760933518409729, "rewards/PlanningActionSetORM/std": 0.17349568009376526, "rewards/RMReward/mean": 0.6200000047683716, "rewards/RMReward/std": 0.18110769987106323, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 138, "train_speed(iter/s)": 0.019971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 114.4375, "completions/min_length": 73.0, "epoch": 0.002133668989654008, "frac_reward_zero_std": 0.0, "grad_norm": 2.5787594318389893, "kl": 0.00023936911020427942, "learning_rate": 1.0666052793124618e-07, "loss": 0.060448840260505676, "memory(GiB)": 82.18, "reward": 0.7087641954421997, "reward_std": 0.11108424514532089, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8513206839561462, "rewards/PlanningActionSetORM/std": 0.09195593744516373, "rewards/RMReward/mean": 0.6731250286102295, "rewards/RMReward/std": 0.13778050243854523, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 139, "train_speed(iter/s)": 0.019843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 164.8125, "completions/min_length": 59.0, "epoch": 0.0021490191262702237, "frac_reward_zero_std": 0.0, "grad_norm": 2.7568485736846924, "kl": 0.00011691721738316119, "learning_rate": 1.074278698588091e-07, "loss": -0.04659303277730942, "memory(GiB)": 82.18, "reward": 0.7123794555664062, "reward_std": 0.10290376096963882, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9306474328041077, "rewards/PlanningActionSetORM/std": 0.09916840493679047, "rewards/RMReward/mean": 0.6578124761581421, "rewards/RMReward/std": 0.17464682459831238, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 140, "train_speed(iter/s)": 0.019796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 117.09375, "completions/min_length": 2.0, "epoch": 0.00216436926288644, "frac_reward_zero_std": 0.0, "grad_norm": 43.41000747680664, "kl": -0.00024806265719234943, "learning_rate": 1.0819521178637202e-07, "loss": 0.028756991028785706, "memory(GiB)": 86.07, "reward": 0.35886555910110474, "reward_std": 0.16856202483177185, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8199054002761841, "rewards/PlanningActionSetORM/std": 0.12555718421936035, "rewards/RMReward/mean": 0.6031249761581421, "rewards/RMReward/std": 0.1657998412847519, "rewards/SpatialReasoningORM/mean": 0.07500000298023224, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 141, "train_speed(iter/s)": 0.019712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 99.40625, "completions/min_length": 14.0, "epoch": 0.0021797193995026555, "frac_reward_zero_std": 0.0, "grad_norm": 4.792597770690918, "kl": 0.00011172753875143826, "learning_rate": 1.0896255371393494e-07, "loss": -0.0035244375467300415, "memory(GiB)": 86.07, "reward": 0.8374479413032532, "reward_std": 0.18372361361980438, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8463541865348816, "rewards/PlanningActionSetORM/std": 0.09169033914804459, "rewards/RMReward/mean": 0.7062499523162842, "rewards/RMReward/std": 0.14705441892147064, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 142, "train_speed(iter/s)": 0.019688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 153.0, "completions/min_length": 71.0, "epoch": 0.0021950695361188715, "frac_reward_zero_std": 0.0, "grad_norm": 2.912574529647827, "kl": 0.00026096662622876465, "learning_rate": 1.0972989564149786e-07, "loss": -0.028247270733118057, "memory(GiB)": 86.07, "reward": 0.7584226727485657, "reward_std": 0.1117933988571167, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8796131014823914, "rewards/PlanningActionSetORM/std": 0.11057230830192566, "rewards/RMReward/mean": 0.7281249761581421, "rewards/RMReward/std": 0.12759405374526978, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 143, "train_speed(iter/s)": 0.019645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 114.65625, "completions/min_length": 2.0, "epoch": 0.002210419672735087, "frac_reward_zero_std": 0.0, "grad_norm": 52.242401123046875, "kl": 0.0008967918111011386, "learning_rate": 1.1049723756906078e-07, "loss": 0.03817521408200264, "memory(GiB)": 86.07, "reward": 0.5233045220375061, "reward_std": 0.19403719902038574, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9205449819564819, "rewards/PlanningActionSetORM/std": 0.04307356849312782, "rewards/RMReward/mean": 0.543749988079071, "rewards/RMReward/std": 0.16720746457576752, "rewards/SpatialReasoningORM/mean": 0.45000001788139343, "rewards/SpatialReasoningORM/std": 0.2683281898498535, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 144, "train_speed(iter/s)": 0.019617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 167.03125, "completions/min_length": 77.0, "epoch": 0.0022257698093513033, "frac_reward_zero_std": 0.0, "grad_norm": 3.4162955284118652, "kl": 0.0004002060159109533, "learning_rate": 1.112645794966237e-07, "loss": -0.14850929379463196, "memory(GiB)": 86.07, "reward": 0.7706300020217896, "reward_std": 0.0851755142211914, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8343998193740845, "rewards/PlanningActionSetORM/std": 0.16110707819461823, "rewards/RMReward/mean": 0.7546875476837158, "rewards/RMReward/std": 0.08362682908773422, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 145, "train_speed(iter/s)": 0.019558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 159.6875, "completions/min_length": 71.0, "epoch": 0.002241119945967519, "frac_reward_zero_std": 0.0, "grad_norm": 1.9468125104904175, "kl": 0.0003353680076543242, "learning_rate": 1.1203192142418662e-07, "loss": -0.03705673664808273, "memory(GiB)": 86.07, "reward": 0.5350024700164795, "reward_std": 0.10424958169460297, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8512622117996216, "rewards/PlanningActionSetORM/std": 0.12266240268945694, "rewards/RMReward/mean": 0.4559375047683716, "rewards/RMReward/std": 0.14232763648033142, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 146, "train_speed(iter/s)": 0.019538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 140.34375, "completions/min_length": 74.0, "epoch": 0.002256470082583735, "frac_reward_zero_std": 0.0, "grad_norm": 2.446741819381714, "kl": 8.249818347394466e-05, "learning_rate": 1.1279926335174956e-07, "loss": 0.015683989971876144, "memory(GiB)": 86.07, "reward": 0.65450519323349, "reward_std": 0.11368487775325775, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8225260376930237, "rewards/PlanningActionSetORM/std": 0.16091854870319366, "rewards/RMReward/mean": 0.612500011920929, "rewards/RMReward/std": 0.1361924558877945, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 147, "train_speed(iter/s)": 0.019508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.53125, "completions/min_length": 2.0, "epoch": 0.002271820219199951, "frac_reward_zero_std": 0.0, "grad_norm": 78.85929870605469, "kl": 0.00016276039241347462, "learning_rate": 1.1356660527931247e-07, "loss": 0.11459395289421082, "memory(GiB)": 86.07, "reward": 0.47749999165534973, "reward_std": 0.32901233434677124, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.15000000596046448, "rewards/SpatialReasoningORM/std": 0.2683281898498535, "rewards/VisualPerceptionAccuracy/mean": 0.8125, "rewards/VisualPerceptionAccuracy/std": 0.40311288833618164, "step": 148, "train_speed(iter/s)": 0.019632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/mean_length": 119.625, "completions/min_length": 72.0, "epoch": 0.0022871703558161667, "frac_reward_zero_std": 0.0, "grad_norm": 3.0381650924682617, "kl": 0.0006759357056580484, "learning_rate": 1.143339472068754e-07, "loss": 0.028140880167484283, "memory(GiB)": 86.07, "reward": 0.7091666460037231, "reward_std": 0.1168891191482544, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8458333015441895, "rewards/PlanningActionSetORM/std": 0.12353263795375824, "rewards/RMReward/mean": 0.675000011920929, "rewards/RMReward/std": 0.12951521575450897, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 149, "train_speed(iter/s)": 0.019578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 85.21875, "completions/min_length": 2.0, "epoch": 0.002302520492432383, "frac_reward_zero_std": 0.0, "grad_norm": 30.874263763427734, "kl": 0.0016872722189873457, "learning_rate": 1.1510128913443831e-07, "loss": -0.21352557837963104, "memory(GiB)": 86.07, "reward": 0.036787454038858414, "reward_std": 0.07687192410230637, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.07044991105794907, "rewards/VisualPerceptionAccuracy/std": 0.1412438601255417, "step": 150, "train_speed(iter/s)": 0.019681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.5, "completions/min_length": 2.0, "epoch": 0.0023178706290485985, "frac_reward_zero_std": 1.0, "grad_norm": 9.553929703542963e-05, "kl": 0.0, "learning_rate": 1.1586863106200124e-07, "loss": 0.0, "memory(GiB)": 86.07, "reward": 0.5699999928474426, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6000000238418579, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 151, "train_speed(iter/s)": 0.019782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 207.5625, "completions/min_length": 58.0, "epoch": 0.0023332207656648145, "frac_reward_zero_std": 0.0, "grad_norm": 3.1556665897369385, "kl": 0.00028360000578686595, "learning_rate": 1.1663597298956415e-07, "loss": -0.061553411185741425, "memory(GiB)": 86.07, "reward": 0.3472975790500641, "reward_std": 0.10757724940776825, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8932291865348816, "rewards/PlanningActionSetORM/std": 0.15788587927818298, "rewards/RMReward/mean": 0.6000000238418579, "rewards/RMReward/std": 0.14605934917926788, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.03594931587576866, "rewards/VisualPerceptionAccuracy/std": 0.08929122984409332, "step": 152, "train_speed(iter/s)": 0.0198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.875, "completions/min_length": 2.0, "epoch": 0.00234857090228103, "frac_reward_zero_std": 0.0, "grad_norm": 46.32199478149414, "kl": 0.0, "learning_rate": 1.1740331491712708e-07, "loss": -0.03203430399298668, "memory(GiB)": 86.07, "reward": 0.35624998807907104, "reward_std": 0.20768335461616516, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.2951216399669647, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 153, "train_speed(iter/s)": 0.019921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/mean_length": 121.0, "completions/min_length": 2.0, "epoch": 0.0023639210388972463, "frac_reward_zero_std": 0.0, "grad_norm": 79.7428970336914, "kl": 1.0506317266845144e-05, "learning_rate": 1.1817065684468999e-07, "loss": 0.09244424849748611, "memory(GiB)": 86.07, "reward": 0.3832343816757202, "reward_std": 0.21296697854995728, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7667186260223389, "rewards/PlanningActionSetORM/std": 0.15033815801143646, "rewards/RMReward/mean": 0.543749988079071, "rewards/RMReward/std": 0.18337120115756989, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 154, "train_speed(iter/s)": 0.01985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 85.03125, "completions/min_length": 14.0, "epoch": 0.002379271175513462, "frac_reward_zero_std": 0.0, "grad_norm": 4.4873857498168945, "kl": 0.00017323797510471195, "learning_rate": 1.1893799877225293e-07, "loss": -0.03547799587249756, "memory(GiB)": 86.07, "reward": 0.47755274176597595, "reward_std": 0.13134823739528656, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.014480462297797203, "rewards/VisualPerceptionAccuracy/std": 0.025196490809321404, "step": 155, "train_speed(iter/s)": 0.01986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 125.71875, "completions/min_length": 86.0, "epoch": 0.002394621312129678, "frac_reward_zero_std": 0.0, "grad_norm": 2.9835965633392334, "kl": 0.00039333413587883115, "learning_rate": 1.1970534069981586e-07, "loss": -0.00157972052693367, "memory(GiB)": 86.07, "reward": 0.7211830615997314, "reward_std": 0.1075335294008255, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8559151887893677, "rewards/PlanningActionSetORM/std": 0.12541404366493225, "rewards/RMReward/mean": 0.6875, "rewards/RMReward/std": 0.13678333163261414, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 156, "train_speed(iter/s)": 0.019875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 117.625, "completions/min_length": 72.0, "epoch": 0.0024099714487458936, "frac_reward_zero_std": 0.0, "grad_norm": 3.199180841445923, "kl": 0.0003329627506900579, "learning_rate": 1.2047268262737878e-07, "loss": -0.03770986944437027, "memory(GiB)": 86.07, "reward": 0.4493650794029236, "reward_std": 0.193276047706604, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7690972089767456, "rewards/PlanningActionSetORM/std": 0.15548236668109894, "rewards/RMReward/mean": 0.721875011920929, "rewards/RMReward/std": 0.1110086441040039, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1674107015132904, "rewards/VisualPerceptionAccuracy/std": 0.28084835410118103, "step": 157, "train_speed(iter/s)": 0.019884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 177.875, "completions/min_length": 2.0, "epoch": 0.0024253215853621097, "frac_reward_zero_std": 0.0, "grad_norm": 43.55288314819336, "kl": 0.00020078114175703377, "learning_rate": 1.2124002455494168e-07, "loss": 0.40374088287353516, "memory(GiB)": 86.07, "reward": 0.3639407455921173, "reward_std": 0.2882622480392456, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3639407455921173, "rewards/VisualPerceptionAccuracy/std": 0.4728466868400574, "step": 158, "train_speed(iter/s)": 0.019976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/mean_length": 141.40625, "completions/min_length": 86.0, "epoch": 0.002440671721978326, "frac_reward_zero_std": 0.0, "grad_norm": 2.1604208946228027, "kl": 0.0012266155099496245, "learning_rate": 1.220073664825046e-07, "loss": 0.014137417078018188, "memory(GiB)": 86.07, "reward": 0.5472016334533691, "reward_std": 0.12547503411769867, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8185081481933594, "rewards/PlanningActionSetORM/std": 0.12435317784547806, "rewards/RMReward/mean": 0.4793749749660492, "rewards/RMReward/std": 0.16694478690624237, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 159, "train_speed(iter/s)": 0.019801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/mean_length": 7.96875, "completions/min_length": 2.0, "epoch": 0.0024560218585945415, "frac_reward_zero_std": 0.0, "grad_norm": 43.65503692626953, "kl": 0.010644784197211266, "learning_rate": 1.2277470841006753e-07, "loss": 0.10762228816747665, "memory(GiB)": 86.07, "reward": 0.6943750381469727, "reward_std": 0.23711106181144714, "rewards/MathAnswerFormat/mean": 0.46875, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.706250011920929, "rewards/SpatialReasoningORM/std": 0.3004700541496277, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 160, "train_speed(iter/s)": 0.019898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 143.5, "completions/min_length": 73.0, "epoch": 0.0024713719952107575, "frac_reward_zero_std": 0.0, "grad_norm": 3.245065927505493, "kl": 0.0005752092693001032, "learning_rate": 1.2354205033763046e-07, "loss": -0.024067385122179985, "memory(GiB)": 86.07, "reward": 0.41970717906951904, "reward_std": 0.10182783752679825, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8697916269302368, "rewards/PlanningActionSetORM/std": 0.07384136319160461, "rewards/RMReward/mean": 0.6812499761581421, "rewards/RMReward/std": 0.10626226663589478, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1204560250043869, "rewards/VisualPerceptionAccuracy/std": 0.11653956770896912, "step": 161, "train_speed(iter/s)": 0.019914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 135.90625, "completions/min_length": 82.0, "epoch": 0.002486722131826973, "frac_reward_zero_std": 0.0, "grad_norm": 2.45306134223938, "kl": 0.000405239115934819, "learning_rate": 1.243093922651934e-07, "loss": -0.02457614615559578, "memory(GiB)": 86.07, "reward": 0.642075777053833, "reward_std": 0.17048686742782593, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8491286039352417, "rewards/PlanningActionSetORM/std": 0.10417639464139938, "rewards/RMReward/mean": 0.5903125405311584, "rewards/RMReward/std": 0.22100189328193665, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 162, "train_speed(iter/s)": 0.019634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 198.875, "completions/min_length": 84.0, "epoch": 0.0025020722684431893, "frac_reward_zero_std": 0.0, "grad_norm": 1.7383829355239868, "kl": 0.0005607217899523675, "learning_rate": 1.250767341927563e-07, "loss": 0.09069711714982986, "memory(GiB)": 86.07, "reward": 0.42620745301246643, "reward_std": 0.14278507232666016, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9192708134651184, "rewards/PlanningActionSetORM/std": 0.06910263746976852, "rewards/RMReward/mean": 0.7768750190734863, "rewards/RMReward/std": 0.23038284480571747, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.04706066846847534, "rewards/VisualPerceptionAccuracy/std": 0.1051437109708786, "step": 163, "train_speed(iter/s)": 0.019614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 53.21875, "completions/min_length": 2.0, "epoch": 0.002517422405059405, "frac_reward_zero_std": 0.0, "grad_norm": 58.881187438964844, "kl": 0.0005240394966676831, "learning_rate": 1.2584407612031924e-07, "loss": 0.019235342741012573, "memory(GiB)": 86.07, "reward": 0.6488646268844604, "reward_std": 0.1720786690711975, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8755208253860474, "rewards/PlanningActionSetORM/std": 0.051050879061222076, "rewards/RMReward/mean": 0.8243749737739563, "rewards/RMReward/std": 0.14296706020832062, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 164, "train_speed(iter/s)": 0.019626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/mean_length": 162.84375, "completions/min_length": 8.0, "epoch": 0.002532772541675621, "frac_reward_zero_std": 0.0, "grad_norm": 9.719059944152832, "kl": 0.03667553886771202, "learning_rate": 1.2661141804788217e-07, "loss": -0.0448753647506237, "memory(GiB)": 86.07, "reward": 0.4214525818824768, "reward_std": 0.2657294273376465, "rewards/MathAnswerFormat/mean": 0.625, "rewards/MathAnswerFormat/std": 0.5, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.3862210214138031, "rewards/VisualPerceptionAccuracy/mean": 0.15853026509284973, "rewards/VisualPerceptionAccuracy/std": 0.15413551032543182, "step": 165, "train_speed(iter/s)": 0.019714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.875, "completions/min_length": 2.0, "epoch": 0.0025481226782918366, "frac_reward_zero_std": 0.0, "grad_norm": 60.57771682739258, "kl": 0.00036991003435105085, "learning_rate": 1.2737875997544507e-07, "loss": -0.042265843600034714, "memory(GiB)": 86.07, "reward": 0.30281248688697815, "reward_std": 0.24234303832054138, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.3187500238418579, "rewards/SpatialReasoningORM/std": 0.30420443415641785, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 166, "train_speed(iter/s)": 0.019825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 102.625, "completions/min_length": 2.0, "epoch": 0.0025634728149080527, "frac_reward_zero_std": 0.0, "grad_norm": 49.494422912597656, "kl": 0.0006331161130219698, "learning_rate": 1.28146101903008e-07, "loss": -0.0733124166727066, "memory(GiB)": 86.07, "reward": 0.4631495475769043, "reward_std": 0.24267098307609558, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8158702850341797, "rewards/PlanningActionSetORM/std": 0.10564067214727402, "rewards/RMReward/mean": 0.5531250238418579, "rewards/RMReward/std": 0.2355622798204422, "rewards/SpatialReasoningORM/mean": 0.3375000059604645, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 167, "train_speed(iter/s)": 0.019711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 88.6875, "completions/min_length": 9.0, "epoch": 0.0025788229515242684, "frac_reward_zero_std": 0.0, "grad_norm": 13.892064094543457, "kl": 0.019336983561515808, "learning_rate": 1.2891344383057092e-07, "loss": -0.06860271841287613, "memory(GiB)": 86.07, "reward": 0.7200061082839966, "reward_std": 0.1328490674495697, "rewards/MathAnswerFormat/mean": 0.8125, "rewards/MathAnswerFormat/std": 0.40311288833618164, "rewards/PlanningActionSetORM/mean": 0.8181862831115723, "rewards/PlanningActionSetORM/std": 0.11600729078054428, "rewards/RMReward/mean": 0.44624999165534973, "rewards/RMReward/std": 0.11729592829942703, "rewards/SpatialReasoningORM/mean": 0.9249999523162842, "rewards/SpatialReasoningORM/std": 0.16124515235424042, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 168, "train_speed(iter/s)": 0.019666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 103.46875, "completions/min_length": 80.0, "epoch": 0.0025941730881404845, "frac_reward_zero_std": 0.0, "grad_norm": 2.839535713195801, "kl": 0.0005613848916254938, "learning_rate": 1.2968078575813384e-07, "loss": 0.00016102194786071777, "memory(GiB)": 86.07, "reward": 0.7705208659172058, "reward_std": 0.10289403796195984, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8901041746139526, "rewards/PlanningActionSetORM/std": 0.1071677878499031, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.13821294903755188, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 169, "train_speed(iter/s)": 0.01965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 59.125, "completions/min_length": 2.0, "epoch": 0.0026095232247567005, "frac_reward_zero_std": 0.0, "grad_norm": 24.05998992919922, "kl": 0.0002340175851713866, "learning_rate": 1.3044812768569674e-07, "loss": -0.03769712150096893, "memory(GiB)": 86.07, "reward": 0.8607738018035889, "reward_std": 0.16189581155776978, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8702380657196045, "rewards/PlanningActionSetORM/std": 0.0820559710264206, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.08266398310661316, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.9375, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 170, "train_speed(iter/s)": 0.019646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/mean_length": 2.8125, "completions/min_length": 2.0, "epoch": 0.002624873361372916, "frac_reward_zero_std": 0.0, "grad_norm": 74.74657440185547, "kl": 0.0007255358505062759, "learning_rate": 1.3121546961325967e-07, "loss": -0.03305456414818764, "memory(GiB)": 86.07, "reward": 0.3162499964237213, "reward_std": 0.2601962089538574, "rewards/MathAnswerFormat/mean": 0.03125, "rewards/MathAnswerFormat/std": 0.1767766922712326, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.33125001192092896, "rewards/SpatialReasoningORM/std": 0.32372578978538513, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 171, "train_speed(iter/s)": 0.019753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 50.875, "completions/min_length": 3.0, "epoch": 0.0026402234979891323, "frac_reward_zero_std": 0.0, "grad_norm": 18.929349899291992, "kl": 9.765474533196539e-05, "learning_rate": 1.319828115408226e-07, "loss": 0.03397456929087639, "memory(GiB)": 86.07, "reward": 0.3127901554107666, "reward_std": 0.12149116396903992, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7372767925262451, "rewards/PlanningActionSetORM/std": 0.18836915493011475, "rewards/RMReward/mean": 0.5531250238418579, "rewards/RMReward/std": 0.09911063313484192, "rewards/SpatialReasoningORM/mean": 0.03750000149011612, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 172, "train_speed(iter/s)": 0.019782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/mean_length": 112.875, "completions/min_length": 49.0, "epoch": 0.002655573634605348, "frac_reward_zero_std": 0.0, "grad_norm": 3.242769241333008, "kl": 0.0006365178851410747, "learning_rate": 1.3275015346838552e-07, "loss": 0.06334810703992844, "memory(GiB)": 86.07, "reward": 0.694337785243988, "reward_std": 0.11498609185218811, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8966889977455139, "rewards/PlanningActionSetORM/std": 0.11771944165229797, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.15900394320487976, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 173, "train_speed(iter/s)": 0.019772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 140.875, "completions/min_length": 71.0, "epoch": 0.002670923771221564, "frac_reward_zero_std": 0.0, "grad_norm": 2.489987850189209, "kl": 0.0005869677988812327, "learning_rate": 1.3351749539594845e-07, "loss": 0.0215819850564003, "memory(GiB)": 86.07, "reward": 0.7136582732200623, "reward_std": 0.12073652446269989, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8895410895347595, "rewards/PlanningActionSetORM/std": 0.13047410547733307, "rewards/RMReward/mean": 0.6696875095367432, "rewards/RMReward/std": 0.17051126062870026, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 174, "train_speed(iter/s)": 0.019685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/mean_length": 166.21875, "completions/min_length": 100.0, "epoch": 0.0026862739078377797, "frac_reward_zero_std": 0.0, "grad_norm": 1.4268220663070679, "kl": 0.00040890302625484765, "learning_rate": 1.3428483732351138e-07, "loss": -0.001752672716975212, "memory(GiB)": 86.07, "reward": 0.6301761865615845, "reward_std": 0.10907380282878876, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8083807229995728, "rewards/PlanningActionSetORM/std": 0.06963157653808594, "rewards/RMReward/mean": 0.5856249928474426, "rewards/RMReward/std": 0.1365458071231842, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 175, "train_speed(iter/s)": 0.019633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/mean_length": 71.6875, "completions/min_length": 15.0, "epoch": 0.0027016240444539957, "frac_reward_zero_std": 0.0, "grad_norm": 4.497776031494141, "kl": 0.00021599960746243596, "learning_rate": 1.350521792510743e-07, "loss": -0.0014239326119422913, "memory(GiB)": 86.07, "reward": 0.8023201823234558, "reward_std": 0.19645912945270538, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.841951847076416, "rewards/PlanningActionSetORM/std": 0.06891711801290512, "rewards/RMReward/mean": 0.6937500238418579, "rewards/RMReward/std": 0.08341662585735321, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 176, "train_speed(iter/s)": 0.019651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 96.59375, "completions/min_length": 55.0, "epoch": 0.0027169741810702114, "frac_reward_zero_std": 0.0, "grad_norm": 3.299380302429199, "kl": 0.0003169958363287151, "learning_rate": 1.358195211786372e-07, "loss": 0.06699047982692719, "memory(GiB)": 86.07, "reward": 0.6268649101257324, "reward_std": 0.09237731248140335, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9140625, "rewards/PlanningActionSetORM/std": 0.09187949448823929, "rewards/RMReward/mean": 0.7337499856948853, "rewards/RMReward/std": 0.17289207875728607, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.483917236328125, "rewards/VisualPerceptionAccuracy/std": 0.03608458861708641, "step": 177, "train_speed(iter/s)": 0.019677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/mean_length": 54.5, "completions/min_length": 2.0, "epoch": 0.0027323243176864275, "frac_reward_zero_std": 0.0, "grad_norm": 50.43584060668945, "kl": 0.0008460129029117525, "learning_rate": 1.3658686310620013e-07, "loss": 0.043660975992679596, "memory(GiB)": 86.07, "reward": 0.6192187666893005, "reward_std": 0.1365564465522766, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8890624642372131, "rewards/PlanningActionSetORM/std": 0.0735803097486496, "rewards/RMReward/mean": 0.7468750476837158, "rewards/RMReward/std": 0.04989573359489441, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 178, "train_speed(iter/s)": 0.019536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/mean_length": 57.4375, "completions/min_length": 14.0, "epoch": 0.002747674454302643, "frac_reward_zero_std": 0.0, "grad_norm": 6.33228874206543, "kl": 0.004761481191962957, "learning_rate": 1.3735420503376305e-07, "loss": -0.012319600209593773, "memory(GiB)": 86.07, "reward": 0.6237499713897705, "reward_std": 0.28443285822868347, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8718750476837158, "rewards/PlanningActionSetORM/std": 0.08508574217557907, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.09168560802936554, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 179, "train_speed(iter/s)": 0.019564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 139.125, "completions/min_length": 71.0, "epoch": 0.002763024590918859, "frac_reward_zero_std": 0.0, "grad_norm": 2.1063451766967773, "kl": 0.00027557997964322567, "learning_rate": 1.3812154696132598e-07, "loss": -0.10572461038827896, "memory(GiB)": 86.07, "reward": 0.39581194519996643, "reward_std": 0.058097176253795624, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9589589834213257, "rewards/PlanningActionSetORM/std": 0.10669484734535217, "rewards/RMReward/mean": 0.7337499856948853, "rewards/RMReward/std": 0.10111874341964722, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.012832128442823887, "rewards/VisualPerceptionAccuracy/std": 0.028818344697356224, "step": 180, "train_speed(iter/s)": 0.019465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/mean_length": 66.0625, "completions/min_length": 3.0, "epoch": 0.0027783747275350753, "frac_reward_zero_std": 0.0, "grad_norm": 91.76191711425781, "kl": 0.0011735500302165747, "learning_rate": 1.3888888888888888e-07, "loss": 0.0024172570556402206, "memory(GiB)": 86.07, "reward": 0.45359376072883606, "reward_std": 0.20240771770477295, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8421875238418579, "rewards/PlanningActionSetORM/std": 0.09206824749708176, "rewards/RMReward/mean": 0.65625, "rewards/RMReward/std": 0.1400892585515976, "rewards/SpatialReasoningORM/mean": 0.22500000894069672, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 181, "train_speed(iter/s)": 0.019442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/mean_length": 106.84375, "completions/min_length": 78.0, "epoch": 0.002793724864151291, "frac_reward_zero_std": 0.0, "grad_norm": 3.006871461868286, "kl": 0.0009073324035853148, "learning_rate": 1.3965623081645183e-07, "loss": 0.026378195732831955, "memory(GiB)": 86.07, "reward": 0.7963764667510986, "reward_std": 0.09751053154468536, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8943824172019958, "rewards/PlanningActionSetORM/std": 0.10906713455915451, "rewards/RMReward/mean": 0.7718750238418579, "rewards/RMReward/std": 0.1069650948047638, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 182, "train_speed(iter/s)": 0.019461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 108.125, "completions/min_length": 77.0, "epoch": 0.002809075000767507, "frac_reward_zero_std": 0.0, "grad_norm": 2.123296022415161, "kl": 0.0005134689854457974, "learning_rate": 1.4042357274401476e-07, "loss": -0.006487447768449783, "memory(GiB)": 86.07, "reward": 0.7429948449134827, "reward_std": 0.11961928009986877, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8149739503860474, "rewards/PlanningActionSetORM/std": 0.08887399733066559, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.15811389684677124, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 183, "train_speed(iter/s)": 0.019409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/mean_length": 191.125, "completions/min_length": 68.0, "epoch": 0.0028244251373837227, "frac_reward_zero_std": 0.0, "grad_norm": 3.496533155441284, "kl": 0.0008764683734625578, "learning_rate": 1.4119091467157766e-07, "loss": -0.18733027577400208, "memory(GiB)": 86.07, "reward": 0.2501721978187561, "reward_std": 0.08050265163183212, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7369791865348816, "rewards/PlanningActionSetORM/std": 0.20514151453971863, "rewards/RMReward/mean": 0.43437498807907104, "rewards/RMReward/std": 0.1399032473564148, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.005448571871966124, "rewards/VisualPerceptionAccuracy/std": 0.019213248044252396, "step": 184, "train_speed(iter/s)": 0.019376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/mean_length": 250.34375, "completions/min_length": 9.0, "epoch": 0.0028397752739999387, "frac_reward_zero_std": 0.0, "grad_norm": 5.434896469116211, "kl": 0.02487398311495781, "learning_rate": 1.4195825659914058e-07, "loss": 0.026796061545610428, "memory(GiB)": 86.07, "reward": 0.17107875645160675, "reward_std": 0.2759225070476532, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.17653250694274902, "rewards/VisualPerceptionAccuracy/std": 0.22590088844299316, "step": 185, "train_speed(iter/s)": 0.019439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.5, "completions/min_length": 2.0, "epoch": 0.0028551254106161544, "frac_reward_zero_std": 0.0, "grad_norm": 50.88827133178711, "kl": -0.0006103515625, "learning_rate": 1.427255985267035e-07, "loss": -4.842877388000488e-07, "memory(GiB)": 86.07, "reward": 0.534375011920929, "reward_std": 0.14249999821186066, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.14756081998348236, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 186, "train_speed(iter/s)": 0.019537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/mean_length": 58.3125, "completions/min_length": 2.0, "epoch": 0.0028704755472323705, "frac_reward_zero_std": 0.0, "grad_norm": 61.53984832763672, "kl": 0.00015083412290550768, "learning_rate": 1.4349294045426644e-07, "loss": 0.03545809164643288, "memory(GiB)": 86.07, "reward": 0.47898438572883606, "reward_std": 0.15672244131565094, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.774218738079071, "rewards/PlanningActionSetORM/std": 0.07429013401269913, "rewards/RMReward/mean": 0.42500001192092896, "rewards/RMReward/std": 0.09831921011209488, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 187, "train_speed(iter/s)": 0.019528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.0625, "completions/min_length": 2.0, "epoch": 0.002885825683848586, "frac_reward_zero_std": 0.0, "grad_norm": 26.7220458984375, "kl": 6.401975406333804e-05, "learning_rate": 1.4426028238182936e-07, "loss": -0.05911121517419815, "memory(GiB)": 86.07, "reward": 0.5296875238418579, "reward_std": 0.31653892993927, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.53125, "rewards/SpatialReasoningORM/std": 0.3754030168056488, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 188, "train_speed(iter/s)": 0.019616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.71875, "completions/min_length": 2.0, "epoch": 0.002901175820464802, "frac_reward_zero_std": 0.0, "grad_norm": 68.50410461425781, "kl": 0.0007803559419699013, "learning_rate": 1.4502762430939226e-07, "loss": -0.018325600773096085, "memory(GiB)": 86.07, "reward": 0.5712499618530273, "reward_std": 0.33484601974487305, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5750000476837158, "rewards/SpatialReasoningORM/std": 0.3793032765388489, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 189, "train_speed(iter/s)": 0.019705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 141.96875, "completions/min_length": 84.0, "epoch": 0.002916525957081018, "frac_reward_zero_std": 0.0, "grad_norm": 1.9695392847061157, "kl": 0.00031497114105150104, "learning_rate": 1.457949662369552e-07, "loss": 0.004786044359207153, "memory(GiB)": 86.07, "reward": 0.6608043909072876, "reward_std": 0.08771329373121262, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8665218353271484, "rewards/PlanningActionSetORM/std": 0.07403690367937088, "rewards/RMReward/mean": 0.609375, "rewards/RMReward/std": 0.17890234291553497, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 190, "train_speed(iter/s)": 0.019698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 165.375, "completions/min_length": 81.0, "epoch": 0.002931876093697234, "frac_reward_zero_std": 0.0, "grad_norm": 2.4280545711517334, "kl": 0.0002445073041599244, "learning_rate": 1.4656230816451812e-07, "loss": 0.06451451778411865, "memory(GiB)": 86.07, "reward": 0.3478536307811737, "reward_std": 0.10739007592201233, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7599674463272095, "rewards/PlanningActionSetORM/std": 0.07156157493591309, "rewards/RMReward/mean": 0.550000011920929, "rewards/RMReward/std": 0.1390443593263626, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.10371367633342743, "rewards/VisualPerceptionAccuracy/std": 0.10802417248487473, "step": 191, "train_speed(iter/s)": 0.019656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/mean_length": 42.8125, "completions/min_length": 2.0, "epoch": 0.0029472262303134496, "frac_reward_zero_std": 0.0, "grad_norm": 56.467376708984375, "kl": 0.00020461613894440234, "learning_rate": 1.4732965009208104e-07, "loss": -0.1312604695558548, "memory(GiB)": 86.07, "reward": 0.4478646218776703, "reward_std": 0.13576708734035492, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8723958134651184, "rewards/PlanningActionSetORM/std": 0.0931306704878807, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.08850612491369247, "rewards/SpatialReasoningORM/mean": 0.07500000298023224, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 192, "train_speed(iter/s)": 0.019677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 204.65625, "completions/min_length": 64.0, "epoch": 0.0029625763669296657, "frac_reward_zero_std": 0.0, "grad_norm": 2.5708839893341064, "kl": 0.00029129101312719285, "learning_rate": 1.4809699201964397e-07, "loss": 0.01226760819554329, "memory(GiB)": 86.07, "reward": 0.15644283592700958, "reward_std": 0.12625691294670105, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.15644283592700958, "rewards/VisualPerceptionAccuracy/std": 0.19397322833538055, "step": 193, "train_speed(iter/s)": 0.019746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 115.65625, "completions/min_length": 2.0, "epoch": 0.0029779265035458817, "frac_reward_zero_std": 0.0, "grad_norm": 45.93134689331055, "kl": 0.00043410190846771, "learning_rate": 1.488643339472069e-07, "loss": -0.03489500284194946, "memory(GiB)": 86.07, "reward": 0.6197049021720886, "reward_std": 0.140390083193779, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8407988548278809, "rewards/PlanningActionSetORM/std": 0.08558391779661179, "rewards/RMReward/mean": 0.715624988079071, "rewards/RMReward/std": 0.10119082778692245, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 194, "train_speed(iter/s)": 0.019712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 175.65625, "completions/min_length": 94.0, "epoch": 0.0029932766401620974, "frac_reward_zero_std": 0.0, "grad_norm": 3.160855531692505, "kl": 0.0008639338193461299, "learning_rate": 1.4963167587476982e-07, "loss": 0.06763618439435959, "memory(GiB)": 86.07, "reward": 0.5157697200775146, "reward_std": 0.09845881164073944, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8552083373069763, "rewards/PlanningActionSetORM/std": 0.07232097536325455, "rewards/RMReward/mean": 0.8118749856948853, "rewards/RMReward/std": 0.09446119517087936, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21099771559238434, "rewards/VisualPerceptionAccuracy/std": 0.12273997068405151, "step": 195, "train_speed(iter/s)": 0.019692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 57.0625, "completions/min_length": 3.0, "epoch": 0.0030086267767783135, "frac_reward_zero_std": 0.0, "grad_norm": 40.37871551513672, "kl": 0.0009451184305362403, "learning_rate": 1.5039901780233272e-07, "loss": -0.06459716707468033, "memory(GiB)": 86.07, "reward": 0.5778645873069763, "reward_std": 0.1935117095708847, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9567708373069763, "rewards/PlanningActionSetORM/std": 0.10315480083227158, "rewards/RMReward/mean": 0.715624988079071, "rewards/RMReward/std": 0.13870683312416077, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722813725471497, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 196, "train_speed(iter/s)": 0.019698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 103.53125, "completions/min_length": 51.0, "epoch": 0.003023976913394529, "frac_reward_zero_std": 0.0, "grad_norm": 3.0366902351379395, "kl": 0.0005397515487857163, "learning_rate": 1.5116635972989565e-07, "loss": 0.09202316403388977, "memory(GiB)": 86.07, "reward": 0.73213791847229, "reward_std": 0.0990568995475769, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8481894731521606, "rewards/PlanningActionSetORM/std": 0.1315530687570572, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.10993950068950653, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 197, "train_speed(iter/s)": 0.019641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/mean_length": 153.65625, "completions/min_length": 67.0, "epoch": 0.003039327050010745, "frac_reward_zero_std": 0.0, "grad_norm": 2.40971302986145, "kl": 0.0027892671059817076, "learning_rate": 1.5193370165745857e-07, "loss": -0.12365936487913132, "memory(GiB)": 86.07, "reward": 0.38106268644332886, "reward_std": 0.14527641236782074, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8390682339668274, "rewards/PlanningActionSetORM/std": 0.09587598592042923, "rewards/RMReward/mean": 0.4156249761581421, "rewards/RMReward/std": 0.1220911517739296, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2618117034435272, "rewards/VisualPerceptionAccuracy/std": 0.18244947493076324, "step": 198, "train_speed(iter/s)": 0.019628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 185.53125, "completions/min_length": 85.0, "epoch": 0.003054677186626961, "frac_reward_zero_std": 0.0, "grad_norm": 2.7426233291625977, "kl": 0.0003959039750043303, "learning_rate": 1.527010435850215e-07, "loss": -0.09707143902778625, "memory(GiB)": 86.07, "reward": 0.4150947332382202, "reward_std": 0.11309171468019485, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7690104246139526, "rewards/PlanningActionSetORM/std": 0.11391567438840866, "rewards/RMReward/mean": 0.5718749761581421, "rewards/RMReward/std": 0.11827757209539413, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2188873589038849, "rewards/VisualPerceptionAccuracy/std": 0.12423893064260483, "step": 199, "train_speed(iter/s)": 0.019564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 123.3125, "completions/min_length": 2.0, "epoch": 0.003070027323243177, "frac_reward_zero_std": 0.0, "grad_norm": 56.9891471862793, "kl": 0.0001718083512969315, "learning_rate": 1.5346838551258443e-07, "loss": 0.1028764545917511, "memory(GiB)": 86.07, "reward": 0.25741684436798096, "reward_std": 0.2715482711791992, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.15000000596046448, "rewards/SpatialReasoningORM/std": 0.2683281898498535, "rewards/VisualPerceptionAccuracy/mean": 0.37233367562294006, "rewards/VisualPerceptionAccuracy/std": 0.2881848216056824, "step": 200, "train_speed(iter/s)": 0.019631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/mean_length": 555.875, "completions/min_length": 119.0, "epoch": 0.0030853774598593926, "frac_reward_zero_std": 0.0, "grad_norm": 1.3889249563217163, "kl": 0.0003022163291461766, "learning_rate": 1.5423572744014735e-07, "loss": 0.0766560360789299, "memory(GiB)": 86.07, "reward": 0.3135453164577484, "reward_std": 0.13177496194839478, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8426666259765625, "rewards/PlanningActionSetORM/std": 0.09181860834360123, "rewards/RMReward/mean": 0.515625, "rewards/RMReward/std": 0.15244536101818085, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.04605727270245552, "rewards/VisualPerceptionAccuracy/std": 0.13642750680446625, "step": 201, "train_speed(iter/s)": 0.019475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 45.375, "completions/min_length": 2.0, "epoch": 0.0031007275964756087, "frac_reward_zero_std": 0.0, "grad_norm": 86.77337646484375, "kl": 0.0012112524127587676, "learning_rate": 1.5500306936771028e-07, "loss": 0.12275737524032593, "memory(GiB)": 86.07, "reward": 0.6272395849227905, "reward_std": 0.16926854848861694, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9442708492279053, "rewards/PlanningActionSetORM/std": 0.06643841415643692, "rewards/RMReward/mean": 0.7531249523162842, "rewards/RMReward/std": 0.12578918039798737, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 202, "train_speed(iter/s)": 0.019425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.6875, "completions/min_length": 2.0, "epoch": 0.0031160777330918243, "frac_reward_zero_std": 0.0, "grad_norm": 39.40776443481445, "kl": -0.00017755682347342372, "learning_rate": 1.5577041129527318e-07, "loss": -0.049878500401973724, "memory(GiB)": 86.07, "reward": 0.2671875059604645, "reward_std": 0.1685960292816162, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.2812500298023224, "rewards/SpatialReasoningORM/std": 0.30420440435409546, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 203, "train_speed(iter/s)": 0.019452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 72.25, "completions/min_length": 2.0, "epoch": 0.0031314278697080404, "frac_reward_zero_std": 0.0, "grad_norm": 107.17088317871094, "kl": 0.001117939013056457, "learning_rate": 1.5653775322283613e-07, "loss": 0.12528467178344727, "memory(GiB)": 86.07, "reward": 0.43454986810684204, "reward_std": 0.20760604739189148, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7329984903335571, "rewards/PlanningActionSetORM/std": 0.16012591123580933, "rewards/RMReward/mean": 0.546875, "rewards/RMReward/std": 0.153263121843338, "rewards/SpatialReasoningORM/mean": 0.30000001192092896, "rewards/SpatialReasoningORM/std": 0.3098386824131012, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 204, "train_speed(iter/s)": 0.019452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/mean_length": 51.125, "completions/min_length": 3.0, "epoch": 0.0031467780063242565, "frac_reward_zero_std": 0.0, "grad_norm": 20.892126083374023, "kl": 0.0006054036784917116, "learning_rate": 1.5730509515039906e-07, "loss": 0.01016203686594963, "memory(GiB)": 86.07, "reward": 0.6496875286102295, "reward_std": 0.1302838921546936, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.887499988079071, "rewards/PlanningActionSetORM/std": 0.08366600424051285, "rewards/RMReward/mean": 0.734375, "rewards/RMReward/std": 0.1399032473564148, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 205, "train_speed(iter/s)": 0.019466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/mean_length": 178.25, "completions/min_length": 2.0, "epoch": 0.003162128142940472, "frac_reward_zero_std": 0.0, "grad_norm": 124.34306335449219, "kl": 0.0002360454382142052, "learning_rate": 1.5807243707796196e-07, "loss": 0.10065513849258423, "memory(GiB)": 90.94, "reward": 0.369777113199234, "reward_std": 0.21382814645767212, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7408961057662964, "rewards/PlanningActionSetORM/std": 0.17388704419136047, "rewards/RMReward/mean": 0.42750000953674316, "rewards/RMReward/std": 0.1567375808954239, "rewards/SpatialReasoningORM/mean": 0.26249998807907104, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 206, "train_speed(iter/s)": 0.01942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 142.34375, "completions/min_length": 2.0, "epoch": 0.003177478279556688, "frac_reward_zero_std": 0.0, "grad_norm": 31.376218795776367, "kl": 0.00015798605454619974, "learning_rate": 1.5883977900552488e-07, "loss": -0.0047413669526577, "memory(GiB)": 90.94, "reward": 0.3106735944747925, "reward_std": 0.18632298707962036, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8692361116409302, "rewards/PlanningActionSetORM/std": 0.14223520457744598, "rewards/RMReward/mean": 0.48124998807907104, "rewards/RMReward/std": 0.14705440402030945, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0625, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 207, "train_speed(iter/s)": 0.019389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/mean_length": 258.75, "completions/min_length": 69.0, "epoch": 0.003192828416172904, "frac_reward_zero_std": 0.0, "grad_norm": 2.886488914489746, "kl": 0.00034138973569497466, "learning_rate": 1.596071209330878e-07, "loss": 0.07783769071102142, "memory(GiB)": 90.94, "reward": 0.41910696029663086, "reward_std": 0.051772069185972214, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9401041865348816, "rewards/PlanningActionSetORM/std": 0.13605618476867676, "rewards/RMReward/mean": 0.8093750476837158, "rewards/RMReward/std": 0.09868932515382767, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0026930617168545723, "rewards/VisualPerceptionAccuracy/std": 0.008740812540054321, "step": 208, "train_speed(iter/s)": 0.019383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/mean_length": 90.03125, "completions/min_length": 2.0, "epoch": 0.00320817855278912, "frac_reward_zero_std": 0.0, "grad_norm": 39.659854888916016, "kl": 0.0008060346590355039, "learning_rate": 1.6037446286065073e-07, "loss": -0.06425384432077408, "memory(GiB)": 90.94, "reward": 0.3850120007991791, "reward_std": 0.17081184685230255, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8401199579238892, "rewards/PlanningActionSetORM/std": 0.08143284171819687, "rewards/RMReward/mean": 0.5743750333786011, "rewards/RMReward/std": 0.10538619756698608, "rewards/SpatialReasoningORM/mean": 0.15000000596046448, "rewards/SpatialReasoningORM/std": 0.26832816004753113, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 209, "train_speed(iter/s)": 0.019368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.09375, "completions/min_length": 2.0, "epoch": 0.0032235286894053356, "frac_reward_zero_std": 0.0, "grad_norm": 80.17959594726562, "kl": 0.00014287017984315753, "learning_rate": 1.6114180478821363e-07, "loss": 0.09301380068063736, "memory(GiB)": 90.94, "reward": 0.3100000023841858, "reward_std": 0.32791197299957275, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.30000001192092896, "rewards/SpatialReasoningORM/std": 0.3627849221229553, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 210, "train_speed(iter/s)": 0.019448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 98.625, "completions/min_length": 2.0, "epoch": 0.0032388788260215517, "frac_reward_zero_std": 0.0, "grad_norm": 23.96139907836914, "kl": -7.491221185773611e-05, "learning_rate": 1.6190914671577656e-07, "loss": -0.03564847260713577, "memory(GiB)": 90.94, "reward": 0.5787662267684937, "reward_std": 0.14771077036857605, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8407867550849915, "rewards/PlanningActionSetORM/std": 0.08060789853334427, "rewards/RMReward/mean": 0.5687500238418579, "rewards/RMReward/std": 0.19224552810192108, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 211, "train_speed(iter/s)": 0.019448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/mean_length": 2.0, "completions/min_length": 2.0, "epoch": 0.0032542289626377673, "frac_reward_zero_std": 0.0, "grad_norm": 190.78184509277344, "kl": 0.012939453125, "learning_rate": 1.626764886433395e-07, "loss": 1.292303204536438e-05, "memory(GiB)": 90.94, "reward": 0.4996874928474426, "reward_std": 0.36962586641311646, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 212, "train_speed(iter/s)": 0.019526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 86.9375, "completions/min_length": 2.0, "epoch": 0.0032695790992539834, "frac_reward_zero_std": 0.0, "grad_norm": 70.06358337402344, "kl": 0.0007935972535051405, "learning_rate": 1.634438305709024e-07, "loss": 0.022338490933179855, "memory(GiB)": 90.94, "reward": 0.1826002597808838, "reward_std": 0.24219703674316406, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": 0.18707554042339325, "rewards/VisualPerceptionAccuracy/std": 0.21152736246585846, "step": 213, "train_speed(iter/s)": 0.019601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 94.125, "completions/min_length": 78.0, "epoch": 0.003284929235870199, "frac_reward_zero_std": 0.0, "grad_norm": 2.5437140464782715, "kl": 0.0007214924553409219, "learning_rate": 1.6421117249846534e-07, "loss": 0.009701840579509735, "memory(GiB)": 90.94, "reward": 0.7770833969116211, "reward_std": 0.11795198917388916, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8604166507720947, "rewards/PlanningActionSetORM/std": 0.0870668962597847, "rewards/RMReward/mean": 0.7562500238418579, "rewards/RMReward/std": 0.14521953463554382, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 214, "train_speed(iter/s)": 0.01954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 156.4375, "completions/min_length": 2.0, "epoch": 0.003300279372486415, "frac_reward_zero_std": 0.0, "grad_norm": 39.20625686645508, "kl": 0.0003769997856579721, "learning_rate": 1.6497851442602824e-07, "loss": 0.03246922045946121, "memory(GiB)": 90.94, "reward": 0.5422136783599854, "reward_std": 0.17996945977210999, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8002617359161377, "rewards/PlanningActionSetORM/std": 0.17011895775794983, "rewards/RMReward/mean": 0.6656249761581421, "rewards/RMReward/std": 0.10119082778692245, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 215, "train_speed(iter/s)": 0.019484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 124.53125, "completions/min_length": 94.0, "epoch": 0.003315629509102631, "frac_reward_zero_std": 0.0, "grad_norm": 2.275956153869629, "kl": 0.0014798138290643692, "learning_rate": 1.6574585635359117e-07, "loss": 0.001329369843006134, "memory(GiB)": 90.94, "reward": 0.7925550937652588, "reward_std": 0.0739617720246315, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9015253186225891, "rewards/PlanningActionSetORM/std": 0.07896449416875839, "rewards/RMReward/mean": 0.7653124928474426, "rewards/RMReward/std": 0.13471555709838867, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 216, "train_speed(iter/s)": 0.019505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 96.71875, "completions/min_length": 2.0, "epoch": 0.003330979645718847, "frac_reward_zero_std": 0.0, "grad_norm": 34.025386810302734, "kl": 0.0002724931691773236, "learning_rate": 1.665131982811541e-07, "loss": -0.012248929589986801, "memory(GiB)": 90.94, "reward": 0.5812852382659912, "reward_std": 0.150128573179245, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8409776091575623, "rewards/PlanningActionSetORM/std": 0.09172411262989044, "rewards/RMReward/mean": 0.5750000476837158, "rewards/RMReward/std": 0.18885621428489685, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 217, "train_speed(iter/s)": 0.019479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/mean_length": 219.5625, "completions/min_length": 79.0, "epoch": 0.003346329782335063, "frac_reward_zero_std": 0.0, "grad_norm": 2.3950655460357666, "kl": 0.0009909409563988447, "learning_rate": 1.6728054020871702e-07, "loss": -0.008894715458154678, "memory(GiB)": 90.94, "reward": 0.4195752441883087, "reward_std": 0.0872134119272232, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8807291984558105, "rewards/PlanningActionSetORM/std": 0.13702252507209778, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.108012355864048, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06300461292266846, "rewards/VisualPerceptionAccuracy/std": 0.08362846821546555, "step": 218, "train_speed(iter/s)": 0.019441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 47.875, "completions/min_length": 3.0, "epoch": 0.0033616799189512786, "frac_reward_zero_std": 0.0, "grad_norm": 54.31106948852539, "kl": 0.0010306134354323149, "learning_rate": 1.6804788213627992e-07, "loss": 0.047377459704875946, "memory(GiB)": 90.94, "reward": 0.5820833444595337, "reward_std": 0.18810708820819855, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9489582777023315, "rewards/PlanningActionSetORM/std": 0.08055794984102249, "rewards/RMReward/mean": 0.7281249761581421, "rewards/RMReward/std": 0.12106300890445709, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 219, "train_speed(iter/s)": 0.019453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/mean_length": 108.125, "completions/min_length": 80.0, "epoch": 0.0033770300555674947, "frac_reward_zero_std": 0.0, "grad_norm": 3.3438496589660645, "kl": 0.0017168624326586723, "learning_rate": 1.6881522406384284e-07, "loss": -0.01470818929374218, "memory(GiB)": 90.94, "reward": 0.6246996521949768, "reward_std": 0.12458019703626633, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8572482466697693, "rewards/PlanningActionSetORM/std": 0.10761914402246475, "rewards/RMReward/mean": 0.5665624737739563, "rewards/RMReward/std": 0.1700020730495453, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 220, "train_speed(iter/s)": 0.019358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 147.75, "completions/min_length": 76.0, "epoch": 0.0033923801921837103, "frac_reward_zero_std": 0.0, "grad_norm": 2.24381685256958, "kl": 0.0022021415643393993, "learning_rate": 1.6958256599140577e-07, "loss": 0.04815671220421791, "memory(GiB)": 90.94, "reward": 0.6242994070053101, "reward_std": 0.09841793775558472, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8664970397949219, "rewards/PlanningActionSetORM/std": 0.0839245617389679, "rewards/RMReward/mean": 0.5637500286102295, "rewards/RMReward/std": 0.11605754494667053, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 221, "train_speed(iter/s)": 0.019311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/mean_length": 51.4375, "completions/min_length": 2.0, "epoch": 0.0034077303287999264, "frac_reward_zero_std": 0.0, "grad_norm": 68.19841003417969, "kl": 0.001185521250590682, "learning_rate": 1.7034990791896872e-07, "loss": 0.006483782082796097, "memory(GiB)": 90.94, "reward": 0.4652083218097687, "reward_std": 0.16319477558135986, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7895833253860474, "rewards/PlanningActionSetORM/std": 0.07995948195457458, "rewards/RMReward/mean": 0.7875000238418579, "rewards/RMReward/std": 0.07852812856435776, "rewards/SpatialReasoningORM/mean": 0.15000000596046448, "rewards/SpatialReasoningORM/std": 0.2683281898498535, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 222, "train_speed(iter/s)": 0.01933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/mean_length": 271.03125, "completions/min_length": 81.0, "epoch": 0.003423080465416142, "frac_reward_zero_std": 0.0, "grad_norm": 2.336064100265503, "kl": 0.0004601888940669596, "learning_rate": 1.7111724984653165e-07, "loss": 0.059883661568164825, "memory(GiB)": 90.94, "reward": 0.38645249605178833, "reward_std": 0.06517988443374634, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8802083134651184, "rewards/PlanningActionSetORM/std": 0.09845449030399323, "rewards/RMReward/mean": 0.7312500476837158, "rewards/RMReward/std": 0.1138346791267395, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.011863265186548233, "rewards/VisualPerceptionAccuracy/std": 0.03259043022990227, "step": 223, "train_speed(iter/s)": 0.019326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 173.40625, "completions/min_length": 58.0, "epoch": 0.003438430602032358, "frac_reward_zero_std": 0.0, "grad_norm": 2.0374767780303955, "kl": 0.0006202008808031678, "learning_rate": 1.7188459177409458e-07, "loss": -0.09627828747034073, "memory(GiB)": 90.94, "reward": 0.3924455940723419, "reward_std": 0.058525554835796356, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.932812511920929, "rewards/PlanningActionSetORM/std": 0.0721026062965393, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.07527727633714676, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.01832868903875351, "rewards/VisualPerceptionAccuracy/std": 0.0607270710170269, "step": 224, "train_speed(iter/s)": 0.019333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 157.6875, "completions/min_length": 104.0, "epoch": 0.0034537807386485738, "frac_reward_zero_std": 0.0, "grad_norm": 2.1315934658050537, "kl": 0.0008958314429037273, "learning_rate": 1.7265193370165747e-07, "loss": 0.004652518779039383, "memory(GiB)": 90.94, "reward": 0.5749682784080505, "reward_std": 0.13023102283477783, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7948412895202637, "rewards/PlanningActionSetORM/std": 0.14286969602108002, "rewards/RMReward/mean": 0.5199999809265137, "rewards/RMReward/std": 0.15078289806842804, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 225, "train_speed(iter/s)": 0.019227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 68.5, "completions/min_length": 3.0, "epoch": 0.00346913087526479, "frac_reward_zero_std": 0.0, "grad_norm": 16.507898330688477, "kl": 0.002057056175544858, "learning_rate": 1.734192756292204e-07, "loss": -0.06336319446563721, "memory(GiB)": 90.94, "reward": 0.669122040271759, "reward_std": 0.14005860686302185, "rewards/MathAnswerFormat/mean": 0.125, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.784970223903656, "rewards/PlanningActionSetORM/std": 0.1594804972410202, "rewards/RMReward/mean": 0.6968749761581421, "rewards/RMReward/std": 0.1454518884420395, "rewards/SpatialReasoningORM/mean": 0.6500000357627869, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 226, "train_speed(iter/s)": 0.019227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/mean_length": 53.1875, "completions/min_length": 14.0, "epoch": 0.003484481011881006, "frac_reward_zero_std": 0.0, "grad_norm": 5.846789836883545, "kl": 0.001945829950273037, "learning_rate": 1.7418661755678333e-07, "loss": -0.03549434244632721, "memory(GiB)": 90.94, "reward": 0.8286458253860474, "reward_std": 0.20867522060871124, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9302083253860474, "rewards/PlanningActionSetORM/std": 0.11417476832866669, "rewards/RMReward/mean": 0.737500011920929, "rewards/RMReward/std": 0.10408329963684082, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 227, "train_speed(iter/s)": 0.019231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 141.96875, "completions/min_length": 91.0, "epoch": 0.0034998311484972216, "frac_reward_zero_std": 0.0, "grad_norm": 2.6448171138763428, "kl": 0.0034110182896256447, "learning_rate": 1.7495395948434625e-07, "loss": 0.0572403222322464, "memory(GiB)": 90.94, "reward": 0.627470850944519, "reward_std": 0.14988009631633759, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7936042547225952, "rewards/PlanningActionSetORM/std": 0.17298156023025513, "rewards/RMReward/mean": 0.5859375, "rewards/RMReward/std": 0.18149663507938385, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 228, "train_speed(iter/s)": 0.019132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/mean_length": 259.1875, "completions/min_length": 110.0, "epoch": 0.0035151812851134377, "frac_reward_zero_std": 0.0, "grad_norm": 1.4416896104812622, "kl": 0.0004899115883745253, "learning_rate": 1.7572130141190915e-07, "loss": -0.0659305602312088, "memory(GiB)": 90.94, "reward": 0.49332696199417114, "reward_std": 0.16918328404426575, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9207386374473572, "rewards/PlanningActionSetORM/std": 0.06562773138284683, "rewards/RMReward/mean": 0.606249988079071, "rewards/RMReward/std": 0.13149778544902802, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.31750616431236267, "rewards/VisualPerceptionAccuracy/std": 0.22674508392810822, "step": 229, "train_speed(iter/s)": 0.019083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/mean_length": 86.15625, "completions/min_length": 2.0, "epoch": 0.0035305314217296533, "frac_reward_zero_std": 0.0, "grad_norm": 26.12569808959961, "kl": 0.005622648634016514, "learning_rate": 1.7648864333947208e-07, "loss": -0.18016719818115234, "memory(GiB)": 90.94, "reward": 0.5924134254455566, "reward_std": 0.11691781878471375, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8147597908973694, "rewards/PlanningActionSetORM/std": 0.13980983197689056, "rewards/RMReward/mean": 0.53125, "rewards/RMReward/std": 0.14930394291877747, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 230, "train_speed(iter/s)": 0.019063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 62.71875, "completions/min_length": 14.0, "epoch": 0.0035458815583458694, "frac_reward_zero_std": 0.0, "grad_norm": 3.9431369304656982, "kl": 0.0022773821838200092, "learning_rate": 1.77255985267035e-07, "loss": 0.04388347268104553, "memory(GiB)": 90.94, "reward": 0.8080431222915649, "reward_std": 0.1915871948003769, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8898065090179443, "rewards/PlanningActionSetORM/std": 0.12495952844619751, "rewards/RMReward/mean": 0.621874988079071, "rewards/RMReward/std": 0.17317500710487366, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 231, "train_speed(iter/s)": 0.01908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 148.1875, "completions/min_length": 64.0, "epoch": 0.003561231694962085, "frac_reward_zero_std": 0.0, "grad_norm": 2.7046117782592773, "kl": 0.001043042866513133, "learning_rate": 1.7802332719459793e-07, "loss": 0.007399236783385277, "memory(GiB)": 90.94, "reward": 0.3225271701812744, "reward_std": 0.11777284741401672, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7641245126724243, "rewards/PlanningActionSetORM/std": 0.16467730700969696, "rewards/RMReward/mean": 0.46562498807907104, "rewards/RMReward/std": 0.12873584032058716, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.11972939223051071, "rewards/VisualPerceptionAccuracy/std": 0.12650275230407715, "step": 232, "train_speed(iter/s)": 0.019062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 145.40625, "completions/min_length": 83.0, "epoch": 0.003576581831578301, "frac_reward_zero_std": 0.0, "grad_norm": 3.394200086593628, "kl": 0.0031977996695786715, "learning_rate": 1.7879066912216083e-07, "loss": -0.0007325997576117516, "memory(GiB)": 90.94, "reward": 0.5159088373184204, "reward_std": 0.1370037943124771, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9020832777023315, "rewards/PlanningActionSetORM/std": 0.10656679421663284, "rewards/RMReward/mean": 0.815625011920929, "rewards/RMReward/std": 0.08702250570058823, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1989009976387024, "rewards/VisualPerceptionAccuracy/std": 0.20478305220603943, "step": 233, "train_speed(iter/s)": 0.01907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 115.40625, "completions/min_length": 62.0, "epoch": 0.0035919319681945168, "frac_reward_zero_std": 0.0, "grad_norm": 2.339165449142456, "kl": 0.0015582253690809011, "learning_rate": 1.7955801104972376e-07, "loss": 0.029492072761058807, "memory(GiB)": 90.94, "reward": 0.45918020606040955, "reward_std": 0.1208207905292511, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8885416388511658, "rewards/PlanningActionSetORM/std": 0.06425318866968155, "rewards/RMReward/mean": 0.8087500333786011, "rewards/RMReward/std": 0.09394147247076035, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09365208446979523, "rewards/VisualPerceptionAccuracy/std": 0.16438713669776917, "step": 234, "train_speed(iter/s)": 0.019096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/mean_length": 53.5625, "completions/min_length": 2.0, "epoch": 0.003607282104810733, "frac_reward_zero_std": 0.0, "grad_norm": 32.55020523071289, "kl": 0.003055587410926819, "learning_rate": 1.8032535297728668e-07, "loss": 0.0491468571126461, "memory(GiB)": 90.94, "reward": 0.67301344871521, "reward_std": 0.10360530018806458, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9832589626312256, "rewards/PlanningActionSetORM/std": 0.04586134850978851, "rewards/RMReward/mean": 0.7687499523162842, "rewards/RMReward/std": 0.07274384796619415, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 235, "train_speed(iter/s)": 0.019118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 58.6875, "completions/min_length": 14.0, "epoch": 0.0036226322414269485, "frac_reward_zero_std": 0.0, "grad_norm": 5.463932991027832, "kl": 0.003162928158417344, "learning_rate": 1.810926949048496e-07, "loss": 0.06677967309951782, "memory(GiB)": 90.94, "reward": 0.602308988571167, "reward_std": 0.2867494523525238, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8824652433395386, "rewards/PlanningActionSetORM/std": 0.1360590159893036, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.10718948394060135, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 236, "train_speed(iter/s)": 0.019143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 72.375, "completions/min_length": 3.0, "epoch": 0.0036379823780431646, "frac_reward_zero_std": 0.0, "grad_norm": 31.501205444335938, "kl": 0.0006902614841237664, "learning_rate": 1.8186003683241254e-07, "loss": -0.02801324427127838, "memory(GiB)": 90.94, "reward": 0.5756832957267761, "reward_std": 0.11006864160299301, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7599577903747559, "rewards/PlanningActionSetORM/std": 0.14237286150455475, "rewards/RMReward/mean": 0.5812499523162842, "rewards/RMReward/std": 0.09639329463243484, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 237, "train_speed(iter/s)": 0.019139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.03125, "completions/min_length": 2.0, "epoch": 0.0036533325146593807, "frac_reward_zero_std": 0.0, "grad_norm": 45.71311569213867, "kl": -4.0690109017305076e-05, "learning_rate": 1.8262737875997544e-07, "loss": 0.04163753613829613, "memory(GiB)": 90.94, "reward": 0.534375011920929, "reward_std": 0.14249999821186066, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.14756081998348236, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 238, "train_speed(iter/s)": 0.019214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/mean_length": 71.71875, "completions/min_length": 14.0, "epoch": 0.0036686826512755963, "frac_reward_zero_std": 0.0, "grad_norm": 9.917619705200195, "kl": 0.0016695134108886123, "learning_rate": 1.8339472068753836e-07, "loss": 0.01710267923772335, "memory(GiB)": 90.94, "reward": 0.592138409614563, "reward_std": 0.30921080708503723, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8270089626312256, "rewards/PlanningActionSetORM/std": 0.08449051529169083, "rewards/RMReward/mean": 0.5431250333786011, "rewards/RMReward/std": 0.15606489777565002, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 239, "train_speed(iter/s)": 0.01921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/mean_length": 54.4375, "completions/min_length": 2.0, "epoch": 0.0036840327878918124, "frac_reward_zero_std": 0.0, "grad_norm": 44.760921478271484, "kl": 0.0013475407613441348, "learning_rate": 1.8416206261510132e-07, "loss": -0.007807694375514984, "memory(GiB)": 90.94, "reward": 0.6133333444595337, "reward_std": 0.14886508882045746, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8520833253860474, "rewards/PlanningActionSetORM/std": 0.09176762402057648, "rewards/RMReward/mean": 0.6968749761581421, "rewards/RMReward/std": 0.12037269026041031, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 240, "train_speed(iter/s)": 0.019218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 139.1875, "completions/min_length": 84.0, "epoch": 0.003699382924508028, "frac_reward_zero_std": 0.0, "grad_norm": 2.4549787044525146, "kl": 0.0012272015446797013, "learning_rate": 1.8492940454266424e-07, "loss": 0.026076029986143112, "memory(GiB)": 90.94, "reward": 0.4446350336074829, "reward_std": 0.0903635025024414, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.957812488079071, "rewards/PlanningActionSetORM/std": 0.05680284649133682, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.08003906160593033, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0902075469493866, "rewards/VisualPerceptionAccuracy/std": 0.11267752945423126, "step": 241, "train_speed(iter/s)": 0.019221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.03125, "completions/min_length": 2.0, "epoch": 0.003714733061124244, "frac_reward_zero_std": 0.0, "grad_norm": 69.26010131835938, "kl": 0.0013292101211845875, "learning_rate": 1.8569674647022717e-07, "loss": 0.020827386528253555, "memory(GiB)": 90.94, "reward": 0.4453125, "reward_std": 0.21375000476837158, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.46875, "rewards/SpatialReasoningORM/std": 0.2520080804824829, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 242, "train_speed(iter/s)": 0.019294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 113.125, "completions/min_length": 72.0, "epoch": 0.0037300831977404598, "frac_reward_zero_std": 0.0, "grad_norm": 2.734302520751953, "kl": 0.0026716715656220913, "learning_rate": 1.8646408839779007e-07, "loss": 0.08693645894527435, "memory(GiB)": 90.94, "reward": 0.685467004776001, "reward_std": 0.15305611491203308, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8773351907730103, "rewards/PlanningActionSetORM/std": 0.16490191221237183, "rewards/RMReward/mean": 0.637499988079071, "rewards/RMReward/std": 0.23589226603507996, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 243, "train_speed(iter/s)": 0.019314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 164.15625, "completions/min_length": 64.0, "epoch": 0.003745433334356676, "frac_reward_zero_std": 0.0, "grad_norm": 1.970035433769226, "kl": 0.0023012394085526466, "learning_rate": 1.87231430325353e-07, "loss": -0.03757743909955025, "memory(GiB)": 90.94, "reward": 0.6516821384429932, "reward_std": 0.12402042746543884, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8271604776382446, "rewards/PlanningActionSetORM/std": 0.09623291343450546, "rewards/RMReward/mean": 0.6078125238418579, "rewards/RMReward/std": 0.152986079454422, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 244, "train_speed(iter/s)": 0.019294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1272.0, "completions/mean_length": 285.78125, "completions/min_length": 14.0, "epoch": 0.0037607834709728915, "frac_reward_zero_std": 0.0, "grad_norm": 4.432549953460693, "kl": 0.0004330520750954747, "learning_rate": 1.8799877225291592e-07, "loss": 0.049782197922468185, "memory(GiB)": 90.94, "reward": 0.4812195599079132, "reward_std": 0.2672712802886963, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.14056412875652313, "rewards/VisualPerceptionAccuracy/std": 0.15158532559871674, "step": 245, "train_speed(iter/s)": 0.019296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 48.75, "completions/min_length": 2.0, "epoch": 0.0037761336075891076, "frac_reward_zero_std": 0.0, "grad_norm": 87.52311706542969, "kl": 0.006335910875350237, "learning_rate": 1.8876611418047885e-07, "loss": 0.10507100820541382, "memory(GiB)": 90.94, "reward": 0.5905208587646484, "reward_std": 0.166447252035141, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9083333611488342, "rewards/PlanningActionSetORM/std": 0.1174970418214798, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.07122442126274109, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 246, "train_speed(iter/s)": 0.019317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 60.1875, "completions/min_length": 3.0, "epoch": 0.0037914837442053232, "frac_reward_zero_std": 0.0, "grad_norm": 65.44864654541016, "kl": 0.03489091247320175, "learning_rate": 1.8953345610804177e-07, "loss": 0.25798606872558594, "memory(GiB)": 90.94, "reward": 0.57833331823349, "reward_std": 0.19815027713775635, "rewards/MathAnswerFormat/mean": 0.3125, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.8458333015441895, "rewards/PlanningActionSetORM/std": 0.12583057582378387, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.16532796621322632, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 247, "train_speed(iter/s)": 0.019265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/mean_length": 64.09375, "completions/min_length": 2.0, "epoch": 0.0038068338808215393, "frac_reward_zero_std": 0.0, "grad_norm": 23.85407066345215, "kl": 0.0019519373308867216, "learning_rate": 1.9030079803560467e-07, "loss": -0.03764723613858223, "memory(GiB)": 90.94, "reward": 0.39020833373069763, "reward_std": 0.12261278927326202, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8989583253860474, "rewards/PlanningActionSetORM/std": 0.07588216662406921, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.1195477694272995, "rewards/SpatialReasoningORM/mean": 0.03750000149011612, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 248, "train_speed(iter/s)": 0.019288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/mean_length": 301.09375, "completions/min_length": 91.0, "epoch": 0.0038221840174377554, "frac_reward_zero_std": 0.0, "grad_norm": 3.058047294616699, "kl": 0.00039830664172768593, "learning_rate": 1.910681399631676e-07, "loss": -0.08224748820066452, "memory(GiB)": 90.94, "reward": 0.12765388190746307, "reward_std": 0.12820056080818176, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.12765388190746307, "rewards/VisualPerceptionAccuracy/std": 0.13804976642131805, "step": 249, "train_speed(iter/s)": 0.019254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 58.84375, "completions/min_length": 14.0, "epoch": 0.003837534154053971, "frac_reward_zero_std": 0.0, "grad_norm": 3.626688003540039, "kl": 0.0030743887182325125, "learning_rate": 1.9183548189073052e-07, "loss": 0.003402289003133774, "memory(GiB)": 90.94, "reward": 0.9127083420753479, "reward_std": 0.15619485080242157, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9114583134651184, "rewards/PlanningActionSetORM/std": 0.05977388471364975, "rewards/RMReward/mean": 0.878125011920929, "rewards/RMReward/std": 0.08360372483730316, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 250, "train_speed(iter/s)": 0.019259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 435.15625, "completions/min_length": 14.0, "epoch": 0.003852884290670187, "frac_reward_zero_std": 0.0, "grad_norm": 3.405315637588501, "kl": 0.0010635718936100602, "learning_rate": 1.9260282381829345e-07, "loss": -0.16066676378250122, "memory(GiB)": 90.94, "reward": 0.47360968589782715, "reward_std": 0.1311608850955963, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.006594335660338402, "rewards/VisualPerceptionAccuracy/std": 0.02482178993523121, "step": 251, "train_speed(iter/s)": 0.019268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/mean_length": 164.9375, "completions/min_length": 112.0, "epoch": 0.0038682344272864028, "frac_reward_zero_std": 0.0, "grad_norm": 1.8321030139923096, "kl": 0.0022901450283825397, "learning_rate": 1.9337016574585635e-07, "loss": 0.009990103542804718, "memory(GiB)": 90.94, "reward": 0.6150603294372559, "reward_std": 0.11014272272586823, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8328015208244324, "rewards/PlanningActionSetORM/std": 0.14249320328235626, "rewards/RMReward/mean": 0.5606250166893005, "rewards/RMReward/std": 0.1204812228679657, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 252, "train_speed(iter/s)": 0.019252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/mean_length": 419.03125, "completions/min_length": 134.0, "epoch": 0.003883584563902619, "frac_reward_zero_std": 0.0, "grad_norm": 1.2052509784698486, "kl": 0.0006554588908329606, "learning_rate": 1.9413750767341928e-07, "loss": -0.058480240404605865, "memory(GiB)": 90.94, "reward": 0.31020545959472656, "reward_std": 0.12648232281208038, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8284969925880432, "rewards/PlanningActionSetORM/std": 0.09630770236253738, "rewards/RMReward/mean": 0.453125, "rewards/RMReward/std": 0.09393038600683212, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09221149235963821, "rewards/VisualPerceptionAccuracy/std": 0.1715877801179886, "step": 253, "train_speed(iter/s)": 0.019233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.4375, "completions/min_length": 3.0, "epoch": 0.0038989347005188345, "frac_reward_zero_std": 0.0, "grad_norm": 87.54324340820312, "kl": 0.00419653533026576, "learning_rate": 1.949048496009822e-07, "loss": -0.012961160391569138, "memory(GiB)": 90.94, "reward": 0.6009374856948853, "reward_std": 0.3082624673843384, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.606249988079071, "rewards/SpatialReasoningORM/std": 0.4203972816467285, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 254, "train_speed(iter/s)": 0.019247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 8.8125, "completions/min_length": 2.0, "epoch": 0.00391428483713505, "frac_reward_zero_std": 0.0, "grad_norm": 62.231563568115234, "kl": 0.000353582960087806, "learning_rate": 1.9567219152854513e-07, "loss": -0.012925218790769577, "memory(GiB)": 90.94, "reward": 0.6662499904632568, "reward_std": 0.2551833689212799, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.675000011920929, "rewards/SpatialReasoningORM/std": 0.37588605284690857, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 255, "train_speed(iter/s)": 0.019312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 77.46875, "completions/min_length": 2.0, "epoch": 0.003929634973751267, "frac_reward_zero_std": 0.0, "grad_norm": 38.36839294433594, "kl": 0.0023981204722076654, "learning_rate": 1.9643953345610803e-07, "loss": 0.06926409900188446, "memory(GiB)": 90.94, "reward": 0.5674367547035217, "reward_std": 0.1377987563610077, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8649924993515015, "rewards/PlanningActionSetORM/std": 0.10665303468704224, "rewards/RMReward/mean": 0.534375011920929, "rewards/RMReward/std": 0.15244534611701965, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 256, "train_speed(iter/s)": 0.019288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 169.0625, "completions/min_length": 94.0, "epoch": 0.003944985110367482, "frac_reward_zero_std": 0.0, "grad_norm": 1.7662898302078247, "kl": 0.0024605176877230406, "learning_rate": 1.97206875383671e-07, "loss": -0.03758067637681961, "memory(GiB)": 90.94, "reward": 0.6391618251800537, "reward_std": 0.1341453492641449, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9283090829849243, "rewards/PlanningActionSetORM/std": 0.08920546621084213, "rewards/RMReward/mean": 0.5668749809265137, "rewards/RMReward/std": 0.16602443158626556, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 257, "train_speed(iter/s)": 0.019257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 51.34375, "completions/min_length": 2.0, "epoch": 0.003960335246983698, "frac_reward_zero_std": 0.0, "grad_norm": 34.08930969238281, "kl": 0.00013798859436064959, "learning_rate": 1.979742173112339e-07, "loss": -0.030640248209238052, "memory(GiB)": 90.94, "reward": 0.605218768119812, "reward_std": 0.1350884586572647, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8203125, "rewards/PlanningActionSetORM/std": 0.12623350322246552, "rewards/RMReward/mean": 0.6399999856948853, "rewards/RMReward/std": 0.14696939289569855, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 258, "train_speed(iter/s)": 0.019274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 50.90625, "completions/min_length": 3.0, "epoch": 0.003975685383599914, "frac_reward_zero_std": 0.0, "grad_norm": 30.377870559692383, "kl": 0.0021107119973748922, "learning_rate": 1.9874155923879683e-07, "loss": -0.046536438167095184, "memory(GiB)": 90.94, "reward": 0.6449479460716248, "reward_std": 0.14279107749462128, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9057291746139526, "rewards/PlanningActionSetORM/std": 0.1654040813446045, "rewards/RMReward/mean": 0.7624999284744263, "rewards/RMReward/std": 0.10246951878070831, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 259, "train_speed(iter/s)": 0.019277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 48.1875, "completions/min_length": 2.0, "epoch": 0.00399103552021613, "frac_reward_zero_std": 0.0, "grad_norm": 80.34373474121094, "kl": 0.003366063814610243, "learning_rate": 1.9950890116635976e-07, "loss": 0.018026482313871384, "memory(GiB)": 90.94, "reward": 0.5729687809944153, "reward_std": 0.17280206084251404, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9578125476837158, "rewards/PlanningActionSetORM/std": 0.07286904007196426, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.09031195938587189, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 260, "train_speed(iter/s)": 0.019293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 133.25, "completions/min_length": 78.0, "epoch": 0.004006385656832346, "frac_reward_zero_std": 0.0, "grad_norm": 2.3938798904418945, "kl": 0.003361078444868326, "learning_rate": 2.0027624309392269e-07, "loss": -0.010775186121463776, "memory(GiB)": 90.94, "reward": 0.6299813985824585, "reward_std": 0.13326598703861237, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8811569809913635, "rewards/PlanningActionSetORM/std": 0.13768930733203888, "rewards/RMReward/mean": 0.567187488079071, "rewards/RMReward/std": 0.16294139623641968, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 261, "train_speed(iter/s)": 0.019279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/mean_length": 240.6875, "completions/min_length": 2.0, "epoch": 0.0040217357934485614, "frac_reward_zero_std": 0.0, "grad_norm": 44.10367202758789, "kl": 0.00031322764698415995, "learning_rate": 2.0104358502148559e-07, "loss": 0.09446556121110916, "memory(GiB)": 90.94, "reward": 0.26009106636047363, "reward_std": 0.12208374589681625, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": 0.02143213339149952, "rewards/VisualPerceptionAccuracy/std": 0.04947543144226074, "step": 262, "train_speed(iter/s)": 0.01932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/mean_length": 229.03125, "completions/min_length": 76.0, "epoch": 0.004037085930064778, "frac_reward_zero_std": 0.0, "grad_norm": 2.649672508239746, "kl": 0.001913513639010489, "learning_rate": 2.018109269490485e-07, "loss": -0.016193069517612457, "memory(GiB)": 90.94, "reward": 0.47444698214530945, "reward_std": 0.14663633704185486, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8536458015441895, "rewards/PlanningActionSetORM/std": 0.10341255366802216, "rewards/RMReward/mean": 0.7174999713897705, "rewards/RMReward/std": 0.18042543530464172, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2041648030281067, "rewards/VisualPerceptionAccuracy/std": 0.14750835299491882, "step": 263, "train_speed(iter/s)": 0.019299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 183.28125, "completions/min_length": 90.0, "epoch": 0.004052436066680994, "frac_reward_zero_std": 0.0, "grad_norm": 2.2597944736480713, "kl": 0.00426459452137351, "learning_rate": 2.0257826887661144e-07, "loss": -0.027839675545692444, "memory(GiB)": 90.94, "reward": 0.6934385299682617, "reward_std": 0.13624471426010132, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8734422922134399, "rewards/PlanningActionSetORM/std": 0.14676573872566223, "rewards/RMReward/mean": 0.6484375, "rewards/RMReward/std": 0.16138732433319092, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 264, "train_speed(iter/s)": 0.019233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/mean_length": 91.8125, "completions/min_length": 74.0, "epoch": 0.004067786203297209, "frac_reward_zero_std": 0.0, "grad_norm": 3.02262806892395, "kl": 0.00789661519229412, "learning_rate": 2.0334561080417437e-07, "loss": -0.004618646577000618, "memory(GiB)": 90.94, "reward": 0.8504687547683716, "reward_std": 0.06678232550621033, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96484375, "rewards/PlanningActionSetORM/std": 0.059181030839681625, "rewards/RMReward/mean": 0.8218749761581421, "rewards/RMReward/std": 0.08025915175676346, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 265, "train_speed(iter/s)": 0.019246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.0, "completions/mean_length": 236.15625, "completions/min_length": 2.0, "epoch": 0.004083136339913425, "frac_reward_zero_std": 0.0, "grad_norm": 92.64744567871094, "kl": 0.00019491557031869888, "learning_rate": 2.0411295273173726e-07, "loss": 0.04752116650342941, "memory(GiB)": 90.94, "reward": 0.22341804206371307, "reward_std": 0.182271808385849, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": 0.0905860960483551, "rewards/VisualPerceptionAccuracy/std": 0.07954363524913788, "step": 266, "train_speed(iter/s)": 0.019278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/mean_length": 230.125, "completions/min_length": 3.0, "epoch": 0.004098486476529641, "frac_reward_zero_std": 0.0, "grad_norm": 22.5261287689209, "kl": 0.0003322142001707107, "learning_rate": 2.048802946593002e-07, "loss": 0.02045576274394989, "memory(GiB)": 90.94, "reward": 0.2758575975894928, "reward_std": 0.1351737678050995, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": 0.05296517536044121, "rewards/VisualPerceptionAccuracy/std": 0.07565546780824661, "step": 267, "train_speed(iter/s)": 0.019319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/mean_length": 50.375, "completions/min_length": 2.0, "epoch": 0.004113836613145857, "frac_reward_zero_std": 0.0, "grad_norm": 62.34941482543945, "kl": 0.003186930436640978, "learning_rate": 2.0564763658686312e-07, "loss": 0.09295766055583954, "memory(GiB)": 90.94, "reward": 0.46656250953674316, "reward_std": 0.16861772537231445, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.925000011920929, "rewards/PlanningActionSetORM/std": 0.12483322620391846, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.07416198402643204, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 268, "train_speed(iter/s)": 0.019341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 204.03125, "completions/min_length": 106.0, "epoch": 0.004129186749762073, "frac_reward_zero_std": 0.0, "grad_norm": 2.3141582012176514, "kl": 0.0023843450471758842, "learning_rate": 2.0641497851442604e-07, "loss": -0.04055456817150116, "memory(GiB)": 90.94, "reward": 0.49482905864715576, "reward_std": 0.16359147429466248, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9246894717216492, "rewards/PlanningActionSetORM/std": 0.03484374284744263, "rewards/RMReward/mean": 0.6337500214576721, "rewards/RMReward/std": 0.2396351397037506, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2977202236652374, "rewards/VisualPerceptionAccuracy/std": 0.13347814977169037, "step": 269, "train_speed(iter/s)": 0.019314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/mean_length": 104.0, "completions/min_length": 76.0, "epoch": 0.004144536886378288, "frac_reward_zero_std": 0.0, "grad_norm": 3.03014874458313, "kl": 0.012069194577634335, "learning_rate": 2.0718232044198897e-07, "loss": 0.03420072793960571, "memory(GiB)": 90.94, "reward": 0.8208333253860474, "reward_std": 0.08142074942588806, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9479166865348816, "rewards/PlanningActionSetORM/std": 0.08724681288003922, "rewards/RMReward/mean": 0.7890625, "rewards/RMReward/std": 0.09134688228368759, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 270, "train_speed(iter/s)": 0.019308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 48.5625, "completions/min_length": 2.0, "epoch": 0.004159887022994505, "frac_reward_zero_std": 0.0, "grad_norm": 37.423179626464844, "kl": 0.007000117562711239, "learning_rate": 2.0794966236955187e-07, "loss": 0.01467430591583252, "memory(GiB)": 90.94, "reward": 0.6281770467758179, "reward_std": 0.12247772514820099, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8098958730697632, "rewards/PlanningActionSetORM/std": 0.10102340579032898, "rewards/RMReward/mean": 0.7000000476837158, "rewards/RMReward/std": 0.11547006666660309, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 271, "train_speed(iter/s)": 0.019324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 96.03125, "completions/min_length": 64.0, "epoch": 0.0041752371596107205, "frac_reward_zero_std": 0.0, "grad_norm": 2.9272332191467285, "kl": 0.0067407917231321335, "learning_rate": 2.087170042971148e-07, "loss": -0.0033142901957035065, "memory(GiB)": 90.94, "reward": 0.734333336353302, "reward_std": 0.11643511056900024, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8916666507720947, "rewards/PlanningActionSetORM/std": 0.09620952606201172, "rewards/RMReward/mean": 0.6950000524520874, "rewards/RMReward/std": 0.14264777302742004, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 272, "train_speed(iter/s)": 0.019246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 171.4375, "completions/min_length": 84.0, "epoch": 0.004190587296226936, "frac_reward_zero_std": 0.0, "grad_norm": 3.413234233856201, "kl": 0.0024913805536925793, "learning_rate": 2.0948434622467772e-07, "loss": -0.07400219887495041, "memory(GiB)": 90.94, "reward": 0.3815958797931671, "reward_std": 0.11227552592754364, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8615342974662781, "rewards/PlanningActionSetORM/std": 0.10239546000957489, "rewards/RMReward/mean": 0.6556249856948853, "rewards/RMReward/std": 0.15331749618053436, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06638484448194504, "rewards/VisualPerceptionAccuracy/std": 0.10263143479824066, "step": 273, "train_speed(iter/s)": 0.019227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/mean_length": 62.0, "completions/min_length": 14.0, "epoch": 0.004205937432843153, "frac_reward_zero_std": 0.0, "grad_norm": 7.651458740234375, "kl": 0.006923839915543795, "learning_rate": 2.1025168815224065e-07, "loss": 0.026057859882712364, "memory(GiB)": 90.94, "reward": 0.7535937428474426, "reward_std": 0.262770414352417, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8578125238418579, "rewards/PlanningActionSetORM/std": 0.11572912335395813, "rewards/RMReward/mean": 0.7906249761581421, "rewards/RMReward/std": 0.0841006264090538, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 274, "train_speed(iter/s)": 0.019226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 139.625, "completions/min_length": 59.0, "epoch": 0.004221287569459368, "frac_reward_zero_std": 0.0, "grad_norm": 2.3022541999816895, "kl": 0.012522250413894653, "learning_rate": 2.110190300798036e-07, "loss": -0.05266432464122772, "memory(GiB)": 90.94, "reward": 0.777942419052124, "reward_std": 0.13165144622325897, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9159618616104126, "rewards/PlanningActionSetORM/std": 0.08076345175504684, "rewards/RMReward/mean": 0.7434375286102295, "rewards/RMReward/std": 0.20468084514141083, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 275, "train_speed(iter/s)": 0.019156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 67.34375, "completions/min_length": 2.0, "epoch": 0.004236637706075584, "frac_reward_zero_std": 0.0, "grad_norm": 25.717750549316406, "kl": 0.010901699773967266, "learning_rate": 2.1178637200736653e-07, "loss": -0.15618613362312317, "memory(GiB)": 90.94, "reward": 0.5876822471618652, "reward_std": 0.13713057339191437, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.7799479365348816, "rewards/PlanningActionSetORM/std": 0.09959661960601807, "rewards/RMReward/mean": 0.528124988079071, "rewards/RMReward/std": 0.19913876056671143, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 276, "train_speed(iter/s)": 0.019148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 111.0, "completions/min_length": 2.0, "epoch": 0.0042519878426918, "frac_reward_zero_std": 0.0, "grad_norm": 32.59693908691406, "kl": 0.012220650911331177, "learning_rate": 2.1255371393492943e-07, "loss": 0.014745496213436127, "memory(GiB)": 90.94, "reward": 0.6306638717651367, "reward_std": 0.19012776017189026, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.9785140156745911, "rewards/PlanningActionSetORM/std": 0.02267991192638874, "rewards/RMReward/mean": 0.675000011920929, "rewards/RMReward/std": 0.18348479270935059, "rewards/SpatialReasoningORM/mean": 0.550000011920929, "rewards/SpatialReasoningORM/std": 0.23664319515228271, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 277, "train_speed(iter/s)": 0.019128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 103.59375, "completions/min_length": 14.0, "epoch": 0.004267337979308016, "frac_reward_zero_std": 0.0, "grad_norm": 4.575436115264893, "kl": 0.004939780570566654, "learning_rate": 2.1332105586249235e-07, "loss": 0.0421413816511631, "memory(GiB)": 90.94, "reward": 0.7449913024902344, "reward_std": 0.26459890604019165, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9280381798744202, "rewards/PlanningActionSetORM/std": 0.08287563174962997, "rewards/RMReward/mean": 0.6031249761581421, "rewards/RMReward/std": 0.17075200378894806, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 278, "train_speed(iter/s)": 0.019116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.46875, "completions/min_length": 3.0, "epoch": 0.004282688115924232, "frac_reward_zero_std": 0.0, "grad_norm": 37.84542465209961, "kl": 0.000662878795992583, "learning_rate": 2.1408839779005528e-07, "loss": 0.011359557509422302, "memory(GiB)": 90.94, "reward": 0.7196875214576721, "reward_std": 0.21609602868556976, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.7312500476837158, "rewards/SpatialReasoningORM/std": 0.3073691725730896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 279, "train_speed(iter/s)": 0.019161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 68.1875, "completions/min_length": 3.0, "epoch": 0.0042980382525404474, "frac_reward_zero_std": 0.0, "grad_norm": 47.28864669799805, "kl": 0.00624704547226429, "learning_rate": 2.148557397176182e-07, "loss": 0.019774597138166428, "memory(GiB)": 90.94, "reward": 0.5704166889190674, "reward_std": 0.18321770429611206, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8010417222976685, "rewards/PlanningActionSetORM/std": 0.10077822208404541, "rewards/RMReward/mean": 0.6468750238418579, "rewards/RMReward/std": 0.15755291283130646, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 280, "train_speed(iter/s)": 0.019147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 101.0625, "completions/min_length": 64.0, "epoch": 0.004313388389156663, "frac_reward_zero_std": 0.0, "grad_norm": 3.2915806770324707, "kl": 0.015668261796236038, "learning_rate": 2.156230816451811e-07, "loss": -0.039537377655506134, "memory(GiB)": 90.94, "reward": 0.7462500333786011, "reward_std": 0.11148083209991455, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8687499761581421, "rewards/PlanningActionSetORM/std": 0.1277388036251068, "rewards/RMReward/mean": 0.7156249284744263, "rewards/RMReward/std": 0.12727762758731842, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 281, "train_speed(iter/s)": 0.01915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/mean_length": 111.5625, "completions/min_length": 81.0, "epoch": 0.00432873852577288, "frac_reward_zero_std": 0.0, "grad_norm": 2.127493381500244, "kl": 0.017569687217473984, "learning_rate": 2.1639042357274403e-07, "loss": -0.014709195122122765, "memory(GiB)": 90.94, "reward": 0.7838541865348816, "reward_std": 0.08654268085956573, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9130207896232605, "rewards/PlanningActionSetORM/std": 0.08874718844890594, "rewards/RMReward/mean": 0.7515624761581421, "rewards/RMReward/std": 0.10355330258607864, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 282, "train_speed(iter/s)": 0.01905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 145.46875, "completions/min_length": 70.0, "epoch": 0.004344088662389095, "frac_reward_zero_std": 0.0, "grad_norm": 2.6278841495513916, "kl": 0.01021304726600647, "learning_rate": 2.1715776550030696e-07, "loss": 0.08572164922952652, "memory(GiB)": 90.94, "reward": 0.6888116002082825, "reward_std": 0.136011004447937, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.857807993888855, "rewards/PlanningActionSetORM/std": 0.1436854898929596, "rewards/RMReward/mean": 0.6465624570846558, "rewards/RMReward/std": 0.1489448845386505, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 283, "train_speed(iter/s)": 0.018951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/mean_length": 55.8125, "completions/min_length": 2.0, "epoch": 0.004359438799005311, "frac_reward_zero_std": 0.0, "grad_norm": 28.34572410583496, "kl": 0.00940506812185049, "learning_rate": 2.1792510742786988e-07, "loss": -0.13618123531341553, "memory(GiB)": 90.94, "reward": 0.2984778583049774, "reward_std": 0.0539114885032177, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": 8.074486686382443e-05, "rewards/VisualPerceptionAccuracy/std": 0.0003229794674552977, "step": 284, "train_speed(iter/s)": 0.019009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 69.375, "completions/min_length": 3.0, "epoch": 0.004374788935621527, "frac_reward_zero_std": 0.0, "grad_norm": 40.802825927734375, "kl": 0.016090987250208855, "learning_rate": 2.1869244935543278e-07, "loss": 0.03002220392227173, "memory(GiB)": 90.94, "reward": 0.48250001668930054, "reward_std": 0.17979006469249725, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.9437500238418579, "rewards/PlanningActionSetORM/std": 0.15478479862213135, "rewards/RMReward/mean": 0.7437499761581421, "rewards/RMReward/std": 0.09810709208250046, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 285, "train_speed(iter/s)": 0.018977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 57.59375, "completions/min_length": 14.0, "epoch": 0.004390139072237743, "frac_reward_zero_std": 0.0, "grad_norm": 4.290514945983887, "kl": 0.007906317710876465, "learning_rate": 2.194597912829957e-07, "loss": 0.006825929507613182, "memory(GiB)": 90.94, "reward": 0.8169479370117188, "reward_std": 0.19714485108852386, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7088541984558105, "rewards/PlanningActionSetORM/std": 0.1907215565443039, "rewards/RMReward/mean": 0.6893749833106995, "rewards/RMReward/std": 0.1749083250761032, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 286, "train_speed(iter/s)": 0.01894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 108.875, "completions/min_length": 77.0, "epoch": 0.004405489208853959, "frac_reward_zero_std": 0.0, "grad_norm": 3.433044672012329, "kl": 0.00047818367602303624, "learning_rate": 2.2022713321055864e-07, "loss": 0.023030489683151245, "memory(GiB)": 90.94, "reward": 0.031509146094322205, "reward_std": 0.037214022129774094, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.031509146094322205, "rewards/VisualPerceptionAccuracy/std": 0.05076908320188522, "step": 287, "train_speed(iter/s)": 0.018998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/mean_length": 51.1875, "completions/min_length": 2.0, "epoch": 0.004420839345470174, "frac_reward_zero_std": 0.0, "grad_norm": 85.22905731201172, "kl": 0.023947831243276596, "learning_rate": 2.2099447513812156e-07, "loss": 0.05989567190408707, "memory(GiB)": 90.94, "reward": 0.4371354281902313, "reward_std": 0.12845101952552795, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9151041507720947, "rewards/PlanningActionSetORM/std": 0.08548439294099808, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.06582807004451752, "rewards/SpatialReasoningORM/mean": 0.07500000298023224, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 288, "train_speed(iter/s)": 0.019004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 88.65625, "completions/min_length": 2.0, "epoch": 0.004436189482086391, "frac_reward_zero_std": 0.0, "grad_norm": 36.180137634277344, "kl": 0.006133736111223698, "learning_rate": 2.217618170656845e-07, "loss": 0.10316378623247147, "memory(GiB)": 90.94, "reward": 0.5948908925056458, "reward_std": 0.16324685513973236, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9551587104797363, "rewards/PlanningActionSetORM/std": 0.10269322246313095, "rewards/RMReward/mean": 0.625, "rewards/RMReward/std": 0.16431677341461182, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 289, "train_speed(iter/s)": 0.018985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 84.8125, "completions/min_length": 2.0, "epoch": 0.0044515396187026065, "frac_reward_zero_std": 0.0, "grad_norm": 27.364362716674805, "kl": 0.008981171995401382, "learning_rate": 2.225291589932474e-07, "loss": 0.011336345225572586, "memory(GiB)": 90.94, "reward": 0.38134875893592834, "reward_std": 0.18040983378887177, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9447375535964966, "rewards/PlanningActionSetORM/std": 0.0786079615354538, "rewards/RMReward/mean": 0.628125011920929, "rewards/RMReward/std": 0.19913877546787262, "rewards/SpatialReasoningORM/mean": 0.07500000298023224, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 290, "train_speed(iter/s)": 0.018973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 451.53125, "completions/min_length": 84.0, "epoch": 0.004466889755318822, "frac_reward_zero_std": 0.0, "grad_norm": 1.540185570716858, "kl": 0.016622476279735565, "learning_rate": 2.2329650092081031e-07, "loss": -0.0646008551120758, "memory(GiB)": 90.94, "reward": 0.42119866609573364, "reward_std": 0.14481797814369202, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9877232313156128, "rewards/PlanningActionSetORM/std": 0.034943319857120514, "rewards/RMReward/mean": 0.699999988079071, "rewards/RMReward/std": 0.13291601836681366, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08485272526741028, "rewards/VisualPerceptionAccuracy/std": 0.17921128869056702, "step": 291, "train_speed(iter/s)": 0.018934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 116.78125, "completions/min_length": 77.0, "epoch": 0.004482239891935038, "frac_reward_zero_std": 0.0, "grad_norm": 1.9782805442810059, "kl": 0.02893809601664543, "learning_rate": 2.2406384284837324e-07, "loss": 0.00045480579137802124, "memory(GiB)": 90.94, "reward": 0.77260422706604, "reward_std": 0.09275975823402405, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9380208253860474, "rewards/PlanningActionSetORM/std": 0.07429289072751999, "rewards/RMReward/mean": 0.731249988079071, "rewards/RMReward/std": 0.12098386883735657, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 292, "train_speed(iter/s)": 0.018933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 142.03125, "completions/min_length": 53.0, "epoch": 0.004497590028551254, "frac_reward_zero_std": 0.0, "grad_norm": 2.391779899597168, "kl": 0.006527619902044535, "learning_rate": 2.248311847759362e-07, "loss": 0.0003433041274547577, "memory(GiB)": 90.94, "reward": 0.3751055598258972, "reward_std": 0.08658356964588165, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9098958373069763, "rewards/PlanningActionSetORM/std": 0.11495646089315414, "rewards/RMReward/mean": 0.6468750238418579, "rewards/RMReward/std": 0.10873323678970337, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05073194205760956, "rewards/VisualPerceptionAccuracy/std": 0.0846281349658966, "step": 293, "train_speed(iter/s)": 0.018932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 3.03125, "completions/min_length": 2.0, "epoch": 0.00451294016516747, "frac_reward_zero_std": 0.0, "grad_norm": 51.34758377075195, "kl": 0.02663722261786461, "learning_rate": 2.2559852670349912e-07, "loss": -0.0866217166185379, "memory(GiB)": 90.94, "reward": 0.2628124952316284, "reward_std": 0.20399034023284912, "rewards/MathAnswerFormat/mean": 0.03125, "rewards/MathAnswerFormat/std": 0.1767766922712326, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.27500003576278687, "rewards/SpatialReasoningORM/std": 0.3242858350276947, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 294, "train_speed(iter/s)": 0.018992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 166.125, "completions/min_length": 118.0, "epoch": 0.004528290301783686, "frac_reward_zero_std": 0.0, "grad_norm": 1.7398126125335693, "kl": 0.017446331679821014, "learning_rate": 2.2636586863106202e-07, "loss": -0.027451656758785248, "memory(GiB)": 90.94, "reward": 0.6764092445373535, "reward_std": 0.1356877237558365, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9082961082458496, "rewards/PlanningActionSetORM/std": 0.14453664422035217, "rewards/RMReward/mean": 0.6184375286102295, "rewards/RMReward/std": 0.1627606451511383, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 295, "train_speed(iter/s)": 0.018891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 246.90625, "completions/min_length": 131.0, "epoch": 0.004543640438399902, "frac_reward_zero_std": 0.0, "grad_norm": 1.9146353006362915, "kl": 0.006993584334850311, "learning_rate": 2.2713321055862495e-07, "loss": -0.023410577327013016, "memory(GiB)": 90.94, "reward": 0.3648238182067871, "reward_std": 0.08842189610004425, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9740372896194458, "rewards/PlanningActionSetORM/std": 0.023805202916264534, "rewards/RMReward/mean": 0.5718750357627869, "rewards/RMReward/std": 0.09123001992702484, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07734018564224243, "rewards/VisualPerceptionAccuracy/std": 0.10156368464231491, "step": 296, "train_speed(iter/s)": 0.018887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/mean_length": 119.9375, "completions/min_length": 62.0, "epoch": 0.004558990575016118, "frac_reward_zero_std": 0.0, "grad_norm": 2.181473731994629, "kl": 0.01889645867049694, "learning_rate": 2.2790055248618787e-07, "loss": 0.005326882004737854, "memory(GiB)": 90.94, "reward": 0.7760416865348816, "reward_std": 0.08284921944141388, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9114583730697632, "rewards/PlanningActionSetORM/std": 0.08684437721967697, "rewards/RMReward/mean": 0.7421875, "rewards/RMReward/std": 0.1397632658481598, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 297, "train_speed(iter/s)": 0.018809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 199.59375, "completions/min_length": 87.0, "epoch": 0.0045743407116323334, "frac_reward_zero_std": 0.0, "grad_norm": 2.051180362701416, "kl": 0.00862662773579359, "learning_rate": 2.286678944137508e-07, "loss": 0.06962089240550995, "memory(GiB)": 90.94, "reward": 0.4185774326324463, "reward_std": 0.1678203046321869, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8548137545585632, "rewards/PlanningActionSetORM/std": 0.11535577476024628, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.1459166556596756, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.15119203925132751, "rewards/VisualPerceptionAccuracy/std": 0.20764976739883423, "step": 298, "train_speed(iter/s)": 0.018797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/mean_length": 90.5625, "completions/min_length": 2.0, "epoch": 0.004589690848248549, "frac_reward_zero_std": 0.0, "grad_norm": 58.684593200683594, "kl": 0.05018091946840286, "learning_rate": 2.2943523634131372e-07, "loss": -0.22895704209804535, "memory(GiB)": 90.94, "reward": 0.6528744697570801, "reward_std": 0.12760013341903687, "rewards/MathAnswerFormat/mean": 0.125, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.9099950790405273, "rewards/PlanningActionSetORM/std": 0.08701256662607193, "rewards/RMReward/mean": 0.625, "rewards/RMReward/std": 0.12516656517982483, "rewards/SpatialReasoningORM/mean": 0.6500000357627869, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 299, "train_speed(iter/s)": 0.018739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 668.59375, "completions/min_length": 255.0, "epoch": 0.004605040984864766, "frac_reward_zero_std": 0.0, "grad_norm": 1.1403841972351074, "kl": 0.0007478682091459632, "learning_rate": 2.3020257826887662e-07, "loss": -0.08340039849281311, "memory(GiB)": 90.94, "reward": 0.167714461684227, "reward_std": 0.14854028820991516, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.167714461684227, "rewards/VisualPerceptionAccuracy/std": 0.1501591056585312, "step": 300, "train_speed(iter/s)": 0.018752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 441.96875, "completions/min_length": 76.0, "epoch": 0.004620391121480981, "frac_reward_zero_std": 0.0, "grad_norm": 1.5064010620117188, "kl": 0.015082769095897675, "learning_rate": 2.3096992019643955e-07, "loss": -0.02110329270362854, "memory(GiB)": 90.94, "reward": 0.421865850687027, "reward_std": 0.05187619850039482, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9546874761581421, "rewards/PlanningActionSetORM/std": 0.07428478449583054, "rewards/RMReward/mean": 0.8031249642372131, "rewards/RMReward/std": 0.07846176624298096, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.010294157080352306, "rewards/VisualPerceptionAccuracy/std": 0.041176628321409225, "step": 301, "train_speed(iter/s)": 0.018668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 60.9375, "completions/min_length": 2.0, "epoch": 0.004635741258097197, "frac_reward_zero_std": 0.0, "grad_norm": 60.72127151489258, "kl": 0.013738203793764114, "learning_rate": 2.3173726212400248e-07, "loss": 0.05754382908344269, "memory(GiB)": 90.94, "reward": 0.5249999761581421, "reward_std": 0.34960058331489563, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8874999284744263, "rewards/PlanningActionSetORM/std": 0.12505553662776947, "rewards/RMReward/mean": 0.621874988079071, "rewards/RMReward/std": 0.23449857532978058, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.375, "rewards/VisualPerceptionAccuracy/std": 0.5, "step": 302, "train_speed(iter/s)": 0.018662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 63.5625, "completions/min_length": 3.0, "epoch": 0.0046510913947134126, "frac_reward_zero_std": 0.0, "grad_norm": 61.2994384765625, "kl": 0.10279148817062378, "learning_rate": 2.325046040515654e-07, "loss": -0.17179100215435028, "memory(GiB)": 90.94, "reward": 0.7458853721618652, "reward_std": 0.14808256924152374, "rewards/MathAnswerFormat/mean": 0.3125, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.9744791984558105, "rewards/PlanningActionSetORM/std": 0.04629502817988396, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.11286976933479309, "rewards/SpatialReasoningORM/mean": 0.7250000238418579, "rewards/SpatialReasoningORM/std": 0.1914854198694229, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 303, "train_speed(iter/s)": 0.018626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 100.5, "completions/min_length": 2.0, "epoch": 0.004666441531329629, "frac_reward_zero_std": 0.0, "grad_norm": 56.58872604370117, "kl": 0.08021632581949234, "learning_rate": 2.332719459791283e-07, "loss": -0.24041670560836792, "memory(GiB)": 90.94, "reward": 0.4816341996192932, "reward_std": 0.1865314543247223, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.7000000476837158, "rewards/SpatialReasoningORM/std": 0.17888543009757996, "rewards/VisualPerceptionAccuracy/mean": 0.28576838970184326, "rewards/VisualPerceptionAccuracy/std": 0.18076108396053314, "step": 304, "train_speed(iter/s)": 0.018585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2006.0, "completions/mean_length": 462.9375, "completions/min_length": 88.0, "epoch": 0.004681791667945845, "frac_reward_zero_std": 0.0, "grad_norm": 2.227092742919922, "kl": 0.011624148115515709, "learning_rate": 2.3403928790669123e-07, "loss": -0.0358729213476181, "memory(GiB)": 90.94, "reward": 0.48735255002975464, "reward_std": 0.1400866061449051, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9124999642372131, "rewards/PlanningActionSetORM/std": 0.07187952846288681, "rewards/RMReward/mean": 0.734375, "rewards/RMReward/std": 0.08508574962615967, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.20470502972602844, "rewards/VisualPerceptionAccuracy/std": 0.20714351534843445, "step": 305, "train_speed(iter/s)": 0.018589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 137.4375, "completions/min_length": 80.0, "epoch": 0.00469714180456206, "frac_reward_zero_std": 0.0, "grad_norm": 2.0748798847198486, "kl": 0.02477068267762661, "learning_rate": 2.3480662983425416e-07, "loss": -0.02005063369870186, "memory(GiB)": 90.94, "reward": 0.8325895667076111, "reward_std": 0.09119254350662231, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9391977787017822, "rewards/PlanningActionSetORM/std": 0.06911656260490417, "rewards/RMReward/mean": 0.8059375286102295, "rewards/RMReward/std": 0.1263599395751953, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 306, "train_speed(iter/s)": 0.018533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 166.65625, "completions/min_length": 2.0, "epoch": 0.004712491941178277, "frac_reward_zero_std": 0.0, "grad_norm": 75.88184356689453, "kl": 0.012628353200852871, "learning_rate": 2.3557397176181708e-07, "loss": 0.028420981019735336, "memory(GiB)": 90.94, "reward": 0.2565680146217346, "reward_std": 0.21493792533874512, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": 0.1568860113620758, "rewards/VisualPerceptionAccuracy/std": 0.14487583935260773, "step": 307, "train_speed(iter/s)": 0.018539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 137.3125, "completions/min_length": 101.0, "epoch": 0.0047278420777944925, "frac_reward_zero_std": 0.0, "grad_norm": 1.7581294775009155, "kl": 0.02435879223048687, "learning_rate": 2.3634131368937998e-07, "loss": -0.026516973972320557, "memory(GiB)": 90.94, "reward": 0.8048355579376221, "reward_std": 0.1041460633277893, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9566778540611267, "rewards/PlanningActionSetORM/std": 0.06445877999067307, "rewards/RMReward/mean": 0.7668749690055847, "rewards/RMReward/std": 0.14794151484966278, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 308, "train_speed(iter/s)": 0.018524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 3.15625, "completions/min_length": 2.0, "epoch": 0.004743192214410708, "frac_reward_zero_std": 0.0, "grad_norm": 48.973575592041016, "kl": 0.11380545794963837, "learning_rate": 2.371086556169429e-07, "loss": -0.14899861812591553, "memory(GiB)": 90.94, "reward": 0.297187477350235, "reward_std": 0.10896115750074387, "rewards/MathAnswerFormat/mean": 0.125, "rewards/MathAnswerFormat/std": 0.33601075410842896, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.3062500059604645, "rewards/SpatialReasoningORM/std": 0.3444841802120209, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 309, "train_speed(iter/s)": 0.018517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/mean_length": 258.71875, "completions/min_length": 124.0, "epoch": 0.004758542351026924, "frac_reward_zero_std": 0.0, "grad_norm": 1.882107138633728, "kl": 0.004486497491598129, "learning_rate": 2.3787599754450586e-07, "loss": -0.005069933831691742, "memory(GiB)": 90.94, "reward": 0.43333661556243896, "reward_std": 0.17716188728809357, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9273617267608643, "rewards/PlanningActionSetORM/std": 0.034542616456747055, "rewards/RMReward/mean": 0.5306249856948853, "rewards/RMReward/std": 0.1816762238740921, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.25670087337493896, "rewards/VisualPerceptionAccuracy/std": 0.2075355052947998, "step": 310, "train_speed(iter/s)": 0.018516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 54.65625, "completions/min_length": 2.0, "epoch": 0.00477389248764314, "frac_reward_zero_std": 0.0, "grad_norm": 69.2463150024414, "kl": 0.12362534552812576, "learning_rate": 2.386433394720688e-07, "loss": -0.2277590036392212, "memory(GiB)": 90.94, "reward": 0.7017187476158142, "reward_std": 0.1461257040500641, "rewards/MathAnswerFormat/mean": 0.125, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.8609374761581421, "rewards/PlanningActionSetORM/std": 0.15082240104675293, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.16352242231369019, "rewards/SpatialReasoningORM/mean": 0.6500000357627869, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 311, "train_speed(iter/s)": 0.018494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/mean_length": 131.0625, "completions/min_length": 85.0, "epoch": 0.004789242624259356, "frac_reward_zero_std": 0.0, "grad_norm": 2.0512075424194336, "kl": 0.038791075348854065, "learning_rate": 2.394106813996317e-07, "loss": 0.0051405602134764194, "memory(GiB)": 90.94, "reward": 0.7015451192855835, "reward_std": 0.12382762879133224, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9202256798744202, "rewards/PlanningActionSetORM/std": 0.09414877742528915, "rewards/RMReward/mean": 0.6468750238418579, "rewards/RMReward/std": 0.1887725591659546, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 312, "train_speed(iter/s)": 0.018485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 119.875, "completions/min_length": 3.0, "epoch": 0.004804592760875572, "frac_reward_zero_std": 0.0, "grad_norm": 37.544898986816406, "kl": 0.009047829546034336, "learning_rate": 2.4017802332719464e-07, "loss": 0.10455597937107086, "memory(GiB)": 90.94, "reward": 0.5796875357627869, "reward_std": 0.22260180115699768, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7093750238418579, "rewards/RMReward/std": 0.21542111039161682, "rewards/SpatialReasoningORM/mean": 0.4125000238418579, "rewards/SpatialReasoningORM/std": 0.28722813725471497, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 313, "train_speed(iter/s)": 0.018462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 218.03125, "completions/min_length": 92.0, "epoch": 0.004819942897491787, "frac_reward_zero_std": 0.0, "grad_norm": 1.4642857313156128, "kl": 0.025621727108955383, "learning_rate": 2.4094536525475757e-07, "loss": -0.044281188398599625, "memory(GiB)": 90.94, "reward": 0.7071774005889893, "reward_std": 0.10186073184013367, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9196368455886841, "rewards/PlanningActionSetORM/std": 0.11031360179185867, "rewards/RMReward/mean": 0.6540625095367432, "rewards/RMReward/std": 0.18389791250228882, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 314, "train_speed(iter/s)": 0.018412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/mean_length": 102.84375, "completions/min_length": 75.0, "epoch": 0.004835293034108004, "frac_reward_zero_std": 0.0, "grad_norm": 2.329535722732544, "kl": 0.0572013258934021, "learning_rate": 2.417127071823205e-07, "loss": -0.004357520490884781, "memory(GiB)": 90.94, "reward": 0.7693750262260437, "reward_std": 0.09427641332149506, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9031250476837158, "rewards/PlanningActionSetORM/std": 0.11382384598255157, "rewards/RMReward/mean": 0.7359374761581421, "rewards/RMReward/std": 0.12328459322452545, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 315, "train_speed(iter/s)": 0.018408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 5.34375, "completions/min_length": 2.0, "epoch": 0.0048506431707242195, "frac_reward_zero_std": 0.0, "grad_norm": 37.76637268066406, "kl": 0.34547412395477295, "learning_rate": 2.4248004910988336e-07, "loss": -0.08520107716321945, "memory(GiB)": 90.94, "reward": 0.5415624976158142, "reward_std": 0.16803036630153656, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.543749988079071, "rewards/SpatialReasoningORM/std": 0.4744691252708435, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 316, "train_speed(iter/s)": 0.018462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 3.625, "completions/min_length": 2.0, "epoch": 0.004865993307340435, "frac_reward_zero_std": 0.0, "grad_norm": 83.2608642578125, "kl": 0.26400554180145264, "learning_rate": 2.432473910374463e-07, "loss": -0.2099490761756897, "memory(GiB)": 90.94, "reward": 0.6031249761581421, "reward_std": 0.21172964572906494, "rewards/MathAnswerFormat/mean": 0.1875, "rewards/MathAnswerFormat/std": 0.3965577781200409, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.21997065842151642, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 317, "train_speed(iter/s)": 0.018517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 3.125, "completions/min_length": 2.0, "epoch": 0.004881343443956652, "frac_reward_zero_std": 0.0, "grad_norm": 73.52694702148438, "kl": 0.29166755080223083, "learning_rate": 2.440147329650092e-07, "loss": -0.28391721844673157, "memory(GiB)": 90.94, "reward": 0.6156250238418579, "reward_std": 0.3656988739967346, "rewards/MathAnswerFormat/mean": 0.375, "rewards/MathAnswerFormat/std": 0.5, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.19999998807907104, "rewards/VisualPerceptionAccuracy/mean": 0.5, "rewards/VisualPerceptionAccuracy/std": 0.5163977742195129, "step": 318, "train_speed(iter/s)": 0.018571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/mean_length": 63.09375, "completions/min_length": 15.0, "epoch": 0.004896693580572867, "frac_reward_zero_std": 0.0, "grad_norm": 4.059166431427002, "kl": 0.01514272391796112, "learning_rate": 2.4478207489257214e-07, "loss": -0.025901587679982185, "memory(GiB)": 90.94, "reward": 0.7610937356948853, "reward_std": 0.255577951669693, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8890624642372131, "rewards/PlanningActionSetORM/std": 0.10366964340209961, "rewards/RMReward/mean": 0.6531250476837158, "rewards/RMReward/std": 0.1564914733171463, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 319, "train_speed(iter/s)": 0.01857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/mean_length": 119.78125, "completions/min_length": 14.0, "epoch": 0.004912043717189083, "frac_reward_zero_std": 0.0, "grad_norm": 6.466460227966309, "kl": 0.009371803142130375, "learning_rate": 2.4554941682013507e-07, "loss": 0.059241969138383865, "memory(GiB)": 90.94, "reward": 0.7331423759460449, "reward_std": 0.24879232048988342, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8626735806465149, "rewards/PlanningActionSetORM/std": 0.17267553508281708, "rewards/RMReward/mean": 0.515625, "rewards/RMReward/std": 0.1903669834136963, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 320, "train_speed(iter/s)": 0.018564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 6.9375, "completions/min_length": 2.0, "epoch": 0.0049273938538052986, "frac_reward_zero_std": 0.0, "grad_norm": 55.65595626831055, "kl": 0.8107352256774902, "learning_rate": 2.46316758747698e-07, "loss": -0.14457279443740845, "memory(GiB)": 90.94, "reward": 0.4978124797344208, "reward_std": 0.2160358428955078, "rewards/MathAnswerFormat/mean": 0.8125, "rewards/MathAnswerFormat/std": 0.3965577781200409, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.48124998807907104, "rewards/SpatialReasoningORM/std": 0.4761658012866974, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 321, "train_speed(iter/s)": 0.018561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/mean_length": 6.625, "completions/min_length": 2.0, "epoch": 0.004942743990421515, "frac_reward_zero_std": 0.0, "grad_norm": 56.691410064697266, "kl": 0.8449750542640686, "learning_rate": 2.470841006752609e-07, "loss": 0.1449773609638214, "memory(GiB)": 90.94, "reward": 0.6534374952316284, "reward_std": 0.27516689896583557, "rewards/MathAnswerFormat/mean": 0.71875, "rewards/MathAnswerFormat/std": 0.45680341124534607, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6499999761581421, "rewards/SpatialReasoningORM/std": 0.4158163070678711, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 322, "train_speed(iter/s)": 0.01861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 7.53125, "completions/min_length": 3.0, "epoch": 0.004958094127037731, "frac_reward_zero_std": 0.0, "grad_norm": 39.76944351196289, "kl": 0.6782459020614624, "learning_rate": 2.4785144260282385e-07, "loss": 0.047997843474149704, "memory(GiB)": 90.94, "reward": 0.5765625238418579, "reward_std": 0.3690255582332611, "rewards/MathAnswerFormat/mean": 0.84375, "rewards/MathAnswerFormat/std": 0.3689020276069641, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.46402865648269653, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 323, "train_speed(iter/s)": 0.018664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 77.15625, "completions/min_length": 15.0, "epoch": 0.004973444263653946, "frac_reward_zero_std": 0.0, "grad_norm": 3.8141930103302, "kl": 0.022982986643910408, "learning_rate": 2.486187845303868e-07, "loss": 0.005099453032016754, "memory(GiB)": 90.94, "reward": 0.873824417591095, "reward_std": 0.1537667214870453, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.985119104385376, "rewards/PlanningActionSetORM/std": 0.032091375440359116, "rewards/RMReward/mean": 0.7625000476837158, "rewards/RMReward/std": 0.08660254627466202, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 324, "train_speed(iter/s)": 0.018631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/mean_length": 55.5625, "completions/min_length": 8.0, "epoch": 0.004988794400270162, "frac_reward_zero_std": 0.0, "grad_norm": 24.765745162963867, "kl": 0.3491430878639221, "learning_rate": 2.4938612645794965e-07, "loss": 0.035843439400196075, "memory(GiB)": 90.94, "reward": 0.639046847820282, "reward_std": 0.3086775839328766, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8804687261581421, "rewards/PlanningActionSetORM/std": 0.13881707191467285, "rewards/RMReward/mean": 0.7212499976158142, "rewards/RMReward/std": 0.1388944536447525, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 325, "train_speed(iter/s)": 0.018632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.5, "completions/min_length": 2.0, "epoch": 0.0050041445368863785, "frac_reward_zero_std": 0.0, "grad_norm": 24.16642951965332, "kl": 0.39112770557403564, "learning_rate": 2.501534683855126e-07, "loss": -0.03907117247581482, "memory(GiB)": 90.94, "reward": 0.49031245708465576, "reward_std": 0.40324753522872925, "rewards/MathAnswerFormat/mean": 0.78125, "rewards/MathAnswerFormat/std": 0.420013427734375, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4749999940395355, "rewards/SpatialReasoningORM/std": 0.4508057236671448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 326, "train_speed(iter/s)": 0.018681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/mean_length": 53.75, "completions/min_length": 8.0, "epoch": 0.005019494673502594, "frac_reward_zero_std": 0.0, "grad_norm": 10.639087677001953, "kl": 0.39416438341140747, "learning_rate": 2.509208103130755e-07, "loss": 0.027470186352729797, "memory(GiB)": 90.94, "reward": 0.8722395896911621, "reward_std": 0.15872883796691895, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9692708253860474, "rewards/PlanningActionSetORM/std": 0.0674755647778511, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.09036961197853088, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 327, "train_speed(iter/s)": 0.018703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 139.84375, "completions/min_length": 96.0, "epoch": 0.00503484481011881, "frac_reward_zero_std": 0.0, "grad_norm": 1.73220694065094, "kl": 0.030575353652238846, "learning_rate": 2.516881522406385e-07, "loss": -0.027906153351068497, "memory(GiB)": 90.94, "reward": 0.6979092359542847, "reward_std": 0.11245512217283249, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9645460844039917, "rewards/PlanningActionSetORM/std": 0.06889300048351288, "rewards/RMReward/mean": 0.6312500238418579, "rewards/RMReward/std": 0.1925005316734314, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 328, "train_speed(iter/s)": 0.018677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 152.5625, "completions/min_length": 94.0, "epoch": 0.005050194946735026, "frac_reward_zero_std": 0.0, "grad_norm": 1.7942774295806885, "kl": 0.030015017837285995, "learning_rate": 2.524554941682014e-07, "loss": 0.013897361233830452, "memory(GiB)": 90.94, "reward": 0.7316173315048218, "reward_std": 0.1398654580116272, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9280866384506226, "rewards/PlanningActionSetORM/std": 0.06629309058189392, "rewards/RMReward/mean": 0.6825000047683716, "rewards/RMReward/std": 0.17383715510368347, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 329, "train_speed(iter/s)": 0.018659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 107.84375, "completions/min_length": 76.0, "epoch": 0.005065545083351242, "frac_reward_zero_std": 0.0, "grad_norm": 3.412503957748413, "kl": 0.042393676936626434, "learning_rate": 2.5322283609576433e-07, "loss": 0.029820241034030914, "memory(GiB)": 90.94, "reward": 0.7446205615997314, "reward_std": 0.12860970199108124, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9043526649475098, "rewards/PlanningActionSetORM/std": 0.14169993996620178, "rewards/RMReward/mean": 0.7046874761581421, "rewards/RMReward/std": 0.15931271016597748, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 330, "train_speed(iter/s)": 0.018665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/mean_length": 16.09375, "completions/min_length": 14.0, "epoch": 0.005080895219967458, "frac_reward_zero_std": 0.0, "grad_norm": 7.147068500518799, "kl": 0.004083638545125723, "learning_rate": 2.539901780233272e-07, "loss": -0.013546787202358246, "memory(GiB)": 90.94, "reward": 0.940625011920929, "reward_std": 0.23749999701976776, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 331, "train_speed(iter/s)": 0.018714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 219.0625, "completions/min_length": 120.0, "epoch": 0.005096245356583673, "frac_reward_zero_std": 0.0, "grad_norm": 2.115833282470703, "kl": 0.00136750063393265, "learning_rate": 2.5475751995089013e-07, "loss": 0.0305488221347332, "memory(GiB)": 90.94, "reward": 0.1882385015487671, "reward_std": 0.20122528076171875, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1882385015487671, "rewards/VisualPerceptionAccuracy/std": 0.2181689739227295, "step": 332, "train_speed(iter/s)": 0.018758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/mean_length": 120.78125, "completions/min_length": 8.0, "epoch": 0.00511159549319989, "frac_reward_zero_std": 0.0, "grad_norm": 14.181467056274414, "kl": 0.49287810921669006, "learning_rate": 2.5552486187845306e-07, "loss": 0.02963319793343544, "memory(GiB)": 90.94, "reward": 0.8227767944335938, "reward_std": 0.2947598397731781, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.980893075466156, "rewards/PlanningActionSetORM/std": 0.03226381540298462, "rewards/RMReward/mean": 0.784375011920929, "rewards/RMReward/std": 0.2534552812576294, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 333, "train_speed(iter/s)": 0.018726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 57.40625, "completions/min_length": 8.0, "epoch": 0.0051269456298161055, "frac_reward_zero_std": 0.0, "grad_norm": 10.445649147033691, "kl": 0.49090543389320374, "learning_rate": 2.56292203806016e-07, "loss": 0.015447739511728287, "memory(GiB)": 90.94, "reward": 0.8411383628845215, "reward_std": 0.16545026004314423, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8582589626312256, "rewards/PlanningActionSetORM/std": 0.18158279359340668, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.07852812856435776, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 334, "train_speed(iter/s)": 0.018744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 156.53125, "completions/min_length": 59.0, "epoch": 0.005142295766432321, "frac_reward_zero_std": 0.0, "grad_norm": 2.6798758506774902, "kl": 0.011836092919111252, "learning_rate": 2.570595457335789e-07, "loss": -0.08109622448682785, "memory(GiB)": 90.94, "reward": 0.5614436268806458, "reward_std": 0.1696593463420868, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9919143319129944, "rewards/PlanningActionSetORM/std": 0.017446067184209824, "rewards/RMReward/mean": 0.9049999713897705, "rewards/RMReward/std": 0.154012992978096, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2005043774843216, "rewards/VisualPerceptionAccuracy/std": 0.2135176807641983, "step": 335, "train_speed(iter/s)": 0.018688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/mean_length": 61.59375, "completions/min_length": 14.0, "epoch": 0.005157645903048537, "frac_reward_zero_std": 0.0, "grad_norm": 5.3191094398498535, "kl": 0.020233457908034325, "learning_rate": 2.5782688766114184e-07, "loss": -0.0014830529689788818, "memory(GiB)": 90.94, "reward": 0.7931250333786011, "reward_std": 0.24099653959274292, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.06800736486911774, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 336, "train_speed(iter/s)": 0.018704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 129.53125, "completions/min_length": 31.0, "epoch": 0.005172996039664753, "frac_reward_zero_std": 0.0, "grad_norm": 2.4941675662994385, "kl": 0.04488375037908554, "learning_rate": 2.5859422958870476e-07, "loss": -0.059857845306396484, "memory(GiB)": 90.94, "reward": 0.763136088848114, "reward_std": 0.13809242844581604, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9431805610656738, "rewards/PlanningActionSetORM/std": 0.15614719688892365, "rewards/RMReward/mean": 0.7181249856948853, "rewards/RMReward/std": 0.1696949303150177, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 337, "train_speed(iter/s)": 0.018681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 117.96875, "completions/min_length": 8.0, "epoch": 0.005188346176280969, "frac_reward_zero_std": 0.0, "grad_norm": 14.514723777770996, "kl": 0.32571181654930115, "learning_rate": 2.593615715162677e-07, "loss": 0.02252466231584549, "memory(GiB)": 90.94, "reward": 0.5032856464385986, "reward_std": 0.21158647537231445, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.12532123923301697, "rewards/VisualPerceptionAccuracy/std": 0.09868615120649338, "step": 338, "train_speed(iter/s)": 0.018725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/mean_length": 106.375, "completions/min_length": 8.0, "epoch": 0.005203696312897185, "frac_reward_zero_std": 0.0, "grad_norm": 18.40239906311035, "kl": 0.5957368016242981, "learning_rate": 2.6012891344383056e-07, "loss": 0.03751669079065323, "memory(GiB)": 90.94, "reward": 0.3859526515007019, "reward_std": 0.25605887174606323, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.06878028810024261, "rewards/VisualPerceptionAccuracy/std": 0.05733989179134369, "step": 339, "train_speed(iter/s)": 0.018762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 185.6875, "completions/min_length": 78.0, "epoch": 0.005219046449513401, "frac_reward_zero_std": 0.0, "grad_norm": 1.5062036514282227, "kl": 0.02235727570950985, "learning_rate": 2.608962553713935e-07, "loss": -0.016113460063934326, "memory(GiB)": 90.94, "reward": 0.7802306413650513, "reward_std": 0.16332074999809265, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9761532545089722, "rewards/PlanningActionSetORM/std": 0.07543820142745972, "rewards/RMReward/mean": 0.731249988079071, "rewards/RMReward/std": 0.20703357458114624, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 340, "train_speed(iter/s)": 0.018682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 105.375, "completions/min_length": 8.0, "epoch": 0.005234396586129617, "frac_reward_zero_std": 0.0, "grad_norm": 10.904153823852539, "kl": 0.5950703024864197, "learning_rate": 2.616635972989564e-07, "loss": 0.0020531564950942993, "memory(GiB)": 90.94, "reward": 0.8916676044464111, "reward_std": 0.15556196868419647, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9035511016845703, "rewards/PlanningActionSetORM/std": 0.028178995475172997, "rewards/RMReward/mean": 0.8274999856948853, "rewards/RMReward/std": 0.09183318167924881, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 341, "train_speed(iter/s)": 0.018657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 126.625, "completions/min_length": 66.0, "epoch": 0.005249746722745832, "frac_reward_zero_std": 0.0, "grad_norm": 2.2567250728607178, "kl": 0.021947788074612617, "learning_rate": 2.6243093922651934e-07, "loss": 0.007912321016192436, "memory(GiB)": 90.94, "reward": 0.38794073462486267, "reward_std": 0.13632646203041077, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9515625238418579, "rewards/PlanningActionSetORM/std": 0.11563906818628311, "rewards/RMReward/mean": 0.6081249713897705, "rewards/RMReward/std": 0.19332937896251678, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09906892478466034, "rewards/VisualPerceptionAccuracy/std": 0.11737886071205139, "step": 342, "train_speed(iter/s)": 0.01863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 193.59375, "completions/min_length": 72.0, "epoch": 0.005265096859362048, "frac_reward_zero_std": 0.0, "grad_norm": 2.054049015045166, "kl": 0.026317952200770378, "learning_rate": 2.6319828115408227e-07, "loss": 0.035576559603214264, "memory(GiB)": 90.94, "reward": 0.7837036848068237, "reward_std": 0.1219165027141571, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8897684812545776, "rewards/PlanningActionSetORM/std": 0.15041889250278473, "rewards/RMReward/mean": 0.7571874856948853, "rewards/RMReward/std": 0.1503809094429016, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 343, "train_speed(iter/s)": 0.018616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/mean_length": 78.875, "completions/min_length": 8.0, "epoch": 0.0052804469959782645, "frac_reward_zero_std": 0.0, "grad_norm": 13.146554946899414, "kl": 0.23900313675403595, "learning_rate": 2.639656230816452e-07, "loss": -0.010339796543121338, "memory(GiB)": 90.94, "reward": 0.4078125059604645, "reward_std": 0.1909274458885193, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.6343749761581421, "rewards/RMReward/std": 0.17674723267555237, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 344, "train_speed(iter/s)": 0.01861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/mean_length": 11.0625, "completions/min_length": 8.0, "epoch": 0.00529579713259448, "frac_reward_zero_std": 0.0, "grad_norm": 23.12652015686035, "kl": 0.5650503635406494, "learning_rate": 2.647329650092081e-07, "loss": 0.007832512259483337, "memory(GiB)": 90.94, "reward": 0.6437499523162842, "reward_std": 0.45771539211273193, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 345, "train_speed(iter/s)": 0.018657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 59.6875, "completions/min_length": 8.0, "epoch": 0.005311147269210696, "frac_reward_zero_std": 0.0, "grad_norm": 7.7051520347595215, "kl": 0.2803463041782379, "learning_rate": 2.6550030693677105e-07, "loss": 0.010003305971622467, "memory(GiB)": 90.94, "reward": 0.44311755895614624, "reward_std": 0.1777723729610443, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9343006014823914, "rewards/PlanningActionSetORM/std": 0.09089004993438721, "rewards/RMReward/mean": 0.737500011920929, "rewards/RMReward/std": 0.13723459839820862, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 346, "train_speed(iter/s)": 0.018658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/mean_length": 50.125, "completions/min_length": 8.0, "epoch": 0.0053264974058269115, "frac_reward_zero_std": 0.0, "grad_norm": 7.9852728843688965, "kl": 0.29858291149139404, "learning_rate": 2.6626764886433397e-07, "loss": 0.012045308947563171, "memory(GiB)": 90.94, "reward": 0.8425520658493042, "reward_std": 0.17694485187530518, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8348958492279053, "rewards/PlanningActionSetORM/std": 0.03958333283662796, "rewards/RMReward/mean": 0.7218749523162842, "rewards/RMReward/std": 0.1460236757993698, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 347, "train_speed(iter/s)": 0.018646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 117.4375, "completions/min_length": 13.0, "epoch": 0.005341847542443128, "frac_reward_zero_std": 0.0, "grad_norm": 2.8707973957061768, "kl": 0.061439741402864456, "learning_rate": 2.670349907918969e-07, "loss": -0.09103171527385712, "memory(GiB)": 90.94, "reward": 0.6763070821762085, "reward_std": 0.11358202248811722, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9190351963043213, "rewards/PlanningActionSetORM/std": 0.16236673295497894, "rewards/RMReward/mean": 0.6156250238418579, "rewards/RMReward/std": 0.18510568141937256, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 348, "train_speed(iter/s)": 0.018629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/mean_length": 284.71875, "completions/min_length": 128.0, "epoch": 0.005357197679059344, "frac_reward_zero_std": 0.0, "grad_norm": 1.7439560890197754, "kl": 0.017627330496907234, "learning_rate": 2.678023327194598e-07, "loss": 0.06376723200082779, "memory(GiB)": 90.94, "reward": 0.39391112327575684, "reward_std": 0.1673855185508728, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9519176483154297, "rewards/PlanningActionSetORM/std": 0.10918691754341125, "rewards/RMReward/mean": 0.640625, "rewards/RMReward/std": 0.17721809446811676, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08493875712156296, "rewards/VisualPerceptionAccuracy/std": 0.18869100511074066, "step": 349, "train_speed(iter/s)": 0.018603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 176.3125, "completions/min_length": 19.0, "epoch": 0.005372547815675559, "frac_reward_zero_std": 0.0, "grad_norm": 2.253011703491211, "kl": 0.03262466937303543, "learning_rate": 2.6856967464702275e-07, "loss": -0.0913693830370903, "memory(GiB)": 90.94, "reward": 0.8012098073959351, "reward_std": 0.1726333200931549, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9485490918159485, "rewards/PlanningActionSetORM/std": 0.156251460313797, "rewards/RMReward/mean": 0.7643749713897705, "rewards/RMReward/std": 0.23478111624717712, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 350, "train_speed(iter/s)": 0.018591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/mean_length": 123.96875, "completions/min_length": 82.0, "epoch": 0.005387897952291776, "frac_reward_zero_std": 0.0, "grad_norm": 2.0037851333618164, "kl": 0.03359844908118248, "learning_rate": 2.693370165745857e-07, "loss": 0.05935042351484299, "memory(GiB)": 90.94, "reward": 0.7387509346008301, "reward_std": 0.1115424782037735, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375045299530029, "rewards/PlanningActionSetORM/std": 0.12485206127166748, "rewards/RMReward/mean": 0.6890624761581421, "rewards/RMReward/std": 0.14958779513835907, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 351, "train_speed(iter/s)": 0.018598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 60.40625, "completions/min_length": 15.0, "epoch": 0.0054032480889079915, "frac_reward_zero_std": 0.0, "grad_norm": 3.409604787826538, "kl": 0.028886564075946808, "learning_rate": 2.701043585021486e-07, "loss": 0.02111688256263733, "memory(GiB)": 90.94, "reward": 0.8614062666893005, "reward_std": 0.15789683163166046, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9234374761581421, "rewards/PlanningActionSetORM/std": 0.1070314347743988, "rewards/RMReward/mean": 0.7468750476837158, "rewards/RMReward/std": 0.08055794984102249, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 352, "train_speed(iter/s)": 0.018614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/mean_length": 117.75, "completions/min_length": 65.0, "epoch": 0.005418598225524207, "frac_reward_zero_std": 0.0, "grad_norm": 2.3513882160186768, "kl": 0.04966942220926285, "learning_rate": 2.7087170042971153e-07, "loss": -0.04872560873627663, "memory(GiB)": 90.94, "reward": 0.7409374713897705, "reward_std": 0.11522465199232101, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8796875476837158, "rewards/PlanningActionSetORM/std": 0.1328778713941574, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.14577379822731018, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 353, "train_speed(iter/s)": 0.0186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 123.53125, "completions/min_length": 77.0, "epoch": 0.005433948362140423, "frac_reward_zero_std": 0.0, "grad_norm": 2.1784842014312744, "kl": 0.025875072926282883, "learning_rate": 2.716390423572744e-07, "loss": -0.006058782339096069, "memory(GiB)": 90.94, "reward": 0.41199782490730286, "reward_std": 0.028976568952202797, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7781250476837158, "rewards/RMReward/std": 0.06823673099279404, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.001495571224950254, "rewards/VisualPerceptionAccuracy/std": 0.003363769967108965, "step": 354, "train_speed(iter/s)": 0.018581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 136.875, "completions/min_length": 59.0, "epoch": 0.005449298498756639, "frac_reward_zero_std": 0.0, "grad_norm": 2.689237594604492, "kl": 0.029217194765806198, "learning_rate": 2.7240638428483733e-07, "loss": 0.04970578849315643, "memory(GiB)": 90.94, "reward": 0.38980650901794434, "reward_std": 0.11163654923439026, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9362102746963501, "rewards/PlanningActionSetORM/std": 0.11541472375392914, "rewards/RMReward/mean": 0.5843750238418579, "rewards/RMReward/std": 0.1719677895307541, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.12487097084522247, "rewards/VisualPerceptionAccuracy/std": 0.07383424788713455, "step": 355, "train_speed(iter/s)": 0.018596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 159.25, "completions/min_length": 14.0, "epoch": 0.005464648635372855, "frac_reward_zero_std": 0.0, "grad_norm": 9.160409927368164, "kl": 0.0032501842360943556, "learning_rate": 2.7317372621240025e-07, "loss": 0.07472329586744308, "memory(GiB)": 90.94, "reward": 0.38999781012535095, "reward_std": 0.3051791191101074, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.13624566793441772, "rewards/VisualPerceptionAccuracy/std": 0.13535825908184052, "step": 356, "train_speed(iter/s)": 0.018637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 103.5625, "completions/min_length": 70.0, "epoch": 0.005479998771989071, "frac_reward_zero_std": 0.0, "grad_norm": 2.5257959365844727, "kl": 0.0482555627822876, "learning_rate": 2.739410681399632e-07, "loss": 0.0019192248582839966, "memory(GiB)": 90.94, "reward": 0.7696458697319031, "reward_std": 0.0705762654542923, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9182291626930237, "rewards/PlanningActionSetORM/std": 0.08133181184530258, "rewards/RMReward/mean": 0.7324999570846558, "rewards/RMReward/std": 0.11039518564939499, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 357, "train_speed(iter/s)": 0.018641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/mean_length": 201.8125, "completions/min_length": 128.0, "epoch": 0.005495348908605286, "frac_reward_zero_std": 0.0, "grad_norm": 1.395135521888733, "kl": 0.01670236699283123, "learning_rate": 2.747084100675261e-07, "loss": 0.027523232623934746, "memory(GiB)": 90.94, "reward": 0.4602298140525818, "reward_std": 0.056122321635484695, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8895833492279053, "rewards/PlanningActionSetORM/std": 0.03298428654670715, "rewards/RMReward/mean": 0.9181250333786011, "rewards/RMReward/std": 0.1073448583483696, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.00804293341934681, "rewards/VisualPerceptionAccuracy/std": 0.023530589416623116, "step": 358, "train_speed(iter/s)": 0.018629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/mean_length": 11.46875, "completions/min_length": 8.0, "epoch": 0.005510699045221503, "frac_reward_zero_std": 0.0, "grad_norm": 20.41734504699707, "kl": 0.5899683237075806, "learning_rate": 2.7547575199508903e-07, "loss": -0.002775849774479866, "memory(GiB)": 90.94, "reward": 0.792187511920929, "reward_std": 0.38963234424591064, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.78125, "rewards/SpatialReasoningORM/std": 0.420013427734375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 359, "train_speed(iter/s)": 0.018673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 188.40625, "completions/min_length": 131.0, "epoch": 0.005526049181837718, "frac_reward_zero_std": 0.0, "grad_norm": 1.2878599166870117, "kl": 0.04074227809906006, "learning_rate": 2.7624309392265196e-07, "loss": -0.0149923637509346, "memory(GiB)": 90.94, "reward": 0.8557677865028381, "reward_std": 0.09729248285293579, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8763389587402344, "rewards/PlanningActionSetORM/std": 0.035575322806835175, "rewards/RMReward/mean": 0.8506249785423279, "rewards/RMReward/std": 0.14745010435581207, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 360, "train_speed(iter/s)": 0.018631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 454.59375, "completions/min_length": 13.0, "epoch": 0.005541399318453934, "frac_reward_zero_std": 0.0, "grad_norm": 3.8303887844085693, "kl": 0.0017015081830322742, "learning_rate": 2.770104358502149e-07, "loss": 0.004403959959745407, "memory(GiB)": 90.94, "reward": 0.12346434593200684, "reward_std": 0.19705575704574585, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.13755369186401367, "rewards/VisualPerceptionAccuracy/std": 0.15661151707172394, "step": 361, "train_speed(iter/s)": 0.018637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 237.78125, "completions/min_length": 125.0, "epoch": 0.0055567494550701505, "frac_reward_zero_std": 0.0, "grad_norm": 1.7315607070922852, "kl": 0.02706998772919178, "learning_rate": 2.7777777777777776e-07, "loss": -0.014040226116776466, "memory(GiB)": 90.94, "reward": 0.5349205732345581, "reward_std": 0.12061825394630432, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.98828125, "rewards/PlanningActionSetORM/std": 0.033994100987911224, "rewards/RMReward/mean": 0.8493750095367432, "rewards/RMReward/std": 0.1677485853433609, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.19268494844436646, "rewards/VisualPerceptionAccuracy/std": 0.10514933615922928, "step": 362, "train_speed(iter/s)": 0.018627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 426.78125, "completions/min_length": 88.0, "epoch": 0.005572099591686366, "frac_reward_zero_std": 0.0, "grad_norm": 1.8682568073272705, "kl": 0.023917239159345627, "learning_rate": 2.7854511970534074e-07, "loss": 0.05158315598964691, "memory(GiB)": 90.94, "reward": 0.39200934767723083, "reward_std": 0.15267488360404968, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8578327894210815, "rewards/PlanningActionSetORM/std": 0.06173335760831833, "rewards/RMReward/mean": 0.6356250047683716, "rewards/RMReward/std": 0.24096940457820892, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.10395212471485138, "rewards/VisualPerceptionAccuracy/std": 0.11066805571317673, "step": 363, "train_speed(iter/s)": 0.018575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 73.6875, "completions/min_length": 14.0, "epoch": 0.005587449728302582, "frac_reward_zero_std": 0.0, "grad_norm": 14.789115905761719, "kl": 0.036816105246543884, "learning_rate": 2.7931246163290366e-07, "loss": -0.003858394455164671, "memory(GiB)": 90.94, "reward": 0.7825000286102295, "reward_std": 0.25207069516181946, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.753125011920929, "rewards/RMReward/std": 0.09911063313484192, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 364, "train_speed(iter/s)": 0.018582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 11.3125, "completions/min_length": 8.0, "epoch": 0.0056027998649187975, "frac_reward_zero_std": 0.0, "grad_norm": 16.3826847076416, "kl": 0.32743358612060547, "learning_rate": 2.800798035604666e-07, "loss": -0.03142614662647247, "memory(GiB)": 90.94, "reward": 0.6734374761581421, "reward_std": 0.45579153299331665, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.65625, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 365, "train_speed(iter/s)": 0.018625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 65.59375, "completions/min_length": 8.0, "epoch": 0.005618150001535014, "frac_reward_zero_std": 0.0, "grad_norm": 8.535293579101562, "kl": 0.5225857496261597, "learning_rate": 2.808471454880295e-07, "loss": 0.0024574175477027893, "memory(GiB)": 90.94, "reward": 0.12931717932224274, "reward_std": 0.15877670049667358, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.14925935864448547, "rewards/VisualPerceptionAccuracy/std": 0.08005338907241821, "step": 366, "train_speed(iter/s)": 0.018669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.46875, "completions/min_length": 8.0, "epoch": 0.00563350013815123, "frac_reward_zero_std": 0.0, "grad_norm": 27.61190414428711, "kl": 0.6816034317016602, "learning_rate": 2.8161448741559244e-07, "loss": -0.010028313845396042, "memory(GiB)": 90.94, "reward": 0.8515625, "reward_std": 0.3311764597892761, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.84375, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 367, "train_speed(iter/s)": 0.018716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/mean_length": 182.8125, "completions/min_length": 14.0, "epoch": 0.005648850274767445, "frac_reward_zero_std": 0.0, "grad_norm": 7.628173351287842, "kl": 0.013892881572246552, "learning_rate": 2.823818293431553e-07, "loss": -0.03440096601843834, "memory(GiB)": 90.94, "reward": 0.7932065725326538, "reward_std": 0.2741246223449707, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9101909399032593, "rewards/PlanningActionSetORM/std": 0.12460445612668991, "rewards/RMReward/mean": 0.7281249761581421, "rewards/RMReward/std": 0.20163394510746002, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 368, "train_speed(iter/s)": 0.018702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 161.125, "completions/min_length": 105.0, "epoch": 0.005664200411383661, "frac_reward_zero_std": 0.0, "grad_norm": 2.1131131649017334, "kl": 0.03422679752111435, "learning_rate": 2.8314917127071824e-07, "loss": 0.08159883320331573, "memory(GiB)": 90.94, "reward": 0.7908333539962769, "reward_std": 0.11803670227527618, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9479166865348816, "rewards/PlanningActionSetORM/std": 0.10528402030467987, "rewards/RMReward/mean": 0.7515624761581421, "rewards/RMReward/std": 0.16775768995285034, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 369, "train_speed(iter/s)": 0.018682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/mean_length": 55.8125, "completions/min_length": 8.0, "epoch": 0.0056795505479998775, "frac_reward_zero_std": 0.0, "grad_norm": 31.585386276245117, "kl": 0.5473757982254028, "learning_rate": 2.8391651319828117e-07, "loss": -0.0048963166773319244, "memory(GiB)": 90.94, "reward": 0.7582812309265137, "reward_std": 0.2770772874355316, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9046874642372131, "rewards/PlanningActionSetORM/std": 0.057892683893442154, "rewards/RMReward/mean": 0.7906249761581421, "rewards/RMReward/std": 0.1214066818356514, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 370, "train_speed(iter/s)": 0.01866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 143.34375, "completions/min_length": 14.0, "epoch": 0.005694900684616093, "frac_reward_zero_std": 0.0, "grad_norm": 4.196388244628906, "kl": 0.02200176566839218, "learning_rate": 2.846838551258441e-07, "loss": 0.012642137706279755, "memory(GiB)": 90.94, "reward": 0.4581249952316284, "reward_std": 0.23331844806671143, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9249999523162842, "rewards/PlanningActionSetORM/std": 0.16124515235424042, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.1543467491865158, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 371, "train_speed(iter/s)": 0.018591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/mean_length": 100.78125, "completions/min_length": 89.0, "epoch": 0.005710250821232309, "frac_reward_zero_std": 0.0, "grad_norm": 2.089395523071289, "kl": 0.07693469524383545, "learning_rate": 2.85451197053407e-07, "loss": 0.004139772616326809, "memory(GiB)": 90.94, "reward": 0.9087083339691162, "reward_std": 0.11719280481338501, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9385416507720947, "rewards/PlanningActionSetORM/std": 0.07712782174348831, "rewards/RMReward/mean": 0.9012500047683716, "rewards/RMReward/std": 0.1535126268863678, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 372, "train_speed(iter/s)": 0.018549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/mean_length": 49.46875, "completions/min_length": 8.0, "epoch": 0.005725600957848525, "frac_reward_zero_std": 0.0, "grad_norm": 20.530757904052734, "kl": 0.351406991481781, "learning_rate": 2.8621853898096995e-07, "loss": -0.005925949662923813, "memory(GiB)": 90.94, "reward": 0.8868750333786011, "reward_std": 0.18828821182250977, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8656250238418579, "rewards/RMReward/std": 0.06511207669973373, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 373, "train_speed(iter/s)": 0.01855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 127.84375, "completions/min_length": 66.0, "epoch": 0.005740951094464741, "frac_reward_zero_std": 0.0, "grad_norm": 2.3212063312530518, "kl": 0.03713265061378479, "learning_rate": 2.869858809085329e-07, "loss": 0.025446007028222084, "memory(GiB)": 90.94, "reward": 0.8466145992279053, "reward_std": 0.09506843239068985, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9643229246139526, "rewards/PlanningActionSetORM/std": 0.0973721593618393, "rewards/RMReward/mean": 0.817187488079071, "rewards/RMReward/std": 0.17376725375652313, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 374, "train_speed(iter/s)": 0.018548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 71.15625, "completions/min_length": 13.0, "epoch": 0.005756301231080957, "frac_reward_zero_std": 0.0, "grad_norm": 4.694615364074707, "kl": 0.016498874872922897, "learning_rate": 2.877532228360958e-07, "loss": -0.00246397964656353, "memory(GiB)": 90.94, "reward": 0.8654761910438538, "reward_std": 0.19288775324821472, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9610118865966797, "rewards/PlanningActionSetORM/std": 0.04849882051348686, "rewards/RMReward/mean": 0.8218749761581421, "rewards/RMReward/std": 0.07520804554224014, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 375, "train_speed(iter/s)": 0.018552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/mean_length": 353.03125, "completions/min_length": 82.0, "epoch": 0.005771651367697172, "frac_reward_zero_std": 0.0, "grad_norm": 1.53122079372406, "kl": 0.01245222520083189, "learning_rate": 2.8852056476365873e-07, "loss": 0.26973453164100647, "memory(GiB)": 90.94, "reward": 0.4467134475708008, "reward_std": 0.16698651015758514, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9137298464775085, "rewards/PlanningActionSetORM/std": 0.09274698048830032, "rewards/RMReward/mean": 0.5399999618530273, "rewards/RMReward/std": 0.14764824509620667, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2786809504032135, "rewards/VisualPerceptionAccuracy/std": 0.20531971752643585, "step": 376, "train_speed(iter/s)": 0.018525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/mean_length": 175.65625, "completions/min_length": 57.0, "epoch": 0.005787001504313389, "frac_reward_zero_std": 0.0, "grad_norm": 2.941678285598755, "kl": 0.015690188854932785, "learning_rate": 2.892879066912216e-07, "loss": 0.020065680146217346, "memory(GiB)": 90.94, "reward": 0.3847016990184784, "reward_std": 0.09546282142400742, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9057291150093079, "rewards/PlanningActionSetORM/std": 0.08359508216381073, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.16172894835472107, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0257575660943985, "rewards/VisualPerceptionAccuracy/std": 0.058763813227415085, "step": 377, "train_speed(iter/s)": 0.018514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 362.46875, "completions/min_length": 89.0, "epoch": 0.005802351640929604, "frac_reward_zero_std": 0.0, "grad_norm": 1.8783869743347168, "kl": 0.033071406185626984, "learning_rate": 2.900552486187845e-07, "loss": 0.18317781388759613, "memory(GiB)": 90.94, "reward": 0.7078977823257446, "reward_std": 0.1543302685022354, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9144889116287231, "rewards/PlanningActionSetORM/std": 0.19527816772460938, "rewards/RMReward/mean": 0.65625, "rewards/RMReward/std": 0.22991934418678284, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 378, "train_speed(iter/s)": 0.018489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.0, "completions/mean_length": 438.1875, "completions/min_length": 93.0, "epoch": 0.00581770177754582, "frac_reward_zero_std": 0.0, "grad_norm": 1.4911257028579712, "kl": 0.030362393707036972, "learning_rate": 2.9082259054634745e-07, "loss": 0.08561024069786072, "memory(GiB)": 90.94, "reward": 0.45817017555236816, "reward_std": 0.1817014068365097, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9899839758872986, "rewards/PlanningActionSetORM/std": 0.027394000440835953, "rewards/RMReward/mean": 0.7956249713897705, "rewards/RMReward/std": 0.22721412777900696, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08184356242418289, "rewards/VisualPerceptionAccuracy/std": 0.1790134459733963, "step": 379, "train_speed(iter/s)": 0.018477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.5, "completions/min_length": 8.0, "epoch": 0.005833051914162036, "frac_reward_zero_std": 0.0, "grad_norm": 23.15418815612793, "kl": 0.5718016028404236, "learning_rate": 2.915899324739104e-07, "loss": 0.011349480599164963, "memory(GiB)": 90.94, "reward": 0.614062488079071, "reward_std": 0.4289786219596863, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.59375, "rewards/SpatialReasoningORM/std": 0.49899089336395264, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 380, "train_speed(iter/s)": 0.01852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 185.375, "completions/min_length": 89.0, "epoch": 0.005848402050778252, "frac_reward_zero_std": 0.0, "grad_norm": 1.8710023164749146, "kl": 0.027933495119214058, "learning_rate": 2.9235727440147336e-07, "loss": 0.0029417872428894043, "memory(GiB)": 90.94, "reward": 0.7976459264755249, "reward_std": 0.10135656595230103, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.931979775428772, "rewards/PlanningActionSetORM/std": 0.038511987775564194, "rewards/RMReward/mean": 0.7640625238418579, "rewards/RMReward/std": 0.12778155505657196, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 381, "train_speed(iter/s)": 0.01851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/mean_length": 117.5, "completions/min_length": 83.0, "epoch": 0.005863752187394468, "frac_reward_zero_std": 0.0, "grad_norm": 2.209796905517578, "kl": 0.03650067746639252, "learning_rate": 2.9312461632903623e-07, "loss": 0.041489824652671814, "memory(GiB)": 90.94, "reward": 0.8359062671661377, "reward_std": 0.08735167235136032, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.907031238079071, "rewards/PlanningActionSetORM/std": 0.11832744628190994, "rewards/RMReward/mean": 0.8181250095367432, "rewards/RMReward/std": 0.1034388542175293, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 382, "train_speed(iter/s)": 0.01851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/mean_length": 145.0, "completions/min_length": 14.0, "epoch": 0.0058791023240106835, "frac_reward_zero_std": 0.0, "grad_norm": 6.057709217071533, "kl": 0.00440499372780323, "learning_rate": 2.9389195825659916e-07, "loss": -0.039056792855262756, "memory(GiB)": 90.94, "reward": 0.5933678150177002, "reward_std": 0.26009315252304077, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.3054855465888977, "rewards/VisualPerceptionAccuracy/std": 0.195699542760849, "step": 383, "train_speed(iter/s)": 0.018546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 104.21875, "completions/min_length": 60.0, "epoch": 0.005894452460626899, "frac_reward_zero_std": 0.0, "grad_norm": 2.2952022552490234, "kl": 0.06400223076343536, "learning_rate": 2.946593001841621e-07, "loss": 0.04043339937925339, "memory(GiB)": 90.94, "reward": 0.754520058631897, "reward_std": 0.09964326024055481, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9226003885269165, "rewards/PlanningActionSetORM/std": 0.06387092918157578, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.12508061528205872, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 384, "train_speed(iter/s)": 0.01851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 124.0625, "completions/min_length": 59.0, "epoch": 0.005909802597243116, "frac_reward_zero_std": 0.0, "grad_norm": 1.955779790878296, "kl": 0.049032628536224365, "learning_rate": 2.95426642111725e-07, "loss": 0.014814062044024467, "memory(GiB)": 90.94, "reward": 0.9230625033378601, "reward_std": 0.049928247928619385, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9453125, "rewards/PlanningActionSetORM/std": 0.08362683653831482, "rewards/RMReward/mean": 0.9175000190734863, "rewards/RMReward/std": 0.1020120158791542, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 385, "train_speed(iter/s)": 0.018492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.005925152733859331, "frac_reward_zero_std": 0.0, "grad_norm": 18.348342895507812, "kl": 1.056640625, "learning_rate": 2.9619398403928794e-07, "loss": 0.0010547041893005371, "memory(GiB)": 90.94, "reward": 0.31718748807907104, "reward_std": 0.36403894424438477, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.28125, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 386, "train_speed(iter/s)": 0.018499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 12.1875, "completions/min_length": 8.0, "epoch": 0.005940502870475547, "frac_reward_zero_std": 0.0, "grad_norm": 10.77245807647705, "kl": 0.4699430465698242, "learning_rate": 2.9696132596685086e-07, "loss": 0.011829286813735962, "memory(GiB)": 90.94, "reward": 0.9109375476837158, "reward_std": 0.28099340200424194, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.90625, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 387, "train_speed(iter/s)": 0.01854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 177.59375, "completions/min_length": 99.0, "epoch": 0.0059558530070917635, "frac_reward_zero_std": 0.0, "grad_norm": 2.0894315242767334, "kl": 0.04057314991950989, "learning_rate": 2.977286678944138e-07, "loss": 0.16333869099617004, "memory(GiB)": 90.94, "reward": 0.8651456832885742, "reward_std": 0.11832542717456818, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8857283592224121, "rewards/PlanningActionSetORM/std": 0.023494554683566093, "rewards/RMReward/mean": 0.8600000143051147, "rewards/RMReward/std": 0.17511285841464996, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 388, "train_speed(iter/s)": 0.018534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 109.0625, "completions/min_length": 13.0, "epoch": 0.005971203143707979, "frac_reward_zero_std": 0.0, "grad_norm": 9.436638832092285, "kl": 0.03664268180727959, "learning_rate": 2.984960098219767e-07, "loss": 0.05124032869935036, "memory(GiB)": 90.94, "reward": 0.7944284677505493, "reward_std": 0.27934545278549194, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8942847847938538, "rewards/PlanningActionSetORM/std": 0.016295991837978363, "rewards/RMReward/mean": 0.809374988079071, "rewards/RMReward/std": 0.1685415357351303, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 389, "train_speed(iter/s)": 0.018489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/mean_length": 55.75, "completions/min_length": 8.0, "epoch": 0.005986553280324195, "frac_reward_zero_std": 0.0, "grad_norm": 12.649961471557617, "kl": 0.5495590567588806, "learning_rate": 2.9926335174953964e-07, "loss": 0.002511851489543915, "memory(GiB)": 90.94, "reward": 0.7689375281333923, "reward_std": 0.29813671112060547, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9418749809265137, "rewards/RMReward/std": 0.13692910969257355, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 390, "train_speed(iter/s)": 0.018444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 134.09375, "completions/min_length": 58.0, "epoch": 0.00600190341694041, "frac_reward_zero_std": 0.0, "grad_norm": 2.294677495956421, "kl": 0.01687207818031311, "learning_rate": 3.000306936771025e-07, "loss": -0.056572072207927704, "memory(GiB)": 90.94, "reward": 0.47023189067840576, "reward_std": 0.08916576951742172, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9352678656578064, "rewards/PlanningActionSetORM/std": 0.12598581612110138, "rewards/RMReward/mean": 0.8462499976158142, "rewards/RMReward/std": 0.08139409869909286, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0764101967215538, "rewards/VisualPerceptionAccuracy/std": 0.10336505621671677, "step": 391, "train_speed(iter/s)": 0.018454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/mean_length": 66.1875, "completions/min_length": 8.0, "epoch": 0.006017253553556627, "frac_reward_zero_std": 0.0, "grad_norm": 16.60491943359375, "kl": 0.38505056500434875, "learning_rate": 3.0079803560466544e-07, "loss": 0.03004731982946396, "memory(GiB)": 90.94, "reward": 0.6903645992279053, "reward_std": 0.2832197844982147, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8848958015441895, "rewards/PlanningActionSetORM/std": 0.11891558021306992, "rewards/RMReward/mean": 0.7000000476837158, "rewards/RMReward/std": 0.1197219118475914, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 392, "train_speed(iter/s)": 0.018413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/mean_length": 181.65625, "completions/min_length": 8.0, "epoch": 0.006032603690172843, "frac_reward_zero_std": 0.0, "grad_norm": 22.77491569519043, "kl": 0.473269522190094, "learning_rate": 3.0156537753222837e-07, "loss": 0.07097290456295013, "memory(GiB)": 90.94, "reward": 0.4510815739631653, "reward_std": 0.30734455585479736, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.1396631896495819, "rewards/VisualPerceptionAccuracy/std": 0.18983621895313263, "step": 393, "train_speed(iter/s)": 0.01844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/mean_length": 353.0, "completions/min_length": 78.0, "epoch": 0.006047953826789058, "frac_reward_zero_std": 0.0, "grad_norm": 2.2713217735290527, "kl": 0.003326139645650983, "learning_rate": 3.023327194597913e-07, "loss": 0.062340348958969116, "memory(GiB)": 90.94, "reward": 0.25322920083999634, "reward_std": 0.20994427800178528, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.25322920083999634, "rewards/VisualPerceptionAccuracy/std": 0.21455639600753784, "step": 394, "train_speed(iter/s)": 0.018466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 115.1875, "completions/min_length": 89.0, "epoch": 0.006063303963405274, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296849966049194, "kl": 0.038350846618413925, "learning_rate": 3.031000613873542e-07, "loss": 0.021338922902941704, "memory(GiB)": 90.94, "reward": 0.8377083539962769, "reward_std": 0.08494054526090622, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9385416507720947, "rewards/PlanningActionSetORM/std": 0.12439368665218353, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.09158109873533249, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 395, "train_speed(iter/s)": 0.018463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 136.25, "completions/min_length": 84.0, "epoch": 0.00607865410002149, "frac_reward_zero_std": 0.0, "grad_norm": 1.7444660663604736, "kl": 0.03918404504656792, "learning_rate": 3.0386740331491715e-07, "loss": 0.05049965903162956, "memory(GiB)": 90.94, "reward": 0.7757291793823242, "reward_std": 0.08309069275856018, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9786458015441895, "rewards/PlanningActionSetORM/std": 0.03878112882375717, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.1099853366613388, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 396, "train_speed(iter/s)": 0.018449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 167.90625, "completions/min_length": 100.0, "epoch": 0.006094004236637706, "frac_reward_zero_std": 0.0, "grad_norm": 1.647379994392395, "kl": 0.03723808377981186, "learning_rate": 3.0463474524248007e-07, "loss": 0.0032631303183734417, "memory(GiB)": 90.94, "reward": 0.8683437705039978, "reward_std": 0.10120312124490738, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.94921875, "rewards/PlanningActionSetORM/std": 0.11403245478868484, "rewards/RMReward/mean": 0.8481249809265137, "rewards/RMReward/std": 0.15392577648162842, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 397, "train_speed(iter/s)": 0.0184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/mean_length": 280.125, "completions/min_length": 101.0, "epoch": 0.006109354373253922, "frac_reward_zero_std": 0.0, "grad_norm": 1.663416862487793, "kl": 0.020363468676805496, "learning_rate": 3.05402087170043e-07, "loss": 0.1078886017203331, "memory(GiB)": 90.94, "reward": 0.5716498494148254, "reward_std": 0.130666121840477, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9362500309944153, "rewards/RMReward/std": 0.09492979943752289, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.19429966807365417, "rewards/VisualPerceptionAccuracy/std": 0.18538840115070343, "step": 398, "train_speed(iter/s)": 0.018371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/mean_length": 57.34375, "completions/min_length": 13.0, "epoch": 0.006124704509870138, "frac_reward_zero_std": 0.0, "grad_norm": 11.24315071105957, "kl": 0.02605253830552101, "learning_rate": 3.061694290976059e-07, "loss": -0.019960418343544006, "memory(GiB)": 90.94, "reward": 0.6896875500679016, "reward_std": 0.27153125405311584, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7437499761581421, "rewards/RMReward/std": 0.0704154446721077, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 399, "train_speed(iter/s)": 0.01838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/mean_length": 148.0, "completions/min_length": 8.0, "epoch": 0.006140054646486354, "frac_reward_zero_std": 0.0, "grad_norm": 14.609026908874512, "kl": 0.380066454410553, "learning_rate": 3.0693677102516885e-07, "loss": -0.0007484368979930878, "memory(GiB)": 90.94, "reward": 0.2691839635372162, "reward_std": 0.291811466217041, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.07274292409420013, "rewards/VisualPerceptionAccuracy/std": 0.096892811357975, "step": 400, "train_speed(iter/s)": 0.018412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 167.84375, "completions/min_length": 81.0, "epoch": 0.0061554047831025695, "frac_reward_zero_std": 0.0, "grad_norm": 1.9157052040100098, "kl": 0.029728878289461136, "learning_rate": 3.077041129527318e-07, "loss": -0.036470651626586914, "memory(GiB)": 90.94, "reward": 0.45043808221817017, "reward_std": 0.09102524071931839, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9437500238418579, "rewards/PlanningActionSetORM/std": 0.13149777054786682, "rewards/RMReward/mean": 0.796875, "rewards/RMReward/std": 0.06182974576950073, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07462608814239502, "rewards/VisualPerceptionAccuracy/std": 0.1295306533575058, "step": 401, "train_speed(iter/s)": 0.01835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 97.0, "completions/min_length": 8.0, "epoch": 0.006170754919718785, "frac_reward_zero_std": 0.0, "grad_norm": 33.036312103271484, "kl": 0.3270963430404663, "learning_rate": 3.084714548802947e-07, "loss": 0.03452030569314957, "memory(GiB)": 90.94, "reward": 0.7340625524520874, "reward_std": 0.3018200993537903, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.17078252136707306, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 402, "train_speed(iter/s)": 0.018345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 62.46875, "completions/min_length": 8.0, "epoch": 0.006186105056335002, "frac_reward_zero_std": 0.0, "grad_norm": 11.456995010375977, "kl": 0.5141586065292358, "learning_rate": 3.0923879680785763e-07, "loss": 0.07156050950288773, "memory(GiB)": 90.94, "reward": 0.8352603912353516, "reward_std": 0.1740274578332901, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9244791269302368, "rewards/PlanningActionSetORM/std": 0.13449041545391083, "rewards/RMReward/mean": 0.6812499761581421, "rewards/RMReward/std": 0.13022416830062866, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 403, "train_speed(iter/s)": 0.01831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 161.40625, "completions/min_length": 8.0, "epoch": 0.006201455192951217, "frac_reward_zero_std": 0.0, "grad_norm": 22.137344360351562, "kl": 0.46241533756256104, "learning_rate": 3.1000613873542056e-07, "loss": -0.01859777793288231, "memory(GiB)": 90.94, "reward": 0.8331249952316284, "reward_std": 0.18399962782859802, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.737500011920929, "rewards/RMReward/std": 0.03415650874376297, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 404, "train_speed(iter/s)": 0.018302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 64.0, "completions/min_length": 8.0, "epoch": 0.006216805329567433, "frac_reward_zero_std": 0.0, "grad_norm": 11.475534439086914, "kl": 0.4223152995109558, "learning_rate": 3.107734806629835e-07, "loss": 0.04991454631090164, "memory(GiB)": 90.94, "reward": 0.445901095867157, "reward_std": 0.22273221611976624, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.06992724537849426, "rewards/VisualPerceptionAccuracy/std": 0.06250718981027603, "step": 405, "train_speed(iter/s)": 0.018341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 167.78125, "completions/min_length": 91.0, "epoch": 0.006232155466183649, "frac_reward_zero_std": 0.0, "grad_norm": 2.0546066761016846, "kl": 0.030902855098247528, "learning_rate": 3.1154082259054635e-07, "loss": -0.052081163972616196, "memory(GiB)": 90.94, "reward": 0.49937906861305237, "reward_std": 0.052248597145080566, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9656250476837158, "rewards/RMReward/std": 0.07238496840000153, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.02625810168683529, "rewards/VisualPerceptionAccuracy/std": 0.04658921808004379, "step": 406, "train_speed(iter/s)": 0.018319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/mean_length": 45.59375, "completions/min_length": 8.0, "epoch": 0.006247505602799865, "frac_reward_zero_std": 0.0, "grad_norm": 14.895936965942383, "kl": 0.5329199433326721, "learning_rate": 3.123081645181093e-07, "loss": 0.008434869349002838, "memory(GiB)": 90.94, "reward": 0.49031248688697815, "reward_std": 0.29470348358154297, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.84375, "rewards/PlanningActionSetORM/std": 0.10704360902309418, "rewards/RMReward/mean": 0.581250011920929, "rewards/RMReward/std": 0.17499999701976776, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 407, "train_speed(iter/s)": 0.018319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 120.90625, "completions/min_length": 8.0, "epoch": 0.006262855739416081, "frac_reward_zero_std": 0.0, "grad_norm": 18.479127883911133, "kl": 0.33036985993385315, "learning_rate": 3.1307550644567226e-07, "loss": -0.012032397091388702, "memory(GiB)": 90.94, "reward": 0.6104261875152588, "reward_std": 0.327831894159317, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9886363744735718, "rewards/PlanningActionSetORM/std": 0.031051358208060265, "rewards/RMReward/mean": 0.6968749761581421, "rewards/RMReward/std": 0.20854157209396362, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 408, "train_speed(iter/s)": 0.018317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 176.4375, "completions/min_length": 99.0, "epoch": 0.006278205876032296, "frac_reward_zero_std": 0.0, "grad_norm": 1.4658422470092773, "kl": 0.03360779583454132, "learning_rate": 3.138428483732352e-07, "loss": 0.015895027667284012, "memory(GiB)": 90.94, "reward": 0.8815000057220459, "reward_std": 0.04624518007040024, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9624999761581421, "rewards/PlanningActionSetORM/std": 0.11845782399177551, "rewards/RMReward/mean": 0.8612500429153442, "rewards/RMReward/std": 0.13729530572891235, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 409, "train_speed(iter/s)": 0.01829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 92.78125, "completions/min_length": 14.0, "epoch": 0.006293556012648513, "frac_reward_zero_std": 0.0, "grad_norm": 7.37828254699707, "kl": 0.009296853095293045, "learning_rate": 3.146101903007981e-07, "loss": 0.0818178653717041, "memory(GiB)": 90.94, "reward": 0.13442909717559814, "reward_std": 0.22534462809562683, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.04073317348957062, "rewards/VisualPerceptionAccuracy/std": 0.06773202121257782, "step": 410, "train_speed(iter/s)": 0.018327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/mean_length": 46.46875, "completions/min_length": 8.0, "epoch": 0.006308906149264729, "frac_reward_zero_std": 0.0, "grad_norm": 10.504820823669434, "kl": 0.462871789932251, "learning_rate": 3.15377532228361e-07, "loss": -0.014697653241455555, "memory(GiB)": 90.94, "reward": 0.9021874666213989, "reward_std": 0.1618342101573944, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8812500238418579, "rewards/PlanningActionSetORM/std": 0.1857791393995285, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.10680004209280014, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 411, "train_speed(iter/s)": 0.018333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/mean_length": 110.9375, "completions/min_length": 71.0, "epoch": 0.006324256285880944, "frac_reward_zero_std": 0.0, "grad_norm": 2.516894578933716, "kl": 0.057775214314460754, "learning_rate": 3.161448741559239e-07, "loss": -0.057245880365371704, "memory(GiB)": 90.94, "reward": 0.78104168176651, "reward_std": 0.06412281841039658, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9114583134651184, "rewards/PlanningActionSetORM/std": 0.1295800805091858, "rewards/RMReward/mean": 0.7484375238418579, "rewards/RMReward/std": 0.0723838061094284, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 412, "train_speed(iter/s)": 0.018323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/mean_length": 199.21875, "completions/min_length": 8.0, "epoch": 0.00633960642249716, "frac_reward_zero_std": 0.0, "grad_norm": 7.33229923248291, "kl": 0.39557209610939026, "learning_rate": 3.1691221608348684e-07, "loss": 0.019423924386501312, "memory(GiB)": 90.94, "reward": 0.8009037971496582, "reward_std": 0.1784418523311615, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9309132099151611, "rewards/PlanningActionSetORM/std": 0.09386121481657028, "rewards/RMReward/mean": 0.59375, "rewards/RMReward/std": 0.14361406862735748, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 413, "train_speed(iter/s)": 0.018317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 243.5, "completions/min_length": 85.0, "epoch": 0.006354956559113376, "frac_reward_zero_std": 0.0, "grad_norm": 2.439452648162842, "kl": 0.0020993193611502647, "learning_rate": 3.1767955801104976e-07, "loss": 0.08024100959300995, "memory(GiB)": 90.94, "reward": 0.3093307912349701, "reward_std": 0.19878503680229187, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3093307912349701, "rewards/VisualPerceptionAccuracy/std": 0.2373688966035843, "step": 414, "train_speed(iter/s)": 0.018348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 162.21875, "completions/min_length": 98.0, "epoch": 0.006370306695729592, "frac_reward_zero_std": 0.0, "grad_norm": 1.6075108051300049, "kl": 0.03576330840587616, "learning_rate": 3.184468999386127e-07, "loss": 0.021036282181739807, "memory(GiB)": 90.94, "reward": 0.8453705906867981, "reward_std": 0.10477820038795471, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9431029558181763, "rewards/PlanningActionSetORM/std": 0.054689712822437286, "rewards/RMReward/mean": 0.8209375143051147, "rewards/RMReward/std": 0.147503063082695, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 415, "train_speed(iter/s)": 0.018308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/mean_length": 246.8125, "completions/min_length": 133.0, "epoch": 0.006385656832345808, "frac_reward_zero_std": 0.0, "grad_norm": 1.544028401374817, "kl": 0.0132091473788023, "learning_rate": 3.192142418661756e-07, "loss": 0.0415426567196846, "memory(GiB)": 90.94, "reward": 0.5277831554412842, "reward_std": 0.1539703607559204, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9028359651565552, "rewards/PlanningActionSetORM/std": 0.09040442854166031, "rewards/RMReward/mean": 0.7306250333786011, "rewards/RMReward/std": 0.193061962723732, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.29049915075302124, "rewards/VisualPerceptionAccuracy/std": 0.14584361016750336, "step": 416, "train_speed(iter/s)": 0.018288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/mean_length": 167.03125, "completions/min_length": 13.0, "epoch": 0.006401006968962023, "frac_reward_zero_std": 0.0, "grad_norm": 3.871736764907837, "kl": 0.018365520983934402, "learning_rate": 3.1998158379373854e-07, "loss": 0.03001544252038002, "memory(GiB)": 90.94, "reward": 0.8113020658493042, "reward_std": 0.17677396535873413, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9723958373069763, "rewards/PlanningActionSetORM/std": 0.09984796494245529, "rewards/RMReward/mean": 0.609375, "rewards/RMReward/std": 0.13689261674880981, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 417, "train_speed(iter/s)": 0.018289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 138.0, "completions/min_length": 94.0, "epoch": 0.00641635710557824, "frac_reward_zero_std": 0.0, "grad_norm": 2.718168020248413, "kl": 0.03750133514404297, "learning_rate": 3.2074892572130147e-07, "loss": -0.02173341065645218, "memory(GiB)": 90.94, "reward": 0.8448958396911621, "reward_std": 0.11097903549671173, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8994791507720947, "rewards/PlanningActionSetORM/std": 0.13308002054691315, "rewards/RMReward/mean": 0.8312499523162842, "rewards/RMReward/std": 0.13955573737621307, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 418, "train_speed(iter/s)": 0.018285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 125.3125, "completions/min_length": 2.0, "epoch": 0.0064317072421944555, "frac_reward_zero_std": 0.0, "grad_norm": 33.98247528076172, "kl": 0.5578848719596863, "learning_rate": 3.215162676488644e-07, "loss": -0.12338078022003174, "memory(GiB)": 90.94, "reward": 0.9720624685287476, "reward_std": 0.10347190499305725, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9637500047683716, "rewards/RMReward/std": 0.12430473417043686, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 419, "train_speed(iter/s)": 0.018276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 111.59375, "completions/min_length": 72.0, "epoch": 0.006447057378810671, "frac_reward_zero_std": 0.0, "grad_norm": 2.593883991241455, "kl": 0.02866745926439762, "learning_rate": 3.2228360957642727e-07, "loss": -0.04794745147228241, "memory(GiB)": 90.94, "reward": 0.47434553503990173, "reward_std": 0.1573198288679123, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8385416865348816, "rewards/PlanningActionSetORM/std": 0.08175590634346008, "rewards/RMReward/mean": 0.8612500429153442, "rewards/RMReward/std": 0.23053200542926788, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09198273718357086, "rewards/VisualPerceptionAccuracy/std": 0.11436720937490463, "step": 420, "train_speed(iter/s)": 0.018277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 211.46875, "completions/min_length": 89.0, "epoch": 0.006462407515426888, "frac_reward_zero_std": 0.0, "grad_norm": 1.5184441804885864, "kl": 0.03373841941356659, "learning_rate": 3.230509515039902e-07, "loss": 0.053357698023319244, "memory(GiB)": 90.94, "reward": 0.7854286432266235, "reward_std": 0.10934267193078995, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9396434426307678, "rewards/PlanningActionSetORM/std": 0.04531920328736305, "rewards/RMReward/mean": 0.7468750476837158, "rewards/RMReward/std": 0.14024028182029724, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 421, "train_speed(iter/s)": 0.018228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 409.4375, "completions/min_length": 8.0, "epoch": 0.006477757652043103, "frac_reward_zero_std": 0.0, "grad_norm": 7.839863300323486, "kl": 0.510494589805603, "learning_rate": 3.238182934315531e-07, "loss": 0.06202063709497452, "memory(GiB)": 90.94, "reward": 0.054994408041238785, "reward_std": 0.11963509023189545, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.0006138152675703168, "rewards/VisualPerceptionAccuracy/std": 0.0017701799515634775, "step": 422, "train_speed(iter/s)": 0.018238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/mean_length": 53.15625, "completions/min_length": 8.0, "epoch": 0.006493107788659319, "frac_reward_zero_std": 0.0, "grad_norm": 22.63603973388672, "kl": 0.3495277166366577, "learning_rate": 3.2458563535911605e-07, "loss": -0.014134325087070465, "memory(GiB)": 90.94, "reward": 0.8540624976158142, "reward_std": 0.21999193727970123, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.971875011920929, "rewards/PlanningActionSetORM/std": 0.050723932683467865, "rewards/RMReward/mean": 0.7906249761581421, "rewards/RMReward/std": 0.14167891442775726, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 423, "train_speed(iter/s)": 0.018237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/mean_length": 8.0625, "completions/min_length": 8.0, "epoch": 0.006508457925275535, "frac_reward_zero_std": 0.0, "grad_norm": 64.53112030029297, "kl": 0.844841480255127, "learning_rate": 3.25352977286679e-07, "loss": -0.011483464390039444, "memory(GiB)": 90.94, "reward": 0.34687501192092896, "reward_std": 0.4499264359474182, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 424, "train_speed(iter/s)": 0.018274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 182.875, "completions/min_length": 70.0, "epoch": 0.006523808061891751, "frac_reward_zero_std": 0.0, "grad_norm": 2.0009684562683105, "kl": 0.03480922803282738, "learning_rate": 3.261203192142419e-07, "loss": 0.02686426416039467, "memory(GiB)": 90.94, "reward": 0.7008333206176758, "reward_std": 0.12455851584672928, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9791666865348816, "rewards/PlanningActionSetORM/std": 0.07701881974935532, "rewards/RMReward/mean": 0.6312499642372131, "rewards/RMReward/std": 0.1740179806947708, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 425, "train_speed(iter/s)": 0.018244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.46875, "completions/min_length": 8.0, "epoch": 0.006539158198507967, "frac_reward_zero_std": 0.0, "grad_norm": 19.47463607788086, "kl": 0.3696393668651581, "learning_rate": 3.268876611418048e-07, "loss": -0.016289927065372467, "memory(GiB)": 90.94, "reward": 0.8515625, "reward_std": 0.3311764597892761, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.84375, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 426, "train_speed(iter/s)": 0.018281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 286.21875, "completions/min_length": 136.0, "epoch": 0.006554508335124182, "frac_reward_zero_std": 0.0, "grad_norm": 1.1574347019195557, "kl": 0.017789466306567192, "learning_rate": 3.2765500306936775e-07, "loss": -0.019201695919036865, "memory(GiB)": 90.94, "reward": 0.5460934042930603, "reward_std": 0.05886281281709671, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9624999761581421, "rewards/RMReward/std": 0.046547479927539825, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.12218676507472992, "rewards/VisualPerceptionAccuracy/std": 0.08048762381076813, "step": 427, "train_speed(iter/s)": 0.018258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.71875, "completions/min_length": 8.0, "epoch": 0.006569858471740398, "frac_reward_zero_std": 0.0, "grad_norm": 13.878012657165527, "kl": 0.318430632352829, "learning_rate": 3.284223449969307e-07, "loss": -0.010111071169376373, "memory(GiB)": 90.94, "reward": 0.22812499105930328, "reward_std": 0.34613892436027527, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.3965577781200409, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 428, "train_speed(iter/s)": 0.018294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/mean_length": 160.15625, "completions/min_length": 8.0, "epoch": 0.006585208608356615, "frac_reward_zero_std": 0.0, "grad_norm": 11.016892433166504, "kl": 0.39191749691963196, "learning_rate": 3.2918968692449355e-07, "loss": 0.05816631019115448, "memory(GiB)": 90.94, "reward": 0.7816250324249268, "reward_std": 0.2568724751472473, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.6087499856948853, "rewards/RMReward/std": 0.2248518019914627, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 429, "train_speed(iter/s)": 0.018288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 63.84375, "completions/min_length": 8.0, "epoch": 0.00660055874497283, "frac_reward_zero_std": 0.0, "grad_norm": 20.92963218688965, "kl": 0.5265624523162842, "learning_rate": 3.299570288520565e-07, "loss": 0.0211641825735569, "memory(GiB)": 90.94, "reward": 0.71484375, "reward_std": 0.3015652000904083, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9546874761581421, "rewards/PlanningActionSetORM/std": 0.053400032222270966, "rewards/RMReward/mean": 0.7437499761581421, "rewards/RMReward/std": 0.16520188748836517, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 430, "train_speed(iter/s)": 0.018282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/mean_length": 43.40625, "completions/min_length": 8.0, "epoch": 0.006615908881589046, "frac_reward_zero_std": 0.0, "grad_norm": 20.77644157409668, "kl": 0.5330261588096619, "learning_rate": 3.307243707796194e-07, "loss": 0.0011790990829467773, "memory(GiB)": 90.94, "reward": 0.846750020980835, "reward_std": 0.2404787689447403, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9137499928474426, "rewards/RMReward/std": 0.07013082504272461, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 431, "train_speed(iter/s)": 0.018294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 156.9375, "completions/min_length": 14.0, "epoch": 0.006631259018205262, "frac_reward_zero_std": 0.0, "grad_norm": 10.107465744018555, "kl": 0.005128385499119759, "learning_rate": 3.3149171270718233e-07, "loss": 0.02990737557411194, "memory(GiB)": 90.94, "reward": 0.20276470482349396, "reward_std": 0.24925002455711365, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.17740440368652344, "rewards/VisualPerceptionAccuracy/std": 0.11554279923439026, "step": 432, "train_speed(iter/s)": 0.018317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 12.25, "completions/min_length": 8.0, "epoch": 0.006646609154821478, "frac_reward_zero_std": 0.0, "grad_norm": 15.304509162902832, "kl": 0.4433211088180542, "learning_rate": 3.3225905463474526e-07, "loss": -0.02753804624080658, "memory(GiB)": 90.94, "reward": 0.5249999761581421, "reward_std": 0.38295724987983704, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 433, "train_speed(iter/s)": 0.018325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 57.5625, "completions/min_length": 13.0, "epoch": 0.006661959291437694, "frac_reward_zero_std": 0.0, "grad_norm": 9.575979232788086, "kl": 0.064034603536129, "learning_rate": 3.330263965623082e-07, "loss": 0.02398866042494774, "memory(GiB)": 90.94, "reward": 0.8266249895095825, "reward_std": 0.19772109389305115, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7150000333786011, "rewards/RMReward/std": 0.08869423717260361, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 434, "train_speed(iter/s)": 0.018335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/mean_length": 94.4375, "completions/min_length": 56.0, "epoch": 0.006677309428053909, "frac_reward_zero_std": 0.0, "grad_norm": 2.9626452922821045, "kl": 0.05225694179534912, "learning_rate": 3.337937384898711e-07, "loss": -0.00020651239901781082, "memory(GiB)": 90.94, "reward": 0.8266146183013916, "reward_std": 0.0893506407737732, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9580729007720947, "rewards/PlanningActionSetORM/std": 0.06604157388210297, "rewards/RMReward/mean": 0.7937500476837158, "rewards/RMReward/std": 0.107575923204422, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 435, "train_speed(iter/s)": 0.018342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 160.875, "completions/min_length": 97.0, "epoch": 0.006692659564670126, "frac_reward_zero_std": 0.0, "grad_norm": 1.4893878698349, "kl": 0.03518208861351013, "learning_rate": 3.3456108041743404e-07, "loss": 0.046877212822437286, "memory(GiB)": 90.94, "reward": 0.8310712575912476, "reward_std": 0.11047312617301941, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9253562688827515, "rewards/PlanningActionSetORM/std": 0.04822499305009842, "rewards/RMReward/mean": 0.8075000047683716, "rewards/RMReward/std": 0.1557292938232422, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 436, "train_speed(iter/s)": 0.018334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 177.28125, "completions/min_length": 91.0, "epoch": 0.0067080097012863415, "frac_reward_zero_std": 0.0, "grad_norm": 1.4621970653533936, "kl": 0.030785633251070976, "learning_rate": 3.353284223449969e-07, "loss": -0.013830430805683136, "memory(GiB)": 90.94, "reward": 0.8753750324249268, "reward_std": 0.06950300186872482, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9593750238418579, "rewards/PlanningActionSetORM/std": 0.06041782721877098, "rewards/RMReward/mean": 0.8543750047683716, "rewards/RMReward/std": 0.15698647499084473, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 437, "train_speed(iter/s)": 0.018323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 104.375, "completions/min_length": 8.0, "epoch": 0.006723359837902557, "frac_reward_zero_std": 0.0, "grad_norm": 12.747922897338867, "kl": 0.20023055374622345, "learning_rate": 3.3609576427255984e-07, "loss": -0.05126720294356346, "memory(GiB)": 90.94, "reward": 0.43861088156700134, "reward_std": 0.22532418370246887, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.05534680560231209, "rewards/VisualPerceptionAccuracy/std": 0.06769111752510071, "step": 438, "train_speed(iter/s)": 0.018356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 161.4375, "completions/min_length": 84.0, "epoch": 0.006738709974518773, "frac_reward_zero_std": 0.0, "grad_norm": 1.7481765747070312, "kl": 0.0340152382850647, "learning_rate": 3.3686310620012276e-07, "loss": 0.018074385821819305, "memory(GiB)": 90.94, "reward": 0.7947925925254822, "reward_std": 0.060593631118535995, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9427129030227661, "rewards/PlanningActionSetORM/std": 0.05571141093969345, "rewards/RMReward/mean": 0.7578125, "rewards/RMReward/std": 0.09681157022714615, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 439, "train_speed(iter/s)": 0.018344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/mean_length": 257.125, "completions/min_length": 92.0, "epoch": 0.006754060111134989, "frac_reward_zero_std": 0.0, "grad_norm": 1.6133842468261719, "kl": 0.029811905696988106, "learning_rate": 3.376304481276857e-07, "loss": -0.006214462220668793, "memory(GiB)": 90.94, "reward": 0.48185500502586365, "reward_std": 0.08539271354675293, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9689732193946838, "rewards/PlanningActionSetORM/std": 0.048527978360652924, "rewards/RMReward/mean": 0.831250011920929, "rewards/RMReward/std": 0.08341661840677261, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1049153208732605, "rewards/VisualPerceptionAccuracy/std": 0.10162444412708282, "step": 440, "train_speed(iter/s)": 0.018327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/mean_length": 217.0, "completions/min_length": 94.0, "epoch": 0.006769410247751205, "frac_reward_zero_std": 0.0, "grad_norm": 3.9639394283294678, "kl": 0.022620396688580513, "learning_rate": 3.383977900552486e-07, "loss": -0.07641912996768951, "memory(GiB)": 90.94, "reward": 0.5813014507293701, "reward_std": 0.12228038161993027, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.20953817665576935, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.25510281324386597, "rewards/VisualPerceptionAccuracy/std": 0.0769302174448967, "step": 441, "train_speed(iter/s)": 0.018298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 91.21875, "completions/min_length": 15.0, "epoch": 0.006784760384367421, "frac_reward_zero_std": 0.5, "grad_norm": 0.8943551778793335, "kl": 0.023813841864466667, "learning_rate": 3.3916513198281154e-07, "loss": 0.010051384568214417, "memory(GiB)": 90.94, "reward": 0.8425000309944153, "reward_std": 0.0744311660528183, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.606249988079071, "rewards/RMReward/std": 0.18607795238494873, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 442, "train_speed(iter/s)": 0.018248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/mean_length": 58.0625, "completions/min_length": 8.0, "epoch": 0.006800110520983637, "frac_reward_zero_std": 0.0, "grad_norm": 16.475439071655273, "kl": 0.4403173327445984, "learning_rate": 3.399324739103745e-07, "loss": 0.053055353462696075, "memory(GiB)": 90.94, "reward": 0.7053884267807007, "reward_std": 0.2786800265312195, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8707588911056519, "rewards/PlanningActionSetORM/std": 0.020463118329644203, "rewards/RMReward/mean": 0.9637500047683716, "rewards/RMReward/std": 0.0843702182173729, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 443, "train_speed(iter/s)": 0.018244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 59.5625, "completions/min_length": 8.0, "epoch": 0.006815460657599853, "frac_reward_zero_std": 0.0, "grad_norm": 21.505491256713867, "kl": 0.4139498174190521, "learning_rate": 3.4069981583793745e-07, "loss": 0.00869518518447876, "memory(GiB)": 90.94, "reward": 0.6198660731315613, "reward_std": 0.2702493667602539, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9580357074737549, "rewards/PlanningActionSetORM/std": 0.07489915192127228, "rewards/RMReward/mean": 0.7281249761581421, "rewards/RMReward/std": 0.0657489001750946, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 444, "train_speed(iter/s)": 0.018247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 119.53125, "completions/min_length": 73.0, "epoch": 0.0068308107942160684, "frac_reward_zero_std": 0.0, "grad_norm": 1.828930139541626, "kl": 0.034878723323345184, "learning_rate": 3.4146715776550037e-07, "loss": 0.019720233976840973, "memory(GiB)": 90.94, "reward": 0.783750057220459, "reward_std": 0.07322828471660614, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.7328125238418579, "rewards/RMReward/std": 0.11260075867176056, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 445, "train_speed(iter/s)": 0.018226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 95.0, "completions/min_length": 8.0, "epoch": 0.006846160930832284, "frac_reward_zero_std": 0.0, "grad_norm": 18.263641357421875, "kl": 0.5243222117424011, "learning_rate": 3.422344996930633e-07, "loss": 0.05784045159816742, "memory(GiB)": 90.94, "reward": 0.8519999980926514, "reward_std": 0.3124074637889862, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9268749952316284, "rewards/RMReward/std": 0.249952495098114, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 446, "train_speed(iter/s)": 0.018182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2049.0, "completions/mean_length": 556.125, "completions/min_length": 7.0, "epoch": 0.006861511067448501, "frac_reward_zero_std": 0.0, "grad_norm": 21.579559326171875, "kl": 0.5960561633110046, "learning_rate": 3.430018416206262e-07, "loss": -0.10673418641090393, "memory(GiB)": 90.94, "reward": 0.1627943366765976, "reward_std": 0.25476324558258057, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.09746367484331131, "rewards/VisualPerceptionAccuracy/std": 0.12656927108764648, "step": 447, "train_speed(iter/s)": 0.01819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 157.0625, "completions/min_length": 101.0, "epoch": 0.006876861204064716, "frac_reward_zero_std": 0.0, "grad_norm": 2.289201021194458, "kl": 0.04352878779172897, "learning_rate": 3.4376918354818915e-07, "loss": 0.051787376403808594, "memory(GiB)": 90.94, "reward": 0.9166642427444458, "reward_std": 0.07650645822286606, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.870820939540863, "rewards/PlanningActionSetORM/std": 0.03242003917694092, "rewards/RMReward/mean": 0.9281249642372131, "rewards/RMReward/std": 0.14454461634159088, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 448, "train_speed(iter/s)": 0.018181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 103.15625, "completions/min_length": 13.0, "epoch": 0.006892211340680932, "frac_reward_zero_std": 0.0, "grad_norm": 4.726511001586914, "kl": 0.01823529042303562, "learning_rate": 3.44536525475752e-07, "loss": 0.052858103066682816, "memory(GiB)": 90.94, "reward": 0.4995400309562683, "reward_std": 0.1811564713716507, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.05845504254102707, "rewards/VisualPerceptionAccuracy/std": 0.12481295317411423, "step": 449, "train_speed(iter/s)": 0.018215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 223.5, "completions/min_length": 169.0, "epoch": 0.0069075614772971475, "frac_reward_zero_std": 0.0, "grad_norm": 1.103893756866455, "kl": 0.037038955837488174, "learning_rate": 3.4530386740331495e-07, "loss": 0.07007520645856857, "memory(GiB)": 90.94, "reward": 0.8632500171661377, "reward_std": 0.11125003546476364, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.8321875333786011, "rewards/RMReward/std": 0.20725135505199432, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 450, "train_speed(iter/s)": 0.018182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/mean_length": 211.09375, "completions/min_length": 87.0, "epoch": 0.006922911613913364, "frac_reward_zero_std": 0.0, "grad_norm": 1.8920814990997314, "kl": 0.004733399488031864, "learning_rate": 3.460712093308779e-07, "loss": 0.10953105241060257, "memory(GiB)": 90.94, "reward": 0.27917909622192383, "reward_std": 0.17157042026519775, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.27917909622192383, "rewards/VisualPerceptionAccuracy/std": 0.1886773556470871, "step": 451, "train_speed(iter/s)": 0.018185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 63.875, "completions/min_length": 8.0, "epoch": 0.00693826175052958, "frac_reward_zero_std": 0.0, "grad_norm": 20.136615753173828, "kl": 0.5280872583389282, "learning_rate": 3.468385512584408e-07, "loss": 0.0275874026119709, "memory(GiB)": 90.94, "reward": 0.8855952620506287, "reward_std": 0.15083220601081848, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9528273940086365, "rewards/PlanningActionSetORM/std": 0.05291373282670975, "rewards/RMReward/mean": 0.800000011920929, "rewards/RMReward/std": 0.0707106739282608, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 452, "train_speed(iter/s)": 0.018175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/mean_length": 159.9375, "completions/min_length": 81.0, "epoch": 0.006953611887145795, "frac_reward_zero_std": 0.0, "grad_norm": 2.4283792972564697, "kl": 0.027989938855171204, "learning_rate": 3.4760589318600373e-07, "loss": 0.07850561290979385, "memory(GiB)": 90.94, "reward": 0.5317444801330566, "reward_std": 0.1347244679927826, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.15275251865386963, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2847388982772827, "rewards/VisualPerceptionAccuracy/std": 0.14736290276050568, "step": 453, "train_speed(iter/s)": 0.018186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 105.71875, "completions/min_length": 82.0, "epoch": 0.006968962023762012, "frac_reward_zero_std": 0.0, "grad_norm": 2.1322977542877197, "kl": 0.033115774393081665, "learning_rate": 3.4837323511356665e-07, "loss": 0.029290199279785156, "memory(GiB)": 90.94, "reward": 0.8378385305404663, "reward_std": 0.06578869372606277, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9141927361488342, "rewards/PlanningActionSetORM/std": 0.0652131512761116, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.09223916381597519, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 454, "train_speed(iter/s)": 0.018188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 103.4375, "completions/min_length": 8.0, "epoch": 0.0069843121603782275, "frac_reward_zero_std": 0.0, "grad_norm": 9.701863288879395, "kl": 0.49328920245170593, "learning_rate": 3.491405770411296e-07, "loss": -0.10587182641029358, "memory(GiB)": 90.94, "reward": 0.911062479019165, "reward_std": 0.18551474809646606, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.17078252136707306, "rewards/RMReward/mean": 0.8674999475479126, "rewards/RMReward/std": 0.13399004936218262, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 455, "train_speed(iter/s)": 0.018177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 137.625, "completions/min_length": 8.0, "epoch": 0.006999662296994443, "frac_reward_zero_std": 0.0, "grad_norm": 14.561565399169922, "kl": 0.45517703890800476, "learning_rate": 3.499079189686925e-07, "loss": 0.014804720878601074, "memory(GiB)": 90.94, "reward": 0.8418701887130737, "reward_std": 0.23244859278202057, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9230769276618958, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9950000047683716, "rewards/RMReward/std": 0.012649113312363625, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 456, "train_speed(iter/s)": 0.018169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 102.78125, "completions/min_length": 8.0, "epoch": 0.007015012433610659, "frac_reward_zero_std": 0.0, "grad_norm": 20.356271743774414, "kl": 0.5445412397384644, "learning_rate": 3.506752608962554e-07, "loss": -0.0010832510888576508, "memory(GiB)": 90.94, "reward": 0.6795138716697693, "reward_std": 0.26659753918647766, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.96875, "rewards/RMReward/std": 0.07274384051561356, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 457, "train_speed(iter/s)": 0.018168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 93.40625, "completions/min_length": 8.0, "epoch": 0.007030362570226875, "frac_reward_zero_std": 0.0, "grad_norm": 8.954508781433105, "kl": 0.536548912525177, "learning_rate": 3.514426028238183e-07, "loss": 0.024058230221271515, "memory(GiB)": 90.94, "reward": 0.6396293044090271, "reward_std": 0.1723722666501999, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.33863359689712524, "rewards/VisualPerceptionAccuracy/std": 0.10724452883005142, "step": 458, "train_speed(iter/s)": 0.018202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 247.6875, "completions/min_length": 8.0, "epoch": 0.007045712706843091, "frac_reward_zero_std": 0.0, "grad_norm": 10.884978294372559, "kl": 0.26344066858291626, "learning_rate": 3.5220994475138123e-07, "loss": -0.02189285308122635, "memory(GiB)": 90.94, "reward": 0.48831433057785034, "reward_std": 0.15281063318252563, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.03600362315773964, "rewards/VisualPerceptionAccuracy/std": 0.0681212842464447, "step": 459, "train_speed(iter/s)": 0.018207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 137.71875, "completions/min_length": 90.0, "epoch": 0.007061062843459307, "frac_reward_zero_std": 0.0, "grad_norm": 2.10494327545166, "kl": 0.018574967980384827, "learning_rate": 3.5297728667894416e-07, "loss": -0.014836106449365616, "memory(GiB)": 90.94, "reward": 0.42624804377555847, "reward_std": 0.08848299086093903, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8661458492279053, "rewards/PlanningActionSetORM/std": 0.029418550431728363, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.0359397754073143, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05426688492298126, "rewards/VisualPerceptionAccuracy/std": 0.1482885628938675, "step": 460, "train_speed(iter/s)": 0.018222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 152.40625, "completions/min_length": 80.0, "epoch": 0.007076412980075522, "frac_reward_zero_std": 0.0, "grad_norm": 2.7770447731018066, "kl": 0.021128252148628235, "learning_rate": 3.537446286065071e-07, "loss": 0.04932726174592972, "memory(GiB)": 90.94, "reward": 0.48929765820503235, "reward_std": 0.15182551741600037, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.952343761920929, "rewards/PlanningActionSetORM/std": 0.10249174386262894, "rewards/RMReward/mean": 0.6812499761581421, "rewards/RMReward/std": 0.1046820655465126, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.24312657117843628, "rewards/VisualPerceptionAccuracy/std": 0.21642425656318665, "step": 461, "train_speed(iter/s)": 0.01821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 167.90625, "completions/min_length": 107.0, "epoch": 0.007091763116691739, "frac_reward_zero_std": 0.0, "grad_norm": 1.174901008605957, "kl": 0.04031512886285782, "learning_rate": 3.5451197053407e-07, "loss": -0.016450103372335434, "memory(GiB)": 90.94, "reward": 0.8595138788223267, "reward_std": 0.07652066648006439, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9163194894790649, "rewards/PlanningActionSetORM/std": 0.09893837571144104, "rewards/RMReward/mean": 0.8453124761581421, "rewards/RMReward/std": 0.16261936724185944, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 462, "train_speed(iter/s)": 0.018171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 5.3125, "completions/min_length": 2.0, "epoch": 0.0071071132533079544, "frac_reward_zero_std": 0.0, "grad_norm": 61.282039642333984, "kl": 0.37166160345077515, "learning_rate": 3.5527931246163294e-07, "loss": -0.018016137182712555, "memory(GiB)": 90.94, "reward": 0.6656249761581421, "reward_std": 0.4768567681312561, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.6875, "rewards/VisualPerceptionAccuracy/std": 0.4787135720252991, "step": 463, "train_speed(iter/s)": 0.018208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 114.84375, "completions/min_length": 8.0, "epoch": 0.00712246338992417, "frac_reward_zero_std": 0.0, "grad_norm": 19.09225082397461, "kl": 0.38450127840042114, "learning_rate": 3.5604665438919586e-07, "loss": -0.04172120988368988, "memory(GiB)": 90.94, "reward": 0.7839027643203735, "reward_std": 0.330003023147583, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9590277671813965, "rewards/PlanningActionSetORM/std": 0.05469236895442009, "rewards/RMReward/mean": 0.7668750286102295, "rewards/RMReward/std": 0.28115758299827576, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 464, "train_speed(iter/s)": 0.018214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 192.34375, "completions/min_length": 96.0, "epoch": 0.007137813526540387, "frac_reward_zero_std": 0.0, "grad_norm": 1.8650486469268799, "kl": 0.009099919348955154, "learning_rate": 3.568139963167588e-07, "loss": -0.06984692811965942, "memory(GiB)": 90.94, "reward": 0.5326652526855469, "reward_std": 0.10282003879547119, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8973958492279053, "rewards/PlanningActionSetORM/std": 0.17763149738311768, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.114017553627491, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.26585137844085693, "rewards/VisualPerceptionAccuracy/std": 0.10637662559747696, "step": 465, "train_speed(iter/s)": 0.018209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/mean_length": 214.28125, "completions/min_length": 121.0, "epoch": 0.007153163663156602, "frac_reward_zero_std": 0.0, "grad_norm": 1.5123875141143799, "kl": 0.03109843283891678, "learning_rate": 3.5758133824432166e-07, "loss": 0.08160799741744995, "memory(GiB)": 90.94, "reward": 0.7946094274520874, "reward_std": 0.09810801595449448, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.935546875, "rewards/PlanningActionSetORM/std": 0.14711983501911163, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.13879522681236267, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 466, "train_speed(iter/s)": 0.018164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 59.4375, "completions/min_length": 14.0, "epoch": 0.007168513799772818, "frac_reward_zero_std": 0.0, "grad_norm": 4.125462055206299, "kl": 0.004773234948515892, "learning_rate": 3.583486801718846e-07, "loss": 0.0012445859611034393, "memory(GiB)": 90.94, "reward": 0.09880012273788452, "reward_std": 0.1426105946302414, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.08822524547576904, "rewards/VisualPerceptionAccuracy/std": 0.04772118851542473, "step": 467, "train_speed(iter/s)": 0.018197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 473.1875, "completions/min_length": 87.0, "epoch": 0.0071838639363890336, "frac_reward_zero_std": 0.0, "grad_norm": 2.157089948654175, "kl": 0.032170381397008896, "learning_rate": 3.591160220994475e-07, "loss": 0.06682373583316803, "memory(GiB)": 90.94, "reward": 0.4120972156524658, "reward_std": 0.14433889091014862, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.981249988079071, "rewards/PlanningActionSetORM/std": 0.04330127313733101, "rewards/RMReward/mean": 0.690625011920929, "rewards/RMReward/std": 0.21542109549045563, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07544441521167755, "rewards/VisualPerceptionAccuracy/std": 0.11044104397296906, "step": 468, "train_speed(iter/s)": 0.018198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 193.625, "completions/min_length": 93.0, "epoch": 0.00719921407300525, "frac_reward_zero_std": 0.0, "grad_norm": 2.3402676582336426, "kl": 0.03786081075668335, "learning_rate": 3.5988336402701044e-07, "loss": 0.004110131412744522, "memory(GiB)": 90.94, "reward": 0.8273237347602844, "reward_std": 0.07852253317832947, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9241186380386353, "rewards/PlanningActionSetORM/std": 0.10861173272132874, "rewards/RMReward/mean": 0.8031250238418579, "rewards/RMReward/std": 0.13496564328670502, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 469, "train_speed(iter/s)": 0.018168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/mean_length": 52.6875, "completions/min_length": 8.0, "epoch": 0.007214564209621466, "frac_reward_zero_std": 0.0, "grad_norm": 17.424392700195312, "kl": 0.32991042733192444, "learning_rate": 3.6065070595457337e-07, "loss": 0.016230342909693718, "memory(GiB)": 90.94, "reward": 0.6306509971618652, "reward_std": 0.3538142442703247, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8096354007720947, "rewards/PlanningActionSetORM/std": 0.17207689583301544, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.2574716806411743, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 470, "train_speed(iter/s)": 0.018174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/mean_length": 63.25, "completions/min_length": 8.0, "epoch": 0.007229914346237681, "frac_reward_zero_std": 0.0, "grad_norm": 10.490078926086426, "kl": 0.6276254057884216, "learning_rate": 3.614180478821363e-07, "loss": 0.007671169936656952, "memory(GiB)": 90.94, "reward": 0.9128124713897705, "reward_std": 0.18836800754070282, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8562500476837158, "rewards/RMReward/std": 0.17404502630233765, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 471, "train_speed(iter/s)": 0.018162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 11.53125, "completions/min_length": 8.0, "epoch": 0.007245264482853897, "frac_reward_zero_std": 0.0, "grad_norm": 20.694486618041992, "kl": 0.530807614326477, "learning_rate": 3.621853898096992e-07, "loss": -0.017904195934534073, "memory(GiB)": 90.94, "reward": 0.9109375476837158, "reward_std": 0.28099340200424194, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.90625, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 472, "train_speed(iter/s)": 0.018196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 181.59375, "completions/min_length": 8.0, "epoch": 0.0072606146194701135, "frac_reward_zero_std": 0.0, "grad_norm": 11.048949241638184, "kl": 0.435551255941391, "learning_rate": 3.6295273173726215e-07, "loss": 0.026004888117313385, "memory(GiB)": 90.94, "reward": 0.5476908683776855, "reward_std": 0.24830859899520874, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.21413177251815796, "rewards/VisualPerceptionAccuracy/std": 0.17213042080402374, "step": 473, "train_speed(iter/s)": 0.018223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/mean_length": 97.6875, "completions/min_length": 85.0, "epoch": 0.007275964756086329, "frac_reward_zero_std": 0.0, "grad_norm": 2.361966371536255, "kl": 0.03633915260434151, "learning_rate": 3.6372007366482507e-07, "loss": 0.0011094510555267334, "memory(GiB)": 90.94, "reward": 0.831250011920929, "reward_std": 0.03892969340085983, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7890625596046448, "rewards/RMReward/std": 0.05496976152062416, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 474, "train_speed(iter/s)": 0.018224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 155.53125, "completions/min_length": 95.0, "epoch": 0.007291314892702545, "frac_reward_zero_std": 0.0, "grad_norm": 1.682361364364624, "kl": 0.01645374670624733, "learning_rate": 3.6448741559238795e-07, "loss": 0.005363356322050095, "memory(GiB)": 90.94, "reward": 0.47253289818763733, "reward_std": 0.07024925202131271, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8712500333786011, "rewards/RMReward/std": 0.09032349288463593, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.04806584119796753, "rewards/VisualPerceptionAccuracy/std": 0.0682397335767746, "step": 475, "train_speed(iter/s)": 0.018203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/mean_length": 68.1875, "completions/min_length": 8.0, "epoch": 0.007306665029318761, "frac_reward_zero_std": 0.0, "grad_norm": 15.020299911499023, "kl": 0.44431447982788086, "learning_rate": 3.6525475751995087e-07, "loss": -0.0047599636018276215, "memory(GiB)": 90.94, "reward": 0.7365103960037231, "reward_std": 0.2711319327354431, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8463541269302368, "rewards/PlanningActionSetORM/std": 0.062358636409044266, "rewards/RMReward/mean": 0.8250000476837158, "rewards/RMReward/std": 0.0774596631526947, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 476, "train_speed(iter/s)": 0.018208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 107.0, "completions/min_length": 8.0, "epoch": 0.007322015165934977, "frac_reward_zero_std": 0.0, "grad_norm": 9.827051162719727, "kl": 0.47035151720046997, "learning_rate": 3.660220994475138e-07, "loss": 0.08360697329044342, "memory(GiB)": 90.94, "reward": 0.6475819945335388, "reward_std": 0.19482964277267456, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.35453900694847107, "rewards/VisualPerceptionAccuracy/std": 0.15215930342674255, "step": 477, "train_speed(iter/s)": 0.018235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 57.03125, "completions/min_length": 8.0, "epoch": 0.007337365302551193, "frac_reward_zero_std": 0.0, "grad_norm": 29.29713249206543, "kl": 0.6033676862716675, "learning_rate": 3.667894413750767e-07, "loss": 0.08121505379676819, "memory(GiB)": 90.94, "reward": 0.8726562857627869, "reward_std": 0.1533605456352234, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9734375476837158, "rewards/PlanningActionSetORM/std": 0.04784415662288666, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.08465616405010223, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 478, "train_speed(iter/s)": 0.018234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.34375, "completions/min_length": 8.0, "epoch": 0.007352715439167408, "frac_reward_zero_std": 0.0, "grad_norm": 28.547094345092773, "kl": 1.187800407409668, "learning_rate": 3.675567833026397e-07, "loss": -0.02327066659927368, "memory(GiB)": 90.94, "reward": 0.614062488079071, "reward_std": 0.47267788648605347, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.59375, "rewards/SpatialReasoningORM/std": 0.49899089336395264, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 479, "train_speed(iter/s)": 0.018269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.007368065575783625, "frac_reward_zero_std": 0.0, "grad_norm": 27.915693283081055, "kl": 0.7109375, "learning_rate": 3.6832412523020263e-07, "loss": 0.0007108859717845917, "memory(GiB)": 90.94, "reward": 0.7328125238418579, "reward_std": 0.40560847520828247, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 480, "train_speed(iter/s)": 0.018302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 165.8125, "completions/min_length": 86.0, "epoch": 0.0073834157123998405, "frac_reward_zero_std": 0.0, "grad_norm": 1.8981467485427856, "kl": 0.047303326427936554, "learning_rate": 3.6909146715776556e-07, "loss": 0.007365290075540543, "memory(GiB)": 90.94, "reward": 0.906166672706604, "reward_std": 0.05740443244576454, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9520833492279053, "rewards/PlanningActionSetORM/std": 0.08701542764902115, "rewards/RMReward/mean": 0.8946874737739563, "rewards/RMReward/std": 0.13711272180080414, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 481, "train_speed(iter/s)": 0.018294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/mean_length": 53.5, "completions/min_length": 8.0, "epoch": 0.007398765849016056, "frac_reward_zero_std": 0.0, "grad_norm": 8.032788276672363, "kl": 0.4309026598930359, "learning_rate": 3.698588090853285e-07, "loss": 0.03551745414733887, "memory(GiB)": 90.94, "reward": 0.8849478960037231, "reward_std": 0.15444281697273254, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8463541269302368, "rewards/PlanningActionSetORM/std": 0.10727987438440323, "rewards/RMReward/mean": 0.824999988079071, "rewards/RMReward/std": 0.08366600424051285, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 482, "train_speed(iter/s)": 0.018304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 62.09375, "completions/min_length": 8.0, "epoch": 0.007414115985632272, "frac_reward_zero_std": 0.0, "grad_norm": 11.0903902053833, "kl": 0.5092900395393372, "learning_rate": 3.706261510128914e-07, "loss": 0.030292324721813202, "memory(GiB)": 90.94, "reward": 0.9003125429153442, "reward_std": 0.1566973179578781, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.831250011920929, "rewards/RMReward/std": 0.08539125323295593, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 483, "train_speed(iter/s)": 0.018316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/mean_length": 98.84375, "completions/min_length": 71.0, "epoch": 0.007429466122248488, "frac_reward_zero_std": 0.0, "grad_norm": 2.720700979232788, "kl": 0.08403855562210083, "learning_rate": 3.7139349294045434e-07, "loss": -0.013936825096607208, "memory(GiB)": 90.94, "reward": 0.8404687643051147, "reward_std": 0.10930022597312927, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8648437261581421, "rewards/PlanningActionSetORM/std": 0.049537938088178635, "rewards/RMReward/mean": 0.8343750238418579, "rewards/RMReward/std": 0.14615362882614136, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 484, "train_speed(iter/s)": 0.018303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 112.1875, "completions/min_length": 95.0, "epoch": 0.007444816258864704, "frac_reward_zero_std": 0.0, "grad_norm": 2.3716351985931396, "kl": 0.08349833637475967, "learning_rate": 3.7216083486801726e-07, "loss": 0.03870212286710739, "memory(GiB)": 90.94, "reward": 0.8634063005447388, "reward_std": 0.0751008689403534, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8820312023162842, "rewards/PlanningActionSetORM/std": 0.04055558145046234, "rewards/RMReward/mean": 0.8587499856948853, "rewards/RMReward/std": 0.11454678326845169, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 485, "train_speed(iter/s)": 0.018288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/mean_length": 268.90625, "completions/min_length": 95.0, "epoch": 0.0074601663954809196, "frac_reward_zero_std": 0.0, "grad_norm": 2.112240791320801, "kl": 0.014795554801821709, "learning_rate": 3.7292817679558014e-07, "loss": 0.06592080742120743, "memory(GiB)": 90.94, "reward": 0.44497817754745483, "reward_std": 0.06284588575363159, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8984375, "rewards/PlanningActionSetORM/std": 0.06355361640453339, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.07576002925634384, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.02276882529258728, "rewards/VisualPerceptionAccuracy/std": 0.06037288159132004, "step": 486, "train_speed(iter/s)": 0.018262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 85.90625, "completions/min_length": 8.0, "epoch": 0.007475516532097136, "frac_reward_zero_std": 0.0, "grad_norm": 25.642934799194336, "kl": 0.47585442662239075, "learning_rate": 3.7369551872314306e-07, "loss": -0.04100598022341728, "memory(GiB)": 90.94, "reward": 0.2729528844356537, "reward_std": 0.2823214530944824, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.13965578377246857, "rewards/VisualPerceptionAccuracy/std": 0.08964291214942932, "step": 487, "train_speed(iter/s)": 0.018293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 187.9375, "completions/min_length": 8.0, "epoch": 0.007490866668713352, "frac_reward_zero_std": 0.0, "grad_norm": 22.729101181030273, "kl": 0.4849288761615753, "learning_rate": 3.74462860650706e-07, "loss": 0.008791293948888779, "memory(GiB)": 90.94, "reward": 0.7220953702926636, "reward_std": 0.2850942611694336, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9428290128707886, "rewards/PlanningActionSetORM/std": 0.012357220984995365, "rewards/RMReward/mean": 0.690625011920929, "rewards/RMReward/std": 0.1440124362707138, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 488, "train_speed(iter/s)": 0.018281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 101.625, "completions/min_length": 77.0, "epoch": 0.007506216805329567, "frac_reward_zero_std": 0.0, "grad_norm": 2.4634218215942383, "kl": 0.04712187871336937, "learning_rate": 3.752302025782689e-07, "loss": -0.015105579048395157, "memory(GiB)": 90.94, "reward": 0.88364577293396, "reward_std": 0.060211148113012314, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9307291507720947, "rewards/PlanningActionSetORM/std": 0.08738631755113602, "rewards/RMReward/mean": 0.871874988079071, "rewards/RMReward/std": 0.07718587666749954, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 489, "train_speed(iter/s)": 0.018285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.007521566941945783, "frac_reward_zero_std": 0.0, "grad_norm": 12.717827796936035, "kl": 0.94287109375, "learning_rate": 3.7599754450583184e-07, "loss": 0.000941544771194458, "memory(GiB)": 90.94, "reward": 0.940625011920929, "reward_std": 0.23749999701976776, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 490, "train_speed(iter/s)": 0.01832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 183.59375, "completions/min_length": 87.0, "epoch": 0.0075369170785619995, "frac_reward_zero_std": 0.0, "grad_norm": 1.3733196258544922, "kl": 0.04080360382795334, "learning_rate": 3.7676488643339477e-07, "loss": -0.0012495936825871468, "memory(GiB)": 90.94, "reward": 0.9072083234786987, "reward_std": 0.08984646201133728, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9635416269302368, "rewards/PlanningActionSetORM/std": 0.05826609581708908, "rewards/RMReward/mean": 0.8931249380111694, "rewards/RMReward/std": 0.11354798078536987, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 491, "train_speed(iter/s)": 0.018301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/mean_length": 255.0, "completions/min_length": 94.0, "epoch": 0.007552267215178215, "frac_reward_zero_std": 0.0, "grad_norm": 1.2978448867797852, "kl": 0.03136986494064331, "learning_rate": 3.775322283609577e-07, "loss": 0.017874358221888542, "memory(GiB)": 90.94, "reward": 0.7688315510749817, "reward_std": 0.10794184356927872, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.994157612323761, "rewards/PlanningActionSetORM/std": 0.020231280475854874, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.1631346195936203, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 492, "train_speed(iter/s)": 0.018257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/mean_length": 40.0625, "completions/min_length": 8.0, "epoch": 0.007567617351794431, "frac_reward_zero_std": 0.0, "grad_norm": 13.275485038757324, "kl": 0.4841064512729645, "learning_rate": 3.782995702885206e-07, "loss": 0.05171171575784683, "memory(GiB)": 90.94, "reward": 0.8378124833106995, "reward_std": 0.22047334909439087, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.06718549132347107, "rewards/RMReward/mean": 0.824999988079071, "rewards/RMReward/std": 0.06324554979801178, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 493, "train_speed(iter/s)": 0.018265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/mean_length": 114.375, "completions/min_length": 82.0, "epoch": 0.0075829674884106465, "frac_reward_zero_std": 0.0, "grad_norm": 2.042163610458374, "kl": 0.04552840441465378, "learning_rate": 3.7906691221608354e-07, "loss": -0.014347271993756294, "memory(GiB)": 90.94, "reward": 0.7846354246139526, "reward_std": 0.0832681953907013, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9606770873069763, "rewards/PlanningActionSetORM/std": 0.07623635232448578, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.08929608017206192, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 494, "train_speed(iter/s)": 0.018247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/mean_length": 104.875, "completions/min_length": 84.0, "epoch": 0.007598317625026863, "frac_reward_zero_std": 0.0, "grad_norm": 1.782818078994751, "kl": 0.06692086905241013, "learning_rate": 3.798342541436464e-07, "loss": -0.008175402879714966, "memory(GiB)": 90.94, "reward": 0.8994063138961792, "reward_std": 0.05258680135011673, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.938281238079071, "rewards/PlanningActionSetORM/std": 0.08744838088750839, "rewards/RMReward/mean": 0.8896875381469727, "rewards/RMReward/std": 0.0960337445139885, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 495, "train_speed(iter/s)": 0.01821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 198.4375, "completions/min_length": 94.0, "epoch": 0.007613667761643079, "frac_reward_zero_std": 0.0, "grad_norm": 1.4624470472335815, "kl": 0.0353463739156723, "learning_rate": 3.8060159607120934e-07, "loss": 0.07602844387292862, "memory(GiB)": 90.94, "reward": 0.8519389629364014, "reward_std": 0.12139710783958435, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.940944492816925, "rewards/PlanningActionSetORM/std": 0.04683314263820648, "rewards/RMReward/mean": 0.8296874761581421, "rewards/RMReward/std": 0.15956562757492065, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 496, "train_speed(iter/s)": 0.018168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.9375, "completions/min_length": 8.0, "epoch": 0.007629017898259294, "frac_reward_zero_std": 0.0, "grad_norm": 21.298112869262695, "kl": 0.6092252135276794, "learning_rate": 3.8136893799877227e-07, "loss": 0.009607329964637756, "memory(GiB)": 90.94, "reward": 0.5546875, "reward_std": 0.3537220358848572, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.53125, "rewards/SpatialReasoningORM/std": 0.507007360458374, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 497, "train_speed(iter/s)": 0.018166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 146.9375, "completions/min_length": 110.0, "epoch": 0.007644368034875511, "frac_reward_zero_std": 0.0, "grad_norm": 1.0378997325897217, "kl": 0.06910277903079987, "learning_rate": 3.821362799263352e-07, "loss": -0.0017558857798576355, "memory(GiB)": 90.94, "reward": 0.9782500267028809, "reward_std": 0.034862808883190155, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9728125333786011, "rewards/RMReward/std": 0.04887339100241661, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 498, "train_speed(iter/s)": 0.01812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/mean_length": 69.59375, "completions/min_length": 8.0, "epoch": 0.0076597181714917265, "frac_reward_zero_std": 0.0, "grad_norm": 10.201099395751953, "kl": 0.45204952359199524, "learning_rate": 3.829036218538981e-07, "loss": -0.012082880362868309, "memory(GiB)": 90.94, "reward": 0.5733798146247864, "reward_std": 0.216994509100914, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.2061346471309662, "rewards/VisualPerceptionAccuracy/std": 0.19648902118206024, "step": 499, "train_speed(iter/s)": 0.018152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/mean_length": 276.375, "completions/min_length": 115.0, "epoch": 0.007675068308107942, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609904289245605, "kl": 0.029964450746774673, "learning_rate": 3.8367096378146105e-07, "loss": 0.05528225004673004, "memory(GiB)": 90.94, "reward": 0.48027902841567993, "reward_std": 0.05574224144220352, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9124999642372131, "rewards/RMReward/std": 0.08465616405010223, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.030558079481124878, "rewards/VisualPerceptionAccuracy/std": 0.043759554624557495, "step": 500, "train_speed(iter/s)": 0.018144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 109.21875, "completions/min_length": 8.0, "epoch": 0.007690418444724158, "frac_reward_zero_std": 0.0, "grad_norm": 10.98415470123291, "kl": 0.6293608546257019, "learning_rate": 3.84438305709024e-07, "loss": 0.033635213971138, "memory(GiB)": 90.94, "reward": 0.13100461661815643, "reward_std": 0.20465627312660217, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.09325923025608063, "rewards/VisualPerceptionAccuracy/std": 0.0848257839679718, "step": 501, "train_speed(iter/s)": 0.018122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 59.625, "completions/min_length": 8.0, "epoch": 0.007705768581340374, "frac_reward_zero_std": 0.0, "grad_norm": 29.417001724243164, "kl": 0.43923190236091614, "learning_rate": 3.852056476365869e-07, "loss": 0.0016805008053779602, "memory(GiB)": 90.94, "reward": 0.621666669845581, "reward_std": 0.2774459719657898, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8666666746139526, "rewards/PlanningActionSetORM/std": 0.14631271362304688, "rewards/RMReward/mean": 0.6812499761581421, "rewards/RMReward/std": 0.07274384796619415, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 502, "train_speed(iter/s)": 0.018134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.00772111871795659, "frac_reward_zero_std": 1.0, "grad_norm": 0.049059394747018814, "kl": 1.0341796875, "learning_rate": 3.8597298956414983e-07, "loss": 0.0010315505787730217, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 503, "train_speed(iter/s)": 0.018162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 121.65625, "completions/min_length": 8.0, "epoch": 0.0077364688545728056, "frac_reward_zero_std": 0.0, "grad_norm": 35.213294982910156, "kl": 0.7556869387626648, "learning_rate": 3.867403314917127e-07, "loss": 0.0009124651551246643, "memory(GiB)": 90.94, "reward": 0.5763750076293945, "reward_std": 0.18258708715438843, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9800000190734863, "rewards/RMReward/std": 0.05085927993059158, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 504, "train_speed(iter/s)": 0.018158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 114.53125, "completions/min_length": 81.0, "epoch": 0.007751818991189021, "frac_reward_zero_std": 0.0, "grad_norm": 2.700779438018799, "kl": 0.028433866798877716, "learning_rate": 3.8750767341927563e-07, "loss": -0.0024089477956295013, "memory(GiB)": 90.94, "reward": 0.4250760078430176, "reward_std": 0.02847634255886078, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.03415650874376297, "rewards/RMReward/mean": 0.784375011920929, "rewards/RMReward/std": 0.047324247658252716, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.025151975452899933, "rewards/VisualPerceptionAccuracy/std": 0.01935836300253868, "step": 505, "train_speed(iter/s)": 0.018137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 160.09375, "completions/min_length": 104.0, "epoch": 0.007767169127805238, "frac_reward_zero_std": 0.0, "grad_norm": 1.6606138944625854, "kl": 0.03092845343053341, "learning_rate": 3.8827501534683855e-07, "loss": 0.03267665579915047, "memory(GiB)": 90.94, "reward": 0.8012499809265137, "reward_std": 0.09933389723300934, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7515624761581421, "rewards/RMReward/std": 0.15682554244995117, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 506, "train_speed(iter/s)": 0.018093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/mean_length": 53.625, "completions/min_length": 8.0, "epoch": 0.007782519264421453, "frac_reward_zero_std": 0.0, "grad_norm": 9.694007873535156, "kl": 0.5532850027084351, "learning_rate": 3.890423572744015e-07, "loss": 0.0006832145154476166, "memory(GiB)": 90.94, "reward": 0.890749990940094, "reward_std": 0.2018512487411499, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.8768750429153442, "rewards/RMReward/std": 0.09665876626968384, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 507, "train_speed(iter/s)": 0.0181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 274.34375, "completions/min_length": 106.0, "epoch": 0.007797869401037669, "frac_reward_zero_std": 0.0, "grad_norm": 1.4346359968185425, "kl": 0.06616972386837006, "learning_rate": 3.898096992019644e-07, "loss": -0.06717785447835922, "memory(GiB)": 90.94, "reward": 0.5464950799942017, "reward_std": 0.05495380610227585, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.996874988079071, "rewards/RMReward/std": 0.012500002980232239, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09549014270305634, "rewards/VisualPerceptionAccuracy/std": 0.09990762174129486, "step": 508, "train_speed(iter/s)": 0.018102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 138.8125, "completions/min_length": 8.0, "epoch": 0.007813219537653885, "frac_reward_zero_std": 0.0, "grad_norm": 55.92293167114258, "kl": 0.44918814301490784, "learning_rate": 3.9057704112952733e-07, "loss": 0.050406765192747116, "memory(GiB)": 90.94, "reward": 0.18836939334869385, "reward_std": 0.259202778339386, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.08923877775669098, "rewards/VisualPerceptionAccuracy/std": 0.09355267137289047, "step": 509, "train_speed(iter/s)": 0.018131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/mean_length": 133.5625, "completions/min_length": 95.0, "epoch": 0.0078285696742701, "frac_reward_zero_std": 0.0, "grad_norm": 3.923942804336548, "kl": 0.065409816801548, "learning_rate": 3.9134438305709026e-07, "loss": -0.010537831112742424, "memory(GiB)": 90.94, "reward": 0.9120312929153442, "reward_std": 0.14050261676311493, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9164062738418579, "rewards/PlanningActionSetORM/std": 0.14488723874092102, "rewards/RMReward/mean": 0.910937488079071, "rewards/RMReward/std": 0.15932030975818634, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 510, "train_speed(iter/s)": 0.018075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/mean_length": 120.09375, "completions/min_length": 86.0, "epoch": 0.007843919810886318, "frac_reward_zero_std": 0.0, "grad_norm": 2.4544317722320557, "kl": 0.06205087900161743, "learning_rate": 3.921117249846532e-07, "loss": 0.008962363004684448, "memory(GiB)": 90.94, "reward": 0.8463749885559082, "reward_std": 0.0659213662147522, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.940625011920929, "rewards/PlanningActionSetORM/std": 0.06082431226968765, "rewards/RMReward/mean": 0.8228124976158142, "rewards/RMReward/std": 0.11543002724647522, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 511, "train_speed(iter/s)": 0.018042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/mean_length": 225.34375, "completions/min_length": 84.0, "epoch": 0.007859269947502533, "frac_reward_zero_std": 0.0, "grad_norm": 3.9833686351776123, "kl": 0.018828772008419037, "learning_rate": 3.9287906691221606e-07, "loss": 0.030827108770608902, "memory(GiB)": 90.94, "reward": 0.4633163511753082, "reward_std": 0.06970712542533875, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9337325692176819, "rewards/PlanningActionSetORM/std": 0.13254648447036743, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.13149777054786682, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.17488618195056915, "rewards/VisualPerceptionAccuracy/std": 0.03206339478492737, "step": 512, "train_speed(iter/s)": 0.018035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 120.875, "completions/min_length": 91.0, "epoch": 0.007874620084118749, "frac_reward_zero_std": 0.0, "grad_norm": 2.270031452178955, "kl": 0.02126414328813553, "learning_rate": 3.93646408839779e-07, "loss": -0.05840768665075302, "memory(GiB)": 90.94, "reward": 0.4626438021659851, "reward_std": 0.08485860377550125, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9567708373069763, "rewards/PlanningActionSetORM/std": 0.05138983577489853, "rewards/RMReward/mean": 0.6531250476837158, "rewards/RMReward/std": 0.16779825091362, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21143338084220886, "rewards/VisualPerceptionAccuracy/std": 0.03670687973499298, "step": 513, "train_speed(iter/s)": 0.018047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 106.53125, "completions/min_length": 80.0, "epoch": 0.007889970220734965, "frac_reward_zero_std": 0.0, "grad_norm": 2.182873487472534, "kl": 0.059764910489320755, "learning_rate": 3.94413750767342e-07, "loss": 0.006160896271467209, "memory(GiB)": 90.94, "reward": 0.8323854207992554, "reward_std": 0.06478796154260635, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8419270515441895, "rewards/PlanningActionSetORM/std": 0.06627858430147171, "rewards/RMReward/mean": 0.8299999833106995, "rewards/RMReward/std": 0.15868410468101501, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 514, "train_speed(iter/s)": 0.017982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/mean_length": 56.65625, "completions/min_length": 8.0, "epoch": 0.00790532035735118, "frac_reward_zero_std": 0.0, "grad_norm": 35.71538543701172, "kl": 0.5599426627159119, "learning_rate": 3.951810926949049e-07, "loss": 0.0055130645632743835, "memory(GiB)": 90.94, "reward": 0.7374999523162842, "reward_std": 0.2715771198272705, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9437500238418579, "rewards/PlanningActionSetORM/std": 0.058807604014873505, "rewards/RMReward/mean": 0.8031250238418579, "rewards/RMReward/std": 0.08260094374418259, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 515, "train_speed(iter/s)": 0.017988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 192.0, "completions/min_length": 97.0, "epoch": 0.007920670493967396, "frac_reward_zero_std": 0.0, "grad_norm": 1.849278211593628, "kl": 0.039799392223358154, "learning_rate": 3.959484346224678e-07, "loss": 0.02079147845506668, "memory(GiB)": 90.94, "reward": 0.8915953040122986, "reward_std": 0.07552343606948853, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9392263889312744, "rewards/PlanningActionSetORM/std": 0.10607850551605225, "rewards/RMReward/mean": 0.879687488079071, "rewards/RMReward/std": 0.11560455709695816, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 516, "train_speed(iter/s)": 0.017975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/mean_length": 157.09375, "completions/min_length": 8.0, "epoch": 0.007936020630583612, "frac_reward_zero_std": 0.0, "grad_norm": 28.664125442504883, "kl": 0.5445536375045776, "learning_rate": 3.9671577655003074e-07, "loss": -0.008291486650705338, "memory(GiB)": 90.94, "reward": 0.33746838569641113, "reward_std": 0.3080594539642334, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.14993682503700256, "rewards/VisualPerceptionAccuracy/std": 0.12554101645946503, "step": 517, "train_speed(iter/s)": 0.017992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 195.6875, "completions/min_length": 148.0, "epoch": 0.007951370767199827, "frac_reward_zero_std": 0.0, "grad_norm": 0.5170093178749084, "kl": 0.0532982274889946, "learning_rate": 3.9748311847759367e-07, "loss": -0.0005689263343811035, "memory(GiB)": 90.94, "reward": 0.9965000152587891, "reward_std": 0.010092873126268387, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9956250190734863, "rewards/RMReward/std": 0.012427208945155144, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 518, "train_speed(iter/s)": 0.01799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 391.125, "completions/min_length": 86.0, "epoch": 0.007966720903816045, "frac_reward_zero_std": 0.0, "grad_norm": 1.546562910079956, "kl": 0.0737360492348671, "learning_rate": 3.982504604051566e-07, "loss": -0.009704146534204483, "memory(GiB)": 90.94, "reward": 0.5261733531951904, "reward_std": 0.12544366717338562, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8723958134651184, "rewards/PlanningActionSetORM/std": 0.010416671633720398, "rewards/RMReward/mean": 0.9125000238418579, "rewards/RMReward/std": 0.09036961197853088, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.14786753058433533, "rewards/VisualPerceptionAccuracy/std": 0.17756988108158112, "step": 519, "train_speed(iter/s)": 0.017951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 62.875, "completions/min_length": 14.0, "epoch": 0.00798207104043226, "frac_reward_zero_std": 0.0, "grad_norm": 8.43814754486084, "kl": 0.07590651512145996, "learning_rate": 3.990178023327195e-07, "loss": -0.01165139302611351, "memory(GiB)": 90.94, "reward": 0.9013854265213013, "reward_std": 0.21892493963241577, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8776041865348816, "rewards/PlanningActionSetORM/std": 0.010416671633720398, "rewards/RMReward/mean": 0.9325000047683716, "rewards/RMReward/std": 0.1425716131925583, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 520, "train_speed(iter/s)": 0.017912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 216.15625, "completions/min_length": 210.0, "epoch": 0.007997421177048476, "frac_reward_zero_std": 0.0, "grad_norm": 0.4951765537261963, "kl": 0.04130841791629791, "learning_rate": 3.9978514426028245e-07, "loss": -0.0005464479327201843, "memory(GiB)": 90.94, "reward": 0.9630277752876282, "reward_std": 0.009500985965132713, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9815624952316284, "rewards/RMReward/std": 0.023432733491063118, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 521, "train_speed(iter/s)": 0.017909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 160.96875, "completions/min_length": 155.0, "epoch": 0.008012771313664692, "frac_reward_zero_std": 0.0, "grad_norm": 1.1437360048294067, "kl": 0.06901523470878601, "learning_rate": 4.0055248618784537e-07, "loss": -0.0016966909170150757, "memory(GiB)": 90.94, "reward": 0.9482499957084656, "reward_std": 0.036885932087898254, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9353125095367432, "rewards/RMReward/std": 0.08304620534181595, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 522, "train_speed(iter/s)": 0.017883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 241.59375, "completions/min_length": 106.0, "epoch": 0.008028121450280907, "frac_reward_zero_std": 0.0, "grad_norm": 1.1971970796585083, "kl": 0.029242580756545067, "learning_rate": 4.013198281154083e-07, "loss": -0.03280522674322128, "memory(GiB)": 90.94, "reward": 0.7945045232772827, "reward_std": 0.06285648792982101, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9600224494934082, "rewards/PlanningActionSetORM/std": 0.07743117958307266, "rewards/RMReward/mean": 0.753125011920929, "rewards/RMReward/std": 0.07613390684127808, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 523, "train_speed(iter/s)": 0.017858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.008043471586897123, "frac_reward_zero_std": 0.0, "grad_norm": 14.7977876663208, "kl": 1.01318359375, "learning_rate": 4.0208717004297117e-07, "loss": 0.0010116882622241974, "memory(GiB)": 90.94, "reward": 0.940625011920929, "reward_std": 0.23749999701976776, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 524, "train_speed(iter/s)": 0.017864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 144.4375, "completions/min_length": 105.0, "epoch": 0.008058821723513339, "frac_reward_zero_std": 0.0, "grad_norm": 1.6535013914108276, "kl": 0.0446917749941349, "learning_rate": 4.028545119705341e-07, "loss": 0.0007055588066577911, "memory(GiB)": 90.94, "reward": 0.6009480953216553, "reward_std": 0.0859188586473465, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9931249618530273, "rewards/RMReward/std": 0.024958305060863495, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23239609599113464, "rewards/VisualPerceptionAccuracy/std": 0.15187108516693115, "step": 525, "train_speed(iter/s)": 0.017837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/mean_length": 179.09375, "completions/min_length": 90.0, "epoch": 0.008074171860129556, "frac_reward_zero_std": 0.0, "grad_norm": 2.356734275817871, "kl": 0.026568841189146042, "learning_rate": 4.03621853898097e-07, "loss": 0.04634793475270271, "memory(GiB)": 90.94, "reward": 0.5827791094779968, "reward_std": 0.14064820110797882, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.21252451837062836, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3555581867694855, "rewards/VisualPerceptionAccuracy/std": 0.11127682030200958, "step": 526, "train_speed(iter/s)": 0.017831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 213.0, "completions/min_length": 182.0, "epoch": 0.008089521996745772, "frac_reward_zero_std": 0.0, "grad_norm": 0.33243420720100403, "kl": 0.05148990452289581, "learning_rate": 4.0438919582565995e-07, "loss": -0.006415821611881256, "memory(GiB)": 90.94, "reward": 0.8691713809967041, "reward_std": 0.18153896927833557, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9558566808700562, "rewards/PlanningActionSetORM/std": 0.04502082243561745, "rewards/RMReward/mean": 0.8474999666213989, "rewards/RMReward/std": 0.24120663106441498, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 527, "train_speed(iter/s)": 0.017719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/mean_length": 11.28125, "completions/min_length": 8.0, "epoch": 0.008104872133361987, "frac_reward_zero_std": 0.0, "grad_norm": 37.223541259765625, "kl": 0.47955751419067383, "learning_rate": 4.051565377532229e-07, "loss": -0.01786630228161812, "memory(GiB)": 90.94, "reward": 0.7328125238418579, "reward_std": 0.36403894424438477, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 528, "train_speed(iter/s)": 0.017748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 70.78125, "completions/min_length": 8.0, "epoch": 0.008120222269978203, "frac_reward_zero_std": 0.0, "grad_norm": 13.62794017791748, "kl": 0.49371394515037537, "learning_rate": 4.059238796807858e-07, "loss": 0.0009810999035835266, "memory(GiB)": 90.94, "reward": 0.8965625166893005, "reward_std": 0.14374998211860657, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8156249523162842, "rewards/RMReward/std": 0.0625, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 529, "train_speed(iter/s)": 0.017759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 248.125, "completions/min_length": 88.0, "epoch": 0.008135572406594418, "frac_reward_zero_std": 0.0, "grad_norm": 1.5026217699050903, "kl": 0.05468415468931198, "learning_rate": 4.0669122160834873e-07, "loss": 0.01594775915145874, "memory(GiB)": 90.94, "reward": 0.8088710904121399, "reward_std": 0.06763684004545212, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9756053686141968, "rewards/PlanningActionSetORM/std": 0.02645184099674225, "rewards/RMReward/mean": 0.7671874761581421, "rewards/RMReward/std": 0.10519519448280334, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 530, "train_speed(iter/s)": 0.017729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/mean_length": 49.84375, "completions/min_length": 8.0, "epoch": 0.008150922543210634, "frac_reward_zero_std": 0.0, "grad_norm": 10.495291709899902, "kl": 0.5271108150482178, "learning_rate": 4.0745856353591166e-07, "loss": 0.001964941620826721, "memory(GiB)": 90.94, "reward": 0.864062488079071, "reward_std": 0.19389086961746216, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.984375, "rewards/PlanningActionSetORM/std": 0.042695630341768265, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.07637625932693481, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 531, "train_speed(iter/s)": 0.017737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 173.1875, "completions/min_length": 78.0, "epoch": 0.00816627267982685, "frac_reward_zero_std": 0.0, "grad_norm": 1.787888765335083, "kl": 0.05399623513221741, "learning_rate": 4.0822590546347453e-07, "loss": -0.012808017432689667, "memory(GiB)": 90.94, "reward": 0.47223159670829773, "reward_std": 0.1022137850522995, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9156249761581421, "rewards/RMReward/std": 0.20389437675476074, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.011963209137320518, "rewards/VisualPerceptionAccuracy/std": 0.04131205752491951, "step": 532, "train_speed(iter/s)": 0.017704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 153.375, "completions/min_length": 8.0, "epoch": 0.008181622816443067, "frac_reward_zero_std": 0.0, "grad_norm": 19.599157333374023, "kl": 0.731479287147522, "learning_rate": 4.0899324739103746e-07, "loss": -0.0010098889470100403, "memory(GiB)": 90.94, "reward": 0.4021875262260437, "reward_std": 0.19318118691444397, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.6187499761581421, "rewards/RMReward/std": 0.18607793748378754, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 533, "train_speed(iter/s)": 0.017671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 179.9375, "completions/min_length": 119.0, "epoch": 0.008196972953059283, "frac_reward_zero_std": 0.0, "grad_norm": 1.0761127471923828, "kl": 0.11048981547355652, "learning_rate": 4.097605893186004e-07, "loss": -0.004910711199045181, "memory(GiB)": 90.94, "reward": 0.9847500324249268, "reward_std": 0.04049193859100342, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9809374809265137, "rewards/RMReward/std": 0.05126524344086647, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 534, "train_speed(iter/s)": 0.017663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 128.40625, "completions/min_length": 96.0, "epoch": 0.008212323089675498, "frac_reward_zero_std": 0.0, "grad_norm": 1.678385853767395, "kl": 0.06412062793970108, "learning_rate": 4.105279312461633e-07, "loss": -0.007236500736325979, "memory(GiB)": 90.94, "reward": 0.8986250162124634, "reward_std": 0.021434586495161057, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9906250238418579, "rewards/PlanningActionSetORM/std": 0.029614463448524475, "rewards/RMReward/mean": 0.8756250143051147, "rewards/RMReward/std": 0.12923941016197205, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 535, "train_speed(iter/s)": 0.01766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/mean_length": 276.4375, "completions/min_length": 83.0, "epoch": 0.008227673226291714, "frac_reward_zero_std": 0.0, "grad_norm": 2.2167770862579346, "kl": 0.017640898004174232, "learning_rate": 4.1129527317372623e-07, "loss": 0.00647609680891037, "memory(GiB)": 90.94, "reward": 0.37901195883750916, "reward_std": 0.0970657467842102, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9539843797683716, "rewards/PlanningActionSetORM/std": 0.005125355441123247, "rewards/RMReward/mean": 0.6843750476837158, "rewards/RMReward/std": 0.15569067001342773, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.01972706988453865, "rewards/VisualPerceptionAccuracy/std": 0.06964111328125, "step": 536, "train_speed(iter/s)": 0.017653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 95.875, "completions/min_length": 72.0, "epoch": 0.00824302336290793, "frac_reward_zero_std": 0.0, "grad_norm": 2.43375825881958, "kl": 0.06031361222267151, "learning_rate": 4.1206261510128916e-07, "loss": 0.022382553666830063, "memory(GiB)": 90.94, "reward": 0.8424999713897705, "reward_std": 0.06390365958213806, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.03360108286142349, "rewards/RMReward/mean": 0.8062499761581421, "rewards/RMReward/std": 0.08957785367965698, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 537, "train_speed(iter/s)": 0.017653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/mean_length": 365.25, "completions/min_length": 8.0, "epoch": 0.008258373499524145, "frac_reward_zero_std": 0.0, "grad_norm": 37.34550857543945, "kl": 0.4057350754737854, "learning_rate": 4.128299570288521e-07, "loss": -0.0608179047703743, "memory(GiB)": 90.94, "reward": 0.37568992376327515, "reward_std": 0.3067273497581482, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.16700479388237, "rewards/VisualPerceptionAccuracy/std": 0.12672454118728638, "step": 538, "train_speed(iter/s)": 0.017643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 182.15625, "completions/min_length": 90.0, "epoch": 0.008273723636140361, "frac_reward_zero_std": 0.0, "grad_norm": 1.5986415147781372, "kl": 0.04752251133322716, "learning_rate": 4.13597298956415e-07, "loss": 0.0352596789598465, "memory(GiB)": 90.94, "reward": 0.7570680379867554, "reward_std": 0.14470338821411133, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9665902256965637, "rewards/PlanningActionSetORM/std": 0.05676404759287834, "rewards/RMReward/mean": 0.7046874761581421, "rewards/RMReward/std": 0.17150160670280457, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 539, "train_speed(iter/s)": 0.017635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/mean_length": 244.53125, "completions/min_length": 85.0, "epoch": 0.008289073772756577, "frac_reward_zero_std": 0.0, "grad_norm": 1.3954708576202393, "kl": 0.032912515103816986, "learning_rate": 4.1436464088397794e-07, "loss": -0.03983844444155693, "memory(GiB)": 90.94, "reward": 0.7881758213043213, "reward_std": 0.08922916650772095, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9471291303634644, "rewards/PlanningActionSetORM/std": 0.07591656595468521, "rewards/RMReward/mean": 0.7484375238418579, "rewards/RMReward/std": 0.12147240340709686, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 540, "train_speed(iter/s)": 0.017579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/mean_length": 231.15625, "completions/min_length": 83.0, "epoch": 0.008304423909372794, "frac_reward_zero_std": 0.0, "grad_norm": 1.8748215436935425, "kl": 0.028243456035852432, "learning_rate": 4.151319828115408e-07, "loss": -0.035128939896821976, "memory(GiB)": 90.94, "reward": 0.5312725305557251, "reward_std": 0.08987291157245636, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9739583134651184, "rewards/PlanningActionSetORM/std": 0.05667279660701752, "rewards/RMReward/mean": 0.765625, "rewards/RMReward/std": 0.05072394013404846, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2552533745765686, "rewards/VisualPerceptionAccuracy/std": 0.13210569322109222, "step": 541, "train_speed(iter/s)": 0.017589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 73.875, "completions/min_length": 8.0, "epoch": 0.00831977404598901, "frac_reward_zero_std": 0.0, "grad_norm": 21.475322723388672, "kl": 0.6201650500297546, "learning_rate": 4.1589932473910374e-07, "loss": -0.025495830923318863, "memory(GiB)": 90.94, "reward": 0.6416249871253967, "reward_std": 0.3154315948486328, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.846250057220459, "rewards/RMReward/std": 0.19482900202274323, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 542, "train_speed(iter/s)": 0.017584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 194.625, "completions/min_length": 109.0, "epoch": 0.008335124182605225, "frac_reward_zero_std": 0.0, "grad_norm": 1.659984827041626, "kl": 0.034968823194503784, "learning_rate": 4.1666666666666667e-07, "loss": -0.024810679256916046, "memory(GiB)": 90.94, "reward": 0.456966757774353, "reward_std": 0.04549291729927063, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9090909361839294, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.07685212790966034, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.024615373462438583, "rewards/VisualPerceptionAccuracy/std": 0.029504159465432167, "step": 543, "train_speed(iter/s)": 0.017573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 466.15625, "completions/min_length": 121.0, "epoch": 0.008350474319221441, "frac_reward_zero_std": 0.0, "grad_norm": 0.8944506645202637, "kl": 0.01988881267607212, "learning_rate": 4.174340085942296e-07, "loss": -0.06208252161741257, "memory(GiB)": 90.94, "reward": 0.5134647488594055, "reward_std": 0.17686957120895386, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9479347467422485, "rewards/PlanningActionSetORM/std": 0.016269050538539886, "rewards/RMReward/mean": 0.7893750071525574, "rewards/RMReward/std": 0.1988120973110199, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.20584258437156677, "rewards/VisualPerceptionAccuracy/std": 0.19511884450912476, "step": 544, "train_speed(iter/s)": 0.017548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/mean_length": 41.21875, "completions/min_length": 13.0, "epoch": 0.008365824455837657, "frac_reward_zero_std": 0.0, "grad_norm": 9.110894203186035, "kl": 0.20733514428138733, "learning_rate": 4.182013505217925e-07, "loss": 0.02045547217130661, "memory(GiB)": 90.94, "reward": 0.7743749618530273, "reward_std": 0.23851656913757324, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.06718549132347107, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.0663795918226242, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 545, "train_speed(iter/s)": 0.017558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 61.03125, "completions/min_length": 13.0, "epoch": 0.008381174592453872, "frac_reward_zero_std": 0.0, "grad_norm": 5.228725433349609, "kl": 0.08796176314353943, "learning_rate": 4.1896869244935544e-07, "loss": -0.020294256508350372, "memory(GiB)": 90.94, "reward": 0.7087500095367432, "reward_std": 0.2815977931022644, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8656250238418579, "rewards/RMReward/std": 0.09077214449644089, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 546, "train_speed(iter/s)": 0.017537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 111.8125, "completions/min_length": 99.0, "epoch": 0.008396524729070088, "frac_reward_zero_std": 0.0, "grad_norm": 1.550759196281433, "kl": 0.08538154512643814, "learning_rate": 4.1973603437691837e-07, "loss": 0.0029104165732860565, "memory(GiB)": 90.94, "reward": 0.8971250057220459, "reward_std": 0.09565763175487518, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.8721874952316284, "rewards/RMReward/std": 0.12830746173858643, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 547, "train_speed(iter/s)": 0.017537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/mean_length": 94.4375, "completions/min_length": 8.0, "epoch": 0.008411874865686305, "frac_reward_zero_std": 0.0, "grad_norm": 28.22283363342285, "kl": 0.6111122965812683, "learning_rate": 4.205033763044813e-07, "loss": 0.0008160993456840515, "memory(GiB)": 90.94, "reward": 0.9038749933242798, "reward_std": 0.19985809922218323, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9081250429153442, "rewards/RMReward/std": 0.09403678774833679, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 548, "train_speed(iter/s)": 0.017523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 154.65625, "completions/min_length": 99.0, "epoch": 0.008427225002302521, "frac_reward_zero_std": 0.0, "grad_norm": 1.474994421005249, "kl": 0.058652184903621674, "learning_rate": 4.212707182320443e-07, "loss": -0.008277103304862976, "memory(GiB)": 90.94, "reward": 0.902013897895813, "reward_std": 0.05243222415447235, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9350694417953491, "rewards/PlanningActionSetORM/std": 0.0546601228415966, "rewards/RMReward/mean": 0.893750011920929, "rewards/RMReward/std": 0.10906493663787842, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 549, "train_speed(iter/s)": 0.017506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/mean_length": 39.15625, "completions/min_length": 8.0, "epoch": 0.008442575138918737, "frac_reward_zero_std": 0.0, "grad_norm": 29.138206481933594, "kl": 0.45258742570877075, "learning_rate": 4.220380601596072e-07, "loss": -0.005821805447340012, "memory(GiB)": 90.94, "reward": 0.7728124856948853, "reward_std": 0.23309358954429626, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.06718549132347107, "rewards/RMReward/mean": 0.6625000238418579, "rewards/RMReward/std": 0.1056724488735199, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 550, "train_speed(iter/s)": 0.01751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/mean_length": 116.78125, "completions/min_length": 90.0, "epoch": 0.008457925275534952, "frac_reward_zero_std": 0.0, "grad_norm": 2.328751802444458, "kl": 0.09372660517692566, "learning_rate": 4.2280540208717013e-07, "loss": 0.009069398045539856, "memory(GiB)": 90.94, "reward": 0.7472916841506958, "reward_std": 0.07466714084148407, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9802083373069763, "rewards/PlanningActionSetORM/std": 0.0485336109995842, "rewards/RMReward/mean": 0.6890624761581421, "rewards/RMReward/std": 0.11052248626947403, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 551, "train_speed(iter/s)": 0.017511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/mean_length": 98.71875, "completions/min_length": 84.0, "epoch": 0.008473275412151168, "frac_reward_zero_std": 0.0, "grad_norm": 2.284254789352417, "kl": 0.03993955999612808, "learning_rate": 4.2357274401473305e-07, "loss": -0.00237111933529377, "memory(GiB)": 90.94, "reward": 0.45559537410736084, "reward_std": 0.05033411085605621, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.840624988079071, "rewards/RMReward/std": 0.05836308375000954, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.03869073465466499, "rewards/VisualPerceptionAccuracy/std": 0.053977787494659424, "step": 552, "train_speed(iter/s)": 0.017518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/mean_length": 205.6875, "completions/min_length": 8.0, "epoch": 0.008488625548767384, "frac_reward_zero_std": 0.0, "grad_norm": 15.340797424316406, "kl": 0.5963845252990723, "learning_rate": 4.2434008594229593e-07, "loss": -0.11856265366077423, "memory(GiB)": 90.94, "reward": 0.5422555208206177, "reward_std": 0.22082078456878662, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.14388608932495117, "rewards/VisualPerceptionAccuracy/std": 0.2041415423154831, "step": 553, "train_speed(iter/s)": 0.017524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/mean_length": 100.8125, "completions/min_length": 90.0, "epoch": 0.0085039756853836, "frac_reward_zero_std": 0.0, "grad_norm": 1.7532166242599487, "kl": 0.11114498972892761, "learning_rate": 4.2510742786985885e-07, "loss": -0.008238507434725761, "memory(GiB)": 90.94, "reward": 0.8685937523841858, "reward_std": 0.04371711611747742, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.992968738079071, "rewards/PlanningActionSetORM/std": 0.027849232777953148, "rewards/RMReward/mean": 0.8375000357627869, "rewards/RMReward/std": 0.05819876492023468, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 554, "train_speed(iter/s)": 0.017527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/mean_length": 182.0625, "completions/min_length": 8.0, "epoch": 0.008519325821999817, "frac_reward_zero_std": 0.0, "grad_norm": 27.311548233032227, "kl": 0.4621959626674652, "learning_rate": 4.258747697974218e-07, "loss": 0.03306617587804794, "memory(GiB)": 90.94, "reward": 0.633276104927063, "reward_std": 0.23294416069984436, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.38530221581459045, "rewards/VisualPerceptionAccuracy/std": 0.1414015293121338, "step": 555, "train_speed(iter/s)": 0.017549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 115.71875, "completions/min_length": 95.0, "epoch": 0.008534675958616032, "frac_reward_zero_std": 0.0, "grad_norm": 2.382946491241455, "kl": 0.07324966788291931, "learning_rate": 4.266421117249847e-07, "loss": -0.012430686503648758, "memory(GiB)": 90.94, "reward": 0.9316249489784241, "reward_std": 0.052777811884880066, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9906250238418579, "rewards/PlanningActionSetORM/std": 0.029614463448524475, "rewards/RMReward/mean": 0.9168750047683716, "rewards/RMReward/std": 0.07818247377872467, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 556, "train_speed(iter/s)": 0.017514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/mean_length": 63.46875, "completions/min_length": 8.0, "epoch": 0.008550026095232248, "frac_reward_zero_std": 0.0, "grad_norm": 40.4548225402832, "kl": 0.5408037304878235, "learning_rate": 4.2740945365254763e-07, "loss": 0.0006609782576560974, "memory(GiB)": 90.94, "reward": 0.6694374680519104, "reward_std": 0.2339209020137787, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9900000095367432, "rewards/RMReward/std": 0.016329936683177948, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 557, "train_speed(iter/s)": 0.017517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 63.5, "completions/min_length": 8.0, "epoch": 0.008565376231848464, "frac_reward_zero_std": 0.0, "grad_norm": 20.03569793701172, "kl": 0.45225226879119873, "learning_rate": 4.2817679558011056e-07, "loss": 0.04373787343502045, "memory(GiB)": 90.94, "reward": 0.7837499976158142, "reward_std": 0.24811024963855743, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9249999523162842, "rewards/PlanningActionSetORM/std": 0.16124515235424042, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.09309493750333786, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 558, "train_speed(iter/s)": 0.017506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 166.0, "completions/min_length": 139.0, "epoch": 0.00858072636846468, "frac_reward_zero_std": 0.0, "grad_norm": 0.7790631651878357, "kl": 0.07847721874713898, "learning_rate": 4.289441375076735e-07, "loss": 0.0012160874903202057, "memory(GiB)": 90.94, "reward": 0.9547499418258667, "reward_std": 0.06468808650970459, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9434374570846558, "rewards/RMReward/std": 0.11746267974376678, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 559, "train_speed(iter/s)": 0.017486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 135.3125, "completions/min_length": 67.0, "epoch": 0.008596076505080895, "frac_reward_zero_std": 0.0, "grad_norm": 2.1699206829071045, "kl": 0.06706250458955765, "learning_rate": 4.297114794352364e-07, "loss": -0.04026159644126892, "memory(GiB)": 90.94, "reward": 0.38476017117500305, "reward_std": 0.06113031506538391, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.13022416830062866, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.004520324524492025, "rewards/VisualPerceptionAccuracy/std": 0.0180812980979681, "step": 560, "train_speed(iter/s)": 0.017487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/mean_length": 8.125, "completions/min_length": 8.0, "epoch": 0.00861142664169711, "frac_reward_zero_std": 0.0, "grad_norm": 32.29910659790039, "kl": 0.8874655365943909, "learning_rate": 4.304788213627993e-07, "loss": 0.01558925211429596, "memory(GiB)": 90.94, "reward": 0.4062499701976776, "reward_std": 0.39974337816238403, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 561, "train_speed(iter/s)": 0.017516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 110.3125, "completions/min_length": 85.0, "epoch": 0.008626776778313326, "frac_reward_zero_std": 0.0, "grad_norm": 2.0639595985412598, "kl": 0.07236211746931076, "learning_rate": 4.312461632903622e-07, "loss": 0.05358774960041046, "memory(GiB)": 90.94, "reward": 0.7767187356948853, "reward_std": 0.10688184201717377, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9898437261581421, "rewards/PlanningActionSetORM/std": 0.03229112550616264, "rewards/RMReward/mean": 0.7234375476837158, "rewards/RMReward/std": 0.13969111442565918, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 562, "train_speed(iter/s)": 0.017502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 110.5, "completions/min_length": 84.0, "epoch": 0.008642126914929544, "frac_reward_zero_std": 0.0, "grad_norm": 2.2335782051086426, "kl": 0.0811225101351738, "learning_rate": 4.3201350521792514e-07, "loss": 0.034636419266462326, "memory(GiB)": 90.94, "reward": 0.8261979818344116, "reward_std": 0.05919802933931351, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9559895992279053, "rewards/PlanningActionSetORM/std": 0.05836152657866478, "rewards/RMReward/mean": 0.793749988079071, "rewards/RMReward/std": 0.08590129762887955, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 563, "train_speed(iter/s)": 0.017477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 180.875, "completions/min_length": 83.0, "epoch": 0.00865747705154576, "frac_reward_zero_std": 0.0, "grad_norm": 1.4052962064743042, "kl": 0.0607801117002964, "learning_rate": 4.3278084714548806e-07, "loss": -0.013690892606973648, "memory(GiB)": 90.94, "reward": 0.8767187595367432, "reward_std": 0.03310541436076164, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.99609375, "rewards/PlanningActionSetORM/std": 0.022097086533904076, "rewards/RMReward/mean": 0.8468749523162842, "rewards/RMReward/std": 0.1573866754770279, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 564, "train_speed(iter/s)": 0.017481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 179.40625, "completions/min_length": 102.0, "epoch": 0.008672827188161975, "frac_reward_zero_std": 0.0, "grad_norm": 1.6774941682815552, "kl": 0.051788121461868286, "learning_rate": 4.33548189073051e-07, "loss": -0.013615619391202927, "memory(GiB)": 90.94, "reward": 0.9334999918937683, "reward_std": 0.08713552355766296, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9168750047683716, "rewards/RMReward/std": 0.13653399050235748, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 565, "train_speed(iter/s)": 0.017459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/mean_length": 146.71875, "completions/min_length": 8.0, "epoch": 0.00868817732477819, "frac_reward_zero_std": 0.0, "grad_norm": 65.97832489013672, "kl": 0.2312832772731781, "learning_rate": 4.343155310006139e-07, "loss": 0.06458880007266998, "memory(GiB)": 90.94, "reward": 0.7815868258476257, "reward_std": 0.26885733008384705, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9939931631088257, "rewards/PlanningActionSetORM/std": 0.016498729586601257, "rewards/RMReward/mean": 0.6781249642372131, "rewards/RMReward/std": 0.19058573246002197, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 566, "train_speed(iter/s)": 0.017441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 149.4375, "completions/min_length": 8.0, "epoch": 0.008703527461394406, "frac_reward_zero_std": 0.0, "grad_norm": 32.040733337402344, "kl": 0.42528268694877625, "learning_rate": 4.3508287292817684e-07, "loss": -0.006122194230556488, "memory(GiB)": 90.94, "reward": 0.587565004825592, "reward_std": 0.2678382992744446, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9319004416465759, "rewards/PlanningActionSetORM/std": 0.004790589679032564, "rewards/RMReward/mean": 0.7281249761581421, "rewards/RMReward/std": 0.07520805299282074, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 567, "train_speed(iter/s)": 0.017437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 173.25, "completions/min_length": 111.0, "epoch": 0.008718877598010622, "frac_reward_zero_std": 0.0, "grad_norm": 1.4327222108840942, "kl": 0.04361701011657715, "learning_rate": 4.3585021485573977e-07, "loss": 0.045531343668699265, "memory(GiB)": 90.94, "reward": 0.8679227828979492, "reward_std": 0.048548340797424316, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8721137046813965, "rewards/PlanningActionSetORM/std": 0.097069151699543, "rewards/RMReward/mean": 0.8668749928474426, "rewards/RMReward/std": 0.1550741195678711, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 568, "train_speed(iter/s)": 0.017439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/mean_length": 57.6875, "completions/min_length": 8.0, "epoch": 0.008734227734626837, "frac_reward_zero_std": 0.0, "grad_norm": 30.25725555419922, "kl": 0.5894811749458313, "learning_rate": 4.366175567833027e-07, "loss": -0.02346336841583252, "memory(GiB)": 90.94, "reward": 0.7975000143051147, "reward_std": 0.23688730597496033, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7906249761581421, "rewards/RMReward/std": 0.061152148991823196, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 569, "train_speed(iter/s)": 0.017453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 234.28125, "completions/min_length": 125.0, "epoch": 0.008749577871243055, "frac_reward_zero_std": 0.0, "grad_norm": 1.3292571306228638, "kl": 0.025541206821799278, "learning_rate": 4.3738489871086557e-07, "loss": -0.012296713888645172, "memory(GiB)": 90.94, "reward": 0.5427807569503784, "reward_std": 0.10222195088863373, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.06020797789096832, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23056155443191528, "rewards/VisualPerceptionAccuracy/std": 0.15627756714820862, "step": 570, "train_speed(iter/s)": 0.017457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/mean_length": 90.25, "completions/min_length": 67.0, "epoch": 0.00876492800785927, "frac_reward_zero_std": 0.0, "grad_norm": 2.989969253540039, "kl": 0.1062743216753006, "learning_rate": 4.381522406384285e-07, "loss": -0.005557361990213394, "memory(GiB)": 90.94, "reward": 0.866812527179718, "reward_std": 0.08185769617557526, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9765625, "rewards/PlanningActionSetORM/std": 0.09753772616386414, "rewards/RMReward/mean": 0.8393750190734863, "rewards/RMReward/std": 0.13734668493270874, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 571, "train_speed(iter/s)": 0.017438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/mean_length": 139.875, "completions/min_length": 8.0, "epoch": 0.008780278144475486, "frac_reward_zero_std": 0.0, "grad_norm": 24.3915958404541, "kl": 0.47152605652809143, "learning_rate": 4.389195825659914e-07, "loss": -0.053417470306158066, "memory(GiB)": 90.94, "reward": 0.6467581391334534, "reward_std": 0.18393921852111816, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.35289129614830017, "rewards/VisualPerceptionAccuracy/std": 0.13037846982479095, "step": 572, "train_speed(iter/s)": 0.017436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/mean_length": 144.46875, "completions/min_length": 14.0, "epoch": 0.008795628281091702, "frac_reward_zero_std": 0.0, "grad_norm": 6.441849708557129, "kl": 0.21305912733078003, "learning_rate": 4.3968692449355435e-07, "loss": -0.21266570687294006, "memory(GiB)": 90.94, "reward": 0.5670245885848999, "reward_std": 0.19326087832450867, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.19342416524887085, "rewards/VisualPerceptionAccuracy/std": 0.14902174472808838, "step": 573, "train_speed(iter/s)": 0.017458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/mean_length": 61.59375, "completions/min_length": 13.0, "epoch": 0.008810978417707917, "frac_reward_zero_std": 0.0, "grad_norm": 8.118145942687988, "kl": 0.25410497188568115, "learning_rate": 4.4045426642111727e-07, "loss": -0.007235966622829437, "memory(GiB)": 90.94, "reward": 0.6116249561309814, "reward_std": 0.23926392197608948, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.9212499856948853, "rewards/RMReward/std": 0.06238321587443352, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 574, "train_speed(iter/s)": 0.01747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 146.6875, "completions/min_length": 105.0, "epoch": 0.008826328554324133, "frac_reward_zero_std": 0.0, "grad_norm": 1.8825360536575317, "kl": 0.06384100019931793, "learning_rate": 4.412216083486802e-07, "loss": 0.08740770071744919, "memory(GiB)": 90.94, "reward": 0.8283437490463257, "reward_std": 0.07117746770381927, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.98046875, "rewards/PlanningActionSetORM/std": 0.04151097312569618, "rewards/RMReward/mean": 0.7903125286102295, "rewards/RMReward/std": 0.1254793256521225, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 575, "train_speed(iter/s)": 0.017473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 170.15625, "completions/min_length": 8.0, "epoch": 0.008841678690940349, "frac_reward_zero_std": 0.0, "grad_norm": 18.822328567504883, "kl": 0.3741300404071808, "learning_rate": 4.419889502762431e-07, "loss": 0.023307379335165024, "memory(GiB)": 90.94, "reward": 0.30243220925331116, "reward_std": 0.3270210027694702, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.19861441850662231, "rewards/VisualPerceptionAccuracy/std": 0.17904198169708252, "step": 576, "train_speed(iter/s)": 0.017497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/mean_length": 54.96875, "completions/min_length": 8.0, "epoch": 0.008857028827556566, "frac_reward_zero_std": 0.0, "grad_norm": 17.96384620666504, "kl": 0.8515878915786743, "learning_rate": 4.4275629220380605e-07, "loss": 0.0010462142527103424, "memory(GiB)": 90.94, "reward": 0.9241250157356262, "reward_std": 0.17307545244693756, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9900000095367432, "rewards/RMReward/std": 0.02708013541996479, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 577, "train_speed(iter/s)": 0.017502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 140.71875, "completions/min_length": 127.0, "epoch": 0.008872378964172782, "frac_reward_zero_std": 0.0, "grad_norm": 1.653166651725769, "kl": 0.08731576800346375, "learning_rate": 4.43523634131369e-07, "loss": 0.0009134579449892044, "memory(GiB)": 90.94, "reward": 0.9182500243186951, "reward_std": 0.04546841233968735, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8978124856948853, "rewards/RMReward/std": 0.12808097898960114, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 578, "train_speed(iter/s)": 0.017483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 117.9375, "completions/min_length": 8.0, "epoch": 0.008887729100788997, "frac_reward_zero_std": 0.0, "grad_norm": 12.71835994720459, "kl": 0.4601368010044098, "learning_rate": 4.4429097605893185e-07, "loss": -0.00019210577011108398, "memory(GiB)": 90.94, "reward": 0.963437557220459, "reward_std": 0.1376245766878128, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.125, "rewards/RMReward/mean": 0.9906250238418579, "rewards/RMReward/std": 0.03749999403953552, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 579, "train_speed(iter/s)": 0.017481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 185.125, "completions/min_length": 84.0, "epoch": 0.008903079237405213, "frac_reward_zero_std": 0.0, "grad_norm": 1.9480174779891968, "kl": 0.052940186113119125, "learning_rate": 4.450583179864948e-07, "loss": 0.005497166886925697, "memory(GiB)": 90.94, "reward": 0.5519827604293823, "reward_std": 0.1890845149755478, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8581249713897705, "rewards/RMReward/std": 0.12666063010692596, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21746546030044556, "rewards/VisualPerceptionAccuracy/std": 0.27684053778648376, "step": 580, "train_speed(iter/s)": 0.017493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 209.34375, "completions/min_length": 104.0, "epoch": 0.008918429374021429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3525387942790985, "kl": 0.08008112013339996, "learning_rate": 4.458256599140577e-07, "loss": 0.0059462375938892365, "memory(GiB)": 90.94, "reward": 0.9267500638961792, "reward_std": 0.05439918860793114, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9240624904632568, "rewards/RMReward/std": 0.10251230001449585, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 581, "train_speed(iter/s)": 0.017491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/mean_length": 103.0, "completions/min_length": 8.0, "epoch": 0.008933779510637644, "frac_reward_zero_std": 0.0, "grad_norm": 58.874534606933594, "kl": 0.3435724377632141, "learning_rate": 4.4659300184162063e-07, "loss": 0.002290060743689537, "memory(GiB)": 90.94, "reward": 0.8131250143051147, "reward_std": 0.2520487606525421, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9781250357627869, "rewards/RMReward/std": 0.03637193143367767, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 582, "train_speed(iter/s)": 0.017484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 176.71875, "completions/min_length": 104.0, "epoch": 0.00894912964725386, "frac_reward_zero_std": 0.0, "grad_norm": 1.1037920713424683, "kl": 0.06844905018806458, "learning_rate": 4.4736034376918356e-07, "loss": 0.019016560167074203, "memory(GiB)": 90.94, "reward": 0.9326249957084656, "reward_std": 0.12186820805072784, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.9165624976158142, "rewards/RMReward/std": 0.1866445541381836, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 583, "train_speed(iter/s)": 0.017476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 192.40625, "completions/min_length": 99.0, "epoch": 0.008964479783870076, "frac_reward_zero_std": 0.0, "grad_norm": 2.342719793319702, "kl": 0.02092200517654419, "learning_rate": 4.481276856967465e-07, "loss": -0.030644044280052185, "memory(GiB)": 90.94, "reward": 0.5619251132011414, "reward_std": 0.06645327806472778, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9993749856948853, "rewards/RMReward/std": 0.002499997615814209, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.12435022741556168, "rewards/VisualPerceptionAccuracy/std": 0.13090656697750092, "step": 584, "train_speed(iter/s)": 0.017477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 155.84375, "completions/min_length": 105.0, "epoch": 0.008979829920486293, "frac_reward_zero_std": 0.0, "grad_norm": 1.5347084999084473, "kl": 0.12025482207536697, "learning_rate": 4.4889502762430946e-07, "loss": 0.0009217485785484314, "memory(GiB)": 90.94, "reward": 0.867888867855072, "reward_std": 0.049890048801898956, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.8487499952316284, "rewards/RMReward/std": 0.1532813161611557, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 585, "train_speed(iter/s)": 0.017444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 126.875, "completions/min_length": 95.0, "epoch": 0.008995180057102509, "frac_reward_zero_std": 0.0, "grad_norm": 1.0479369163513184, "kl": 0.11135934293270111, "learning_rate": 4.496623695518724e-07, "loss": 0.002398252487182617, "memory(GiB)": 90.94, "reward": 0.7927500009536743, "reward_std": 0.09846886992454529, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7409375309944153, "rewards/RMReward/std": 0.26336970925331116, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 586, "train_speed(iter/s)": 0.017437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 121.0, "completions/min_length": 8.0, "epoch": 0.009010530193718724, "frac_reward_zero_std": 0.0, "grad_norm": 36.21721649169922, "kl": 0.4586048126220703, "learning_rate": 4.504297114794353e-07, "loss": 0.01121777668595314, "memory(GiB)": 90.94, "reward": 0.9398750066757202, "reward_std": 0.16385585069656372, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9981249570846558, "rewards/RMReward/std": 0.0040311249904334545, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 587, "train_speed(iter/s)": 0.017446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/mean_length": 56.4375, "completions/min_length": 8.0, "epoch": 0.00902588033033494, "frac_reward_zero_std": 0.0, "grad_norm": 22.352365493774414, "kl": 0.4933631122112274, "learning_rate": 4.5119705340699824e-07, "loss": -0.002086438238620758, "memory(GiB)": 90.94, "reward": 0.8953125476837158, "reward_std": 0.13874998688697815, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.05000000819563866, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 588, "train_speed(iter/s)": 0.017445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 60.0, "completions/min_length": 8.0, "epoch": 0.009041230466951156, "frac_reward_zero_std": 0.0, "grad_norm": 24.598888397216797, "kl": 0.34119725227355957, "learning_rate": 4.5196439533456117e-07, "loss": -0.05185960978269577, "memory(GiB)": 90.94, "reward": 0.5712500214576721, "reward_std": 0.2835601270198822, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.17783419787883759, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 589, "train_speed(iter/s)": 0.017416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/mean_length": 130.78125, "completions/min_length": 98.0, "epoch": 0.009056580603567371, "frac_reward_zero_std": 0.0, "grad_norm": 1.1347893476486206, "kl": 0.09094809740781784, "learning_rate": 4.5273173726212404e-07, "loss": 0.002010021358728409, "memory(GiB)": 90.94, "reward": 0.8047499656677246, "reward_std": 0.09914431720972061, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.024593474343419075, "rewards/RMReward/mean": 0.7574999928474426, "rewards/RMReward/std": 0.1854201704263687, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 590, "train_speed(iter/s)": 0.01739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 107.53125, "completions/min_length": 80.0, "epoch": 0.009071930740183587, "frac_reward_zero_std": 0.0, "grad_norm": 2.549037218093872, "kl": 0.10580957680940628, "learning_rate": 4.5349907918968697e-07, "loss": 0.05764421820640564, "memory(GiB)": 90.94, "reward": 0.874218761920929, "reward_std": 0.050314806401729584, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96484375, "rewards/PlanningActionSetORM/std": 0.07028915733098984, "rewards/RMReward/mean": 0.8515625, "rewards/RMReward/std": 0.06535802781581879, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 591, "train_speed(iter/s)": 0.017386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 58.6875, "completions/min_length": 8.0, "epoch": 0.009087280876799804, "frac_reward_zero_std": 0.0, "grad_norm": 47.3668327331543, "kl": 0.5560216307640076, "learning_rate": 4.542664211172499e-07, "loss": -0.0030506588518619537, "memory(GiB)": 90.94, "reward": 0.5811458826065063, "reward_std": 0.2985491156578064, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8958333730697632, "rewards/PlanningActionSetORM/std": 0.11702326685190201, "rewards/RMReward/mean": 0.6468750238418579, "rewards/RMReward/std": 0.12970319390296936, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 592, "train_speed(iter/s)": 0.017392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/mean_length": 141.5625, "completions/min_length": 117.0, "epoch": 0.00910263101341602, "frac_reward_zero_std": 0.0, "grad_norm": 0.6120684742927551, "kl": 0.08954606205224991, "learning_rate": 4.550337630448128e-07, "loss": -0.004499765112996101, "memory(GiB)": 90.94, "reward": 0.8925000429153442, "reward_std": 0.04278245195746422, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8656250238418579, "rewards/RMReward/std": 0.11670026183128357, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 593, "train_speed(iter/s)": 0.017367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 271.75, "completions/min_length": 96.0, "epoch": 0.009117981150032236, "frac_reward_zero_std": 0.0, "grad_norm": 1.3321164846420288, "kl": 0.07901132106781006, "learning_rate": 4.5580110497237574e-07, "loss": -0.1565173715353012, "memory(GiB)": 90.94, "reward": 0.4044298529624939, "reward_std": 0.1307089775800705, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.5093749761581421, "rewards/RMReward/std": 0.1724516898393631, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.20135968923568726, "rewards/VisualPerceptionAccuracy/std": 0.12345661967992783, "step": 594, "train_speed(iter/s)": 0.017359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/mean_length": 83.625, "completions/min_length": 8.0, "epoch": 0.009133331286648451, "frac_reward_zero_std": 0.0, "grad_norm": 40.33875274658203, "kl": 0.5838109254837036, "learning_rate": 4.5656844689993867e-07, "loss": -0.004807736724615097, "memory(GiB)": 90.94, "reward": 0.6866250038146973, "reward_std": 0.2532902956008911, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9587500095367432, "rewards/RMReward/std": 0.039475735276937485, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 595, "train_speed(iter/s)": 0.017358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 129.96875, "completions/min_length": 88.0, "epoch": 0.009148681423264667, "frac_reward_zero_std": 0.0, "grad_norm": 2.3301827907562256, "kl": 0.08425301313400269, "learning_rate": 4.573357888275016e-07, "loss": -0.014724839478731155, "memory(GiB)": 90.94, "reward": 0.7857500314712524, "reward_std": 0.08010542392730713, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.925000011920929, "rewards/PlanningActionSetORM/std": 0.09713642299175262, "rewards/RMReward/mean": 0.7509374618530273, "rewards/RMReward/std": 0.12282584607601166, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 596, "train_speed(iter/s)": 0.017351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 130.40625, "completions/min_length": 60.0, "epoch": 0.009164031559880883, "frac_reward_zero_std": 0.0, "grad_norm": 2.407496929168701, "kl": 0.03327801823616028, "learning_rate": 4.581031307550645e-07, "loss": -0.005645018070936203, "memory(GiB)": 90.94, "reward": 0.50644850730896, "reward_std": 0.07055769115686417, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9493750333786011, "rewards/RMReward/std": 0.016520196571946144, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05339701473712921, "rewards/VisualPerceptionAccuracy/std": 0.12789922952651978, "step": 597, "train_speed(iter/s)": 0.017341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/mean_length": 271.84375, "completions/min_length": 2.0, "epoch": 0.009179381696497098, "frac_reward_zero_std": 0.0, "grad_norm": 99.80846405029297, "kl": 0.014450881630182266, "learning_rate": 4.5887047268262745e-07, "loss": 0.15046417713165283, "memory(GiB)": 90.94, "reward": 0.31651127338409424, "reward_std": 0.33079996705055237, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.31651127338409424, "rewards/VisualPerceptionAccuracy/std": 0.3910312354564667, "step": 598, "train_speed(iter/s)": 0.017356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 200.28125, "completions/min_length": 8.0, "epoch": 0.009194731833113316, "frac_reward_zero_std": 0.0, "grad_norm": 34.47993087768555, "kl": 0.4442765712738037, "learning_rate": 4.596378146101903e-07, "loss": 0.030552756041288376, "memory(GiB)": 90.94, "reward": 0.7645272016525269, "reward_std": 0.28800609707832336, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9471473097801208, "rewards/PlanningActionSetORM/std": 0.00355541636236012, "rewards/RMReward/mean": 0.7956249713897705, "rewards/RMReward/std": 0.15152420103549957, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 599, "train_speed(iter/s)": 0.017352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/mean_length": 207.6875, "completions/min_length": 90.0, "epoch": 0.009210081969729531, "frac_reward_zero_std": 0.0, "grad_norm": 1.0973620414733887, "kl": 0.06108545884490013, "learning_rate": 4.6040515653775325e-07, "loss": -0.1446862816810608, "memory(GiB)": 90.94, "reward": 0.8406842350959778, "reward_std": 0.06976573169231415, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9934210777282715, "rewards/PlanningActionSetORM/std": 0.01768476888537407, "rewards/RMReward/mean": 0.8025000095367432, "rewards/RMReward/std": 0.2280350774526596, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 600, "train_speed(iter/s)": 0.017327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/mean_length": 136.25, "completions/min_length": 13.0, "epoch": 0.009225432106345747, "frac_reward_zero_std": 0.0, "grad_norm": 14.133525848388672, "kl": 0.05291406810283661, "learning_rate": 4.611724984653162e-07, "loss": 0.00985398143529892, "memory(GiB)": 90.94, "reward": 0.3519226312637329, "reward_std": 0.228829488158226, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.0007202774286270142, "rewards/VisualPerceptionAccuracy/std": 0.0028811099473387003, "step": 601, "train_speed(iter/s)": 0.017314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 152.1875, "completions/min_length": 8.0, "epoch": 0.009240782242961963, "frac_reward_zero_std": 0.0, "grad_norm": 27.11223793029785, "kl": 0.42544639110565186, "learning_rate": 4.619398403928791e-07, "loss": 0.011867377907037735, "memory(GiB)": 90.94, "reward": 0.8286948800086975, "reward_std": 0.27843618392944336, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9338235855102539, "rewards/PlanningActionSetORM/std": 0.0019607841968536377, "rewards/RMReward/mean": 0.9593750238418579, "rewards/RMReward/std": 0.1280868947505951, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 602, "train_speed(iter/s)": 0.017307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 121.03125, "completions/min_length": 8.0, "epoch": 0.009256132379578178, "frac_reward_zero_std": 0.0, "grad_norm": 51.08638000488281, "kl": 0.22635866701602936, "learning_rate": 4.6270718232044203e-07, "loss": -0.06590534001588821, "memory(GiB)": 90.94, "reward": 0.33687329292297363, "reward_std": 0.2774176597595215, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.14874663949012756, "rewards/VisualPerceptionAccuracy/std": 0.0642574280500412, "step": 603, "train_speed(iter/s)": 0.017317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 628.21875, "completions/min_length": 216.0, "epoch": 0.009271482516194394, "frac_reward_zero_std": 0.0, "grad_norm": 1.2735247611999512, "kl": 0.012090755626559258, "learning_rate": 4.6347452424800495e-07, "loss": -0.027233945205807686, "memory(GiB)": 90.94, "reward": 0.24681934714317322, "reward_std": 0.17978903651237488, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.24681934714317322, "rewards/VisualPerceptionAccuracy/std": 0.17948441207408905, "step": 604, "train_speed(iter/s)": 0.017325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 85.15625, "completions/min_length": 8.0, "epoch": 0.00928683265281061, "frac_reward_zero_std": 0.0, "grad_norm": 35.17012023925781, "kl": 0.3396991491317749, "learning_rate": 4.642418661755679e-07, "loss": 0.002721088472753763, "memory(GiB)": 90.94, "reward": 0.7728124856948853, "reward_std": 0.279270201921463, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8031250238418579, "rewards/RMReward/std": 0.12970317900180817, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 605, "train_speed(iter/s)": 0.017319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 129.40625, "completions/min_length": 92.0, "epoch": 0.009302182789426825, "frac_reward_zero_std": 0.0, "grad_norm": 2.8840413093566895, "kl": 0.0679096207022667, "learning_rate": 4.650092081031308e-07, "loss": 0.023494787514209747, "memory(GiB)": 90.94, "reward": 0.5271163582801819, "reward_std": 0.17083440721035004, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9479166865348816, "rewards/PlanningActionSetORM/std": 0.10481465607881546, "rewards/RMReward/mean": 0.828125, "rewards/RMReward/std": 0.12512493133544922, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2021494209766388, "rewards/VisualPerceptionAccuracy/std": 0.23732168972492218, "step": 606, "train_speed(iter/s)": 0.017319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/mean_length": 309.0, "completions/min_length": 162.0, "epoch": 0.009317532926043042, "frac_reward_zero_std": 0.0, "grad_norm": 1.0900659561157227, "kl": 0.03459532558917999, "learning_rate": 4.657765500306937e-07, "loss": -0.014526225626468658, "memory(GiB)": 90.94, "reward": 0.5067848563194275, "reward_std": 0.09481371194124222, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9025000333786011, "rewards/RMReward/std": 0.10661457479000092, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09156971424818039, "rewards/VisualPerceptionAccuracy/std": 0.10433577746152878, "step": 607, "train_speed(iter/s)": 0.017309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 138.75, "completions/min_length": 69.0, "epoch": 0.009332883062659258, "frac_reward_zero_std": 0.0, "grad_norm": 2.3556864261627197, "kl": 0.0610874705016613, "learning_rate": 4.665438919582566e-07, "loss": 0.07710473239421844, "memory(GiB)": 90.94, "reward": 0.6680654883384705, "reward_std": 0.14243462681770325, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9403273463249207, "rewards/PlanningActionSetORM/std": 0.10256687551736832, "rewards/RMReward/mean": 0.6000000238418579, "rewards/RMReward/std": 0.17133253812789917, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 608, "train_speed(iter/s)": 0.01728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/mean_length": 225.65625, "completions/min_length": 86.0, "epoch": 0.009348233199275474, "frac_reward_zero_std": 0.0, "grad_norm": 1.5993715524673462, "kl": 0.07990635931491852, "learning_rate": 4.6731123388581953e-07, "loss": -0.04980270937085152, "memory(GiB)": 90.94, "reward": 0.5053103566169739, "reward_std": 0.041737303137779236, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9437500238418579, "rewards/RMReward/std": 0.017078254371881485, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05562075227499008, "rewards/VisualPerceptionAccuracy/std": 0.06981198489665985, "step": 609, "train_speed(iter/s)": 0.017284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 134.28125, "completions/min_length": 103.0, "epoch": 0.00936358333589169, "frac_reward_zero_std": 0.0, "grad_norm": 1.9346609115600586, "kl": 0.09240606427192688, "learning_rate": 4.6807857581338246e-07, "loss": 0.04608698934316635, "memory(GiB)": 90.94, "reward": 0.7616374492645264, "reward_std": 0.06798338890075684, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9281870126724243, "rewards/PlanningActionSetORM/std": 0.10665634274482727, "rewards/RMReward/mean": 0.7200000286102295, "rewards/RMReward/std": 0.11623113602399826, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 610, "train_speed(iter/s)": 0.017289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/mean_length": 104.8125, "completions/min_length": 72.0, "epoch": 0.009378933472507905, "frac_reward_zero_std": 0.0, "grad_norm": 1.9158326387405396, "kl": 0.11491276323795319, "learning_rate": 4.688459177409454e-07, "loss": -0.03935626894235611, "memory(GiB)": 90.94, "reward": 0.7992187738418579, "reward_std": 0.07561680674552917, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9085937738418579, "rewards/PlanningActionSetORM/std": 0.09767650067806244, "rewards/RMReward/mean": 0.7718750238418579, "rewards/RMReward/std": 0.10993950068950653, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 611, "train_speed(iter/s)": 0.017276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/mean_length": 147.09375, "completions/min_length": 91.0, "epoch": 0.00939428360912412, "frac_reward_zero_std": 0.0, "grad_norm": 1.2587531805038452, "kl": 0.09362341463565826, "learning_rate": 4.696132596685083e-07, "loss": -0.00787162035703659, "memory(GiB)": 90.94, "reward": 0.9187499284744263, "reward_std": 0.05474440008401871, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8984375, "rewards/RMReward/std": 0.06778091937303543, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 612, "train_speed(iter/s)": 0.017238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 81.5, "completions/min_length": 8.0, "epoch": 0.009409633745740336, "frac_reward_zero_std": 0.0, "grad_norm": 19.435728073120117, "kl": 0.41806167364120483, "learning_rate": 4.7038060159607124e-07, "loss": 0.009538061916828156, "memory(GiB)": 90.94, "reward": 0.4921875, "reward_std": 0.14691615104675293, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.84375, "rewards/RMReward/std": 0.0704154297709465, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 613, "train_speed(iter/s)": 0.017242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 111.875, "completions/min_length": 91.0, "epoch": 0.009424983882356554, "frac_reward_zero_std": 0.0, "grad_norm": 3.4082539081573486, "kl": 0.061993785202503204, "learning_rate": 4.7114794352363416e-07, "loss": 0.011808447539806366, "memory(GiB)": 90.94, "reward": 0.5533921718597412, "reward_std": 0.09131033718585968, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.981249988079071, "rewards/PlanningActionSetORM/std": 0.05123474821448326, "rewards/RMReward/mean": 1.0, "rewards/RMReward/std": 0.0, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.11053426563739777, "rewards/VisualPerceptionAccuracy/std": 0.1723737269639969, "step": 614, "train_speed(iter/s)": 0.017241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 58.1875, "completions/min_length": 8.0, "epoch": 0.00944033401897277, "frac_reward_zero_std": 0.0, "grad_norm": 35.315887451171875, "kl": 0.807303786277771, "learning_rate": 4.719152854511971e-07, "loss": -0.013342161662876606, "memory(GiB)": 90.94, "reward": 0.8448660373687744, "reward_std": 0.21468135714530945, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9892857074737549, "rewards/PlanningActionSetORM/std": 0.02973809465765953, "rewards/RMReward/mean": 0.8374999761581421, "rewards/RMReward/std": 0.056273143738508224, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 615, "train_speed(iter/s)": 0.017238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 277.375, "completions/min_length": 127.0, "epoch": 0.009455684155588985, "frac_reward_zero_std": 0.0, "grad_norm": 0.8955515623092651, "kl": 0.0511813759803772, "learning_rate": 4.7268262737875996e-07, "loss": 0.054938171058893204, "memory(GiB)": 90.94, "reward": 0.8118830919265747, "reward_std": 0.09461420029401779, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9656651616096497, "rewards/PlanningActionSetORM/std": 0.036614157259464264, "rewards/RMReward/mean": 0.7734375, "rewards/RMReward/std": 0.12571369111537933, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 616, "train_speed(iter/s)": 0.017216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 136.875, "completions/min_length": 94.0, "epoch": 0.0094710342922052, "frac_reward_zero_std": 0.0, "grad_norm": 1.0992785692214966, "kl": 0.07812923192977905, "learning_rate": 4.734499693063229e-07, "loss": 0.02326810359954834, "memory(GiB)": 90.94, "reward": 0.7925000190734863, "reward_std": 0.05655529722571373, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9624999761581421, "rewards/PlanningActionSetORM/std": 0.11845782399177551, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.0832795575261116, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 617, "train_speed(iter/s)": 0.017196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 91.25, "completions/min_length": 8.0, "epoch": 0.009486384428821416, "frac_reward_zero_std": 0.0, "grad_norm": 30.45676612854004, "kl": 0.5183064937591553, "learning_rate": 4.742173112338858e-07, "loss": -0.02953854575753212, "memory(GiB)": 90.94, "reward": 0.4062679708003998, "reward_std": 0.32646462321281433, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.22816091775894165, "rewards/VisualPerceptionAccuracy/std": 0.16619910299777985, "step": 618, "train_speed(iter/s)": 0.01722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 76.625, "completions/min_length": 8.0, "epoch": 0.009501734565437632, "frac_reward_zero_std": 0.0, "grad_norm": 13.065408706665039, "kl": 0.6590441465377808, "learning_rate": 4.7498465316144874e-07, "loss": 0.042958177626132965, "memory(GiB)": 90.94, "reward": 0.4646875262260437, "reward_std": 0.20515987277030945, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.21602468192577362, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 619, "train_speed(iter/s)": 0.017215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/mean_length": 59.40625, "completions/min_length": 8.0, "epoch": 0.009517084702053848, "frac_reward_zero_std": 0.0, "grad_norm": 59.381385803222656, "kl": 0.5833268761634827, "learning_rate": 4.757519950890117e-07, "loss": -0.013209369033575058, "memory(GiB)": 90.94, "reward": 0.6159374713897705, "reward_std": 0.25826501846313477, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.856249988079071, "rewards/RMReward/std": 0.07719024270772934, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 620, "train_speed(iter/s)": 0.017216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/mean_length": 57.0625, "completions/min_length": 8.0, "epoch": 0.009532434838670063, "frac_reward_zero_std": 0.0, "grad_norm": 9.812414169311523, "kl": 0.49791303277015686, "learning_rate": 4.7651933701657465e-07, "loss": 0.011210789903998375, "memory(GiB)": 90.94, "reward": 0.5404375195503235, "reward_std": 0.12079939246177673, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9956250190734863, "rewards/RMReward/std": 0.005123470444232225, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 621, "train_speed(iter/s)": 0.017197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 188.5, "completions/min_length": 95.0, "epoch": 0.00954778497528628, "frac_reward_zero_std": 0.0, "grad_norm": 2.2387166023254395, "kl": 0.06921859830617905, "learning_rate": 4.772866789441376e-07, "loss": 0.023870810866355896, "memory(GiB)": 90.94, "reward": 0.5462729334831238, "reward_std": 0.05513911694288254, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.9299999475479126, "rewards/RMReward/std": 0.049531131982803345, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.15354596078395844, "rewards/VisualPerceptionAccuracy/std": 0.06309632211923599, "step": 622, "train_speed(iter/s)": 0.017195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/mean_length": 105.875, "completions/min_length": 100.0, "epoch": 0.009563135111902496, "frac_reward_zero_std": 0.0, "grad_norm": 1.5164318084716797, "kl": 0.12020145356655121, "learning_rate": 4.780540208717004e-07, "loss": 0.008200649172067642, "memory(GiB)": 90.94, "reward": 0.8531250357627869, "reward_std": 0.04681898280978203, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.817187488079071, "rewards/RMReward/std": 0.07362653315067291, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 623, "train_speed(iter/s)": 0.017193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 184.125, "completions/min_length": 106.0, "epoch": 0.009578485248518712, "frac_reward_zero_std": 0.0, "grad_norm": 0.8766635060310364, "kl": 0.06956459581851959, "learning_rate": 4.788213627992634e-07, "loss": 0.001507822424173355, "memory(GiB)": 90.94, "reward": 0.9465577006340027, "reward_std": 0.03820435330271721, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.9428125023841858, "rewards/RMReward/std": 0.07701002806425095, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 624, "train_speed(iter/s)": 0.017167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 120.625, "completions/min_length": 76.0, "epoch": 0.009593835385134928, "frac_reward_zero_std": 0.0, "grad_norm": 2.611072063446045, "kl": 0.048769038170576096, "learning_rate": 4.795887047268263e-07, "loss": -0.017616480588912964, "memory(GiB)": 90.94, "reward": 0.4842023551464081, "reward_std": 0.13116416335105896, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9727678298950195, "rewards/PlanningActionSetORM/std": 0.10892858356237411, "rewards/RMReward/mean": 0.8743749856948853, "rewards/RMReward/std": 0.14778220653533936, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07435113191604614, "rewards/VisualPerceptionAccuracy/std": 0.13193535804748535, "step": 625, "train_speed(iter/s)": 0.017164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/mean_length": 235.90625, "completions/min_length": 98.0, "epoch": 0.009609185521751143, "frac_reward_zero_std": 0.0, "grad_norm": 2.445434093475342, "kl": 0.021940739825367928, "learning_rate": 4.803560466543893e-07, "loss": 0.09565643221139908, "memory(GiB)": 90.94, "reward": 0.3417258858680725, "reward_std": 0.11365124583244324, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3417258858680725, "rewards/VisualPerceptionAccuracy/std": 0.12439797818660736, "step": 626, "train_speed(iter/s)": 0.017178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 11.1875, "completions/min_length": 8.0, "epoch": 0.009624535658367359, "frac_reward_zero_std": 0.0, "grad_norm": 25.523723602294922, "kl": 0.48496612906455994, "learning_rate": 4.811233885819522e-07, "loss": 0.013758648186922073, "memory(GiB)": 90.94, "reward": 0.46562498807907104, "reward_std": 0.4188675582408905, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 627, "train_speed(iter/s)": 0.017202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/mean_length": 130.0, "completions/min_length": 83.0, "epoch": 0.009639885794983575, "frac_reward_zero_std": 0.0, "grad_norm": 2.285313367843628, "kl": 0.08535836637020111, "learning_rate": 4.818907305095151e-07, "loss": -0.001759018748998642, "memory(GiB)": 90.94, "reward": 0.8066146373748779, "reward_std": 0.06544861197471619, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9643229246139526, "rewards/PlanningActionSetORM/std": 0.08893118798732758, "rewards/RMReward/mean": 0.7671875357627869, "rewards/RMReward/std": 0.0857691764831543, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 628, "train_speed(iter/s)": 0.017206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 57.25, "completions/min_length": 8.0, "epoch": 0.009655235931599792, "frac_reward_zero_std": 0.0, "grad_norm": 18.553747177124023, "kl": 0.5007448792457581, "learning_rate": 4.82658072437078e-07, "loss": 0.003341319039463997, "memory(GiB)": 90.94, "reward": 0.7953125238418579, "reward_std": 0.2567998170852661, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.07352720201015472, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 629, "train_speed(iter/s)": 0.017211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 69.1875, "completions/min_length": 8.0, "epoch": 0.009670586068216008, "frac_reward_zero_std": 0.0, "grad_norm": 11.981245994567871, "kl": 0.546134889125824, "learning_rate": 4.83425414364641e-07, "loss": -0.0068529509007930756, "memory(GiB)": 90.94, "reward": 0.8506250381469727, "reward_std": 0.1864646077156067, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7750000357627869, "rewards/RMReward/std": 0.060553014278411865, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 630, "train_speed(iter/s)": 0.017217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/mean_length": 70.3125, "completions/min_length": 8.0, "epoch": 0.009685936204832223, "frac_reward_zero_std": 0.0, "grad_norm": 41.562435150146484, "kl": 0.6004064083099365, "learning_rate": 4.841927562922039e-07, "loss": 0.00568948220461607, "memory(GiB)": 90.94, "reward": 0.7518228888511658, "reward_std": 0.27480560541152954, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8994791507720947, "rewards/PlanningActionSetORM/std": 0.007739241700619459, "rewards/RMReward/mean": 0.8500000238418579, "rewards/RMReward/std": 0.09309493005275726, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 631, "train_speed(iter/s)": 0.017201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 308.21875, "completions/min_length": 8.0, "epoch": 0.009701286341448439, "frac_reward_zero_std": 0.0, "grad_norm": 53.35213088989258, "kl": 0.5919615030288696, "learning_rate": 4.849600982197667e-07, "loss": -0.012130755931138992, "memory(GiB)": 90.94, "reward": 0.45299553871154785, "reward_std": 0.227819561958313, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.08411611616611481, "rewards/VisualPerceptionAccuracy/std": 0.07268186658620834, "step": 632, "train_speed(iter/s)": 0.017208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.009716636478064655, "frac_reward_zero_std": 1.0, "grad_norm": 1.9692892237799242e-05, "kl": 0.87109375, "learning_rate": 4.857274401473297e-07, "loss": 0.0008700303733348846, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 633, "train_speed(iter/s)": 0.017231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 195.375, "completions/min_length": 110.0, "epoch": 0.00973198661468087, "frac_reward_zero_std": 0.0, "grad_norm": 1.48134183883667, "kl": 0.07816536724567413, "learning_rate": 4.864947820748926e-07, "loss": 0.01650015451014042, "memory(GiB)": 90.94, "reward": 0.8423076868057251, "reward_std": 0.03692292422056198, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.07295601814985275, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 634, "train_speed(iter/s)": 0.017228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 14.375, "completions/min_length": 13.0, "epoch": 0.009747336751297086, "frac_reward_zero_std": 0.0, "grad_norm": 9.651278495788574, "kl": 0.12524919211864471, "learning_rate": 4.872621240024556e-07, "loss": -0.01705031841993332, "memory(GiB)": 90.94, "reward": 0.8515625, "reward_std": 0.3537220358848572, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.84375, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 635, "train_speed(iter/s)": 0.017223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/mean_length": 54.21875, "completions/min_length": 8.0, "epoch": 0.009762686887913303, "frac_reward_zero_std": 0.0, "grad_norm": 11.392610549926758, "kl": 0.6810470223426819, "learning_rate": 4.880294659300184e-07, "loss": 0.00024308264255523682, "memory(GiB)": 90.94, "reward": 0.9440624713897705, "reward_std": 0.1328330934047699, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.934374988079071, "rewards/RMReward/std": 0.035207707434892654, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 636, "train_speed(iter/s)": 0.017231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/mean_length": 164.53125, "completions/min_length": 98.0, "epoch": 0.009778037024529519, "frac_reward_zero_std": 0.0, "grad_norm": 1.9152065515518188, "kl": 0.07437677681446075, "learning_rate": 4.887968078575814e-07, "loss": 0.1530182659626007, "memory(GiB)": 90.94, "reward": 0.5647285580635071, "reward_std": 0.07897253334522247, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8999999761581421, "rewards/RMReward/std": 0.05773501843214035, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2094571888446808, "rewards/VisualPerceptionAccuracy/std": 0.1117570623755455, "step": 637, "train_speed(iter/s)": 0.017234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 184.0625, "completions/min_length": 184.0, "epoch": 0.009793387161145735, "frac_reward_zero_std": 0.0, "grad_norm": 0.13090746104717255, "kl": 0.07003574073314667, "learning_rate": 4.895641497851443e-07, "loss": 7.279962301254272e-05, "memory(GiB)": 90.94, "reward": 0.9282500147819519, "reward_std": 0.02962104231119156, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9103124737739563, "rewards/RMReward/std": 0.10243752598762512, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 638, "train_speed(iter/s)": 0.017207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 197.125, "completions/min_length": 105.0, "epoch": 0.00980873729776195, "frac_reward_zero_std": 0.0, "grad_norm": 0.8798655271530151, "kl": 0.07213570177555084, "learning_rate": 4.903314917127073e-07, "loss": 0.0013218000531196594, "memory(GiB)": 90.94, "reward": 0.914557695388794, "reward_std": 0.04775324836373329, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.9028124809265137, "rewards/RMReward/std": 0.1208634003996849, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 639, "train_speed(iter/s)": 0.017157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/mean_length": 59.25, "completions/min_length": 13.0, "epoch": 0.009824087434378166, "frac_reward_zero_std": 0.0, "grad_norm": 9.801433563232422, "kl": 0.24938684701919556, "learning_rate": 4.910988336402701e-07, "loss": 0.01869812235236168, "memory(GiB)": 90.94, "reward": 0.8215624690055847, "reward_std": 0.24371886253356934, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9249999523162842, "rewards/RMReward/std": 0.04082481563091278, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 640, "train_speed(iter/s)": 0.017159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 279.90625, "completions/min_length": 104.0, "epoch": 0.009839437570994381, "frac_reward_zero_std": 0.0, "grad_norm": 1.2106349468231201, "kl": 0.05153132602572441, "learning_rate": 4.91866175567833e-07, "loss": 0.06044390797615051, "memory(GiB)": 90.94, "reward": 0.6739914417266846, "reward_std": 0.14348994195461273, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.988707423210144, "rewards/PlanningActionSetORM/std": 0.033984165638685226, "rewards/RMReward/mean": 0.5953125357627869, "rewards/RMReward/std": 0.18594110012054443, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 641, "train_speed(iter/s)": 0.017149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/mean_length": 63.59375, "completions/min_length": 8.0, "epoch": 0.009854787707610597, "frac_reward_zero_std": 0.0, "grad_norm": 20.102745056152344, "kl": 0.5030133128166199, "learning_rate": 4.92633517495396e-07, "loss": 0.001702653244137764, "memory(GiB)": 90.94, "reward": 0.944812536239624, "reward_std": 0.14188438653945923, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9362499713897705, "rewards/RMReward/std": 0.05783597007393837, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 642, "train_speed(iter/s)": 0.017153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 11.59375, "completions/min_length": 8.0, "epoch": 0.009870137844226813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009738627704791725, "kl": 0.3333434462547302, "learning_rate": 4.934008594229589e-07, "loss": 0.00033352436730638146, "memory(GiB)": 90.94, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 643, "train_speed(iter/s)": 0.01715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/mean_length": 105.53125, "completions/min_length": 101.0, "epoch": 0.00988548798084303, "frac_reward_zero_std": 0.0, "grad_norm": 1.6221320629119873, "kl": 0.08714481443166733, "learning_rate": 4.941682013505218e-07, "loss": -0.004448361694812775, "memory(GiB)": 90.94, "reward": 0.9287499785423279, "reward_std": 0.047100719064474106, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9109375476837158, "rewards/RMReward/std": 0.06925218552350998, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 644, "train_speed(iter/s)": 0.017143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 170.78125, "completions/min_length": 68.0, "epoch": 0.009900838117459246, "frac_reward_zero_std": 0.0, "grad_norm": 1.2141804695129395, "kl": 0.06951069086790085, "learning_rate": 4.949355432780847e-07, "loss": 0.004145216196775436, "memory(GiB)": 90.94, "reward": 0.9622499942779541, "reward_std": 0.027527760714292526, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9528125524520874, "rewards/RMReward/std": 0.06269405037164688, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 645, "train_speed(iter/s)": 0.017143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 60.09375, "completions/min_length": 8.0, "epoch": 0.009916188254075461, "frac_reward_zero_std": 0.0, "grad_norm": 27.300107955932617, "kl": 0.3186866343021393, "learning_rate": 4.957028852056477e-07, "loss": -0.017425253987312317, "memory(GiB)": 90.94, "reward": 0.8671875, "reward_std": 0.17975232005119324, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.878125011920929, "rewards/PlanningActionSetORM/std": 0.008539117872714996, "rewards/RMReward/mean": 0.846875011920929, "rewards/RMReward/std": 0.042695626616477966, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 646, "train_speed(iter/s)": 0.017149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 107.875, "completions/min_length": 88.0, "epoch": 0.009931538390691677, "frac_reward_zero_std": 0.0, "grad_norm": 2.5125558376312256, "kl": 0.06432423740625381, "learning_rate": 4.964702271332106e-07, "loss": -0.005381416529417038, "memory(GiB)": 90.94, "reward": 0.5855178236961365, "reward_std": 0.07119166105985641, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8999999761581421, "rewards/RMReward/std": 0.07958223670721054, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2510357201099396, "rewards/VisualPerceptionAccuracy/std": 0.07871754467487335, "step": 647, "train_speed(iter/s)": 0.017156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 250.125, "completions/min_length": 232.0, "epoch": 0.009946888527307893, "frac_reward_zero_std": 0.0, "grad_norm": 0.4704650342464447, "kl": 0.04234021529555321, "learning_rate": 4.972375690607735e-07, "loss": -0.0005712881684303284, "memory(GiB)": 90.94, "reward": 0.9293076992034912, "reward_std": 0.10372859239578247, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.9212499856948853, "rewards/RMReward/std": 0.15363866090774536, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 648, "train_speed(iter/s)": 0.017136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 111.4375, "completions/min_length": 98.0, "epoch": 0.009962238663924108, "frac_reward_zero_std": 0.0, "grad_norm": 1.5230727195739746, "kl": 0.09221327304840088, "learning_rate": 4.980049109883364e-07, "loss": 0.03243201971054077, "memory(GiB)": 90.94, "reward": 0.9269999265670776, "reward_std": 0.050426237285137177, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9087499380111694, "rewards/RMReward/std": 0.06318175792694092, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 649, "train_speed(iter/s)": 0.017108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.009977588800540324, "frac_reward_zero_std": 0.0, "grad_norm": 76.33328247070312, "kl": 0.91796875, "learning_rate": 4.987722529158993e-07, "loss": 0.0009177252650260925, "memory(GiB)": 90.94, "reward": 0.8218749761581421, "reward_std": 0.3746698498725891, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.3965577781200409, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 650, "train_speed(iter/s)": 0.017132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 180.3125, "completions/min_length": 100.0, "epoch": 0.009992938937156541, "frac_reward_zero_std": 0.0, "grad_norm": 1.2542628049850464, "kl": 0.06914187967777252, "learning_rate": 4.995395948434623e-07, "loss": 0.0018538013100624084, "memory(GiB)": 90.94, "reward": 0.8756591081619263, "reward_std": 0.06788256019353867, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9545454978942871, "rewards/PlanningActionSetORM/std": 0.046181850135326385, "rewards/RMReward/mean": 0.8559374809265137, "rewards/RMReward/std": 0.10865332186222076, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 651, "train_speed(iter/s)": 0.017101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.96875, "completions/min_length": 13.0, "epoch": 0.010008289073772757, "frac_reward_zero_std": 0.0, "grad_norm": 9.796310424804688, "kl": 0.2706412672996521, "learning_rate": 5.003069367710251e-07, "loss": -0.019946686923503876, "memory(GiB)": 90.94, "reward": 0.9109375476837158, "reward_std": 0.28099340200424194, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.90625, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 652, "train_speed(iter/s)": 0.017122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 122.0, "completions/min_length": 13.0, "epoch": 0.010023639210388973, "frac_reward_zero_std": 1.0, "grad_norm": 0.005644581280648708, "kl": 0.20460790395736694, "learning_rate": 5.010742786985881e-07, "loss": 0.00020425673574209213, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 1.0, "rewards/RMReward/std": 0.0, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 653, "train_speed(iter/s)": 0.017101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 130.65625, "completions/min_length": 97.0, "epoch": 0.010038989347005188, "frac_reward_zero_std": 0.0, "grad_norm": 1.9933488368988037, "kl": 0.07577458769083023, "learning_rate": 5.01841620626151e-07, "loss": 0.046118151396512985, "memory(GiB)": 90.94, "reward": 0.8535000085830688, "reward_std": 0.10319438576698303, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.8199999928474426, "rewards/RMReward/std": 0.14655175805091858, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 654, "train_speed(iter/s)": 0.0171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 202.5625, "completions/min_length": 90.0, "epoch": 0.010054339483621404, "frac_reward_zero_std": 0.0, "grad_norm": 2.2902190685272217, "kl": 0.06022392213344574, "learning_rate": 5.02608962553714e-07, "loss": 0.017109137028455734, "memory(GiB)": 90.94, "reward": 0.6315159797668457, "reward_std": 0.0899578407406807, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.909375011920929, "rewards/RMReward/std": 0.06381940096616745, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3355320394039154, "rewards/VisualPerceptionAccuracy/std": 0.12886019051074982, "step": 655, "train_speed(iter/s)": 0.0171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 186.6875, "completions/min_length": 114.0, "epoch": 0.01006968962023762, "frac_reward_zero_std": 0.0, "grad_norm": 1.1140108108520508, "kl": 0.05195700377225876, "learning_rate": 5.03376304481277e-07, "loss": 0.02172229439020157, "memory(GiB)": 90.94, "reward": 0.5008558034896851, "reward_std": 0.12808506190776825, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9375, "rewards/RMReward/std": 0.25, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05171159654855728, "rewards/VisualPerceptionAccuracy/std": 0.056170135736465454, "step": 656, "train_speed(iter/s)": 0.017106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 136.25, "completions/min_length": 69.0, "epoch": 0.010085039756853835, "frac_reward_zero_std": 0.0, "grad_norm": 2.34753680229187, "kl": 0.056657012552022934, "learning_rate": 5.041436464088398e-07, "loss": -0.07328768074512482, "memory(GiB)": 90.94, "reward": 0.5274672508239746, "reward_std": 0.100489042699337, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.0573730543255806, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22993451356887817, "rewards/VisualPerceptionAccuracy/std": 0.1550796777009964, "step": 657, "train_speed(iter/s)": 0.017109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 169.34375, "completions/min_length": 8.0, "epoch": 0.010100389893470053, "frac_reward_zero_std": 0.0, "grad_norm": 27.696086883544922, "kl": 0.658991813659668, "learning_rate": 5.049109883364028e-07, "loss": -0.04490796476602554, "memory(GiB)": 90.94, "reward": 0.8453899025917053, "reward_std": 0.1858254224061966, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9351489543914795, "rewards/PlanningActionSetORM/std": 0.012636031024158001, "rewards/RMReward/mean": 0.778124988079071, "rewards/RMReward/std": 0.05764475092291832, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 658, "train_speed(iter/s)": 0.017103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/mean_length": 11.0, "completions/min_length": 8.0, "epoch": 0.010115740030086268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018130317330360413, "kl": 0.5284597873687744, "learning_rate": 5.056783302639657e-07, "loss": 0.0005271573318168521, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 659, "train_speed(iter/s)": 0.017083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/mean_length": 94.15625, "completions/min_length": 65.0, "epoch": 0.010131090166702484, "frac_reward_zero_std": 0.0, "grad_norm": 3.0150625705718994, "kl": 0.07146899402141571, "learning_rate": 5.064456721915287e-07, "loss": -0.007521394640207291, "memory(GiB)": 90.94, "reward": 0.5004476308822632, "reward_std": 0.060295552015304565, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9356249570846558, "rewards/RMReward/std": 0.044116321951150894, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0523952841758728, "rewards/VisualPerceptionAccuracy/std": 0.08529803901910782, "step": 660, "train_speed(iter/s)": 0.017088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 131.28125, "completions/min_length": 79.0, "epoch": 0.0101464403033187, "frac_reward_zero_std": 0.0, "grad_norm": 2.231421947479248, "kl": 0.041397225111722946, "learning_rate": 5.072130141190915e-07, "loss": -0.060193516314029694, "memory(GiB)": 90.94, "reward": 0.4574809670448303, "reward_std": 0.08939924836158752, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8708333373069763, "rewards/PlanningActionSetORM/std": 0.15942604839801788, "rewards/RMReward/mean": 0.7625000476837158, "rewards/RMReward/std": 0.08465617150068283, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13079522550106049, "rewards/VisualPerceptionAccuracy/std": 0.11052616685628891, "step": 661, "train_speed(iter/s)": 0.017094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 473.5625, "completions/min_length": 190.0, "epoch": 0.010161790439934915, "frac_reward_zero_std": 0.0, "grad_norm": 1.0559390783309937, "kl": 0.024475643411278725, "learning_rate": 5.079803560466544e-07, "loss": 0.00122736394405365, "memory(GiB)": 90.94, "reward": 0.5152161121368408, "reward_std": 0.10940476506948471, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9477577209472656, "rewards/PlanningActionSetORM/std": 0.0024482335429638624, "rewards/RMReward/mean": 0.8262500166893005, "rewards/RMReward/std": 0.13657110929489136, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.17988070845603943, "rewards/VisualPerceptionAccuracy/std": 0.10968736559152603, "step": 662, "train_speed(iter/s)": 0.017072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 120.40625, "completions/min_length": 13.0, "epoch": 0.010177140576551131, "frac_reward_zero_std": 0.0, "grad_norm": 9.732826232910156, "kl": 0.16078534722328186, "learning_rate": 5.087476979742174e-07, "loss": -0.02522641234099865, "memory(GiB)": 90.94, "reward": 0.8498125076293945, "reward_std": 0.2324419766664505, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9956250190734863, "rewards/RMReward/std": 0.012632632628083229, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 663, "train_speed(iter/s)": 0.017065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/mean_length": 428.53125, "completions/min_length": 292.0, "epoch": 0.010192490713167347, "frac_reward_zero_std": 0.0, "grad_norm": 0.9747546911239624, "kl": 0.021288715302944183, "learning_rate": 5.095150399017803e-07, "loss": -0.07350832223892212, "memory(GiB)": 90.94, "reward": 0.49757376313209534, "reward_std": 0.14588820934295654, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9553714990615845, "rewards/PlanningActionSetORM/std": 0.002865873510017991, "rewards/RMReward/mean": 0.7768750190734863, "rewards/RMReward/std": 0.22747071087360382, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.18257319927215576, "rewards/VisualPerceptionAccuracy/std": 0.10950319468975067, "step": 664, "train_speed(iter/s)": 0.017051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/mean_length": 322.3125, "completions/min_length": 239.0, "epoch": 0.010207840849783562, "frac_reward_zero_std": 0.0, "grad_norm": 0.27842414379119873, "kl": 0.03725852444767952, "learning_rate": 5.102823818293432e-07, "loss": 0.008966408669948578, "memory(GiB)": 90.94, "reward": 0.7787500619888306, "reward_std": 0.06399586796760559, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.7265625, "rewards/RMReward/std": 0.0983610674738884, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 665, "train_speed(iter/s)": 0.017047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/mean_length": 94.34375, "completions/min_length": 76.0, "epoch": 0.01022319098639978, "frac_reward_zero_std": 0.0, "grad_norm": 1.699456810951233, "kl": 0.12400448322296143, "learning_rate": 5.110497237569061e-07, "loss": 0.015509601682424545, "memory(GiB)": 90.94, "reward": 0.9274687767028809, "reward_std": 0.07409239560365677, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.99609375, "rewards/PlanningActionSetORM/std": 0.022097086533904076, "rewards/RMReward/mean": 0.9103125333786011, "rewards/RMReward/std": 0.14427760243415833, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 666, "train_speed(iter/s)": 0.017019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 226.625, "completions/min_length": 173.0, "epoch": 0.010238541123015995, "frac_reward_zero_std": 0.0, "grad_norm": 0.23925645649433136, "kl": 0.036750033497810364, "learning_rate": 5.118170656844691e-07, "loss": 0.012410003691911697, "memory(GiB)": 90.94, "reward": 0.878000020980835, "reward_std": 0.050312209874391556, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8474999666213989, "rewards/RMReward/std": 0.17492854595184326, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 667, "train_speed(iter/s)": 0.016997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 118.40625, "completions/min_length": 106.0, "epoch": 0.010253891259632211, "frac_reward_zero_std": 0.0, "grad_norm": 1.4198265075683594, "kl": 0.0721539556980133, "learning_rate": 5.12584407612032e-07, "loss": 0.003192078322172165, "memory(GiB)": 90.94, "reward": 0.887499988079071, "reward_std": 0.05208711698651314, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.06530015915632248, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 668, "train_speed(iter/s)": 0.016983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/mean_length": 311.15625, "completions/min_length": 72.0, "epoch": 0.010269241396248427, "frac_reward_zero_std": 0.0, "grad_norm": 4.39170503616333, "kl": 0.040532492101192474, "learning_rate": 5.133517495395948e-07, "loss": -0.05750482529401779, "memory(GiB)": 90.94, "reward": 0.21937736868858337, "reward_std": 0.10467880964279175, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21937736868858337, "rewards/VisualPerceptionAccuracy/std": 0.14220963418483734, "step": 669, "train_speed(iter/s)": 0.016998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/mean_length": 151.125, "completions/min_length": 97.0, "epoch": 0.010284591532864642, "frac_reward_zero_std": 0.0, "grad_norm": 1.324571967124939, "kl": 0.08654100447893143, "learning_rate": 5.141190914671578e-07, "loss": 0.0010358989238739014, "memory(GiB)": 90.94, "reward": 0.9214999675750732, "reward_std": 0.03406501188874245, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9018750190734863, "rewards/RMReward/std": 0.10929207503795624, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 670, "train_speed(iter/s)": 0.016977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/mean_length": 113.0, "completions/min_length": 80.0, "epoch": 0.010299941669480858, "frac_reward_zero_std": 0.0, "grad_norm": 1.7148760557174683, "kl": 0.13047845661640167, "learning_rate": 5.148864333947207e-07, "loss": -0.004992213100194931, "memory(GiB)": 90.94, "reward": 0.8575000166893005, "reward_std": 0.06019943207502365, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.824999988079071, "rewards/RMReward/std": 0.06956083327531815, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 671, "train_speed(iter/s)": 0.016965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/mean_length": 416.21875, "completions/min_length": 110.0, "epoch": 0.010315291806097074, "frac_reward_zero_std": 0.0, "grad_norm": 1.1780213117599487, "kl": 0.05496516823768616, "learning_rate": 5.156537753222837e-07, "loss": -0.011045651510357857, "memory(GiB)": 90.94, "reward": 0.48110687732696533, "reward_std": 0.0659768208861351, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8406250476837158, "rewards/RMReward/std": 0.06381940096616745, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08971376717090607, "rewards/VisualPerceptionAccuracy/std": 0.08089815080165863, "step": 672, "train_speed(iter/s)": 0.016968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 60.4375, "completions/min_length": 8.0, "epoch": 0.010330641942713291, "frac_reward_zero_std": 0.0, "grad_norm": 32.48237991333008, "kl": 0.4599156379699707, "learning_rate": 5.164211172498465e-07, "loss": 0.022727176547050476, "memory(GiB)": 90.94, "reward": 0.9465625286102295, "reward_std": 0.14168031513690948, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.940625011920929, "rewards/RMReward/std": 0.0573258176445961, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 673, "train_speed(iter/s)": 0.016971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 288.96875, "completions/min_length": 62.0, "epoch": 0.010345992079329507, "frac_reward_zero_std": 0.0, "grad_norm": 2.059417963027954, "kl": 0.08306089043617249, "learning_rate": 5.171884591774095e-07, "loss": -0.07285396754741669, "memory(GiB)": 90.94, "reward": 0.5145534873008728, "reward_std": 0.05928546190261841, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.893750011920929, "rewards/RMReward/std": 0.0573730394244194, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.11410706490278244, "rewards/VisualPerceptionAccuracy/std": 0.0726725161075592, "step": 674, "train_speed(iter/s)": 0.01695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 137.5, "completions/min_length": 8.0, "epoch": 0.010361342215945722, "frac_reward_zero_std": 0.0, "grad_norm": 43.71916580200195, "kl": 0.47122249007225037, "learning_rate": 5.179558011049724e-07, "loss": 0.0004702955484390259, "memory(GiB)": 90.94, "reward": 0.5591250061988831, "reward_std": 0.26218169927597046, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9368749856948853, "rewards/RMReward/std": 0.24984578788280487, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 675, "train_speed(iter/s)": 0.016956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 444.125, "completions/min_length": 149.0, "epoch": 0.010376692352561938, "frac_reward_zero_std": 0.0, "grad_norm": 1.5147590637207031, "kl": 0.020249057561159134, "learning_rate": 5.187231430325354e-07, "loss": -0.20711761713027954, "memory(GiB)": 90.94, "reward": 0.24464958906173706, "reward_std": 0.1106705367565155, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.24464958906173706, "rewards/VisualPerceptionAccuracy/std": 0.13091789186000824, "step": 676, "train_speed(iter/s)": 0.016957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 184.0, "completions/min_length": 8.0, "epoch": 0.010392042489178153, "frac_reward_zero_std": 0.0, "grad_norm": 20.811405181884766, "kl": 0.62446129322052, "learning_rate": 5.194904849600983e-07, "loss": 0.014448363333940506, "memory(GiB)": 90.94, "reward": 0.5198957920074463, "reward_std": 0.2211986929178238, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.0991666242480278, "rewards/VisualPerceptionAccuracy/std": 0.20489738881587982, "step": 677, "train_speed(iter/s)": 0.016976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/mean_length": 257.5625, "completions/min_length": 93.0, "epoch": 0.01040739262579437, "frac_reward_zero_std": 0.0, "grad_norm": 2.4415009021759033, "kl": 0.02762497588992119, "learning_rate": 5.202578268876611e-07, "loss": -0.031959354877471924, "memory(GiB)": 90.94, "reward": 0.33955156803131104, "reward_std": 0.20887506008148193, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.33955156803131104, "rewards/VisualPerceptionAccuracy/std": 0.23089800775051117, "step": 678, "train_speed(iter/s)": 0.016991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 210.40625, "completions/min_length": 8.0, "epoch": 0.010422742762410585, "frac_reward_zero_std": 0.0, "grad_norm": 36.27178192138672, "kl": 0.3668454587459564, "learning_rate": 5.210251688152241e-07, "loss": 0.006564922630786896, "memory(GiB)": 90.94, "reward": 0.6021875143051147, "reward_std": 0.28346922993659973, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8218750357627869, "rewards/RMReward/std": 0.14020074903964996, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 679, "train_speed(iter/s)": 0.016969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/mean_length": 123.1875, "completions/min_length": 105.0, "epoch": 0.010438092899026802, "frac_reward_zero_std": 0.0, "grad_norm": 0.8178883790969849, "kl": 0.09692177176475525, "learning_rate": 5.21792510742787e-07, "loss": 0.00020097196102142334, "memory(GiB)": 90.94, "reward": 0.9662500023841858, "reward_std": 0.0256030336022377, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.957812488079071, "rewards/RMReward/std": 0.048708103597164154, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 680, "train_speed(iter/s)": 0.016968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/mean_length": 130.1875, "completions/min_length": 8.0, "epoch": 0.010453443035643018, "frac_reward_zero_std": 0.5, "grad_norm": 1.3572877645492554, "kl": 0.5266994833946228, "learning_rate": 5.2255985267035e-07, "loss": 0.04474405199289322, "memory(GiB)": 90.94, "reward": 0.6542326807975769, "reward_std": 0.10620979219675064, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.3084653317928314, "rewards/VisualPerceptionAccuracy/std": 0.21241958439350128, "step": 681, "train_speed(iter/s)": 0.016962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 154.84375, "completions/min_length": 81.0, "epoch": 0.010468793172259233, "frac_reward_zero_std": 0.0, "grad_norm": 2.0977158546447754, "kl": 0.048909205943346024, "learning_rate": 5.233271945979128e-07, "loss": -0.0023884885013103485, "memory(GiB)": 90.94, "reward": 0.5414240956306458, "reward_std": 0.14822056889533997, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9114583134651184, "rewards/PlanningActionSetORM/std": 0.0727677121758461, "rewards/RMReward/mean": 0.8499999642372131, "rewards/RMReward/std": 0.08755949884653091, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22055649757385254, "rewards/VisualPerceptionAccuracy/std": 0.22038106620311737, "step": 682, "train_speed(iter/s)": 0.016966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 159.96875, "completions/min_length": 104.0, "epoch": 0.010484143308875449, "frac_reward_zero_std": 0.0, "grad_norm": 2.353926181793213, "kl": 0.0713505893945694, "learning_rate": 5.240945365254758e-07, "loss": -0.11513775587081909, "memory(GiB)": 90.94, "reward": 0.5560294389724731, "reward_std": 0.08920067548751831, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9075000286102295, "rewards/RMReward/std": 0.0859844908118248, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.18605884909629822, "rewards/VisualPerceptionAccuracy/std": 0.10961377620697021, "step": 683, "train_speed(iter/s)": 0.016964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.010499493445491665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031291977502405643, "kl": 0.884765625, "learning_rate": 5.248618784530387e-07, "loss": 0.0008858293294906616, "memory(GiB)": 90.94, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 684, "train_speed(iter/s)": 0.01697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 152.21875, "completions/min_length": 113.0, "epoch": 0.01051484358210788, "frac_reward_zero_std": 0.0, "grad_norm": 1.5977879762649536, "kl": 0.07406497001647949, "learning_rate": 5.256292203806017e-07, "loss": 0.11598330736160278, "memory(GiB)": 90.94, "reward": 0.8804791569709778, "reward_std": 0.05890025943517685, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9973958730697632, "rewards/PlanningActionSetORM/std": 0.014731387607753277, "rewards/RMReward/mean": 0.8512499928474426, "rewards/RMReward/std": 0.07819537073373795, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 685, "train_speed(iter/s)": 0.016974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 118.375, "completions/min_length": 49.0, "epoch": 0.010530193718724096, "frac_reward_zero_std": 0.0, "grad_norm": 2.4566457271575928, "kl": 0.0752955824136734, "learning_rate": 5.263965623081645e-07, "loss": -0.05736635625362396, "memory(GiB)": 90.94, "reward": 0.5478272438049316, "reward_std": 0.08121301978826523, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9556249976158142, "rewards/RMReward/std": 0.03520771488547325, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13115453720092773, "rewards/VisualPerceptionAccuracy/std": 0.13425986468791962, "step": 686, "train_speed(iter/s)": 0.016968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 183.28125, "completions/min_length": 102.0, "epoch": 0.010545543855340312, "frac_reward_zero_std": 0.0, "grad_norm": 1.3336987495422363, "kl": 0.06981254369020462, "learning_rate": 5.271639042357274e-07, "loss": -0.05769607052206993, "memory(GiB)": 90.94, "reward": 0.6826601028442383, "reward_std": 0.09087351709604263, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9462500214576721, "rewards/RMReward/std": 0.020615534856915474, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.40832024812698364, "rewards/VisualPerceptionAccuracy/std": 0.165254607796669, "step": 687, "train_speed(iter/s)": 0.016973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/mean_length": 155.4375, "completions/min_length": 80.0, "epoch": 0.010560893991956529, "frac_reward_zero_std": 0.0, "grad_norm": 1.760880947113037, "kl": 0.0696030855178833, "learning_rate": 5.279312461632904e-07, "loss": -0.029585443437099457, "memory(GiB)": 90.94, "reward": 0.6096692085266113, "reward_std": 0.11351755261421204, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.921875, "rewards/PlanningActionSetORM/std": 0.0625, "rewards/RMReward/mean": 0.8218749761581421, "rewards/RMReward/std": 0.0546770878136158, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3774634599685669, "rewards/VisualPerceptionAccuracy/std": 0.18556727468967438, "step": 688, "train_speed(iter/s)": 0.016982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/mean_length": 124.84375, "completions/min_length": 104.0, "epoch": 0.010576244128572745, "frac_reward_zero_std": 0.0, "grad_norm": 2.009856939315796, "kl": 0.07321737706661224, "learning_rate": 5.286985880908533e-07, "loss": 0.007375746965408325, "memory(GiB)": 90.94, "reward": 0.8468749523162842, "reward_std": 0.06627137213945389, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.8093750476837158, "rewards/RMReward/std": 0.10273478180170059, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 689, "train_speed(iter/s)": 0.016969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 176.0, "completions/min_length": 104.0, "epoch": 0.01059159426518896, "frac_reward_zero_std": 0.0, "grad_norm": 1.1321582794189453, "kl": 0.056034818291664124, "learning_rate": 5.294659300184162e-07, "loss": 5.58244064450264e-05, "memory(GiB)": 90.94, "reward": 0.8382499814033508, "reward_std": 0.06797965615987778, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8134375214576721, "rewards/RMReward/std": 0.1667451113462448, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 690, "train_speed(iter/s)": 0.016942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 135.5625, "completions/min_length": 97.0, "epoch": 0.010606944401805176, "frac_reward_zero_std": 0.0, "grad_norm": 3.2629549503326416, "kl": 0.03053409792482853, "learning_rate": 5.302332719459792e-07, "loss": -0.04649539664387703, "memory(GiB)": 90.94, "reward": 0.34070253372192383, "reward_std": 0.11151659488677979, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.34070253372192383, "rewards/VisualPerceptionAccuracy/std": 0.17310844361782074, "step": 691, "train_speed(iter/s)": 0.016962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 98.96875, "completions/min_length": 13.0, "epoch": 0.010622294538421392, "frac_reward_zero_std": 0.0, "grad_norm": 6.796281337738037, "kl": 0.17195507884025574, "learning_rate": 5.310006138735421e-07, "loss": -0.01692352071404457, "memory(GiB)": 90.94, "reward": 0.8606250286102295, "reward_std": 0.1801319420337677, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.8062500357627869, "rewards/RMReward/std": 0.051234759390354156, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 692, "train_speed(iter/s)": 0.016956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 89.6875, "completions/min_length": 8.0, "epoch": 0.010637644675037607, "frac_reward_zero_std": 0.0, "grad_norm": 30.898000717163086, "kl": 0.6322103142738342, "learning_rate": 5.317679558011051e-07, "loss": -0.0031740814447402954, "memory(GiB)": 90.94, "reward": 0.3018067479133606, "reward_std": 0.2995935082435608, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.3754885196685791, "rewards/VisualPerceptionAccuracy/std": 0.21622979640960693, "step": 693, "train_speed(iter/s)": 0.016976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 153.34375, "completions/min_length": 95.0, "epoch": 0.010652994811653823, "frac_reward_zero_std": 0.0, "grad_norm": 2.1792354583740234, "kl": 0.04263375699520111, "learning_rate": 5.325352977286679e-07, "loss": 0.00651375949382782, "memory(GiB)": 90.94, "reward": 0.4474964737892151, "reward_std": 0.06842278689146042, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8999999761581421, "rewards/PlanningActionSetORM/std": 0.17888543009757996, "rewards/RMReward/mean": 0.8031250238418579, "rewards/RMReward/std": 0.07630803436040878, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07249289751052856, "rewards/VisualPerceptionAccuracy/std": 0.05688726529479027, "step": 694, "train_speed(iter/s)": 0.016957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 518.875, "completions/min_length": 198.0, "epoch": 0.01066834494827004, "frac_reward_zero_std": 0.0, "grad_norm": 0.758905291557312, "kl": 0.034779325127601624, "learning_rate": 5.333026396562309e-07, "loss": -0.0918072909116745, "memory(GiB)": 90.94, "reward": 0.5029605627059937, "reward_std": 0.1045476645231247, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8218749761581421, "rewards/RMReward/std": 0.0815858393907547, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1706433743238449, "rewards/VisualPerceptionAccuracy/std": 0.1438266485929489, "step": 695, "train_speed(iter/s)": 0.016944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/mean_length": 116.3125, "completions/min_length": 97.0, "epoch": 0.010683695084886256, "frac_reward_zero_std": 0.0, "grad_norm": 3.247995376586914, "kl": 0.06514608860015869, "learning_rate": 5.340699815837938e-07, "loss": -0.02344723604619503, "memory(GiB)": 90.94, "reward": 0.6994637846946716, "reward_std": 0.05501112341880798, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9468749761581421, "rewards/PlanningActionSetORM/std": 0.1280868947505951, "rewards/RMReward/mean": 0.8374999761581421, "rewards/RMReward/std": 0.07637625932693481, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5395525693893433, "rewards/VisualPerceptionAccuracy/std": 0.03410841152071953, "step": 696, "train_speed(iter/s)": 0.016946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 270.25, "completions/min_length": 104.0, "epoch": 0.010699045221502472, "frac_reward_zero_std": 0.0, "grad_norm": 1.5173593759536743, "kl": 0.0606059804558754, "learning_rate": 5.348373235113568e-07, "loss": 0.002852506935596466, "memory(GiB)": 90.94, "reward": 0.7997812628746033, "reward_std": 0.12309609353542328, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.94140625, "rewards/PlanningActionSetORM/std": 0.06135544553399086, "rewards/RMReward/mean": 0.7643749713897705, "rewards/RMReward/std": 0.1744842529296875, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 697, "train_speed(iter/s)": 0.016932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/mean_length": 53.5, "completions/min_length": 8.0, "epoch": 0.010714395358118687, "frac_reward_zero_std": 0.0, "grad_norm": 36.8167610168457, "kl": 0.6124213337898254, "learning_rate": 5.356046654389196e-07, "loss": -0.0031759440898895264, "memory(GiB)": 90.94, "reward": 0.47468748688697815, "reward_std": 0.19204188883304596, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8781249523162842, "rewards/PlanningActionSetORM/std": 0.08343743532896042, "rewards/RMReward/mean": 0.7562500238418579, "rewards/RMReward/std": 0.0793200358748436, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 698, "train_speed(iter/s)": 0.016941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 121.3125, "completions/min_length": 99.0, "epoch": 0.010729745494734903, "frac_reward_zero_std": 0.0, "grad_norm": 1.8143417835235596, "kl": 0.1056319996714592, "learning_rate": 5.363720073664825e-07, "loss": 0.013440673239529133, "memory(GiB)": 90.94, "reward": 0.8823660612106323, "reward_std": 0.055492669343948364, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9743303656578064, "rewards/PlanningActionSetORM/std": 0.07510198652744293, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.06278160959482193, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 699, "train_speed(iter/s)": 0.016945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/mean_length": 152.625, "completions/min_length": 102.0, "epoch": 0.010745095631351119, "frac_reward_zero_std": 0.0, "grad_norm": 1.9401131868362427, "kl": 0.05768316239118576, "learning_rate": 5.371393492940455e-07, "loss": -0.002343691885471344, "memory(GiB)": 90.94, "reward": 0.5323766469955444, "reward_std": 0.07395513355731964, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9943749904632568, "rewards/RMReward/std": 0.012632631696760654, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06925326585769653, "rewards/VisualPerceptionAccuracy/std": 0.13780416548252106, "step": 700, "train_speed(iter/s)": 0.01693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 159.28125, "completions/min_length": 107.0, "epoch": 0.010760445767967334, "frac_reward_zero_std": 0.0, "grad_norm": 1.9274342060089111, "kl": 0.06559186428785324, "learning_rate": 5.379066912216084e-07, "loss": 0.02851003035902977, "memory(GiB)": 90.94, "reward": 0.5740333199501038, "reward_std": 0.10174842178821564, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.890625, "rewards/RMReward/std": 0.06381939351558685, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23556670546531677, "rewards/VisualPerceptionAccuracy/std": 0.1524413526058197, "step": 701, "train_speed(iter/s)": 0.016921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 237.9375, "completions/min_length": 117.0, "epoch": 0.010775795904583552, "frac_reward_zero_std": 0.0, "grad_norm": 0.8906928300857544, "kl": 0.05218782275915146, "learning_rate": 5.386740331491714e-07, "loss": -0.07337739318609238, "memory(GiB)": 90.94, "reward": 0.8237500190734863, "reward_std": 0.12114269286394119, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7796875238418579, "rewards/RMReward/std": 0.21055203676223755, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 702, "train_speed(iter/s)": 0.016899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 152.78125, "completions/min_length": 108.0, "epoch": 0.010791146041199767, "frac_reward_zero_std": 0.0, "grad_norm": 1.9674255847930908, "kl": 0.069559246301651, "learning_rate": 5.394413750767342e-07, "loss": 0.039429403841495514, "memory(GiB)": 90.94, "reward": 0.5619975924491882, "reward_std": 0.1278371959924698, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9921875, "rewards/PlanningActionSetORM/std": 0.03125, "rewards/RMReward/mean": 0.8812500238418579, "rewards/RMReward/std": 0.08139409869909286, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22055773437023163, "rewards/VisualPerceptionAccuracy/std": 0.18763720989227295, "step": 703, "train_speed(iter/s)": 0.016897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/mean_length": 316.09375, "completions/min_length": 136.0, "epoch": 0.010806496177815983, "frac_reward_zero_std": 0.0, "grad_norm": 1.909892201423645, "kl": 0.03669392317533493, "learning_rate": 5.402087170042972e-07, "loss": -0.07090280950069427, "memory(GiB)": 90.94, "reward": 0.34408116340637207, "reward_std": 0.1548791378736496, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.34408116340637207, "rewards/VisualPerceptionAccuracy/std": 0.1541069746017456, "step": 704, "train_speed(iter/s)": 0.01691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/mean_length": 118.3125, "completions/min_length": 94.0, "epoch": 0.010821846314432199, "frac_reward_zero_std": 0.0, "grad_norm": 2.4528732299804688, "kl": 0.07727180421352386, "learning_rate": 5.409760589318601e-07, "loss": 0.056053757667541504, "memory(GiB)": 90.94, "reward": 0.6914311051368713, "reward_std": 0.1533920019865036, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8812499642372131, "rewards/RMReward/std": 0.07719022780656815, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.47786223888397217, "rewards/VisualPerceptionAccuracy/std": 0.24503186345100403, "step": 705, "train_speed(iter/s)": 0.016918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 77.3125, "completions/min_length": 8.0, "epoch": 0.010837196451048414, "frac_reward_zero_std": 0.0, "grad_norm": 42.62831115722656, "kl": 0.49039024114608765, "learning_rate": 5.417434008594231e-07, "loss": 0.010393805801868439, "memory(GiB)": 90.94, "reward": 0.6273288726806641, "reward_std": 0.2944774329662323, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9201636910438538, "rewards/PlanningActionSetORM/std": 0.17184005677700043, "rewards/RMReward/mean": 0.7562500238418579, "rewards/RMReward/std": 0.10626226663589478, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 706, "train_speed(iter/s)": 0.016924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 153.96875, "completions/min_length": 126.0, "epoch": 0.01085254658766463, "frac_reward_zero_std": 0.0, "grad_norm": 1.6558232307434082, "kl": 0.08353269100189209, "learning_rate": 5.425107427869859e-07, "loss": 0.016223933547735214, "memory(GiB)": 90.94, "reward": 0.934749960899353, "reward_std": 0.05161329731345177, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9184374809265137, "rewards/RMReward/std": 0.0787445604801178, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 707, "train_speed(iter/s)": 0.016911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 125.0625, "completions/min_length": 93.0, "epoch": 0.010867896724280846, "frac_reward_zero_std": 0.0, "grad_norm": 1.6238579750061035, "kl": 0.10310997813940048, "learning_rate": 5.432780847145488e-07, "loss": -0.007348958402872086, "memory(GiB)": 90.94, "reward": 0.8684478998184204, "reward_std": 0.05591537430882454, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9872395992279053, "rewards/PlanningActionSetORM/std": 0.034714944660663605, "rewards/RMReward/mean": 0.8387500047683716, "rewards/RMReward/std": 0.10557828098535538, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 708, "train_speed(iter/s)": 0.016893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 228.15625, "completions/min_length": 8.0, "epoch": 0.010883246860897061, "frac_reward_zero_std": 0.0, "grad_norm": 30.96575927734375, "kl": 0.6543907523155212, "learning_rate": 5.440454266421118e-07, "loss": 0.005178097635507584, "memory(GiB)": 90.94, "reward": 0.7684236764907837, "reward_std": 0.20789937674999237, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.955486536026001, "rewards/PlanningActionSetORM/std": 0.0018518194556236267, "rewards/RMReward/mean": 0.5806249976158142, "rewards/RMReward/std": 0.11433685570955276, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 709, "train_speed(iter/s)": 0.016891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 256.3125, "completions/min_length": 13.0, "epoch": 0.010898596997513279, "frac_reward_zero_std": 0.0, "grad_norm": 27.103586196899414, "kl": 0.1287725865840912, "learning_rate": 5.448127685696747e-07, "loss": 0.04445244371891022, "memory(GiB)": 90.94, "reward": 0.4361836910247803, "reward_std": 0.3535264730453491, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.2879924178123474, "rewards/VisualPerceptionAccuracy/std": 0.22032277286052704, "step": 710, "train_speed(iter/s)": 0.016904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 99.625, "completions/min_length": 69.0, "epoch": 0.010913947134129494, "frac_reward_zero_std": 0.0, "grad_norm": 2.4897396564483643, "kl": 0.10168014466762543, "learning_rate": 5.455801104972376e-07, "loss": -0.009629884734749794, "memory(GiB)": 90.94, "reward": 0.8519999980926514, "reward_std": 0.05469802767038345, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.06318176537752151, "rewards/RMReward/mean": 0.8212499618530273, "rewards/RMReward/std": 0.12481599301099777, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 711, "train_speed(iter/s)": 0.016894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 15.09375, "completions/min_length": 14.0, "epoch": 0.01092929727074571, "frac_reward_zero_std": 1.0, "grad_norm": 0.004546253941953182, "kl": 0.26107239723205566, "learning_rate": 5.463474524248005e-07, "loss": 0.0002610197407193482, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 712, "train_speed(iter/s)": 0.016866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 110.78125, "completions/min_length": 97.0, "epoch": 0.010944647407361925, "frac_reward_zero_std": 0.0, "grad_norm": 1.2614942789077759, "kl": 0.1000937819480896, "learning_rate": 5.471147943523635e-07, "loss": 0.010103223845362663, "memory(GiB)": 90.94, "reward": 0.9085000157356262, "reward_std": 0.06692825257778168, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9012500047683716, "rewards/RMReward/std": 0.0852983370423317, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 713, "train_speed(iter/s)": 0.01685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/mean_length": 449.3125, "completions/min_length": 194.0, "epoch": 0.010959997543978141, "frac_reward_zero_std": 0.0, "grad_norm": 0.6856794953346252, "kl": 0.03619564324617386, "learning_rate": 5.478821362799264e-07, "loss": -0.0013080313801765442, "memory(GiB)": 90.94, "reward": 0.5335520505905151, "reward_std": 0.10839538276195526, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9356250166893005, "rewards/RMReward/std": 0.07023472338914871, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.14082640409469604, "rewards/VisualPerceptionAccuracy/std": 0.16060301661491394, "step": 714, "train_speed(iter/s)": 0.016846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 122.34375, "completions/min_length": 114.0, "epoch": 0.010975347680594357, "frac_reward_zero_std": 0.0, "grad_norm": 1.2879810333251953, "kl": 0.10068473219871521, "learning_rate": 5.486494782074893e-07, "loss": 0.004969527013599873, "memory(GiB)": 90.94, "reward": 0.8818750381469727, "reward_std": 0.04670906811952591, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.8531250357627869, "rewards/RMReward/std": 0.0841824933886528, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 715, "train_speed(iter/s)": 0.01682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/mean_length": 56.71875, "completions/min_length": 8.0, "epoch": 0.010990697817210572, "frac_reward_zero_std": 0.0, "grad_norm": 22.76175308227539, "kl": 0.5187423229217529, "learning_rate": 5.494168201350522e-07, "loss": 0.002282470464706421, "memory(GiB)": 90.94, "reward": 0.8916249871253967, "reward_std": 0.18522164225578308, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8774999976158142, "rewards/RMReward/std": 0.05744561553001404, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 716, "train_speed(iter/s)": 0.016821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 5.0, "completions/min_length": 2.0, "epoch": 0.01100604795382679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017785695381462574, "kl": 0.228515625, "learning_rate": 5.501841620626151e-07, "loss": 0.00022761523723602295, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 1.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 717, "train_speed(iter/s)": 0.016804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/mean_length": 281.21875, "completions/min_length": 106.0, "epoch": 0.011021398090443005, "frac_reward_zero_std": 0.0, "grad_norm": 1.9098111391067505, "kl": 0.0819406509399414, "learning_rate": 5.509515039901781e-07, "loss": -0.0043350085616111755, "memory(GiB)": 90.94, "reward": 0.8253210783004761, "reward_std": 0.10964188724756241, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9703553318977356, "rewards/PlanningActionSetORM/std": 0.09825875610113144, "rewards/RMReward/mean": 0.7890625, "rewards/RMReward/std": 0.18244501948356628, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 718, "train_speed(iter/s)": 0.016784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 80.375, "completions/min_length": 14.0, "epoch": 0.011036748227059221, "frac_reward_zero_std": 0.0, "grad_norm": 3.6629974842071533, "kl": 0.14397816359996796, "learning_rate": 5.517188459177409e-07, "loss": 0.000504322350025177, "memory(GiB)": 90.94, "reward": 0.8517968654632568, "reward_std": 0.16034632921218872, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.96484375, "rewards/PlanningActionSetORM/std": 0.04770106449723244, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.09574271738529205, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 719, "train_speed(iter/s)": 0.016771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 125.90625, "completions/min_length": 80.0, "epoch": 0.011052098363675437, "frac_reward_zero_std": 0.0, "grad_norm": 2.2145488262176514, "kl": 0.055258315056562424, "learning_rate": 5.524861878453039e-07, "loss": 0.0025990568101406097, "memory(GiB)": 90.94, "reward": 0.6498695015907288, "reward_std": 0.10754693299531937, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9399999976158142, "rewards/RMReward/std": 0.03162277489900589, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3477391004562378, "rewards/VisualPerceptionAccuracy/std": 0.18979564309120178, "step": 720, "train_speed(iter/s)": 0.016772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 481.71875, "completions/min_length": 86.0, "epoch": 0.011067448500291652, "frac_reward_zero_std": 0.0, "grad_norm": 2.064136028289795, "kl": 0.06756404042243958, "learning_rate": 5.532535297728668e-07, "loss": 0.10444328188896179, "memory(GiB)": 90.94, "reward": 0.4151371419429779, "reward_std": 0.040807969868183136, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.7687499523162842, "rewards/RMReward/std": 0.04787135869264603, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.020274249836802483, "rewards/VisualPerceptionAccuracy/std": 0.036597415804862976, "step": 721, "train_speed(iter/s)": 0.016765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 105.5, "completions/min_length": 2.0, "epoch": 0.011082798636907868, "frac_reward_zero_std": 0.5, "grad_norm": 0.015259871259331703, "kl": 0.030393755063414574, "learning_rate": 5.540208717004298e-07, "loss": 2.9772520065307617e-05, "memory(GiB)": 90.94, "reward": 0.48463886976242065, "reward_std": 0.014928722754120827, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9893749952316284, "rewards/RMReward/std": 0.0373217947781086, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 722, "train_speed(iter/s)": 0.016726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 197.09375, "completions/min_length": 110.0, "epoch": 0.011098148773524084, "frac_reward_zero_std": 0.0, "grad_norm": 1.3992277383804321, "kl": 0.06762278079986572, "learning_rate": 5.547882136279926e-07, "loss": 0.03370478004217148, "memory(GiB)": 90.94, "reward": 0.9126826524734497, "reward_std": 0.0798262357711792, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9459134340286255, "rewards/PlanningActionSetORM/std": 0.0742843747138977, "rewards/RMReward/mean": 0.9043750166893005, "rewards/RMReward/std": 0.0933234840631485, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 723, "train_speed(iter/s)": 0.016708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/mean_length": 147.96875, "completions/min_length": 2.0, "epoch": 0.011113498910140301, "frac_reward_zero_std": 0.0, "grad_norm": 42.188621520996094, "kl": 0.03134293854236603, "learning_rate": 5.555555555555555e-07, "loss": -0.07403925061225891, "memory(GiB)": 90.94, "reward": 0.5299049615859985, "reward_std": 0.3437151312828064, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5299049615859985, "rewards/VisualPerceptionAccuracy/std": 0.41797125339508057, "step": 724, "train_speed(iter/s)": 0.016721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 52.59375, "completions/min_length": 8.0, "epoch": 0.011128849046756517, "frac_reward_zero_std": 0.0, "grad_norm": 25.80755615234375, "kl": 0.45820093154907227, "learning_rate": 5.563228974831185e-07, "loss": -0.012782204896211624, "memory(GiB)": 90.94, "reward": 0.7796354293823242, "reward_std": 0.2365104854106903, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8494791388511658, "rewards/PlanningActionSetORM/std": 0.051860637962818146, "rewards/RMReward/mean": 0.7093750238418579, "rewards/RMReward/std": 0.10834936052560806, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 725, "train_speed(iter/s)": 0.016724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/mean_length": 210.0, "completions/min_length": 100.0, "epoch": 0.011144199183372732, "frac_reward_zero_std": 0.0, "grad_norm": 1.3736644983291626, "kl": 0.06189563497900963, "learning_rate": 5.570902394106815e-07, "loss": 0.039511680603027344, "memory(GiB)": 90.94, "reward": 0.893089771270752, "reward_std": 0.08864504098892212, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9066989421844482, "rewards/PlanningActionSetORM/std": 0.029924508184194565, "rewards/RMReward/mean": 0.8896875381469727, "rewards/RMReward/std": 0.11860708147287369, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 726, "train_speed(iter/s)": 0.016694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 123.25, "completions/min_length": 93.0, "epoch": 0.011159549319988948, "frac_reward_zero_std": 0.0, "grad_norm": 2.2179694175720215, "kl": 0.08263559639453888, "learning_rate": 5.578575813382444e-07, "loss": 0.05956895649433136, "memory(GiB)": 90.94, "reward": 0.6845456957817078, "reward_std": 0.09648381173610687, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9750000238418579, "rewards/RMReward/std": 0.06582806259393692, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4140913188457489, "rewards/VisualPerceptionAccuracy/std": 0.14030516147613525, "step": 727, "train_speed(iter/s)": 0.01669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1912.0, "completions/mean_length": 522.5625, "completions/min_length": 13.0, "epoch": 0.011174899456605164, "frac_reward_zero_std": 0.0, "grad_norm": 5.439716339111328, "kl": 0.1002650037407875, "learning_rate": 5.586249232658073e-07, "loss": -0.007420103996992111, "memory(GiB)": 90.94, "reward": 0.6396914720535278, "reward_std": 0.2120998501777649, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.33875787258148193, "rewards/VisualPerceptionAccuracy/std": 0.1866997331380844, "step": 728, "train_speed(iter/s)": 0.016696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/mean_length": 152.5625, "completions/min_length": 77.0, "epoch": 0.01119024959322138, "frac_reward_zero_std": 0.0, "grad_norm": 2.190979242324829, "kl": 0.07602135837078094, "learning_rate": 5.593922651933702e-07, "loss": -0.048093847930431366, "memory(GiB)": 90.94, "reward": 0.5058797001838684, "reward_std": 0.027653893455863, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9962500333786011, "rewards/RMReward/std": 0.012583060190081596, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.014759454876184464, "rewards/VisualPerceptionAccuracy/std": 0.0452413484454155, "step": 729, "train_speed(iter/s)": 0.016684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/mean_length": 183.71875, "completions/min_length": 13.0, "epoch": 0.011205599729837595, "frac_reward_zero_std": 0.0, "grad_norm": 5.388022422790527, "kl": 0.1272808164358139, "learning_rate": 5.601596071209332e-07, "loss": 0.058962322771549225, "memory(GiB)": 90.94, "reward": 0.20823480188846588, "reward_std": 0.19010861217975616, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.30709460377693176, "rewards/VisualPerceptionAccuracy/std": 0.14271722733974457, "step": 730, "train_speed(iter/s)": 0.016698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/mean_length": 127.71875, "completions/min_length": 13.0, "epoch": 0.01122094986645381, "frac_reward_zero_std": 0.0, "grad_norm": 8.930718421936035, "kl": 0.16419295966625214, "learning_rate": 5.609269490484961e-07, "loss": -0.001333490014076233, "memory(GiB)": 90.94, "reward": 0.2535172700881958, "reward_std": 0.30258291959762573, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.33828452229499817, "rewards/VisualPerceptionAccuracy/std": 0.28067904710769653, "step": 731, "train_speed(iter/s)": 0.016715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 160.6875, "completions/min_length": 156.0, "epoch": 0.011236300003070028, "frac_reward_zero_std": 0.0, "grad_norm": 0.7986122965812683, "kl": 0.08834192901849747, "learning_rate": 5.61694290976059e-07, "loss": -0.0013305023312568665, "memory(GiB)": 90.94, "reward": 0.8186964392662048, "reward_std": 0.09848912060260773, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9709821343421936, "rewards/PlanningActionSetORM/std": 0.035642217844724655, "rewards/RMReward/mean": 0.7806249856948853, "rewards/RMReward/std": 0.27031567692756653, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 732, "train_speed(iter/s)": 0.016694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 140.03125, "completions/min_length": 13.0, "epoch": 0.011251650139686244, "frac_reward_zero_std": 0.0, "grad_norm": 10.859370231628418, "kl": 0.11573168635368347, "learning_rate": 5.624616329036219e-07, "loss": -0.018341831862926483, "memory(GiB)": 90.94, "reward": 0.7970576286315918, "reward_std": 0.24381646513938904, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9230769276618958, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8087500333786011, "rewards/RMReward/std": 0.07847505807876587, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 733, "train_speed(iter/s)": 0.016685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/mean_length": 110.84375, "completions/min_length": 92.0, "epoch": 0.01126700027630246, "frac_reward_zero_std": 0.0, "grad_norm": 2.727053165435791, "kl": 0.10110676288604736, "learning_rate": 5.632289748311849e-07, "loss": 0.01097363606095314, "memory(GiB)": 90.94, "reward": 0.3933585584163666, "reward_std": 0.02787686511874199, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.057735033333301544, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.006717143580317497, "rewards/VisualPerceptionAccuracy/std": 0.009565705433487892, "step": 734, "train_speed(iter/s)": 0.016692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 142.90625, "completions/min_length": 95.0, "epoch": 0.011282350412918675, "frac_reward_zero_std": 0.0, "grad_norm": 0.6494303345680237, "kl": 0.08224593102931976, "learning_rate": 5.639963167587478e-07, "loss": 0.01560157723724842, "memory(GiB)": 90.94, "reward": 0.9232500195503235, "reward_std": 0.10943738371133804, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9040625095367432, "rewards/RMReward/std": 0.21199297904968262, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 735, "train_speed(iter/s)": 0.016629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 64.0625, "completions/min_length": 13.0, "epoch": 0.01129770054953489, "frac_reward_zero_std": 0.0, "grad_norm": 7.3769025802612305, "kl": 0.13826121389865875, "learning_rate": 5.647636586863106e-07, "loss": 0.014612168073654175, "memory(GiB)": 90.94, "reward": 0.8893749713897705, "reward_std": 0.1885429322719574, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8718750476837158, "rewards/RMReward/std": 0.06574887782335281, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 736, "train_speed(iter/s)": 0.016629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 250.84375, "completions/min_length": 133.0, "epoch": 0.011313050686151106, "frac_reward_zero_std": 0.0, "grad_norm": 1.196504831314087, "kl": 0.0486183725297451, "learning_rate": 5.655310006138736e-07, "loss": -0.0031020119786262512, "memory(GiB)": 90.94, "reward": 0.4960615336894989, "reward_std": 0.049465298652648926, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9524999856948853, "rewards/RMReward/std": 0.055677637457847595, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.030123062431812286, "rewards/VisualPerceptionAccuracy/std": 0.05438845977187157, "step": 737, "train_speed(iter/s)": 0.01661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 196.5625, "completions/min_length": 89.0, "epoch": 0.011328400822767322, "frac_reward_zero_std": 0.0, "grad_norm": 1.498199701309204, "kl": 0.0666193813085556, "learning_rate": 5.662983425414365e-07, "loss": -0.01779079996049404, "memory(GiB)": 90.94, "reward": 0.7996217012405396, "reward_std": 0.0964362770318985, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9106084108352661, "rewards/PlanningActionSetORM/std": 0.07054568082094193, "rewards/RMReward/mean": 0.7718750238418579, "rewards/RMReward/std": 0.13496564328670502, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 738, "train_speed(iter/s)": 0.016601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 5.0, "completions/min_length": 2.0, "epoch": 0.01134375095938354, "frac_reward_zero_std": 0.0, "grad_norm": 82.78768157958984, "kl": 0.30029296875, "learning_rate": 5.670656844689995e-07, "loss": 0.0003014206886291504, "memory(GiB)": 90.94, "reward": 0.38749998807907104, "reward_std": 0.46889573335647583, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.25, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 739, "train_speed(iter/s)": 0.016621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 183.34375, "completions/min_length": 107.0, "epoch": 0.011359101095999755, "frac_reward_zero_std": 0.0, "grad_norm": 1.6745718717575073, "kl": 0.07227885723114014, "learning_rate": 5.678330263965623e-07, "loss": -0.000632312148809433, "memory(GiB)": 90.94, "reward": 0.9295576810836792, "reward_std": 0.03556394204497337, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.9215624928474426, "rewards/RMReward/std": 0.08390121906995773, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 740, "train_speed(iter/s)": 0.016602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 102.4375, "completions/min_length": 86.0, "epoch": 0.01137445123261597, "frac_reward_zero_std": 0.0, "grad_norm": 2.415585517883301, "kl": 0.08440542221069336, "learning_rate": 5.686003683241253e-07, "loss": -0.0199158675968647, "memory(GiB)": 90.94, "reward": 0.8067187070846558, "reward_std": 0.08142437040805817, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8960937261581421, "rewards/PlanningActionSetORM/std": 0.05240558460354805, "rewards/RMReward/mean": 0.7843749523162842, "rewards/RMReward/std": 0.12727762758731842, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 741, "train_speed(iter/s)": 0.016605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 57.15625, "completions/min_length": 8.0, "epoch": 0.011389801369232186, "frac_reward_zero_std": 0.0, "grad_norm": 53.02656173706055, "kl": 0.6285262703895569, "learning_rate": 5.693677102516882e-07, "loss": 0.002420559525489807, "memory(GiB)": 90.94, "reward": 0.5368750095367432, "reward_std": 0.18844769895076752, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8812500238418579, "rewards/RMReward/std": 0.06551080197095871, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 742, "train_speed(iter/s)": 0.016607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.5, "completions/min_length": 8.0, "epoch": 0.011405151505848402, "frac_reward_zero_std": 1.0, "grad_norm": 4.88403020426631e-05, "kl": 0.8404947519302368, "learning_rate": 5.701350521792512e-07, "loss": 0.0008407963905483484, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 743, "train_speed(iter/s)": 0.016595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 107.0, "completions/min_length": 101.0, "epoch": 0.011420501642464618, "frac_reward_zero_std": 0.0, "grad_norm": 1.4685876369476318, "kl": 0.09461850672960281, "learning_rate": 5.70902394106814e-07, "loss": -0.0007867813110351562, "memory(GiB)": 90.94, "reward": 0.9632500410079956, "reward_std": 0.02273278869688511, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9696875214576721, "rewards/RMReward/std": 0.0351480171084404, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 744, "train_speed(iter/s)": 0.016583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 232.875, "completions/min_length": 138.0, "epoch": 0.011435851779080833, "frac_reward_zero_std": 0.0, "grad_norm": 1.429543375968933, "kl": 0.04555128514766693, "learning_rate": 5.716697360343769e-07, "loss": 0.011514425277709961, "memory(GiB)": 90.94, "reward": 0.6110386848449707, "reward_std": 0.14681176841259003, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9981249570846558, "rewards/RMReward/std": 0.0040311249904334545, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22357742488384247, "rewards/VisualPerceptionAccuracy/std": 0.29039865732192993, "step": 745, "train_speed(iter/s)": 0.016582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 155.9375, "completions/min_length": 86.0, "epoch": 0.01145120191569705, "frac_reward_zero_std": 0.0, "grad_norm": 2.2059459686279297, "kl": 0.08804452419281006, "learning_rate": 5.724370779619399e-07, "loss": -0.05820825323462486, "memory(GiB)": 90.94, "reward": 0.7487499713897705, "reward_std": 0.09295308589935303, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9124999642372131, "rewards/PlanningActionSetORM/std": 0.15644744038581848, "rewards/RMReward/mean": 0.707812488079071, "rewards/RMReward/std": 0.12320280820131302, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 746, "train_speed(iter/s)": 0.016577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/mean_length": 65.375, "completions/min_length": 8.0, "epoch": 0.011466552052313266, "frac_reward_zero_std": 0.0, "grad_norm": 25.03678321838379, "kl": 0.3627682328224182, "learning_rate": 5.732044198895028e-07, "loss": -0.018503714352846146, "memory(GiB)": 90.94, "reward": 0.523187518119812, "reward_std": 0.1485716998577118, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9212499856948853, "rewards/RMReward/std": 0.07455422729253769, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 747, "train_speed(iter/s)": 0.016577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.5, "completions/min_length": 8.0, "epoch": 0.011481902188929482, "frac_reward_zero_std": 1.0, "grad_norm": 1.8996637663803995e-05, "kl": 0.9518229365348816, "learning_rate": 5.739717618170657e-07, "loss": 0.0009513530530966818, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 748, "train_speed(iter/s)": 0.016565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 186.78125, "completions/min_length": 106.0, "epoch": 0.011497252325545698, "frac_reward_zero_std": 0.0, "grad_norm": 1.118830680847168, "kl": 0.07310517132282257, "learning_rate": 5.747391037446286e-07, "loss": -0.08148396015167236, "memory(GiB)": 90.94, "reward": 0.8786388635635376, "reward_std": 0.08228729665279388, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.956944465637207, "rewards/PlanningActionSetORM/std": 0.04652392864227295, "rewards/RMReward/mean": 0.8590624928474426, "rewards/RMReward/std": 0.16091188788414001, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 749, "train_speed(iter/s)": 0.016535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/mean_length": 85.40625, "completions/min_length": 69.0, "epoch": 0.011512602462161913, "frac_reward_zero_std": 0.0, "grad_norm": 1.4098490476608276, "kl": 0.11817610263824463, "learning_rate": 5.755064456721916e-07, "loss": -0.0004579015076160431, "memory(GiB)": 90.94, "reward": 0.824999988079071, "reward_std": 0.07659415155649185, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.1463259607553482, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 750, "train_speed(iter/s)": 0.016517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/mean_length": 154.15625, "completions/min_length": 14.0, "epoch": 0.011527952598778129, "frac_reward_zero_std": 0.0, "grad_norm": 8.254073143005371, "kl": 0.14777159690856934, "learning_rate": 5.762737875997545e-07, "loss": -0.025971882045269012, "memory(GiB)": 90.94, "reward": 0.7254170775413513, "reward_std": 0.2369268536567688, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.5102091431617737, "rewards/VisualPerceptionAccuracy/std": 0.23635374009609222, "step": 751, "train_speed(iter/s)": 0.016533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 123.5625, "completions/min_length": 14.0, "epoch": 0.011543302735394344, "frac_reward_zero_std": 0.0, "grad_norm": 14.082786560058594, "kl": 0.02811337262392044, "learning_rate": 5.770411295273175e-07, "loss": -0.012829571962356567, "memory(GiB)": 90.94, "reward": 0.8704702854156494, "reward_std": 0.29301896691322327, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9078283309936523, "rewards/PlanningActionSetORM/std": 0.0050505101680755615, "rewards/RMReward/mean": 0.921875, "rewards/RMReward/std": 0.25361964106559753, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 752, "train_speed(iter/s)": 0.016537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.3125, "completions/min_length": 8.0, "epoch": 0.01155865287201056, "frac_reward_zero_std": 0.0, "grad_norm": 111.1207275390625, "kl": 0.8899438977241516, "learning_rate": 5.778084714548803e-07, "loss": -0.02217058464884758, "memory(GiB)": 90.94, "reward": 0.46562498807907104, "reward_std": 0.4707540273666382, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 753, "train_speed(iter/s)": 0.016539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 239.5625, "completions/min_length": 129.0, "epoch": 0.011574003008626777, "frac_reward_zero_std": 0.0, "grad_norm": 1.7363215684890747, "kl": 0.07408386468887329, "learning_rate": 5.785758133824432e-07, "loss": -0.00841906201094389, "memory(GiB)": 90.94, "reward": 0.6016032695770264, "reward_std": 0.08360141515731812, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7999999523162842, "rewards/RMReward/std": 0.05163978412747383, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.36320650577545166, "rewards/VisualPerceptionAccuracy/std": 0.12589101493358612, "step": 754, "train_speed(iter/s)": 0.016544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 188.6875, "completions/min_length": 67.0, "epoch": 0.011589353145242993, "frac_reward_zero_std": 0.0, "grad_norm": 1.489295482635498, "kl": 0.0970931127667427, "learning_rate": 5.793431553100062e-07, "loss": -0.014029841870069504, "memory(GiB)": 90.94, "reward": 0.924708366394043, "reward_std": 0.05227117985486984, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9197916984558105, "rewards/PlanningActionSetORM/std": 0.0609714575111866, "rewards/RMReward/mean": 0.9259375333786011, "rewards/RMReward/std": 0.10956981778144836, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 755, "train_speed(iter/s)": 0.016541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 114.3125, "completions/min_length": 102.0, "epoch": 0.011604703281859209, "frac_reward_zero_std": 0.0, "grad_norm": 1.8809616565704346, "kl": 0.10663272440433502, "learning_rate": 5.80110497237569e-07, "loss": 0.059276286512613297, "memory(GiB)": 90.94, "reward": 0.8829166889190674, "reward_std": 0.09183105081319809, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8833333253860474, "rewards/PlanningActionSetORM/std": 0.028476862236857414, "rewards/RMReward/mean": 0.8828125, "rewards/RMReward/std": 0.16244570910930634, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 756, "train_speed(iter/s)": 0.016521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 345.09375, "completions/min_length": 122.0, "epoch": 0.011620053418475424, "frac_reward_zero_std": 0.0, "grad_norm": 1.0891003608703613, "kl": 0.05513135343790054, "learning_rate": 5.80877839165132e-07, "loss": -0.15711495280265808, "memory(GiB)": 90.94, "reward": 0.5760444402694702, "reward_std": 0.10146810859441757, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9800000190734863, "rewards/RMReward/std": 0.030331509187817574, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1680888831615448, "rewards/VisualPerceptionAccuracy/std": 0.17867101728916168, "step": 757, "train_speed(iter/s)": 0.016522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/mean_length": 58.3125, "completions/min_length": 8.0, "epoch": 0.01163540355509164, "frac_reward_zero_std": 0.0, "grad_norm": 41.66611862182617, "kl": 0.7026523351669312, "learning_rate": 5.816451810926949e-07, "loss": 0.026120122522115707, "memory(GiB)": 90.94, "reward": 0.6868749856948853, "reward_std": 0.2524999976158142, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9906250238418579, "rewards/RMReward/std": 0.03749999403953552, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 758, "train_speed(iter/s)": 0.01651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 128.25, "completions/min_length": 8.0, "epoch": 0.011650753691707856, "frac_reward_zero_std": 0.0, "grad_norm": 16.21059799194336, "kl": 0.7439602613449097, "learning_rate": 5.824125230202579e-07, "loss": 0.00258747860789299, "memory(GiB)": 90.94, "reward": 0.7340625524520874, "reward_std": 0.15955442190170288, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.40937501192092896, "rewards/RMReward/std": 0.10201103240251541, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 759, "train_speed(iter/s)": 0.01651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/mean_length": 68.59375, "completions/min_length": 13.0, "epoch": 0.011666103828324071, "frac_reward_zero_std": 0.0, "grad_norm": 9.233336448669434, "kl": 0.21446748077869415, "learning_rate": 5.831798649478208e-07, "loss": -0.012244362384080887, "memory(GiB)": 90.94, "reward": 0.6205752491950989, "reward_std": 0.20106080174446106, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.35990050435066223, "rewards/VisualPerceptionAccuracy/std": 0.07763480395078659, "step": 760, "train_speed(iter/s)": 0.016528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/mean_length": 202.3125, "completions/min_length": 138.0, "epoch": 0.011681453964940289, "frac_reward_zero_std": 0.0, "grad_norm": 1.5649653673171997, "kl": 0.06645673513412476, "learning_rate": 5.839472068753837e-07, "loss": -0.006860699504613876, "memory(GiB)": 90.94, "reward": 0.6231355667114258, "reward_std": 0.05700242891907692, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.996874988079071, "rewards/RMReward/std": 0.006020791828632355, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.24877110123634338, "rewards/VisualPerceptionAccuracy/std": 0.10918822139501572, "step": 761, "train_speed(iter/s)": 0.016535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 79.0, "completions/min_length": 8.0, "epoch": 0.011696804101556504, "frac_reward_zero_std": 0.0, "grad_norm": 58.18812561035156, "kl": 0.31281498074531555, "learning_rate": 5.847145488029467e-07, "loss": 0.031629446893930435, "memory(GiB)": 90.94, "reward": 0.3684464395046234, "reward_std": 0.3232981562614441, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.1525178998708725, "rewards/VisualPerceptionAccuracy/std": 0.15986618399620056, "step": 762, "train_speed(iter/s)": 0.016553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/mean_length": 112.8125, "completions/min_length": 101.0, "epoch": 0.01171215423817272, "frac_reward_zero_std": 0.0, "grad_norm": 1.066393494606018, "kl": 0.10404053330421448, "learning_rate": 5.854818907305096e-07, "loss": 0.0027270540595054626, "memory(GiB)": 90.94, "reward": 0.8787499666213989, "reward_std": 0.05284091830253601, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8484375476837158, "rewards/RMReward/std": 0.06535802781581879, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 763, "train_speed(iter/s)": 0.01653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/mean_length": 111.78125, "completions/min_length": 77.0, "epoch": 0.011727504374788936, "frac_reward_zero_std": 0.0, "grad_norm": 3.2841548919677734, "kl": 0.058670252561569214, "learning_rate": 5.862492326580725e-07, "loss": 0.027988338842988014, "memory(GiB)": 90.94, "reward": 0.3172915279865265, "reward_std": 0.18964883685112, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3172915279865265, "rewards/VisualPerceptionAccuracy/std": 0.19344203174114227, "step": 764, "train_speed(iter/s)": 0.016548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 62.65625, "completions/min_length": 2.0, "epoch": 0.011742854511405151, "frac_reward_zero_std": 0.0, "grad_norm": 65.56890106201172, "kl": 0.05890627205371857, "learning_rate": 5.870165745856354e-07, "loss": 0.03037305548787117, "memory(GiB)": 90.94, "reward": 0.5144791603088379, "reward_std": 0.1963082104921341, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9947916865348816, "rewards/PlanningActionSetORM/std": 0.020833328366279602, "rewards/RMReward/mean": 0.8812500238418579, "rewards/RMReward/std": 0.06291528046131134, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.125, "rewards/VisualPerceptionAccuracy/std": 0.3415650427341461, "step": 765, "train_speed(iter/s)": 0.016544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 182.0625, "completions/min_length": 90.0, "epoch": 0.011758204648021367, "frac_reward_zero_std": 0.0, "grad_norm": 1.2670422792434692, "kl": 0.06256899237632751, "learning_rate": 5.877839165131983e-07, "loss": -0.09984764456748962, "memory(GiB)": 90.94, "reward": 0.8448317646980286, "reward_std": 0.07234068959951401, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9554086327552795, "rewards/PlanningActionSetORM/std": 0.04416311904788017, "rewards/RMReward/mean": 0.8171875476837158, "rewards/RMReward/std": 0.09298748522996902, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 766, "train_speed(iter/s)": 0.016523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 127.125, "completions/min_length": 79.0, "epoch": 0.011773554784637583, "frac_reward_zero_std": 0.0, "grad_norm": 2.2668018341064453, "kl": 0.04710844159126282, "learning_rate": 5.885512584407613e-07, "loss": -0.011392777785658836, "memory(GiB)": 90.94, "reward": 0.7184423208236694, "reward_std": 0.08349347114562988, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.10201103985309601, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6443846225738525, "rewards/VisualPerceptionAccuracy/std": 0.08537811785936356, "step": 767, "train_speed(iter/s)": 0.016492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/mean_length": 108.375, "completions/min_length": 72.0, "epoch": 0.011788904921253798, "frac_reward_zero_std": 0.0, "grad_norm": 2.3201043605804443, "kl": 0.10029622912406921, "learning_rate": 5.893186003683242e-07, "loss": -0.024775858968496323, "memory(GiB)": 90.94, "reward": 0.5079743266105652, "reward_std": 0.02591574937105179, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9881250262260437, "rewards/RMReward/std": 0.019397171214222908, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0254486296325922, "rewards/VisualPerceptionAccuracy/std": 0.036313772201538086, "step": 768, "train_speed(iter/s)": 0.016486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/mean_length": 225.6875, "completions/min_length": 108.0, "epoch": 0.011804255057870016, "frac_reward_zero_std": 0.0, "grad_norm": 1.3831443786621094, "kl": 0.04721491038799286, "learning_rate": 5.900859422958871e-07, "loss": 0.10347548127174377, "memory(GiB)": 90.94, "reward": 0.5295187830924988, "reward_std": 0.120096355676651, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7687499523162842, "rewards/RMReward/std": 0.09287088364362717, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.24403750896453857, "rewards/VisualPerceptionAccuracy/std": 0.16589601337909698, "step": 769, "train_speed(iter/s)": 0.016484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 115.71875, "completions/min_length": 101.0, "epoch": 0.011819605194486231, "frac_reward_zero_std": 0.0, "grad_norm": 2.317918062210083, "kl": 0.09837784618139267, "learning_rate": 5.9085328422345e-07, "loss": 0.030645744875073433, "memory(GiB)": 90.94, "reward": 0.8450000286102295, "reward_std": 0.07005490362644196, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8062499761581421, "rewards/RMReward/std": 0.08683503419160843, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 770, "train_speed(iter/s)": 0.016488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/mean_length": 172.5, "completions/min_length": 155.0, "epoch": 0.011834955331102447, "frac_reward_zero_std": 0.0, "grad_norm": 0.2695406973361969, "kl": 0.058242253959178925, "learning_rate": 5.91620626151013e-07, "loss": 5.6974589824676514e-05, "memory(GiB)": 90.94, "reward": 0.977388858795166, "reward_std": 0.029906228184700012, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9856250286102295, "rewards/RMReward/std": 0.04543250799179077, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 771, "train_speed(iter/s)": 0.01647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 167.59375, "completions/min_length": 8.0, "epoch": 0.011850305467718663, "frac_reward_zero_std": 0.0, "grad_norm": 26.490474700927734, "kl": 0.3911944031715393, "learning_rate": 5.923879680785759e-07, "loss": -0.0540393590927124, "memory(GiB)": 90.94, "reward": 0.4367815852165222, "reward_std": 0.19633153080940247, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9484408497810364, "rewards/PlanningActionSetORM/std": 0.028115753084421158, "rewards/RMReward/mean": 0.7181249856948853, "rewards/RMReward/std": 0.19644230604171753, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 772, "train_speed(iter/s)": 0.016466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/mean_length": 138.59375, "completions/min_length": 102.0, "epoch": 0.011865655604334878, "frac_reward_zero_std": 0.0, "grad_norm": 1.9494116306304932, "kl": 0.09872715175151825, "learning_rate": 5.931553100061387e-07, "loss": 0.01742669567465782, "memory(GiB)": 90.94, "reward": 0.9279375076293945, "reward_std": 0.04888708144426346, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9921875, "rewards/PlanningActionSetORM/std": 0.03074183501303196, "rewards/RMReward/mean": 0.9118750095367432, "rewards/RMReward/std": 0.11732055246829987, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 773, "train_speed(iter/s)": 0.016463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 137.3125, "completions/min_length": 8.0, "epoch": 0.011881005740951094, "frac_reward_zero_std": 0.0, "grad_norm": 22.38896942138672, "kl": 0.563018262386322, "learning_rate": 5.939226519337017e-07, "loss": 0.009029172360897064, "memory(GiB)": 90.94, "reward": 0.9663125276565552, "reward_std": 0.12674999237060547, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9900000095367432, "rewards/RMReward/std": 0.020000005140900612, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 774, "train_speed(iter/s)": 0.016453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 102.0625, "completions/min_length": 78.0, "epoch": 0.01189635587756731, "frac_reward_zero_std": 0.0, "grad_norm": 2.3439340591430664, "kl": 0.1042470708489418, "learning_rate": 5.946899938612646e-07, "loss": 0.010901855304837227, "memory(GiB)": 90.94, "reward": 0.8138541579246521, "reward_std": 0.07474301755428314, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9505208134651184, "rewards/PlanningActionSetORM/std": 0.06561460345983505, "rewards/RMReward/mean": 0.7796875238418579, "rewards/RMReward/std": 0.11699143797159195, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 775, "train_speed(iter/s)": 0.016459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 168.375, "completions/min_length": 72.0, "epoch": 0.011911706014183527, "frac_reward_zero_std": 0.0, "grad_norm": 2.4093425273895264, "kl": 0.08924457430839539, "learning_rate": 5.954573357888276e-07, "loss": 0.01530434936285019, "memory(GiB)": 90.94, "reward": 0.5279443264007568, "reward_std": 0.16888204216957092, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.03415650874376297, "rewards/RMReward/mean": 0.7437499761581421, "rewards/RMReward/std": 0.09639330208301544, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.26338857412338257, "rewards/VisualPerceptionAccuracy/std": 0.2571415603160858, "step": 776, "train_speed(iter/s)": 0.016461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 138.5, "completions/min_length": 100.0, "epoch": 0.011927056150799743, "frac_reward_zero_std": 0.0, "grad_norm": 1.4947118759155273, "kl": 0.08158189058303833, "learning_rate": 5.962246777163904e-07, "loss": 0.056072767823934555, "memory(GiB)": 90.94, "reward": 0.9002499580383301, "reward_std": 0.04903451353311539, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8753124475479126, "rewards/RMReward/std": 0.08139842748641968, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 777, "train_speed(iter/s)": 0.016458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 12.25, "completions/min_length": 8.0, "epoch": 0.011942406287415958, "frac_reward_zero_std": 1.0, "grad_norm": 0.00021011955686844885, "kl": 0.534877598285675, "learning_rate": 5.969920196439534e-07, "loss": 0.0005340364878065884, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 778, "train_speed(iter/s)": 0.016441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/mean_length": 304.3125, "completions/min_length": 112.0, "epoch": 0.011957756424032174, "frac_reward_zero_std": 0.0, "grad_norm": 1.2665060758590698, "kl": 0.07372942566871643, "learning_rate": 5.977593615715163e-07, "loss": -0.061452023684978485, "memory(GiB)": 90.94, "reward": 0.5644311904907227, "reward_std": 0.14481939375400543, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9243749976158142, "rewards/RMReward/std": 0.1150633841753006, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.18936240673065186, "rewards/VisualPerceptionAccuracy/std": 0.19758811593055725, "step": 779, "train_speed(iter/s)": 0.016435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 290.375, "completions/min_length": 154.0, "epoch": 0.01197310656064839, "frac_reward_zero_std": 0.0, "grad_norm": 0.4586198329925537, "kl": 0.04747578129172325, "learning_rate": 5.985267034990793e-07, "loss": -0.001524588093161583, "memory(GiB)": 90.94, "reward": 0.8382500410079956, "reward_std": 0.12179352343082428, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7978125214576721, "rewards/RMReward/std": 0.19777363538742065, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 780, "train_speed(iter/s)": 0.01643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/mean_length": 112.78125, "completions/min_length": 105.0, "epoch": 0.011988456697264605, "frac_reward_zero_std": 0.0, "grad_norm": 1.6742323637008667, "kl": 0.1125965267419815, "learning_rate": 5.992940454266422e-07, "loss": 0.033168841153383255, "memory(GiB)": 90.94, "reward": 0.8550000190734863, "reward_std": 0.07388974726200104, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8343750238418579, "rewards/RMReward/std": 0.11600715667009354, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 781, "train_speed(iter/s)": 0.016415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/mean_length": 378.90625, "completions/min_length": 82.0, "epoch": 0.01200380683388082, "frac_reward_zero_std": 0.0, "grad_norm": 1.319633960723877, "kl": 0.060789305716753006, "learning_rate": 6.00061387354205e-07, "loss": -0.08440998941659927, "memory(GiB)": 90.94, "reward": 0.4404691457748413, "reward_std": 0.0868183821439743, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9249999523162842, "rewards/PlanningActionSetORM/std": 0.16124515235424042, "rewards/RMReward/mean": 0.746874988079071, "rewards/RMReward/std": 0.09568830579519272, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09843823313713074, "rewards/VisualPerceptionAccuracy/std": 0.09639254957437515, "step": 782, "train_speed(iter/s)": 0.016421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/mean_length": 104.375, "completions/min_length": 96.0, "epoch": 0.012019156970497038, "frac_reward_zero_std": 0.0, "grad_norm": 2.16178560256958, "kl": 0.10270880162715912, "learning_rate": 6.00828729281768e-07, "loss": -0.011389240622520447, "memory(GiB)": 90.94, "reward": 0.8187500238418579, "reward_std": 0.05992849916219711, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.7890625, "rewards/RMReward/std": 0.08774447441101074, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 783, "train_speed(iter/s)": 0.016424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.012034507107113254, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010847497469512746, "kl": 0.8046875, "learning_rate": 6.015960712093309e-07, "loss": 0.0008046142756938934, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 784, "train_speed(iter/s)": 0.016436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 193.0625, "completions/min_length": 106.0, "epoch": 0.01204985724372947, "frac_reward_zero_std": 0.0, "grad_norm": 3.0630569458007812, "kl": 0.06946271657943726, "learning_rate": 6.023634131368939e-07, "loss": -0.0004491843283176422, "memory(GiB)": 90.94, "reward": 0.8422499895095825, "reward_std": 0.11036403477191925, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8028124570846558, "rewards/RMReward/std": 0.1832367330789566, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 785, "train_speed(iter/s)": 0.016434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/mean_length": 157.1875, "completions/min_length": 99.0, "epoch": 0.012065207380345685, "frac_reward_zero_std": 0.0, "grad_norm": 2.691588878631592, "kl": 0.1045052707195282, "learning_rate": 6.031307550644567e-07, "loss": 0.11956378817558289, "memory(GiB)": 90.94, "reward": 0.6659615635871887, "reward_std": 0.12878535687923431, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8781249523162842, "rewards/RMReward/std": 0.07951676100492477, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.42942315340042114, "rewards/VisualPerceptionAccuracy/std": 0.1939573436975479, "step": 786, "train_speed(iter/s)": 0.016414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 67.25, "completions/min_length": 14.0, "epoch": 0.0120805575169619, "frac_reward_zero_std": 0.0, "grad_norm": 20.467811584472656, "kl": 0.11663760244846344, "learning_rate": 6.038980969920197e-07, "loss": 0.011158913373947144, "memory(GiB)": 90.94, "reward": 0.5712500214576721, "reward_std": 0.24436387419700623, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9624999761581421, "rewards/PlanningActionSetORM/std": 0.12583057582378387, "rewards/RMReward/mean": 0.828125, "rewards/RMReward/std": 0.07951676100492477, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 787, "train_speed(iter/s)": 0.016418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.012095907653578116, "frac_reward_zero_std": 1.0, "grad_norm": 8.940585394157097e-05, "kl": 0.67578125, "learning_rate": 6.046654389195826e-07, "loss": 0.0006759911775588989, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 788, "train_speed(iter/s)": 0.016407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 71.375, "completions/min_length": 8.0, "epoch": 0.012111257790194332, "frac_reward_zero_std": 0.0, "grad_norm": 22.26667594909668, "kl": 0.4579291045665741, "learning_rate": 6.054327808471456e-07, "loss": -0.02596811205148697, "memory(GiB)": 90.94, "reward": 0.17184646427631378, "reward_std": 0.15241128206253052, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.23431792855262756, "rewards/VisualPerceptionAccuracy/std": 0.06732256710529327, "step": 789, "train_speed(iter/s)": 0.016425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/mean_length": 56.0, "completions/min_length": 8.0, "epoch": 0.012126607926810548, "frac_reward_zero_std": 0.0, "grad_norm": 32.27520751953125, "kl": 0.5532602071762085, "learning_rate": 6.062001227747084e-07, "loss": 0.000551614910364151, "memory(GiB)": 90.94, "reward": 0.9453125, "reward_std": 0.21875, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9375, "rewards/RMReward/std": 0.25, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 790, "train_speed(iter/s)": 0.016428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 234.90625, "completions/min_length": 164.0, "epoch": 0.012141958063426765, "frac_reward_zero_std": 0.0, "grad_norm": 0.9440630674362183, "kl": 0.05079951137304306, "learning_rate": 6.069674647022713e-07, "loss": 0.043728724122047424, "memory(GiB)": 90.94, "reward": 0.8494091033935547, "reward_std": 0.07617150247097015, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9545454978942871, "rewards/PlanningActionSetORM/std": 0.046181850135326385, "rewards/RMReward/mean": 0.8231250047683716, "rewards/RMReward/std": 0.208797425031662, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 791, "train_speed(iter/s)": 0.016405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 117.78125, "completions/min_length": 103.0, "epoch": 0.01215730820004298, "frac_reward_zero_std": 0.0, "grad_norm": 1.5200475454330444, "kl": 0.11806297302246094, "learning_rate": 6.077348066298343e-07, "loss": -0.012952517718076706, "memory(GiB)": 90.94, "reward": 0.8450000286102295, "reward_std": 0.06860017776489258, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8062500357627869, "rewards/RMReward/std": 0.08867301046848297, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 792, "train_speed(iter/s)": 0.01641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/mean_length": 106.53125, "completions/min_length": 105.0, "epoch": 0.012172658336659196, "frac_reward_zero_std": 0.0, "grad_norm": 0.8890234231948853, "kl": 0.11831538379192352, "learning_rate": 6.085021485573972e-07, "loss": -0.0013174116611480713, "memory(GiB)": 90.94, "reward": 0.9637500047683716, "reward_std": 0.01866261661052704, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9703124761581421, "rewards/RMReward/std": 0.030741842463612556, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 793, "train_speed(iter/s)": 0.016388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 107.9375, "completions/min_length": 67.0, "epoch": 0.012188008473275412, "frac_reward_zero_std": 0.0, "grad_norm": 2.1862142086029053, "kl": 0.10983430594205856, "learning_rate": 6.092694904849601e-07, "loss": 0.08359171450138092, "memory(GiB)": 90.94, "reward": 0.9067500233650208, "reward_std": 0.07163029909133911, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8834375143051147, "rewards/RMReward/std": 0.12643012404441833, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 794, "train_speed(iter/s)": 0.016378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2049.0, "completions/mean_length": 680.9375, "completions/min_length": 150.0, "epoch": 0.012203358609891628, "frac_reward_zero_std": 0.0, "grad_norm": 0.6656572818756104, "kl": 0.025553904473781586, "learning_rate": 6.10036832412523e-07, "loss": -0.29022592306137085, "memory(GiB)": 90.94, "reward": 0.6486064195632935, "reward_std": 0.16571637988090515, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9230769276618958, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9118750095367432, "rewards/RMReward/std": 0.061015695333480835, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.38309741020202637, "rewards/VisualPerceptionAccuracy/std": 0.2826201915740967, "step": 795, "train_speed(iter/s)": 0.01637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 472.09375, "completions/min_length": 120.0, "epoch": 0.012218708746507843, "frac_reward_zero_std": 0.0, "grad_norm": 1.6930272579193115, "kl": 0.06961572170257568, "learning_rate": 6.10804174340086e-07, "loss": -0.08779796212911606, "memory(GiB)": 90.94, "reward": 0.5487725138664246, "reward_std": 0.091922827064991, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.871874988079071, "rewards/RMReward/std": 0.06574887782335281, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.20004507899284363, "rewards/VisualPerceptionAccuracy/std": 0.13124656677246094, "step": 796, "train_speed(iter/s)": 0.016358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/mean_length": 38.25, "completions/min_length": 8.0, "epoch": 0.012234058883124059, "frac_reward_zero_std": 0.0, "grad_norm": 31.83102035522461, "kl": 0.6311763525009155, "learning_rate": 6.11571516267649e-07, "loss": 0.002819061279296875, "memory(GiB)": 90.94, "reward": 0.590749979019165, "reward_std": 0.25554487109184265, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8675000071525574, "rewards/RMReward/std": 0.10779610276222229, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 797, "train_speed(iter/s)": 0.016356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/mean_length": 58.28125, "completions/min_length": 8.0, "epoch": 0.012249409019740276, "frac_reward_zero_std": 0.0, "grad_norm": 89.9682846069336, "kl": 0.3768978416919708, "learning_rate": 6.123388581952118e-07, "loss": 0.02026926726102829, "memory(GiB)": 90.94, "reward": 0.5953124761581421, "reward_std": 0.2880864441394806, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.6875, "rewards/RMReward/std": 0.11180340498685837, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 798, "train_speed(iter/s)": 0.016354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/mean_length": 61.65625, "completions/min_length": 14.0, "epoch": 0.012264759156356492, "frac_reward_zero_std": 0.0, "grad_norm": 7.302554130554199, "kl": 0.15382198989391327, "learning_rate": 6.131062001227748e-07, "loss": 0.008636362850666046, "memory(GiB)": 90.94, "reward": 0.9191250205039978, "reward_std": 0.17840920388698578, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9774999618530273, "rewards/RMReward/std": 0.04041452705860138, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 799, "train_speed(iter/s)": 0.016357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 345.375, "completions/min_length": 100.0, "epoch": 0.012280109292972708, "frac_reward_zero_std": 0.0, "grad_norm": 0.8100487589836121, "kl": 0.08192047476768494, "learning_rate": 6.138735420503377e-07, "loss": -0.07611434906721115, "memory(GiB)": 90.94, "reward": 0.7040714621543884, "reward_std": 0.07216080278158188, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.887499988079071, "rewards/RMReward/std": 0.05627313256263733, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.49814292788505554, "rewards/VisualPerceptionAccuracy/std": 0.09930310398340225, "step": 800, "train_speed(iter/s)": 0.016362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/mean_length": 116.21875, "completions/min_length": 98.0, "epoch": 0.012295459429588923, "frac_reward_zero_std": 0.0, "grad_norm": 2.0671956539154053, "kl": 0.12597984075546265, "learning_rate": 6.146408839779007e-07, "loss": 0.018215373158454895, "memory(GiB)": 90.94, "reward": 0.8524999618530273, "reward_std": 0.06252051889896393, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8156249523162842, "rewards/RMReward/std": 0.11807426810264587, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 801, "train_speed(iter/s)": 0.01633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 202.5, "completions/min_length": 166.0, "epoch": 0.012310809566205139, "frac_reward_zero_std": 0.0, "grad_norm": 0.0005215367418713868, "kl": 0.07076107710599899, "learning_rate": 6.154082259054636e-07, "loss": 7.086992263793945e-05, "memory(GiB)": 90.94, "reward": 0.9012500047683716, "reward_std": 0.040471553802490234, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8765624761581421, "rewards/RMReward/std": 0.133189395070076, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 802, "train_speed(iter/s)": 0.016318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/mean_length": 58.375, "completions/min_length": 8.0, "epoch": 0.012326159702821355, "frac_reward_zero_std": 0.0, "grad_norm": 44.21424102783203, "kl": 0.6291332244873047, "learning_rate": 6.161755678330264e-07, "loss": 0.0006736218929290771, "memory(GiB)": 90.94, "reward": 0.8610000014305115, "reward_std": 0.21903453767299652, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9493750333786011, "rewards/RMReward/std": 0.016520196571946144, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 803, "train_speed(iter/s)": 0.016321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2020.0, "completions/mean_length": 557.6875, "completions/min_length": 83.0, "epoch": 0.01234150983943757, "frac_reward_zero_std": 0.0, "grad_norm": 1.268349051475525, "kl": 0.04978014528751373, "learning_rate": 6.169429097605894e-07, "loss": -0.1529892086982727, "memory(GiB)": 90.94, "reward": 0.17739072442054749, "reward_std": 0.12801602482795715, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.17739072442054749, "rewards/VisualPerceptionAccuracy/std": 0.19222277402877808, "step": 804, "train_speed(iter/s)": 0.016323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/mean_length": 265.71875, "completions/min_length": 102.0, "epoch": 0.012356859976053788, "frac_reward_zero_std": 0.0, "grad_norm": 1.5230895280838013, "kl": 0.06673093140125275, "learning_rate": 6.177102516881523e-07, "loss": 0.009079881012439728, "memory(GiB)": 90.94, "reward": 0.8031222820281982, "reward_std": 0.07081723213195801, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9468612670898438, "rewards/PlanningActionSetORM/std": 0.07356841117143631, "rewards/RMReward/mean": 0.7671874761581421, "rewards/RMReward/std": 0.09034822881221771, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 805, "train_speed(iter/s)": 0.016287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 83.125, "completions/min_length": 8.0, "epoch": 0.012372210112670003, "frac_reward_zero_std": 0.0, "grad_norm": 28.515260696411133, "kl": 0.4575609266757965, "learning_rate": 6.184775936157153e-07, "loss": 0.005236785858869553, "memory(GiB)": 90.94, "reward": 0.8321875333786011, "reward_std": 0.2114369124174118, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8031250238418579, "rewards/RMReward/std": 0.04989573732018471, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 806, "train_speed(iter/s)": 0.016288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/mean_length": 314.5, "completions/min_length": 198.0, "epoch": 0.012387560249286219, "frac_reward_zero_std": 0.0, "grad_norm": 0.8164734840393066, "kl": 0.03757227212190628, "learning_rate": 6.192449355432781e-07, "loss": 0.0037283580750226974, "memory(GiB)": 90.94, "reward": 0.761246919631958, "reward_std": 0.10585005581378937, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9437342286109924, "rewards/PlanningActionSetORM/std": 0.055862896144390106, "rewards/RMReward/mean": 0.7156250476837158, "rewards/RMReward/std": 0.13645127415657043, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 807, "train_speed(iter/s)": 0.016253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 186.875, "completions/min_length": 102.0, "epoch": 0.012402910385902435, "frac_reward_zero_std": 0.0, "grad_norm": 1.3719042539596558, "kl": 0.10503330081701279, "learning_rate": 6.200122774708411e-07, "loss": -0.07340413331985474, "memory(GiB)": 90.94, "reward": 0.6167083978652954, "reward_std": 0.0697026401758194, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9156249761581421, "rewards/RMReward/std": 0.04366061091423035, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.30091679096221924, "rewards/VisualPerceptionAccuracy/std": 0.10447679460048676, "step": 808, "train_speed(iter/s)": 0.016242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/mean_length": 230.34375, "completions/min_length": 81.0, "epoch": 0.01241826052251865, "frac_reward_zero_std": 0.0, "grad_norm": 2.297229290008545, "kl": 0.07744991034269333, "learning_rate": 6.20779619398404e-07, "loss": 0.025358978658914566, "memory(GiB)": 90.94, "reward": 0.7069458365440369, "reward_std": 0.07321543246507645, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9921875, "rewards/PlanningActionSetORM/std": 0.03125, "rewards/RMReward/mean": 0.8400000333786011, "rewards/RMReward/std": 0.07302967458963394, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5434541702270508, "rewards/VisualPerceptionAccuracy/std": 0.08566581457853317, "step": 809, "train_speed(iter/s)": 0.016248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 123.0625, "completions/min_length": 95.0, "epoch": 0.012433610659134866, "frac_reward_zero_std": 0.0, "grad_norm": 2.39200758934021, "kl": 0.12271445244550705, "learning_rate": 6.21546961325967e-07, "loss": -0.0011704564094543457, "memory(GiB)": 90.94, "reward": 0.5945765972137451, "reward_std": 0.06368260830640793, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8843749761581421, "rewards/PlanningActionSetORM/std": 0.03145764395594597, "rewards/RMReward/mean": 0.753125011920929, "rewards/RMReward/std": 0.012500002980232239, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4097781181335449, "rewards/VisualPerceptionAccuracy/std": 0.11598173528909683, "step": 810, "train_speed(iter/s)": 0.016255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 142.125, "completions/min_length": 106.0, "epoch": 0.012448960795751082, "frac_reward_zero_std": 0.0, "grad_norm": 1.1797360181808472, "kl": 0.0902089774608612, "learning_rate": 6.223143032535298e-07, "loss": -0.017531774938106537, "memory(GiB)": 90.94, "reward": 0.875, "reward_std": 0.06343373656272888, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.84375, "rewards/RMReward/std": 0.11828750371932983, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 811, "train_speed(iter/s)": 0.01624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/mean_length": 160.6875, "completions/min_length": 98.0, "epoch": 0.012464310932367297, "frac_reward_zero_std": 0.0, "grad_norm": 2.0560495853424072, "kl": 0.09442120790481567, "learning_rate": 6.230816451810927e-07, "loss": 0.05028972029685974, "memory(GiB)": 90.94, "reward": 0.856429934501648, "reward_std": 0.13146522641181946, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9533997774124146, "rewards/PlanningActionSetORM/std": 0.05363380163908005, "rewards/RMReward/mean": 0.8321875333786011, "rewards/RMReward/std": 0.19063112139701843, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 812, "train_speed(iter/s)": 0.016217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 271.90625, "completions/min_length": 102.0, "epoch": 0.012479661068983515, "frac_reward_zero_std": 0.0, "grad_norm": 2.218872547149658, "kl": 0.1052217036485672, "learning_rate": 6.238489871086557e-07, "loss": -0.12773825228214264, "memory(GiB)": 90.94, "reward": 0.410847008228302, "reward_std": 0.23530957102775574, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.410847008228302, "rewards/VisualPerceptionAccuracy/std": 0.2658255100250244, "step": 813, "train_speed(iter/s)": 0.016202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/mean_length": 347.6875, "completions/min_length": 120.0, "epoch": 0.01249501120559973, "frac_reward_zero_std": 0.0, "grad_norm": 1.4764474630355835, "kl": 0.06536181271076202, "learning_rate": 6.246163290362186e-07, "loss": 0.009924419224262238, "memory(GiB)": 90.94, "reward": 0.6495445966720581, "reward_std": 0.0811368003487587, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.934374988079071, "rewards/RMReward/std": 0.030103983357548714, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.35158926248550415, "rewards/VisualPerceptionAccuracy/std": 0.13819041848182678, "step": 814, "train_speed(iter/s)": 0.016183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 170.6875, "completions/min_length": 109.0, "epoch": 0.012510361342215946, "frac_reward_zero_std": 0.0, "grad_norm": 1.2873551845550537, "kl": 0.08792547881603241, "learning_rate": 6.253836709637815e-07, "loss": 0.03712012246251106, "memory(GiB)": 90.94, "reward": 0.8812851309776306, "reward_std": 0.11852554976940155, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9464256763458252, "rewards/PlanningActionSetORM/std": 0.05519845709204674, "rewards/RMReward/mean": 0.8650000095367432, "rewards/RMReward/std": 0.17010432481765747, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 815, "train_speed(iter/s)": 0.016165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 216.3125, "completions/min_length": 8.0, "epoch": 0.012525711478832162, "frac_reward_zero_std": 0.0, "grad_norm": 20.031919479370117, "kl": 0.44151073694229126, "learning_rate": 6.261510128913445e-07, "loss": 0.014463081955909729, "memory(GiB)": 90.94, "reward": 0.5628892779350281, "reward_std": 0.2861105501651764, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9320180416107178, "rewards/PlanningActionSetORM/std": 0.0950583890080452, "rewards/RMReward/mean": 0.7406250238418579, "rewards/RMReward/std": 0.1474435031414032, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 816, "train_speed(iter/s)": 0.016154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/mean_length": 265.5, "completions/min_length": 109.0, "epoch": 0.012541061615448377, "frac_reward_zero_std": 0.0, "grad_norm": 1.3214049339294434, "kl": 0.07499167323112488, "learning_rate": 6.269183548189074e-07, "loss": -0.09188546240329742, "memory(GiB)": 90.94, "reward": 0.6748044490814209, "reward_std": 0.1161593645811081, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8968750238418579, "rewards/RMReward/std": 0.06700434535741806, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.43210890889167786, "rewards/VisualPerceptionAccuracy/std": 0.17871524393558502, "step": 817, "train_speed(iter/s)": 0.016134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/mean_length": 143.125, "completions/min_length": 126.0, "epoch": 0.012556411752064593, "frac_reward_zero_std": 0.0, "grad_norm": 1.541358470916748, "kl": 0.1077137440443039, "learning_rate": 6.276856967464704e-07, "loss": 0.0061184801161289215, "memory(GiB)": 90.94, "reward": 0.9175000190734863, "reward_std": 0.03441087529063225, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8968750238418579, "rewards/RMReward/std": 0.07718587666749954, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 818, "train_speed(iter/s)": 0.016135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 114.84375, "completions/min_length": 81.0, "epoch": 0.012571761888680809, "frac_reward_zero_std": 0.0, "grad_norm": 1.6645797491073608, "kl": 0.10913502424955368, "learning_rate": 6.284530386740332e-07, "loss": -0.03411467745900154, "memory(GiB)": 90.94, "reward": 0.84375, "reward_std": 0.0937928706407547, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8046875, "rewards/RMReward/std": 0.14775706827640533, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 819, "train_speed(iter/s)": 0.016139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 155.03125, "completions/min_length": 103.0, "epoch": 0.012587112025297026, "frac_reward_zero_std": 0.5, "grad_norm": 0.793282687664032, "kl": 0.0762275978922844, "learning_rate": 6.292203806015962e-07, "loss": 0.0017329250695183873, "memory(GiB)": 90.94, "reward": 0.9476388692855835, "reward_std": 0.019958283752202988, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9484374523162842, "rewards/RMReward/std": 0.06284179538488388, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 820, "train_speed(iter/s)": 0.016119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 640.96875, "completions/min_length": 245.0, "epoch": 0.012602462161913242, "frac_reward_zero_std": 0.0, "grad_norm": 1.0535491704940796, "kl": 0.0676075667142868, "learning_rate": 6.299877225291591e-07, "loss": -0.10080472379922867, "memory(GiB)": 90.94, "reward": 0.4590115547180176, "reward_std": 0.22490566968917847, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4590115547180176, "rewards/VisualPerceptionAccuracy/std": 0.22301898896694183, "step": 821, "train_speed(iter/s)": 0.016125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 696.96875, "completions/min_length": 114.0, "epoch": 0.012617812298529457, "frac_reward_zero_std": 0.0, "grad_norm": 1.7382783889770508, "kl": 0.06261429935693741, "learning_rate": 6.30755064456722e-07, "loss": -0.13593822717666626, "memory(GiB)": 90.94, "reward": 0.6316408514976501, "reward_std": 0.14958110451698303, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8999999761581421, "rewards/PlanningActionSetORM/std": 0.17888543009757996, "rewards/RMReward/mean": 0.809374988079071, "rewards/RMReward/std": 0.058363091200590134, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.43578165769577026, "rewards/VisualPerceptionAccuracy/std": 0.2280285656452179, "step": 822, "train_speed(iter/s)": 0.016105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 74.6875, "completions/min_length": 13.0, "epoch": 0.012633162435145673, "frac_reward_zero_std": 0.0, "grad_norm": 12.825447082519531, "kl": 0.17081022262573242, "learning_rate": 6.31522406384285e-07, "loss": -0.030421875417232513, "memory(GiB)": 90.94, "reward": 0.39037272334098816, "reward_std": 0.292805552482605, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.4932454526424408, "rewards/VisualPerceptionAccuracy/std": 0.16075821220874786, "step": 823, "train_speed(iter/s)": 0.016121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 14.3125, "completions/min_length": 13.0, "epoch": 0.012648512571761888, "frac_reward_zero_std": 0.0, "grad_norm": 22.09183120727539, "kl": 0.3302072286605835, "learning_rate": 6.322897483118478e-07, "loss": -0.018282301723957062, "memory(GiB)": 90.94, "reward": 0.703125, "reward_std": 0.4348437190055847, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 824, "train_speed(iter/s)": 0.016137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/mean_length": 377.03125, "completions/min_length": 93.0, "epoch": 0.012663862708378104, "frac_reward_zero_std": 0.0, "grad_norm": 0.713340163230896, "kl": 0.042044222354888916, "learning_rate": 6.330570902394108e-07, "loss": -0.29042771458625793, "memory(GiB)": 90.94, "reward": 0.46383750438690186, "reward_std": 0.14526242017745972, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.734375, "rewards/RMReward/std": 0.20794129371643066, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1401749700307846, "rewards/VisualPerceptionAccuracy/std": 0.12417180836200714, "step": 825, "train_speed(iter/s)": 0.016095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/mean_length": 132.625, "completions/min_length": 86.0, "epoch": 0.01267921284499432, "frac_reward_zero_std": 0.0, "grad_norm": 2.3062376976013184, "kl": 0.11941765993833542, "learning_rate": 6.338244321669737e-07, "loss": -0.017776761204004288, "memory(GiB)": 90.94, "reward": 0.8033854365348816, "reward_std": 0.07154304534196854, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9731770753860474, "rewards/PlanningActionSetORM/std": 0.05370395630598068, "rewards/RMReward/mean": 0.7609374523162842, "rewards/RMReward/std": 0.09649867564439774, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 826, "train_speed(iter/s)": 0.016076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 156.90625, "completions/min_length": 108.0, "epoch": 0.012694562981610537, "frac_reward_zero_std": 0.0, "grad_norm": 0.5810644030570984, "kl": 0.07156138867139816, "learning_rate": 6.345917740945367e-07, "loss": 0.0005636289715766907, "memory(GiB)": 90.94, "reward": 0.9691388607025146, "reward_std": 0.023044554516673088, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9753124713897705, "rewards/RMReward/std": 0.03282253071665764, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 827, "train_speed(iter/s)": 0.016076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/mean_length": 344.875, "completions/min_length": 87.0, "epoch": 0.012709913118226753, "frac_reward_zero_std": 0.0, "grad_norm": 1.2725895643234253, "kl": 0.043014053255319595, "learning_rate": 6.353591160220995e-07, "loss": -0.0855816975235939, "memory(GiB)": 90.94, "reward": 0.22666585445404053, "reward_std": 0.13549378514289856, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22666585445404053, "rewards/VisualPerceptionAccuracy/std": 0.18986767530441284, "step": 828, "train_speed(iter/s)": 0.016089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 117.25, "completions/min_length": 108.0, "epoch": 0.012725263254842968, "frac_reward_zero_std": 0.0, "grad_norm": 1.3645042181015015, "kl": 0.12355975806713104, "learning_rate": 6.361264579496625e-07, "loss": -0.015021570026874542, "memory(GiB)": 90.94, "reward": 0.9249999523162842, "reward_std": 0.05703606829047203, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.90625, "rewards/RMReward/std": 0.07042496651411057, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 829, "train_speed(iter/s)": 0.016091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/mean_length": 120.15625, "completions/min_length": 89.0, "epoch": 0.012740613391459184, "frac_reward_zero_std": 0.0, "grad_norm": 1.4483157396316528, "kl": 0.10131765902042389, "learning_rate": 6.368937998772254e-07, "loss": 0.011496060527861118, "memory(GiB)": 90.94, "reward": 0.8752187490463257, "reward_std": 0.052570126950740814, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.99609375, "rewards/PlanningActionSetORM/std": 0.022097086533904076, "rewards/RMReward/mean": 0.8450000286102295, "rewards/RMReward/std": 0.06974607706069946, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 830, "train_speed(iter/s)": 0.016086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 114.03125, "completions/min_length": 99.0, "epoch": 0.0127559635280754, "frac_reward_zero_std": 0.0, "grad_norm": 3.1982953548431396, "kl": 0.1176474541425705, "learning_rate": 6.376611418047883e-07, "loss": -0.020638085901737213, "memory(GiB)": 90.94, "reward": 0.8987500667572021, "reward_std": 0.05629931390285492, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8734375238418579, "rewards/RMReward/std": 0.1282540112733841, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 831, "train_speed(iter/s)": 0.016086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/mean_length": 185.25, "completions/min_length": 144.0, "epoch": 0.012771313664691615, "frac_reward_zero_std": 0.0, "grad_norm": 1.8596872091293335, "kl": 0.06944257766008377, "learning_rate": 6.384284837323512e-07, "loss": -0.016908185556530952, "memory(GiB)": 90.94, "reward": 0.7284201383590698, "reward_std": 0.09126240015029907, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.125, "rewards/RMReward/mean": 1.0, "rewards/RMReward/std": 0.0, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4630902409553528, "rewards/VisualPerceptionAccuracy/std": 0.15752482414245605, "step": 832, "train_speed(iter/s)": 0.016083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/mean_length": 205.1875, "completions/min_length": 8.0, "epoch": 0.012786663801307831, "frac_reward_zero_std": 0.0, "grad_norm": 24.88359832763672, "kl": 0.6176271438598633, "learning_rate": 6.391958256599141e-07, "loss": 0.00150369293987751, "memory(GiB)": 90.94, "reward": 0.87491774559021, "reward_std": 0.1351301670074463, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9835525751113892, "rewards/PlanningActionSetORM/std": 0.013157904148101807, "rewards/RMReward/mean": 0.765625, "rewards/RMReward/std": 0.0396600216627121, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 833, "train_speed(iter/s)": 0.016084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/mean_length": 93.28125, "completions/min_length": 78.0, "epoch": 0.012802013937924047, "frac_reward_zero_std": 0.0, "grad_norm": 1.752859115600586, "kl": 0.11252487450838089, "learning_rate": 6.399631675874771e-07, "loss": 0.007785983383655548, "memory(GiB)": 90.94, "reward": 0.9587500095367432, "reward_std": 0.052808865904808044, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.948437511920929, "rewards/RMReward/std": 0.07457879185676575, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 834, "train_speed(iter/s)": 0.016074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 135.75, "completions/min_length": 8.0, "epoch": 0.012817364074540264, "frac_reward_zero_std": 0.5, "grad_norm": 0.3398272395133972, "kl": 0.634985089302063, "learning_rate": 6.4073050951504e-07, "loss": 0.001323852688074112, "memory(GiB)": 90.94, "reward": 0.4323076903820038, "reward_std": 0.01712697185575962, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9230769276618958, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7875000238418579, "rewards/RMReward/std": 0.042817454785108566, "rewards/SpatialReasoningORM/mean": 0.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 835, "train_speed(iter/s)": 0.016071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 116.25, "completions/min_length": 104.0, "epoch": 0.01283271421115648, "frac_reward_zero_std": 0.0, "grad_norm": 2.3395233154296875, "kl": 0.11974254995584488, "learning_rate": 6.414978514426029e-07, "loss": -0.014337407425045967, "memory(GiB)": 90.94, "reward": 0.8957499861717224, "reward_std": 0.060395874083042145, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8696874380111694, "rewards/RMReward/std": 0.08314325660467148, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 836, "train_speed(iter/s)": 0.016066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 285.09375, "completions/min_length": 100.0, "epoch": 0.012848064347772695, "frac_reward_zero_std": 0.0, "grad_norm": 1.32615327835083, "kl": 0.08280260860919952, "learning_rate": 6.422651933701658e-07, "loss": 0.011033955961465836, "memory(GiB)": 90.94, "reward": 0.6462374329566956, "reward_std": 0.15598925948143005, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.05836308002471924, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.40497490763664246, "rewards/VisualPerceptionAccuracy/std": 0.2652880847454071, "step": 837, "train_speed(iter/s)": 0.016063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 209.09375, "completions/min_length": 105.0, "epoch": 0.012863414484388911, "frac_reward_zero_std": 0.0, "grad_norm": 1.4142568111419678, "kl": 0.08514004945755005, "learning_rate": 6.430325352977288e-07, "loss": -0.004021987318992615, "memory(GiB)": 90.94, "reward": 0.9262691736221313, "reward_std": 0.05776994675397873, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9663461446762085, "rewards/PlanningActionSetORM/std": 0.03423883020877838, "rewards/RMReward/mean": 0.9162499904632568, "rewards/RMReward/std": 0.07477924227714539, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 838, "train_speed(iter/s)": 0.016032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 149.0, "completions/min_length": 2.0, "epoch": 0.012878764621005127, "frac_reward_zero_std": 0.0, "grad_norm": 89.20307922363281, "kl": 0.40027186274528503, "learning_rate": 6.437998772252917e-07, "loss": 0.00040079839527606964, "memory(GiB)": 90.94, "reward": 0.5088333487510681, "reward_std": 0.2123388797044754, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9333333373069763, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8824999928474426, "rewards/RMReward/std": 0.10389097779989243, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.125, "rewards/VisualPerceptionAccuracy/std": 0.3415650427341461, "step": 839, "train_speed(iter/s)": 0.016016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 151.09375, "completions/min_length": 106.0, "epoch": 0.012894114757621342, "frac_reward_zero_std": 0.0, "grad_norm": 1.53733229637146, "kl": 0.08350422978401184, "learning_rate": 6.445672191528545e-07, "loss": -0.01727653294801712, "memory(GiB)": 90.94, "reward": 0.8682500123977661, "reward_std": 0.05891717970371246, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.09837386757135391, "rewards/RMReward/mean": 0.8415625095367432, "rewards/RMReward/std": 0.08124472200870514, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 840, "train_speed(iter/s)": 0.01602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 199.53125, "completions/min_length": 13.0, "epoch": 0.012909464894237558, "frac_reward_zero_std": 0.0, "grad_norm": 10.185162544250488, "kl": 0.20886501669883728, "learning_rate": 6.453345610804175e-07, "loss": 0.008413918316364288, "memory(GiB)": 90.94, "reward": 0.9137993454933167, "reward_std": 0.15219147503376007, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9473684430122375, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.871874988079071, "rewards/RMReward/std": 0.08360372483730316, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 841, "train_speed(iter/s)": 0.016016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 120.6875, "completions/min_length": 67.0, "epoch": 0.012924815030853775, "frac_reward_zero_std": 0.0, "grad_norm": 2.592555522918701, "kl": 0.10460469126701355, "learning_rate": 6.461019030079804e-07, "loss": 0.01732484996318817, "memory(GiB)": 90.94, "reward": 0.5379302501678467, "reward_std": 0.11534159630537033, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9260416626930237, "rewards/PlanningActionSetORM/std": 0.115264393389225, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.06582807004451752, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2706522047519684, "rewards/VisualPerceptionAccuracy/std": 0.1624492108821869, "step": 842, "train_speed(iter/s)": 0.016023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 156.8125, "completions/min_length": 106.0, "epoch": 0.012940165167469991, "frac_reward_zero_std": 0.0, "grad_norm": 0.8493188619613647, "kl": 0.09136269241571426, "learning_rate": 6.468692449355434e-07, "loss": 0.00675351545214653, "memory(GiB)": 90.94, "reward": 0.8326388597488403, "reward_std": 0.10778573155403137, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.8046874403953552, "rewards/RMReward/std": 0.16380523145198822, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 843, "train_speed(iter/s)": 0.015978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 179.34375, "completions/min_length": 102.0, "epoch": 0.012955515304086207, "frac_reward_zero_std": 0.0, "grad_norm": 0.851453959941864, "kl": 0.11052010953426361, "learning_rate": 6.476365868631062e-07, "loss": -0.12023787200450897, "memory(GiB)": 90.94, "reward": 0.8646875023841858, "reward_std": 0.09069839119911194, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9671875238418579, "rewards/PlanningActionSetORM/std": 0.14332422614097595, "rewards/RMReward/mean": 0.8390624523162842, "rewards/RMReward/std": 0.1435350775718689, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 844, "train_speed(iter/s)": 0.015974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/mean_length": 54.53125, "completions/min_length": 8.0, "epoch": 0.012970865440702422, "frac_reward_zero_std": 0.0, "grad_norm": 27.447513580322266, "kl": 0.6432846188545227, "learning_rate": 6.484039287906692e-07, "loss": 0.0014224536716938019, "memory(GiB)": 90.94, "reward": 0.520937442779541, "reward_std": 0.1362142413854599, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9156249761581421, "rewards/RMReward/std": 0.04366061091423035, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 845, "train_speed(iter/s)": 0.015973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 171.09375, "completions/min_length": 102.0, "epoch": 0.012986215577318638, "frac_reward_zero_std": 0.0, "grad_norm": 1.6673234701156616, "kl": 0.0748337060213089, "learning_rate": 6.491712707182321e-07, "loss": -0.0004144236445426941, "memory(GiB)": 90.94, "reward": 0.9739999771118164, "reward_std": 0.03690027818083763, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9674999713897705, "rewards/RMReward/std": 0.0695144459605217, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 846, "train_speed(iter/s)": 0.015966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/mean_length": 313.6875, "completions/min_length": 106.0, "epoch": 0.013001565713934854, "frac_reward_zero_std": 0.0, "grad_norm": 1.3798271417617798, "kl": 0.11653854697942734, "learning_rate": 6.499386126457951e-07, "loss": 0.12902547419071198, "memory(GiB)": 90.94, "reward": 0.6681280136108398, "reward_std": 0.12457224726676941, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9859374761581421, "rewards/PlanningActionSetORM/std": 0.038696203380823135, "rewards/RMReward/mean": 0.871874988079071, "rewards/RMReward/std": 0.07520803809165955, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4415685534477234, "rewards/VisualPerceptionAccuracy/std": 0.18784011900424957, "step": 847, "train_speed(iter/s)": 0.015961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/mean_length": 144.03125, "completions/min_length": 66.0, "epoch": 0.01301691585055107, "frac_reward_zero_std": 0.0, "grad_norm": 1.9377878904342651, "kl": 0.12201578915119171, "learning_rate": 6.50705954573358e-07, "loss": 0.011473473161458969, "memory(GiB)": 90.94, "reward": 0.5672214031219482, "reward_std": 0.14463090896606445, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9791666269302368, "rewards/PlanningActionSetORM/std": 0.05692751333117485, "rewards/RMReward/mean": 0.6625000238418579, "rewards/RMReward/std": 0.10246951133012772, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4086094796657562, "rewards/VisualPerceptionAccuracy/std": 0.21034787595272064, "step": 848, "train_speed(iter/s)": 0.015964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 119.5625, "completions/min_length": 109.0, "epoch": 0.013032265987167287, "frac_reward_zero_std": 0.0, "grad_norm": 1.3935714960098267, "kl": 0.12031075358390808, "learning_rate": 6.514732965009208e-07, "loss": -0.004146132618188858, "memory(GiB)": 90.94, "reward": 0.9212499856948853, "reward_std": 0.05926584452390671, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9015624523162842, "rewards/RMReward/std": 0.0734894871711731, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 849, "train_speed(iter/s)": 0.015944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 101.28125, "completions/min_length": 86.0, "epoch": 0.013047616123783502, "frac_reward_zero_std": 0.0, "grad_norm": 1.8941874504089355, "kl": 0.11116837710142136, "learning_rate": 6.522406384284838e-07, "loss": 0.007666131481528282, "memory(GiB)": 90.94, "reward": 0.8762500286102295, "reward_std": 0.053235504776239395, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8453125357627869, "rewards/RMReward/std": 0.06881412118673325, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 850, "train_speed(iter/s)": 0.015936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 167.28125, "completions/min_length": 8.0, "epoch": 0.013062966260399718, "frac_reward_zero_std": 0.0, "grad_norm": 32.76248550415039, "kl": 0.4992152452468872, "learning_rate": 6.530079803560467e-07, "loss": -0.012249559164047241, "memory(GiB)": 90.94, "reward": 0.24923385679721832, "reward_std": 0.3155580163002014, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.21096771955490112, "rewards/VisualPerceptionAccuracy/std": 0.20626311004161835, "step": 851, "train_speed(iter/s)": 0.015951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/mean_length": 55.78125, "completions/min_length": 8.0, "epoch": 0.013078316397015934, "frac_reward_zero_std": 0.0, "grad_norm": 24.537538528442383, "kl": 0.4464787542819977, "learning_rate": 6.537753222836097e-07, "loss": 0.012144509702920914, "memory(GiB)": 90.94, "reward": 0.5728124976158142, "reward_std": 0.21510769426822662, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8968750238418579, "rewards/RMReward/std": 0.05907268449664116, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 852, "train_speed(iter/s)": 0.015945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 152.3125, "completions/min_length": 115.0, "epoch": 0.01309366653363215, "frac_reward_zero_std": 0.0, "grad_norm": 1.6206018924713135, "kl": 0.06981135159730911, "learning_rate": 6.545426642111725e-07, "loss": 0.0687360018491745, "memory(GiB)": 90.94, "reward": 0.7012326717376709, "reward_std": 0.1073881983757019, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8770833015441895, "rewards/PlanningActionSetORM/std": 0.1007150337100029, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.044253069907426834, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6020486950874329, "rewards/VisualPerceptionAccuracy/std": 0.17033958435058594, "step": 853, "train_speed(iter/s)": 0.015948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 214.09375, "completions/min_length": 155.0, "epoch": 0.013109016670248365, "frac_reward_zero_std": 0.0, "grad_norm": 0.3835532069206238, "kl": 0.05799085274338722, "learning_rate": 6.553100061387355e-07, "loss": 0.005910638719797134, "memory(GiB)": 90.94, "reward": 0.8817499876022339, "reward_std": 0.11018285155296326, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8521875143051147, "rewards/RMReward/std": 0.1471390426158905, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 854, "train_speed(iter/s)": 0.01594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 57.5625, "completions/min_length": 2.0, "epoch": 0.01312436680686458, "frac_reward_zero_std": 0.0, "grad_norm": 39.9505615234375, "kl": 0.06990550458431244, "learning_rate": 6.560773480662984e-07, "loss": -0.008703764528036118, "memory(GiB)": 90.94, "reward": 0.9162499904632568, "reward_std": 0.1501660943031311, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8687499761581421, "rewards/RMReward/std": 0.06291528046131134, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.9375, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 855, "train_speed(iter/s)": 0.015941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 194.4375, "completions/min_length": 105.0, "epoch": 0.013139716943480796, "frac_reward_zero_std": 0.0, "grad_norm": 1.3994815349578857, "kl": 0.08089881390333176, "learning_rate": 6.568446899938614e-07, "loss": 0.0054054465144872665, "memory(GiB)": 90.94, "reward": 0.9541249871253967, "reward_std": 0.035454198718070984, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.965624988079071, "rewards/PlanningActionSetORM/std": 0.05599179118871689, "rewards/RMReward/mean": 0.9512500166893005, "rewards/RMReward/std": 0.059986554086208344, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 856, "train_speed(iter/s)": 0.015929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/mean_length": 95.09375, "completions/min_length": 64.0, "epoch": 0.013155067080097014, "frac_reward_zero_std": 0.0, "grad_norm": 3.12225604057312, "kl": 0.09633377939462662, "learning_rate": 6.576120319214242e-07, "loss": -0.024945441633462906, "memory(GiB)": 90.94, "reward": 0.9292500019073486, "reward_std": 0.06217510625720024, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9271875023841858, "rewards/RMReward/std": 0.08247127383947372, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 857, "train_speed(iter/s)": 0.015914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/mean_length": 11.125, "completions/min_length": 8.0, "epoch": 0.01317041721671323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039748200215399265, "kl": 0.6606565713882446, "learning_rate": 6.583793738489871e-07, "loss": 0.0006601496716029942, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 858, "train_speed(iter/s)": 0.015916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 160.78125, "completions/min_length": 81.0, "epoch": 0.013185767353329445, "frac_reward_zero_std": 0.0, "grad_norm": 2.015105962753296, "kl": 0.09624709188938141, "learning_rate": 6.591467157765501e-07, "loss": 0.012729529291391373, "memory(GiB)": 90.94, "reward": 0.49861159920692444, "reward_std": 0.15304991602897644, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9296875, "rewards/PlanningActionSetORM/std": 0.06404344737529755, "rewards/RMReward/mean": 0.846875011920929, "rewards/RMReward/std": 0.08260094374418259, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1337856650352478, "rewards/VisualPerceptionAccuracy/std": 0.23835721611976624, "step": 859, "train_speed(iter/s)": 0.015902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 170.78125, "completions/min_length": 85.0, "epoch": 0.01320111748994566, "frac_reward_zero_std": 0.0, "grad_norm": 2.6068882942199707, "kl": 0.10045913606882095, "learning_rate": 6.59914057704113e-07, "loss": -0.08144441992044449, "memory(GiB)": 90.94, "reward": 0.7837847471237183, "reward_std": 0.13849809765815735, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9689235687255859, "rewards/PlanningActionSetORM/std": 0.031104054301977158, "rewards/RMReward/mean": 0.737500011920929, "rewards/RMReward/std": 0.19176597893238068, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 860, "train_speed(iter/s)": 0.015887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/mean_length": 234.28125, "completions/min_length": 90.0, "epoch": 0.013216467626561876, "frac_reward_zero_std": 0.0, "grad_norm": 1.4518282413482666, "kl": 0.0717913806438446, "learning_rate": 6.606813996316759e-07, "loss": -0.005295161157846451, "memory(GiB)": 90.94, "reward": 0.801965594291687, "reward_std": 0.08065962046384811, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9935777187347412, "rewards/PlanningActionSetORM/std": 0.01728130131959915, "rewards/RMReward/mean": 0.7540625333786011, "rewards/RMReward/std": 0.1340644806623459, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 861, "train_speed(iter/s)": 0.015872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 150.28125, "completions/min_length": 101.0, "epoch": 0.013231817763178092, "frac_reward_zero_std": 0.0, "grad_norm": 1.0999547243118286, "kl": 0.08261606842279434, "learning_rate": 6.614487415592388e-07, "loss": -0.004598096013069153, "memory(GiB)": 90.94, "reward": 0.9388889074325562, "reward_std": 0.047687843441963196, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9375, "rewards/RMReward/std": 0.07361626625061035, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 862, "train_speed(iter/s)": 0.015855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/mean_length": 66.71875, "completions/min_length": 2.0, "epoch": 0.013247167899794307, "frac_reward_zero_std": 0.0, "grad_norm": 45.57637405395508, "kl": 0.13734544813632965, "learning_rate": 6.622160834868018e-07, "loss": 0.00023586302995681763, "memory(GiB)": 90.94, "reward": 0.9199999570846558, "reward_std": 0.14918676018714905, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8781249523162842, "rewards/RMReward/std": 0.06046692654490471, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.9375, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 863, "train_speed(iter/s)": 0.015857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 156.5, "completions/min_length": 105.0, "epoch": 0.013262518036410525, "frac_reward_zero_std": 0.0, "grad_norm": 1.2582000494003296, "kl": 0.08601567149162292, "learning_rate": 6.629834254143647e-07, "loss": -0.006255025044083595, "memory(GiB)": 90.94, "reward": 0.9422500133514404, "reward_std": 0.05478803068399429, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9278125166893005, "rewards/RMReward/std": 0.07477856427431107, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 864, "train_speed(iter/s)": 0.015839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 173.84375, "completions/min_length": 70.0, "epoch": 0.01327786817302674, "frac_reward_zero_std": 0.0, "grad_norm": 1.5538665056228638, "kl": 0.0853523463010788, "learning_rate": 6.637507673419276e-07, "loss": -0.012981381267309189, "memory(GiB)": 90.94, "reward": 0.8497243523597717, "reward_std": 0.05243883281946182, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9511218070983887, "rewards/PlanningActionSetORM/std": 0.04878510907292366, "rewards/RMReward/mean": 0.8243749737739563, "rewards/RMReward/std": 0.09473008662462234, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 865, "train_speed(iter/s)": 0.015841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.5, "completions/min_length": 8.0, "epoch": 0.013293218309642956, "frac_reward_zero_std": 1.0, "grad_norm": 9.437170774617698e-06, "kl": 0.69140625, "learning_rate": 6.645181092694905e-07, "loss": 0.0006916556158103049, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 866, "train_speed(iter/s)": 0.015844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/mean_length": 405.8125, "completions/min_length": 194.0, "epoch": 0.013308568446259172, "frac_reward_zero_std": 0.0, "grad_norm": 1.697456955909729, "kl": 0.07211117446422577, "learning_rate": 6.652854511970534e-07, "loss": -0.05944700911641121, "memory(GiB)": 90.94, "reward": 0.45305800437927246, "reward_std": 0.10084346681833267, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.45305800437927246, "rewards/VisualPerceptionAccuracy/std": 0.26825881004333496, "step": 867, "train_speed(iter/s)": 0.015856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 61.34375, "completions/min_length": 8.0, "epoch": 0.013323918582875387, "frac_reward_zero_std": 0.0, "grad_norm": 19.18303108215332, "kl": 0.51530921459198, "learning_rate": 6.660527931246164e-07, "loss": -0.0012040957808494568, "memory(GiB)": 90.94, "reward": 0.9403125047683716, "reward_std": 0.13807183504104614, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9249999523162842, "rewards/RMReward/std": 0.04830458015203476, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 868, "train_speed(iter/s)": 0.015859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/mean_length": 240.9375, "completions/min_length": 79.0, "epoch": 0.013339268719491603, "frac_reward_zero_std": 0.0, "grad_norm": 1.3560453653335571, "kl": 0.05321550741791725, "learning_rate": 6.668201350521792e-07, "loss": 0.05011596530675888, "memory(GiB)": 90.94, "reward": 0.6449235677719116, "reward_std": 0.16379521787166595, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9993749856948853, "rewards/RMReward/std": 0.002499997615814209, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.290347158908844, "rewards/VisualPerceptionAccuracy/std": 0.32559046149253845, "step": 869, "train_speed(iter/s)": 0.015863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2049.0, "completions/mean_length": 861.65625, "completions/min_length": 300.0, "epoch": 0.013354618856107819, "frac_reward_zero_std": 0.0, "grad_norm": 1.0468549728393555, "kl": 0.05177421122789383, "learning_rate": 6.675874769797422e-07, "loss": -0.13350629806518555, "memory(GiB)": 90.94, "reward": 0.33483055233955383, "reward_std": 0.12417146563529968, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.33483055233955383, "rewards/VisualPerceptionAccuracy/std": 0.27729058265686035, "step": 870, "train_speed(iter/s)": 0.015864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 70.875, "completions/min_length": 8.0, "epoch": 0.013369968992724036, "frac_reward_zero_std": 0.0, "grad_norm": 51.89393997192383, "kl": 0.41306713223457336, "learning_rate": 6.683548189073051e-07, "loss": 0.02660531923174858, "memory(GiB)": 90.94, "reward": 0.8571875095367432, "reward_std": 0.1494569480419159, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.71875, "rewards/RMReward/std": 0.07719024270772934, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 871, "train_speed(iter/s)": 0.015868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 257.25, "completions/min_length": 109.0, "epoch": 0.013385319129340252, "frac_reward_zero_std": 0.0, "grad_norm": 0.2674546241760254, "kl": 0.0575408898293972, "learning_rate": 6.691221608348681e-07, "loss": -0.0003383159637451172, "memory(GiB)": 90.94, "reward": 0.8818535804748535, "reward_std": 0.03309750184416771, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9117677211761475, "rewards/PlanningActionSetORM/std": 0.03749947249889374, "rewards/RMReward/mean": 0.8743749856948853, "rewards/RMReward/std": 0.13797935843467712, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 872, "train_speed(iter/s)": 0.01584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/mean_length": 97.5, "completions/min_length": 8.0, "epoch": 0.013400669265956467, "frac_reward_zero_std": 0.0, "grad_norm": 28.067760467529297, "kl": 0.5484416484832764, "learning_rate": 6.698895027624309e-07, "loss": 0.0005485918372869492, "memory(GiB)": 90.94, "reward": 0.8498125076293945, "reward_std": 0.22990554571151733, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9956250190734863, "rewards/RMReward/std": 0.006291523110121489, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 873, "train_speed(iter/s)": 0.01584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/mean_length": 127.8125, "completions/min_length": 88.0, "epoch": 0.013416019402572683, "frac_reward_zero_std": 0.0, "grad_norm": 2.1181910037994385, "kl": 0.0906156599521637, "learning_rate": 6.706568446899938e-07, "loss": -0.040373362600803375, "memory(GiB)": 90.94, "reward": 0.768958330154419, "reward_std": 0.06706961989402771, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9822916984558105, "rewards/PlanningActionSetORM/std": 0.07572084665298462, "rewards/RMReward/mean": 0.715624988079071, "rewards/RMReward/std": 0.09283830970525742, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 874, "train_speed(iter/s)": 0.015831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/mean_length": 275.9375, "completions/min_length": 99.0, "epoch": 0.013431369539188899, "frac_reward_zero_std": 0.0, "grad_norm": 1.3774969577789307, "kl": 0.10580592602491379, "learning_rate": 6.714241866175568e-07, "loss": 0.05874714255332947, "memory(GiB)": 90.94, "reward": 0.5713374614715576, "reward_std": 0.14142166078090668, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.800000011920929, "rewards/RMReward/std": 0.05163978412747383, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3039249777793884, "rewards/VisualPerceptionAccuracy/std": 0.24253205955028534, "step": 875, "train_speed(iter/s)": 0.015833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/mean_length": 77.25, "completions/min_length": 8.0, "epoch": 0.013446719675805114, "frac_reward_zero_std": 0.0, "grad_norm": 39.972694396972656, "kl": 0.5490761995315552, "learning_rate": 6.721915285451197e-07, "loss": -0.004886026494204998, "memory(GiB)": 90.94, "reward": 0.7316145896911621, "reward_std": 0.23728857934474945, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.6413542032241821, "rewards/VisualPerceptionAccuracy/std": 0.09161989390850067, "step": 876, "train_speed(iter/s)": 0.015849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 122.34375, "completions/min_length": 99.0, "epoch": 0.01346206981242133, "frac_reward_zero_std": 0.0, "grad_norm": 2.159862756729126, "kl": 0.10558890551328659, "learning_rate": 6.729588704726826e-07, "loss": -0.012372568249702454, "memory(GiB)": 90.94, "reward": 0.8048437833786011, "reward_std": 0.05946136265993118, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.936718761920929, "rewards/PlanningActionSetORM/std": 0.0609019473195076, "rewards/RMReward/mean": 0.7718750238418579, "rewards/RMReward/std": 0.08125775307416916, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 877, "train_speed(iter/s)": 0.015847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 132.90625, "completions/min_length": 100.0, "epoch": 0.013477419949037546, "frac_reward_zero_std": 0.0, "grad_norm": 1.0711309909820557, "kl": 0.10078258812427521, "learning_rate": 6.737262124002455e-07, "loss": 0.0033194217830896378, "memory(GiB)": 90.94, "reward": 0.8964999914169312, "reward_std": 0.07569115608930588, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8706250190734863, "rewards/RMReward/std": 0.1031421571969986, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 878, "train_speed(iter/s)": 0.015831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 15.5625, "completions/min_length": 14.0, "epoch": 0.013492770085653763, "frac_reward_zero_std": 1.0, "grad_norm": 0.02036617323756218, "kl": 0.309370756149292, "learning_rate": 6.744935543278085e-07, "loss": 0.00030883229919709265, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 879, "train_speed(iter/s)": 0.01584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.013508120222269979, "frac_reward_zero_std": 1.0, "grad_norm": 2.125967512256466e-06, "kl": 1.009765625, "learning_rate": 6.752608962553714e-07, "loss": 0.0010116547346115112, "memory(GiB)": 90.94, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 880, "train_speed(iter/s)": 0.015838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 172.4375, "completions/min_length": 106.0, "epoch": 0.013523470358886194, "frac_reward_zero_std": 0.0, "grad_norm": 0.7862514853477478, "kl": 0.0759672075510025, "learning_rate": 6.760282381829344e-07, "loss": 0.012886876240372658, "memory(GiB)": 90.94, "reward": 0.9070000648498535, "reward_std": 0.023122485727071762, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8837499618530273, "rewards/RMReward/std": 0.12283611297607422, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 881, "train_speed(iter/s)": 0.015824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 173.40625, "completions/min_length": 86.0, "epoch": 0.01353882049550241, "frac_reward_zero_std": 0.0, "grad_norm": 1.7692304849624634, "kl": 0.08343550562858582, "learning_rate": 6.767955801104972e-07, "loss": 0.020748160779476166, "memory(GiB)": 90.94, "reward": 0.6359086632728577, "reward_std": 0.15233321487903595, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.875, "rewards/RMReward/std": 0.05477224662899971, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.37181735038757324, "rewards/VisualPerceptionAccuracy/std": 0.26084867119789124, "step": 882, "train_speed(iter/s)": 0.015829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 135.875, "completions/min_length": 79.0, "epoch": 0.013554170632118626, "frac_reward_zero_std": 0.0, "grad_norm": 2.320565938949585, "kl": 0.0801541656255722, "learning_rate": 6.775629220380601e-07, "loss": -0.017321255058050156, "memory(GiB)": 90.94, "reward": 0.6818163394927979, "reward_std": 0.14552751183509827, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.90625, "rewards/PlanningActionSetORM/std": 0.017873018980026245, "rewards/RMReward/mean": 0.8562500476837158, "rewards/RMReward/std": 0.0655108094215393, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4973827302455902, "rewards/VisualPerceptionAccuracy/std": 0.23681791126728058, "step": 883, "train_speed(iter/s)": 0.015824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 132.78125, "completions/min_length": 106.0, "epoch": 0.013569520768734841, "frac_reward_zero_std": 0.0, "grad_norm": 0.5610942840576172, "kl": 0.11418863385915756, "learning_rate": 6.783302639656231e-07, "loss": 0.00257042795419693, "memory(GiB)": 90.94, "reward": 0.8787500262260437, "reward_std": 0.041200462728738785, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8640625476837158, "rewards/RMReward/std": 0.13633117079734802, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 884, "train_speed(iter/s)": 0.015805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 372.15625, "completions/min_length": 198.0, "epoch": 0.013584870905351057, "frac_reward_zero_std": 0.0, "grad_norm": 0.6530418992042542, "kl": 0.05001503974199295, "learning_rate": 6.79097605893186e-07, "loss": -0.08659734576940536, "memory(GiB)": 90.94, "reward": 0.5639392137527466, "reward_std": 0.10817579925060272, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.027195287868380547, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.34260058403015137, "rewards/VisualPerceptionAccuracy/std": 0.19459538161754608, "step": 885, "train_speed(iter/s)": 0.0158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/mean_length": 56.65625, "completions/min_length": 13.0, "epoch": 0.013600221041967274, "frac_reward_zero_std": 0.0, "grad_norm": 12.025540351867676, "kl": 0.1759881228208542, "learning_rate": 6.79864947820749e-07, "loss": 0.007899470627307892, "memory(GiB)": 90.94, "reward": 0.8190624713897705, "reward_std": 0.24351345002651215, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.918749988079071, "rewards/RMReward/std": 0.04031128063797951, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 886, "train_speed(iter/s)": 0.01579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/mean_length": 106.09375, "completions/min_length": 101.0, "epoch": 0.01361557117858349, "frac_reward_zero_std": 0.0, "grad_norm": 1.3779560327529907, "kl": 0.10673440992832184, "learning_rate": 6.80632289748312e-07, "loss": -0.0011513140052556992, "memory(GiB)": 90.94, "reward": 0.8737499713897705, "reward_std": 0.0480068176984787, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8421875238418579, "rewards/RMReward/std": 0.071964792907238, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 887, "train_speed(iter/s)": 0.01578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 193.25, "completions/min_length": 106.0, "epoch": 0.013630921315199706, "frac_reward_zero_std": 0.0, "grad_norm": 1.2329015731811523, "kl": 0.11465984582901001, "learning_rate": 6.813996316758749e-07, "loss": 0.004634007811546326, "memory(GiB)": 90.94, "reward": 0.7752871513366699, "reward_std": 0.12013144046068192, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9156249761581421, "rewards/RMReward/std": 0.05072391405701637, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6180742979049683, "rewards/VisualPerceptionAccuracy/std": 0.1996837556362152, "step": 888, "train_speed(iter/s)": 0.015784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 177.875, "completions/min_length": 138.0, "epoch": 0.013646271451815921, "frac_reward_zero_std": 0.0, "grad_norm": 1.3322417736053467, "kl": 0.05700422078371048, "learning_rate": 6.821669736034378e-07, "loss": -0.01990591362118721, "memory(GiB)": 90.94, "reward": 0.9310416579246521, "reward_std": 0.06997233629226685, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9427083134651184, "rewards/PlanningActionSetORM/std": 0.05900471284985542, "rewards/RMReward/mean": 0.9281250238418579, "rewards/RMReward/std": 0.08570156246423721, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 889, "train_speed(iter/s)": 0.01578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/mean_length": 109.03125, "completions/min_length": 104.0, "epoch": 0.013661621588432137, "frac_reward_zero_std": 0.0, "grad_norm": 0.857208251953125, "kl": 0.11483171582221985, "learning_rate": 6.829343155310007e-07, "loss": -0.022798266261816025, "memory(GiB)": 90.94, "reward": 0.9122500419616699, "reward_std": 0.02367434650659561, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8903125524520874, "rewards/RMReward/std": 0.10287744551897049, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 890, "train_speed(iter/s)": 0.015772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/mean_length": 214.25, "completions/min_length": 111.0, "epoch": 0.013676971725048353, "frac_reward_zero_std": 0.0, "grad_norm": 1.482432246208191, "kl": 0.1015935093164444, "learning_rate": 6.837016574585636e-07, "loss": -0.0475507415831089, "memory(GiB)": 90.94, "reward": 0.677983283996582, "reward_std": 0.1283358782529831, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8468749523162842, "rewards/RMReward/std": 0.07846176624298096, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.47846662998199463, "rewards/VisualPerceptionAccuracy/std": 0.193902388215065, "step": 891, "train_speed(iter/s)": 0.015771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 201.5625, "completions/min_length": 122.0, "epoch": 0.013692321861664568, "frac_reward_zero_std": 0.0, "grad_norm": 1.0611636638641357, "kl": 0.05720309168100357, "learning_rate": 6.844689993861266e-07, "loss": -0.006385289132595062, "memory(GiB)": 90.94, "reward": 0.7082035541534424, "reward_std": 0.04830019548535347, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9172676205635071, "rewards/PlanningActionSetORM/std": 0.008101769722998142, "rewards/RMReward/mean": 0.6559374928474426, "rewards/RMReward/std": 0.16084171831607819, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 892, "train_speed(iter/s)": 0.01577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/mean_length": 161.1875, "completions/min_length": 112.0, "epoch": 0.013707671998280786, "frac_reward_zero_std": 0.0, "grad_norm": 0.8664038777351379, "kl": 0.09598290175199509, "learning_rate": 6.852363413136895e-07, "loss": -0.005672536790370941, "memory(GiB)": 90.94, "reward": 0.9447499513626099, "reward_std": 0.04081519693136215, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9309374690055847, "rewards/RMReward/std": 0.058879829943180084, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 893, "train_speed(iter/s)": 0.015768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/mean_length": 55.0, "completions/min_length": 8.0, "epoch": 0.013723022134897001, "frac_reward_zero_std": 0.0, "grad_norm": 36.34557342529297, "kl": 0.7210478186607361, "learning_rate": 6.860036832412524e-07, "loss": 0.0007224753499031067, "memory(GiB)": 90.94, "reward": 0.8338125348091125, "reward_std": 0.24099041521549225, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.986875057220459, "rewards/RMReward/std": 0.03400368615984917, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 894, "train_speed(iter/s)": 0.015765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 229.4375, "completions/min_length": 114.0, "epoch": 0.013738372271513217, "frac_reward_zero_std": 0.0, "grad_norm": 1.230101227760315, "kl": 0.07540535181760788, "learning_rate": 6.867710251688153e-07, "loss": 0.016736887395381927, "memory(GiB)": 90.94, "reward": 0.8292802572250366, "reward_std": 0.05022870749235153, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9651515483856201, "rewards/PlanningActionSetORM/std": 0.03587154299020767, "rewards/RMReward/mean": 0.7953125238418579, "rewards/RMReward/std": 0.09948202967643738, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 895, "train_speed(iter/s)": 0.015754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 94.65625, "completions/min_length": 8.0, "epoch": 0.013753722408129433, "frac_reward_zero_std": 0.0, "grad_norm": 32.405517578125, "kl": 0.6592245101928711, "learning_rate": 6.875383670963783e-07, "loss": 0.012038320302963257, "memory(GiB)": 90.94, "reward": 0.4060037434101105, "reward_std": 0.2552412748336792, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.6432574987411499, "rewards/VisualPerceptionAccuracy/std": 0.18599578738212585, "step": 896, "train_speed(iter/s)": 0.015769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 113.8125, "completions/min_length": 85.0, "epoch": 0.013769072544745648, "frac_reward_zero_std": 0.0, "grad_norm": 2.4860169887542725, "kl": 0.11414231359958649, "learning_rate": 6.883057090239412e-07, "loss": -0.06368384510278702, "memory(GiB)": 90.94, "reward": 0.7985937595367432, "reward_std": 0.06344722211360931, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.992968738079071, "rewards/PlanningActionSetORM/std": 0.027849232777953148, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.07725115865468979, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 897, "train_speed(iter/s)": 0.015765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/mean_length": 384.0, "completions/min_length": 211.0, "epoch": 0.013784422681361864, "frac_reward_zero_std": 0.0, "grad_norm": 0.9379944801330566, "kl": 0.052066922187805176, "learning_rate": 6.89073050951504e-07, "loss": 0.1419280469417572, "memory(GiB)": 90.94, "reward": 0.6324171423912048, "reward_std": 0.18384654819965363, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9333333373069763, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.59375, "rewards/RMReward/std": 0.14244882762432098, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6031675934791565, "rewards/VisualPerceptionAccuracy/std": 0.2537340223789215, "step": 898, "train_speed(iter/s)": 0.015758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/mean_length": 104.71875, "completions/min_length": 72.0, "epoch": 0.01379977281797808, "frac_reward_zero_std": 0.0, "grad_norm": 2.235992908477783, "kl": 0.13034501671791077, "learning_rate": 6.89840392879067e-07, "loss": 0.0050859395414590836, "memory(GiB)": 90.94, "reward": 0.8131250143051147, "reward_std": 0.07279576361179352, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.7671874761581421, "rewards/RMReward/std": 0.108218252658844, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 899, "train_speed(iter/s)": 0.015763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/mean_length": 114.375, "completions/min_length": 63.0, "epoch": 0.013815122954594295, "frac_reward_zero_std": 0.0, "grad_norm": 1.4233157634735107, "kl": 0.090582475066185, "learning_rate": 6.906077348066299e-07, "loss": 7.162988185882568e-05, "memory(GiB)": 90.94, "reward": 0.9087499976158142, "reward_std": 0.12276692688465118, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.885937511920929, "rewards/RMReward/std": 0.18588687479496002, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 900, "train_speed(iter/s)": 0.015764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 139.34375, "completions/min_length": 81.0, "epoch": 0.013830473091210512, "frac_reward_zero_std": 0.0, "grad_norm": 1.7483291625976562, "kl": 0.09404680877923965, "learning_rate": 6.913750767341929e-07, "loss": -0.009181037545204163, "memory(GiB)": 90.94, "reward": 0.800000011920929, "reward_std": 0.0684339851140976, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.09158109873533249, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 901, "train_speed(iter/s)": 0.015734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 240.875, "completions/min_length": 192.0, "epoch": 0.013845823227826728, "frac_reward_zero_std": 0.0, "grad_norm": 0.9901946783065796, "kl": 0.048356957733631134, "learning_rate": 6.921424186617558e-07, "loss": -0.01503954827785492, "memory(GiB)": 90.94, "reward": 0.8129615783691406, "reward_std": 0.17765139043331146, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.973557710647583, "rewards/PlanningActionSetORM/std": 0.037119895219802856, "rewards/RMReward/mean": 0.7728124856948853, "rewards/RMReward/std": 0.24003002047538757, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 902, "train_speed(iter/s)": 0.015733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 117.71875, "completions/min_length": 103.0, "epoch": 0.013861173364442944, "frac_reward_zero_std": 0.0, "grad_norm": 1.7294689416885376, "kl": 0.1409139186143875, "learning_rate": 6.929097605893187e-07, "loss": -0.006354279816150665, "memory(GiB)": 90.94, "reward": 0.9272499680519104, "reward_std": 0.03520190715789795, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9090625047683716, "rewards/RMReward/std": 0.05969595909118652, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 903, "train_speed(iter/s)": 0.015724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 117.21875, "completions/min_length": 91.0, "epoch": 0.01387652350105916, "frac_reward_zero_std": 0.0, "grad_norm": 2.6984570026397705, "kl": 0.14035241305828094, "learning_rate": 6.936771025168816e-07, "loss": 0.02347872033715248, "memory(GiB)": 90.94, "reward": 0.510852575302124, "reward_std": 0.13894261419773102, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9755208492279053, "rewards/PlanningActionSetORM/std": 0.05404634773731232, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.071879543364048, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21660089492797852, "rewards/VisualPerceptionAccuracy/std": 0.22062212228775024, "step": 904, "train_speed(iter/s)": 0.015731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 266.25, "completions/min_length": 128.0, "epoch": 0.013891873637675375, "frac_reward_zero_std": 0.0, "grad_norm": 1.362176537513733, "kl": 0.05904548615217209, "learning_rate": 6.944444444444446e-07, "loss": 0.009763376787304878, "memory(GiB)": 90.94, "reward": 0.8384547233581543, "reward_std": 0.0805143266916275, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9072734117507935, "rewards/PlanningActionSetORM/std": 0.0705106183886528, "rewards/RMReward/mean": 0.8212499618530273, "rewards/RMReward/std": 0.1254347264766693, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 905, "train_speed(iter/s)": 0.015719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 161.8125, "completions/min_length": 108.0, "epoch": 0.01390722377429159, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875290870666504, "kl": 0.12545491755008698, "learning_rate": 6.952117863720075e-07, "loss": -0.10111586004495621, "memory(GiB)": 90.94, "reward": 0.34481069445610046, "reward_std": 0.1618582308292389, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.53125, "rewards/RMReward/std": 0.2212653011083603, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06462136656045914, "rewards/VisualPerceptionAccuracy/std": 0.14670422673225403, "step": 906, "train_speed(iter/s)": 0.015715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 124.0, "completions/min_length": 103.0, "epoch": 0.013922573910907806, "frac_reward_zero_std": 0.0, "grad_norm": 1.2599494457244873, "kl": 0.08586151152849197, "learning_rate": 6.959791282995703e-07, "loss": 0.004268910735845566, "memory(GiB)": 90.94, "reward": 0.9549999833106995, "reward_std": 0.02632993459701538, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9593749642372131, "rewards/RMReward/std": 0.04825586825609207, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 907, "train_speed(iter/s)": 0.015702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 120.4375, "completions/min_length": 13.0, "epoch": 0.013937924047524024, "frac_reward_zero_std": 0.0, "grad_norm": 17.228862762451172, "kl": 0.16488440334796906, "learning_rate": 6.967464702271333e-07, "loss": -0.01384054496884346, "memory(GiB)": 90.94, "reward": 0.7703125476837158, "reward_std": 0.24986112117767334, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.796875, "rewards/RMReward/std": 0.05618051812052727, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 908, "train_speed(iter/s)": 0.0157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 188.3125, "completions/min_length": 168.0, "epoch": 0.01395327418414024, "frac_reward_zero_std": 0.0, "grad_norm": 0.393259197473526, "kl": 0.05489160120487213, "learning_rate": 6.975138121546962e-07, "loss": 0.00028622522950172424, "memory(GiB)": 90.94, "reward": 0.9965000152587891, "reward_std": 0.0069679152220487595, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9956250190734863, "rewards/RMReward/std": 0.009482582099735737, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 909, "train_speed(iter/s)": 0.015693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/mean_length": 114.34375, "completions/min_length": 102.0, "epoch": 0.013968624320756455, "frac_reward_zero_std": 0.0, "grad_norm": 1.7627898454666138, "kl": 0.1203865259885788, "learning_rate": 6.982811540822592e-07, "loss": 0.02191023901104927, "memory(GiB)": 90.94, "reward": 0.9262499809265137, "reward_std": 0.059334952384233475, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9078124761581421, "rewards/RMReward/std": 0.07802953571081161, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 910, "train_speed(iter/s)": 0.015677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 115.0625, "completions/min_length": 83.0, "epoch": 0.01398397445737267, "frac_reward_zero_std": 0.0, "grad_norm": 2.0167236328125, "kl": 0.14300280809402466, "learning_rate": 6.99048496009822e-07, "loss": 0.031090067699551582, "memory(GiB)": 90.94, "reward": 0.8872500061988831, "reward_std": 0.06352110207080841, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8590624928474426, "rewards/RMReward/std": 0.08317720144987106, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 911, "train_speed(iter/s)": 0.015668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 124.15625, "completions/min_length": 8.0, "epoch": 0.013999324593988886, "frac_reward_zero_std": 0.0, "grad_norm": 68.92786407470703, "kl": 0.5112285017967224, "learning_rate": 6.99815837937385e-07, "loss": -0.005944415926933289, "memory(GiB)": 90.94, "reward": 0.8634375333786011, "reward_std": 0.26335814595222473, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8812500238418579, "rewards/RMReward/std": 0.17969882488250732, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 912, "train_speed(iter/s)": 0.01567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/mean_length": 205.6875, "completions/min_length": 75.0, "epoch": 0.014014674730605102, "frac_reward_zero_std": 0.0, "grad_norm": 2.621506929397583, "kl": 0.18512143194675446, "learning_rate": 7.005831798649479e-07, "loss": 0.03223436325788498, "memory(GiB)": 90.94, "reward": 0.693713366985321, "reward_std": 0.13704177737236023, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.693713366985321, "rewards/VisualPerceptionAccuracy/std": 0.18925221264362335, "step": 913, "train_speed(iter/s)": 0.015683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 61.5625, "completions/min_length": 13.0, "epoch": 0.014030024867221318, "frac_reward_zero_std": 0.0, "grad_norm": 19.10027503967285, "kl": 0.19859181344509125, "learning_rate": 7.013505217925108e-07, "loss": 0.014992808923125267, "memory(GiB)": 90.94, "reward": 0.7096874713897705, "reward_std": 0.2625136077404022, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.793749988079071, "rewards/RMReward/std": 0.04787135869264603, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 914, "train_speed(iter/s)": 0.015682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/mean_length": 106.25, "completions/min_length": 100.0, "epoch": 0.014045375003837533, "frac_reward_zero_std": 0.0, "grad_norm": 1.6530067920684814, "kl": 0.12475774437189102, "learning_rate": 7.021178637200737e-07, "loss": 0.004295565187931061, "memory(GiB)": 90.94, "reward": 0.8637499809265137, "reward_std": 0.04022643342614174, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8296875357627869, "rewards/RMReward/std": 0.09988652169704437, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 915, "train_speed(iter/s)": 0.015679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 155.1875, "completions/min_length": 100.0, "epoch": 0.01406072514045375, "frac_reward_zero_std": 0.0, "grad_norm": 0.8431693315505981, "kl": 0.09903188794851303, "learning_rate": 7.028852056476366e-07, "loss": -0.011514462530612946, "memory(GiB)": 90.94, "reward": 0.9186388850212097, "reward_std": 0.07078807801008224, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9121874570846558, "rewards/RMReward/std": 0.09128948301076889, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 916, "train_speed(iter/s)": 0.015675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 110.3125, "completions/min_length": 87.0, "epoch": 0.014076075277069966, "frac_reward_zero_std": 0.0, "grad_norm": 1.6619373559951782, "kl": 0.1416858732700348, "learning_rate": 7.036525475751996e-07, "loss": 0.012540940195322037, "memory(GiB)": 90.94, "reward": 0.9112499952316284, "reward_std": 0.04162073880434036, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8890625238418579, "rewards/RMReward/std": 0.06688261777162552, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 917, "train_speed(iter/s)": 0.015676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/mean_length": 144.5625, "completions/min_length": 90.0, "epoch": 0.014091425413686182, "frac_reward_zero_std": 0.0, "grad_norm": 2.055908203125, "kl": 0.1438940167427063, "learning_rate": 7.044198895027625e-07, "loss": -0.019979529082775116, "memory(GiB)": 90.94, "reward": 0.6750463843345642, "reward_std": 0.06722434610128403, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.07899102568626404, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4425927996635437, "rewards/VisualPerceptionAccuracy/std": 0.0712558850646019, "step": 918, "train_speed(iter/s)": 0.015669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/mean_length": 510.84375, "completions/min_length": 77.0, "epoch": 0.014106775550302398, "frac_reward_zero_std": 0.0, "grad_norm": 0.7626867890357971, "kl": 0.058205388486385345, "learning_rate": 7.051872314303254e-07, "loss": 0.003975324332714081, "memory(GiB)": 90.94, "reward": 0.5765954256057739, "reward_std": 0.12615682184696198, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8343749642372131, "rewards/RMReward/std": 0.1179247573018074, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2856907844543457, "rewards/VisualPerceptionAccuracy/std": 0.15797387063503265, "step": 919, "train_speed(iter/s)": 0.015666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/mean_length": 130.65625, "completions/min_length": 112.0, "epoch": 0.014122125686918613, "frac_reward_zero_std": 0.0, "grad_norm": 1.3764615058898926, "kl": 0.09889379143714905, "learning_rate": 7.059545733578883e-07, "loss": -0.02300296165049076, "memory(GiB)": 90.94, "reward": 0.92249995470047, "reward_std": 0.04218476265668869, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9031249284744263, "rewards/RMReward/std": 0.05670735985040665, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 920, "train_speed(iter/s)": 0.015657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 367.65625, "completions/min_length": 93.0, "epoch": 0.014137475823534829, "frac_reward_zero_std": 0.0, "grad_norm": 0.9024666547775269, "kl": 0.08868097513914108, "learning_rate": 7.067219152854513e-07, "loss": -0.2045654058456421, "memory(GiB)": 90.94, "reward": 0.5505054593086243, "reward_std": 0.059171050786972046, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9375, "rewards/RMReward/std": 0.028867509216070175, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.15101096034049988, "rewards/VisualPerceptionAccuracy/std": 0.09524808079004288, "step": 921, "train_speed(iter/s)": 0.015649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/mean_length": 53.6875, "completions/min_length": 13.0, "epoch": 0.014152825960151045, "frac_reward_zero_std": 0.0, "grad_norm": 12.612726211547852, "kl": 0.3062455952167511, "learning_rate": 7.074892572130142e-07, "loss": 0.011318448930978775, "memory(GiB)": 90.94, "reward": 0.964062511920929, "reward_std": 0.14374999701976776, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.984375, "rewards/RMReward/std": 0.0625, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 922, "train_speed(iter/s)": 0.01564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 164.46875, "completions/min_length": 115.0, "epoch": 0.014168176096767262, "frac_reward_zero_std": 0.0, "grad_norm": 1.6218886375427246, "kl": 0.093462273478508, "learning_rate": 7.08256599140577e-07, "loss": 0.01822078600525856, "memory(GiB)": 90.94, "reward": 0.9101388454437256, "reward_std": 0.06456516683101654, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.901562511920929, "rewards/RMReward/std": 0.09712343662977219, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 923, "train_speed(iter/s)": 0.015641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 184.25, "completions/min_length": 127.0, "epoch": 0.014183526233383478, "frac_reward_zero_std": 0.0, "grad_norm": 1.6388112306594849, "kl": 0.11177732050418854, "learning_rate": 7.0902394106814e-07, "loss": -0.03464597836136818, "memory(GiB)": 90.94, "reward": 0.7684966325759888, "reward_std": 0.10229432582855225, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.079320028424263, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.681993305683136, "rewards/VisualPerceptionAccuracy/std": 0.1411326378583908, "step": 924, "train_speed(iter/s)": 0.015641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 120.875, "completions/min_length": 100.0, "epoch": 0.014198876369999693, "frac_reward_zero_std": 0.0, "grad_norm": 2.2050979137420654, "kl": 0.163753479719162, "learning_rate": 7.097912829957029e-07, "loss": 0.0017648963257670403, "memory(GiB)": 90.94, "reward": 0.6509957313537598, "reward_std": 0.24543596804141998, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.846875011920929, "rewards/RMReward/std": 0.06446897983551025, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4244914650917053, "rewards/VisualPerceptionAccuracy/std": 0.43929678201675415, "step": 925, "train_speed(iter/s)": 0.015645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/mean_length": 317.71875, "completions/min_length": 99.0, "epoch": 0.014214226506615909, "frac_reward_zero_std": 0.0, "grad_norm": 1.1472792625427246, "kl": 0.11794564127922058, "learning_rate": 7.105586249232659e-07, "loss": -0.025252357125282288, "memory(GiB)": 90.94, "reward": 0.6633573174476624, "reward_std": 0.08456546068191528, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9087499380111694, "rewards/RMReward/std": 0.051234740763902664, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.39971470832824707, "rewards/VisualPerceptionAccuracy/std": 0.12814313173294067, "step": 926, "train_speed(iter/s)": 0.015646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 118.625, "completions/min_length": 105.0, "epoch": 0.014229576643232125, "frac_reward_zero_std": 0.0, "grad_norm": 1.2496237754821777, "kl": 0.12722009420394897, "learning_rate": 7.113259668508287e-07, "loss": 0.0005449140444397926, "memory(GiB)": 90.94, "reward": 0.8849999904632568, "reward_std": 0.060354869812726974, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8562500476837158, "rewards/RMReward/std": 0.07593502104282379, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 927, "train_speed(iter/s)": 0.015643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 64.375, "completions/min_length": 8.0, "epoch": 0.01424492677984834, "frac_reward_zero_std": 0.0, "grad_norm": 38.38037872314453, "kl": 0.5691236853599548, "learning_rate": 7.120933087783917e-07, "loss": 0.0017738137394189835, "memory(GiB)": 90.94, "reward": 0.3254576027393341, "reward_std": 0.23460838198661804, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.48216521739959717, "rewards/VisualPerceptionAccuracy/std": 0.14472998678684235, "step": 928, "train_speed(iter/s)": 0.015657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/mean_length": 156.09375, "completions/min_length": 100.0, "epoch": 0.014260276916464556, "frac_reward_zero_std": 0.0, "grad_norm": 1.951472520828247, "kl": 0.08334691822528839, "learning_rate": 7.128606507059546e-07, "loss": -0.03809378668665886, "memory(GiB)": 90.94, "reward": 0.9022777676582336, "reward_std": 0.08348961919546127, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9451388716697693, "rewards/PlanningActionSetORM/std": 0.055801425129175186, "rewards/RMReward/mean": 0.8915624618530273, "rewards/RMReward/std": 0.1101936399936676, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 929, "train_speed(iter/s)": 0.015654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/mean_length": 278.3125, "completions/min_length": 98.0, "epoch": 0.014275627053080773, "frac_reward_zero_std": 0.0, "grad_norm": 1.432620644569397, "kl": 0.14940351247787476, "learning_rate": 7.136279926335176e-07, "loss": -0.009709298610687256, "memory(GiB)": 90.94, "reward": 0.7833875417709351, "reward_std": 0.07140746712684631, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.828125, "rewards/RMReward/std": 0.04819664731621742, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7042751312255859, "rewards/VisualPerceptionAccuracy/std": 0.10425764322280884, "step": 930, "train_speed(iter/s)": 0.015661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/mean_length": 115.09375, "completions/min_length": 99.0, "epoch": 0.014290977189696989, "frac_reward_zero_std": 0.0, "grad_norm": 1.089811086654663, "kl": 0.09495414793491364, "learning_rate": 7.143953345610805e-07, "loss": -0.001028638333082199, "memory(GiB)": 90.94, "reward": 0.7825000286102295, "reward_std": 0.03253787010908127, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.7437499761581421, "rewards/RMReward/std": 0.0487753264605999, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 931, "train_speed(iter/s)": 0.015657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/mean_length": 196.15625, "completions/min_length": 102.0, "epoch": 0.014306327326313205, "frac_reward_zero_std": 0.0, "grad_norm": 1.9292292594909668, "kl": 0.10735579580068588, "learning_rate": 7.151626764886433e-07, "loss": -0.0021525025367736816, "memory(GiB)": 90.94, "reward": 0.8652521967887878, "reward_std": 0.10249738395214081, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9262611865997314, "rewards/PlanningActionSetORM/std": 0.13446767628192902, "rewards/RMReward/mean": 0.8500000238418579, "rewards/RMReward/std": 0.1508042812347412, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 932, "train_speed(iter/s)": 0.015646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 150.6875, "completions/min_length": 102.0, "epoch": 0.01432167746292942, "frac_reward_zero_std": 0.0, "grad_norm": 2.0138864517211914, "kl": 0.18801338970661163, "learning_rate": 7.159300184162063e-07, "loss": 0.040801823139190674, "memory(GiB)": 90.94, "reward": 0.6585797667503357, "reward_std": 0.1615610420703888, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8062499761581421, "rewards/RMReward/std": 0.16007810831069946, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4721595048904419, "rewards/VisualPerceptionAccuracy/std": 0.1950596421957016, "step": 933, "train_speed(iter/s)": 0.015641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 119.5, "completions/min_length": 90.0, "epoch": 0.014337027599545636, "frac_reward_zero_std": 0.0, "grad_norm": 0.7828933596611023, "kl": 0.1037663146853447, "learning_rate": 7.166973603437692e-07, "loss": 0.0007597021758556366, "memory(GiB)": 90.94, "reward": 0.9394999742507935, "reward_std": 0.04572426155209541, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9243749976158142, "rewards/RMReward/std": 0.0817524716258049, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 934, "train_speed(iter/s)": 0.015626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/mean_length": 150.6875, "completions/min_length": 112.0, "epoch": 0.014352377736161851, "frac_reward_zero_std": 0.0, "grad_norm": 1.4929102659225464, "kl": 0.09708437323570251, "learning_rate": 7.174647022713322e-07, "loss": 0.011621825397014618, "memory(GiB)": 90.94, "reward": 0.8925000429153442, "reward_std": 0.04207826405763626, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9624999761581421, "rewards/PlanningActionSetORM/std": 0.11845782399177551, "rewards/RMReward/mean": 0.875, "rewards/RMReward/std": 0.12115039676427841, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 935, "train_speed(iter/s)": 0.015626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 118.0, "completions/min_length": 8.0, "epoch": 0.014367727872778067, "frac_reward_zero_std": 0.5, "grad_norm": 8.659906598040834e-05, "kl": 0.4051535129547119, "learning_rate": 7.18232044198895e-07, "loss": 0.00040842965245246887, "memory(GiB)": 90.94, "reward": 0.5230000019073486, "reward_std": 0.005059646442532539, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9950000047683716, "rewards/RMReward/std": 0.012649113312363625, "rewards/SpatialReasoningORM/mean": 0.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 936, "train_speed(iter/s)": 0.015613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 124.8125, "completions/min_length": 105.0, "epoch": 0.014383078009394283, "frac_reward_zero_std": 0.0, "grad_norm": 2.0008809566497803, "kl": 0.16801831126213074, "learning_rate": 7.18999386126458e-07, "loss": 0.01735023967921734, "memory(GiB)": 90.94, "reward": 0.7284713387489319, "reward_std": 0.1178940013051033, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9631249904632568, "rewards/RMReward/std": 0.02056494727730751, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4864426553249359, "rewards/VisualPerceptionAccuracy/std": 0.21933606266975403, "step": 937, "train_speed(iter/s)": 0.015614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/mean_length": 55.8125, "completions/min_length": 2.0, "epoch": 0.0143984281460105, "frac_reward_zero_std": 0.0, "grad_norm": 192.58750915527344, "kl": 0.10102006793022156, "learning_rate": 7.197667280540209e-07, "loss": 0.01606135629117489, "memory(GiB)": 90.94, "reward": 0.6662499904632568, "reward_std": 0.2802569270133972, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8687499761581421, "rewards/RMReward/std": 0.06020796298980713, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4375, "rewards/VisualPerceptionAccuracy/std": 0.5123475790023804, "step": 938, "train_speed(iter/s)": 0.015619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 205.03125, "completions/min_length": 126.0, "epoch": 0.014413778282626716, "frac_reward_zero_std": 0.0, "grad_norm": 1.7222883701324463, "kl": 0.11839155852794647, "learning_rate": 7.205340699815839e-07, "loss": -0.1017165333032608, "memory(GiB)": 90.94, "reward": 0.7255916595458984, "reward_std": 0.14107546210289001, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7843749523162842, "rewards/RMReward/std": 0.0625, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6236832141876221, "rewards/VisualPerceptionAccuracy/std": 0.23215094208717346, "step": 939, "train_speed(iter/s)": 0.015619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 121.9375, "completions/min_length": 94.0, "epoch": 0.014429128419242931, "frac_reward_zero_std": 0.0, "grad_norm": 2.1704225540161133, "kl": 0.15825524926185608, "learning_rate": 7.213014119091467e-07, "loss": -0.004071585834026337, "memory(GiB)": 90.94, "reward": 0.8681249618530273, "reward_std": 0.05815871059894562, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.8359375, "rewards/RMReward/std": 0.08541584759950638, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 940, "train_speed(iter/s)": 0.015621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 58.8125, "completions/min_length": 8.0, "epoch": 0.014444478555859147, "frac_reward_zero_std": 0.0, "grad_norm": 37.5047492980957, "kl": 0.46401193737983704, "learning_rate": 7.220687538367096e-07, "loss": -0.001067373901605606, "memory(GiB)": 90.94, "reward": 0.5678125023841858, "reward_std": 0.2204325944185257, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.07238496094942093, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 941, "train_speed(iter/s)": 0.015622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/mean_length": 63.3125, "completions/min_length": 8.0, "epoch": 0.014459828692475363, "frac_reward_zero_std": 0.0, "grad_norm": 18.001787185668945, "kl": 0.43692925572395325, "learning_rate": 7.228360957642726e-07, "loss": 0.0012612231075763702, "memory(GiB)": 90.94, "reward": 0.5296874642372131, "reward_std": 0.1302970051765442, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9375, "rewards/RMReward/std": 0.028867509216070175, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 942, "train_speed(iter/s)": 0.015625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 189.5625, "completions/min_length": 119.0, "epoch": 0.014475178829091578, "frac_reward_zero_std": 0.0, "grad_norm": 1.0304698944091797, "kl": 0.05896314978599548, "learning_rate": 7.236034376918355e-07, "loss": 0.0009200386703014374, "memory(GiB)": 90.94, "reward": 0.9547500014305115, "reward_std": 0.018126964569091797, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9434374570846558, "rewards/RMReward/std": 0.06418769061565399, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 943, "train_speed(iter/s)": 0.015609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/mean_length": 117.25, "completions/min_length": 8.0, "epoch": 0.014490528965707794, "frac_reward_zero_std": 0.0, "grad_norm": 29.45670509338379, "kl": 0.5590760707855225, "learning_rate": 7.243707796193984e-07, "loss": 0.0018326044082641602, "memory(GiB)": 90.94, "reward": 0.9262840747833252, "reward_std": 0.16993321478366852, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9090909361839294, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9868749976158142, "rewards/RMReward/std": 0.019224554300308228, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 944, "train_speed(iter/s)": 0.015607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 127.15625, "completions/min_length": 90.0, "epoch": 0.014505879102324011, "frac_reward_zero_std": 0.0, "grad_norm": 1.5146483182907104, "kl": 0.12032702565193176, "learning_rate": 7.251381215469613e-07, "loss": 0.04040007293224335, "memory(GiB)": 90.94, "reward": 0.8285000324249268, "reward_std": 0.05267549306154251, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7856249809265137, "rewards/RMReward/std": 0.08556972444057465, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 945, "train_speed(iter/s)": 0.015611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 110.75, "completions/min_length": 101.0, "epoch": 0.014521229238940227, "frac_reward_zero_std": 0.0, "grad_norm": 1.560992956161499, "kl": 0.1464795470237732, "learning_rate": 7.259054634745243e-07, "loss": -0.002984359860420227, "memory(GiB)": 90.94, "reward": 0.8743749856948853, "reward_std": 0.049766287207603455, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9906250238418579, "rewards/PlanningActionSetORM/std": 0.029614463448524475, "rewards/RMReward/mean": 0.8453124761581421, "rewards/RMReward/std": 0.12271089106798172, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 946, "train_speed(iter/s)": 0.015604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 109.75, "completions/min_length": 105.0, "epoch": 0.014536579375556443, "frac_reward_zero_std": 0.0, "grad_norm": 0.18009671568870544, "kl": 0.12946078181266785, "learning_rate": 7.266728054020872e-07, "loss": 0.0008553769439458847, "memory(GiB)": 90.94, "reward": 0.8734999895095825, "reward_std": 0.04904608055949211, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8418750166893005, "rewards/RMReward/std": 0.17693515121936798, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 947, "train_speed(iter/s)": 0.015604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/mean_length": 120.9375, "completions/min_length": 98.0, "epoch": 0.014551929512172658, "frac_reward_zero_std": 0.0, "grad_norm": 2.164644479751587, "kl": 0.11107125133275986, "learning_rate": 7.274401473296501e-07, "loss": 0.05986110121011734, "memory(GiB)": 90.94, "reward": 0.8699999451637268, "reward_std": 0.03907374292612076, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.840624988079071, "rewards/RMReward/std": 0.10273478180170059, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 948, "train_speed(iter/s)": 0.01559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 287.3125, "completions/min_length": 8.0, "epoch": 0.014567279648788874, "frac_reward_zero_std": 0.0, "grad_norm": 30.0937557220459, "kl": 0.4299609065055847, "learning_rate": 7.28207489257213e-07, "loss": -0.054474126547575, "memory(GiB)": 90.94, "reward": 0.2357202023267746, "reward_std": 0.2902560830116272, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.12456539273262024, "rewards/VisualPerceptionAccuracy/std": 0.12573426961898804, "step": 949, "train_speed(iter/s)": 0.015595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 183.46875, "completions/min_length": 102.0, "epoch": 0.01458262978540509, "frac_reward_zero_std": 0.0, "grad_norm": 0.41036781668663025, "kl": 0.07679446041584015, "learning_rate": 7.289748311847759e-07, "loss": 0.011640295386314392, "memory(GiB)": 90.94, "reward": 0.9527499675750732, "reward_std": 0.056456007063388824, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9409374594688416, "rewards/RMReward/std": 0.08157163113355637, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 950, "train_speed(iter/s)": 0.015583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/mean_length": 156.53125, "completions/min_length": 112.0, "epoch": 0.014597979922021305, "frac_reward_zero_std": 0.0, "grad_norm": 0.6407934427261353, "kl": 0.09666426479816437, "learning_rate": 7.297421731123389e-07, "loss": 0.0017949864268302917, "memory(GiB)": 90.94, "reward": 0.8698889017105103, "reward_std": 0.039021141827106476, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.8512499928474426, "rewards/RMReward/std": 0.08319234848022461, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 951, "train_speed(iter/s)": 0.015571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 155.25, "completions/min_length": 88.0, "epoch": 0.014613330058637523, "frac_reward_zero_std": 0.0, "grad_norm": 2.318927526473999, "kl": 0.13557758927345276, "learning_rate": 7.305095150399017e-07, "loss": -0.04509638994932175, "memory(GiB)": 90.94, "reward": 0.6047863364219666, "reward_std": 0.1715211570262909, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.84375, "rewards/RMReward/std": 0.0793200209736824, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3345726430416107, "rewards/VisualPerceptionAccuracy/std": 0.2795863151550293, "step": 952, "train_speed(iter/s)": 0.015578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/mean_length": 135.125, "completions/min_length": 102.0, "epoch": 0.014628680195253738, "frac_reward_zero_std": 0.0, "grad_norm": 1.185038447380066, "kl": 0.11862577497959137, "learning_rate": 7.312768569674647e-07, "loss": 0.011174183338880539, "memory(GiB)": 90.94, "reward": 0.9647499918937683, "reward_std": 0.06118711084127426, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9559375047683716, "rewards/RMReward/std": 0.07703620195388794, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 953, "train_speed(iter/s)": 0.015567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 142.125, "completions/min_length": 8.0, "epoch": 0.014644030331869954, "frac_reward_zero_std": 0.0, "grad_norm": 46.12839126586914, "kl": 0.6035248041152954, "learning_rate": 7.320441988950276e-07, "loss": -0.005546145141124725, "memory(GiB)": 90.94, "reward": 0.43774843215942383, "reward_std": 0.3069223463535309, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.35049691796302795, "rewards/VisualPerceptionAccuracy/std": 0.1232668086886406, "step": 954, "train_speed(iter/s)": 0.015567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 53.4375, "completions/min_length": 8.0, "epoch": 0.01465938046848617, "frac_reward_zero_std": 0.0, "grad_norm": 14.951556205749512, "kl": 0.3802618384361267, "learning_rate": 7.328115408225906e-07, "loss": -0.039458051323890686, "memory(GiB)": 90.94, "reward": 0.7796875238418579, "reward_std": 0.20406070351600647, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.84375, "rewards/PlanningActionSetORM/std": 0.055901702493429184, "rewards/RMReward/mean": 0.5625, "rewards/RMReward/std": 0.2053452432155609, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 955, "train_speed(iter/s)": 0.015553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 131.5625, "completions/min_length": 104.0, "epoch": 0.014674730605102385, "frac_reward_zero_std": 0.0, "grad_norm": 1.2112092971801758, "kl": 0.08664943277835846, "learning_rate": 7.335788827501535e-07, "loss": 0.006223045289516449, "memory(GiB)": 90.94, "reward": 0.9119791984558105, "reward_std": 0.030469922348856926, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9973958730697632, "rewards/PlanningActionSetORM/std": 0.014731387607753277, "rewards/RMReward/mean": 0.890625, "rewards/RMReward/std": 0.11319231241941452, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 956, "train_speed(iter/s)": 0.015545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 264.625, "completions/min_length": 122.0, "epoch": 0.014690080741718601, "frac_reward_zero_std": 0.0, "grad_norm": 1.342129111289978, "kl": 0.06769202649593353, "learning_rate": 7.343462246777165e-07, "loss": -0.00818992406129837, "memory(GiB)": 90.94, "reward": 0.8400000333786011, "reward_std": 0.048725374042987823, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.800000011920929, "rewards/RMReward/std": 0.06090712174773216, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 957, "train_speed(iter/s)": 0.015546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/mean_length": 107.21875, "completions/min_length": 73.0, "epoch": 0.014705430878334817, "frac_reward_zero_std": 0.0, "grad_norm": 1.6892038583755493, "kl": 0.1900276243686676, "learning_rate": 7.351135666052794e-07, "loss": -0.03075726330280304, "memory(GiB)": 90.94, "reward": 0.7161562442779541, "reward_std": 0.1003483384847641, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.92578125, "rewards/PlanningActionSetORM/std": 0.08315462619066238, "rewards/RMReward/mean": 0.6637499928474426, "rewards/RMReward/std": 0.22393473982810974, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 958, "train_speed(iter/s)": 0.015545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 151.96875, "completions/min_length": 101.0, "epoch": 0.014720781014951032, "frac_reward_zero_std": 0.0, "grad_norm": 1.6949481964111328, "kl": 0.1481599509716034, "learning_rate": 7.358809085328424e-07, "loss": -0.0426805354654789, "memory(GiB)": 90.94, "reward": 0.7523877024650574, "reward_std": 0.07941075414419174, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.887499988079071, "rewards/RMReward/std": 0.04999998211860657, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5947754383087158, "rewards/VisualPerceptionAccuracy/std": 0.11882152408361435, "step": 959, "train_speed(iter/s)": 0.015548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/mean_length": 114.09375, "completions/min_length": 90.0, "epoch": 0.01473613115156725, "frac_reward_zero_std": 0.0, "grad_norm": 2.025787115097046, "kl": 0.13843712210655212, "learning_rate": 7.366482504604053e-07, "loss": -0.014688961207866669, "memory(GiB)": 90.94, "reward": 0.9149999618530273, "reward_std": 0.03954201191663742, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8937499523162842, "rewards/RMReward/std": 0.06318175047636032, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 960, "train_speed(iter/s)": 0.015551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/mean_length": 101.9375, "completions/min_length": 91.0, "epoch": 0.014751481288183465, "frac_reward_zero_std": 0.0, "grad_norm": 1.3468854427337646, "kl": 0.15980073809623718, "learning_rate": 7.374155923879682e-07, "loss": 0.005400367081165314, "memory(GiB)": 90.94, "reward": 0.9075000286102295, "reward_std": 0.03486078605055809, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.08747118711471558, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 961, "train_speed(iter/s)": 0.015545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/mean_length": 55.28125, "completions/min_length": 8.0, "epoch": 0.014766831424799681, "frac_reward_zero_std": 0.0, "grad_norm": 48.829803466796875, "kl": 0.347678005695343, "learning_rate": 7.381829343155311e-07, "loss": 0.01834068074822426, "memory(GiB)": 90.94, "reward": 0.6735937595367432, "reward_std": 0.3002474009990692, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.870312511920929, "rewards/PlanningActionSetORM/std": 0.01874999701976776, "rewards/RMReward/mean": 0.8843750357627869, "rewards/RMReward/std": 0.13870683312416077, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 962, "train_speed(iter/s)": 0.015529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 139.59375, "completions/min_length": 13.0, "epoch": 0.014782181561415897, "frac_reward_zero_std": 0.0, "grad_norm": 13.279778480529785, "kl": 0.09211206436157227, "learning_rate": 7.38950276243094e-07, "loss": -0.022863812744617462, "memory(GiB)": 90.94, "reward": 0.9291827082633972, "reward_std": 0.1772433966398239, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9230769276618958, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9906250238418579, "rewards/RMReward/std": 0.03749999403953552, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 963, "train_speed(iter/s)": 0.01553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 289.90625, "completions/min_length": 13.0, "epoch": 0.014797531698032112, "frac_reward_zero_std": 0.0, "grad_norm": 22.715816497802734, "kl": 0.14287903904914856, "learning_rate": 7.39717618170657e-07, "loss": -0.11714605987071991, "memory(GiB)": 90.94, "reward": 0.502117395401001, "reward_std": 0.35251420736312866, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.5386097431182861, "rewards/VisualPerceptionAccuracy/std": 0.2182982861995697, "step": 964, "train_speed(iter/s)": 0.015535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 112.65625, "completions/min_length": 99.0, "epoch": 0.014812881834648328, "frac_reward_zero_std": 0.0, "grad_norm": 1.9043545722961426, "kl": 0.14594897627830505, "learning_rate": 7.404849600982198e-07, "loss": -0.005904369056224823, "memory(GiB)": 90.94, "reward": 0.7679687738418579, "reward_std": 0.050372414290905, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.889843761920929, "rewards/PlanningActionSetORM/std": 0.03694992884993553, "rewards/RMReward/mean": 0.7374999523162842, "rewards/RMReward/std": 0.07184212654829025, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 965, "train_speed(iter/s)": 0.01553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 161.59375, "completions/min_length": 108.0, "epoch": 0.014828231971264543, "frac_reward_zero_std": 0.0, "grad_norm": 1.4940966367721558, "kl": 0.11580687016248703, "learning_rate": 7.412523020257828e-07, "loss": 0.017757102847099304, "memory(GiB)": 90.94, "reward": 0.8783035278320312, "reward_std": 0.0542084276676178, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9602678418159485, "rewards/PlanningActionSetORM/std": 0.12594522535800934, "rewards/RMReward/mean": 0.8578124642372131, "rewards/RMReward/std": 0.0833853930234909, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 966, "train_speed(iter/s)": 0.015518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/mean_length": 129.875, "completions/min_length": 90.0, "epoch": 0.014843582107880761, "frac_reward_zero_std": 0.0, "grad_norm": 1.576491117477417, "kl": 0.11134432256221771, "learning_rate": 7.420196439533457e-07, "loss": -0.0035111140459775925, "memory(GiB)": 90.94, "reward": 0.8521875143051147, "reward_std": 0.05506587773561478, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9921875, "rewards/PlanningActionSetORM/std": 0.03074183501303196, "rewards/RMReward/mean": 0.8171875476837158, "rewards/RMReward/std": 0.08945117145776749, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 967, "train_speed(iter/s)": 0.015505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 121.75, "completions/min_length": 104.0, "epoch": 0.014858932244496977, "frac_reward_zero_std": 0.0, "grad_norm": 1.8645356893539429, "kl": 0.127556711435318, "learning_rate": 7.427869858809087e-07, "loss": 0.011928454041481018, "memory(GiB)": 90.94, "reward": 0.8500000238418579, "reward_std": 0.028057891875505447, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.07295601814985275, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 968, "train_speed(iter/s)": 0.015508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/mean_length": 59.6875, "completions/min_length": 8.0, "epoch": 0.014874282381113192, "frac_reward_zero_std": 0.0, "grad_norm": 30.605724334716797, "kl": 0.36512404680252075, "learning_rate": 7.435543278084715e-07, "loss": -0.00017729029059410095, "memory(GiB)": 90.94, "reward": 0.7075625061988831, "reward_std": 0.27704495191574097, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.93687504529953, "rewards/RMReward/std": 0.08419966697692871, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 969, "train_speed(iter/s)": 0.015505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/mean_length": 331.3125, "completions/min_length": 145.0, "epoch": 0.014889632517729408, "frac_reward_zero_std": 0.0, "grad_norm": 0.8956571817398071, "kl": 0.11156492680311203, "learning_rate": 7.443216697360345e-07, "loss": 0.012297466397285461, "memory(GiB)": 90.94, "reward": 0.6115838289260864, "reward_std": 0.09850583970546722, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.07576002925634384, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3356676399707794, "rewards/VisualPerceptionAccuracy/std": 0.13640369474887848, "step": 970, "train_speed(iter/s)": 0.015509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.5, "completions/min_length": 8.0, "epoch": 0.014904982654345623, "frac_reward_zero_std": 1.0, "grad_norm": 1.2409598639351316e-05, "kl": 0.8177083730697632, "learning_rate": 7.450890116635974e-07, "loss": 0.0008172128000296652, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 971, "train_speed(iter/s)": 0.015522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/mean_length": 321.9375, "completions/min_length": 145.0, "epoch": 0.014920332790961839, "frac_reward_zero_std": 0.0, "grad_norm": 0.9610666036605835, "kl": 0.09067404270172119, "learning_rate": 7.458563535911603e-07, "loss": 0.024569140747189522, "memory(GiB)": 90.94, "reward": 0.5949399471282959, "reward_std": 0.08032379299402237, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9375, "rewards/RMReward/std": 0.02236068621277809, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23987996578216553, "rewards/VisualPerceptionAccuracy/std": 0.1427590250968933, "step": 972, "train_speed(iter/s)": 0.015524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 140.9375, "completions/min_length": 122.0, "epoch": 0.014935682927578055, "frac_reward_zero_std": 0.0, "grad_norm": 1.3832200765609741, "kl": 0.12207087874412537, "learning_rate": 7.466236955187232e-07, "loss": 0.012226805090904236, "memory(GiB)": 90.94, "reward": 0.8351041674613953, "reward_std": 0.06716414541006088, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9817708730697632, "rewards/PlanningActionSetORM/std": 0.035001110285520554, "rewards/RMReward/mean": 0.7984374761581421, "rewards/RMReward/std": 0.14836983382701874, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 973, "train_speed(iter/s)": 0.015511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 408.09375, "completions/min_length": 106.0, "epoch": 0.014951033064194272, "frac_reward_zero_std": 0.0, "grad_norm": 1.4502918720245361, "kl": 0.12430889904499054, "learning_rate": 7.473910374462861e-07, "loss": -0.06033812463283539, "memory(GiB)": 90.94, "reward": 0.6236739158630371, "reward_std": 0.13826125860214233, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9937499761581421, "rewards/PlanningActionSetORM/std": 0.025000005960464478, "rewards/RMReward/mean": 0.6843750476837158, "rewards/RMReward/std": 0.08107352256774902, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5010978579521179, "rewards/VisualPerceptionAccuracy/std": 0.2125564068555832, "step": 974, "train_speed(iter/s)": 0.015506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/mean_length": 110.625, "completions/min_length": 80.0, "epoch": 0.014966383200810488, "frac_reward_zero_std": 0.0, "grad_norm": 2.322338819503784, "kl": 0.16040775179862976, "learning_rate": 7.481583793738491e-07, "loss": -0.029873017221689224, "memory(GiB)": 90.94, "reward": 0.7962720394134521, "reward_std": 0.119886115193367, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8062499761581421, "rewards/RMReward/std": 0.04787136986851692, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7475440502166748, "rewards/VisualPerceptionAccuracy/std": 0.20147515833377838, "step": 975, "train_speed(iter/s)": 0.015514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/mean_length": 106.09375, "completions/min_length": 92.0, "epoch": 0.014981733337426703, "frac_reward_zero_std": 0.0, "grad_norm": 1.9634389877319336, "kl": 0.11785170435905457, "learning_rate": 7.48925721301412e-07, "loss": -0.03377607464790344, "memory(GiB)": 90.94, "reward": 0.8693749904632568, "reward_std": 0.043565861880779266, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.984375, "rewards/PlanningActionSetORM/std": 0.04200134426355362, "rewards/RMReward/mean": 0.840624988079071, "rewards/RMReward/std": 0.07343802601099014, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 976, "train_speed(iter/s)": 0.015514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 264.4375, "completions/min_length": 123.0, "epoch": 0.014997083474042919, "frac_reward_zero_std": 0.0, "grad_norm": 0.468097448348999, "kl": 0.0795922800898552, "learning_rate": 7.49693063228975e-07, "loss": 0.0007211193442344666, "memory(GiB)": 90.94, "reward": 0.8559868335723877, "reward_std": 0.06173539161682129, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9736841917037964, "rewards/PlanningActionSetORM/std": 0.026736857369542122, "rewards/RMReward/mean": 0.8265625238418579, "rewards/RMReward/std": 0.13258251547813416, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 977, "train_speed(iter/s)": 0.015509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 239.78125, "completions/min_length": 103.0, "epoch": 0.015012433610659135, "frac_reward_zero_std": 0.0, "grad_norm": 1.5022907257080078, "kl": 0.10580089688301086, "learning_rate": 7.504604051565378e-07, "loss": 0.011024482548236847, "memory(GiB)": 90.94, "reward": 0.7010507583618164, "reward_std": 0.08913911134004593, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9118750095367432, "rewards/RMReward/std": 0.05528334900736809, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4726014733314514, "rewards/VisualPerceptionAccuracy/std": 0.13405154645442963, "step": 978, "train_speed(iter/s)": 0.015513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 118.5625, "completions/min_length": 94.0, "epoch": 0.01502778374727535, "frac_reward_zero_std": 0.0, "grad_norm": 1.9408226013183594, "kl": 0.14264926314353943, "learning_rate": 7.512277470841008e-07, "loss": 0.0035346411168575287, "memory(GiB)": 90.94, "reward": 0.8949999809265137, "reward_std": 0.05899765342473984, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8843750357627869, "rewards/RMReward/std": 0.08654431253671646, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 979, "train_speed(iter/s)": 0.015513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 133.0, "completions/min_length": 8.0, "epoch": 0.015043133883891566, "frac_reward_zero_std": 0.0, "grad_norm": 51.628448486328125, "kl": 0.5332334041595459, "learning_rate": 7.519950890116637e-07, "loss": 0.0005337372422218323, "memory(GiB)": 90.94, "reward": 0.7869374752044678, "reward_std": 0.2633233666419983, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9868749976158142, "rewards/RMReward/std": 0.049895722419023514, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 980, "train_speed(iter/s)": 0.015517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 117.84375, "completions/min_length": 106.0, "epoch": 0.015058484020507782, "frac_reward_zero_std": 0.0, "grad_norm": 2.1170413494110107, "kl": 0.15987777709960938, "learning_rate": 7.527624309392266e-07, "loss": -0.003629859536886215, "memory(GiB)": 90.94, "reward": 0.7765564918518066, "reward_std": 0.13142840564250946, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9706250429153442, "rewards/RMReward/std": 0.06265980005264282, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5766129493713379, "rewards/VisualPerceptionAccuracy/std": 0.21272894740104675, "step": 981, "train_speed(iter/s)": 0.015518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/mean_length": 96.34375, "completions/min_length": 69.0, "epoch": 0.015073834157123999, "frac_reward_zero_std": 0.0, "grad_norm": 1.6000083684921265, "kl": 0.17519250512123108, "learning_rate": 7.535297728667895e-07, "loss": -0.003197290003299713, "memory(GiB)": 90.94, "reward": 0.8525000214576721, "reward_std": 0.04960283637046814, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.831250011920929, "rewards/RMReward/std": 0.07698972523212433, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 982, "train_speed(iter/s)": 0.015511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.40625, "completions/min_length": 8.0, "epoch": 0.015089184293740215, "frac_reward_zero_std": 0.0, "grad_norm": 81.32865142822266, "kl": 1.078603744506836, "learning_rate": 7.542971147943524e-07, "loss": 0.016840122640132904, "memory(GiB)": 90.94, "reward": 0.40625, "reward_std": 0.4348437190055847, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 983, "train_speed(iter/s)": 0.015513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 157.28125, "completions/min_length": 102.0, "epoch": 0.01510453443035643, "frac_reward_zero_std": 0.0, "grad_norm": 1.5907434225082397, "kl": 0.1230667382478714, "learning_rate": 7.550644567219154e-07, "loss": -0.010032661259174347, "memory(GiB)": 90.94, "reward": 0.8666250109672546, "reward_std": 0.037645820528268814, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.8340624570846558, "rewards/RMReward/std": 0.17906314134597778, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 984, "train_speed(iter/s)": 0.015516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 107.3125, "completions/min_length": 89.0, "epoch": 0.015119884566972646, "frac_reward_zero_std": 0.0, "grad_norm": 1.955061674118042, "kl": 0.1782384216785431, "learning_rate": 7.558317986494783e-07, "loss": 0.005148295313119888, "memory(GiB)": 90.94, "reward": 0.8379687666893005, "reward_std": 0.06201374903321266, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.99609375, "rewards/PlanningActionSetORM/std": 0.022097086533904076, "rewards/RMReward/mean": 0.7984374761581421, "rewards/RMReward/std": 0.07980255037546158, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 985, "train_speed(iter/s)": 0.01552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 147.9375, "completions/min_length": 118.0, "epoch": 0.015135234703588862, "frac_reward_zero_std": 0.0, "grad_norm": 1.7269610166549683, "kl": 0.14002332091331482, "learning_rate": 7.565991405770412e-07, "loss": 0.012882303446531296, "memory(GiB)": 90.94, "reward": 0.7745199203491211, "reward_std": 0.07980799674987793, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.6312500238418579, "rewards/RMReward/std": 0.11236102879047394, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.8440397381782532, "rewards/VisualPerceptionAccuracy/std": 0.06972715258598328, "step": 986, "train_speed(iter/s)": 0.01552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/mean_length": 199.5, "completions/min_length": 152.0, "epoch": 0.015150584840205077, "frac_reward_zero_std": 0.0, "grad_norm": 0.0004662986902985722, "kl": 0.06069711595773697, "learning_rate": 7.573664825046041e-07, "loss": 6.081536412239075e-05, "memory(GiB)": 90.94, "reward": 0.9112499952316284, "reward_std": 0.13364022970199585, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8890625238418579, "rewards/RMReward/std": 0.18996365368366241, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 987, "train_speed(iter/s)": 0.015509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/mean_length": 123.53125, "completions/min_length": 110.0, "epoch": 0.015165934976821293, "frac_reward_zero_std": 0.0, "grad_norm": 1.6208240985870361, "kl": 0.18087512254714966, "learning_rate": 7.581338244321671e-07, "loss": 0.0058390796184539795, "memory(GiB)": 90.94, "reward": 0.8162500262260437, "reward_std": 0.1043223962187767, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.770312488079071, "rewards/RMReward/std": 0.15019309520721436, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 988, "train_speed(iter/s)": 0.015509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 163.9375, "completions/min_length": 8.0, "epoch": 0.01518128511343751, "frac_reward_zero_std": 0.0, "grad_norm": 103.33522033691406, "kl": 0.6886765956878662, "learning_rate": 7.5890116635973e-07, "loss": -0.029952526092529297, "memory(GiB)": 90.94, "reward": 0.6260830163955688, "reward_std": 0.2826540172100067, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.6084160804748535, "rewards/VisualPerceptionAccuracy/std": 0.0903080552816391, "step": 989, "train_speed(iter/s)": 0.015521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/mean_length": 93.5, "completions/min_length": 8.0, "epoch": 0.015196635250053726, "frac_reward_zero_std": 0.0, "grad_norm": 20.629486083984375, "kl": 0.5610105991363525, "learning_rate": 7.596685082872928e-07, "loss": 0.0005640313029289246, "memory(GiB)": 90.94, "reward": 0.9698125123977661, "reward_std": 0.12011625617742538, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9987499713897705, "rewards/RMReward/std": 0.0034156469628214836, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 990, "train_speed(iter/s)": 0.015518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/mean_length": 167.71875, "completions/min_length": 92.0, "epoch": 0.015211985386669942, "frac_reward_zero_std": 0.0, "grad_norm": 1.2484015226364136, "kl": 0.10662993043661118, "learning_rate": 7.604358502148558e-07, "loss": 0.05227525532245636, "memory(GiB)": 90.94, "reward": 0.7523794174194336, "reward_std": 0.07845987379550934, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9931468963623047, "rewards/PlanningActionSetORM/std": 0.03060940094292164, "rewards/RMReward/mean": 0.6921875476837158, "rewards/RMReward/std": 0.13802137970924377, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 991, "train_speed(iter/s)": 0.015519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 53.3125, "completions/min_length": 8.0, "epoch": 0.015227335523286157, "frac_reward_zero_std": 0.0, "grad_norm": 48.77431106567383, "kl": 0.7200738191604614, "learning_rate": 7.612031921424187e-07, "loss": -0.010040249675512314, "memory(GiB)": 90.94, "reward": 0.760937511920929, "reward_std": 0.2460414469242096, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.055901702493429184, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.04787136986851692, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 992, "train_speed(iter/s)": 0.015522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 151.9375, "completions/min_length": 127.0, "epoch": 0.015242685659902373, "frac_reward_zero_std": 0.0, "grad_norm": 2.106827735900879, "kl": 0.14036618173122406, "learning_rate": 7.619705340699817e-07, "loss": 0.07346686720848083, "memory(GiB)": 90.94, "reward": 0.8146189451217651, "reward_std": 0.11572806537151337, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8874999284744263, "rewards/RMReward/std": 0.07852812111377716, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7192379832267761, "rewards/VisualPerceptionAccuracy/std": 0.16863366961479187, "step": 993, "train_speed(iter/s)": 0.01552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/mean_length": 123.84375, "completions/min_length": 102.0, "epoch": 0.015258035796518589, "frac_reward_zero_std": 0.0, "grad_norm": 1.4551893472671509, "kl": 0.12433330714702606, "learning_rate": 7.627378759975445e-07, "loss": -0.01765509508550167, "memory(GiB)": 90.94, "reward": 0.9427499771118164, "reward_std": 0.03884084150195122, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9284375309944153, "rewards/RMReward/std": 0.06816059350967407, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 994, "train_speed(iter/s)": 0.015522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/mean_length": 296.5, "completions/min_length": 162.0, "epoch": 0.015273385933134804, "frac_reward_zero_std": 0.0, "grad_norm": 1.0129081010818481, "kl": 0.10227860510349274, "learning_rate": 7.635052179251075e-07, "loss": 0.0017900541424751282, "memory(GiB)": 90.94, "reward": 0.5835635662078857, "reward_std": 0.08161450177431107, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9500000476837158, "rewards/RMReward/std": 0.025819895789027214, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2071271687746048, "rewards/VisualPerceptionAccuracy/std": 0.14257307350635529, "step": 995, "train_speed(iter/s)": 0.015516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 122.59375, "completions/min_length": 107.0, "epoch": 0.015288736069751022, "frac_reward_zero_std": 0.0, "grad_norm": 1.3909235000610352, "kl": 0.13015511631965637, "learning_rate": 7.642725598526704e-07, "loss": -0.02727035991847515, "memory(GiB)": 90.94, "reward": 0.9182499647140503, "reward_std": 0.089841827750206, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8978124856948853, "rewards/RMReward/std": 0.12294069677591324, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 996, "train_speed(iter/s)": 0.015501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/mean_length": 11.875, "completions/min_length": 9.0, "epoch": 0.015304086206367237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009333066991530359, "kl": 0.4191828966140747, "learning_rate": 7.650399017802334e-07, "loss": 0.0004193430067971349, "memory(GiB)": 90.94, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 997, "train_speed(iter/s)": 0.015492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/mean_length": 113.3125, "completions/min_length": 94.0, "epoch": 0.015319436342983453, "frac_reward_zero_std": 0.0, "grad_norm": 1.2406524419784546, "kl": 0.11762101203203201, "learning_rate": 7.658072437077962e-07, "loss": -0.01016131043434143, "memory(GiB)": 90.94, "reward": 0.8425000309944153, "reward_std": 0.07612987607717514, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.14070673286914825, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 998, "train_speed(iter/s)": 0.015481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 135.5, "completions/min_length": 8.0, "epoch": 0.015334786479599669, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003465648624114692, "kl": 0.43044498562812805, "learning_rate": 7.665745856353591e-07, "loss": 0.00043089533573947847, "memory(GiB)": 90.94, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 1.0, "rewards/RMReward/std": 0.0, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 999, "train_speed(iter/s)": 0.015475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/mean_length": 225.8125, "completions/min_length": 91.0, "epoch": 0.015350136616215884, "frac_reward_zero_std": 0.0, "grad_norm": 1.5104966163635254, "kl": 0.14533330500125885, "learning_rate": 7.673419275629221e-07, "loss": -0.005463648587465286, "memory(GiB)": 90.94, "reward": 0.6617385149002075, "reward_std": 0.07101516425609589, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.893750011920929, "rewards/RMReward/std": 0.04787134379148483, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4084770083427429, "rewards/VisualPerceptionAccuracy/std": 0.10373327881097794, "step": 1000, "train_speed(iter/s)": 0.015477 } ], "logging_steps": 1, "max_steps": 65146, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }