{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.023872045834328, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/mean_length": 26.828125, "completions/min_length": 2.0, "epoch": 3.4102922620468576e-05, "frac_reward_zero_std": 0.0, "grad_norm": 20.063051223754883, "kl": 0.0, "learning_rate": 1.7041581458759375e-09, "loss": -0.13861791789531708, "memory(GiB)": 66.29, "reward": 0.6236249804496765, "reward_std": 0.2238796353340149, "rewards/MathAnswerFormat/mean": 0.3958333432674408, "rewards/MathAnswerFormat/std": 0.49420398473739624, "rewards/PlanningActionSetORM/mean": 0.5062500238418579, "rewards/PlanningActionSetORM/std": 0.3230712115764618, "rewards/RMReward/mean": 0.5274999737739563, "rewards/RMReward/std": 0.20502033829689026, "rewards/SpatialReasoningORM/mean": 0.6708332896232605, "rewards/SpatialReasoningORM/std": 0.33641308546066284, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 1, "train_speed(iter/s)": 0.004438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/mean_length": 58.421875, "completions/min_length": 8.0, "epoch": 6.820584524093715e-05, "frac_reward_zero_std": 0.0, "grad_norm": 9.05689525604248, "kl": 0.0, "learning_rate": 3.408316291751875e-09, "loss": -0.18926993012428284, "memory(GiB)": 66.35, "reward": 0.6684344410896301, "reward_std": 0.19405728578567505, "rewards/MathAnswerFormat/mean": 0.53125, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": 0.7109071016311646, "rewards/PlanningActionSetORM/std": 0.3097461760044098, "rewards/RMReward/mean": 0.49531251192092896, "rewards/RMReward/std": 0.17522306740283966, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.20280292630195618, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 2, "train_speed(iter/s)": 0.005412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 110.453125, "completions/min_length": 2.0, "epoch": 0.00010230876786140572, "frac_reward_zero_std": 0.0, "grad_norm": 39.6663703918457, "kl": 2.6281841201125644e-06, "learning_rate": 5.112474437627813e-09, "loss": -0.02219673991203308, "memory(GiB)": 68.54, "reward": 0.5144725441932678, "reward_std": 0.15965893864631653, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8608586192131042, "rewards/PlanningActionSetORM/std": 0.14656837284564972, "rewards/RMReward/mean": 0.5383333563804626, "rewards/RMReward/std": 0.1377684324979782, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 3, "train_speed(iter/s)": 0.006143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/mean_length": 65.15625, "completions/min_length": 8.0, "epoch": 0.0001364116904818743, "frac_reward_zero_std": 0.0, "grad_norm": 8.300585746765137, "kl": 4.8126228648470715e-05, "learning_rate": 6.81663258350375e-09, "loss": -0.08794452995061874, "memory(GiB)": 68.54, "reward": 0.6354743242263794, "reward_std": 0.22318699955940247, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.504016101360321, "rewards/PlanningActionSetORM/mean": 0.7844308018684387, "rewards/PlanningActionSetORM/std": 0.20548413693904877, "rewards/RMReward/mean": 0.5859375, "rewards/RMReward/std": 0.2008989453315735, "rewards/SpatialReasoningORM/mean": 0.65625, "rewards/SpatialReasoningORM/std": 0.3416633903980255, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 4, "train_speed(iter/s)": 0.006089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 36.125, "completions/min_length": 2.0, "epoch": 0.00017051461310234288, "frac_reward_zero_std": 0.0, "grad_norm": 29.673837661743164, "kl": 0.0002050812036031857, "learning_rate": 8.520790729379687e-09, "loss": -0.07947923243045807, "memory(GiB)": 68.54, "reward": 0.6489322781562805, "reward_std": 0.26808950304985046, "rewards/MathAnswerFormat/mean": 0.46875, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": 0.8755208253860474, "rewards/PlanningActionSetORM/std": 0.15121327340602875, "rewards/RMReward/mean": 0.706250011920929, "rewards/RMReward/std": 0.1631716936826706, "rewards/SpatialReasoningORM/mean": 0.7875000238418579, "rewards/SpatialReasoningORM/std": 0.20280292630195618, "rewards/VisualPerceptionAccuracy/mean": 0.3125, "rewards/VisualPerceptionAccuracy/std": 0.4787135720252991, "step": 5, "train_speed(iter/s)": 0.006253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/mean_length": 187.75, "completions/min_length": 2.0, "epoch": 0.00020461753572281145, "frac_reward_zero_std": 0.0, "grad_norm": 23.503047943115234, "kl": 8.261055336333811e-05, "learning_rate": 1.0224948875255626e-08, "loss": -0.05376787483692169, "memory(GiB)": 68.54, "reward": 0.3737810552120209, "reward_std": 0.13833655416965485, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8755952715873718, "rewards/PlanningActionSetORM/std": 0.1544969230890274, "rewards/RMReward/mean": 0.5140625238418579, "rewards/RMReward/std": 0.1657020002603531, "rewards/SpatialReasoningORM/mean": 0.3375000059604645, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": 0.0017611411167308688, "rewards/VisualPerceptionAccuracy/std": 0.007044564932584763, "step": 6, "train_speed(iter/s)": 0.006307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/mean_length": 93.296875, "completions/min_length": 2.0, "epoch": 0.00023872045834328001, "frac_reward_zero_std": 0.0, "grad_norm": 16.071739196777344, "kl": 1.211746348417364e-05, "learning_rate": 1.1929107021131563e-08, "loss": -0.058940090239048004, "memory(GiB)": 68.54, "reward": 0.671814501285553, "reward_std": 0.15495885908603668, "rewards/MathAnswerFormat/mean": 0.34375, "rewards/MathAnswerFormat/std": 0.4825586974620819, "rewards/PlanningActionSetORM/mean": 0.836895227432251, "rewards/PlanningActionSetORM/std": 0.17385931313037872, "rewards/RMReward/mean": 0.5953124761581421, "rewards/RMReward/std": 0.14775708317756653, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.23201431334018707, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 7, "train_speed(iter/s)": 0.006578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 68.15625, "completions/min_length": 2.0, "epoch": 0.0002728233809637486, "frac_reward_zero_std": 0.0, "grad_norm": 22.155200958251953, "kl": 2.587931703601498e-06, "learning_rate": 1.36332651670075e-08, "loss": -0.05756515637040138, "memory(GiB)": 68.54, "reward": 0.5792311429977417, "reward_std": 0.22289851307868958, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.8782489895820618, "rewards/PlanningActionSetORM/std": 0.11272280663251877, "rewards/RMReward/mean": 0.520312488079071, "rewards/RMReward/std": 0.10840439051389694, "rewards/SpatialReasoningORM/mean": 0.7749999761581421, "rewards/SpatialReasoningORM/std": 0.20493900775909424, "rewards/VisualPerceptionAccuracy/mean": 0.375, "rewards/VisualPerceptionAccuracy/std": 0.5, "step": 8, "train_speed(iter/s)": 0.006722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 145.671875, "completions/min_length": 2.0, "epoch": 0.0003069263035842172, "frac_reward_zero_std": 0.0, "grad_norm": 12.718220710754395, "kl": 0.00011807317059719935, "learning_rate": 1.533742331288344e-08, "loss": -0.027544397860765457, "memory(GiB)": 68.55, "reward": 0.7593376040458679, "reward_std": 0.1785666048526764, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.849750816822052, "rewards/PlanningActionSetORM/std": 0.17555762827396393, "rewards/RMReward/mean": 0.6885417103767395, "rewards/RMReward/std": 0.1831287145614624, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.875, "rewards/VisualPerceptionAccuracy/std": 0.3415650427341461, "step": 9, "train_speed(iter/s)": 0.005669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/mean_length": 50.3125, "completions/min_length": 2.0, "epoch": 0.00034102922620468575, "frac_reward_zero_std": 0.0, "grad_norm": 22.606090545654297, "kl": -3.891171672876226e-06, "learning_rate": 1.7041581458759374e-08, "loss": -0.11562611162662506, "memory(GiB)": 68.55, "reward": 0.5762760639190674, "reward_std": 0.23152200877666473, "rewards/MathAnswerFormat/mean": 0.1875, "rewards/MathAnswerFormat/std": 0.3965577781200409, "rewards/PlanningActionSetORM/mean": 0.7690104246139526, "rewards/PlanningActionSetORM/std": 0.32269468903541565, "rewards/RMReward/mean": 0.6578124761581421, "rewards/RMReward/std": 0.15663260221481323, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.3661041557788849, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 10, "train_speed(iter/s)": 0.005964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 93.328125, "completions/min_length": 9.0, "epoch": 0.0003751321488251543, "frac_reward_zero_std": 0.0, "grad_norm": 3.86909818649292, "kl": -1.1304642612230964e-05, "learning_rate": 1.8745739604635313e-08, "loss": -0.020192787051200867, "memory(GiB)": 68.55, "reward": 0.7348595261573792, "reward_std": 0.15131428837776184, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.8688554167747498, "rewards/PlanningActionSetORM/std": 0.13436836004257202, "rewards/RMReward/mean": 0.6468749642372131, "rewards/RMReward/std": 0.16288824379444122, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.1914854198694229, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 11, "train_speed(iter/s)": 0.006127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 155.640625, "completions/min_length": 2.0, "epoch": 0.0004092350714456229, "frac_reward_zero_std": 0.0, "grad_norm": 36.265995025634766, "kl": 0.00019093291484750807, "learning_rate": 2.044989775051125e-08, "loss": -0.10986940562725067, "memory(GiB)": 68.55, "reward": 0.6181696653366089, "reward_std": 0.17744769155979156, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8544642925262451, "rewards/PlanningActionSetORM/std": 0.1708446890115738, "rewards/RMReward/mean": 0.6385416388511658, "rewards/RMReward/std": 0.18341894447803497, "rewards/SpatialReasoningORM/mean": 0.45000001788139343, "rewards/SpatialReasoningORM/std": 0.2683281898498535, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 12, "train_speed(iter/s)": 0.005486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 41.1875, "completions/min_length": 2.0, "epoch": 0.00044333799406609146, "frac_reward_zero_std": 0.0, "grad_norm": 39.33011245727539, "kl": 0.0005200150771997869, "learning_rate": 2.2154055896387187e-08, "loss": -0.06964020431041718, "memory(GiB)": 68.65, "reward": 0.4416908621788025, "reward_std": 0.20740115642547607, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8963169455528259, "rewards/PlanningActionSetORM/std": 0.10666309297084808, "rewards/RMReward/mean": 0.5593750476837158, "rewards/RMReward/std": 0.15939338505268097, "rewards/SpatialReasoningORM/mean": 0.4000000059604645, "rewards/SpatialReasoningORM/std": 0.285835862159729, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 13, "train_speed(iter/s)": 0.005605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/mean_length": 91.0, "completions/min_length": 8.0, "epoch": 0.00047744091668656003, "frac_reward_zero_std": 0.0, "grad_norm": 3.691892385482788, "kl": 1.709748357825447e-05, "learning_rate": 2.3858214042263125e-08, "loss": -0.05931015685200691, "memory(GiB)": 68.65, "reward": 0.6910992860794067, "reward_std": 0.15497468411922455, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": 0.9083704948425293, "rewards/PlanningActionSetORM/std": 0.12250368297100067, "rewards/RMReward/mean": 0.6572916507720947, "rewards/RMReward/std": 0.20909878611564636, "rewards/SpatialReasoningORM/mean": 0.6625000238418579, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 14, "train_speed(iter/s)": 0.005545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 64.515625, "completions/min_length": 2.0, "epoch": 0.0005115438393070287, "frac_reward_zero_std": 0.0, "grad_norm": 39.9661750793457, "kl": 0.0001323507312918082, "learning_rate": 2.5562372188139064e-08, "loss": -0.018108254298567772, "memory(GiB)": 68.65, "reward": 0.5031376481056213, "reward_std": 0.15747655928134918, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8888764977455139, "rewards/PlanningActionSetORM/std": 0.10438663512468338, "rewards/RMReward/mean": 0.6793749928474426, "rewards/RMReward/std": 0.15429207682609558, "rewards/SpatialReasoningORM/mean": 0.30000001192092896, "rewards/SpatialReasoningORM/std": 0.3048003017902374, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 15, "train_speed(iter/s)": 0.005663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 72.6875, "completions/min_length": 2.0, "epoch": 0.0005456467619274972, "frac_reward_zero_std": 0.0, "grad_norm": 21.334291458129883, "kl": 4.856162831856636e-06, "learning_rate": 2.7266530334015e-08, "loss": -0.05801338702440262, "memory(GiB)": 68.65, "reward": 0.5717964172363281, "reward_std": 0.14748349785804749, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.4399413466453552, "rewards/PlanningActionSetORM/mean": 0.8707770109176636, "rewards/PlanningActionSetORM/std": 0.12123830616474152, "rewards/RMReward/mean": 0.6543750166893005, "rewards/RMReward/std": 0.1442430168390274, "rewards/SpatialReasoningORM/mean": 0.45625001192092896, "rewards/SpatialReasoningORM/std": 0.41343367099761963, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 16, "train_speed(iter/s)": 0.005695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 69.171875, "completions/min_length": 2.0, "epoch": 0.0005797496845479658, "frac_reward_zero_std": 0.0, "grad_norm": 27.202594757080078, "kl": 0.0001259972050320357, "learning_rate": 2.8970688479890938e-08, "loss": -0.16257841885089874, "memory(GiB)": 68.65, "reward": 0.6907548308372498, "reward_std": 0.1723363995552063, "rewards/MathAnswerFormat/mean": 0.5208333134651184, "rewards/MathAnswerFormat/std": 0.5048523545265198, "rewards/PlanningActionSetORM/mean": 0.855096697807312, "rewards/PlanningActionSetORM/std": 0.16528281569480896, "rewards/RMReward/mean": 0.39625000953674316, "rewards/RMReward/std": 0.11982626467943192, "rewards/SpatialReasoningORM/mean": 0.7708333134651184, "rewards/SpatialReasoningORM/std": 0.2797858715057373, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 17, "train_speed(iter/s)": 0.005499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 139.578125, "completions/min_length": 2.0, "epoch": 0.0006138526071684344, "frac_reward_zero_std": 0.0, "grad_norm": 25.27290916442871, "kl": 5.247867011348717e-05, "learning_rate": 3.067484662576688e-08, "loss": -0.014648456126451492, "memory(GiB)": 68.65, "reward": 0.38945016264915466, "reward_std": 0.1308077722787857, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8359158039093018, "rewards/PlanningActionSetORM/std": 0.16447778046131134, "rewards/RMReward/mean": 0.6015625, "rewards/RMReward/std": 0.14395584166049957, "rewards/SpatialReasoningORM/mean": 0.26250001788139343, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": 0.011559405364096165, "rewards/VisualPerceptionAccuracy/std": 0.04623761773109436, "step": 18, "train_speed(iter/s)": 0.005375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 63.484375, "completions/min_length": 2.0, "epoch": 0.0006479555297889029, "frac_reward_zero_std": 0.0, "grad_norm": 38.95516586303711, "kl": 1.0542782547418028e-05, "learning_rate": 3.237900477164281e-08, "loss": -0.08472032845020294, "memory(GiB)": 68.65, "reward": 0.586377739906311, "reward_std": 0.1911381334066391, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.4399413466453552, "rewards/PlanningActionSetORM/mean": 0.7997147440910339, "rewards/PlanningActionSetORM/std": 0.14762941002845764, "rewards/RMReward/mean": 0.5750000476837158, "rewards/RMReward/std": 0.16114509105682373, "rewards/SpatialReasoningORM/mean": 0.5687500238418579, "rewards/SpatialReasoningORM/std": 0.3486725986003876, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 19, "train_speed(iter/s)": 0.00549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/mean_length": 153.65625, "completions/min_length": 2.0, "epoch": 0.0006820584524093715, "frac_reward_zero_std": 0.75, "grad_norm": 0.9650750160217285, "kl": 4.165345671935938e-05, "learning_rate": 3.408316291751875e-08, "loss": -0.06465031206607819, "memory(GiB)": 68.65, "reward": 0.4756249785423279, "reward_std": 0.06809001415967941, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.6625000238418579, "rewards/PlanningActionSetORM/std": 0.44403454661369324, "rewards/RMReward/mean": 0.7875000238418579, "rewards/RMReward/std": 0.24664413928985596, "rewards/SpatialReasoningORM/mean": 0.6000000238418579, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 20, "train_speed(iter/s)": 0.005543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 181.171875, "completions/min_length": 2.0, "epoch": 0.0007161613750298401, "frac_reward_zero_std": 0.0, "grad_norm": 43.26586151123047, "kl": 5.4887499572942033e-05, "learning_rate": 3.5787321063394687e-08, "loss": 0.002734757959842682, "memory(GiB)": 68.65, "reward": 0.42339715361595154, "reward_std": 0.14538529515266418, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8929510116577148, "rewards/PlanningActionSetORM/std": 0.14817026257514954, "rewards/RMReward/mean": 0.6265624761581421, "rewards/RMReward/std": 0.1844715029001236, "rewards/SpatialReasoningORM/mean": 0.3375000059604645, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": 0.013283206149935722, "rewards/VisualPerceptionAccuracy/std": 0.05313282459974289, "step": 21, "train_speed(iter/s)": 0.005537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 132.125, "completions/min_length": 8.0, "epoch": 0.0007502642976503086, "frac_reward_zero_std": 0.0, "grad_norm": 6.479933261871338, "kl": 0.00032640888821333647, "learning_rate": 3.7491479209270625e-08, "loss": -0.11561333388090134, "memory(GiB)": 68.65, "reward": 0.5758824944496155, "reward_std": 0.20553477108478546, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5163977742195129, "rewards/PlanningActionSetORM/mean": 0.7610915303230286, "rewards/PlanningActionSetORM/std": 0.24600206315517426, "rewards/RMReward/mean": 0.6156249642372131, "rewards/RMReward/std": 0.17629648745059967, "rewards/SpatialReasoningORM/mean": 0.36250001192092896, "rewards/SpatialReasoningORM/std": 0.4455333948135376, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 22, "train_speed(iter/s)": 0.005317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 73.5625, "completions/min_length": 9.0, "epoch": 0.0007843672202707772, "frac_reward_zero_std": 0.0, "grad_norm": 6.909884452819824, "kl": -1.0920312888629269e-05, "learning_rate": 3.9195637355146564e-08, "loss": -0.08729790896177292, "memory(GiB)": 68.67, "reward": 0.6949377059936523, "reward_std": 0.1481037437915802, "rewards/MathAnswerFormat/mean": 0.53125, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": 0.8621899485588074, "rewards/PlanningActionSetORM/std": 0.12682366371154785, "rewards/RMReward/mean": 0.5237500071525574, "rewards/RMReward/std": 0.13881409168243408, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.20280292630195618, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 23, "train_speed(iter/s)": 0.005123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 115.21875, "completions/min_length": 8.0, "epoch": 0.0008184701428912458, "frac_reward_zero_std": 0.0, "grad_norm": 2.6207058429718018, "kl": -5.712464917451143e-05, "learning_rate": 4.08997955010225e-08, "loss": -0.05897172540426254, "memory(GiB)": 68.67, "reward": 0.43716204166412354, "reward_std": 0.09584798663854599, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8850381970405579, "rewards/PlanningActionSetORM/std": 0.12269546836614609, "rewards/RMReward/mean": 0.5060416460037231, "rewards/RMReward/std": 0.1514713615179062, "rewards/SpatialReasoningORM/mean": 0.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 24, "train_speed(iter/s)": 0.005151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/mean_length": 31.046875, "completions/min_length": 2.0, "epoch": 0.0008525730655117144, "frac_reward_zero_std": 0.0, "grad_norm": 16.188669204711914, "kl": 5.460055490402738e-06, "learning_rate": 4.260395364689844e-08, "loss": -0.08883854746818542, "memory(GiB)": 68.67, "reward": 0.6820275783538818, "reward_std": 0.24428164958953857, "rewards/MathAnswerFormat/mean": 0.34375, "rewards/MathAnswerFormat/std": 0.4825586974620819, "rewards/PlanningActionSetORM/mean": 0.7749255895614624, "rewards/PlanningActionSetORM/std": 0.12764175236225128, "rewards/RMReward/mean": 0.484375, "rewards/RMReward/std": 0.1274346262216568, "rewards/SpatialReasoningORM/mean": 0.737500011920929, "rewards/SpatialReasoningORM/std": 0.1930234581232071, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 25, "train_speed(iter/s)": 0.005225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/mean_length": 63.421875, "completions/min_length": 9.0, "epoch": 0.0008866759881321829, "frac_reward_zero_std": 0.0, "grad_norm": 7.626407146453857, "kl": 2.19928624574095e-05, "learning_rate": 4.4308111792774374e-08, "loss": -0.08258946239948273, "memory(GiB)": 68.67, "reward": 0.5852637887001038, "reward_std": 0.12151254713535309, "rewards/MathAnswerFormat/mean": 0.59375, "rewards/MathAnswerFormat/std": 0.49899089336395264, "rewards/PlanningActionSetORM/mean": 0.830450177192688, "rewards/PlanningActionSetORM/std": 0.22677037119865417, "rewards/RMReward/mean": 0.6840624809265137, "rewards/RMReward/std": 0.14266720414161682, "rewards/SpatialReasoningORM/mean": 0.44999998807907104, "rewards/SpatialReasoningORM/std": 0.47383132576942444, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 26, "train_speed(iter/s)": 0.005288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 97.8125, "completions/min_length": 9.0, "epoch": 0.0009207789107526515, "frac_reward_zero_std": 0.0, "grad_norm": 5.152341365814209, "kl": 2.6905370759777725e-05, "learning_rate": 4.601226993865031e-08, "loss": -0.04503066465258598, "memory(GiB)": 68.67, "reward": 0.6747475862503052, "reward_std": 0.1522926390171051, "rewards/MathAnswerFormat/mean": 0.3125, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.8585255742073059, "rewards/PlanningActionSetORM/std": 0.13386449217796326, "rewards/RMReward/mean": 0.6164583563804626, "rewards/RMReward/std": 0.20390328764915466, "rewards/SpatialReasoningORM/mean": 0.7250000238418579, "rewards/SpatialReasoningORM/std": 0.1914854198694229, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 27, "train_speed(iter/s)": 0.005302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 105.09375, "completions/min_length": 9.0, "epoch": 0.0009548818333731201, "frac_reward_zero_std": 0.0, "grad_norm": 4.177663326263428, "kl": 1.2371407137834467e-05, "learning_rate": 4.771642808452625e-08, "loss": -0.06320246309041977, "memory(GiB)": 68.67, "reward": 0.6863918304443359, "reward_std": 0.12247981876134872, "rewards/MathAnswerFormat/mean": 0.5625, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.8478204607963562, "rewards/PlanningActionSetORM/std": 0.13347113132476807, "rewards/RMReward/mean": 0.59375, "rewards/RMReward/std": 0.11878149211406708, "rewards/SpatialReasoningORM/mean": 0.8250000476837158, "rewards/SpatialReasoningORM/std": 0.20493900775909424, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 28, "train_speed(iter/s)": 0.005374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 89.65625, "completions/min_length": 9.0, "epoch": 0.0009889847559935887, "frac_reward_zero_std": 0.0, "grad_norm": 2.8320486545562744, "kl": 0.0002103090810123831, "learning_rate": 4.942058623040219e-08, "loss": -0.12539486587047577, "memory(GiB)": 68.67, "reward": 0.5166665315628052, "reward_std": 0.19167160987854004, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": 0.8163183331489563, "rewards/PlanningActionSetORM/std": 0.27849307656288147, "rewards/RMReward/mean": 0.6270833611488342, "rewards/RMReward/std": 0.21035893261432648, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 29, "train_speed(iter/s)": 0.005421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 47.828125, "completions/min_length": 8.0, "epoch": 0.0010230876786140573, "frac_reward_zero_std": 0.0, "grad_norm": 7.569765090942383, "kl": 9.105460048886016e-05, "learning_rate": 5.112474437627813e-08, "loss": -0.1225392296910286, "memory(GiB)": 68.67, "reward": 0.6032736897468567, "reward_std": 0.23438116908073425, "rewards/MathAnswerFormat/mean": 0.5625, "rewards/MathAnswerFormat/std": 0.5013279914855957, "rewards/PlanningActionSetORM/mean": 0.8904739618301392, "rewards/PlanningActionSetORM/std": 0.09289407730102539, "rewards/RMReward/mean": 0.5062500238418579, "rewards/RMReward/std": 0.08539126068353653, "rewards/SpatialReasoningORM/mean": 0.612500011920929, "rewards/SpatialReasoningORM/std": 0.4134056568145752, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 30, "train_speed(iter/s)": 0.005416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 9.65625, "completions/min_length": 2.0, "epoch": 0.0010571906012345259, "frac_reward_zero_std": 0.0, "grad_norm": 30.954381942749023, "kl": 0.000676010618917644, "learning_rate": 5.282890252215406e-08, "loss": -0.09231703728437424, "memory(GiB)": 68.67, "reward": 0.4854687452316284, "reward_std": 0.21883749961853027, "rewards/MathAnswerFormat/mean": 0.328125, "rewards/MathAnswerFormat/std": 0.4732423722743988, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.4937499761581421, "rewards/SpatialReasoningORM/std": 0.3935491740703583, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 31, "train_speed(iter/s)": 0.005483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 141.421875, "completions/min_length": 2.0, "epoch": 0.0010912935238549944, "frac_reward_zero_std": 0.0, "grad_norm": 9.96425724029541, "kl": 4.3487812945386395e-05, "learning_rate": 5.453306066803e-08, "loss": -0.053737565875053406, "memory(GiB)": 68.67, "reward": 0.5973647236824036, "reward_std": 0.14357122778892517, "rewards/MathAnswerFormat/mean": 0.5625, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.9272321462631226, "rewards/PlanningActionSetORM/std": 0.07550269365310669, "rewards/RMReward/mean": 0.5637500286102295, "rewards/RMReward/std": 0.11188536137342453, "rewards/SpatialReasoningORM/mean": 0.8250000476837158, "rewards/SpatialReasoningORM/std": 0.20493900775909424, "rewards/VisualPerceptionAccuracy/mean": 0.47056880593299866, "rewards/VisualPerceptionAccuracy/std": 0.5053113102912903, "step": 32, "train_speed(iter/s)": 0.005399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 88.9375, "completions/min_length": 2.0, "epoch": 0.001125396446475463, "frac_reward_zero_std": 0.0, "grad_norm": 10.565755844116211, "kl": 0.00011231462121941149, "learning_rate": 5.623721881390594e-08, "loss": 0.0017009321600198746, "memory(GiB)": 68.67, "reward": 0.3717986047267914, "reward_std": 0.11078410595655441, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8707961440086365, "rewards/PlanningActionSetORM/std": 0.1457003802061081, "rewards/RMReward/mean": 0.6684374809265137, "rewards/RMReward/std": 0.18554382026195526, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.03468797728419304, "rewards/VisualPerceptionAccuracy/std": 0.17621742188930511, "step": 33, "train_speed(iter/s)": 0.005408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 77.703125, "completions/min_length": 9.0, "epoch": 0.0011594993690959316, "frac_reward_zero_std": 0.0, "grad_norm": 7.551508903503418, "kl": 4.695549068856053e-05, "learning_rate": 5.7941376959781877e-08, "loss": -0.10251830518245697, "memory(GiB)": 68.67, "reward": 0.6570513844490051, "reward_std": 0.15789690613746643, "rewards/MathAnswerFormat/mean": 0.28125, "rewards/MathAnswerFormat/std": 0.45680341124534607, "rewards/PlanningActionSetORM/mean": 0.8533262014389038, "rewards/PlanningActionSetORM/std": 0.1640489548444748, "rewards/RMReward/mean": 0.5656249523162842, "rewards/RMReward/std": 0.15525083243846893, "rewards/SpatialReasoningORM/mean": 0.7125000357627869, "rewards/SpatialReasoningORM/std": 0.182721346616745, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 34, "train_speed(iter/s)": 0.005411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 104.140625, "completions/min_length": 9.0, "epoch": 0.0011936022917164002, "frac_reward_zero_std": 0.0, "grad_norm": 3.882620334625244, "kl": -5.416199201135896e-06, "learning_rate": 5.964553510565782e-08, "loss": -0.05294916033744812, "memory(GiB)": 68.67, "reward": 0.7172465920448303, "reward_std": 0.14197131991386414, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5163977742195129, "rewards/PlanningActionSetORM/mean": 0.8666439056396484, "rewards/PlanningActionSetORM/std": 0.13084599375724792, "rewards/RMReward/mean": 0.6516666412353516, "rewards/RMReward/std": 0.15717361867427826, "rewards/SpatialReasoningORM/mean": 0.800000011920929, "rewards/SpatialReasoningORM/std": 0.20655910670757294, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 35, "train_speed(iter/s)": 0.005467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 102.578125, "completions/min_length": 2.0, "epoch": 0.0012277052143368687, "frac_reward_zero_std": 0.0, "grad_norm": 38.00226974487305, "kl": 4.46073645434808e-05, "learning_rate": 6.134969325153375e-08, "loss": -0.06881725788116455, "memory(GiB)": 68.67, "reward": 0.3951609134674072, "reward_std": 0.2324889898300171, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.5562499761581421, "rewards/PlanningActionSetORM/std": 0.36142081022262573, "rewards/RMReward/mean": 0.6187499761581421, "rewards/RMReward/std": 0.1691892445087433, "rewards/SpatialReasoningORM/mean": 0.737500011920929, "rewards/SpatialReasoningORM/std": 0.2801785171031952, "rewards/VisualPerceptionAccuracy/mean": 0.12594681978225708, "rewards/VisualPerceptionAccuracy/std": 0.3356606960296631, "step": 36, "train_speed(iter/s)": 0.005549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 67.453125, "completions/min_length": 2.0, "epoch": 0.0012618081369573373, "frac_reward_zero_std": 0.75, "grad_norm": 0.4919441044330597, "kl": 4.490598803386092e-05, "learning_rate": 6.305385139740968e-08, "loss": -0.002701921621337533, "memory(GiB)": 68.67, "reward": 0.26313021779060364, "reward_std": 0.02121981419622898, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8276041746139526, "rewards/PlanningActionSetORM/std": 0.10277260839939117, "rewards/RMReward/mean": 0.39625000953674316, "rewards/RMReward/std": 0.10500000417232513, "rewards/SpatialReasoningORM/mean": 0.30000001192092896, "rewards/SpatialReasoningORM/std": 0.3048003017902374, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 37, "train_speed(iter/s)": 0.005495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/mean_length": 78.171875, "completions/min_length": 9.0, "epoch": 0.0012959110595778059, "frac_reward_zero_std": 0.0, "grad_norm": 3.9196760654449463, "kl": 7.642435230081901e-05, "learning_rate": 6.475800954328562e-08, "loss": -0.07189090549945831, "memory(GiB)": 68.67, "reward": 0.7619583606719971, "reward_std": 0.16114789247512817, "rewards/MathAnswerFormat/mean": 0.1875, "rewards/MathAnswerFormat/std": 0.40311288833618164, "rewards/PlanningActionSetORM/mean": 0.8128471970558167, "rewards/PlanningActionSetORM/std": 0.2468893826007843, "rewards/RMReward/mean": 0.7956250309944153, "rewards/RMReward/std": 0.18737725913524628, "rewards/SpatialReasoningORM/mean": 0.6750000715255737, "rewards/SpatialReasoningORM/std": 0.16124515235424042, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 38, "train_speed(iter/s)": 0.005493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/mean_length": 126.25, "completions/min_length": 2.0, "epoch": 0.0013300139821982744, "frac_reward_zero_std": 0.5, "grad_norm": 7.1325860023498535, "kl": 0.00022385016200132668, "learning_rate": 6.646216768916156e-08, "loss": -0.07945459336042404, "memory(GiB)": 68.67, "reward": 0.5753124952316284, "reward_std": 0.09841199964284897, "rewards/MathAnswerFormat/mean": 0.4583333432674408, "rewards/MathAnswerFormat/std": 0.5035336017608643, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.7833333611488342, "rewards/SpatialReasoningORM/std": 0.20141342282295227, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 39, "train_speed(iter/s)": 0.005514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 166.140625, "completions/min_length": 8.0, "epoch": 0.001364116904818743, "frac_reward_zero_std": 0.0, "grad_norm": 4.42421293258667, "kl": 0.0002096628595609218, "learning_rate": 6.81663258350375e-08, "loss": -0.025179019197821617, "memory(GiB)": 68.67, "reward": 0.46634265780448914, "reward_std": 0.1878824084997177, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.7613281011581421, "rewards/PlanningActionSetORM/std": 0.25344452261924744, "rewards/RMReward/mean": 0.6337499618530273, "rewards/RMReward/std": 0.16091972589492798, "rewards/SpatialReasoningORM/mean": 0.5249999761581421, "rewards/SpatialReasoningORM/std": 0.44944408535957336, "rewards/VisualPerceptionAccuracy/mean": 0.013714337721467018, "rewards/VisualPerceptionAccuracy/std": 0.01819988153874874, "step": 40, "train_speed(iter/s)": 0.005489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/mean_length": 61.609375, "completions/min_length": 8.0, "epoch": 0.0013982198274392116, "frac_reward_zero_std": 0.0, "grad_norm": 6.496490478515625, "kl": 0.00029191517387516797, "learning_rate": 6.987048398091343e-08, "loss": -0.07995790243148804, "memory(GiB)": 68.67, "reward": 0.5299270749092102, "reward_std": 0.16617657244205475, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.504016101360321, "rewards/PlanningActionSetORM/mean": 0.8755208253860474, "rewards/PlanningActionSetORM/std": 0.16312001645565033, "rewards/RMReward/mean": 0.7074999809265137, "rewards/RMReward/std": 0.13317464292049408, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.42786040902137756, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 41, "train_speed(iter/s)": 0.005502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/mean_length": 50.15625, "completions/min_length": 2.0, "epoch": 0.0014323227500596801, "frac_reward_zero_std": 0.0, "grad_norm": 24.455642700195312, "kl": 0.00017499896057415754, "learning_rate": 7.157464212678937e-08, "loss": -0.0593477264046669, "memory(GiB)": 68.67, "reward": 0.6199531555175781, "reward_std": 0.1754092574119568, "rewards/MathAnswerFormat/mean": 0.21875, "rewards/MathAnswerFormat/std": 0.420013427734375, "rewards/PlanningActionSetORM/mean": 0.7476562261581421, "rewards/PlanningActionSetORM/std": 0.3050297796726227, "rewards/RMReward/mean": 0.5996874570846558, "rewards/RMReward/std": 0.14887717366218567, "rewards/SpatialReasoningORM/mean": 0.6312500238418579, "rewards/SpatialReasoningORM/std": 0.26449888944625854, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 42, "train_speed(iter/s)": 0.005557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 100.765625, "completions/min_length": 3.0, "epoch": 0.0014664256726801487, "frac_reward_zero_std": 0.0, "grad_norm": 23.60268783569336, "kl": 6.7500368459150195e-06, "learning_rate": 7.327880027266531e-08, "loss": -0.039834242314100266, "memory(GiB)": 68.67, "reward": 0.6325029134750366, "reward_std": 0.13371261954307556, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8664779663085938, "rewards/PlanningActionSetORM/std": 0.13981792330741882, "rewards/RMReward/mean": 0.6445833444595337, "rewards/RMReward/std": 0.17315542697906494, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 43, "train_speed(iter/s)": 0.005565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/mean_length": 117.40625, "completions/min_length": 2.0, "epoch": 0.0015005285953006173, "frac_reward_zero_std": 0.0, "grad_norm": 18.85192108154297, "kl": 0.00010136073979083449, "learning_rate": 7.498295841854125e-08, "loss": 0.01493091695010662, "memory(GiB)": 68.67, "reward": 0.49546119570732117, "reward_std": 0.1455981731414795, "rewards/MathAnswerFormat/mean": 0.375, "rewards/MathAnswerFormat/std": 0.49186936020851135, "rewards/PlanningActionSetORM/mean": 0.9125000238418579, "rewards/PlanningActionSetORM/std": 0.07232898473739624, "rewards/RMReward/mean": 0.7281249761581421, "rewards/RMReward/std": 0.11250001192092896, "rewards/SpatialReasoningORM/mean": 0.6187499761581421, "rewards/SpatialReasoningORM/std": 0.3779720962047577, "rewards/VisualPerceptionAccuracy/mean": 0.003719785949215293, "rewards/VisualPerceptionAccuracy/std": 0.0024010210763663054, "step": 44, "train_speed(iter/s)": 0.005593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 148.40625, "completions/min_length": 2.0, "epoch": 0.0015346315179210859, "frac_reward_zero_std": 0.5, "grad_norm": 4.4581780433654785, "kl": 0.000307003065245226, "learning_rate": 7.668711656441719e-08, "loss": -0.13862568140029907, "memory(GiB)": 68.67, "reward": 0.4944791793823242, "reward_std": 0.08712536096572876, "rewards/MathAnswerFormat/mean": 0.28125, "rewards/MathAnswerFormat/std": 0.45680341124534607, "rewards/PlanningActionSetORM/mean": 0.43020832538604736, "rewards/PlanningActionSetORM/std": 0.2207232415676117, "rewards/RMReward/mean": 0.637499988079071, "rewards/RMReward/std": 0.12179218232631683, "rewards/SpatialReasoningORM/mean": 0.7125000357627869, "rewards/SpatialReasoningORM/std": 0.182721346616745, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 45, "train_speed(iter/s)": 0.005494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/mean_length": 32.421875, "completions/min_length": 3.0, "epoch": 0.0015687344405415544, "frac_reward_zero_std": 0.0, "grad_norm": 41.68775939941406, "kl": 0.0017544105648994446, "learning_rate": 7.839127471029313e-08, "loss": -0.13427715003490448, "memory(GiB)": 68.67, "reward": 0.6492648720741272, "reward_std": 0.20847390592098236, "rewards/MathAnswerFormat/mean": 0.4583333432674408, "rewards/MathAnswerFormat/std": 0.5035336017608643, "rewards/PlanningActionSetORM/mean": 0.5096726417541504, "rewards/PlanningActionSetORM/std": 0.2521880567073822, "rewards/RMReward/mean": 0.643125057220459, "rewards/RMReward/std": 0.169773131608963, "rewards/SpatialReasoningORM/mean": 0.6708333492279053, "rewards/SpatialReasoningORM/std": 0.3724321126937866, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 46, "train_speed(iter/s)": 0.00549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 100.953125, "completions/min_length": 8.0, "epoch": 0.001602837363162023, "frac_reward_zero_std": 0.0, "grad_norm": 8.36641788482666, "kl": 0.0006791218183934689, "learning_rate": 8.009543285616907e-08, "loss": -0.12381693720817566, "memory(GiB)": 68.67, "reward": 0.6795242428779602, "reward_std": 0.15107494592666626, "rewards/MathAnswerFormat/mean": 0.53125, "rewards/MathAnswerFormat/std": 0.507007360458374, "rewards/PlanningActionSetORM/mean": 0.885554850101471, "rewards/PlanningActionSetORM/std": 0.13559801876544952, "rewards/RMReward/mean": 0.4793750047683716, "rewards/RMReward/std": 0.11763633042573929, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.20280292630195618, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 47, "train_speed(iter/s)": 0.005526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/mean_length": 143.34375, "completions/min_length": 97.0, "epoch": 0.0016369402857824916, "frac_reward_zero_std": 0.0, "grad_norm": 0.7724863886833191, "kl": 2.302710709045641e-05, "learning_rate": 8.1799591002045e-08, "loss": -0.02028258517384529, "memory(GiB)": 68.67, "reward": 0.658230185508728, "reward_std": 0.08633055537939072, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8574008345603943, "rewards/PlanningActionSetORM/std": 0.14830802381038666, "rewards/RMReward/mean": 0.6084374785423279, "rewards/RMReward/std": 0.1333359330892563, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 48, "train_speed(iter/s)": 0.005531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/mean_length": 168.390625, "completions/min_length": 9.0, "epoch": 0.0016710432084029601, "frac_reward_zero_std": 0.0, "grad_norm": 3.832322835922241, "kl": 0.0006214072345755994, "learning_rate": 8.350374914792094e-08, "loss": -0.036578599363565445, "memory(GiB)": 68.67, "reward": 0.522361695766449, "reward_std": 0.10454296320676804, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.8335751295089722, "rewards/PlanningActionSetORM/std": 0.14028802514076233, "rewards/RMReward/mean": 0.546875, "rewards/RMReward/std": 0.10155047476291656, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.1914854198694229, "rewards/VisualPerceptionAccuracy/mean": 0.015391725115478039, "rewards/VisualPerceptionAccuracy/std": 0.024609284475445747, "step": 49, "train_speed(iter/s)": 0.005566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 110.75, "completions/min_length": 8.0, "epoch": 0.0017051461310234287, "frac_reward_zero_std": 0.0, "grad_norm": 3.178027629852295, "kl": 0.00037259692908264697, "learning_rate": 8.520790729379688e-08, "loss": -0.06121521443128586, "memory(GiB)": 68.67, "reward": 0.732346773147583, "reward_std": 0.13225039839744568, "rewards/MathAnswerFormat/mean": 0.8125, "rewards/MathAnswerFormat/std": 0.40311288833618164, "rewards/PlanningActionSetORM/mean": 0.8833536505699158, "rewards/PlanningActionSetORM/std": 0.1356755644083023, "rewards/RMReward/mean": 0.6166666150093079, "rewards/RMReward/std": 0.16188165545463562, "rewards/SpatialReasoningORM/mean": 0.9249999523162842, "rewards/SpatialReasoningORM/std": 0.16124515235424042, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 50, "train_speed(iter/s)": 0.005549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 96.109375, "completions/min_length": 9.0, "epoch": 0.0017392490536438973, "frac_reward_zero_std": 0.0, "grad_norm": 4.166775703430176, "kl": 0.0007065112004056573, "learning_rate": 8.691206543967282e-08, "loss": -0.04587006941437721, "memory(GiB)": 68.67, "reward": 0.7669281959533691, "reward_std": 0.14050763845443726, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.880146324634552, "rewards/PlanningActionSetORM/std": 0.13762159645557404, "rewards/RMReward/mean": 0.7422916889190674, "rewards/RMReward/std": 0.12590506672859192, "rewards/SpatialReasoningORM/mean": 0.7749999761581421, "rewards/SpatialReasoningORM/std": 0.20493900775909424, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 51, "train_speed(iter/s)": 0.005541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/mean_length": 24.515625, "completions/min_length": 2.0, "epoch": 0.0017733519762643658, "frac_reward_zero_std": 0.0, "grad_norm": 78.09367370605469, "kl": -6.198720257089008e-07, "learning_rate": 8.861622358554875e-08, "loss": 0.008589555509388447, "memory(GiB)": 68.67, "reward": 0.47695308923721313, "reward_std": 0.3449239432811737, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9546874761581421, "rewards/PlanningActionSetORM/std": 0.06469077616930008, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.23839743435382843, "rewards/SpatialReasoningORM/mean": 0.11250000447034836, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": 0.5, "rewards/VisualPerceptionAccuracy/std": 0.5080004930496216, "step": 52, "train_speed(iter/s)": 0.00555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/mean_length": 124.03125, "completions/min_length": 2.0, "epoch": 0.0018074548988848344, "frac_reward_zero_std": 0.0, "grad_norm": 35.587154388427734, "kl": 0.00010703909356379882, "learning_rate": 9.032038173142469e-08, "loss": -0.007575869560241699, "memory(GiB)": 68.67, "reward": 0.5533853769302368, "reward_std": 0.1794787347316742, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.6788192391395569, "rewards/PlanningActionSetORM/std": 0.2881951928138733, "rewards/RMReward/mean": 0.6041666865348816, "rewards/RMReward/std": 0.1761990189552307, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 53, "train_speed(iter/s)": 0.005535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/mean_length": 112.328125, "completions/min_length": 9.0, "epoch": 0.001841557821505303, "frac_reward_zero_std": 0.0, "grad_norm": 3.5506439208984375, "kl": 0.00028721781563945115, "learning_rate": 9.202453987730062e-08, "loss": -0.061832357197999954, "memory(GiB)": 68.67, "reward": 0.6645126938819885, "reward_std": 0.1364256888628006, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.8182096481323242, "rewards/PlanningActionSetORM/std": 0.14101412892341614, "rewards/RMReward/mean": 0.5422916412353516, "rewards/RMReward/std": 0.1513659805059433, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.1914854198694229, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 54, "train_speed(iter/s)": 0.005541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/mean_length": 178.96875, "completions/min_length": 9.0, "epoch": 0.0018756607441257715, "frac_reward_zero_std": 0.0, "grad_norm": 4.192141056060791, "kl": 0.001116206869482994, "learning_rate": 9.372869802317656e-08, "loss": 0.027419671416282654, "memory(GiB)": 68.67, "reward": 0.363248735666275, "reward_std": 0.12347867339849472, "rewards/MathAnswerFormat/mean": 0.5625, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.8854955434799194, "rewards/PlanningActionSetORM/std": 0.13063351809978485, "rewards/RMReward/mean": 0.5859375, "rewards/RMReward/std": 0.12587395310401917, "rewards/SpatialReasoningORM/mean": 0.13750000298023224, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": 0.0025467220693826675, "rewards/VisualPerceptionAccuracy/std": 0.009133521467447281, "step": 55, "train_speed(iter/s)": 0.005514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 95.265625, "completions/min_length": 2.0, "epoch": 0.0019097636667462401, "frac_reward_zero_std": 0.0, "grad_norm": 50.17286682128906, "kl": 0.00011780315253417939, "learning_rate": 9.54328561690525e-08, "loss": -0.1055094376206398, "memory(GiB)": 68.67, "reward": 0.5722017288208008, "reward_std": 0.1834663450717926, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7469697594642639, "rewards/PlanningActionSetORM/std": 0.2780715525150299, "rewards/RMReward/mean": 0.6333333253860474, "rewards/RMReward/std": 0.178150475025177, "rewards/SpatialReasoningORM/mean": 0.3375000059604645, "rewards/SpatialReasoningORM/std": 0.30740854144096375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 56, "train_speed(iter/s)": 0.005507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/mean_length": 32.359375, "completions/min_length": 2.0, "epoch": 0.0019438665893667087, "frac_reward_zero_std": 0.0, "grad_norm": 36.9224739074707, "kl": 0.0015258875209838152, "learning_rate": 9.713701431492844e-08, "loss": -0.028460180386900902, "memory(GiB)": 68.67, "reward": 0.659067690372467, "reward_std": 0.25763171911239624, "rewards/MathAnswerFormat/mean": 0.5416666865348816, "rewards/MathAnswerFormat/std": 0.5035336017608643, "rewards/PlanningActionSetORM/mean": 0.8619791269302368, "rewards/PlanningActionSetORM/std": 0.13608171045780182, "rewards/RMReward/mean": 0.7962499856948853, "rewards/RMReward/std": 0.11206397414207458, "rewards/SpatialReasoningORM/mean": 0.6124999523162842, "rewards/SpatialReasoningORM/std": 0.4134056568145752, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 57, "train_speed(iter/s)": 0.005518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1406.0, "completions/mean_length": 163.96875, "completions/min_length": 2.0, "epoch": 0.0019779695119871775, "frac_reward_zero_std": 0.0, "grad_norm": 24.362775802612305, "kl": 0.0019091550493612885, "learning_rate": 9.884117246080438e-08, "loss": -0.02619495429098606, "memory(GiB)": 68.67, "reward": 0.3872469663619995, "reward_std": 0.12319914996623993, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.4399413466453552, "rewards/PlanningActionSetORM/mean": 0.8197916746139526, "rewards/PlanningActionSetORM/std": 0.09432385861873627, "rewards/RMReward/mean": 0.6906249523162842, "rewards/RMReward/std": 0.1186293289065361, "rewards/SpatialReasoningORM/mean": 0.41875001788139343, "rewards/SpatialReasoningORM/std": 0.4261133372783661, "rewards/VisualPerceptionAccuracy/mean": 0.011904563754796982, "rewards/VisualPerceptionAccuracy/std": 0.022776784375309944, "step": 58, "train_speed(iter/s)": 0.005519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 83.4375, "completions/min_length": 2.0, "epoch": 0.002012072434607646, "frac_reward_zero_std": 0.0, "grad_norm": 35.933197021484375, "kl": 0.0013419091701507568, "learning_rate": 1.0054533060668032e-07, "loss": -0.05951740965247154, "memory(GiB)": 68.67, "reward": 0.5475798845291138, "reward_std": 0.18054842948913574, "rewards/MathAnswerFormat/mean": 0.40625, "rewards/MathAnswerFormat/std": 0.49899089336395264, "rewards/PlanningActionSetORM/mean": 0.8255208134651184, "rewards/PlanningActionSetORM/std": 0.2951062321662903, "rewards/RMReward/mean": 0.8743749856948853, "rewards/RMReward/std": 0.2125392109155655, "rewards/SpatialReasoningORM/mean": 0.6687500476837158, "rewards/SpatialReasoningORM/std": 0.3468173146247864, "rewards/VisualPerceptionAccuracy/mean": 0.014465476386249065, "rewards/VisualPerceptionAccuracy/std": 0.05786190554499626, "step": 59, "train_speed(iter/s)": 0.005556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 138.71875, "completions/min_length": 8.0, "epoch": 0.0020461753572281146, "frac_reward_zero_std": 0.0, "grad_norm": 2.0890979766845703, "kl": 0.0006845601601526141, "learning_rate": 1.0224948875255626e-07, "loss": -0.04431188851594925, "memory(GiB)": 68.67, "reward": 0.5503275394439697, "reward_std": 0.1635483205318451, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4787135720252991, "rewards/PlanningActionSetORM/mean": 0.909288227558136, "rewards/PlanningActionSetORM/std": 0.12445541471242905, "rewards/RMReward/mean": 0.6496874690055847, "rewards/RMReward/std": 0.17442040145397186, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.3464101552963257, "rewards/VisualPerceptionAccuracy/mean": 0.05121990665793419, "rewards/VisualPerceptionAccuracy/std": 0.04015451297163963, "step": 60, "train_speed(iter/s)": 0.00556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/mean_length": 217.25, "completions/min_length": 2.0, "epoch": 0.002080278279848583, "frac_reward_zero_std": 0.0, "grad_norm": 90.54625701904297, "kl": 0.004063948057591915, "learning_rate": 1.0395364689843218e-07, "loss": -0.11637486517429352, "memory(GiB)": 68.67, "reward": 0.4424682855606079, "reward_std": 0.11801905184984207, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8537776470184326, "rewards/PlanningActionSetORM/std": 0.15096169710159302, "rewards/RMReward/mean": 0.5778124928474426, "rewards/RMReward/std": 0.1709882915019989, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": 0.0051119364798069, "rewards/VisualPerceptionAccuracy/std": 0.02044774778187275, "step": 61, "train_speed(iter/s)": 0.005514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/mean_length": 54.78125, "completions/min_length": 2.0, "epoch": 0.0021143812024690518, "frac_reward_zero_std": 0.0, "grad_norm": 30.551162719726562, "kl": 0.0011387966806069016, "learning_rate": 1.0565780504430812e-07, "loss": -0.03014410100877285, "memory(GiB)": 68.67, "reward": 0.40347546339035034, "reward_std": 0.18358425796031952, "rewards/MathAnswerFormat/mean": 0.375, "rewards/MathAnswerFormat/std": 0.49186936020851135, "rewards/PlanningActionSetORM/mean": 0.7822544574737549, "rewards/PlanningActionSetORM/std": 0.27028781175613403, "rewards/RMReward/mean": 0.46312499046325684, "rewards/RMReward/std": 0.14802870154380798, "rewards/SpatialReasoningORM/mean": 0.2750000059604645, "rewards/SpatialReasoningORM/std": 0.3242858350276947, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 62, "train_speed(iter/s)": 0.005534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 25.796875, "completions/min_length": 2.0, "epoch": 0.0021484841250895203, "frac_reward_zero_std": 0.0, "grad_norm": 54.61528396606445, "kl": 0.003846371080726385, "learning_rate": 1.0736196319018406e-07, "loss": -0.09756014496088028, "memory(GiB)": 68.67, "reward": 0.4019322991371155, "reward_std": 0.2954092025756836, "rewards/MathAnswerFormat/mean": 0.3541666567325592, "rewards/MathAnswerFormat/std": 0.4833211302757263, "rewards/PlanningActionSetORM/mean": 0.5161458253860474, "rewards/PlanningActionSetORM/std": 0.3598928451538086, "rewards/RMReward/mean": 0.611875057220459, "rewards/RMReward/std": 0.16529646515846252, "rewards/SpatialReasoningORM/mean": 0.3375000059604645, "rewards/SpatialReasoningORM/std": 0.3762214481830597, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 63, "train_speed(iter/s)": 0.005591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 178.796875, "completions/min_length": 2.0, "epoch": 0.002182587047709989, "frac_reward_zero_std": 0.0, "grad_norm": 11.510490417480469, "kl": 2.0723346096929163e-05, "learning_rate": 1.0906612133606e-07, "loss": -0.10594518482685089, "memory(GiB)": 68.67, "reward": 0.4676835238933563, "reward_std": 0.16463546454906464, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7574735283851624, "rewards/PlanningActionSetORM/std": 0.2462390661239624, "rewards/RMReward/mean": 0.5604166388511658, "rewards/RMReward/std": 0.19838666915893555, "rewards/SpatialReasoningORM/mean": 0.07500000298023224, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 64, "train_speed(iter/s)": 0.005562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/mean_length": 75.890625, "completions/min_length": 2.0, "epoch": 0.0022166899703304575, "frac_reward_zero_std": 0.0, "grad_norm": 18.722631454467773, "kl": 1.9837987565551884e-05, "learning_rate": 1.1077027948193594e-07, "loss": -0.006188581697642803, "memory(GiB)": 68.67, "reward": 0.5744043588638306, "reward_std": 0.12934575974941254, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8822792172431946, "rewards/PlanningActionSetORM/std": 0.15278613567352295, "rewards/RMReward/mean": 0.7070832848548889, "rewards/RMReward/std": 0.12021183967590332, "rewards/SpatialReasoningORM/mean": 0.07500000298023224, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 65, "train_speed(iter/s)": 0.005598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 113.234375, "completions/min_length": 3.0, "epoch": 0.002250792892950926, "frac_reward_zero_std": 0.0, "grad_norm": 28.13227081298828, "kl": 4.0386668842984363e-05, "learning_rate": 1.1247443762781188e-07, "loss": -0.052513256669044495, "memory(GiB)": 68.67, "reward": 0.5326682329177856, "reward_std": 0.16179049015045166, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9052882194519043, "rewards/PlanningActionSetORM/std": 0.1031593531370163, "rewards/RMReward/mean": 0.5427083373069763, "rewards/RMReward/std": 0.1447630524635315, "rewards/SpatialReasoningORM/mean": 0.30000001192092896, "rewards/SpatialReasoningORM/std": 0.3098386824131012, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 66, "train_speed(iter/s)": 0.005597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 216.046875, "completions/min_length": 79.0, "epoch": 0.0022848958155713946, "frac_reward_zero_std": 0.0, "grad_norm": 0.6997715830802917, "kl": 2.1825264411745593e-05, "learning_rate": 1.1417859577368781e-07, "loss": -0.1061745211482048, "memory(GiB)": 68.67, "reward": 0.6801037788391113, "reward_std": 0.13881012797355652, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8698939085006714, "rewards/PlanningActionSetORM/std": 0.17307044565677643, "rewards/RMReward/mean": 0.6326562166213989, "rewards/RMReward/std": 0.19247134029865265, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 67, "train_speed(iter/s)": 0.005546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/mean_length": 44.0625, "completions/min_length": 2.0, "epoch": 0.002318998738191863, "frac_reward_zero_std": 0.0, "grad_norm": 53.182594299316406, "kl": 0.0004429807886481285, "learning_rate": 1.1588275391956375e-07, "loss": 0.050283994525671005, "memory(GiB)": 68.67, "reward": 0.583104133605957, "reward_std": 0.19847321510314941, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.6901041269302368, "rewards/PlanningActionSetORM/std": 0.2880881726741791, "rewards/RMReward/mean": 0.6840624809265137, "rewards/RMReward/std": 0.2584677040576935, "rewards/SpatialReasoningORM/mean": 0.5062500238418579, "rewards/SpatialReasoningORM/std": 0.22134123742580414, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 68, "train_speed(iter/s)": 0.005567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 71.421875, "completions/min_length": 2.0, "epoch": 0.0023531016608123317, "frac_reward_zero_std": 0.0, "grad_norm": 57.924259185791016, "kl": 0.0021750028245151043, "learning_rate": 1.1758691206543969e-07, "loss": -0.020249154418706894, "memory(GiB)": 68.67, "reward": 0.49542489647865295, "reward_std": 0.15716144442558289, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8789366483688354, "rewards/PlanningActionSetORM/std": 0.13685888051986694, "rewards/RMReward/mean": 0.5512499809265137, "rewards/RMReward/std": 0.12870946526527405, "rewards/SpatialReasoningORM/mean": 0.39375001192092896, "rewards/SpatialReasoningORM/std": 0.2895352244377136, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 69, "train_speed(iter/s)": 0.005571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 115.9375, "completions/min_length": 2.0, "epoch": 0.0023872045834328003, "frac_reward_zero_std": 0.0, "grad_norm": 10.051409721374512, "kl": 3.2915788324316964e-05, "learning_rate": 1.1929107021131563e-07, "loss": -0.043433330953121185, "memory(GiB)": 68.67, "reward": 0.633508563041687, "reward_std": 0.13038431107997894, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8460988998413086, "rewards/PlanningActionSetORM/std": 0.15472358465194702, "rewards/RMReward/mean": 0.6216667294502258, "rewards/RMReward/std": 0.16321678459644318, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 70, "train_speed(iter/s)": 0.00557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 175.65625, "completions/min_length": 13.0, "epoch": 0.002421307506053269, "frac_reward_zero_std": 0.0, "grad_norm": 1.1608566045761108, "kl": 1.1969318620685954e-05, "learning_rate": 1.2099522835719156e-07, "loss": -0.19580112397670746, "memory(GiB)": 68.67, "reward": 0.6579445600509644, "reward_std": 0.1841476559638977, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.7834728360176086, "rewards/PlanningActionSetORM/std": 0.25494107604026794, "rewards/RMReward/mean": 0.6265624761581421, "rewards/RMReward/std": 0.2314496785402298, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 71, "train_speed(iter/s)": 0.005504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/mean_length": 52.203125, "completions/min_length": 2.0, "epoch": 0.0024554104286737375, "frac_reward_zero_std": 0.0, "grad_norm": 58.026065826416016, "kl": 0.0035122002009302378, "learning_rate": 1.226993865030675e-07, "loss": -0.039586830884218216, "memory(GiB)": 68.67, "reward": 0.6721405982971191, "reward_std": 0.1922767162322998, "rewards/MathAnswerFormat/mean": 0.34375, "rewards/MathAnswerFormat/std": 0.4825586974620819, "rewards/PlanningActionSetORM/mean": 0.87890625, "rewards/PlanningActionSetORM/std": 0.12210783362388611, "rewards/RMReward/mean": 0.6746875047683716, "rewards/RMReward/std": 0.1776728332042694, "rewards/SpatialReasoningORM/mean": 0.643750011920929, "rewards/SpatialReasoningORM/std": 0.33595073223114014, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 72, "train_speed(iter/s)": 0.005529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/mean_length": 142.25, "completions/min_length": 9.0, "epoch": 0.002489513351294206, "frac_reward_zero_std": 0.0, "grad_norm": 4.054316997528076, "kl": 0.0023867820855230093, "learning_rate": 1.2440354464894343e-07, "loss": -0.08877871930599213, "memory(GiB)": 68.67, "reward": 0.5476449131965637, "reward_std": 0.15011060237884521, "rewards/MathAnswerFormat/mean": 0.625, "rewards/MathAnswerFormat/std": 0.5, "rewards/PlanningActionSetORM/mean": 0.7599405646324158, "rewards/PlanningActionSetORM/std": 0.2813683748245239, "rewards/RMReward/mean": 0.6371874809265137, "rewards/RMReward/std": 0.1743972897529602, "rewards/SpatialReasoningORM/mean": 0.8500000238418579, "rewards/SpatialReasoningORM/std": 0.19999998807907104, "rewards/VisualPerceptionAccuracy/mean": 0.028353329747915268, "rewards/VisualPerceptionAccuracy/std": 0.07811301201581955, "step": 73, "train_speed(iter/s)": 0.005485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/mean_length": 63.421875, "completions/min_length": 9.0, "epoch": 0.0025236162739146746, "frac_reward_zero_std": 0.0, "grad_norm": 6.676919937133789, "kl": 0.005999286193400621, "learning_rate": 1.2610770279481936e-07, "loss": -0.14257261157035828, "memory(GiB)": 68.67, "reward": 0.7560081481933594, "reward_std": 0.16296766698360443, "rewards/MathAnswerFormat/mean": 0.59375, "rewards/MathAnswerFormat/std": 0.49899089336395264, "rewards/PlanningActionSetORM/mean": 0.7460193037986755, "rewards/PlanningActionSetORM/std": 0.29043614864349365, "rewards/RMReward/mean": 0.671875, "rewards/RMReward/std": 0.15394672751426697, "rewards/SpatialReasoningORM/mean": 0.8374999761581421, "rewards/SpatialReasoningORM/std": 0.1995963603258133, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 74, "train_speed(iter/s)": 0.005492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 147.625, "completions/min_length": 2.0, "epoch": 0.002557719196535143, "frac_reward_zero_std": 0.0, "grad_norm": 22.166528701782227, "kl": 0.00010788359213620424, "learning_rate": 1.278118609406953e-07, "loss": -0.12798818945884705, "memory(GiB)": 68.67, "reward": 0.5557536482810974, "reward_std": 0.18010571599006653, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7706494331359863, "rewards/PlanningActionSetORM/std": 0.24777741730213165, "rewards/RMReward/mean": 0.5406250357627869, "rewards/RMReward/std": 0.18061839044094086, "rewards/SpatialReasoningORM/mean": 0.48750001192092896, "rewards/SpatialReasoningORM/std": 0.24186775088310242, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 75, "train_speed(iter/s)": 0.005439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 52.046875, "completions/min_length": 2.0, "epoch": 0.0025918221191556117, "frac_reward_zero_std": 0.0, "grad_norm": 10.722772598266602, "kl": 0.0043738363310694695, "learning_rate": 1.2951601908657124e-07, "loss": -0.03638050705194473, "memory(GiB)": 68.67, "reward": 0.5948455929756165, "reward_std": 0.15430252254009247, "rewards/MathAnswerFormat/mean": 0.5625, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.5546875, "rewards/PlanningActionSetORM/std": 0.14894595742225647, "rewards/RMReward/mean": 0.637499988079071, "rewards/RMReward/std": 0.1454876959323883, "rewards/SpatialReasoningORM/mean": 0.8250000476837158, "rewards/SpatialReasoningORM/std": 0.20493900775909424, "rewards/VisualPerceptionAccuracy/mean": 0.4732849895954132, "rewards/VisualPerceptionAccuracy/std": 0.5029146075248718, "step": 76, "train_speed(iter/s)": 0.005462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/mean_length": 37.453125, "completions/min_length": 8.0, "epoch": 0.0026259250417760803, "frac_reward_zero_std": 0.0, "grad_norm": 7.3149261474609375, "kl": 0.005905167665332556, "learning_rate": 1.312201772324472e-07, "loss": -0.09401912242174149, "memory(GiB)": 68.67, "reward": 0.6411562561988831, "reward_std": 0.1968861222267151, "rewards/MathAnswerFormat/mean": 0.8333333134651184, "rewards/MathAnswerFormat/std": 0.3766217827796936, "rewards/PlanningActionSetORM/mean": 0.921875, "rewards/PlanningActionSetORM/std": 0.06810089200735092, "rewards/RMReward/mean": 0.4737499952316284, "rewards/RMReward/std": 0.13519738614559174, "rewards/SpatialReasoningORM/mean": 0.6583333015441895, "rewards/SpatialReasoningORM/std": 0.44618159532546997, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 77, "train_speed(iter/s)": 0.005483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 120.34375, "completions/min_length": 2.0, "epoch": 0.002660027964396549, "frac_reward_zero_std": 0.0, "grad_norm": 34.587154388427734, "kl": 0.0007761975866742432, "learning_rate": 1.3292433537832311e-07, "loss": -0.013287756592035294, "memory(GiB)": 69.16, "reward": 0.5114080905914307, "reward_std": 0.18066167831420898, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.774803638458252, "rewards/PlanningActionSetORM/std": 0.19927214086055756, "rewards/RMReward/mean": 0.5695833563804626, "rewards/RMReward/std": 0.2294856160879135, "rewards/SpatialReasoningORM/mean": 0.22500000894069672, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 78, "train_speed(iter/s)": 0.005428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 123.9375, "completions/min_length": 9.0, "epoch": 0.0026941308870170174, "frac_reward_zero_std": 0.0, "grad_norm": 3.4362494945526123, "kl": 0.004080138169229031, "learning_rate": 1.3462849352419907e-07, "loss": -0.08507783710956573, "memory(GiB)": 69.16, "reward": 0.6942434906959534, "reward_std": 0.1557389795780182, "rewards/MathAnswerFormat/mean": 0.8125, "rewards/MathAnswerFormat/std": 0.40311288833618164, "rewards/PlanningActionSetORM/mean": 0.7926650047302246, "rewards/PlanningActionSetORM/std": 0.2170020341873169, "rewards/RMReward/mean": 0.5758333206176758, "rewards/RMReward/std": 0.17268632352352142, "rewards/SpatialReasoningORM/mean": 0.9249999523162842, "rewards/SpatialReasoningORM/std": 0.16124515235424042, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 79, "train_speed(iter/s)": 0.005406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 133.921875, "completions/min_length": 2.0, "epoch": 0.002728233809637486, "frac_reward_zero_std": 0.0, "grad_norm": 8.58715534210205, "kl": 0.0035675051622092724, "learning_rate": 1.36332651670075e-07, "loss": -0.09914574027061462, "memory(GiB)": 69.16, "reward": 0.5005258321762085, "reward_std": 0.10239540785551071, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.504016101360321, "rewards/PlanningActionSetORM/mean": 0.8576140999794006, "rewards/PlanningActionSetORM/std": 0.15969280898571014, "rewards/RMReward/mean": 0.43437498807907104, "rewards/RMReward/std": 0.11360567808151245, "rewards/SpatialReasoningORM/mean": 0.7562500238418579, "rewards/SpatialReasoningORM/std": 0.24221757054328918, "rewards/VisualPerceptionAccuracy/mean": 0.0024556657299399376, "rewards/VisualPerceptionAccuracy/std": 0.003017341485247016, "step": 80, "train_speed(iter/s)": 0.005351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 42.796875, "completions/min_length": 2.0, "epoch": 0.0027623367322579546, "frac_reward_zero_std": 0.0, "grad_norm": 25.35330581665039, "kl": 0.007309362292289734, "learning_rate": 1.3803680981595094e-07, "loss": -0.1033204197883606, "memory(GiB)": 69.16, "reward": 0.6349624395370483, "reward_std": 0.16633577644824982, "rewards/MathAnswerFormat/mean": 0.5416666865348816, "rewards/MathAnswerFormat/std": 0.5035336017608643, "rewards/PlanningActionSetORM/mean": 0.870498538017273, "rewards/PlanningActionSetORM/std": 0.11689376085996628, "rewards/RMReward/mean": 0.4806249737739563, "rewards/RMReward/std": 0.09095557779073715, "rewards/SpatialReasoningORM/mean": 0.6666666865348816, "rewards/SpatialReasoningORM/std": 0.4193882644176483, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 81, "train_speed(iter/s)": 0.005378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/mean_length": 58.21875, "completions/min_length": 9.0, "epoch": 0.002796439654878423, "frac_reward_zero_std": 0.0, "grad_norm": 3.4418442249298096, "kl": 0.009212848730385303, "learning_rate": 1.3974096796182687e-07, "loss": -0.051449067890644073, "memory(GiB)": 69.16, "reward": 0.8564114570617676, "reward_std": 0.17515352368354797, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.33601075410842896, "rewards/PlanningActionSetORM/mean": 0.8325520753860474, "rewards/PlanningActionSetORM/std": 0.16430257260799408, "rewards/RMReward/mean": 0.7871875166893005, "rewards/RMReward/std": 0.1582002341747284, "rewards/SpatialReasoningORM/mean": 0.918749988079071, "rewards/SpatialReasoningORM/std": 0.21468281745910645, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 82, "train_speed(iter/s)": 0.005377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 88.34375, "completions/min_length": 3.0, "epoch": 0.0028305425774988917, "frac_reward_zero_std": 0.0, "grad_norm": 16.0701904296875, "kl": 0.00646199518814683, "learning_rate": 1.414451261077028e-07, "loss": -0.06429173797369003, "memory(GiB)": 69.16, "reward": 0.632198691368103, "reward_std": 0.17995207011699677, "rewards/MathAnswerFormat/mean": 0.3125, "rewards/MathAnswerFormat/std": 0.4709290862083435, "rewards/PlanningActionSetORM/mean": 0.8922990560531616, "rewards/PlanningActionSetORM/std": 0.12942376732826233, "rewards/RMReward/mean": 0.543749988079071, "rewards/RMReward/std": 0.18083587288856506, "rewards/SpatialReasoningORM/mean": 0.6687500476837158, "rewards/SpatialReasoningORM/std": 0.2856091260910034, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 83, "train_speed(iter/s)": 0.005336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/mean_length": 189.90625, "completions/min_length": 8.0, "epoch": 0.0028646455001193603, "frac_reward_zero_std": 0.0, "grad_norm": 3.024365186691284, "kl": 0.00415208050981164, "learning_rate": 1.4314928425357875e-07, "loss": -0.004529934376478195, "memory(GiB)": 69.16, "reward": 0.578330397605896, "reward_std": 0.12156155705451965, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.7560707330703735, "rewards/PlanningActionSetORM/std": 0.23099148273468018, "rewards/RMReward/mean": 0.6496875286102295, "rewards/RMReward/std": 0.14234179258346558, "rewards/SpatialReasoningORM/mean": 0.949999988079071, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": 0.025143321603536606, "rewards/VisualPerceptionAccuracy/std": 0.06870760768651962, "step": 84, "train_speed(iter/s)": 0.005302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 84.5, "completions/min_length": 2.0, "epoch": 0.002898748422739829, "frac_reward_zero_std": 0.0, "grad_norm": 67.87237548828125, "kl": 0.0010671747149899602, "learning_rate": 1.4485344239945467e-07, "loss": -0.04536234214901924, "memory(GiB)": 69.16, "reward": 0.5412287712097168, "reward_std": 0.19282278418540955, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9044749736785889, "rewards/PlanningActionSetORM/std": 0.06632761657238007, "rewards/RMReward/mean": 0.5703125, "rewards/RMReward/std": 0.1740686595439911, "rewards/SpatialReasoningORM/mean": 0.46875, "rewards/SpatialReasoningORM/std": 0.2520080804824829, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 85, "train_speed(iter/s)": 0.005314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/mean_length": 92.765625, "completions/min_length": 9.0, "epoch": 0.0029328513453602974, "frac_reward_zero_std": 0.0, "grad_norm": 3.9155492782592773, "kl": 0.005194452591240406, "learning_rate": 1.4655760054533062e-07, "loss": -0.0659988671541214, "memory(GiB)": 69.16, "reward": 0.19600160419940948, "reward_std": 0.04807785525918007, "rewards/MathAnswerFormat/mean": 0.5625, "rewards/MathAnswerFormat/std": 0.5123475790023804, "rewards/PlanningActionSetORM/mean": 0.9125000238418579, "rewards/PlanningActionSetORM/std": 0.12568332254886627, "rewards/RMReward/mean": 0.6968749761581421, "rewards/RMReward/std": 0.1687885820865631, "rewards/SpatialReasoningORM/mean": 0.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.00794071052223444, "rewards/VisualPerceptionAccuracy/std": 0.01391301304101944, "step": 86, "train_speed(iter/s)": 0.005353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 81.109375, "completions/min_length": 2.0, "epoch": 0.002966954267980766, "frac_reward_zero_std": 0.0, "grad_norm": 10.706153869628906, "kl": 0.00010433288116473705, "learning_rate": 1.4826175869120655e-07, "loss": -0.05895226448774338, "memory(GiB)": 69.16, "reward": 0.5567179918289185, "reward_std": 0.126101553440094, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8704116940498352, "rewards/PlanningActionSetORM/std": 0.1827528029680252, "rewards/RMReward/mean": 0.6954166889190674, "rewards/RMReward/std": 0.1802593618631363, "rewards/SpatialReasoningORM/mean": 0.03750000149011612, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 87, "train_speed(iter/s)": 0.005352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/mean_length": 32.15625, "completions/min_length": 2.0, "epoch": 0.0030010571906012346, "frac_reward_zero_std": 0.0, "grad_norm": 50.73446273803711, "kl": 0.006288472563028336, "learning_rate": 1.499659168370825e-07, "loss": -0.013900049030780792, "memory(GiB)": 69.16, "reward": 0.5188281536102295, "reward_std": 0.2280990034341812, "rewards/MathAnswerFormat/mean": 0.2708333432674408, "rewards/MathAnswerFormat/std": 0.449092835187912, "rewards/PlanningActionSetORM/mean": 0.801562488079071, "rewards/PlanningActionSetORM/std": 0.0760858878493309, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.10327956825494766, "rewards/SpatialReasoningORM/mean": 0.4541666507720947, "rewards/SpatialReasoningORM/std": 0.40157049894332886, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 88, "train_speed(iter/s)": 0.005363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 101.109375, "completions/min_length": 2.0, "epoch": 0.003035160113221703, "frac_reward_zero_std": 0.0, "grad_norm": 29.250837326049805, "kl": 6.518045847769827e-05, "learning_rate": 1.5167007498295843e-07, "loss": -0.0837792158126831, "memory(GiB)": 69.16, "reward": 0.3795134127140045, "reward_std": 0.1432286947965622, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8166480660438538, "rewards/PlanningActionSetORM/std": 0.20406189560890198, "rewards/RMReward/mean": 0.6512500047683716, "rewards/RMReward/std": 0.16901516914367676, "rewards/SpatialReasoningORM/mean": 0.15000000596046448, "rewards/SpatialReasoningORM/std": 0.2683281898498535, "rewards/VisualPerceptionAccuracy/mean": 0.006894384510815144, "rewards/VisualPerceptionAccuracy/std": 0.027577538043260574, "step": 89, "train_speed(iter/s)": 0.005336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 131.1875, "completions/min_length": 12.0, "epoch": 0.0030692630358421717, "frac_reward_zero_std": 0.0, "grad_norm": 1.9094507694244385, "kl": 0.003599894931539893, "learning_rate": 1.5337423312883438e-07, "loss": -0.0344340056180954, "memory(GiB)": 69.16, "reward": 0.7254934310913086, "reward_std": 0.10894228518009186, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8455811142921448, "rewards/PlanningActionSetORM/std": 0.1258738487958908, "rewards/RMReward/mean": 0.5922916531562805, "rewards/RMReward/std": 0.18758532404899597, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 90, "train_speed(iter/s)": 0.00531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/mean_length": 27.5, "completions/min_length": 2.0, "epoch": 0.0031033659584626403, "frac_reward_zero_std": 0.0, "grad_norm": 46.75975036621094, "kl": 0.002642372390255332, "learning_rate": 1.550783912747103e-07, "loss": 0.04760921001434326, "memory(GiB)": 69.16, "reward": 0.41874998807907104, "reward_std": 0.21180346608161926, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.796875, "rewards/PlanningActionSetORM/std": 0.2061300277709961, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.10626225918531418, "rewards/SpatialReasoningORM/mean": 0.3125000298023224, "rewards/SpatialReasoningORM/std": 0.3029114007949829, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 91, "train_speed(iter/s)": 0.005343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/mean_length": 105.15625, "completions/min_length": 2.0, "epoch": 0.003137468881083109, "frac_reward_zero_std": 0.0, "grad_norm": 62.263004302978516, "kl": 0.004167382139712572, "learning_rate": 1.5678254942058626e-07, "loss": -0.04961085319519043, "memory(GiB)": 69.16, "reward": 0.5235680937767029, "reward_std": 0.13758976757526398, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9144120812416077, "rewards/PlanningActionSetORM/std": 0.10771683603525162, "rewards/RMReward/mean": 0.5697917342185974, "rewards/RMReward/std": 0.173738494515419, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.28722816705703735, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 92, "train_speed(iter/s)": 0.00535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 155.5, "completions/min_length": 2.0, "epoch": 0.0031715718037035774, "frac_reward_zero_std": 0.0, "grad_norm": 12.930011749267578, "kl": -0.00020220344595145434, "learning_rate": 1.5848670756646218e-07, "loss": 0.01462763175368309, "memory(GiB)": 69.16, "reward": 0.5797370672225952, "reward_std": 0.12390126287937164, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8834554553031921, "rewards/PlanningActionSetORM/std": 0.1340387761592865, "rewards/RMReward/mean": 0.5227083563804626, "rewards/RMReward/std": 0.13763564825057983, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.15000000596046448, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 93, "train_speed(iter/s)": 0.005334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 77.640625, "completions/min_length": 2.0, "epoch": 0.003205674726324046, "frac_reward_zero_std": 0.0, "grad_norm": 12.711181640625, "kl": 0.010648422874510288, "learning_rate": 1.6019086571233813e-07, "loss": -0.06259234994649887, "memory(GiB)": 69.16, "reward": 0.6766130924224854, "reward_std": 0.14182700216770172, "rewards/MathAnswerFormat/mean": 0.4375, "rewards/MathAnswerFormat/std": 0.504016101360321, "rewards/PlanningActionSetORM/mean": 0.8911306262016296, "rewards/PlanningActionSetORM/std": 0.11612848937511444, "rewards/RMReward/mean": 0.565625011920929, "rewards/RMReward/std": 0.1510380655527115, "rewards/SpatialReasoningORM/mean": 0.737500011920929, "rewards/SpatialReasoningORM/std": 0.2756224274635315, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 94, "train_speed(iter/s)": 0.005327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 114.09375, "completions/min_length": 9.0, "epoch": 0.0032397776489445146, "frac_reward_zero_std": 0.0, "grad_norm": 1.9142701625823975, "kl": 0.011115633882582188, "learning_rate": 1.6189502385821406e-07, "loss": -0.10447518527507782, "memory(GiB)": 69.16, "reward": 0.7553759813308716, "reward_std": 0.12846116721630096, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8572984337806702, "rewards/PlanningActionSetORM/std": 0.16535571217536926, "rewards/RMReward/mean": 0.6391666531562805, "rewards/RMReward/std": 0.17262472212314606, "rewards/SpatialReasoningORM/mean": 0.9750000238418579, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 95, "train_speed(iter/s)": 0.005308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/mean_length": 177.8125, "completions/min_length": 9.0, "epoch": 0.003273880571564983, "frac_reward_zero_std": 0.0, "grad_norm": 2.6554102897644043, "kl": 0.024167906492948532, "learning_rate": 1.6359918200409e-07, "loss": 0.002300931140780449, "memory(GiB)": 69.16, "reward": 0.48681455850601196, "reward_std": 0.15246452391147614, "rewards/MathAnswerFormat/mean": 0.75, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": 0.8442987203598022, "rewards/PlanningActionSetORM/std": 0.13042715191841125, "rewards/RMReward/mean": 0.5062500238418579, "rewards/RMReward/std": 0.15332339704036713, "rewards/SpatialReasoningORM/mean": 0.7999999523162842, "rewards/SpatialReasoningORM/std": 0.35023802518844604, "rewards/VisualPerceptionAccuracy/mean": 0.002038793871179223, "rewards/VisualPerceptionAccuracy/std": 0.008155175484716892, "step": 96, "train_speed(iter/s)": 0.005294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 146.640625, "completions/min_length": 63.0, "epoch": 0.0033079834941854517, "frac_reward_zero_std": 0.0, "grad_norm": 0.8775780200958252, "kl": 6.542081973748282e-05, "learning_rate": 1.6530334014996594e-07, "loss": -0.005208496004343033, "memory(GiB)": 69.16, "reward": 0.7031875848770142, "reward_std": 0.1271270364522934, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.847187876701355, "rewards/PlanningActionSetORM/std": 0.16559448838233948, "rewards/RMReward/mean": 0.6671874523162842, "rewards/RMReward/std": 0.16359521448612213, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 97, "train_speed(iter/s)": 0.005278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/mean_length": 113.734375, "completions/min_length": 2.0, "epoch": 0.0033420864168059203, "frac_reward_zero_std": 0.0, "grad_norm": 16.580354690551758, "kl": 0.02884897217154503, "learning_rate": 1.670074982958419e-07, "loss": -0.15014538168907166, "memory(GiB)": 69.16, "reward": 0.47109633684158325, "reward_std": 0.16981688141822815, "rewards/MathAnswerFormat/mean": 0.6875, "rewards/MathAnswerFormat/std": 0.4684174358844757, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6250000596046448, "rewards/SpatialReasoningORM/std": 0.4034426212310791, "rewards/VisualPerceptionAccuracy/mean": 1.0420777471153997e-05, "rewards/VisualPerceptionAccuracy/std": 4.1683113522594795e-05, "step": 98, "train_speed(iter/s)": 0.0053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/mean_length": 142.796875, "completions/min_length": 74.0, "epoch": 0.003376189339426389, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515201807022095, "kl": 1.921286093420349e-05, "learning_rate": 1.6871165644171781e-07, "loss": -0.048162661492824554, "memory(GiB)": 69.16, "reward": 0.6885551810264587, "reward_std": 0.13082799315452576, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8990257978439331, "rewards/PlanningActionSetORM/std": 0.1369183361530304, "rewards/RMReward/mean": 0.6359374523162842, "rewards/RMReward/std": 0.21866635978221893, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 99, "train_speed(iter/s)": 0.005301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 91.25, "completions/min_length": 3.0, "epoch": 0.0034102922620468574, "frac_reward_zero_std": 0.0, "grad_norm": 19.142072677612305, "kl": 0.002431114437058568, "learning_rate": 1.7041581458759377e-07, "loss": -0.061217743903398514, "memory(GiB)": 69.16, "reward": 0.6068400144577026, "reward_std": 0.14967861771583557, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7643499374389648, "rewards/PlanningActionSetORM/std": 0.16660964488983154, "rewards/RMReward/mean": 0.612500011920929, "rewards/RMReward/std": 0.183494433760643, "rewards/SpatialReasoningORM/mean": 0.5250000357627869, "rewards/SpatialReasoningORM/std": 0.20493903756141663, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 100, "train_speed(iter/s)": 0.005306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 116.46875, "completions/min_length": 2.0, "epoch": 0.003444395184667326, "frac_reward_zero_std": 0.0, "grad_norm": 45.21478271484375, "kl": 0.0034053055569529533, "learning_rate": 1.7211997273346966e-07, "loss": -0.1418212503194809, "memory(GiB)": 69.16, "reward": 0.5443620085716248, "reward_std": 0.18216723203659058, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7076823115348816, "rewards/PlanningActionSetORM/std": 0.2686212658882141, "rewards/RMReward/mean": 0.671875, "rewards/RMReward/std": 0.12309025228023529, "rewards/SpatialReasoningORM/mean": 0.43125003576278687, "rewards/SpatialReasoningORM/std": 0.2740820646286011, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 101, "train_speed(iter/s)": 0.005205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 120.40625, "completions/min_length": 2.0, "epoch": 0.0034784981072877945, "frac_reward_zero_std": 0.0, "grad_norm": 27.79353141784668, "kl": -9.184223017655313e-05, "learning_rate": 1.7382413087934564e-07, "loss": -0.07328879088163376, "memory(GiB)": 69.16, "reward": 0.3565567433834076, "reward_std": 0.12572768330574036, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9234374761581421, "rewards/PlanningActionSetORM/std": 0.1696733981370926, "rewards/RMReward/mean": 0.7718750238418579, "rewards/RMReward/std": 0.04819665849208832, "rewards/SpatialReasoningORM/mean": 0.3187500238418579, "rewards/SpatialReasoningORM/std": 0.30420443415641785, "rewards/VisualPerceptionAccuracy/mean": 0.018414434045553207, "rewards/VisualPerceptionAccuracy/std": 0.026708804070949554, "step": 102, "train_speed(iter/s)": 0.005199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 171.953125, "completions/min_length": 54.0, "epoch": 0.003512601029908263, "frac_reward_zero_std": 0.0, "grad_norm": 0.7606267929077148, "kl": 6.572043639607728e-05, "learning_rate": 1.7552828902522154e-07, "loss": -0.04922802001237869, "memory(GiB)": 69.16, "reward": 0.6348370313644409, "reward_std": 0.12937507033348083, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8723099231719971, "rewards/PlanningActionSetORM/std": 0.13326551020145416, "rewards/RMReward/mean": 0.5754687786102295, "rewards/RMReward/std": 0.15056432783603668, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 103, "train_speed(iter/s)": 0.005199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 116.125, "completions/min_length": 2.0, "epoch": 0.0035467039525287317, "frac_reward_zero_std": 0.0, "grad_norm": 60.87458419799805, "kl": 0.0010178301017731428, "learning_rate": 1.772324471710975e-07, "loss": 0.033247314393520355, "memory(GiB)": 69.16, "reward": 0.3249780833721161, "reward_std": 0.16022366285324097, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8489583730697632, "rewards/PlanningActionSetORM/std": 0.14141710102558136, "rewards/RMReward/mean": 0.38062500953674316, "rewards/RMReward/std": 0.12572291493415833, "rewards/SpatialReasoningORM/mean": 0.4312500059604645, "rewards/SpatialReasoningORM/std": 0.2740820646286011, "rewards/VisualPerceptionAccuracy/mean": 0.006245708093047142, "rewards/VisualPerceptionAccuracy/std": 0.024982832372188568, "step": 104, "train_speed(iter/s)": 0.00522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/mean_length": 2.4375, "completions/min_length": 2.0, "epoch": 0.0035808068751492002, "frac_reward_zero_std": 0.0, "grad_norm": 78.9616470336914, "kl": 0.004677635617554188, "learning_rate": 1.7893660531697342e-07, "loss": 0.06740260124206543, "memory(GiB)": 69.16, "reward": 0.4168750047683716, "reward_std": 0.29495927691459656, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.30000004172325134, "rewards/SpatialReasoningORM/std": 0.3031747043132782, "rewards/VisualPerceptionAccuracy/mean": 0.8125, "rewards/VisualPerceptionAccuracy/std": 0.40311288833618164, "step": 105, "train_speed(iter/s)": 0.00524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/mean_length": 79.515625, "completions/min_length": 12.0, "epoch": 0.003614909797769669, "frac_reward_zero_std": 0.0, "grad_norm": 2.7227671146392822, "kl": 0.0788290873169899, "learning_rate": 1.8064076346284937e-07, "loss": -0.03951828554272652, "memory(GiB)": 69.16, "reward": 0.7873151302337646, "reward_std": 0.12350036948919296, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.3415650427341461, "rewards/PlanningActionSetORM/mean": 0.8316840529441833, "rewards/PlanningActionSetORM/std": 0.17890511453151703, "rewards/RMReward/mean": 0.7099999785423279, "rewards/RMReward/std": 0.23805059492588043, "rewards/SpatialReasoningORM/mean": 0.949999988079071, "rewards/SpatialReasoningORM/std": 0.1366260051727295, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 106, "train_speed(iter/s)": 0.005239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 139.953125, "completions/min_length": 71.0, "epoch": 0.0036490127203901374, "frac_reward_zero_std": 0.0, "grad_norm": 0.7752564549446106, "kl": 0.00010719095007516444, "learning_rate": 1.823449216087253e-07, "loss": 0.014544110745191574, "memory(GiB)": 69.16, "reward": 0.5876930952072144, "reward_std": 0.11243686079978943, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8203403949737549, "rewards/PlanningActionSetORM/std": 0.15611377358436584, "rewards/RMReward/mean": 0.5295312404632568, "rewards/RMReward/std": 0.15141060948371887, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 107, "train_speed(iter/s)": 0.005236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/mean_length": 73.09375, "completions/min_length": 2.0, "epoch": 0.003683115643010606, "frac_reward_zero_std": 0.0, "grad_norm": 31.004894256591797, "kl": 0.007711970247328281, "learning_rate": 1.8404907975460125e-07, "loss": 0.010995730757713318, "memory(GiB)": 69.16, "reward": 0.631895899772644, "reward_std": 0.20450976490974426, "rewards/MathAnswerFormat/mean": 0.5, "rewards/MathAnswerFormat/std": 0.5080004930496216, "rewards/PlanningActionSetORM/mean": 0.9136470556259155, "rewards/PlanningActionSetORM/std": 0.13293135166168213, "rewards/RMReward/mean": 0.5853124856948853, "rewards/RMReward/std": 0.18048295378684998, "rewards/SpatialReasoningORM/mean": 0.6187499761581421, "rewards/SpatialReasoningORM/std": 0.4261133372783661, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 108, "train_speed(iter/s)": 0.00524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/mean_length": 33.046875, "completions/min_length": 2.0, "epoch": 0.0037172185656310745, "frac_reward_zero_std": 0.0, "grad_norm": 16.36119842529297, "kl": 0.047457676380872726, "learning_rate": 1.8575323790047717e-07, "loss": -0.16238468885421753, "memory(GiB)": 69.16, "reward": 0.8257421851158142, "reward_std": 0.1247640922665596, "rewards/MathAnswerFormat/mean": 0.6458333134651184, "rewards/MathAnswerFormat/std": 0.48332110047340393, "rewards/PlanningActionSetORM/mean": 0.7367187738418579, "rewards/PlanningActionSetORM/std": 0.25172653794288635, "rewards/RMReward/mean": 0.765625, "rewards/RMReward/std": 0.1640312522649765, "rewards/SpatialReasoningORM/mean": 0.8583333492279053, "rewards/SpatialReasoningORM/std": 0.19332842528820038, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 109, "train_speed(iter/s)": 0.005267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/mean_length": 33.6875, "completions/min_length": 2.0, "epoch": 0.003751321488251543, "frac_reward_zero_std": 0.75, "grad_norm": 0.35988351702690125, "kl": 0.0063239759765565395, "learning_rate": 1.8745739604635313e-07, "loss": -0.005326041020452976, "memory(GiB)": 69.16, "reward": 0.73200523853302, "reward_std": 0.018875332549214363, "rewards/MathAnswerFormat/mean": 0.3333333432674408, "rewards/MathAnswerFormat/std": 0.47639307379722595, "rewards/PlanningActionSetORM/mean": 0.8651041984558105, "rewards/PlanningActionSetORM/std": 0.15576034784317017, "rewards/RMReward/mean": 0.768750011920929, "rewards/RMReward/std": 0.08341663330793381, "rewards/SpatialReasoningORM/mean": 0.7333333492279053, "rewards/SpatialReasoningORM/std": 0.19055722653865814, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 110, "train_speed(iter/s)": 0.005236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 120.546875, "completions/min_length": 2.0, "epoch": 0.0037854244108720117, "frac_reward_zero_std": 0.0, "grad_norm": 10.482210159301758, "kl": 0.008879944682121277, "learning_rate": 1.8916155419222905e-07, "loss": -0.19530907273292542, "memory(GiB)": 69.16, "reward": 0.6299331784248352, "reward_std": 0.164458230137825, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.761429488658905, "rewards/PlanningActionSetORM/std": 0.24894694983959198, "rewards/RMReward/mean": 0.6108333468437195, "rewards/RMReward/std": 0.23489095270633698, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 111, "train_speed(iter/s)": 0.005203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 101.59375, "completions/min_length": 3.0, "epoch": 0.0038195273334924802, "frac_reward_zero_std": 0.0, "grad_norm": 32.30810546875, "kl": 0.0004714427632279694, "learning_rate": 1.90865712338105e-07, "loss": -0.03729706257581711, "memory(GiB)": 69.16, "reward": 0.6056398749351501, "reward_std": 0.14725950360298157, "rewards/MathAnswerFormat/mean": 0.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9313492178916931, "rewards/PlanningActionSetORM/std": 0.107616126537323, "rewards/RMReward/mean": 0.628125011920929, "rewards/RMReward/std": 0.1783214509487152, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.30000001192092896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 112, "train_speed(iter/s)": 0.005199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 56.59375, "completions/min_length": 2.0, "epoch": 0.003853630256112949, "frac_reward_zero_std": 0.0, "grad_norm": 26.997880935668945, "kl": 0.052741143852472305, "learning_rate": 1.9256987048398093e-07, "loss": -0.13854235410690308, "memory(GiB)": 69.16, "reward": 0.5013817548751831, "reward_std": 0.2038370668888092, "rewards/MathAnswerFormat/mean": 0.3958333432674408, "rewards/MathAnswerFormat/std": 0.49420398473739624, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.675000011920929, "rewards/SpatialReasoningORM/std": 0.3185739517211914, "rewards/VisualPerceptionAccuracy/mean": 0.022402027621865273, "rewards/VisualPerceptionAccuracy/std": 0.08960811793804169, "step": 113, "train_speed(iter/s)": 0.005229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 117.59375, "completions/min_length": 2.0, "epoch": 0.0038877331787334174, "frac_reward_zero_std": 0.0, "grad_norm": 17.565256118774414, "kl": 0.020434806123375893, "learning_rate": 1.9427402862985688e-07, "loss": -0.10322876274585724, "memory(GiB)": 69.16, "reward": 0.6112024784088135, "reward_std": 0.10430684685707092, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.8965578079223633, "rewards/PlanningActionSetORM/std": 0.08249711990356445, "rewards/RMReward/mean": 0.5458333492279053, "rewards/RMReward/std": 0.13080492615699768, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 114, "train_speed(iter/s)": 0.005241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 102.125, "completions/min_length": 2.0, "epoch": 0.003921836101353886, "frac_reward_zero_std": 0.0, "grad_norm": 10.991354942321777, "kl": 0.009472067467868328, "learning_rate": 1.959781867757328e-07, "loss": -0.21679843962192535, "memory(GiB)": 69.16, "reward": 0.6101297736167908, "reward_std": 0.1302555799484253, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.7527401447296143, "rewards/PlanningActionSetORM/std": 0.279987633228302, "rewards/RMReward/mean": 0.5800000429153442, "rewards/RMReward/std": 0.1611395627260208, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.10000000149011612, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 115, "train_speed(iter/s)": 0.005233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 76.46875, "completions/min_length": 14.0, "epoch": 0.003955939023974355, "frac_reward_zero_std": 0.0, "grad_norm": 3.1411468982696533, "kl": 0.0031458779703825712, "learning_rate": 1.9768234492160876e-07, "loss": 0.01717730611562729, "memory(GiB)": 69.16, "reward": 0.7645739316940308, "reward_std": 0.2511516809463501, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8582397699356079, "rewards/PlanningActionSetORM/std": 0.16336974501609802, "rewards/RMReward/mean": 0.7437499761581421, "rewards/RMReward/std": 0.12620387971401215, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.4399413466453552, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 116, "train_speed(iter/s)": 0.005257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 57.484375, "completions/min_length": 3.0, "epoch": 0.003990041946594823, "frac_reward_zero_std": 0.0, "grad_norm": 21.553314208984375, "kl": 0.021735820919275284, "learning_rate": 1.9938650306748468e-07, "loss": -0.12503188848495483, "memory(GiB)": 69.16, "reward": 0.6599404811859131, "reward_std": 0.11836305260658264, "rewards/MathAnswerFormat/mean": 0.0625, "rewards/MathAnswerFormat/std": 0.24593468010425568, "rewards/PlanningActionSetORM/mean": 0.921279788017273, "rewards/PlanningActionSetORM/std": 0.13992731273174286, "rewards/RMReward/mean": 0.6734374761581421, "rewards/RMReward/std": 0.14479371905326843, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.09837386757135391, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 117, "train_speed(iter/s)": 0.005271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 43.125, "completions/min_length": 2.0, "epoch": 0.004024144869215292, "frac_reward_zero_std": 0.0, "grad_norm": 16.62257957458496, "kl": 0.038156572729349136, "learning_rate": 2.0109066121336064e-07, "loss": -0.12877893447875977, "memory(GiB)": 69.16, "reward": 0.7655185461044312, "reward_std": 0.2323218584060669, "rewards/MathAnswerFormat/mean": 0.7291666865348816, "rewards/MathAnswerFormat/std": 0.449092835187912, "rewards/PlanningActionSetORM/mean": 0.9472470283508301, "rewards/PlanningActionSetORM/std": 0.05130886659026146, "rewards/RMReward/mean": 0.5743749737739563, "rewards/RMReward/std": 0.16665208339691162, "rewards/SpatialReasoningORM/mean": 0.8083333969116211, "rewards/SpatialReasoningORM/std": 0.3030577003955841, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 118, "train_speed(iter/s)": 0.00526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 175.9375, "completions/min_length": 61.0, "epoch": 0.00405824779183576, "frac_reward_zero_std": 0.0, "grad_norm": 1.0319796800613403, "kl": 0.00021911971271038055, "learning_rate": 2.0279481935923654e-07, "loss": -0.05050245299935341, "memory(GiB)": 69.16, "reward": 0.5123500823974609, "reward_std": 0.11285276710987091, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8435516357421875, "rewards/PlanningActionSetORM/std": 0.15871970355510712, "rewards/RMReward/mean": 0.5852082967758179, "rewards/RMReward/std": 0.17963306605815887, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13876929879188538, "rewards/VisualPerceptionAccuracy/std": 0.11202280968427658, "step": 119, "train_speed(iter/s)": 0.00527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 135.34375, "completions/min_length": 14.0, "epoch": 0.004092350714456229, "frac_reward_zero_std": 0.0, "grad_norm": 4.340455055236816, "kl": 0.002895558485761285, "learning_rate": 2.0449897750511251e-07, "loss": -0.040266841650009155, "memory(GiB)": 69.16, "reward": 0.7490403652191162, "reward_std": 0.19255289435386658, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7706859707832336, "rewards/PlanningActionSetORM/std": 0.27320557832717896, "rewards/RMReward/mean": 0.6885417103767395, "rewards/RMReward/std": 0.17571724951267242, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 120, "train_speed(iter/s)": 0.00525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 96.90625, "completions/min_length": 44.0, "epoch": 0.004126453637076697, "frac_reward_zero_std": 0.0, "grad_norm": 1.0265387296676636, "kl": 0.0005865697748959064, "learning_rate": 2.062031356509884e-07, "loss": -0.009743457660079002, "memory(GiB)": 69.16, "reward": 0.660363495349884, "reward_std": 0.1343478560447693, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.825567364692688, "rewards/PlanningActionSetORM/std": 0.22142425179481506, "rewards/RMReward/mean": 0.6190625429153442, "rewards/RMReward/std": 0.18795238435268402, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 121, "train_speed(iter/s)": 0.005267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 67.921875, "completions/min_length": 2.0, "epoch": 0.004160556559697166, "frac_reward_zero_std": 0.0, "grad_norm": 38.12855529785156, "kl": 0.06100451201200485, "learning_rate": 2.0790729379686436e-07, "loss": -0.1651797741651535, "memory(GiB)": 69.16, "reward": 0.6707040667533875, "reward_std": 0.12164171785116196, "rewards/MathAnswerFormat/mean": 0.09375, "rewards/MathAnswerFormat/std": 0.2961445748806, "rewards/PlanningActionSetORM/mean": 0.8992278575897217, "rewards/PlanningActionSetORM/std": 0.10244543105363846, "rewards/RMReward/mean": 0.6890625357627869, "rewards/RMReward/std": 0.15225951373577118, "rewards/SpatialReasoningORM/mean": 0.6375000476837158, "rewards/SpatialReasoningORM/std": 0.11845782399177551, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 122, "train_speed(iter/s)": 0.005269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 130.9375, "completions/min_length": 2.0, "epoch": 0.0041946594823176345, "frac_reward_zero_std": 0.0, "grad_norm": 26.89638328552246, "kl": 0.06704875826835632, "learning_rate": 2.096114519427403e-07, "loss": -0.18676453828811646, "memory(GiB)": 69.16, "reward": 0.62687087059021, "reward_std": 0.11894381046295166, "rewards/MathAnswerFormat/mean": 0.25, "rewards/MathAnswerFormat/std": 0.44721361994743347, "rewards/PlanningActionSetORM/mean": 0.9208054542541504, "rewards/PlanningActionSetORM/std": 0.11873335391283035, "rewards/RMReward/mean": 0.5322917103767395, "rewards/RMReward/std": 0.1449466496706009, "rewards/SpatialReasoningORM/mean": 0.7000000476837158, "rewards/SpatialReasoningORM/std": 0.17888543009757996, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 123, "train_speed(iter/s)": 0.005262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/mean_length": 70.921875, "completions/min_length": 2.0, "epoch": 0.0042287624049381035, "frac_reward_zero_std": 0.0, "grad_norm": 32.0610466003418, "kl": 0.1968420147895813, "learning_rate": 2.1131561008861624e-07, "loss": -0.10161139816045761, "memory(GiB)": 69.16, "reward": 0.43683549761772156, "reward_std": 0.19088459014892578, "rewards/MathAnswerFormat/mean": 0.71875, "rewards/MathAnswerFormat/std": 0.45680341124534607, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.38749998807907104, "rewards/SpatialReasoningORM/std": 0.4187154173851013, "rewards/VisualPerceptionAccuracy/mean": 0.46960845589637756, "rewards/VisualPerceptionAccuracy/std": 0.5061938166618347, "step": 124, "train_speed(iter/s)": 0.005293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 7.25, "completions/min_length": 2.0, "epoch": 0.004262865327558572, "frac_reward_zero_std": 0.0, "grad_norm": 37.33481216430664, "kl": 0.41766485571861267, "learning_rate": 2.1301976823449217e-07, "loss": -0.18553240597248077, "memory(GiB)": 69.16, "reward": 0.8301562666893005, "reward_std": 0.28766965866088867, "rewards/MathAnswerFormat/mean": 0.7083333134651184, "rewards/MathAnswerFormat/std": 0.4593396484851837, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8208333849906921, "rewards/SpatialReasoningORM/std": 0.28054529428482056, "rewards/VisualPerceptionAccuracy/mean": 0.875, "rewards/VisualPerceptionAccuracy/std": 0.3415650427341461, "step": 125, "train_speed(iter/s)": 0.005324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2049.0, "completions/mean_length": 512.78125, "completions/min_length": 113.0, "epoch": 0.004296968250179041, "frac_reward_zero_std": 0.0, "grad_norm": 0.6742759943008423, "kl": 0.00023160793352872133, "learning_rate": 2.1472392638036812e-07, "loss": -0.0057653337717056274, "memory(GiB)": 69.16, "reward": 0.36737364530563354, "reward_std": 0.06333071738481522, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.894444465637207, "rewards/PlanningActionSetORM/std": 0.10401555895805359, "rewards/RMReward/mean": 0.6859375238418579, "rewards/RMReward/std": 0.17562532424926758, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0071083311922848225, "rewards/VisualPerceptionAccuracy/std": 0.014754912815988064, "step": 126, "train_speed(iter/s)": 0.005306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/mean_length": 31.84375, "completions/min_length": 2.0, "epoch": 0.004331071172799509, "frac_reward_zero_std": 0.0, "grad_norm": 32.760860443115234, "kl": 0.4890006482601166, "learning_rate": 2.1642808452624404e-07, "loss": -0.13343648612499237, "memory(GiB)": 69.16, "reward": 0.6139062643051147, "reward_std": 0.25430336594581604, "rewards/MathAnswerFormat/mean": 0.3958333432674408, "rewards/MathAnswerFormat/std": 0.49420398473739624, "rewards/PlanningActionSetORM/mean": 0.9468749761581421, "rewards/PlanningActionSetORM/std": 0.08055795729160309, "rewards/RMReward/mean": 0.6656249761581421, "rewards/RMReward/std": 0.16095419228076935, "rewards/SpatialReasoningORM/mean": 0.5875000357627869, "rewards/SpatialReasoningORM/std": 0.3311006724834442, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 127, "train_speed(iter/s)": 0.005325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 5.53125, "completions/min_length": 2.0, "epoch": 0.004365174095419978, "frac_reward_zero_std": 0.0, "grad_norm": 76.98832702636719, "kl": 0.9943448305130005, "learning_rate": 2.1813224267212e-07, "loss": -0.2829389274120331, "memory(GiB)": 69.16, "reward": 0.8253124952316284, "reward_std": 0.18526458740234375, "rewards/MathAnswerFormat/mean": 0.59375, "rewards/MathAnswerFormat/std": 0.49501484632492065, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8374999761581421, "rewards/SpatialReasoningORM/std": 0.19800592958927155, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 128, "train_speed(iter/s)": 0.005353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/mean_length": 29.546875, "completions/min_length": 2.0, "epoch": 0.004399277018040446, "frac_reward_zero_std": 0.0, "grad_norm": 32.115928649902344, "kl": 0.3497599959373474, "learning_rate": 2.1983640081799592e-07, "loss": -0.1166493147611618, "memory(GiB)": 69.16, "reward": 0.8131250143051147, "reward_std": 0.2596621513366699, "rewards/MathAnswerFormat/mean": 0.75, "rewards/MathAnswerFormat/std": 0.4399413466453552, "rewards/PlanningActionSetORM/mean": 0.840624988079071, "rewards/PlanningActionSetORM/std": 0.08459462225437164, "rewards/RMReward/mean": 0.753125011920929, "rewards/RMReward/std": 0.10718948394060135, "rewards/SpatialReasoningORM/mean": 0.8062500357627869, "rewards/SpatialReasoningORM/std": 0.31514203548431396, "rewards/VisualPerceptionAccuracy/mean": 0.875, "rewards/VisualPerceptionAccuracy/std": 0.3415650427341461, "step": 129, "train_speed(iter/s)": 0.005358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 64.328125, "completions/min_length": 2.0, "epoch": 0.004433379940660915, "frac_reward_zero_std": 0.0, "grad_norm": 10.153584480285645, "kl": 0.44268369674682617, "learning_rate": 2.2154055896387187e-07, "loss": -0.0806940495967865, "memory(GiB)": 69.16, "reward": 0.8610537052154541, "reward_std": 0.13532119989395142, "rewards/MathAnswerFormat/mean": 0.875, "rewards/MathAnswerFormat/std": 0.33601075410842896, "rewards/PlanningActionSetORM/mean": 0.910537600517273, "rewards/PlanningActionSetORM/std": 0.12662091851234436, "rewards/RMReward/mean": 0.7421875, "rewards/RMReward/std": 0.19096924364566803, "rewards/SpatialReasoningORM/mean": 0.949999988079071, "rewards/SpatialReasoningORM/std": 0.1344042867422104, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 130, "train_speed(iter/s)": 0.005361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/mean_length": 142.671875, "completions/min_length": 7.0, "epoch": 0.004467482863281383, "frac_reward_zero_std": 0.0, "grad_norm": 8.875357627868652, "kl": 0.5031766295433044, "learning_rate": 2.232447171097478e-07, "loss": -0.005861423909664154, "memory(GiB)": 69.16, "reward": 0.513052761554718, "reward_std": 0.23844686150550842, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6666666865348816, "rewards/SpatialReasoningORM/std": 0.47639307379722595, "rewards/VisualPerceptionAccuracy/mean": 0.0022108578123152256, "rewards/VisualPerceptionAccuracy/std": 0.008843431249260902, "step": 131, "train_speed(iter/s)": 0.005371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 66.28125, "completions/min_length": 8.0, "epoch": 0.004501585785901852, "frac_reward_zero_std": 0.0, "grad_norm": 6.515994071960449, "kl": 0.4732876121997833, "learning_rate": 2.2494887525562375e-07, "loss": -0.02341190166771412, "memory(GiB)": 69.16, "reward": 0.47732698917388916, "reward_std": 0.27203497290611267, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8152901530265808, "rewards/PlanningActionSetORM/std": 0.15461130440235138, "rewards/RMReward/mean": 0.6593749523162842, "rewards/RMReward/std": 0.1113833487033844, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.48924607038497925, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 132, "train_speed(iter/s)": 0.005373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 52.140625, "completions/min_length": 8.0, "epoch": 0.00453568870852232, "frac_reward_zero_std": 0.0, "grad_norm": 9.0066556930542, "kl": 0.7943549156188965, "learning_rate": 2.2665303340149968e-07, "loss": 0.005387982353568077, "memory(GiB)": 69.16, "reward": 0.7215364575386047, "reward_std": 0.2947700619697571, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8934895992279053, "rewards/PlanningActionSetORM/std": 0.13394908607006073, "rewards/RMReward/mean": 0.7015625238418579, "rewards/RMReward/std": 0.15318362414836884, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 133, "train_speed(iter/s)": 0.005383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 131.5, "completions/min_length": 14.0, "epoch": 0.004569791631142789, "frac_reward_zero_std": 0.0, "grad_norm": 2.1427829265594482, "kl": 0.008661498315632343, "learning_rate": 2.2835719154737563e-07, "loss": -0.029709462076425552, "memory(GiB)": 69.16, "reward": 0.5716249942779541, "reward_std": 0.23296552896499634, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8989583849906921, "rewards/PlanningActionSetORM/std": 0.1143217533826828, "rewards/RMReward/mean": 0.533958375453949, "rewards/RMReward/std": 0.17967748641967773, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 134, "train_speed(iter/s)": 0.00537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/mean_length": 119.9375, "completions/min_length": 58.0, "epoch": 0.004603894553763257, "frac_reward_zero_std": 0.0, "grad_norm": 0.9368076920509338, "kl": 0.0010836783330887556, "learning_rate": 2.3006134969325155e-07, "loss": -0.039234984666109085, "memory(GiB)": 69.16, "reward": 0.6462823748588562, "reward_std": 0.12697425484657288, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8926618099212646, "rewards/PlanningActionSetORM/std": 0.16140371561050415, "rewards/RMReward/mean": 0.5846875309944153, "rewards/RMReward/std": 0.16708490252494812, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 135, "train_speed(iter/s)": 0.005364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/mean_length": 88.953125, "completions/min_length": 8.0, "epoch": 0.004637997476383726, "frac_reward_zero_std": 0.0, "grad_norm": 6.1064677238464355, "kl": 0.2023729383945465, "learning_rate": 2.317655078391275e-07, "loss": -0.014356574043631554, "memory(GiB)": 69.16, "reward": 0.6680647730827332, "reward_std": 0.14593517780303955, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9128720164299011, "rewards/PlanningActionSetORM/std": 0.15537682175636292, "rewards/RMReward/mean": 0.7531249523162842, "rewards/RMReward/std": 0.10403325408697128, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": 0.0059346589259803295, "rewards/VisualPerceptionAccuracy/std": 0.0037030752282589674, "step": 136, "train_speed(iter/s)": 0.005375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 96.71875, "completions/min_length": 8.0, "epoch": 0.0046721003990041945, "frac_reward_zero_std": 0.0, "grad_norm": 7.259604454040527, "kl": 0.2421349287033081, "learning_rate": 2.334696659850034e-07, "loss": -0.03547379747033119, "memory(GiB)": 69.16, "reward": 0.5248070359230042, "reward_std": 0.2057066559791565, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8243386149406433, "rewards/PlanningActionSetORM/std": 0.2035295069217682, "rewards/RMReward/mean": 0.5735417008399963, "rewards/RMReward/std": 0.18640483915805817, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 137, "train_speed(iter/s)": 0.005377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 104.953125, "completions/min_length": 8.0, "epoch": 0.0047062033216246635, "frac_reward_zero_std": 0.0, "grad_norm": 9.974244117736816, "kl": 0.31472063064575195, "learning_rate": 2.3517382413087938e-07, "loss": -0.06052904203534126, "memory(GiB)": 69.16, "reward": 0.5241013169288635, "reward_std": 0.21367821097373962, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8591145873069763, "rewards/PlanningActionSetORM/std": 0.22138534486293793, "rewards/RMReward/mean": 0.6812500357627869, "rewards/RMReward/std": 0.174017995595932, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.01900942251086235, "rewards/VisualPerceptionAccuracy/std": 0.05413059517741203, "step": 138, "train_speed(iter/s)": 0.005387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 134.09375, "completions/min_length": 82.0, "epoch": 0.004740306244245132, "frac_reward_zero_std": 0.0, "grad_norm": 0.8185073733329773, "kl": 0.0008047579322010279, "learning_rate": 2.3687798227675528e-07, "loss": -0.059652287513017654, "memory(GiB)": 69.16, "reward": 0.6524960398674011, "reward_std": 0.10716114938259125, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9043551683425903, "rewards/PlanningActionSetORM/std": 0.10273490101099014, "rewards/RMReward/mean": 0.5895312428474426, "rewards/RMReward/std": 0.13567502796649933, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 139, "train_speed(iter/s)": 0.005386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/mean_length": 237.234375, "completions/min_length": 14.0, "epoch": 0.004774409166865601, "frac_reward_zero_std": 0.0, "grad_norm": 1.6034201383590698, "kl": 0.003892803331837058, "learning_rate": 2.3858214042263126e-07, "loss": 0.029412558302283287, "memory(GiB)": 69.16, "reward": 0.18848051130771637, "reward_std": 0.10301587730646133, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8580357432365417, "rewards/PlanningActionSetORM/std": 0.15591542422771454, "rewards/RMReward/mean": 0.565000057220459, "rewards/RMReward/std": 0.13165612518787384, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.010469954460859299, "rewards/VisualPerceptionAccuracy/std": 0.03507794439792633, "step": 140, "train_speed(iter/s)": 0.005373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 101.53125, "completions/min_length": 8.0, "epoch": 0.004808512089486069, "frac_reward_zero_std": 0.0, "grad_norm": 4.370182991027832, "kl": 0.343711793422699, "learning_rate": 2.402862985685072e-07, "loss": -0.03196563944220543, "memory(GiB)": 69.16, "reward": 0.5140881538391113, "reward_std": 0.10101315379142761, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8795883059501648, "rewards/PlanningActionSetORM/std": 0.1445714384317398, "rewards/RMReward/mean": 0.4703124761581421, "rewards/RMReward/std": 0.10614913702011108, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.011392267420887947, "rewards/VisualPerceptionAccuracy/std": 0.004966686945408583, "step": 141, "train_speed(iter/s)": 0.005373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2049.0, "completions/mean_length": 152.6875, "completions/min_length": 2.0, "epoch": 0.004842615012106538, "frac_reward_zero_std": 0.0, "grad_norm": 5.352238655090332, "kl": 0.7586576342582703, "learning_rate": 2.419904567143831e-07, "loss": -0.12925222516059875, "memory(GiB)": 69.16, "reward": 0.5711859464645386, "reward_std": 0.22117546200752258, "rewards/MathAnswerFormat/mean": 0.96875, "rewards/MathAnswerFormat/std": 0.1767766922712326, "rewards/PlanningActionSetORM/mean": 0.8243589401245117, "rewards/PlanningActionSetORM/std": 0.19736874103546143, "rewards/RMReward/mean": 0.7531249523162842, "rewards/RMReward/std": 0.09240294992923737, "rewards/SpatialReasoningORM/mean": 0.34375, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 142, "train_speed(iter/s)": 0.005363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/mean_length": 184.09375, "completions/min_length": 43.0, "epoch": 0.004876717934727006, "frac_reward_zero_std": 0.0, "grad_norm": 0.9414759278297424, "kl": 0.0010313601233065128, "learning_rate": 2.4369461486025904e-07, "loss": 0.0340191051363945, "memory(GiB)": 69.16, "reward": 0.5306472778320312, "reward_std": 0.09722454845905304, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8789398074150085, "rewards/PlanningActionSetORM/std": 0.14232927560806274, "rewards/RMReward/mean": 0.6631249785423279, "rewards/RMReward/std": 0.1890868991613388, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.003725299146026373, "rewards/VisualPerceptionAccuracy/std": 0.0029209202621132135, "step": 143, "train_speed(iter/s)": 0.005368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 105.828125, "completions/min_length": 8.0, "epoch": 0.004910820857347475, "frac_reward_zero_std": 0.0, "grad_norm": 5.1842942237854, "kl": 0.2346501350402832, "learning_rate": 2.45398773006135e-07, "loss": -0.10160716623067856, "memory(GiB)": 69.16, "reward": 0.7390414476394653, "reward_std": 0.19341591000556946, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8279845118522644, "rewards/PlanningActionSetORM/std": 0.18336187303066254, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.10550032556056976, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 144, "train_speed(iter/s)": 0.005348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 147.71875, "completions/min_length": 79.0, "epoch": 0.004944923779967943, "frac_reward_zero_std": 0.0, "grad_norm": 0.6366856098175049, "kl": 0.0008406902197748423, "learning_rate": 2.471029311520109e-07, "loss": -0.03146018460392952, "memory(GiB)": 69.16, "reward": 0.5249054431915283, "reward_std": 0.0852465033531189, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8550276160240173, "rewards/PlanningActionSetORM/std": 0.1521150767803192, "rewards/RMReward/mean": 0.6610416769981384, "rewards/RMReward/std": 0.14752192795276642, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.00010500478674657643, "rewards/VisualPerceptionAccuracy/std": 0.00018203452054876834, "step": 145, "train_speed(iter/s)": 0.005354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 100.390625, "completions/min_length": 8.0, "epoch": 0.004979026702588412, "frac_reward_zero_std": 0.0, "grad_norm": 7.159071445465088, "kl": 0.7256119251251221, "learning_rate": 2.4880708929788687e-07, "loss": -0.0532202422618866, "memory(GiB)": 69.16, "reward": 0.416553795337677, "reward_std": 0.23362240195274353, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8811631798744202, "rewards/PlanningActionSetORM/std": 0.11300291866064072, "rewards/RMReward/mean": 0.535937488079071, "rewards/RMReward/std": 0.13983537256717682, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.3965577781200409, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 146, "train_speed(iter/s)": 0.005336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 96.5, "completions/min_length": 8.0, "epoch": 0.00501312962520888, "frac_reward_zero_std": 0.0, "grad_norm": 3.021448850631714, "kl": 0.38370054960250854, "learning_rate": 2.505112474437628e-07, "loss": 0.006049616727977991, "memory(GiB)": 69.16, "reward": 0.5516406297683716, "reward_std": 0.1256677657365799, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.932812511920929, "rewards/PlanningActionSetORM/std": 0.09137455374002457, "rewards/RMReward/mean": 0.640625, "rewards/RMReward/std": 0.1763266623020172, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 147, "train_speed(iter/s)": 0.005348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 113.15625, "completions/min_length": 8.0, "epoch": 0.005047232547829349, "frac_reward_zero_std": 0.0, "grad_norm": 2.9424078464508057, "kl": 0.2589932680130005, "learning_rate": 2.522154055896387e-07, "loss": -0.03844868391752243, "memory(GiB)": 69.16, "reward": 0.5489835143089294, "reward_std": 0.15323090553283691, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8909319043159485, "rewards/PlanningActionSetORM/std": 0.15649163722991943, "rewards/RMReward/mean": 0.6466667056083679, "rewards/RMReward/std": 0.13799989223480225, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 148, "train_speed(iter/s)": 0.005325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/mean_length": 63.921875, "completions/min_length": 8.0, "epoch": 0.005081335470449817, "frac_reward_zero_std": 0.0, "grad_norm": 5.487532615661621, "kl": 0.29126477241516113, "learning_rate": 2.5391956373551464e-07, "loss": -0.03464755415916443, "memory(GiB)": 69.16, "reward": 0.6872689723968506, "reward_std": 0.26916101574897766, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8051897287368774, "rewards/PlanningActionSetORM/std": 0.19840727746486664, "rewards/RMReward/mean": 0.5637500286102295, "rewards/RMReward/std": 0.18426401913166046, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.4399413466453552, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 149, "train_speed(iter/s)": 0.005326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 57.328125, "completions/min_length": 8.0, "epoch": 0.005115438393070286, "frac_reward_zero_std": 0.0, "grad_norm": 8.303274154663086, "kl": 0.4815227687358856, "learning_rate": 2.556237218813906e-07, "loss": 0.025063227862119675, "memory(GiB)": 69.16, "reward": 0.5271406173706055, "reward_std": 0.2551981508731842, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8960937261581421, "rewards/PlanningActionSetORM/std": 0.09916732460260391, "rewards/RMReward/mean": 0.6231250166893005, "rewards/RMReward/std": 0.2014053463935852, "rewards/SpatialReasoningORM/mean": 0.34375, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 150, "train_speed(iter/s)": 0.005327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.078125, "completions/min_length": 8.0, "epoch": 0.0051495413156907545, "frac_reward_zero_std": 1.0, "grad_norm": 0.009486901573836803, "kl": 0.6615129709243774, "learning_rate": 2.5732788002726655e-07, "loss": 0.0006611973512917757, "memory(GiB)": 69.16, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 151, "train_speed(iter/s)": 0.005332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 62.453125, "completions/min_length": 8.0, "epoch": 0.0051836442383112235, "frac_reward_zero_std": 0.0, "grad_norm": 8.212359428405762, "kl": 0.24312163889408112, "learning_rate": 2.5903203817314247e-07, "loss": -0.0012879259884357452, "memory(GiB)": 69.16, "reward": 0.6780424118041992, "reward_std": 0.24048474431037903, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9032366275787354, "rewards/PlanningActionSetORM/std": 0.08579596877098083, "rewards/RMReward/mean": 0.627500057220459, "rewards/RMReward/std": 0.17551766335964203, "rewards/SpatialReasoningORM/mean": 0.65625, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 152, "train_speed(iter/s)": 0.005329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 127.46875, "completions/min_length": 25.0, "epoch": 0.005217747160931692, "frac_reward_zero_std": 0.0, "grad_norm": 1.0101615190505981, "kl": 0.0022312719374895096, "learning_rate": 2.607361963190184e-07, "loss": -0.0697772428393364, "memory(GiB)": 69.16, "reward": 0.6737657189369202, "reward_std": 0.12375812232494354, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8913285136222839, "rewards/PlanningActionSetORM/std": 0.17431963980197906, "rewards/RMReward/mean": 0.6193749904632568, "rewards/RMReward/std": 0.17528095841407776, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 153, "train_speed(iter/s)": 0.005318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/mean_length": 177.9375, "completions/min_length": 7.0, "epoch": 0.005251850083552161, "frac_reward_zero_std": 0.0, "grad_norm": 9.339456558227539, "kl": 0.38825371861457825, "learning_rate": 2.624403544648944e-07, "loss": 0.10622972249984741, "memory(GiB)": 69.16, "reward": 0.369717001914978, "reward_std": 0.2013726681470871, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9452381134033203, "rewards/PlanningActionSetORM/std": 0.12958157062530518, "rewards/RMReward/mean": 0.640625, "rewards/RMReward/std": 0.16453850269317627, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.03709772229194641, "rewards/VisualPerceptionAccuracy/std": 0.11360425502061844, "step": 154, "train_speed(iter/s)": 0.005332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 54.078125, "completions/min_length": 8.0, "epoch": 0.005285953006172629, "frac_reward_zero_std": 0.0, "grad_norm": 5.31643009185791, "kl": 0.39909905195236206, "learning_rate": 2.641445126107703e-07, "loss": -0.01131357904523611, "memory(GiB)": 69.16, "reward": 0.5794531106948853, "reward_std": 0.18950524926185608, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9476562142372131, "rewards/PlanningActionSetORM/std": 0.1260659545660019, "rewards/RMReward/mean": 0.6296875476837158, "rewards/RMReward/std": 0.19545620679855347, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 155, "train_speed(iter/s)": 0.005345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 77.140625, "completions/min_length": 8.0, "epoch": 0.005320055928793098, "frac_reward_zero_std": 0.0, "grad_norm": 6.044063091278076, "kl": 0.7644968032836914, "learning_rate": 2.6584867075664623e-07, "loss": 0.02617832086980343, "memory(GiB)": 69.16, "reward": 0.5345771312713623, "reward_std": 0.2980148196220398, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8551462888717651, "rewards/PlanningActionSetORM/std": 0.14545178413391113, "rewards/RMReward/mean": 0.5406249761581421, "rewards/RMReward/std": 0.12790967524051666, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 156, "train_speed(iter/s)": 0.005349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 62.328125, "completions/min_length": 2.0, "epoch": 0.005354158851413566, "frac_reward_zero_std": 0.0, "grad_norm": 14.920748710632324, "kl": 0.3274642825126648, "learning_rate": 2.6755282890252215e-07, "loss": -0.05287901684641838, "memory(GiB)": 69.16, "reward": 0.6767336130142212, "reward_std": 0.27088695764541626, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7938988208770752, "rewards/PlanningActionSetORM/std": 0.16171729564666748, "rewards/RMReward/mean": 0.659375011920929, "rewards/RMReward/std": 0.1352640688419342, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 157, "train_speed(iter/s)": 0.005362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/mean_length": 32.984375, "completions/min_length": 8.0, "epoch": 0.005388261774034035, "frac_reward_zero_std": 0.0, "grad_norm": 14.450737953186035, "kl": 0.6688479781150818, "learning_rate": 2.6925698704839813e-07, "loss": 0.014951595105230808, "memory(GiB)": 69.16, "reward": 0.739270806312561, "reward_std": 0.35702162981033325, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9885416626930237, "rewards/PlanningActionSetORM/std": 0.03145764395594597, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.12449900060892105, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4684174358844757, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 158, "train_speed(iter/s)": 0.005372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/mean_length": 32.234375, "completions/min_length": 8.0, "epoch": 0.005422364696654503, "frac_reward_zero_std": 0.0, "grad_norm": 11.264158248901367, "kl": 0.5433053970336914, "learning_rate": 2.7096114519427406e-07, "loss": -0.015675444155931473, "memory(GiB)": 69.16, "reward": 0.6331771016120911, "reward_std": 0.36348676681518555, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8979166746139526, "rewards/PlanningActionSetORM/std": 0.15087707340717316, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.09831921756267548, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5013279914855957, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 159, "train_speed(iter/s)": 0.005393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 77.59375, "completions/min_length": 8.0, "epoch": 0.005456467619274972, "frac_reward_zero_std": 0.0, "grad_norm": 9.525080680847168, "kl": 0.773806095123291, "learning_rate": 2.7266530334015e-07, "loss": -0.035107869654893875, "memory(GiB)": 69.16, "reward": 0.6909326314926147, "reward_std": 0.24938036501407623, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8905763626098633, "rewards/PlanningActionSetORM/std": 0.10669633746147156, "rewards/RMReward/mean": 0.699999988079071, "rewards/RMReward/std": 0.1391216665506363, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 160, "train_speed(iter/s)": 0.005389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/mean_length": 163.515625, "completions/min_length": 8.0, "epoch": 0.00549057054189544, "frac_reward_zero_std": 0.0, "grad_norm": 5.959665298461914, "kl": 0.16288502514362335, "learning_rate": 2.743694614860259e-07, "loss": -0.015840960666537285, "memory(GiB)": 69.16, "reward": 0.34404075145721436, "reward_std": 0.12564274668693542, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7923098802566528, "rewards/PlanningActionSetORM/std": 0.18883275985717773, "rewards/RMReward/mean": 0.5828125476837158, "rewards/RMReward/std": 0.19740630686283112, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.017364056780934334, "rewards/VisualPerceptionAccuracy/std": 0.06945622712373734, "step": 161, "train_speed(iter/s)": 0.005394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 53.046875, "completions/min_length": 8.0, "epoch": 0.005524673464515909, "frac_reward_zero_std": 0.0, "grad_norm": 5.3821539878845215, "kl": 0.25692489743232727, "learning_rate": 2.760736196319019e-07, "loss": -0.004991693422198296, "memory(GiB)": 69.16, "reward": 0.6537500023841858, "reward_std": 0.29901644587516785, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9125000238418579, "rewards/PlanningActionSetORM/std": 0.1976451873779297, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.1513381153345108, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 162, "train_speed(iter/s)": 0.005401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 171.96875, "completions/min_length": 23.0, "epoch": 0.005558776387136377, "frac_reward_zero_std": 0.0, "grad_norm": 0.8352375626564026, "kl": 0.0031698201783001423, "learning_rate": 2.7777777777777776e-07, "loss": -0.04118116572499275, "memory(GiB)": 69.16, "reward": 0.54547119140625, "reward_std": 0.10209926962852478, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8860599994659424, "rewards/PlanningActionSetORM/std": 0.1896575391292572, "rewards/RMReward/mean": 0.6831250190734863, "rewards/RMReward/std": 0.13956095278263092, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.01074867881834507, "rewards/VisualPerceptionAccuracy/std": 0.04299471527338028, "step": 163, "train_speed(iter/s)": 0.005407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 194.171875, "completions/min_length": 8.0, "epoch": 0.005592879309756846, "frac_reward_zero_std": 0.0, "grad_norm": 5.227523326873779, "kl": 0.2582387626171112, "learning_rate": 2.7948193592365374e-07, "loss": -0.03739166632294655, "memory(GiB)": 69.16, "reward": 0.40810760855674744, "reward_std": 0.19092680513858795, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.7542100548744202, "rewards/PlanningActionSetORM/std": 0.23057393729686737, "rewards/RMReward/mean": 0.5687500238418579, "rewards/RMReward/std": 0.1688433140516281, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.014496375806629658, "rewards/VisualPerceptionAccuracy/std": 0.05798550322651863, "step": 164, "train_speed(iter/s)": 0.005385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 86.3125, "completions/min_length": 8.0, "epoch": 0.005626982232377314, "frac_reward_zero_std": 0.0, "grad_norm": 9.17982006072998, "kl": 0.6845124363899231, "learning_rate": 2.8118609406952966e-07, "loss": -0.01027902215719223, "memory(GiB)": 69.16, "reward": 0.7658953666687012, "reward_std": 0.24702323973178864, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9183283448219299, "rewards/PlanningActionSetORM/std": 0.12106188386678696, "rewards/RMReward/mean": 0.6578124761581421, "rewards/RMReward/std": 0.1251511126756668, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.3965577781200409, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 165, "train_speed(iter/s)": 0.005392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 117.625, "completions/min_length": 8.0, "epoch": 0.005661085154997783, "frac_reward_zero_std": 0.0, "grad_norm": 6.496171951293945, "kl": 0.32884830236434937, "learning_rate": 2.828902522154056e-07, "loss": 0.025014858692884445, "memory(GiB)": 69.16, "reward": 0.5298558473587036, "reward_std": 0.21835190057754517, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9440103769302368, "rewards/PlanningActionSetORM/std": 0.10302838683128357, "rewards/RMReward/mean": 0.8106249570846558, "rewards/RMReward/std": 0.13969869911670685, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.03856915608048439, "rewards/VisualPerceptionAccuracy/std": 0.15427663922309875, "step": 166, "train_speed(iter/s)": 0.005409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/mean_length": 137.34375, "completions/min_length": 8.0, "epoch": 0.005695188077618252, "frac_reward_zero_std": 0.0, "grad_norm": 6.8118414878845215, "kl": 0.6561443209648132, "learning_rate": 2.845944103612815e-07, "loss": -0.0450773648917675, "memory(GiB)": 69.16, "reward": 0.45078423619270325, "reward_std": 0.20576249063014984, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.84375, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": 0.0500059500336647, "rewards/VisualPerceptionAccuracy/std": 0.10891973227262497, "step": 167, "train_speed(iter/s)": 0.005419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/mean_length": 32.03125, "completions/min_length": 8.0, "epoch": 0.005729291000238721, "frac_reward_zero_std": 0.75, "grad_norm": 0.39817655086517334, "kl": 0.7261168956756592, "learning_rate": 2.862985685071575e-07, "loss": -0.004017374478280544, "memory(GiB)": 69.16, "reward": 0.9446145296096802, "reward_std": 0.028771156445145607, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9072916507720947, "rewards/PlanningActionSetORM/std": 0.06984606385231018, "rewards/RMReward/mean": 0.7462499737739563, "rewards/RMReward/std": 0.12971122562885284, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 168, "train_speed(iter/s)": 0.0054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/mean_length": 133.0, "completions/min_length": 8.0, "epoch": 0.005763393922859189, "frac_reward_zero_std": 0.0, "grad_norm": 4.992537498474121, "kl": 0.2561548948287964, "learning_rate": 2.880027266530334e-07, "loss": 0.04888743907213211, "memory(GiB)": 69.16, "reward": 0.5586552619934082, "reward_std": 0.13205133378505707, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9191405773162842, "rewards/PlanningActionSetORM/std": 0.10496660321950912, "rewards/RMReward/mean": 0.5653125047683716, "rewards/RMReward/std": 0.13302884995937347, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.021839719265699387, "rewards/VisualPerceptionAccuracy/std": 0.07752823829650879, "step": 169, "train_speed(iter/s)": 0.005393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/mean_length": 59.203125, "completions/min_length": 8.0, "epoch": 0.005797496845479658, "frac_reward_zero_std": 0.0, "grad_norm": 10.75657844543457, "kl": 0.7205661535263062, "learning_rate": 2.8970688479890934e-07, "loss": -0.026254402473568916, "memory(GiB)": 69.16, "reward": 0.623471200466156, "reward_std": 0.21868914365768433, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8656498193740845, "rewards/PlanningActionSetORM/std": 0.12481005489826202, "rewards/RMReward/mean": 0.5746875405311584, "rewards/RMReward/std": 0.13697148859500885, "rewards/SpatialReasoningORM/mean": 0.59375, "rewards/SpatialReasoningORM/std": 0.49899089336395264, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 170, "train_speed(iter/s)": 0.005401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/mean_length": 58.921875, "completions/min_length": 8.0, "epoch": 0.005831599768100126, "frac_reward_zero_std": 0.0, "grad_norm": 10.29723072052002, "kl": 0.5884418487548828, "learning_rate": 2.9141104294478527e-07, "loss": -0.024332638829946518, "memory(GiB)": 69.16, "reward": 0.6289867162704468, "reward_std": 0.2412482053041458, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8523674607276917, "rewards/PlanningActionSetORM/std": 0.13972978293895721, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.11495966464281082, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 171, "train_speed(iter/s)": 0.005415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 33.9375, "completions/min_length": 8.0, "epoch": 0.005865702690720595, "frac_reward_zero_std": 0.0, "grad_norm": 10.570205688476562, "kl": 0.9806642532348633, "learning_rate": 2.9311520109066125e-07, "loss": -0.004584586247801781, "memory(GiB)": 69.16, "reward": 0.7765215635299683, "reward_std": 0.2728031873703003, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9054315090179443, "rewards/PlanningActionSetORM/std": 0.029183166101574898, "rewards/RMReward/mean": 0.796875, "rewards/RMReward/std": 0.05313112214207649, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.4375949800014496, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 172, "train_speed(iter/s)": 0.005425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 136.28125, "completions/min_length": 14.0, "epoch": 0.005899805613341063, "frac_reward_zero_std": 0.0, "grad_norm": 1.9697086811065674, "kl": 0.00868192594498396, "learning_rate": 2.9481935923653717e-07, "loss": -0.08463746309280396, "memory(GiB)": 69.16, "reward": 0.5250206589698792, "reward_std": 0.14120149612426758, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8936793804168701, "rewards/PlanningActionSetORM/std": 0.12370304018259048, "rewards/RMReward/mean": 0.606041669845581, "rewards/RMReward/std": 0.21189860999584198, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 173, "train_speed(iter/s)": 0.0054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 83.65625, "completions/min_length": 8.0, "epoch": 0.005933908535961532, "frac_reward_zero_std": 0.0, "grad_norm": 9.342227935791016, "kl": 0.6897100806236267, "learning_rate": 2.965235173824131e-07, "loss": -0.021669160574674606, "memory(GiB)": 69.16, "reward": 0.648094654083252, "reward_std": 0.25460970401763916, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.8541666865348816, "rewards/SpatialReasoningORM/std": 0.3566739559173584, "rewards/VisualPerceptionAccuracy/mean": 0.008003435097634792, "rewards/VisualPerceptionAccuracy/std": 0.031599074602127075, "step": 174, "train_speed(iter/s)": 0.005418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 120.21875, "completions/min_length": 60.0, "epoch": 0.005968011458582, "frac_reward_zero_std": 0.0, "grad_norm": 1.3045780658721924, "kl": 0.002908664057031274, "learning_rate": 2.982276755282891e-07, "loss": -0.03146177530288696, "memory(GiB)": 69.16, "reward": 0.6934580206871033, "reward_std": 0.12987008690834045, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9122902154922485, "rewards/PlanningActionSetORM/std": 0.12420065701007843, "rewards/RMReward/mean": 0.6387500166893005, "rewards/RMReward/std": 0.1605101078748703, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 175, "train_speed(iter/s)": 0.005429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 33.859375, "completions/min_length": 8.0, "epoch": 0.006002114381202469, "frac_reward_zero_std": 0.0, "grad_norm": 8.787640571594238, "kl": 0.7979426383972168, "learning_rate": 2.99931833674165e-07, "loss": -0.003080928698182106, "memory(GiB)": 69.16, "reward": 0.6104166507720947, "reward_std": 0.2979092001914978, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8677083253860474, "rewards/PlanningActionSetORM/std": 0.15008871257305145, "rewards/RMReward/mean": 0.6437500715255737, "rewards/RMReward/std": 0.17308476567268372, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5013279914855957, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 176, "train_speed(iter/s)": 0.00544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 70.46875, "completions/min_length": 8.0, "epoch": 0.006036217303822937, "frac_reward_zero_std": 0.0, "grad_norm": 6.595123767852783, "kl": 0.5447622537612915, "learning_rate": 3.0163599182004093e-07, "loss": -0.020993687212467194, "memory(GiB)": 69.16, "reward": 0.8263901472091675, "reward_std": 0.22140565514564514, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8201512694358826, "rewards/PlanningActionSetORM/std": 0.20069177448749542, "rewards/RMReward/mean": 0.7593750357627869, "rewards/RMReward/std": 0.1526314914226532, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.33601075410842896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 177, "train_speed(iter/s)": 0.005446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 92.953125, "completions/min_length": 8.0, "epoch": 0.006070320226443406, "frac_reward_zero_std": 0.0, "grad_norm": 4.250853538513184, "kl": 0.32738515734672546, "learning_rate": 3.0334014996591685e-07, "loss": -0.01740461029112339, "memory(GiB)": 69.16, "reward": 0.719871997833252, "reward_std": 0.20087051391601562, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9126880764961243, "rewards/PlanningActionSetORM/std": 0.11436323821544647, "rewards/RMReward/mean": 0.6291666626930237, "rewards/RMReward/std": 0.17284642159938812, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 178, "train_speed(iter/s)": 0.005446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 231.234375, "completions/min_length": 67.0, "epoch": 0.006104423149063874, "frac_reward_zero_std": 0.0, "grad_norm": 0.6782126426696777, "kl": 0.0022236136719584465, "learning_rate": 3.0504430811179283e-07, "loss": -0.04174081236124039, "memory(GiB)": 69.16, "reward": 0.5721921920776367, "reward_std": 0.13438573479652405, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9415088295936584, "rewards/PlanningActionSetORM/std": 0.09138629585504532, "rewards/RMReward/mean": 0.684374988079071, "rewards/RMReward/std": 0.16604159772396088, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08136340230703354, "rewards/VisualPerceptionAccuracy/std": 0.12601442635059357, "step": 179, "train_speed(iter/s)": 0.005432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 80.0, "completions/min_length": 7.0, "epoch": 0.006138526071684343, "frac_reward_zero_std": 0.0, "grad_norm": 9.893946647644043, "kl": 0.6328333020210266, "learning_rate": 3.0674846625766876e-07, "loss": -0.0012863297015428543, "memory(GiB)": 69.16, "reward": 0.7517447471618652, "reward_std": 0.24061219394207, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9455729126930237, "rewards/PlanningActionSetORM/std": 0.09773382544517517, "rewards/RMReward/mean": 0.6156250238418579, "rewards/RMReward/std": 0.12727762758731842, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.3965577781200409, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 180, "train_speed(iter/s)": 0.005427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/mean_length": 208.953125, "completions/min_length": 8.0, "epoch": 0.0061726289943048115, "frac_reward_zero_std": 1.0, "grad_norm": 0.005839524790644646, "kl": 0.2766643762588501, "learning_rate": 3.084526244035447e-07, "loss": 0.0002762989606708288, "memory(GiB)": 69.16, "reward": 0.5, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 181, "train_speed(iter/s)": 0.005426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 96.9375, "completions/min_length": 8.0, "epoch": 0.0062067319169252805, "frac_reward_zero_std": 0.0, "grad_norm": 4.141970634460449, "kl": 0.3159032166004181, "learning_rate": 3.101567825494206e-07, "loss": -0.023114584386348724, "memory(GiB)": 69.16, "reward": 0.5934720039367676, "reward_std": 0.21021583676338196, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8503450155258179, "rewards/PlanningActionSetORM/std": 0.1511581689119339, "rewards/RMReward/mean": 0.5406249761581421, "rewards/RMReward/std": 0.12406911700963974, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 182, "train_speed(iter/s)": 0.005432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/mean_length": 29.234375, "completions/min_length": 8.0, "epoch": 0.006240834839545749, "frac_reward_zero_std": 0.75, "grad_norm": 0.5139399766921997, "kl": 0.4568496346473694, "learning_rate": 3.118609406952966e-07, "loss": -0.03710724413394928, "memory(GiB)": 69.16, "reward": 0.9378385543823242, "reward_std": 0.018710896372795105, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9192708134651184, "rewards/PlanningActionSetORM/std": 0.08801930397748947, "rewards/RMReward/mean": 0.7093750238418579, "rewards/RMReward/std": 0.09868933260440826, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 183, "train_speed(iter/s)": 0.005429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 77.203125, "completions/min_length": 8.0, "epoch": 0.006274937762166218, "frac_reward_zero_std": 0.0, "grad_norm": 10.735126495361328, "kl": 0.7268499732017517, "learning_rate": 3.135650988411725e-07, "loss": -0.01967575214803219, "memory(GiB)": 69.16, "reward": 0.6629718542098999, "reward_std": 0.23567987978458405, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9187809824943542, "rewards/PlanningActionSetORM/std": 0.1315264105796814, "rewards/RMReward/mean": 0.5859375, "rewards/RMReward/std": 0.17562532424926758, "rewards/SpatialReasoningORM/mean": 0.65625, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 184, "train_speed(iter/s)": 0.005432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 69.671875, "completions/min_length": 7.0, "epoch": 0.006309040684786686, "frac_reward_zero_std": 0.0, "grad_norm": 7.553243637084961, "kl": 0.5884474515914917, "learning_rate": 3.1526925698704844e-07, "loss": -0.01368972659111023, "memory(GiB)": 69.16, "reward": 0.41484248638153076, "reward_std": 0.2143564224243164, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9343625903129578, "rewards/PlanningActionSetORM/std": 0.12234511971473694, "rewards/RMReward/mean": 0.629687488079071, "rewards/RMReward/std": 0.1674569547176361, "rewards/SpatialReasoningORM/mean": 0.09375, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 185, "train_speed(iter/s)": 0.005431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 93.453125, "completions/min_length": 2.0, "epoch": 0.006343143607407155, "frac_reward_zero_std": 0.0, "grad_norm": 13.155986785888672, "kl": 0.00496454443782568, "learning_rate": 3.1697341513292436e-07, "loss": 0.03413597494363785, "memory(GiB)": 69.16, "reward": 0.5092209577560425, "reward_std": 0.17658819258213043, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8682291507720947, "rewards/PlanningActionSetORM/std": 0.0878465324640274, "rewards/RMReward/mean": 0.7699999809265137, "rewards/RMReward/std": 0.09834108501672745, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2287960946559906, "rewards/VisualPerceptionAccuracy/std": 0.416459858417511, "step": 186, "train_speed(iter/s)": 0.005434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 146.25, "completions/min_length": 60.0, "epoch": 0.006377246530027623, "frac_reward_zero_std": 0.0, "grad_norm": 0.6790571212768555, "kl": 0.0037014279514551163, "learning_rate": 3.186775732788003e-07, "loss": -0.03819122910499573, "memory(GiB)": 69.16, "reward": 0.7378286123275757, "reward_std": 0.08800974488258362, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8610180616378784, "rewards/PlanningActionSetORM/std": 0.15536223351955414, "rewards/RMReward/mean": 0.70703125, "rewards/RMReward/std": 0.13565455377101898, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 187, "train_speed(iter/s)": 0.005408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 143.265625, "completions/min_length": 8.0, "epoch": 0.006411349452648092, "frac_reward_zero_std": 0.0, "grad_norm": 7.292414665222168, "kl": 0.4159780442714691, "learning_rate": 3.2038173142467627e-07, "loss": -0.042184263467788696, "memory(GiB)": 69.16, "reward": 0.6117194294929504, "reward_std": 0.2227741926908493, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9212546348571777, "rewards/PlanningActionSetORM/std": 0.13552436232566833, "rewards/RMReward/mean": 0.5952083468437195, "rewards/RMReward/std": 0.16245444118976593, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 188, "train_speed(iter/s)": 0.005387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 149.9375, "completions/min_length": 8.0, "epoch": 0.00644545237526856, "frac_reward_zero_std": 0.0, "grad_norm": 4.064563751220703, "kl": 0.2596573531627655, "learning_rate": 3.220858895705522e-07, "loss": -0.04734981060028076, "memory(GiB)": 69.16, "reward": 0.7989641427993774, "reward_std": 0.1335853785276413, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9287192225456238, "rewards/PlanningActionSetORM/std": 0.11347241699695587, "rewards/RMReward/mean": 0.7074999809265137, "rewards/RMReward/std": 0.20362146198749542, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 189, "train_speed(iter/s)": 0.00537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 137.390625, "completions/min_length": 80.0, "epoch": 0.006479555297889029, "frac_reward_zero_std": 0.0, "grad_norm": 0.9883583784103394, "kl": 0.0026094880886375904, "learning_rate": 3.237900477164281e-07, "loss": 0.001934293657541275, "memory(GiB)": 69.16, "reward": 0.2963825762271881, "reward_std": 0.06225347891449928, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9278955459594727, "rewards/PlanningActionSetORM/std": 0.0915476381778717, "rewards/RMReward/mean": 0.504687488079071, "rewards/RMReward/std": 0.13341625034809113, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0034360310528427362, "rewards/VisualPerceptionAccuracy/std": 0.01633668877184391, "step": 190, "train_speed(iter/s)": 0.005357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/mean_length": 57.140625, "completions/min_length": 8.0, "epoch": 0.006513658220509497, "frac_reward_zero_std": 0.0, "grad_norm": 7.046691417694092, "kl": 0.5943520069122314, "learning_rate": 3.2549420586230404e-07, "loss": 0.05762716382741928, "memory(GiB)": 69.16, "reward": 0.7211681604385376, "reward_std": 0.2630351185798645, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8866815567016602, "rewards/PlanningActionSetORM/std": 0.13928832113742828, "rewards/RMReward/mean": 0.776562511920929, "rewards/RMReward/std": 0.11707756668329239, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 191, "train_speed(iter/s)": 0.005355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.359375, "completions/min_length": 8.0, "epoch": 0.006547761143129966, "frac_reward_zero_std": 0.0, "grad_norm": 13.365184783935547, "kl": 1.1321314573287964, "learning_rate": 3.2719836400818e-07, "loss": -0.012077726423740387, "memory(GiB)": 69.16, "reward": 0.7328125238418579, "reward_std": 0.3621150851249695, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.4531635046005249, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 192, "train_speed(iter/s)": 0.005375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 79.75, "completions/min_length": 8.0, "epoch": 0.006581864065750434, "frac_reward_zero_std": 0.0, "grad_norm": 5.330570697784424, "kl": 0.3362552523612976, "learning_rate": 3.2890252215405595e-07, "loss": -0.007189333438873291, "memory(GiB)": 69.16, "reward": 0.7413281202316284, "reward_std": 0.20359861850738525, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9067708849906921, "rewards/PlanningActionSetORM/std": 0.13381624221801758, "rewards/RMReward/mean": 0.7406250834465027, "rewards/RMReward/std": 0.15425050258636475, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 193, "train_speed(iter/s)": 0.005381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 111.046875, "completions/min_length": 8.0, "epoch": 0.006615966988370903, "frac_reward_zero_std": 0.0, "grad_norm": 3.2301511764526367, "kl": 0.3223208487033844, "learning_rate": 3.3060668029993187e-07, "loss": -0.05367133393883705, "memory(GiB)": 69.16, "reward": 0.7663881182670593, "reward_std": 0.19739612936973572, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8741939663887024, "rewards/PlanningActionSetORM/std": 0.15127263963222504, "rewards/RMReward/mean": 0.5587500333786011, "rewards/RMReward/std": 0.14024747908115387, "rewards/SpatialReasoningORM/mean": 0.90625, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 194, "train_speed(iter/s)": 0.005367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 62.578125, "completions/min_length": 8.0, "epoch": 0.006650069910991372, "frac_reward_zero_std": 0.0, "grad_norm": 7.363446235656738, "kl": 0.6280087828636169, "learning_rate": 3.323108384458078e-07, "loss": 0.01756516471505165, "memory(GiB)": 69.16, "reward": 0.6510208249092102, "reward_std": 0.20789392292499542, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9427083730697632, "rewards/PlanningActionSetORM/std": 0.11683455109596252, "rewards/RMReward/mean": 0.7356249690055847, "rewards/RMReward/std": 0.13210302591323853, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 195, "train_speed(iter/s)": 0.00537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/mean_length": 106.34375, "completions/min_length": 2.0, "epoch": 0.0066841728336118405, "frac_reward_zero_std": 0.0, "grad_norm": 19.627811431884766, "kl": 0.34482523798942566, "learning_rate": 3.340149965916838e-07, "loss": -0.008155794814229012, "memory(GiB)": 69.16, "reward": 0.5155869126319885, "reward_std": 0.23781563341617584, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9921875, "rewards/PlanningActionSetORM/std": 0.03125, "rewards/RMReward/mean": 0.8031249642372131, "rewards/RMReward/std": 0.07630804926156998, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.40758004784584045, "rewards/VisualPerceptionAccuracy/std": 0.4979287385940552, "step": 196, "train_speed(iter/s)": 0.005377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 200.515625, "completions/min_length": 58.0, "epoch": 0.0067182757562323095, "frac_reward_zero_std": 0.0, "grad_norm": 0.805159866809845, "kl": 0.005264517851173878, "learning_rate": 3.357191547375597e-07, "loss": -0.10656531900167465, "memory(GiB)": 69.16, "reward": 0.6983689069747925, "reward_std": 0.13819634914398193, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.864344596862793, "rewards/PlanningActionSetORM/std": 0.17729230225086212, "rewards/RMReward/mean": 0.65687495470047, "rewards/RMReward/std": 0.16702699661254883, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 197, "train_speed(iter/s)": 0.005367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/mean_length": 107.84375, "completions/min_length": 26.0, "epoch": 0.006752378678852778, "frac_reward_zero_std": 0.0, "grad_norm": 0.9216268658638, "kl": 0.007541703525930643, "learning_rate": 3.3742331288343563e-07, "loss": -0.11128046363592148, "memory(GiB)": 69.16, "reward": 0.7505241632461548, "reward_std": 0.15313345193862915, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9219959378242493, "rewards/PlanningActionSetORM/std": 0.13260847330093384, "rewards/RMReward/mean": 0.7076562643051147, "rewards/RMReward/std": 0.21056795120239258, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 198, "train_speed(iter/s)": 0.005367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/mean_length": 130.65625, "completions/min_length": 8.0, "epoch": 0.006786481601473247, "frac_reward_zero_std": 0.0, "grad_norm": 3.73274827003479, "kl": 0.8365906476974487, "learning_rate": 3.391274710293115e-07, "loss": 0.028320278972387314, "memory(GiB)": 69.16, "reward": 0.4442098140716553, "reward_std": 0.16519922018051147, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9729787111282349, "rewards/PlanningActionSetORM/std": 0.04532875865697861, "rewards/RMReward/mean": 0.6124999523162842, "rewards/RMReward/std": 0.11902381479740143, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.04224354773759842, "rewards/VisualPerceptionAccuracy/std": 0.08608395606279373, "step": 199, "train_speed(iter/s)": 0.005373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/mean_length": 186.96875, "completions/min_length": 88.0, "epoch": 0.006820584524093715, "frac_reward_zero_std": 0.0, "grad_norm": 0.8313043117523193, "kl": 0.0034105582162737846, "learning_rate": 3.4083162917518753e-07, "loss": 0.028468186035752296, "memory(GiB)": 69.16, "reward": 0.3549802005290985, "reward_std": 0.0895305871963501, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9312466382980347, "rewards/PlanningActionSetORM/std": 0.12462181597948074, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.1763199120759964, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.008711084723472595, "rewards/VisualPerceptionAccuracy/std": 0.03642583265900612, "step": 200, "train_speed(iter/s)": 0.005367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 58.84375, "completions/min_length": 8.0, "epoch": 0.006854687446714184, "frac_reward_zero_std": 0.0, "grad_norm": 6.178959846496582, "kl": 0.5953184962272644, "learning_rate": 3.4253578732106346e-07, "loss": -0.002688649110496044, "memory(GiB)": 69.16, "reward": 0.61265629529953, "reward_std": 0.2434464395046234, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9609375, "rewards/PlanningActionSetORM/std": 0.08536338061094284, "rewards/RMReward/mean": 0.5609375238418579, "rewards/RMReward/std": 0.1389676034450531, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 201, "train_speed(iter/s)": 0.005326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/mean_length": 34.4375, "completions/min_length": 8.0, "epoch": 0.006888790369334652, "frac_reward_zero_std": 0.0, "grad_norm": 5.743222236633301, "kl": 0.4876578152179718, "learning_rate": 3.4423994546693933e-07, "loss": 0.12108352780342102, "memory(GiB)": 69.16, "reward": 0.8668750524520874, "reward_std": 0.2318515032529831, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.949999988079071, "rewards/PlanningActionSetORM/std": 0.105277419090271, "rewards/RMReward/mean": 0.643750011920929, "rewards/RMReward/std": 0.14361406862735748, "rewards/SpatialReasoningORM/mean": 0.9166666865348816, "rewards/SpatialReasoningORM/std": 0.2793101966381073, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 202, "train_speed(iter/s)": 0.00533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 147.90625, "completions/min_length": 40.0, "epoch": 0.006922893291955121, "frac_reward_zero_std": 0.0, "grad_norm": 0.6916298270225525, "kl": 0.0042004939168691635, "learning_rate": 3.4594410361281526e-07, "loss": -0.04386992007493973, "memory(GiB)": 69.28, "reward": 0.7617881298065186, "reward_std": 0.13580533862113953, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9370653629302979, "rewards/PlanningActionSetORM/std": 0.11971775442361832, "rewards/RMReward/mean": 0.7179687023162842, "rewards/RMReward/std": 0.1755642145872116, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 203, "train_speed(iter/s)": 0.005319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 162.28125, "completions/min_length": 81.0, "epoch": 0.006956996214575589, "frac_reward_zero_std": 0.0, "grad_norm": 0.7131001353263855, "kl": 0.005141849163919687, "learning_rate": 3.476482617586913e-07, "loss": 0.023694273084402084, "memory(GiB)": 69.28, "reward": 0.5580418109893799, "reward_std": 0.13409948348999023, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9622602462768555, "rewards/PlanningActionSetORM/std": 0.09106422960758209, "rewards/RMReward/mean": 0.6677083969116211, "rewards/RMReward/std": 0.19446861743927002, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.052311044186353683, "rewards/VisualPerceptionAccuracy/std": 0.17688602209091187, "step": 204, "train_speed(iter/s)": 0.005321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 142.59375, "completions/min_length": 83.0, "epoch": 0.006991099137196058, "frac_reward_zero_std": 0.0, "grad_norm": 0.9544890522956848, "kl": 0.007013922557234764, "learning_rate": 3.4935241990456716e-07, "loss": 0.0027881572023034096, "memory(GiB)": 69.28, "reward": 0.6670235395431519, "reward_std": 0.1188739687204361, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9463675022125244, "rewards/PlanningActionSetORM/std": 0.11277472227811813, "rewards/RMReward/mean": 0.5971875190734863, "rewards/RMReward/std": 0.17266784608364105, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 205, "train_speed(iter/s)": 0.005308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 87.84375, "completions/min_length": 14.0, "epoch": 0.007025202059816526, "frac_reward_zero_std": 0.0, "grad_norm": 1.3848472833633423, "kl": 0.009910574182868004, "learning_rate": 3.510565780504431e-07, "loss": -0.0063152071088552475, "memory(GiB)": 69.28, "reward": 0.6065675020217896, "reward_std": 0.12346380949020386, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9698247313499451, "rewards/PlanningActionSetORM/std": 0.1046280711889267, "rewards/RMReward/mean": 0.7229166626930237, "rewards/RMReward/std": 0.18304699659347534, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 206, "train_speed(iter/s)": 0.005295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/mean_length": 104.609375, "completions/min_length": 7.0, "epoch": 0.007059304982436995, "frac_reward_zero_std": 0.0, "grad_norm": 8.785104751586914, "kl": 0.43634742498397827, "learning_rate": 3.52760736196319e-07, "loss": 0.09845670312643051, "memory(GiB)": 69.28, "reward": 0.6928083300590515, "reward_std": 0.24180279672145844, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.92620849609375, "rewards/PlanningActionSetORM/std": 0.13820505142211914, "rewards/RMReward/mean": 0.6215624809265137, "rewards/RMReward/std": 0.2001791000366211, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 207, "train_speed(iter/s)": 0.005272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 69.34375, "completions/min_length": 8.0, "epoch": 0.007093407905057463, "frac_reward_zero_std": 0.0, "grad_norm": 5.974292755126953, "kl": 0.6290959119796753, "learning_rate": 3.54464894342195e-07, "loss": -0.005746757611632347, "memory(GiB)": 69.28, "reward": 0.6350666284561157, "reward_std": 0.22671952843666077, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.967540979385376, "rewards/PlanningActionSetORM/std": 0.07999540865421295, "rewards/RMReward/mean": 0.6153124570846558, "rewards/RMReward/std": 0.1887648105621338, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 208, "train_speed(iter/s)": 0.005272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 122.1875, "completions/min_length": 8.0, "epoch": 0.007127510827677932, "frac_reward_zero_std": 0.0, "grad_norm": 4.909434795379639, "kl": 0.4153672754764557, "learning_rate": 3.561690524880709e-07, "loss": -0.01072942279279232, "memory(GiB)": 69.28, "reward": 0.5610967874526978, "reward_std": 0.14652635157108307, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.867559552192688, "rewards/PlanningActionSetORM/std": 0.18980994820594788, "rewards/RMReward/mean": 0.590624988079071, "rewards/RMReward/std": 0.1829143464565277, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.011738221161067486, "rewards/VisualPerceptionAccuracy/std": 0.04695288464426994, "step": 209, "train_speed(iter/s)": 0.005271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 130.109375, "completions/min_length": 8.0, "epoch": 0.0071616137502984005, "frac_reward_zero_std": 0.0, "grad_norm": 9.89221477508545, "kl": 0.3129643499851227, "learning_rate": 3.5787321063394684e-07, "loss": -0.04067559540271759, "memory(GiB)": 69.28, "reward": 0.647314727306366, "reward_std": 0.20717862248420715, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9398065209388733, "rewards/PlanningActionSetORM/std": 0.09055726230144501, "rewards/RMReward/mean": 0.6004166603088379, "rewards/RMReward/std": 0.15428569912910461, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 210, "train_speed(iter/s)": 0.005256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 107.0625, "completions/min_length": 8.0, "epoch": 0.0071957166729188695, "frac_reward_zero_std": 0.0, "grad_norm": 12.331053733825684, "kl": 0.38998791575431824, "learning_rate": 3.5957736877982276e-07, "loss": -0.01946946606040001, "memory(GiB)": 69.28, "reward": 0.699527382850647, "reward_std": 0.19501237571239471, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9551823735237122, "rewards/PlanningActionSetORM/std": 0.07945264875888824, "rewards/RMReward/mean": 0.7083333134651184, "rewards/RMReward/std": 0.16800877451896667, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 211, "train_speed(iter/s)": 0.005255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 90.390625, "completions/min_length": 8.0, "epoch": 0.007229819595539338, "frac_reward_zero_std": 0.0, "grad_norm": 4.3919267654418945, "kl": 0.683586061000824, "learning_rate": 3.6128152692569874e-07, "loss": 0.007636832073330879, "memory(GiB)": 69.28, "reward": 0.5907508134841919, "reward_std": 0.2686261534690857, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9228203296661377, "rewards/PlanningActionSetORM/std": 0.1330711394548416, "rewards/RMReward/mean": 0.5528125166893005, "rewards/RMReward/std": 0.1597222238779068, "rewards/SpatialReasoningORM/mean": 0.53125, "rewards/SpatialReasoningORM/std": 0.507007360458374, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 212, "train_speed(iter/s)": 0.005253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 91.78125, "completions/min_length": 14.0, "epoch": 0.007263922518159807, "frac_reward_zero_std": 0.0, "grad_norm": 3.6273927688598633, "kl": 0.009324407204985619, "learning_rate": 3.6298568507157467e-07, "loss": -0.0013618804514408112, "memory(GiB)": 69.28, "reward": 0.723766565322876, "reward_std": 0.1944366842508316, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9178187847137451, "rewards/PlanningActionSetORM/std": 0.08353325724601746, "rewards/RMReward/mean": 0.7333332896232605, "rewards/RMReward/std": 0.12348371744155884, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 213, "train_speed(iter/s)": 0.005253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 109.765625, "completions/min_length": 8.0, "epoch": 0.007298025440780275, "frac_reward_zero_std": 0.0, "grad_norm": 7.059411525726318, "kl": 0.25508370995521545, "learning_rate": 3.646898432174506e-07, "loss": -0.057797763496637344, "memory(GiB)": 69.28, "reward": 0.6610580682754517, "reward_std": 0.20696191489696503, "rewards/MathAnswerFormat/mean": 0.9375, "rewards/MathAnswerFormat/std": 0.25, "rewards/PlanningActionSetORM/mean": 0.9214285016059875, "rewards/PlanningActionSetORM/std": 0.10846532881259918, "rewards/RMReward/mean": 0.7529166340827942, "rewards/RMReward/std": 0.18114244937896729, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 214, "train_speed(iter/s)": 0.005234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 102.296875, "completions/min_length": 2.0, "epoch": 0.007332128363400744, "frac_reward_zero_std": 0.0, "grad_norm": 15.478890419006348, "kl": 0.010225590318441391, "learning_rate": 3.663940013633265e-07, "loss": 0.048641737550497055, "memory(GiB)": 69.28, "reward": 0.5291926860809326, "reward_std": 0.23411825299263, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.879873514175415, "rewards/PlanningActionSetORM/std": 0.12385344505310059, "rewards/RMReward/mean": 0.6062500476837158, "rewards/RMReward/std": 0.13400870561599731, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.2575855553150177, "rewards/VisualPerceptionAccuracy/std": 0.4374730587005615, "step": 215, "train_speed(iter/s)": 0.005246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 69.140625, "completions/min_length": 8.0, "epoch": 0.007366231286021212, "frac_reward_zero_std": 0.0, "grad_norm": 7.610004901885986, "kl": 0.6099122762680054, "learning_rate": 3.680981595092025e-07, "loss": -0.009249292314052582, "memory(GiB)": 69.28, "reward": 0.6112061738967896, "reward_std": 0.22109943628311157, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9633116722106934, "rewards/PlanningActionSetORM/std": 0.082268126308918, "rewards/RMReward/mean": 0.6309375166893005, "rewards/RMReward/std": 0.1444089114665985, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 216, "train_speed(iter/s)": 0.00525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 203.484375, "completions/min_length": 8.0, "epoch": 0.007400334208641681, "frac_reward_zero_std": 0.0, "grad_norm": 6.776693344116211, "kl": 0.23712581396102905, "learning_rate": 3.698023176550784e-07, "loss": -0.0435311421751976, "memory(GiB)": 69.28, "reward": 0.6123303174972534, "reward_std": 0.1071690246462822, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9288039207458496, "rewards/PlanningActionSetORM/std": 0.1528744399547577, "rewards/RMReward/mean": 0.7106249928474426, "rewards/RMReward/std": 0.24083103239536285, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.00017476726497989148, "rewards/VisualPerceptionAccuracy/std": 0.0006990690599195659, "step": 217, "train_speed(iter/s)": 0.005231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 91.6875, "completions/min_length": 8.0, "epoch": 0.007434437131262149, "frac_reward_zero_std": 0.0, "grad_norm": 8.828535079956055, "kl": 0.30984270572662354, "learning_rate": 3.7150647580095435e-07, "loss": -0.029144639149308205, "memory(GiB)": 69.28, "reward": 0.8500424027442932, "reward_std": 0.19928483664989471, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8907365798950195, "rewards/PlanningActionSetORM/std": 0.15290799736976624, "rewards/RMReward/mean": 0.7637499570846558, "rewards/RMReward/std": 0.156220480799675, "rewards/SpatialReasoningORM/mean": 0.90625, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 218, "train_speed(iter/s)": 0.005223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 128.953125, "completions/min_length": 8.0, "epoch": 0.007468540053882618, "frac_reward_zero_std": 0.0, "grad_norm": 4.050131797790527, "kl": 0.30734479427337646, "learning_rate": 3.732106339468303e-07, "loss": -0.026400156319141388, "memory(GiB)": 69.28, "reward": 0.582898736000061, "reward_std": 0.10456105321645737, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9313616752624512, "rewards/PlanningActionSetORM/std": 0.11135704815387726, "rewards/RMReward/mean": 0.6324999928474426, "rewards/RMReward/std": 0.19599208235740662, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.006425259634852409, "rewards/VisualPerceptionAccuracy/std": 0.025701040402054787, "step": 219, "train_speed(iter/s)": 0.005227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 99.734375, "completions/min_length": 8.0, "epoch": 0.007502642976503086, "frac_reward_zero_std": 0.0, "grad_norm": 5.6640801429748535, "kl": 0.7184401154518127, "learning_rate": 3.7491479209270625e-07, "loss": 0.02098626084625721, "memory(GiB)": 69.28, "reward": 0.6265893578529358, "reward_std": 0.18242597579956055, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9663194417953491, "rewards/PlanningActionSetORM/std": 0.10062497854232788, "rewards/RMReward/mean": 0.6656249761581421, "rewards/RMReward/std": 0.09437292814254761, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.33601075410842896, "rewards/VisualPerceptionAccuracy/mean": 0.01809350773692131, "rewards/VisualPerceptionAccuracy/std": 0.03240010514855385, "step": 220, "train_speed(iter/s)": 0.005228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 125.953125, "completions/min_length": 8.0, "epoch": 0.007536745899123555, "frac_reward_zero_std": 0.0, "grad_norm": 8.172256469726562, "kl": 0.273044228553772, "learning_rate": 3.766189502385822e-07, "loss": -0.05021955072879791, "memory(GiB)": 69.28, "reward": 0.6816059350967407, "reward_std": 0.2170472890138626, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9273726940155029, "rewards/PlanningActionSetORM/std": 0.15070025622844696, "rewards/RMReward/mean": 0.6854166984558105, "rewards/RMReward/std": 0.17164979875087738, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 221, "train_speed(iter/s)": 0.005208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 121.203125, "completions/min_length": 58.0, "epoch": 0.007570848821744023, "frac_reward_zero_std": 0.0, "grad_norm": 0.7865909337997437, "kl": 0.0083763487637043, "learning_rate": 3.783231083844581e-07, "loss": 0.02216080017387867, "memory(GiB)": 69.28, "reward": 0.5965223908424377, "reward_std": 0.07238127291202545, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9123263359069824, "rewards/PlanningActionSetORM/std": 0.10919582843780518, "rewards/RMReward/mean": 0.7658333778381348, "rewards/RMReward/std": 0.13195475935935974, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0006936726858839393, "rewards/VisualPerceptionAccuracy/std": 0.0009304965496994555, "step": 222, "train_speed(iter/s)": 0.005211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 120.078125, "completions/min_length": 75.0, "epoch": 0.007604951744364492, "frac_reward_zero_std": 0.0, "grad_norm": 0.6727388501167297, "kl": 0.01016615703701973, "learning_rate": 3.8002726653033403e-07, "loss": 0.03958651050925255, "memory(GiB)": 69.28, "reward": 0.7591222524642944, "reward_std": 0.10822977870702744, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9868613481521606, "rewards/PlanningActionSetORM/std": 0.05512888729572296, "rewards/RMReward/mean": 0.7021874785423279, "rewards/RMReward/std": 0.1601732075214386, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 223, "train_speed(iter/s)": 0.005211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/mean_length": 92.125, "completions/min_length": 8.0, "epoch": 0.0076390546669849605, "frac_reward_zero_std": 0.0, "grad_norm": 5.771301746368408, "kl": 0.2199469804763794, "learning_rate": 3.8173142467621e-07, "loss": -0.0263049453496933, "memory(GiB)": 69.28, "reward": 0.6507604122161865, "reward_std": 0.1311582624912262, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9663193821907043, "rewards/PlanningActionSetORM/std": 0.07781286537647247, "rewards/RMReward/mean": 0.7727083563804626, "rewards/RMReward/std": 0.10771790891885757, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 224, "train_speed(iter/s)": 0.005207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 60.609375, "completions/min_length": 7.0, "epoch": 0.0076731575896054295, "frac_reward_zero_std": 0.0, "grad_norm": 6.7911458015441895, "kl": 0.5972501039505005, "learning_rate": 3.8343558282208593e-07, "loss": 0.021959567442536354, "memory(GiB)": 69.28, "reward": 0.5583720207214355, "reward_std": 0.20144358277320862, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9084077477455139, "rewards/PlanningActionSetORM/std": 0.09061753004789352, "rewards/RMReward/mean": 0.5496875047683716, "rewards/RMReward/std": 0.1498490422964096, "rewards/SpatialReasoningORM/mean": 0.46875, "rewards/SpatialReasoningORM/std": 0.507007360458374, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 225, "train_speed(iter/s)": 0.005215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/mean_length": 90.078125, "completions/min_length": 8.0, "epoch": 0.007707260512225898, "frac_reward_zero_std": 0.5, "grad_norm": 5.823115348815918, "kl": 0.32558977603912354, "learning_rate": 3.8513974096796186e-07, "loss": 0.0026064671110361814, "memory(GiB)": 69.28, "reward": 0.7107812762260437, "reward_std": 0.07527834922075272, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.04472137242555618, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.08310385048389435, "rewards/SpatialReasoningORM/mean": 0.96875, "rewards/SpatialReasoningORM/std": 0.1767766922712326, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 226, "train_speed(iter/s)": 0.005206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 48.03125, "completions/min_length": 8.0, "epoch": 0.007741363434846367, "frac_reward_zero_std": 0.0, "grad_norm": 11.55721664428711, "kl": 0.7346694469451904, "learning_rate": 3.868438991138378e-07, "loss": 0.013985388912260532, "memory(GiB)": 69.28, "reward": 0.5069271326065063, "reward_std": 0.28285855054855347, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9697916507720947, "rewards/PlanningActionSetORM/std": 0.10077821463346481, "rewards/RMReward/mean": 0.768750011920929, "rewards/RMReward/std": 0.10626225918531418, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.48924607038497925, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 227, "train_speed(iter/s)": 0.005216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 172.921875, "completions/min_length": 60.0, "epoch": 0.007775466357466835, "frac_reward_zero_std": 0.0, "grad_norm": 0.7958896160125732, "kl": 0.009277289733290672, "learning_rate": 3.8854805725971376e-07, "loss": -0.043639495968818665, "memory(GiB)": 69.28, "reward": 0.7938258647918701, "reward_std": 0.08940669894218445, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9503793716430664, "rewards/PlanningActionSetORM/std": 0.10037101805210114, "rewards/RMReward/mean": 0.7546874284744263, "rewards/RMReward/std": 0.1376442313194275, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 228, "train_speed(iter/s)": 0.005208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.125, "completions/min_length": 7.0, "epoch": 0.007809569280087304, "frac_reward_zero_std": 1.0, "grad_norm": 0.04397943615913391, "kl": 0.8870994448661804, "learning_rate": 3.902522154055897e-07, "loss": 0.0008879292290657759, "memory(GiB)": 69.28, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 229, "train_speed(iter/s)": 0.005206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/mean_length": 92.15625, "completions/min_length": 7.0, "epoch": 0.007843672202707772, "frac_reward_zero_std": 0.0, "grad_norm": 6.178118705749512, "kl": 0.4663146436214447, "learning_rate": 3.919563735514656e-07, "loss": -0.11059492081403732, "memory(GiB)": 69.28, "reward": 0.4272496998310089, "reward_std": 0.09344589710235596, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9739583134651184, "rewards/PlanningActionSetORM/std": 0.0641581118106842, "rewards/RMReward/mean": 0.7562500238418579, "rewards/RMReward/std": 0.0913606807589531, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 4.047831680509262e-05, "rewards/VisualPerceptionAccuracy/std": 0.00012068884825566784, "step": 230, "train_speed(iter/s)": 0.005206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 73.703125, "completions/min_length": 8.0, "epoch": 0.007877775125328241, "frac_reward_zero_std": 0.0, "grad_norm": 12.345176696777344, "kl": 0.5990958213806152, "learning_rate": 3.9366053169734154e-07, "loss": -0.030909733846783638, "memory(GiB)": 69.28, "reward": 0.7607812881469727, "reward_std": 0.2579383850097656, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9593750238418579, "rewards/PlanningActionSetORM/std": 0.0682523176074028, "rewards/RMReward/mean": 0.671875, "rewards/RMReward/std": 0.143649160861969, "rewards/SpatialReasoningORM/mean": 0.78125, "rewards/SpatialReasoningORM/std": 0.420013427734375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 231, "train_speed(iter/s)": 0.005208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 109.546875, "completions/min_length": 2.0, "epoch": 0.00791187804794871, "frac_reward_zero_std": 0.0, "grad_norm": 1.8764485120773315, "kl": 0.008845668286085129, "learning_rate": 3.953646898432175e-07, "loss": -0.012351909652352333, "memory(GiB)": 69.28, "reward": 0.7758600115776062, "reward_std": 0.13251034915447235, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9015665650367737, "rewards/PlanningActionSetORM/std": 0.14499592781066895, "rewards/RMReward/mean": 0.6770833134651184, "rewards/RMReward/std": 0.11982183158397675, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.9375, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 232, "train_speed(iter/s)": 0.005207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 104.96875, "completions/min_length": 8.0, "epoch": 0.007945980970569177, "frac_reward_zero_std": 0.75, "grad_norm": 0.29413503408432007, "kl": 0.4485989511013031, "learning_rate": 3.9706884798909344e-07, "loss": -0.03107903152704239, "memory(GiB)": 69.28, "reward": 0.5018163919448853, "reward_std": 0.007265475578606129, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.003632738022133708, "rewards/VisualPerceptionAccuracy/std": 0.020549871027469635, "step": 233, "train_speed(iter/s)": 0.005214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 98.109375, "completions/min_length": 8.0, "epoch": 0.007980083893189646, "frac_reward_zero_std": 0.0, "grad_norm": 4.762686729431152, "kl": 0.382493257522583, "learning_rate": 3.9877300613496937e-07, "loss": -0.012126797810196877, "memory(GiB)": 69.28, "reward": 0.6235941052436829, "reward_std": 0.2162722647190094, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9676339626312256, "rewards/PlanningActionSetORM/std": 0.09807214140892029, "rewards/RMReward/mean": 0.640625, "rewards/RMReward/std": 0.17341546714305878, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.33601075410842896, "rewards/VisualPerceptionAccuracy/mean": 0.0258496031165123, "rewards/VisualPerceptionAccuracy/std": 0.1033984124660492, "step": 234, "train_speed(iter/s)": 0.005223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 188.125, "completions/min_length": 56.0, "epoch": 0.008014186815810115, "frac_reward_zero_std": 0.0, "grad_norm": 0.7816046476364136, "kl": 0.016193676739931107, "learning_rate": 4.0047716428084524e-07, "loss": -0.11705546081066132, "memory(GiB)": 69.45, "reward": 0.651739239692688, "reward_std": 0.1307329684495926, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9655709266662598, "rewards/PlanningActionSetORM/std": 0.10397889465093613, "rewards/RMReward/mean": 0.5732812285423279, "rewards/RMReward/std": 0.20802971720695496, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 235, "train_speed(iter/s)": 0.0052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/mean_length": 168.234375, "completions/min_length": 14.0, "epoch": 0.008048289738430584, "frac_reward_zero_std": 0.0, "grad_norm": 3.4439544677734375, "kl": 0.005977318622171879, "learning_rate": 4.0218132242672127e-07, "loss": -0.014115801081061363, "memory(GiB)": 69.45, "reward": 0.21736353635787964, "reward_std": 0.13396082818508148, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.637499988079071, "rewards/RMReward/std": 0.14776107668876648, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.025039581581950188, "rewards/VisualPerceptionAccuracy/std": 0.11015941947698593, "step": 236, "train_speed(iter/s)": 0.005207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 94.390625, "completions/min_length": 8.0, "epoch": 0.008082392661051051, "frac_reward_zero_std": 0.0, "grad_norm": 7.721702575683594, "kl": 0.44148311018943787, "learning_rate": 4.038854805725972e-07, "loss": -0.029559914022684097, "memory(GiB)": 69.45, "reward": 0.6364398002624512, "reward_std": 0.19366376101970673, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9356398582458496, "rewards/PlanningActionSetORM/std": 0.10196968168020248, "rewards/RMReward/mean": 0.6822916865348816, "rewards/RMReward/std": 0.1987968236207962, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 237, "train_speed(iter/s)": 0.0052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 107.71875, "completions/min_length": 8.0, "epoch": 0.00811649558367152, "frac_reward_zero_std": 0.0, "grad_norm": 3.547671318054199, "kl": 0.29646924138069153, "learning_rate": 4.0558963871847307e-07, "loss": -0.0006960779428482056, "memory(GiB)": 69.45, "reward": 0.7995052337646484, "reward_std": 0.1359790563583374, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9581596851348877, "rewards/PlanningActionSetORM/std": 0.06414765864610672, "rewards/RMReward/mean": 0.7010416984558105, "rewards/RMReward/std": 0.15827849507331848, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 238, "train_speed(iter/s)": 0.005198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/mean_length": 57.984375, "completions/min_length": 8.0, "epoch": 0.00815059850629199, "frac_reward_zero_std": 0.0, "grad_norm": 3.829479455947876, "kl": 0.2986622154712677, "learning_rate": 4.07293796864349e-07, "loss": 0.0041629960760474205, "memory(GiB)": 69.45, "reward": 0.5340312719345093, "reward_std": 0.2093784213066101, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.984375, "rewards/PlanningActionSetORM/std": 0.0883883461356163, "rewards/RMReward/mean": 0.8409374952316284, "rewards/RMReward/std": 0.1013253852725029, "rewards/SpatialReasoningORM/mean": 0.15625, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 239, "train_speed(iter/s)": 0.005197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/mean_length": 233.78125, "completions/min_length": 81.0, "epoch": 0.008184701428912458, "frac_reward_zero_std": 0.0, "grad_norm": 0.7020107507705688, "kl": 0.013734212145209312, "learning_rate": 4.0899795501022503e-07, "loss": -0.02836659364402294, "memory(GiB)": 69.45, "reward": 0.5589096546173096, "reward_std": 0.11687156558036804, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9831597208976746, "rewards/PlanningActionSetORM/std": 0.06208168342709541, "rewards/RMReward/mean": 0.6552083492279053, "rewards/RMReward/std": 0.1831287145614624, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07324280589818954, "rewards/VisualPerceptionAccuracy/std": 0.2018054574728012, "step": 240, "train_speed(iter/s)": 0.005194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 146.078125, "completions/min_length": 70.0, "epoch": 0.008218804351532926, "frac_reward_zero_std": 0.0, "grad_norm": 0.767279326915741, "kl": 0.014979475177824497, "learning_rate": 4.107021131561009e-07, "loss": -0.013839060440659523, "memory(GiB)": 69.45, "reward": 0.7523542046546936, "reward_std": 0.08791854232549667, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9973958134651184, "rewards/PlanningActionSetORM/std": 0.020833337679505348, "rewards/RMReward/mean": 0.6910937428474426, "rewards/RMReward/std": 0.1348499208688736, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 241, "train_speed(iter/s)": 0.005191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 192.265625, "completions/min_length": 8.0, "epoch": 0.008252907274153395, "frac_reward_zero_std": 0.0, "grad_norm": 8.428912162780762, "kl": 0.14362700283527374, "learning_rate": 4.124062713019768e-07, "loss": -0.006913911551237106, "memory(GiB)": 69.45, "reward": 0.2996704876422882, "reward_std": 0.15823884308338165, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7124999761581421, "rewards/RMReward/std": 0.11618950217962265, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.01121596060693264, "rewards/VisualPerceptionAccuracy/std": 0.03367912769317627, "step": 242, "train_speed(iter/s)": 0.005195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 113.875, "completions/min_length": 14.0, "epoch": 0.008287010196773864, "frac_reward_zero_std": 0.0, "grad_norm": 2.018444061279297, "kl": 0.016467489302158356, "learning_rate": 4.1411042944785275e-07, "loss": -0.03564276173710823, "memory(GiB)": 69.45, "reward": 0.7458925247192383, "reward_std": 0.1321468949317932, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9724082946777344, "rewards/PlanningActionSetORM/std": 0.06942732632160187, "rewards/RMReward/mean": 0.6081250309944153, "rewards/RMReward/std": 0.14432021975517273, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 243, "train_speed(iter/s)": 0.005195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/mean_length": 167.640625, "completions/min_length": 14.0, "epoch": 0.008321113119394333, "frac_reward_zero_std": 0.0, "grad_norm": 6.608267784118652, "kl": 0.006615589372813702, "learning_rate": 4.1581458759372873e-07, "loss": -0.007036179304122925, "memory(GiB)": 69.45, "reward": 0.5565733313560486, "reward_std": 0.17555129528045654, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.614062488079071, "rewards/RMReward/std": 0.20526708662509918, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.02191835269331932, "rewards/VisualPerceptionAccuracy/std": 0.052395548671483994, "step": 244, "train_speed(iter/s)": 0.005197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 12.171875, "completions/min_length": 8.0, "epoch": 0.0083552160420148, "frac_reward_zero_std": 0.5, "grad_norm": 4.188677787780762, "kl": 0.45059841871261597, "learning_rate": 4.1751874573960465e-07, "loss": 0.020232608541846275, "memory(GiB)": 69.45, "reward": 0.940625011920929, "reward_std": 0.1551143079996109, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24397502839565277, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 245, "train_speed(iter/s)": 0.005198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 141.46875, "completions/min_length": 8.0, "epoch": 0.008389318964635269, "frac_reward_zero_std": 0.0, "grad_norm": 10.431602478027344, "kl": 0.37717893719673157, "learning_rate": 4.192229038854806e-07, "loss": 0.03874437138438225, "memory(GiB)": 69.45, "reward": 0.5519782304763794, "reward_std": 0.20502831041812897, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.95703125, "rewards/PlanningActionSetORM/std": 0.101182721555233, "rewards/RMReward/mean": 0.8040624856948853, "rewards/RMReward/std": 0.1523124873638153, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.07297544181346893, "rewards/VisualPerceptionAccuracy/std": 0.1035553365945816, "step": 246, "train_speed(iter/s)": 0.005195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 134.9375, "completions/min_length": 15.0, "epoch": 0.008423421887255738, "frac_reward_zero_std": 0.0, "grad_norm": 2.202167510986328, "kl": 0.013605264015495777, "learning_rate": 4.209270620313565e-07, "loss": -0.049352049827575684, "memory(GiB)": 69.45, "reward": 0.77053302526474, "reward_std": 0.15863949060440063, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9712616801261902, "rewards/PlanningActionSetORM/std": 0.06588948518037796, "rewards/RMReward/mean": 0.6989583373069763, "rewards/RMReward/std": 0.10890635848045349, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 247, "train_speed(iter/s)": 0.005197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/mean_length": 75.15625, "completions/min_length": 7.0, "epoch": 0.008457524809876207, "frac_reward_zero_std": 0.0, "grad_norm": 4.52064847946167, "kl": 0.5355660319328308, "learning_rate": 4.226312201772325e-07, "loss": -0.05371341109275818, "memory(GiB)": 69.45, "reward": 0.7977864742279053, "reward_std": 0.1273173838853836, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9467013478279114, "rewards/PlanningActionSetORM/std": 0.06495451182126999, "rewards/RMReward/mean": 0.7010416984558105, "rewards/RMReward/std": 0.14122924208641052, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 248, "train_speed(iter/s)": 0.005196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/mean_length": 30.46875, "completions/min_length": 8.0, "epoch": 0.008491627732496674, "frac_reward_zero_std": 0.0, "grad_norm": 7.662796974182129, "kl": 0.27601659297943115, "learning_rate": 4.243353783231084e-07, "loss": -0.020969191566109657, "memory(GiB)": 69.45, "reward": 0.6765625476837158, "reward_std": 0.293831467628479, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.71875, "rewards/RMReward/std": 0.07274384051561356, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.48924607038497925, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 249, "train_speed(iter/s)": 0.005208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 98.125, "completions/min_length": 14.0, "epoch": 0.008525730655117143, "frac_reward_zero_std": 0.0, "grad_norm": 5.607728004455566, "kl": 0.016202952712774277, "learning_rate": 4.2603953646898434e-07, "loss": -0.01952899806201458, "memory(GiB)": 69.45, "reward": 0.6907082796096802, "reward_std": 0.21184471249580383, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9909722208976746, "rewards/PlanningActionSetORM/std": 0.03621552884578705, "rewards/RMReward/mean": 0.7341666221618652, "rewards/RMReward/std": 0.16441817581653595, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 250, "train_speed(iter/s)": 0.005207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 106.8125, "completions/min_length": 8.0, "epoch": 0.008559833577737612, "frac_reward_zero_std": 0.0, "grad_norm": 8.809422492980957, "kl": 0.37440425157546997, "learning_rate": 4.2774369461486026e-07, "loss": -0.00542757473886013, "memory(GiB)": 69.45, "reward": 0.616969883441925, "reward_std": 0.2100895643234253, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9141740798950195, "rewards/PlanningActionSetORM/std": 0.10066714137792587, "rewards/RMReward/mean": 0.6552082896232605, "rewards/RMReward/std": 0.15096835792064667, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 251, "train_speed(iter/s)": 0.005209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 97.40625, "completions/min_length": 14.0, "epoch": 0.008593936500358081, "frac_reward_zero_std": 0.0, "grad_norm": 3.467480421066284, "kl": 0.005570763722062111, "learning_rate": 4.2944785276073624e-07, "loss": 0.03706555813550949, "memory(GiB)": 69.45, "reward": 0.5958994626998901, "reward_std": 0.1637095808982849, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9791666865348816, "rewards/PlanningActionSetORM/std": 0.08980264514684677, "rewards/RMReward/mean": 0.8390624523162842, "rewards/RMReward/std": 0.1097903922200203, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.005681083537638187, "rewards/VisualPerceptionAccuracy/std": 0.02272433415055275, "step": 252, "train_speed(iter/s)": 0.005213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 144.703125, "completions/min_length": 14.0, "epoch": 0.008628039422978549, "frac_reward_zero_std": 0.0, "grad_norm": 3.0703794956207275, "kl": 0.009776528924703598, "learning_rate": 4.3115201090661216e-07, "loss": -0.060745470225811005, "memory(GiB)": 69.45, "reward": 0.42115265130996704, "reward_std": 0.21985137462615967, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9473812580108643, "rewards/PlanningActionSetORM/std": 0.12169716507196426, "rewards/RMReward/mean": 0.5268750190734863, "rewards/RMReward/std": 0.15516524016857147, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.023634394630789757, "rewards/VisualPerceptionAccuracy/std": 0.09453757852315903, "step": 253, "train_speed(iter/s)": 0.005204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/mean_length": 112.859375, "completions/min_length": 8.0, "epoch": 0.008662142345599018, "frac_reward_zero_std": 0.0, "grad_norm": 2.6432182788848877, "kl": 0.17648418247699738, "learning_rate": 4.328561690524881e-07, "loss": 0.006441940553486347, "memory(GiB)": 69.45, "reward": 0.7703300714492798, "reward_std": 0.16533416509628296, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.985325813293457, "rewards/PlanningActionSetORM/std": 0.03738659620285034, "rewards/RMReward/mean": 0.6456250548362732, "rewards/RMReward/std": 0.18025875091552734, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 254, "train_speed(iter/s)": 0.005201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/mean_length": 239.984375, "completions/min_length": 90.0, "epoch": 0.008696245268219487, "frac_reward_zero_std": 0.0, "grad_norm": 0.9172453284263611, "kl": 0.0034908954985439777, "learning_rate": 4.34560327198364e-07, "loss": 0.0013046814128756523, "memory(GiB)": 69.45, "reward": 0.24957413971424103, "reward_std": 0.04077339544892311, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.04472137242555618, "rewards/RMReward/mean": 0.9437500238418579, "rewards/RMReward/std": 0.06946222484111786, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.016098862513899803, "rewards/VisualPerceptionAccuracy/std": 0.035675931721925735, "step": 255, "train_speed(iter/s)": 0.005207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/mean_length": 115.359375, "completions/min_length": 70.0, "epoch": 0.008730348190839956, "frac_reward_zero_std": 0.0, "grad_norm": 0.7970722317695618, "kl": 0.013088076375424862, "learning_rate": 4.3626448534424e-07, "loss": -0.018110891804099083, "memory(GiB)": 69.45, "reward": 0.7612820863723755, "reward_std": 0.07595863938331604, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9657856822013855, "rewards/PlanningActionSetORM/std": 0.06054890528321266, "rewards/RMReward/mean": 0.7101562023162842, "rewards/RMReward/std": 0.11757459491491318, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 256, "train_speed(iter/s)": 0.005214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 139.34375, "completions/min_length": 8.0, "epoch": 0.008764451113460423, "frac_reward_zero_std": 0.0, "grad_norm": 19.372608184814453, "kl": 0.3039901554584503, "learning_rate": 4.379686434901159e-07, "loss": -0.033451154828071594, "memory(GiB)": 69.45, "reward": 0.49427056312561035, "reward_std": 0.18795418739318848, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.984375, "rewards/PlanningActionSetORM/std": 0.07233155518770218, "rewards/RMReward/mean": 0.6828124523162842, "rewards/RMReward/std": 0.151129812002182, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.02520722523331642, "rewards/VisualPerceptionAccuracy/std": 0.0689982995390892, "step": 257, "train_speed(iter/s)": 0.005213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 166.65625, "completions/min_length": 2.0, "epoch": 0.008798554036080892, "frac_reward_zero_std": 0.0, "grad_norm": 20.759654998779297, "kl": 0.010196135379374027, "learning_rate": 4.3967280163599184e-07, "loss": -0.043307747691869736, "memory(GiB)": 69.45, "reward": 0.7135398387908936, "reward_std": 0.2025669515132904, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9027653336524963, "rewards/PlanningActionSetORM/std": 0.13575328886508942, "rewards/RMReward/mean": 0.703125, "rewards/RMReward/std": 0.1326514631509781, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.625, "rewards/VisualPerceptionAccuracy/std": 0.5, "step": 258, "train_speed(iter/s)": 0.005203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 118.53125, "completions/min_length": 8.0, "epoch": 0.00883265695870136, "frac_reward_zero_std": 0.0, "grad_norm": 5.909709453582764, "kl": 0.24784284830093384, "learning_rate": 4.413769597818678e-07, "loss": -0.011513087898492813, "memory(GiB)": 69.45, "reward": 0.5980952978134155, "reward_std": 0.2011723816394806, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9767857193946838, "rewards/PlanningActionSetORM/std": 0.05566542595624924, "rewards/RMReward/mean": 0.690625011920929, "rewards/RMReward/std": 0.13566963374614716, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.3965577781200409, "rewards/VisualPerceptionAccuracy/mean": 0.0007739979191683233, "rewards/VisualPerceptionAccuracy/std": 0.0006076318677514791, "step": 259, "train_speed(iter/s)": 0.005214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 120.90625, "completions/min_length": 14.0, "epoch": 0.00886675988132183, "frac_reward_zero_std": 0.0, "grad_norm": 3.6878528594970703, "kl": 0.013099666684865952, "learning_rate": 4.4308111792774375e-07, "loss": -0.007689995691180229, "memory(GiB)": 69.45, "reward": 0.662905216217041, "reward_std": 0.1140565499663353, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.984375, "rewards/PlanningActionSetORM/std": 0.0883883461356163, "rewards/RMReward/mean": 0.8578125238418579, "rewards/RMReward/std": 0.10294798016548157, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.004120801575481892, "rewards/VisualPerceptionAccuracy/std": 0.0030074268579483032, "step": 260, "train_speed(iter/s)": 0.005207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/mean_length": 93.53125, "completions/min_length": 2.0, "epoch": 0.008900862803942297, "frac_reward_zero_std": 0.0, "grad_norm": 14.481356620788574, "kl": 0.020065637305378914, "learning_rate": 4.447852760736197e-07, "loss": 0.029144249856472015, "memory(GiB)": 69.45, "reward": 0.6090587377548218, "reward_std": 0.2056732177734375, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9562252163887024, "rewards/PlanningActionSetORM/std": 0.09602298587560654, "rewards/RMReward/mean": 0.6458333134651184, "rewards/RMReward/std": 0.1797851324081421, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3125, "rewards/VisualPerceptionAccuracy/std": 0.4787135720252991, "step": 261, "train_speed(iter/s)": 0.00521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 112.03125, "completions/min_length": 2.0, "epoch": 0.008934965726562766, "frac_reward_zero_std": 0.0, "grad_norm": 31.373058319091797, "kl": 0.21746891736984253, "learning_rate": 4.464894342194956e-07, "loss": -0.039564698934555054, "memory(GiB)": 69.45, "reward": 0.7932589054107666, "reward_std": 0.20859292149543762, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9810267686843872, "rewards/PlanningActionSetORM/std": 0.08887434005737305, "rewards/RMReward/mean": 0.6421874761581421, "rewards/RMReward/std": 0.12385564297437668, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.8125, "rewards/VisualPerceptionAccuracy/std": 0.40311288833618164, "step": 262, "train_speed(iter/s)": 0.0052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/mean_length": 148.640625, "completions/min_length": 88.0, "epoch": 0.008969068649183235, "frac_reward_zero_std": 0.0, "grad_norm": 0.8605144023895264, "kl": 0.015582485124468803, "learning_rate": 4.481935923653716e-07, "loss": -0.01972213014960289, "memory(GiB)": 69.45, "reward": 0.7574460506439209, "reward_std": 0.1062883734703064, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9153553247451782, "rewards/PlanningActionSetORM/std": 0.15831561386585236, "rewards/RMReward/mean": 0.7179687023162842, "rewards/RMReward/std": 0.1695856750011444, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 263, "train_speed(iter/s)": 0.0052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/mean_length": 165.390625, "completions/min_length": 75.0, "epoch": 0.009003171571803704, "frac_reward_zero_std": 0.0, "grad_norm": 0.6847345232963562, "kl": 0.012185968458652496, "learning_rate": 4.498977505112475e-07, "loss": 0.009360896423459053, "memory(GiB)": 69.45, "reward": 0.5301496982574463, "reward_std": 0.10567256808280945, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9279882311820984, "rewards/PlanningActionSetORM/std": 0.11048923432826996, "rewards/RMReward/mean": 0.6416666507720947, "rewards/RMReward/std": 0.1595650166273117, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.023805761709809303, "rewards/VisualPerceptionAccuracy/std": 0.05249713733792305, "step": 264, "train_speed(iter/s)": 0.005204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 148.859375, "completions/min_length": 94.0, "epoch": 0.009037274494424171, "frac_reward_zero_std": 0.0, "grad_norm": 0.9441695213317871, "kl": 0.011199315078556538, "learning_rate": 4.5160190865712343e-07, "loss": -0.004530476406216621, "memory(GiB)": 69.45, "reward": 0.4364486038684845, "reward_std": 0.06695067137479782, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.01767767407000065, "rewards/RMReward/mean": 0.8278124928474426, "rewards/RMReward/std": 0.1554957628250122, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.011272164061665535, "rewards/VisualPerceptionAccuracy/std": 0.03535296395421028, "step": 265, "train_speed(iter/s)": 0.0052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/mean_length": 91.34375, "completions/min_length": 8.0, "epoch": 0.00907137741704464, "frac_reward_zero_std": 0.0, "grad_norm": 17.19652557373047, "kl": 0.404803991317749, "learning_rate": 4.5330606680299935e-07, "loss": 0.013049734756350517, "memory(GiB)": 69.45, "reward": 0.4147527515888214, "reward_std": 0.2836609482765198, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8993750214576721, "rewards/RMReward/std": 0.19027940928936005, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": 0.04576103016734123, "rewards/VisualPerceptionAccuracy/std": 0.1127329096198082, "step": 266, "train_speed(iter/s)": 0.005203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/mean_length": 314.28125, "completions/min_length": 126.0, "epoch": 0.00910548033966511, "frac_reward_zero_std": 0.0, "grad_norm": 0.6303542852401733, "kl": 0.00939914584159851, "learning_rate": 4.5501022494887533e-07, "loss": -0.02964402362704277, "memory(GiB)": 69.45, "reward": 0.3514721095561981, "reward_std": 0.0870799571275711, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.958268404006958, "rewards/PlanningActionSetORM/std": 0.1111559346318245, "rewards/RMReward/mean": 0.609375, "rewards/RMReward/std": 0.12210353463888168, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.023790523409843445, "rewards/VisualPerceptionAccuracy/std": 0.07274416834115982, "step": 267, "train_speed(iter/s)": 0.005197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/mean_length": 62.890625, "completions/min_length": 8.0, "epoch": 0.009139583262285578, "frac_reward_zero_std": 0.0, "grad_norm": 8.261279106140137, "kl": 0.7197884321212769, "learning_rate": 4.5671438309475126e-07, "loss": -0.01629083976149559, "memory(GiB)": 69.45, "reward": 0.2977340817451477, "reward_std": 0.172930046916008, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8643749952316284, "rewards/RMReward/std": 0.0889170914888382, "rewards/SpatialReasoningORM/mean": 0.09375, "rewards/SpatialReasoningORM/std": 0.2961445748806, "rewards/VisualPerceptionAccuracy/mean": 0.021311333402991295, "rewards/VisualPerceptionAccuracy/std": 0.058599818497896194, "step": 268, "train_speed(iter/s)": 0.005204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/mean_length": 193.546875, "completions/min_length": 7.0, "epoch": 0.009173686184906046, "frac_reward_zero_std": 0.0, "grad_norm": 12.936355590820312, "kl": 0.758910596370697, "learning_rate": 4.584185412406272e-07, "loss": -0.038592398166656494, "memory(GiB)": 69.45, "reward": 0.3666754961013794, "reward_std": 0.243648499250412, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9323217868804932, "rewards/PlanningActionSetORM/std": 0.09648168087005615, "rewards/RMReward/mean": 0.65625, "rewards/RMReward/std": 0.11672617495059967, "rewards/SpatialReasoningORM/mean": 0.34375, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": 0.0021125776693224907, "rewards/VisualPerceptionAccuracy/std": 0.008450310677289963, "step": 269, "train_speed(iter/s)": 0.005202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 109.375, "completions/min_length": 8.0, "epoch": 0.009207789107526515, "frac_reward_zero_std": 0.0, "grad_norm": 4.922996520996094, "kl": 0.38717496395111084, "learning_rate": 4.601226993865031e-07, "loss": -0.0013753995299339294, "memory(GiB)": 69.45, "reward": 0.5938913822174072, "reward_std": 0.13284902274608612, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9769840836524963, "rewards/PlanningActionSetORM/std": 0.05036294460296631, "rewards/RMReward/mean": 0.699999988079071, "rewards/RMReward/std": 0.12757976353168488, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 270, "train_speed(iter/s)": 0.005204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/mean_length": 318.4375, "completions/min_length": 88.0, "epoch": 0.009241892030146984, "frac_reward_zero_std": 0.0, "grad_norm": 0.6065545678138733, "kl": 0.005273071117699146, "learning_rate": 4.618268575323791e-07, "loss": -0.042036183178424835, "memory(GiB)": 69.45, "reward": 0.4031004309654236, "reward_std": 0.06412216275930405, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9861979484558105, "rewards/PlanningActionSetORM/std": 0.07085654884576797, "rewards/RMReward/mean": 0.745312511920929, "rewards/RMReward/std": 0.1278604120016098, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.01271129958331585, "rewards/VisualPerceptionAccuracy/std": 0.031256042420864105, "step": 271, "train_speed(iter/s)": 0.005195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/mean_length": 178.5625, "completions/min_length": 8.0, "epoch": 0.009275994952767453, "frac_reward_zero_std": 0.0, "grad_norm": 14.693676948547363, "kl": 0.5320155620574951, "learning_rate": 4.63531015678255e-07, "loss": -0.031642891466617584, "memory(GiB)": 69.45, "reward": 0.5322496891021729, "reward_std": 0.2085070013999939, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9973958134651184, "rewards/PlanningActionSetORM/std": 0.010416671633720398, "rewards/RMReward/mean": 0.5743750333786011, "rewards/RMReward/std": 0.11153288185596466, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": 0.004394604358822107, "rewards/VisualPerceptionAccuracy/std": 0.01757841743528843, "step": 272, "train_speed(iter/s)": 0.005198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 145.453125, "completions/min_length": 2.0, "epoch": 0.00931009787538792, "frac_reward_zero_std": 0.0, "grad_norm": 21.13184356689453, "kl": 0.010165249928832054, "learning_rate": 4.6523517382413094e-07, "loss": -0.01739722117781639, "memory(GiB)": 69.45, "reward": 0.4744797646999359, "reward_std": 0.2126973420381546, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9268056154251099, "rewards/PlanningActionSetORM/std": 0.11353453248739243, "rewards/RMReward/mean": 0.7200000286102295, "rewards/RMReward/std": 0.15332865715026855, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.18759839236736298, "rewards/VisualPerceptionAccuracy/std": 0.36828088760375977, "step": 273, "train_speed(iter/s)": 0.005199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 117.359375, "completions/min_length": 8.0, "epoch": 0.009344200798008389, "frac_reward_zero_std": 0.0, "grad_norm": 5.516790866851807, "kl": 0.4534129500389099, "learning_rate": 4.669393319700068e-07, "loss": -0.019991273060441017, "memory(GiB)": 69.45, "reward": 0.42925041913986206, "reward_std": 0.12215966731309891, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.969531238079071, "rewards/PlanningActionSetORM/std": 0.053782109171152115, "rewards/RMReward/mean": 0.7437500357627869, "rewards/RMReward/std": 0.133047416806221, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.0298140961676836, "rewards/VisualPerceptionAccuracy/std": 0.05688825994729996, "step": 274, "train_speed(iter/s)": 0.005203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 139.453125, "completions/min_length": 75.0, "epoch": 0.009378303720628858, "frac_reward_zero_std": 0.0, "grad_norm": 0.6982995271682739, "kl": 0.019752763211727142, "learning_rate": 4.6864349011588284e-07, "loss": -0.04482017457485199, "memory(GiB)": 69.45, "reward": 0.8162127733230591, "reward_std": 0.06759783625602722, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9529390335083008, "rewards/PlanningActionSetORM/std": 0.08940417319536209, "rewards/RMReward/mean": 0.7820312976837158, "rewards/RMReward/std": 0.1193125769495964, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 275, "train_speed(iter/s)": 0.005195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2051.0, "completions/mean_length": 135.109375, "completions/min_length": 8.0, "epoch": 0.009412406643249327, "frac_reward_zero_std": 0.0, "grad_norm": 5.421580791473389, "kl": 0.25496265292167664, "learning_rate": 4.7034764826175877e-07, "loss": -0.056675996631383896, "memory(GiB)": 69.45, "reward": 0.8437315225601196, "reward_std": 0.13886059820652008, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9830018877983093, "rewards/PlanningActionSetORM/std": 0.07521434873342514, "rewards/RMReward/mean": 0.7685416340827942, "rewards/RMReward/std": 0.19094377756118774, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 276, "train_speed(iter/s)": 0.00519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 149.515625, "completions/min_length": 86.0, "epoch": 0.009446509565869794, "frac_reward_zero_std": 0.0, "grad_norm": 0.5908917784690857, "kl": 0.014932196587324142, "learning_rate": 4.7205180640763464e-07, "loss": -0.05440901964902878, "memory(GiB)": 69.45, "reward": 0.8489375114440918, "reward_std": 0.0777469202876091, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9703124761581421, "rewards/PlanningActionSetORM/std": 0.11081184446811676, "rewards/RMReward/mean": 0.8185937404632568, "rewards/RMReward/std": 0.11608517915010452, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 277, "train_speed(iter/s)": 0.005174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 172.890625, "completions/min_length": 78.0, "epoch": 0.009480612488490263, "frac_reward_zero_std": 0.0, "grad_norm": 0.7913100123405457, "kl": 0.014683262445032597, "learning_rate": 4.7375596455351057e-07, "loss": -0.004205229692161083, "memory(GiB)": 69.45, "reward": 0.5487061738967896, "reward_std": 0.09738843142986298, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9774305820465088, "rewards/PlanningActionSetORM/std": 0.047823745757341385, "rewards/RMReward/mean": 0.6664583086967468, "rewards/RMReward/std": 0.22097934782505035, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.008866138756275177, "rewards/VisualPerceptionAccuracy/std": 0.024226905778050423, "step": 278, "train_speed(iter/s)": 0.00517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 105.71875, "completions/min_length": 9.0, "epoch": 0.009514715411110732, "frac_reward_zero_std": 0.0, "grad_norm": 17.114398956298828, "kl": 0.33925575017929077, "learning_rate": 4.754601226993866e-07, "loss": -0.04290539771318436, "memory(GiB)": 69.45, "reward": 0.7233266830444336, "reward_std": 0.17429575324058533, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9157196879386902, "rewards/PlanningActionSetORM/std": 0.16380539536476135, "rewards/RMReward/mean": 0.6341666579246521, "rewards/RMReward/std": 0.15183743834495544, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 279, "train_speed(iter/s)": 0.00516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 85.375, "completions/min_length": 7.0, "epoch": 0.009548818333731201, "frac_reward_zero_std": 0.0, "grad_norm": 2.8160758018493652, "kl": 0.42298686504364014, "learning_rate": 4.771642808452625e-07, "loss": -0.002180814743041992, "memory(GiB)": 69.45, "reward": 0.5878341197967529, "reward_std": 0.16151557862758636, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9591021537780762, "rewards/PlanningActionSetORM/std": 0.09132225811481476, "rewards/RMReward/mean": 0.6943750381469727, "rewards/RMReward/std": 0.18008162081241608, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 280, "train_speed(iter/s)": 0.005169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 106.59375, "completions/min_length": 8.0, "epoch": 0.009582921256351669, "frac_reward_zero_std": 0.0, "grad_norm": 8.783860206604004, "kl": 0.401263028383255, "learning_rate": 4.788684389911384e-07, "loss": -0.09688197076320648, "memory(GiB)": 69.45, "reward": 0.7838467359542847, "reward_std": 0.21349653601646423, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9134672284126282, "rewards/PlanningActionSetORM/std": 0.15755584836006165, "rewards/RMReward/mean": 0.6296875476837158, "rewards/RMReward/std": 0.15338091552257538, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.33601075410842896, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 281, "train_speed(iter/s)": 0.005158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 137.59375, "completions/min_length": 76.0, "epoch": 0.009617024178972138, "frac_reward_zero_std": 0.0, "grad_norm": 0.6103310585021973, "kl": 0.017279986292123795, "learning_rate": 4.805725971370144e-07, "loss": -0.029757466167211533, "memory(GiB)": 69.45, "reward": 0.8557395935058594, "reward_std": 0.07069511711597443, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9986979365348816, "rewards/PlanningActionSetORM/std": 0.010416664183139801, "rewards/RMReward/mean": 0.8200000524520874, "rewards/RMReward/std": 0.11244751513004303, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 282, "train_speed(iter/s)": 0.005151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 151.109375, "completions/min_length": 87.0, "epoch": 0.009651127101592607, "frac_reward_zero_std": 0.0, "grad_norm": 0.6311525106430054, "kl": 0.02068886160850525, "learning_rate": 4.822767552828904e-07, "loss": -0.04379233717918396, "memory(GiB)": 69.45, "reward": 0.6927265524864197, "reward_std": 0.12397636473178864, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9892578125, "rewards/PlanningActionSetORM/std": 0.0645061805844307, "rewards/RMReward/mean": 0.618593692779541, "rewards/RMReward/std": 0.17089742422103882, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 283, "train_speed(iter/s)": 0.005143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/mean_length": 102.671875, "completions/min_length": 67.0, "epoch": 0.009685230024213076, "frac_reward_zero_std": 0.0, "grad_norm": 0.6638708114624023, "kl": 0.033237095922231674, "learning_rate": 4.839809134287662e-07, "loss": -0.023485654965043068, "memory(GiB)": 69.45, "reward": 0.8509374856948853, "reward_std": 0.09717650711536407, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9859374761581421, "rewards/PlanningActionSetORM/std": 0.07097324728965759, "rewards/RMReward/mean": 0.817187488079071, "rewards/RMReward/std": 0.13680560886859894, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 284, "train_speed(iter/s)": 0.005141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 112.125, "completions/min_length": 85.0, "epoch": 0.009719332946833543, "frac_reward_zero_std": 0.0, "grad_norm": 0.7554171085357666, "kl": 0.01794593781232834, "learning_rate": 4.856850715746422e-07, "loss": 0.018737278878688812, "memory(GiB)": 69.45, "reward": 0.8599531054496765, "reward_std": 0.08089378476142883, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9722656011581421, "rewards/PlanningActionSetORM/std": 0.0669809877872467, "rewards/RMReward/mean": 0.8318750262260437, "rewards/RMReward/std": 0.14630773663520813, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 285, "train_speed(iter/s)": 0.005142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 229.265625, "completions/min_length": 77.0, "epoch": 0.009753435869454012, "frac_reward_zero_std": 0.0, "grad_norm": 0.6778825521469116, "kl": 0.018707606941461563, "learning_rate": 4.873892297205181e-07, "loss": 0.012279221788048744, "memory(GiB)": 69.45, "reward": 0.3582942485809326, "reward_std": 0.06344248354434967, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9731422066688538, "rewards/PlanningActionSetORM/std": 0.07542593032121658, "rewards/RMReward/mean": 0.6431249976158142, "rewards/RMReward/std": 0.13556185364723206, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.007460075430572033, "rewards/VisualPerceptionAccuracy/std": 0.0299623291939497, "step": 286, "train_speed(iter/s)": 0.005138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 204.09375, "completions/min_length": 8.0, "epoch": 0.00978753879207448, "frac_reward_zero_std": 0.0, "grad_norm": 3.963052749633789, "kl": 0.2243940532207489, "learning_rate": 4.890933878663941e-07, "loss": 0.014764038845896721, "memory(GiB)": 69.45, "reward": 0.47622761130332947, "reward_std": 0.1193600594997406, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9688801765441895, "rewards/PlanningActionSetORM/std": 0.10183736681938171, "rewards/RMReward/mean": 0.8353124856948853, "rewards/RMReward/std": 0.10235876590013504, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.012108426541090012, "rewards/VisualPerceptionAccuracy/std": 0.01891861855983734, "step": 287, "train_speed(iter/s)": 0.005139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 107.875, "completions/min_length": 14.0, "epoch": 0.00982164171469495, "frac_reward_zero_std": 0.0, "grad_norm": 1.9948744773864746, "kl": 0.020528389140963554, "learning_rate": 4.9079754601227e-07, "loss": 0.026449164375662804, "memory(GiB)": 69.45, "reward": 0.8206785917282104, "reward_std": 0.12791869044303894, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9876487851142883, "rewards/PlanningActionSetORM/std": 0.03699789568781853, "rewards/RMReward/mean": 0.7289583086967468, "rewards/RMReward/std": 0.1270657330751419, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 288, "train_speed(iter/s)": 0.005135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 134.03125, "completions/min_length": 78.0, "epoch": 0.009855744637315417, "frac_reward_zero_std": 0.0, "grad_norm": 0.6236565113067627, "kl": 0.01856260374188423, "learning_rate": 4.925017041581459e-07, "loss": -0.020101157948374748, "memory(GiB)": 69.45, "reward": 0.777706503868103, "reward_std": 0.09594586491584778, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9854073524475098, "rewards/PlanningActionSetORM/std": 0.0707532986998558, "rewards/RMReward/mean": 0.7257813215255737, "rewards/RMReward/std": 0.18082530796527863, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 289, "train_speed(iter/s)": 0.00513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 156.453125, "completions/min_length": 92.0, "epoch": 0.009889847559935886, "frac_reward_zero_std": 0.0, "grad_norm": 0.6469151973724365, "kl": 0.01785050332546234, "learning_rate": 4.942058623040218e-07, "loss": -0.021824443712830544, "memory(GiB)": 69.45, "reward": 0.6106370687484741, "reward_std": 0.09975982457399368, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9941874146461487, "rewards/PlanningActionSetORM/std": 0.01994098164141178, "rewards/RMReward/mean": 0.7185416221618652, "rewards/RMReward/std": 0.1489393562078476, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.12153582274913788, "rewards/VisualPerceptionAccuracy/std": 0.14011235535144806, "step": 290, "train_speed(iter/s)": 0.005127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2049.0, "completions/mean_length": 203.28125, "completions/min_length": 71.0, "epoch": 0.009923950482556355, "frac_reward_zero_std": 0.0, "grad_norm": 0.7494269013404846, "kl": 0.023647356778383255, "learning_rate": 4.959100204498979e-07, "loss": -0.12180398404598236, "memory(GiB)": 69.45, "reward": 0.7416232824325562, "reward_std": 0.11197495460510254, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9674913287162781, "rewards/PlanningActionSetORM/std": 0.11690963804721832, "rewards/RMReward/mean": 0.6851562261581421, "rewards/RMReward/std": 0.2045668512582779, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 291, "train_speed(iter/s)": 0.005119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/mean_length": 130.9375, "completions/min_length": 61.0, "epoch": 0.009958053405176824, "frac_reward_zero_std": 0.0, "grad_norm": 0.7751319408416748, "kl": 0.02698349580168724, "learning_rate": 4.976141785957737e-07, "loss": 0.020711002871394157, "memory(GiB)": 69.45, "reward": 0.7195613980293274, "reward_std": 0.10294980555772781, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.972806990146637, "rewards/PlanningActionSetORM/std": 0.06509751081466675, "rewards/RMReward/mean": 0.65625, "rewards/RMReward/std": 0.1421211212873459, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 292, "train_speed(iter/s)": 0.005117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/mean_length": 171.9375, "completions/min_length": 8.0, "epoch": 0.009992156327797293, "frac_reward_zero_std": 0.0, "grad_norm": 6.1333136558532715, "kl": 0.8427349925041199, "learning_rate": 4.993183367416496e-07, "loss": -0.0014631012454628944, "memory(GiB)": 69.45, "reward": 0.4936929941177368, "reward_std": 0.1502593457698822, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8962500095367432, "rewards/RMReward/std": 0.122576504945755, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.00777203356847167, "rewards/VisualPerceptionAccuracy/std": 0.027976175770163536, "step": 293, "train_speed(iter/s)": 0.005116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 152.9375, "completions/min_length": 2.0, "epoch": 0.01002625925041776, "frac_reward_zero_std": 0.0, "grad_norm": 26.929536819458008, "kl": 0.03417845815420151, "learning_rate": 5.010224948875256e-07, "loss": -0.01546501275151968, "memory(GiB)": 69.45, "reward": 0.7320535778999329, "reward_std": 0.19704760611057281, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9970238208770752, "rewards/PlanningActionSetORM/std": 0.020619653165340424, "rewards/RMReward/mean": 0.762499988079071, "rewards/RMReward/std": 0.12398524582386017, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5, "rewards/VisualPerceptionAccuracy/std": 0.5163977742195129, "step": 294, "train_speed(iter/s)": 0.005119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 114.328125, "completions/min_length": 9.0, "epoch": 0.01006036217303823, "frac_reward_zero_std": 0.0, "grad_norm": 15.137662887573242, "kl": 0.22810253500938416, "learning_rate": 5.027266530334016e-07, "loss": -0.014866366982460022, "memory(GiB)": 69.45, "reward": 0.7162756323814392, "reward_std": 0.1611046940088272, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9935041069984436, "rewards/PlanningActionSetORM/std": 0.01972215250134468, "rewards/RMReward/mean": 0.8256250023841858, "rewards/RMReward/std": 0.1511080414056778, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 295, "train_speed(iter/s)": 0.005118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/mean_length": 112.953125, "completions/min_length": 8.0, "epoch": 0.010094465095658698, "frac_reward_zero_std": 0.0, "grad_norm": 18.31632423400879, "kl": 0.4110388159751892, "learning_rate": 5.044308111792774e-07, "loss": -0.003462608903646469, "memory(GiB)": 69.45, "reward": 0.5731271505355835, "reward_std": 0.13349726796150208, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9585565328598022, "rewards/PlanningActionSetORM/std": 0.1050945296883583, "rewards/RMReward/mean": 0.637499988079071, "rewards/RMReward/std": 0.11845783144235611, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.00783588644117117, "rewards/VisualPerceptionAccuracy/std": 0.03134354576468468, "step": 296, "train_speed(iter/s)": 0.005122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/mean_length": 159.265625, "completions/min_length": 14.0, "epoch": 0.010128568018279167, "frac_reward_zero_std": 0.0, "grad_norm": 2.0003418922424316, "kl": 0.015071345493197441, "learning_rate": 5.061349693251534e-07, "loss": -0.004519270732998848, "memory(GiB)": 69.45, "reward": 0.6222495436668396, "reward_std": 0.12551239132881165, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9320963621139526, "rewards/PlanningActionSetORM/std": 0.1456420123577118, "rewards/RMReward/mean": 0.71875, "rewards/RMReward/std": 0.12362143397331238, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.025534499436616898, "rewards/VisualPerceptionAccuracy/std": 0.10213800519704819, "step": 297, "train_speed(iter/s)": 0.005124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 113.515625, "completions/min_length": 2.0, "epoch": 0.010162670940899635, "frac_reward_zero_std": 0.0, "grad_norm": 22.371912002563477, "kl": 0.01831671968102455, "learning_rate": 5.078391274710293e-07, "loss": 0.005460579879581928, "memory(GiB)": 69.45, "reward": 0.648675262928009, "reward_std": 0.17643070220947266, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9628348350524902, "rewards/PlanningActionSetORM/std": 0.07472366839647293, "rewards/RMReward/mean": 0.7102083563804626, "rewards/RMReward/std": 0.1394592821598053, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3125, "rewards/VisualPerceptionAccuracy/std": 0.4787135720252991, "step": 298, "train_speed(iter/s)": 0.00513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/mean_length": 24.84375, "completions/min_length": 8.0, "epoch": 0.010196773863520104, "frac_reward_zero_std": 0.75, "grad_norm": 0.433591365814209, "kl": 0.6204030513763428, "learning_rate": 5.095432856169053e-07, "loss": -0.01752602681517601, "memory(GiB)": 69.45, "reward": 0.9426041841506958, "reward_std": 0.013729781843721867, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9895833134651184, "rewards/PlanningActionSetORM/std": 0.041666675359010696, "rewards/RMReward/mean": 0.7156250476837158, "rewards/RMReward/std": 0.06511208415031433, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 299, "train_speed(iter/s)": 0.005132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/mean_length": 209.09375, "completions/min_length": 85.0, "epoch": 0.010230876786140573, "frac_reward_zero_std": 0.0, "grad_norm": 0.7598996758460999, "kl": 0.013141377829015255, "learning_rate": 5.112474437627812e-07, "loss": 0.0039025340229272842, "memory(GiB)": 69.45, "reward": 0.5405741930007935, "reward_std": 0.0846746638417244, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9489872455596924, "rewards/PlanningActionSetORM/std": 0.10809201002120972, "rewards/RMReward/mean": 0.653124988079071, "rewards/RMReward/std": 0.12816472351551056, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.02540449984371662, "rewards/VisualPerceptionAccuracy/std": 0.046632133424282074, "step": 300, "train_speed(iter/s)": 0.005128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 132.78125, "completions/min_length": 2.0, "epoch": 0.010264979708761042, "frac_reward_zero_std": 0.0, "grad_norm": 16.758092880249023, "kl": 0.339070200920105, "learning_rate": 5.129516019086571e-07, "loss": -0.009073756635189056, "memory(GiB)": 69.45, "reward": 0.6289036273956299, "reward_std": 0.1707572489976883, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.996874988079071, "rewards/PlanningActionSetORM/std": 0.012500002980232239, "rewards/RMReward/mean": 0.6181250214576721, "rewards/RMReward/std": 0.11600107699632645, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.4405573010444641, "rewards/VisualPerceptionAccuracy/std": 0.5013470649719238, "step": 301, "train_speed(iter/s)": 0.005108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/mean_length": 76.6875, "completions/min_length": 8.0, "epoch": 0.010299082631381509, "frac_reward_zero_std": 0.0, "grad_norm": 12.155729293823242, "kl": 0.7027010321617126, "learning_rate": 5.146557600545331e-07, "loss": -0.005418134853243828, "memory(GiB)": 69.45, "reward": 0.6473697423934937, "reward_std": 0.25285136699676514, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9973958134651184, "rewards/PlanningActionSetORM/std": 0.010416671633720398, "rewards/RMReward/mean": 0.7218749523162842, "rewards/RMReward/std": 0.0815858468413353, "rewards/SpatialReasoningORM/mean": 0.5833333134651184, "rewards/SpatialReasoningORM/std": 0.49822381138801575, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 302, "train_speed(iter/s)": 0.005107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 176.84375, "completions/min_length": 8.0, "epoch": 0.010333185554001978, "frac_reward_zero_std": 0.0, "grad_norm": 10.561141967773438, "kl": 0.3022620677947998, "learning_rate": 5.163599182004091e-07, "loss": 0.08193507045507431, "memory(GiB)": 69.45, "reward": 0.8030000329017639, "reward_std": 0.1738283932209015, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9479166865348816, "rewards/PlanningActionSetORM/std": 0.1487387865781784, "rewards/RMReward/mean": 0.7341666221618652, "rewards/RMReward/std": 0.18711508810520172, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 303, "train_speed(iter/s)": 0.005099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 121.4375, "completions/min_length": 8.0, "epoch": 0.010367288476622447, "frac_reward_zero_std": 0.0, "grad_norm": 3.3404972553253174, "kl": 0.31335487961769104, "learning_rate": 5.180640763462849e-07, "loss": 0.06348075717687607, "memory(GiB)": 69.45, "reward": 0.6784668564796448, "reward_std": 0.1582738757133484, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.971875011920929, "rewards/PlanningActionSetORM/std": 0.1113969162106514, "rewards/RMReward/mean": 0.8050000071525574, "rewards/RMReward/std": 0.1318112015724182, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.1558675318956375, "rewards/VisualPerceptionAccuracy/std": 0.10864797979593277, "step": 304, "train_speed(iter/s)": 0.005101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/mean_length": 192.703125, "completions/min_length": 101.0, "epoch": 0.010401391399242916, "frac_reward_zero_std": 0.0, "grad_norm": 0.717376708984375, "kl": 0.012645927257835865, "learning_rate": 5.197682344921609e-07, "loss": 0.017094755545258522, "memory(GiB)": 69.45, "reward": 0.6322231888771057, "reward_std": 0.0854329913854599, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9453744888305664, "rewards/PlanningActionSetORM/std": 0.06262645125389099, "rewards/RMReward/mean": 0.7887499928474426, "rewards/RMReward/std": 0.16297271847724915, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06866809725761414, "rewards/VisualPerceptionAccuracy/std": 0.12122991681098938, "step": 305, "train_speed(iter/s)": 0.005094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/mean_length": 89.0, "completions/min_length": 8.0, "epoch": 0.010435494321863383, "frac_reward_zero_std": 0.0, "grad_norm": 5.604002475738525, "kl": 0.40334969758987427, "learning_rate": 5.214723926380368e-07, "loss": -0.015530163422226906, "memory(GiB)": 69.45, "reward": 0.8828125, "reward_std": 0.12409119307994843, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9708333015441895, "rewards/PlanningActionSetORM/std": 0.048743072897195816, "rewards/RMReward/mean": 0.8614583015441895, "rewards/RMReward/std": 0.07454798370599747, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 306, "train_speed(iter/s)": 0.005095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 167.25, "completions/min_length": 94.0, "epoch": 0.010469597244483852, "frac_reward_zero_std": 0.0, "grad_norm": 0.46629539132118225, "kl": 0.015873238444328308, "learning_rate": 5.231765507839128e-07, "loss": -0.04955955594778061, "memory(GiB)": 69.45, "reward": 0.8001934289932251, "reward_std": 0.08454757183790207, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.986592173576355, "rewards/PlanningActionSetORM/std": 0.02901742234826088, "rewards/RMReward/mean": 0.7535938024520874, "rewards/RMReward/std": 0.1434757560491562, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 307, "train_speed(iter/s)": 0.005094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/mean_length": 112.578125, "completions/min_length": 14.0, "epoch": 0.010503700167104321, "frac_reward_zero_std": 0.0, "grad_norm": 4.188554763793945, "kl": 0.018884867429733276, "learning_rate": 5.248807089297888e-07, "loss": 0.063402459025383, "memory(GiB)": 69.45, "reward": 0.7995203733444214, "reward_std": 0.17846840620040894, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9509692192077637, "rewards/PlanningActionSetORM/std": 0.08610907196998596, "rewards/RMReward/mean": 0.7770833969116211, "rewards/RMReward/std": 0.1246093213558197, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 308, "train_speed(iter/s)": 0.005096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.78125, "completions/min_length": 8.0, "epoch": 0.01053780308972479, "frac_reward_zero_std": 1.0, "grad_norm": 0.01614847034215927, "kl": 0.8277525901794434, "learning_rate": 5.265848670756646e-07, "loss": 0.0008286094525828958, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 309, "train_speed(iter/s)": 0.005101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/mean_length": 127.3125, "completions/min_length": 8.0, "epoch": 0.010571906012345257, "frac_reward_zero_std": 0.75, "grad_norm": 0.26590561866760254, "kl": 0.5401639342308044, "learning_rate": 5.282890252215406e-07, "loss": 0.005359509959816933, "memory(GiB)": 69.45, "reward": 0.5150753259658813, "reward_std": 0.006813235580921173, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.6666666865348816, "rewards/SpatialReasoningORM/std": 0.47639307379722595, "rewards/VisualPerceptionAccuracy/mean": 0.010301481932401657, "rewards/VisualPerceptionAccuracy/std": 0.027252938598394394, "step": 310, "train_speed(iter/s)": 0.005106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 168.640625, "completions/min_length": 92.0, "epoch": 0.010606008934965726, "frac_reward_zero_std": 0.0, "grad_norm": 0.6736661195755005, "kl": 0.02276102639734745, "learning_rate": 5.299931833674166e-07, "loss": 0.036374494433403015, "memory(GiB)": 69.45, "reward": 0.8221534490585327, "reward_std": 0.1034855842590332, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9701421856880188, "rewards/PlanningActionSetORM/std": 0.09053152799606323, "rewards/RMReward/mean": 0.78515625, "rewards/RMReward/std": 0.13878145813941956, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 311, "train_speed(iter/s)": 0.005103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 104.53125, "completions/min_length": 9.0, "epoch": 0.010640111857586195, "frac_reward_zero_std": 0.0, "grad_norm": 5.7104387283325195, "kl": 0.34911012649536133, "learning_rate": 5.316973415132925e-07, "loss": 0.007825497537851334, "memory(GiB)": 69.45, "reward": 0.7907055020332336, "reward_std": 0.15796296298503876, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9963700175285339, "rewards/PlanningActionSetORM/std": 0.01761067844927311, "rewards/RMReward/mean": 0.7510417103767395, "rewards/RMReward/std": 0.10987883806228638, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 312, "train_speed(iter/s)": 0.005105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/mean_length": 134.453125, "completions/min_length": 8.0, "epoch": 0.010674214780206664, "frac_reward_zero_std": 0.75, "grad_norm": 0.2909960448741913, "kl": 0.23307445645332336, "learning_rate": 5.334014996591684e-07, "loss": -0.012703338637948036, "memory(GiB)": 69.45, "reward": 0.7570604085922241, "reward_std": 0.018021944910287857, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.028241701424121857, "rewards/VisualPerceptionAccuracy/std": 0.07208778709173203, "step": 313, "train_speed(iter/s)": 0.005105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/mean_length": 173.578125, "completions/min_length": 2.0, "epoch": 0.010708317702827132, "frac_reward_zero_std": 0.0, "grad_norm": 9.390215873718262, "kl": 0.026575513184070587, "learning_rate": 5.351056578050443e-07, "loss": 0.01112254336476326, "memory(GiB)": 69.45, "reward": 0.6778175234794617, "reward_std": 0.15473559498786926, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.09837386757135391, "rewards/RMReward/mean": 0.817187488079071, "rewards/RMReward/std": 0.1604476422071457, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5068849921226501, "rewards/VisualPerceptionAccuracy/std": 0.478324830532074, "step": 314, "train_speed(iter/s)": 0.005108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 195.875, "completions/min_length": 84.0, "epoch": 0.0107424206254476, "frac_reward_zero_std": 0.0, "grad_norm": 0.7216103672981262, "kl": 0.014508618041872978, "learning_rate": 5.368098159509203e-07, "loss": 0.01418386958539486, "memory(GiB)": 69.45, "reward": 0.5497620701789856, "reward_std": 0.08343185484409332, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9610979557037354, "rewards/PlanningActionSetORM/std": 0.05457461252808571, "rewards/RMReward/mean": 0.6552082896232605, "rewards/RMReward/std": 0.1092720553278923, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0498894564807415, "rewards/VisualPerceptionAccuracy/std": 0.11400047689676285, "step": 315, "train_speed(iter/s)": 0.005103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 149.78125, "completions/min_length": 14.0, "epoch": 0.01077652354806807, "frac_reward_zero_std": 0.0, "grad_norm": 2.7340800762176514, "kl": 0.024334348738193512, "learning_rate": 5.385139740967963e-07, "loss": -0.0051831696182489395, "memory(GiB)": 69.45, "reward": 0.6588646173477173, "reward_std": 0.1835300326347351, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9920138716697693, "rewards/PlanningActionSetORM/std": 0.03079705499112606, "rewards/RMReward/mean": 0.6808333396911621, "rewards/RMReward/std": 0.13803844153881073, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 316, "train_speed(iter/s)": 0.0051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 168.1875, "completions/min_length": 74.0, "epoch": 0.010810626470688539, "frac_reward_zero_std": 0.0, "grad_norm": 0.5311713218688965, "kl": 0.014291655272245407, "learning_rate": 5.402181322426721e-07, "loss": 0.008663852699100971, "memory(GiB)": 69.45, "reward": 0.8295651078224182, "reward_std": 0.06848076730966568, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9697005748748779, "rewards/PlanningActionSetORM/std": 0.06339983642101288, "rewards/RMReward/mean": 0.7945312261581421, "rewards/RMReward/std": 0.10355475544929504, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 317, "train_speed(iter/s)": 0.005101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 160.484375, "completions/min_length": 88.0, "epoch": 0.010844729393309006, "frac_reward_zero_std": 0.0, "grad_norm": 0.7482367753982544, "kl": 0.019716572016477585, "learning_rate": 5.419222903885481e-07, "loss": 0.010258251801133156, "memory(GiB)": 69.45, "reward": 0.6024327278137207, "reward_std": 0.08494926989078522, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9866319298744202, "rewards/PlanningActionSetORM/std": 0.03318849205970764, "rewards/RMReward/mean": 0.7531249523162842, "rewards/RMReward/std": 0.17906561493873596, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.010251584462821484, "rewards/VisualPerceptionAccuracy/std": 0.04100634157657623, "step": 318, "train_speed(iter/s)": 0.005102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 97.625, "completions/min_length": 8.0, "epoch": 0.010878832315929475, "frac_reward_zero_std": 0.0, "grad_norm": 6.344542026519775, "kl": 0.3606337010860443, "learning_rate": 5.436264485344241e-07, "loss": 0.00864484068006277, "memory(GiB)": 69.45, "reward": 0.713687539100647, "reward_std": 0.13207080960273743, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8691666722297668, "rewards/RMReward/std": 0.11527641862630844, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 319, "train_speed(iter/s)": 0.005102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 119.234375, "completions/min_length": 8.0, "epoch": 0.010912935238549944, "frac_reward_zero_std": 0.0, "grad_norm": 10.822799682617188, "kl": 0.27904656529426575, "learning_rate": 5.453306066803e-07, "loss": 0.019557472318410873, "memory(GiB)": 69.45, "reward": 0.5712681412696838, "reward_std": 0.21057280898094177, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9198975563049316, "rewards/PlanningActionSetORM/std": 0.1255006194114685, "rewards/RMReward/mean": 0.7390625476837158, "rewards/RMReward/std": 0.14904770255088806, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.09086336940526962, "rewards/VisualPerceptionAccuracy/std": 0.13665561378002167, "step": 320, "train_speed(iter/s)": 0.005105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 139.28125, "completions/min_length": 9.0, "epoch": 0.010947038161170413, "frac_reward_zero_std": 0.0, "grad_norm": 5.619868755340576, "kl": 0.32595232129096985, "learning_rate": 5.470347648261759e-07, "loss": -0.005989586468786001, "memory(GiB)": 69.45, "reward": 0.6410937309265137, "reward_std": 0.1790773868560791, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9645833373069763, "rewards/PlanningActionSetORM/std": 0.11937706917524338, "rewards/RMReward/mean": 0.7322916388511658, "rewards/RMReward/std": 0.19337143003940582, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 321, "train_speed(iter/s)": 0.005098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 45.703125, "completions/min_length": 8.0, "epoch": 0.01098114108379088, "frac_reward_zero_std": 1.0, "grad_norm": 0.004418172407895327, "kl": 0.5668042302131653, "learning_rate": 5.487389229720518e-07, "loss": 0.0005676334840245545, "memory(GiB)": 69.45, "reward": 0.75, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 322, "train_speed(iter/s)": 0.005097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 69.703125, "completions/min_length": 2.0, "epoch": 0.01101524400641135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032301025930792093, "kl": 0.20589211583137512, "learning_rate": 5.504430811179278e-07, "loss": 0.0002058693062281236, "memory(GiB)": 69.45, "reward": 0.512499988079071, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.5, "rewards/VisualPerceptionAccuracy/std": 0.5080004930496216, "step": 323, "train_speed(iter/s)": 0.005096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 236.421875, "completions/min_length": 8.0, "epoch": 0.011049346929031818, "frac_reward_zero_std": 0.0, "grad_norm": 9.496509552001953, "kl": 0.3409801423549652, "learning_rate": 5.521472392638038e-07, "loss": 0.025314174592494965, "memory(GiB)": 69.45, "reward": 0.45975276827812195, "reward_std": 0.20478428900241852, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9677039384841919, "rewards/PlanningActionSetORM/std": 0.028670258820056915, "rewards/RMReward/mean": 0.7062499523162842, "rewards/RMReward/std": 0.14930395781993866, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.03047027625143528, "rewards/VisualPerceptionAccuracy/std": 0.05124426260590553, "step": 324, "train_speed(iter/s)": 0.005088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 242.0625, "completions/min_length": 106.0, "epoch": 0.011083449851652287, "frac_reward_zero_std": 0.0, "grad_norm": 0.6532318592071533, "kl": 0.013563262298703194, "learning_rate": 5.538513974096796e-07, "loss": 0.01643209159374237, "memory(GiB)": 69.45, "reward": 0.4704831540584564, "reward_std": 0.06779948621988297, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9977678656578064, "rewards/PlanningActionSetORM/std": 0.012626911513507366, "rewards/RMReward/mean": 0.8849999904632568, "rewards/RMReward/std": 0.07720938324928284, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.03341274708509445, "rewards/VisualPerceptionAccuracy/std": 0.07702160626649857, "step": 325, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 90.078125, "completions/min_length": 8.0, "epoch": 0.011117552774272755, "frac_reward_zero_std": 0.0, "grad_norm": 3.7975430488586426, "kl": 0.44096556305885315, "learning_rate": 5.555555555555555e-07, "loss": 0.0209006629884243, "memory(GiB)": 69.45, "reward": 0.8748764991760254, "reward_std": 0.12390050292015076, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9612599015235901, "rewards/PlanningActionSetORM/std": 0.09384459257125854, "rewards/RMReward/mean": 0.8506249785423279, "rewards/RMReward/std": 0.09803181886672974, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 326, "train_speed(iter/s)": 0.005083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/mean_length": 138.953125, "completions/min_length": 8.0, "epoch": 0.011151655696893224, "frac_reward_zero_std": 0.0, "grad_norm": 17.59566879272461, "kl": 0.3287653625011444, "learning_rate": 5.572597137014316e-07, "loss": 0.03109089285135269, "memory(GiB)": 69.45, "reward": 0.4059596657752991, "reward_std": 0.29184216260910034, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.6468750238418579, "rewards/RMReward/std": 0.1687885820865631, "rewards/SpatialReasoningORM/mean": 0.34375, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": 0.15821371972560883, "rewards/VisualPerceptionAccuracy/std": 0.10434918850660324, "step": 327, "train_speed(iter/s)": 0.005084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 107.78125, "completions/min_length": 8.0, "epoch": 0.011185758619513693, "frac_reward_zero_std": 0.0, "grad_norm": 14.927371978759766, "kl": 0.39518600702285767, "learning_rate": 5.589638718473075e-07, "loss": -0.006560611538589001, "memory(GiB)": 69.45, "reward": 0.6992325186729431, "reward_std": 0.16911616921424866, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.09837386757135391, "rewards/RMReward/mean": 0.8418750166893005, "rewards/RMReward/std": 0.11811523139476776, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.17868006229400635, "rewards/VisualPerceptionAccuracy/std": 0.1502297818660736, "step": 328, "train_speed(iter/s)": 0.005085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 168.390625, "completions/min_length": 8.0, "epoch": 0.011219861542134162, "frac_reward_zero_std": 0.0, "grad_norm": 11.402052879333496, "kl": 0.45280641317367554, "learning_rate": 5.606680299931833e-07, "loss": 0.055666107684373856, "memory(GiB)": 69.45, "reward": 0.49811702966690063, "reward_std": 0.10998254269361496, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.08539126068353653, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.09842151403427124, "rewards/VisualPerceptionAccuracy/std": 0.12740246951580048, "step": 329, "train_speed(iter/s)": 0.005088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/mean_length": 87.296875, "completions/min_length": 8.0, "epoch": 0.011253964464754629, "frac_reward_zero_std": 0.0, "grad_norm": 7.271234512329102, "kl": 0.37396112084388733, "learning_rate": 5.623721881390593e-07, "loss": -0.01864844188094139, "memory(GiB)": 69.45, "reward": 0.7260313034057617, "reward_std": 0.1361151486635208, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8649999499320984, "rewards/RMReward/std": 0.1037796288728714, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 330, "train_speed(iter/s)": 0.005093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/mean_length": 113.671875, "completions/min_length": 14.0, "epoch": 0.011288067387375098, "frac_reward_zero_std": 0.0, "grad_norm": 1.8074911832809448, "kl": 0.019478194415569305, "learning_rate": 5.640763462849353e-07, "loss": -0.025816068053245544, "memory(GiB)": 69.45, "reward": 0.7971235513687134, "reward_std": 0.128288134932518, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9714488983154297, "rewards/PlanningActionSetORM/std": 0.09877581149339676, "rewards/RMReward/mean": 0.6937499642372131, "rewards/RMReward/std": 0.13433247804641724, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 331, "train_speed(iter/s)": 0.00509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/mean_length": 171.078125, "completions/min_length": 67.0, "epoch": 0.011322170309995567, "frac_reward_zero_std": 0.0, "grad_norm": 0.7276086807250977, "kl": 0.011846652254462242, "learning_rate": 5.657805044308112e-07, "loss": -0.01557854562997818, "memory(GiB)": 69.45, "reward": 0.8748303651809692, "reward_std": 0.07669711112976074, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.993526816368103, "rewards/PlanningActionSetORM/std": 0.02029844932258129, "rewards/RMReward/mean": 0.8451562523841858, "rewards/RMReward/std": 0.1312696486711502, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 332, "train_speed(iter/s)": 0.005091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 187.21875, "completions/min_length": 76.0, "epoch": 0.011356273232616036, "frac_reward_zero_std": 0.0, "grad_norm": 0.832055926322937, "kl": 0.02434718981385231, "learning_rate": 5.674846625766872e-07, "loss": -0.01274992898106575, "memory(GiB)": 69.45, "reward": 0.5982101559638977, "reward_std": 0.09288178384304047, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9929983019828796, "rewards/PlanningActionSetORM/std": 0.02592165395617485, "rewards/RMReward/mean": 0.7197916507720947, "rewards/RMReward/std": 0.15254563093185425, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06954161822795868, "rewards/VisualPerceptionAccuracy/std": 0.059369046241045, "step": 333, "train_speed(iter/s)": 0.005089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 75.875, "completions/min_length": 2.0, "epoch": 0.011390376155236503, "frac_reward_zero_std": 0.0, "grad_norm": 19.991744995117188, "kl": 0.44512853026390076, "learning_rate": 5.69188820722563e-07, "loss": -0.01989518105983734, "memory(GiB)": 69.45, "reward": 0.8512187004089355, "reward_std": 0.16111350059509277, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9606249928474426, "rewards/PlanningActionSetORM/std": 0.09604896605014801, "rewards/RMReward/mean": 0.7140624523162842, "rewards/RMReward/std": 0.10018882900476456, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.9375, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 334, "train_speed(iter/s)": 0.005095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/mean_length": 183.78125, "completions/min_length": 8.0, "epoch": 0.011424479077856972, "frac_reward_zero_std": 0.0, "grad_norm": 21.54302406311035, "kl": 0.2377747893333435, "learning_rate": 5.70892978868439e-07, "loss": 0.01159317884594202, "memory(GiB)": 69.45, "reward": 0.5171235799789429, "reward_std": 0.19855868816375732, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9593355655670166, "rewards/PlanningActionSetORM/std": 0.06053202599287033, "rewards/RMReward/mean": 0.6499999761581421, "rewards/RMReward/std": 0.20517498254776, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.0010101805673912168, "rewards/VisualPerceptionAccuracy/std": 0.004040722269564867, "step": 335, "train_speed(iter/s)": 0.005096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 92.0, "completions/min_length": 9.0, "epoch": 0.011458582000477441, "frac_reward_zero_std": 0.0, "grad_norm": 10.584470748901367, "kl": 0.21970613300800323, "learning_rate": 5.72597137014315e-07, "loss": -0.00857007596641779, "memory(GiB)": 69.45, "reward": 0.7404516935348511, "reward_std": 0.17409035563468933, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9990530014038086, "rewards/PlanningActionSetORM/std": 0.006560800597071648, "rewards/RMReward/mean": 0.8397917151451111, "rewards/RMReward/std": 0.1671983003616333, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 336, "train_speed(iter/s)": 0.005092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 125.59375, "completions/min_length": 8.0, "epoch": 0.01149268492309791, "frac_reward_zero_std": 0.0, "grad_norm": 3.7749059200286865, "kl": 0.31464067101478577, "learning_rate": 5.743012951601909e-07, "loss": 0.016609571874141693, "memory(GiB)": 69.45, "reward": 0.8709701299667358, "reward_std": 0.12138283252716064, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9787593483924866, "rewards/PlanningActionSetORM/std": 0.038640957325696945, "rewards/RMReward/mean": 0.8149999976158142, "rewards/RMReward/std": 0.1271788775920868, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 337, "train_speed(iter/s)": 0.005089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 194.046875, "completions/min_length": 97.0, "epoch": 0.011526787845718377, "frac_reward_zero_std": 0.0, "grad_norm": 0.5158781409263611, "kl": 0.02667395956814289, "learning_rate": 5.760054533060668e-07, "loss": 0.003787318244576454, "memory(GiB)": 69.45, "reward": 0.7856249809265137, "reward_std": 0.0926208421587944, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.981249988079071, "rewards/PlanningActionSetORM/std": 0.08521680533885956, "rewards/RMReward/mean": 0.7367187738418579, "rewards/RMReward/std": 0.12125082314014435, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 338, "train_speed(iter/s)": 0.005087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 198.8125, "completions/min_length": 94.0, "epoch": 0.011560890768338846, "frac_reward_zero_std": 0.0, "grad_norm": 0.7284677028656006, "kl": 0.02287740260362625, "learning_rate": 5.777096114519428e-07, "loss": 0.04897058755159378, "memory(GiB)": 69.45, "reward": 0.6530143022537231, "reward_std": 0.10224732011556625, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9710968136787415, "rewards/PlanningActionSetORM/std": 0.03686441853642464, "rewards/RMReward/mean": 0.7966666221618652, "rewards/RMReward/std": 0.09763878583908081, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.11739914119243622, "rewards/VisualPerceptionAccuracy/std": 0.18407177925109863, "step": 339, "train_speed(iter/s)": 0.005087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 142.1875, "completions/min_length": 67.0, "epoch": 0.011594993690959315, "frac_reward_zero_std": 0.0, "grad_norm": 0.9043580293655396, "kl": 0.036876481026411057, "learning_rate": 5.794137695978187e-07, "loss": 0.013967192731797695, "memory(GiB)": 69.45, "reward": 0.5955851078033447, "reward_std": 0.08501339703798294, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9340153336524963, "rewards/PlanningActionSetORM/std": 0.06962540745735168, "rewards/RMReward/mean": 0.7258333563804626, "rewards/RMReward/std": 0.09670823812484741, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07993112504482269, "rewards/VisualPerceptionAccuracy/std": 0.11802732199430466, "step": 340, "train_speed(iter/s)": 0.005081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/mean_length": 213.640625, "completions/min_length": 8.0, "epoch": 0.011629096613579784, "frac_reward_zero_std": 0.0, "grad_norm": 4.069617748260498, "kl": 0.20584522187709808, "learning_rate": 5.811179277436947e-07, "loss": -0.0007933145388960838, "memory(GiB)": 69.45, "reward": 0.454424113035202, "reward_std": 0.1184605062007904, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9584201574325562, "rewards/PlanningActionSetORM/std": 0.09077916294336319, "rewards/RMReward/mean": 0.7778124809265137, "rewards/RMReward/std": 0.07627347111701965, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.021078310906887054, "rewards/VisualPerceptionAccuracy/std": 0.03394928574562073, "step": 341, "train_speed(iter/s)": 0.005081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/mean_length": 114.5625, "completions/min_length": 14.0, "epoch": 0.011663199536200252, "frac_reward_zero_std": 0.0, "grad_norm": 3.402348518371582, "kl": 0.038482315838336945, "learning_rate": 5.828220858895705e-07, "loss": -0.019283728674054146, "memory(GiB)": 69.45, "reward": 0.7726492881774902, "reward_std": 0.17149314284324646, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9901618957519531, "rewards/PlanningActionSetORM/std": 0.024683251976966858, "rewards/RMReward/mean": 0.7225000262260437, "rewards/RMReward/std": 0.19887448847293854, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 342, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 157.140625, "completions/min_length": 98.0, "epoch": 0.01169730245882072, "frac_reward_zero_std": 0.0, "grad_norm": 0.644148051738739, "kl": 0.025477029383182526, "learning_rate": 5.845262440354465e-07, "loss": 0.0170297771692276, "memory(GiB)": 69.45, "reward": 0.7150592803955078, "reward_std": 0.06478558480739594, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8331249356269836, "rewards/RMReward/std": 0.14115265011787415, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.26073718070983887, "rewards/VisualPerceptionAccuracy/std": 0.065870001912117, "step": 343, "train_speed(iter/s)": 0.005079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 152.625, "completions/min_length": 8.0, "epoch": 0.01173140538144119, "frac_reward_zero_std": 0.0, "grad_norm": 12.32762622833252, "kl": 0.3823719322681427, "learning_rate": 5.862304021813225e-07, "loss": 0.012756852433085442, "memory(GiB)": 69.45, "reward": 0.6566254496574402, "reward_std": 0.21770885586738586, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9629195332527161, "rewards/PlanningActionSetORM/std": 0.10173175483942032, "rewards/RMReward/mean": 0.5854166746139526, "rewards/RMReward/std": 0.16820654273033142, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 344, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 160.015625, "completions/min_length": 14.0, "epoch": 0.011765508304061659, "frac_reward_zero_std": 0.0, "grad_norm": 3.6685729026794434, "kl": 0.0399780198931694, "learning_rate": 5.879345603271984e-07, "loss": 0.009862866252660751, "memory(GiB)": 69.45, "reward": 0.5278619527816772, "reward_std": 0.17446881532669067, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9543898701667786, "rewards/PlanningActionSetORM/std": 0.060228124260902405, "rewards/RMReward/mean": 0.8187500238418579, "rewards/RMReward/std": 0.09898679703474045, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.13219186663627625, "rewards/VisualPerceptionAccuracy/std": 0.180985689163208, "step": 345, "train_speed(iter/s)": 0.005082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/mean_length": 227.84375, "completions/min_length": 91.0, "epoch": 0.011799611226682126, "frac_reward_zero_std": 0.0, "grad_norm": 0.5437374114990234, "kl": 0.020887240767478943, "learning_rate": 5.896387184730743e-07, "loss": -0.005031399428844452, "memory(GiB)": 69.45, "reward": 0.5875885486602783, "reward_std": 0.0762166976928711, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9979166984558105, "rewards/PlanningActionSetORM/std": 0.014433760195970535, "rewards/RMReward/mean": 0.7262499928474426, "rewards/RMReward/std": 0.17219020426273346, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.008604242466390133, "rewards/VisualPerceptionAccuracy/std": 0.03269953280687332, "step": 346, "train_speed(iter/s)": 0.005077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 89.65625, "completions/min_length": 8.0, "epoch": 0.011833714149302595, "frac_reward_zero_std": 0.75, "grad_norm": 0.3370281457901001, "kl": 0.8896948099136353, "learning_rate": 5.913428766189503e-07, "loss": 0.014332905411720276, "memory(GiB)": 69.45, "reward": 0.7851793169975281, "reward_std": 0.03934251144528389, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.14071714878082275, "rewards/VisualPerceptionAccuracy/std": 0.15737004578113556, "step": 347, "train_speed(iter/s)": 0.005078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 160.328125, "completions/min_length": 97.0, "epoch": 0.011867817071923064, "frac_reward_zero_std": 0.0, "grad_norm": 0.7789832353591919, "kl": 0.03747539967298508, "learning_rate": 5.930470347648262e-07, "loss": 0.0011560851708054543, "memory(GiB)": 69.45, "reward": 0.6304270029067993, "reward_std": 0.11054781079292297, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9787773489952087, "rewards/PlanningActionSetORM/std": 0.04230834171175957, "rewards/RMReward/mean": 0.7102084159851074, "rewards/RMReward/std": 0.158724844455719, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2299416959285736, "rewards/VisualPerceptionAccuracy/std": 0.13627181947231293, "step": 348, "train_speed(iter/s)": 0.005077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 101.984375, "completions/min_length": 8.0, "epoch": 0.011901919994543533, "frac_reward_zero_std": 0.0, "grad_norm": 5.218850135803223, "kl": 0.42452043294906616, "learning_rate": 5.947511929107022e-07, "loss": 0.009625047445297241, "memory(GiB)": 69.45, "reward": 0.8157812356948853, "reward_std": 0.21118369698524475, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7250000238418579, "rewards/RMReward/std": 0.1361924707889557, "rewards/SpatialReasoningORM/mean": 0.84375, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 349, "train_speed(iter/s)": 0.005076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/mean_length": 153.0625, "completions/min_length": 8.0, "epoch": 0.011936022917164, "frac_reward_zero_std": 0.0, "grad_norm": 7.751214027404785, "kl": 0.3400135934352875, "learning_rate": 5.964553510565782e-07, "loss": 0.015347901731729507, "memory(GiB)": 69.45, "reward": 0.6965791583061218, "reward_std": 0.15413478016853333, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9468749761581421, "rewards/RMReward/std": 0.04321271553635597, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.10881669819355011, "rewards/VisualPerceptionAccuracy/std": 0.12611031532287598, "step": 350, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 90.46875, "completions/min_length": 8.0, "epoch": 0.01197012583978447, "frac_reward_zero_std": 0.0, "grad_norm": 4.740235328674316, "kl": 0.5260579586029053, "learning_rate": 5.98159509202454e-07, "loss": -0.0064430152997374535, "memory(GiB)": 69.45, "reward": 0.5002332925796509, "reward_std": 0.15749779343605042, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8828125, "rewards/PlanningActionSetORM/std": 0.03125, "rewards/RMReward/mean": 0.8656250238418579, "rewards/RMReward/std": 0.07238496094942093, "rewards/SpatialReasoningORM/mean": 0.53125, "rewards/SpatialReasoningORM/std": 0.507007360458374, "rewards/VisualPerceptionAccuracy/mean": 0.02249578759074211, "rewards/VisualPerceptionAccuracy/std": 0.012470102868974209, "step": 351, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 151.0, "completions/min_length": 8.0, "epoch": 0.012004228762404938, "frac_reward_zero_std": 0.0, "grad_norm": 8.218010902404785, "kl": 0.37744736671447754, "learning_rate": 5.9986366734833e-07, "loss": -0.023528533056378365, "memory(GiB)": 69.45, "reward": 0.7751015424728394, "reward_std": 0.16984108090400696, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9621351361274719, "rewards/PlanningActionSetORM/std": 0.09842412173748016, "rewards/RMReward/mean": 0.7583333849906921, "rewards/RMReward/std": 0.12937383353710175, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 352, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 153.671875, "completions/min_length": 8.0, "epoch": 0.012038331685025407, "frac_reward_zero_std": 0.0, "grad_norm": 5.325697422027588, "kl": 0.44640570878982544, "learning_rate": 6.015678254942059e-07, "loss": 0.01600956730544567, "memory(GiB)": 69.45, "reward": 0.6353265643119812, "reward_std": 0.18671967089176178, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9194072484970093, "rewards/PlanningActionSetORM/std": 0.049149058759212494, "rewards/RMReward/mean": 0.7903125286102295, "rewards/RMReward/std": 0.09556232392787933, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.14654326438903809, "rewards/VisualPerceptionAccuracy/std": 0.17802794277668, "step": 353, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 130.75, "completions/min_length": 15.0, "epoch": 0.012072434607645875, "frac_reward_zero_std": 0.0, "grad_norm": 1.238945484161377, "kl": 0.04019720107316971, "learning_rate": 6.032719836400819e-07, "loss": 0.007259652018547058, "memory(GiB)": 69.45, "reward": 0.7023721933364868, "reward_std": 0.08907061815261841, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9933035969734192, "rewards/PlanningActionSetORM/std": 0.02115318924188614, "rewards/RMReward/mean": 0.8712500333786011, "rewards/RMReward/std": 0.11774083226919174, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.07754234969615936, "rewards/VisualPerceptionAccuracy/std": 0.05577481538057327, "step": 354, "train_speed(iter/s)": 0.005075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 158.734375, "completions/min_length": 8.0, "epoch": 0.012106537530266344, "frac_reward_zero_std": 0.0, "grad_norm": 2.831192970275879, "kl": 0.2567981481552124, "learning_rate": 6.049761417859578e-07, "loss": 0.016651369631290436, "memory(GiB)": 69.45, "reward": 0.6727585196495056, "reward_std": 0.1625654399394989, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9352678656578064, "rewards/PlanningActionSetORM/std": 0.07381989806890488, "rewards/RMReward/mean": 0.78125, "rewards/RMReward/std": 0.12556324899196625, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.12630200386047363, "rewards/VisualPerceptionAccuracy/std": 0.21667882800102234, "step": 355, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 136.46875, "completions/min_length": 8.0, "epoch": 0.012140640452886813, "frac_reward_zero_std": 0.0, "grad_norm": 8.542367935180664, "kl": 0.5502690672874451, "learning_rate": 6.066802999318337e-07, "loss": -0.004108712077140808, "memory(GiB)": 69.45, "reward": 0.3596891164779663, "reward_std": 0.2284948080778122, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.65625, "rewards/SpatialReasoningORM/std": 0.4825586974620819, "rewards/VisualPerceptionAccuracy/mean": 0.04594072699546814, "rewards/VisualPerceptionAccuracy/std": 0.061299506574869156, "step": 356, "train_speed(iter/s)": 0.005082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 140.9375, "completions/min_length": 8.0, "epoch": 0.012174743375507282, "frac_reward_zero_std": 0.0, "grad_norm": 4.989202976226807, "kl": 0.45698651671409607, "learning_rate": 6.083844580777097e-07, "loss": -0.014668038114905357, "memory(GiB)": 69.45, "reward": 0.555292546749115, "reward_std": 0.1692790389060974, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8807291984558105, "rewards/PlanningActionSetORM/std": 0.07917864620685577, "rewards/RMReward/mean": 0.682812511920929, "rewards/RMReward/std": 0.1558583825826645, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.013878396712243557, "rewards/VisualPerceptionAccuracy/std": 0.055513590574264526, "step": 357, "train_speed(iter/s)": 0.005084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 80.453125, "completions/min_length": 8.0, "epoch": 0.012208846298127749, "frac_reward_zero_std": 0.0, "grad_norm": 17.95388412475586, "kl": 0.6925598382949829, "learning_rate": 6.100886162235857e-07, "loss": 0.0140468655154109, "memory(GiB)": 69.45, "reward": 0.5794062614440918, "reward_std": 0.23254112899303436, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8762500286102295, "rewards/RMReward/std": 0.07881172746419907, "rewards/SpatialReasoningORM/mean": 0.21875, "rewards/SpatialReasoningORM/std": 0.420013427734375, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 358, "train_speed(iter/s)": 0.005082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/mean_length": 164.5, "completions/min_length": 8.0, "epoch": 0.012242949220748218, "frac_reward_zero_std": 0.0, "grad_norm": 6.661040782928467, "kl": 0.4370995759963989, "learning_rate": 6.117927743694615e-07, "loss": 0.05313628911972046, "memory(GiB)": 69.45, "reward": 0.6942723989486694, "reward_std": 0.15628166496753693, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9576016664505005, "rewards/PlanningActionSetORM/std": 0.069029800593853, "rewards/RMReward/mean": 0.8028125166893005, "rewards/RMReward/std": 0.13390497863292694, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.22829878330230713, "rewards/VisualPerceptionAccuracy/std": 0.17193765938282013, "step": 359, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/mean_length": 160.09375, "completions/min_length": 100.0, "epoch": 0.012277052143368687, "frac_reward_zero_std": 0.0, "grad_norm": 0.5567981004714966, "kl": 0.0420532301068306, "learning_rate": 6.134969325153375e-07, "loss": -0.03117343783378601, "memory(GiB)": 69.45, "reward": 0.8296797275543213, "reward_std": 0.0695021003484726, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9990234375, "rewards/PlanningActionSetORM/std": 0.0078125, "rewards/RMReward/mean": 0.7873437404632568, "rewards/RMReward/std": 0.1878892183303833, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 360, "train_speed(iter/s)": 0.005078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 144.4375, "completions/min_length": 14.0, "epoch": 0.012311155065989156, "frac_reward_zero_std": 0.0, "grad_norm": 1.9748889207839966, "kl": 0.03918726369738579, "learning_rate": 6.152010906612134e-07, "loss": -0.008471069857478142, "memory(GiB)": 69.45, "reward": 0.6605000495910645, "reward_std": 0.1695147156715393, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.731041669845581, "rewards/RMReward/std": 0.19747598469257355, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 361, "train_speed(iter/s)": 0.005079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 137.859375, "completions/min_length": 82.0, "epoch": 0.012345257988609623, "frac_reward_zero_std": 0.0, "grad_norm": 0.7393240332603455, "kl": 0.04636208713054657, "learning_rate": 6.169052488070894e-07, "loss": -0.013126589357852936, "memory(GiB)": 69.45, "reward": 0.8384676575660706, "reward_std": 0.0809173658490181, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9754633903503418, "rewards/PlanningActionSetORM/std": 0.04358070343732834, "rewards/RMReward/mean": 0.8042187690734863, "rewards/RMReward/std": 0.13872210681438446, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 362, "train_speed(iter/s)": 0.005076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 58.1875, "completions/min_length": 8.0, "epoch": 0.012379360911230092, "frac_reward_zero_std": 0.75, "grad_norm": 0.41576147079467773, "kl": 0.7385034561157227, "learning_rate": 6.186094069529653e-07, "loss": 0.005442816764116287, "memory(GiB)": 69.45, "reward": 0.7629979848861694, "reward_std": 0.009176250547170639, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.051992036402225494, "rewards/VisualPerceptionAccuracy/std": 0.036705002188682556, "step": 363, "train_speed(iter/s)": 0.005074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/mean_length": 182.015625, "completions/min_length": 101.0, "epoch": 0.012413463833850561, "frac_reward_zero_std": 0.0, "grad_norm": 0.7501329183578491, "kl": 0.04746862128376961, "learning_rate": 6.203135650988412e-07, "loss": -0.010105805471539497, "memory(GiB)": 69.45, "reward": 0.8498685359954834, "reward_std": 0.08406589925289154, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9662174582481384, "rewards/PlanningActionSetORM/std": 0.048347823321819305, "rewards/RMReward/mean": 0.8207812905311584, "rewards/RMReward/std": 0.1322612315416336, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 364, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/mean_length": 238.21875, "completions/min_length": 99.0, "epoch": 0.01244756675647103, "frac_reward_zero_std": 0.0, "grad_norm": 0.6486160755157471, "kl": 0.031049486249685287, "learning_rate": 6.220177232447171e-07, "loss": 0.006272182799875736, "memory(GiB)": 69.45, "reward": 0.4258996546268463, "reward_std": 0.08893337100744247, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7421875, "rewards/RMReward/std": 0.1251511126756668, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.058049384504556656, "rewards/VisualPerceptionAccuracy/std": 0.15588800609111786, "step": 365, "train_speed(iter/s)": 0.005076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 166.125, "completions/min_length": 69.0, "epoch": 0.012481669679091497, "frac_reward_zero_std": 0.0, "grad_norm": 0.5566408038139343, "kl": 0.04413505643606186, "learning_rate": 6.237218813905932e-07, "loss": 0.009820721112191677, "memory(GiB)": 69.45, "reward": 0.8370580077171326, "reward_std": 0.06865938752889633, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9934151768684387, "rewards/PlanningActionSetORM/std": 0.02630016580224037, "rewards/RMReward/mean": 0.7979687452316284, "rewards/RMReward/std": 0.10861582309007645, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 366, "train_speed(iter/s)": 0.005067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 154.1875, "completions/min_length": 94.0, "epoch": 0.012515772601711966, "frac_reward_zero_std": 0.0, "grad_norm": 0.8446842432022095, "kl": 0.04465162754058838, "learning_rate": 6.25426039536469e-07, "loss": 0.008183088153600693, "memory(GiB)": 69.45, "reward": 0.452908456325531, "reward_std": 0.060197848826646805, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8628125190734863, "rewards/RMReward/std": 0.1251640021800995, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.015566974878311157, "rewards/VisualPerceptionAccuracy/std": 0.04359227791428566, "step": 367, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/mean_length": 124.953125, "completions/min_length": 8.0, "epoch": 0.012549875524332435, "frac_reward_zero_std": 0.0, "grad_norm": 9.200446128845215, "kl": 0.3129236400127411, "learning_rate": 6.27130197682345e-07, "loss": -0.034285034984350204, "memory(GiB)": 69.45, "reward": 0.6586930751800537, "reward_std": 0.15098237991333008, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9562499523162842, "rewards/PlanningActionSetORM/std": 0.12164861708879471, "rewards/RMReward/mean": 0.8331249952316284, "rewards/RMReward/std": 0.1288018524646759, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.09739722311496735, "rewards/VisualPerceptionAccuracy/std": 0.052130162715911865, "step": 368, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 239.625, "completions/min_length": 100.0, "epoch": 0.012583978446952904, "frac_reward_zero_std": 0.0, "grad_norm": 0.7604706287384033, "kl": 0.044850531965494156, "learning_rate": 6.288343558282209e-07, "loss": 0.005872165784239769, "memory(GiB)": 69.45, "reward": 0.6142931580543518, "reward_std": 0.08623957633972168, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9629951119422913, "rewards/PlanningActionSetORM/std": 0.04055073484778404, "rewards/RMReward/mean": 0.7552083134651184, "rewards/RMReward/std": 0.08005289733409882, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06687554717063904, "rewards/VisualPerceptionAccuracy/std": 0.15118442475795746, "step": 369, "train_speed(iter/s)": 0.005067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 151.609375, "completions/min_length": 15.0, "epoch": 0.012618081369573372, "frac_reward_zero_std": 0.0, "grad_norm": 3.7308502197265625, "kl": 0.026280954480171204, "learning_rate": 6.305385139740969e-07, "loss": -0.006071016192436218, "memory(GiB)": 69.45, "reward": 0.35149478912353516, "reward_std": 0.15159741044044495, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.7906249761581421, "rewards/RMReward/std": 0.06884463876485825, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.14548955857753754, "rewards/VisualPerceptionAccuracy/std": 0.08539096266031265, "step": 370, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 61.859375, "completions/min_length": 8.0, "epoch": 0.01265218429219384, "frac_reward_zero_std": 0.0, "grad_norm": 9.811009407043457, "kl": 0.7794462442398071, "learning_rate": 6.322426721199727e-07, "loss": 0.016253536567091942, "memory(GiB)": 69.45, "reward": 0.9035347700119019, "reward_std": 0.14859038591384888, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9347222447395325, "rewards/PlanningActionSetORM/std": 0.07511772215366364, "rewards/RMReward/mean": 0.8493750095367432, "rewards/RMReward/std": 0.07918689399957657, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 371, "train_speed(iter/s)": 0.005075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 163.734375, "completions/min_length": 14.0, "epoch": 0.01268628721481431, "frac_reward_zero_std": 0.0, "grad_norm": 2.55371356010437, "kl": 0.030613936483860016, "learning_rate": 6.339468302658487e-07, "loss": -0.040734272450208664, "memory(GiB)": 69.45, "reward": 0.8573213815689087, "reward_std": 0.1266857534646988, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9686012268066406, "rewards/PlanningActionSetORM/std": 0.03544701263308525, "rewards/RMReward/mean": 0.7947916984558105, "rewards/RMReward/std": 0.15752054750919342, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 372, "train_speed(iter/s)": 0.005077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.546875, "completions/min_length": 8.0, "epoch": 0.012720390137434779, "frac_reward_zero_std": 1.0, "grad_norm": 0.012921950779855251, "kl": 0.551144540309906, "learning_rate": 6.356509884117247e-07, "loss": 0.0005513461655937135, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 373, "train_speed(iter/s)": 0.005079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 146.328125, "completions/min_length": 8.0, "epoch": 0.012754493060055246, "frac_reward_zero_std": 0.0, "grad_norm": 9.659248352050781, "kl": 0.4792747497558594, "learning_rate": 6.373551465576006e-07, "loss": 0.017458105459809303, "memory(GiB)": 69.45, "reward": 0.6698307991027832, "reward_std": 0.13711509108543396, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9591382741928101, "rewards/PlanningActionSetORM/std": 0.0518825501203537, "rewards/RMReward/mean": 0.8218750357627869, "rewards/RMReward/std": 0.09749896824359894, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.09941770136356354, "rewards/VisualPerceptionAccuracy/std": 0.10343368351459503, "step": 374, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 147.515625, "completions/min_length": 82.0, "epoch": 0.012788595982675715, "frac_reward_zero_std": 0.0, "grad_norm": 0.7157317399978638, "kl": 0.03561176359653473, "learning_rate": 6.390593047034766e-07, "loss": 0.01544945128262043, "memory(GiB)": 69.45, "reward": 0.6554405093193054, "reward_std": 0.058163680136203766, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8191666603088379, "rewards/RMReward/std": 0.12658919394016266, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.055762115865945816, "rewards/VisualPerceptionAccuracy/std": 0.020027877762913704, "step": 375, "train_speed(iter/s)": 0.005075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/mean_length": 158.46875, "completions/min_length": 8.0, "epoch": 0.012822698905296184, "frac_reward_zero_std": 0.0, "grad_norm": 5.956635475158691, "kl": 0.509128212928772, "learning_rate": 6.407634628493525e-07, "loss": 0.01021464355289936, "memory(GiB)": 69.45, "reward": 0.49763453006744385, "reward_std": 0.15669015049934387, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.887499988079071, "rewards/RMReward/std": 0.06952216476202011, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.1293315589427948, "rewards/VisualPerceptionAccuracy/std": 0.1384287178516388, "step": 376, "train_speed(iter/s)": 0.005078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 218.71875, "completions/min_length": 8.0, "epoch": 0.012856801827916653, "frac_reward_zero_std": 0.0, "grad_norm": 24.470989227294922, "kl": 0.4733067452907562, "learning_rate": 6.424676209952284e-07, "loss": 0.00700244028121233, "memory(GiB)": 69.45, "reward": 0.391300231218338, "reward_std": 0.21714499592781067, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9535484313964844, "rewards/PlanningActionSetORM/std": 0.025853564962744713, "rewards/RMReward/mean": 0.6343749761581421, "rewards/RMReward/std": 0.13505400717258453, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.1709955930709839, "rewards/VisualPerceptionAccuracy/std": 0.1353432685136795, "step": 377, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 135.0, "completions/min_length": 83.0, "epoch": 0.01289090475053712, "frac_reward_zero_std": 0.0, "grad_norm": 0.5935525894165039, "kl": 0.031892817467451096, "learning_rate": 6.441717791411044e-07, "loss": -0.000324321910738945, "memory(GiB)": 69.45, "reward": 0.8401406407356262, "reward_std": 0.05629011243581772, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9800781011581421, "rewards/PlanningActionSetORM/std": 0.043645139783620834, "rewards/RMReward/mean": 0.8051562309265137, "rewards/RMReward/std": 0.17754939198493958, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 378, "train_speed(iter/s)": 0.00508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 122.671875, "completions/min_length": 8.0, "epoch": 0.01292500767315759, "frac_reward_zero_std": 0.0, "grad_norm": 17.57994270324707, "kl": 0.30433890223503113, "learning_rate": 6.458759372869803e-07, "loss": 0.019965248182415962, "memory(GiB)": 69.45, "reward": 0.8484687805175781, "reward_std": 0.1506139039993286, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9958333373069763, "rewards/PlanningActionSetORM/std": 0.020194098353385925, "rewards/RMReward/mean": 0.8227083683013916, "rewards/RMReward/std": 0.0969039723277092, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 379, "train_speed(iter/s)": 0.005078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 150.953125, "completions/min_length": 2.0, "epoch": 0.012959110595778058, "frac_reward_zero_std": 0.0, "grad_norm": 26.267677307128906, "kl": 0.058306824415922165, "learning_rate": 6.475800954328562e-07, "loss": -0.08998194336891174, "memory(GiB)": 69.45, "reward": 0.6615546941757202, "reward_std": 0.16036555171012878, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9195311665534973, "rewards/PlanningActionSetORM/std": 0.1547541320323944, "rewards/RMReward/mean": 0.7945833206176758, "rewards/RMReward/std": 0.11347430944442749, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1875, "rewards/VisualPerceptionAccuracy/std": 0.40311288833618164, "step": 380, "train_speed(iter/s)": 0.005075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.140625, "completions/min_length": 8.0, "epoch": 0.012993213518398527, "frac_reward_zero_std": 0.0, "grad_norm": 19.446125030517578, "kl": 1.248046875, "learning_rate": 6.492842535787322e-07, "loss": -0.012898769229650497, "memory(GiB)": 69.45, "reward": 0.5546875, "reward_std": 0.3984794020652771, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.53125, "rewards/SpatialReasoningORM/std": 0.5029674172401428, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 381, "train_speed(iter/s)": 0.005087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 185.90625, "completions/min_length": 93.0, "epoch": 0.013027316441018994, "frac_reward_zero_std": 0.0, "grad_norm": 0.8045307397842407, "kl": 0.03674007207155228, "learning_rate": 6.509884117246081e-07, "loss": -0.027425279840826988, "memory(GiB)": 69.45, "reward": 0.6987142562866211, "reward_std": 0.05904275178909302, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9528409838676453, "rewards/PlanningActionSetORM/std": 0.09533991664648056, "rewards/RMReward/mean": 0.888124942779541, "rewards/RMReward/std": 0.07465256005525589, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09165240824222565, "rewards/VisualPerceptionAccuracy/std": 0.03591272607445717, "step": 382, "train_speed(iter/s)": 0.005081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 118.5625, "completions/min_length": 8.0, "epoch": 0.013061419363639463, "frac_reward_zero_std": 0.0, "grad_norm": 15.82660961151123, "kl": 0.6536732912063599, "learning_rate": 6.526925698704841e-07, "loss": -0.025367939844727516, "memory(GiB)": 69.45, "reward": 0.43086010217666626, "reward_std": 0.21726742386817932, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7875000238418579, "rewards/RMReward/std": 0.06454972177743912, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": 0.1996903270483017, "rewards/VisualPerceptionAccuracy/std": 0.09319975972175598, "step": 383, "train_speed(iter/s)": 0.005086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/mean_length": 61.546875, "completions/min_length": 8.0, "epoch": 0.013095522286259932, "frac_reward_zero_std": 0.0, "grad_norm": 15.874649047851562, "kl": 0.6622673869132996, "learning_rate": 6.5439672801636e-07, "loss": 0.02555866539478302, "memory(GiB)": 69.45, "reward": 0.6270483732223511, "reward_std": 0.219709575176239, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9379836320877075, "rewards/PlanningActionSetORM/std": 0.09038233757019043, "rewards/RMReward/mean": 0.8253124952316284, "rewards/RMReward/std": 0.14628703892230988, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 384, "train_speed(iter/s)": 0.00509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/mean_length": 194.796875, "completions/min_length": 99.0, "epoch": 0.013129625208880401, "frac_reward_zero_std": 0.0, "grad_norm": 0.4566393494606018, "kl": 0.03595701977610588, "learning_rate": 6.561008861622359e-07, "loss": -0.005258901044726372, "memory(GiB)": 69.45, "reward": 0.7959516048431396, "reward_std": 0.07821876555681229, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.984133243560791, "rewards/PlanningActionSetORM/std": 0.02954111620783806, "rewards/RMReward/mean": 0.7489062547683716, "rewards/RMReward/std": 0.19911006093025208, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 385, "train_speed(iter/s)": 0.005085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 105.625, "completions/min_length": 14.0, "epoch": 0.013163728131500869, "frac_reward_zero_std": 0.0, "grad_norm": 4.487667560577393, "kl": 0.031046777963638306, "learning_rate": 6.578050443081119e-07, "loss": 0.005256105214357376, "memory(GiB)": 69.45, "reward": 0.5059080123901367, "reward_std": 0.11236575990915298, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.8543750047683716, "rewards/RMReward/std": 0.13124556839466095, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.15225696563720703, "rewards/VisualPerceptionAccuracy/std": 0.016327714547514915, "step": 386, "train_speed(iter/s)": 0.005086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 162.15625, "completions/min_length": 14.0, "epoch": 0.013197831054121338, "frac_reward_zero_std": 0.0, "grad_norm": 4.531765460968018, "kl": 0.03441854566335678, "learning_rate": 6.595092024539878e-07, "loss": 0.07167820632457733, "memory(GiB)": 69.45, "reward": 0.6373130083084106, "reward_std": 0.13606326282024384, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9009374976158142, "rewards/RMReward/std": 0.06688262522220612, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.004626926500350237, "rewards/VisualPerceptionAccuracy/std": 0.0185077041387558, "step": 387, "train_speed(iter/s)": 0.00509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 95.453125, "completions/min_length": 8.0, "epoch": 0.013231933976741807, "frac_reward_zero_std": 0.0, "grad_norm": 4.618619441986084, "kl": 0.38650721311569214, "learning_rate": 6.612133605998637e-07, "loss": -0.02400306798517704, "memory(GiB)": 69.45, "reward": 0.8769062757492065, "reward_std": 0.10076236724853516, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8195834159851074, "rewards/RMReward/std": 0.1980324238538742, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 388, "train_speed(iter/s)": 0.005088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/mean_length": 238.75, "completions/min_length": 83.0, "epoch": 0.013266036899362276, "frac_reward_zero_std": 0.0, "grad_norm": 1.1008973121643066, "kl": 0.0476832240819931, "learning_rate": 6.629175187457397e-07, "loss": -0.04925195872783661, "memory(GiB)": 69.45, "reward": 0.22579318284988403, "reward_std": 0.056768715381622314, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9930555820465088, "rewards/PlanningActionSetORM/std": 0.0277777761220932, "rewards/RMReward/mean": 0.6281249523162842, "rewards/RMReward/std": 0.13658788800239563, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06735385209321976, "rewards/VisualPerceptionAccuracy/std": 0.0644267201423645, "step": 389, "train_speed(iter/s)": 0.005085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/mean_length": 115.90625, "completions/min_length": 8.0, "epoch": 0.013300139821982745, "frac_reward_zero_std": 0.0, "grad_norm": 14.165087699890137, "kl": 0.4094739556312561, "learning_rate": 6.646216768916156e-07, "loss": -0.039736054837703705, "memory(GiB)": 69.45, "reward": 0.710031270980835, "reward_std": 0.19956724345684052, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7393749356269836, "rewards/RMReward/std": 0.20549272000789642, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 390, "train_speed(iter/s)": 0.005086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/mean_length": 225.4375, "completions/min_length": 101.0, "epoch": 0.013334242744603212, "frac_reward_zero_std": 0.0, "grad_norm": 0.8408118486404419, "kl": 0.03779301047325134, "learning_rate": 6.663258350374916e-07, "loss": -0.10180748999118805, "memory(GiB)": 69.45, "reward": 0.5173502564430237, "reward_std": 0.08492738753557205, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9704522490501404, "rewards/PlanningActionSetORM/std": 0.041626401245594025, "rewards/RMReward/mean": 0.6164583563804626, "rewards/RMReward/std": 0.1363738477230072, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0076295603066682816, "rewards/VisualPerceptionAccuracy/std": 0.030518243089318275, "step": 391, "train_speed(iter/s)": 0.005082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 213.09375, "completions/min_length": 66.0, "epoch": 0.013368345667223681, "frac_reward_zero_std": 0.0, "grad_norm": 0.7673869729042053, "kl": 0.03622743487358093, "learning_rate": 6.680299931833676e-07, "loss": 0.0027324333786964417, "memory(GiB)": 69.45, "reward": 0.6853574514389038, "reward_std": 0.07024219632148743, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9549617171287537, "rewards/PlanningActionSetORM/std": 0.04146895930171013, "rewards/RMReward/mean": 0.8268750309944153, "rewards/RMReward/std": 0.10225450247526169, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.18395274877548218, "rewards/VisualPerceptionAccuracy/std": 0.09872056543827057, "step": 392, "train_speed(iter/s)": 0.005081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/mean_length": 34.5, "completions/min_length": 8.0, "epoch": 0.01340244858984415, "frac_reward_zero_std": 0.0, "grad_norm": 9.134946823120117, "kl": 0.6916907429695129, "learning_rate": 6.697341513292434e-07, "loss": -0.00922984816133976, "memory(GiB)": 69.45, "reward": 0.6698437333106995, "reward_std": 0.25214898586273193, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.027195289731025696, "rewards/SpatialReasoningORM/mean": 0.6041666865348816, "rewards/SpatialReasoningORM/std": 0.49420401453971863, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 393, "train_speed(iter/s)": 0.005089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 95.78125, "completions/min_length": 8.0, "epoch": 0.013436551512464619, "frac_reward_zero_std": 0.0, "grad_norm": 19.257781982421875, "kl": 0.4254208505153656, "learning_rate": 6.714383094751194e-07, "loss": 0.0182246845215559, "memory(GiB)": 69.45, "reward": 0.7644270658493042, "reward_std": 0.1684466004371643, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.996180534362793, "rewards/PlanningActionSetORM/std": 0.01859090104699135, "rewards/RMReward/mean": 0.8062500357627869, "rewards/RMReward/std": 0.09654951840639114, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 394, "train_speed(iter/s)": 0.005084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/mean_length": 116.578125, "completions/min_length": 8.0, "epoch": 0.013470654435085086, "frac_reward_zero_std": 0.75, "grad_norm": 0.5603694319725037, "kl": 0.5723918676376343, "learning_rate": 6.731424676209952e-07, "loss": 0.011168370023369789, "memory(GiB)": 69.45, "reward": 0.7550222277641296, "reward_std": 0.005029338877648115, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.02008889988064766, "rewards/VisualPerceptionAccuracy/std": 0.02011735364794731, "step": 395, "train_speed(iter/s)": 0.005079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/mean_length": 184.09375, "completions/min_length": 80.0, "epoch": 0.013504757357705555, "frac_reward_zero_std": 0.0, "grad_norm": 0.6024854183197021, "kl": 0.04404645785689354, "learning_rate": 6.748466257668713e-07, "loss": 0.02266225777566433, "memory(GiB)": 69.45, "reward": 0.7827638983726501, "reward_std": 0.06494900584220886, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9325693845748901, "rewards/PlanningActionSetORM/std": 0.07920264452695847, "rewards/RMReward/mean": 0.745312511920929, "rewards/RMReward/std": 0.08485223352909088, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 396, "train_speed(iter/s)": 0.005079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 268.84375, "completions/min_length": 95.0, "epoch": 0.013538860280326024, "frac_reward_zero_std": 0.0, "grad_norm": 0.681078314781189, "kl": 0.03552109748125076, "learning_rate": 6.765507839127472e-07, "loss": -0.025065673515200615, "memory(GiB)": 69.45, "reward": 0.7016306519508362, "reward_std": 0.08950787782669067, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8708333373069763, "rewards/RMReward/std": 0.11960632354021072, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.11652280390262604, "rewards/VisualPerceptionAccuracy/std": 0.09265892952680588, "step": 397, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 133.6875, "completions/min_length": 8.0, "epoch": 0.013572963202946493, "frac_reward_zero_std": 0.0, "grad_norm": 27.08705711364746, "kl": 0.7938302755355835, "learning_rate": 6.78254942058623e-07, "loss": -0.060368478298187256, "memory(GiB)": 69.45, "reward": 0.6606249809265137, "reward_std": 0.26957422494888306, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.984375, "rewards/PlanningActionSetORM/std": 0.0883883461356163, "rewards/RMReward/mean": 0.823437511920929, "rewards/RMReward/std": 0.06348021328449249, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 398, "train_speed(iter/s)": 0.005068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 297.546875, "completions/min_length": 113.0, "epoch": 0.01360706612556696, "frac_reward_zero_std": 0.0, "grad_norm": 0.5315088033676147, "kl": 0.030005773529410362, "learning_rate": 6.799591002044991e-07, "loss": 0.03264927864074707, "memory(GiB)": 69.45, "reward": 0.6167799234390259, "reward_std": 0.08445829153060913, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9450898170471191, "rewards/PlanningActionSetORM/std": 0.023344991728663445, "rewards/RMReward/mean": 0.7583333849906921, "rewards/RMReward/std": 0.11124696582555771, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08006580919027328, "rewards/VisualPerceptionAccuracy/std": 0.1127348244190216, "step": 399, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/mean_length": 65.40625, "completions/min_length": 8.0, "epoch": 0.01364116904818743, "frac_reward_zero_std": 0.0, "grad_norm": 11.610371589660645, "kl": 0.6342613697052002, "learning_rate": 6.816632583503751e-07, "loss": -0.049727242439985275, "memory(GiB)": 69.45, "reward": 0.8129062652587891, "reward_std": 0.2251531481742859, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9765625, "rewards/PlanningActionSetORM/std": 0.046055786311626434, "rewards/RMReward/mean": 0.8350000381469727, "rewards/RMReward/std": 0.14721062779426575, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.4399413466453552, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 400, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 166.96875, "completions/min_length": 9.0, "epoch": 0.013675271970807899, "frac_reward_zero_std": 0.5, "grad_norm": 0.37758198380470276, "kl": 0.30909669399261475, "learning_rate": 6.833674164962508e-07, "loss": -0.07977592945098877, "memory(GiB)": 69.45, "reward": 0.8806250095367432, "reward_std": 0.052209462970495224, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.956250011920929, "rewards/PlanningActionSetORM/std": 0.13897666335105896, "rewards/RMReward/mean": 0.7125000357627869, "rewards/RMReward/std": 0.142557293176651, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 401, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 132.4375, "completions/min_length": 99.0, "epoch": 0.013709374893428368, "frac_reward_zero_std": 0.0, "grad_norm": 0.6352049708366394, "kl": 0.04021678492426872, "learning_rate": 6.850715746421269e-07, "loss": 1.3333279639482498e-05, "memory(GiB)": 69.45, "reward": 0.7383752465248108, "reward_std": 0.05523773282766342, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9583333134651184, "rewards/PlanningActionSetORM/std": 0.059549134224653244, "rewards/RMReward/mean": 0.9652083516120911, "rewards/RMReward/std": 0.06024734303355217, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06200101599097252, "rewards/VisualPerceptionAccuracy/std": 0.1344725340604782, "step": 402, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/mean_length": 132.359375, "completions/min_length": 8.0, "epoch": 0.013743477816048835, "frac_reward_zero_std": 0.0, "grad_norm": 3.5272088050842285, "kl": 0.358977735042572, "learning_rate": 6.867757327880027e-07, "loss": 0.0014690179377794266, "memory(GiB)": 69.45, "reward": 0.7023107409477234, "reward_std": 0.11284612864255905, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9545454978942871, "rewards/PlanningActionSetORM/std": 0.046181850135326385, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.11103435605764389, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.11179979890584946, "rewards/VisualPerceptionAccuracy/std": 0.04409690573811531, "step": 403, "train_speed(iter/s)": 0.005053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 223.6875, "completions/min_length": 86.0, "epoch": 0.013777580738669304, "frac_reward_zero_std": 0.0, "grad_norm": 0.6145146489143372, "kl": 0.03418319672346115, "learning_rate": 6.884798909338787e-07, "loss": 0.0009155753068625927, "memory(GiB)": 69.45, "reward": 0.6488364934921265, "reward_std": 0.060847483575344086, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9093623757362366, "rewards/PlanningActionSetORM/std": 0.027891265228390694, "rewards/RMReward/mean": 0.7995832562446594, "rewards/RMReward/std": 0.09646867215633392, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13072845339775085, "rewards/VisualPerceptionAccuracy/std": 0.0513584241271019, "step": 404, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 197.4375, "completions/min_length": 80.0, "epoch": 0.013811683661289773, "frac_reward_zero_std": 0.0, "grad_norm": 0.6212000846862793, "kl": 0.038112521171569824, "learning_rate": 6.901840490797547e-07, "loss": 0.00205293670296669, "memory(GiB)": 69.45, "reward": 0.6927293539047241, "reward_std": 0.0658067911863327, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9805334210395813, "rewards/PlanningActionSetORM/std": 0.02812463417649269, "rewards/RMReward/mean": 0.8395833969116211, "rewards/RMReward/std": 0.14012089371681213, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1675974577665329, "rewards/VisualPerceptionAccuracy/std": 0.09106248617172241, "step": 405, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/mean_length": 199.203125, "completions/min_length": 96.0, "epoch": 0.013845786583910242, "frac_reward_zero_std": 0.0, "grad_norm": 0.5164350271224976, "kl": 0.039434902369976044, "learning_rate": 6.918882072256305e-07, "loss": 0.018996145576238632, "memory(GiB)": 69.45, "reward": 0.8367530107498169, "reward_std": 0.08232679963111877, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.951890230178833, "rewards/PlanningActionSetORM/std": 0.04819236323237419, "rewards/RMReward/mean": 0.8079687356948853, "rewards/RMReward/std": 0.1559831202030182, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 406, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/mean_length": 146.4375, "completions/min_length": 8.0, "epoch": 0.01387988950653071, "frac_reward_zero_std": 0.0, "grad_norm": 11.072874069213867, "kl": 0.3886912763118744, "learning_rate": 6.935923653715065e-07, "loss": -0.03319701924920082, "memory(GiB)": 69.45, "reward": 0.432765394449234, "reward_std": 0.22094173729419708, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9090909361839294, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9156249761581421, "rewards/RMReward/std": 0.09542667865753174, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.49186936020851135, "rewards/VisualPerceptionAccuracy/mean": 0.0042433543130755424, "rewards/VisualPerceptionAccuracy/std": 0.007938865572214127, "step": 407, "train_speed(iter/s)": 0.005047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 134.453125, "completions/min_length": 2.0, "epoch": 0.013913992429151178, "frac_reward_zero_std": 0.0, "grad_norm": 61.622615814208984, "kl": 0.08432707190513611, "learning_rate": 6.952965235173826e-07, "loss": -0.024414600804448128, "memory(GiB)": 69.45, "reward": 0.7986249923706055, "reward_std": 0.17699924111366272, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7685416340827942, "rewards/RMReward/std": 0.16907745599746704, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 408, "train_speed(iter/s)": 0.005046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 168.890625, "completions/min_length": 107.0, "epoch": 0.013948095351771647, "frac_reward_zero_std": 0.0, "grad_norm": 0.7621544003486633, "kl": 0.04054006561636925, "learning_rate": 6.970006816632583e-07, "loss": 0.021077683195471764, "memory(GiB)": 69.45, "reward": 0.6761959195137024, "reward_std": 0.07682199031114578, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9985119700431824, "rewards/PlanningActionSetORM/std": 0.010309829376637936, "rewards/RMReward/mean": 0.8212500214576721, "rewards/RMReward/std": 0.10509620606899261, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13467645645141602, "rewards/VisualPerceptionAccuracy/std": 0.06163617596030235, "step": 409, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 128.703125, "completions/min_length": 8.0, "epoch": 0.013982198274392116, "frac_reward_zero_std": 0.0, "grad_norm": 10.55576229095459, "kl": 0.4405179023742676, "learning_rate": 6.987048398091343e-07, "loss": -0.01190480962395668, "memory(GiB)": 69.45, "reward": 0.8774532675743103, "reward_std": 0.1602354347705841, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9736467003822327, "rewards/PlanningActionSetORM/std": 0.037971798330545425, "rewards/RMReward/mean": 0.9260416626930237, "rewards/RMReward/std": 0.10395865887403488, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 410, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 152.609375, "completions/min_length": 8.0, "epoch": 0.014016301197012583, "frac_reward_zero_std": 0.0, "grad_norm": 5.466578006744385, "kl": 0.27482831478118896, "learning_rate": 7.004089979550102e-07, "loss": -0.0009460793808102608, "memory(GiB)": 69.45, "reward": 0.8952326774597168, "reward_std": 0.11381669342517853, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9405092597007751, "rewards/PlanningActionSetORM/std": 0.11041124910116196, "rewards/RMReward/mean": 0.8650000095367432, "rewards/RMReward/std": 0.08666393905878067, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 411, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 105.21875, "completions/min_length": 14.0, "epoch": 0.014050404119633052, "frac_reward_zero_std": 0.0, "grad_norm": 2.5375237464904785, "kl": 0.061204709112644196, "learning_rate": 7.021131561008862e-07, "loss": 0.008641698397696018, "memory(GiB)": 69.45, "reward": 0.6800290942192078, "reward_std": 0.08661012351512909, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9977678656578064, "rewards/PlanningActionSetORM/std": 0.012626911513507366, "rewards/RMReward/mean": 0.859375, "rewards/RMReward/std": 0.07873750478029251, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.005384059157222509, "rewards/VisualPerceptionAccuracy/std": 0.021536236628890038, "step": 412, "train_speed(iter/s)": 0.005046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 170.921875, "completions/min_length": 79.0, "epoch": 0.014084507042253521, "frac_reward_zero_std": 0.0, "grad_norm": 0.8075844645500183, "kl": 0.04478128254413605, "learning_rate": 7.038173142467621e-07, "loss": -0.031775109469890594, "memory(GiB)": 69.45, "reward": 0.4665907919406891, "reward_std": 0.07531797140836716, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8371874690055847, "rewards/RMReward/std": 0.08832616358995438, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06343156099319458, "rewards/VisualPerceptionAccuracy/std": 0.16101567447185516, "step": 413, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/mean_length": 45.078125, "completions/min_length": 2.0, "epoch": 0.01411860996487399, "frac_reward_zero_std": 0.0, "grad_norm": 38.18312072753906, "kl": 0.22790570557117462, "learning_rate": 7.05521472392638e-07, "loss": 0.0032078996300697327, "memory(GiB)": 69.45, "reward": 0.8592812418937683, "reward_std": 0.20478928089141846, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9196875095367432, "rewards/RMReward/std": 0.06382860988378525, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.625, "rewards/VisualPerceptionAccuracy/std": 0.5, "step": 414, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/mean_length": 181.0625, "completions/min_length": 87.0, "epoch": 0.014152712887494458, "frac_reward_zero_std": 0.0, "grad_norm": 0.6936892867088318, "kl": 0.04427436739206314, "learning_rate": 7.07225630538514e-07, "loss": -0.0014942698180675507, "memory(GiB)": 69.45, "reward": 0.6314427256584167, "reward_std": 0.06300897151231766, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9648036956787109, "rewards/PlanningActionSetORM/std": 0.04340188950300217, "rewards/RMReward/mean": 0.7829167246818542, "rewards/RMReward/std": 0.1401512622833252, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06788872182369232, "rewards/VisualPerceptionAccuracy/std": 0.05436552315950394, "step": 415, "train_speed(iter/s)": 0.005046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/mean_length": 251.015625, "completions/min_length": 15.0, "epoch": 0.014186815810114927, "frac_reward_zero_std": 0.0, "grad_norm": 8.840636253356934, "kl": 0.03403105214238167, "learning_rate": 7.0892978868439e-07, "loss": -0.0041390061378479, "memory(GiB)": 69.45, "reward": 0.2504805624485016, "reward_std": 0.13782823085784912, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.040224067866802216, "rewards/VisualPerceptionAccuracy/std": 0.10322099924087524, "step": 416, "train_speed(iter/s)": 0.005047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/mean_length": 125.09375, "completions/min_length": 2.0, "epoch": 0.014220918732735396, "frac_reward_zero_std": 0.0, "grad_norm": 12.812562942504883, "kl": 0.03167351335287094, "learning_rate": 7.106339468302658e-07, "loss": 0.007599828764796257, "memory(GiB)": 69.45, "reward": 0.6779974102973938, "reward_std": 0.11297501623630524, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9429687261581421, "rewards/PlanningActionSetORM/std": 0.06196850538253784, "rewards/RMReward/mean": 0.8590624928474426, "rewards/RMReward/std": 0.08317720144987106, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4801510274410248, "rewards/VisualPerceptionAccuracy/std": 0.4992693066596985, "step": 417, "train_speed(iter/s)": 0.005047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 87.9375, "completions/min_length": 8.0, "epoch": 0.014255021655355865, "frac_reward_zero_std": 0.0, "grad_norm": 20.943195343017578, "kl": 0.4982563257217407, "learning_rate": 7.123381049761418e-07, "loss": 0.0022310595959424973, "memory(GiB)": 69.45, "reward": 0.699874997138977, "reward_std": 0.2653246521949768, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8590624928474426, "rewards/RMReward/std": 0.08509422838687897, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 418, "train_speed(iter/s)": 0.005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/mean_length": 186.21875, "completions/min_length": 107.0, "epoch": 0.014289124577976332, "frac_reward_zero_std": 0.0, "grad_norm": 0.7007050514221191, "kl": 0.03478127717971802, "learning_rate": 7.140422631220177e-07, "loss": -0.018193669617176056, "memory(GiB)": 69.45, "reward": 0.4156625270843506, "reward_std": 0.08134239912033081, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8902901411056519, "rewards/PlanningActionSetORM/std": 0.03697976469993591, "rewards/RMReward/mean": 0.7165625095367432, "rewards/RMReward/std": 0.16334426403045654, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08001705259084702, "rewards/VisualPerceptionAccuracy/std": 0.072938472032547, "step": 419, "train_speed(iter/s)": 0.005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/mean_length": 207.78125, "completions/min_length": 93.0, "epoch": 0.014323227500596801, "frac_reward_zero_std": 0.0, "grad_norm": 0.6745275855064392, "kl": 0.03780486434698105, "learning_rate": 7.157464212678937e-07, "loss": 0.011940972879529, "memory(GiB)": 69.45, "reward": 0.6428709030151367, "reward_std": 0.04257950559258461, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9127232432365417, "rewards/PlanningActionSetORM/std": 0.08009038120508194, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.07579368352890015, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22384969890117645, "rewards/VisualPerceptionAccuracy/std": 0.03178897127509117, "step": 420, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/mean_length": 98.328125, "completions/min_length": 14.0, "epoch": 0.01435733042321727, "frac_reward_zero_std": 0.0, "grad_norm": 4.236474514007568, "kl": 0.04158113896846771, "learning_rate": 7.174505794137697e-07, "loss": -0.0027131661772727966, "memory(GiB)": 69.45, "reward": 0.8828749656677246, "reward_std": 0.14963862299919128, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9037500023841858, "rewards/RMReward/std": 0.08489354699850082, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 421, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 149.53125, "completions/min_length": 99.0, "epoch": 0.014391433345837739, "frac_reward_zero_std": 0.0, "grad_norm": 0.8415261507034302, "kl": 0.05067490041255951, "learning_rate": 7.191547375596455e-07, "loss": 0.007452432531863451, "memory(GiB)": 69.45, "reward": 0.6953496336936951, "reward_std": 0.04383312910795212, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9689235687255859, "rewards/PlanningActionSetORM/std": 0.052154798060655594, "rewards/RMReward/mean": 0.8897916674613953, "rewards/RMReward/std": 0.1020948588848114, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.06454435735940933, "rewards/VisualPerceptionAccuracy/std": 0.042162418365478516, "step": 422, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 146.765625, "completions/min_length": 76.0, "epoch": 0.014425536268458206, "frac_reward_zero_std": 0.0, "grad_norm": 1.0652965307235718, "kl": 0.05851977318525314, "learning_rate": 7.208588957055215e-07, "loss": 0.016408901661634445, "memory(GiB)": 69.45, "reward": 0.6553103923797607, "reward_std": 0.06949518620967865, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9583333134651184, "rewards/PlanningActionSetORM/std": 0.059549134224653244, "rewards/RMReward/mean": 0.8243749737739563, "rewards/RMReward/std": 0.10815486311912537, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0677414983510971, "rewards/VisualPerceptionAccuracy/std": 0.09497284889221191, "step": 423, "train_speed(iter/s)": 0.005046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/mean_length": 62.625, "completions/min_length": 14.0, "epoch": 0.014459639191078675, "frac_reward_zero_std": 0.5, "grad_norm": 0.9419928789138794, "kl": 0.043806180357933044, "learning_rate": 7.225630538513975e-07, "loss": -0.018159398809075356, "memory(GiB)": 69.45, "reward": 0.7330045700073242, "reward_std": 0.02932886965572834, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.890625, "rewards/RMReward/std": 0.04905352741479874, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.019518166780471802, "rewards/VisualPerceptionAccuracy/std": 0.07807266712188721, "step": 424, "train_speed(iter/s)": 0.005044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 183.03125, "completions/min_length": 14.0, "epoch": 0.014493742113699144, "frac_reward_zero_std": 0.0, "grad_norm": 1.0422261953353882, "kl": 0.03192015737295151, "learning_rate": 7.242672119972734e-07, "loss": 0.03859039396047592, "memory(GiB)": 69.45, "reward": 0.649376392364502, "reward_std": 0.10766948759555817, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9493844509124756, "rewards/PlanningActionSetORM/std": 0.09281457960605621, "rewards/RMReward/mean": 0.799375057220459, "rewards/RMReward/std": 0.11092275381088257, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 425, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 220.25, "completions/min_length": 99.0, "epoch": 0.014527845036319613, "frac_reward_zero_std": 0.0, "grad_norm": 0.6939671635627747, "kl": 0.04797326400876045, "learning_rate": 7.259713701431493e-07, "loss": 0.011101841926574707, "memory(GiB)": 69.45, "reward": 0.6531656384468079, "reward_std": 0.07564224302768707, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9552083611488342, "rewards/PlanningActionSetORM/std": 0.03731998801231384, "rewards/RMReward/mean": 0.8174999356269836, "rewards/RMReward/std": 0.1270700991153717, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07753767818212509, "rewards/VisualPerceptionAccuracy/std": 0.03200438618659973, "step": 426, "train_speed(iter/s)": 0.005044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/mean_length": 256.1875, "completions/min_length": 89.0, "epoch": 0.01456194795894008, "frac_reward_zero_std": 0.0, "grad_norm": 0.8990611433982849, "kl": 0.029941482469439507, "learning_rate": 7.276755282890252e-07, "loss": 0.023374423384666443, "memory(GiB)": 69.45, "reward": 0.30646729469299316, "reward_std": 0.02702564001083374, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.98375004529953, "rewards/RMReward/std": 0.0236290842294693, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.07962311059236526, "rewards/VisualPerceptionAccuracy/std": 0.06881367415189743, "step": 427, "train_speed(iter/s)": 0.005044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 223.984375, "completions/min_length": 109.0, "epoch": 0.01459605088156055, "frac_reward_zero_std": 0.0, "grad_norm": 0.5898703336715698, "kl": 0.043767571449279785, "learning_rate": 7.293796864349012e-07, "loss": -0.02250717394053936, "memory(GiB)": 69.45, "reward": 0.8240771293640137, "reward_std": 0.0680391788482666, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9860104918479919, "rewards/PlanningActionSetORM/std": 0.02727869153022766, "rewards/RMReward/mean": 0.7835937738418579, "rewards/RMReward/std": 0.0963849350810051, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 428, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/mean_length": 157.0625, "completions/min_length": 8.0, "epoch": 0.014630153804181019, "frac_reward_zero_std": 0.0, "grad_norm": 3.8803493976593018, "kl": 0.3570232093334198, "learning_rate": 7.310838445807772e-07, "loss": 0.020918337628245354, "memory(GiB)": 69.45, "reward": 0.8378044366836548, "reward_std": 0.1386333853006363, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9482796788215637, "rewards/PlanningActionSetORM/std": 0.047458868473768234, "rewards/RMReward/mean": 0.7920833230018616, "rewards/RMReward/std": 0.16733147203922272, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 429, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 83.84375, "completions/min_length": 8.0, "epoch": 0.014664256726801488, "frac_reward_zero_std": 0.0, "grad_norm": 16.061634063720703, "kl": 0.464065819978714, "learning_rate": 7.32788002726653e-07, "loss": 0.005761181935667992, "memory(GiB)": 69.45, "reward": 0.8231006860733032, "reward_std": 0.2433740198612213, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9056249856948853, "rewards/RMReward/std": 0.0714791789650917, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 430, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 119.21875, "completions/min_length": 98.0, "epoch": 0.014698359649421955, "frac_reward_zero_std": 0.0, "grad_norm": 0.691938579082489, "kl": 0.05693651735782623, "learning_rate": 7.34492160872529e-07, "loss": 0.022340357303619385, "memory(GiB)": 69.45, "reward": 0.8623697757720947, "reward_std": 0.05560533329844475, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9930989742279053, "rewards/PlanningActionSetORM/std": 0.05520833283662796, "rewards/RMReward/mean": 0.8296874761581421, "rewards/RMReward/std": 0.10849575698375702, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 431, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 200.296875, "completions/min_length": 54.0, "epoch": 0.014732462572042424, "frac_reward_zero_std": 0.0, "grad_norm": 0.8920049071311951, "kl": 0.03566576912999153, "learning_rate": 7.36196319018405e-07, "loss": -0.02785690687596798, "memory(GiB)": 69.45, "reward": 0.5776665210723877, "reward_std": 0.08720333129167557, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9714808464050293, "rewards/PlanningActionSetORM/std": 0.051762573421001434, "rewards/RMReward/mean": 0.7131249904632568, "rewards/RMReward/std": 0.24156542122364044, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.01627757027745247, "rewards/VisualPerceptionAccuracy/std": 0.06511028110980988, "step": 432, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 194.65625, "completions/min_length": 79.0, "epoch": 0.014766565494662893, "frac_reward_zero_std": 0.0, "grad_norm": 0.4664268493652344, "kl": 0.040001846849918365, "learning_rate": 7.379004771642809e-07, "loss": 0.020993495360016823, "memory(GiB)": 69.45, "reward": 0.8071032166481018, "reward_std": 0.09851079434156418, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9773911237716675, "rewards/PlanningActionSetORM/std": 0.03948013857007027, "rewards/RMReward/mean": 0.7645312547683716, "rewards/RMReward/std": 0.14387968182563782, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 433, "train_speed(iter/s)": 0.005039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/mean_length": 104.09375, "completions/min_length": 8.0, "epoch": 0.014800668417283362, "frac_reward_zero_std": 0.0, "grad_norm": 14.279929161071777, "kl": 0.3675253391265869, "learning_rate": 7.396046353101568e-07, "loss": 0.00028868112713098526, "memory(GiB)": 69.45, "reward": 0.8049051761627197, "reward_std": 0.15317906439304352, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9202008843421936, "rewards/PlanningActionSetORM/std": 0.06874677538871765, "rewards/RMReward/mean": 0.793749988079071, "rewards/RMReward/std": 0.10346394032239914, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 434, "train_speed(iter/s)": 0.005042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 5.25, "completions/min_length": 2.0, "epoch": 0.014834771339903829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011124114971607924, "kl": 0.3778211772441864, "learning_rate": 7.413087934560328e-07, "loss": 0.0003774641372729093, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 1.0, "rewards/VisualPerceptionAccuracy/std": 0.0, "step": 435, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 129.984375, "completions/min_length": 93.0, "epoch": 0.014868874262524298, "frac_reward_zero_std": 0.0, "grad_norm": 0.7611417770385742, "kl": 0.06758832931518555, "learning_rate": 7.430129516019087e-07, "loss": -0.03155793622136116, "memory(GiB)": 69.45, "reward": 0.862500011920929, "reward_std": 0.050089336931705475, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.828125, "rewards/RMReward/std": 0.09123273938894272, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 436, "train_speed(iter/s)": 0.00504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 185.25, "completions/min_length": 95.0, "epoch": 0.014902977185144767, "frac_reward_zero_std": 0.0, "grad_norm": 0.7698606252670288, "kl": 0.043112292885780334, "learning_rate": 7.447171097477847e-07, "loss": 0.03787289187312126, "memory(GiB)": 69.45, "reward": 0.6898930668830872, "reward_std": 0.03775346279144287, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8993750214576721, "rewards/RMReward/std": 0.10840048640966415, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0010720863938331604, "rewards/VisualPerceptionAccuracy/std": 0.004288345575332642, "step": 437, "train_speed(iter/s)": 0.005034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 125.109375, "completions/min_length": 94.0, "epoch": 0.014937080107765236, "frac_reward_zero_std": 0.0, "grad_norm": 0.8167955279350281, "kl": 0.06321346759796143, "learning_rate": 7.464212678936605e-07, "loss": 0.01843256503343582, "memory(GiB)": 69.45, "reward": 0.8153437376022339, "reward_std": 0.08749585598707199, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96484375, "rewards/PlanningActionSetORM/std": 0.05664543807506561, "rewards/RMReward/mean": 0.77796870470047, "rewards/RMReward/std": 0.14502523839473724, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 438, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 140.96875, "completions/min_length": 105.0, "epoch": 0.014971183030385703, "frac_reward_zero_std": 0.0, "grad_norm": 0.5753097534179688, "kl": 0.04744359105825424, "learning_rate": 7.481254260395365e-07, "loss": -0.013742068782448769, "memory(GiB)": 69.45, "reward": 0.8848749399185181, "reward_std": 0.058986566960811615, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.85609370470047, "rewards/RMReward/std": 0.11736007779836655, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 439, "train_speed(iter/s)": 0.005031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/mean_length": 132.1875, "completions/min_length": 14.0, "epoch": 0.015005285953006172, "frac_reward_zero_std": 0.0, "grad_norm": 2.094315767288208, "kl": 0.053872451186180115, "learning_rate": 7.498295841854125e-07, "loss": -0.021245576441287994, "memory(GiB)": 69.45, "reward": 0.6927499771118164, "reward_std": 0.1136334091424942, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9552083015441895, "rewards/PlanningActionSetORM/std": 0.07377763092517853, "rewards/RMReward/mean": 0.8702083230018616, "rewards/RMReward/std": 0.12504449486732483, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 440, "train_speed(iter/s)": 0.005033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 202.421875, "completions/min_length": 106.0, "epoch": 0.015039388875626641, "frac_reward_zero_std": 0.0, "grad_norm": 0.7963932752609253, "kl": 0.05384797602891922, "learning_rate": 7.515337423312884e-07, "loss": 0.014133075252175331, "memory(GiB)": 69.45, "reward": 0.7032894492149353, "reward_std": 0.05517955124378204, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9659143090248108, "rewards/PlanningActionSetORM/std": 0.04885994270443916, "rewards/RMReward/mean": 0.8089583516120911, "rewards/RMReward/std": 0.11676926910877228, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2921091914176941, "rewards/VisualPerceptionAccuracy/std": 0.020372651517391205, "step": 441, "train_speed(iter/s)": 0.005032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 162.203125, "completions/min_length": 95.0, "epoch": 0.01507349179824711, "frac_reward_zero_std": 0.0, "grad_norm": 0.8064142465591431, "kl": 0.05606180429458618, "learning_rate": 7.532379004771644e-07, "loss": 0.038296110928058624, "memory(GiB)": 69.45, "reward": 0.6833665370941162, "reward_std": 0.11327415704727173, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9583333134651184, "rewards/PlanningActionSetORM/std": 0.059549134224653244, "rewards/RMReward/mean": 0.8797916769981384, "rewards/RMReward/std": 0.14904646575450897, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.04696618765592575, "rewards/VisualPerceptionAccuracy/std": 0.1878647655248642, "step": 442, "train_speed(iter/s)": 0.005032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/mean_length": 172.046875, "completions/min_length": 91.0, "epoch": 0.015107594720867578, "frac_reward_zero_std": 0.0, "grad_norm": 0.7149699926376343, "kl": 0.05271387845277786, "learning_rate": 7.549420586230403e-07, "loss": -0.014760434627532959, "memory(GiB)": 69.45, "reward": 0.74125736951828, "reward_std": 0.05883089825510979, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9760403037071228, "rewards/PlanningActionSetORM/std": 0.03495002165436745, "rewards/RMReward/mean": 0.8697916865348816, "rewards/RMReward/std": 0.1565382033586502, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.29190534353256226, "rewards/VisualPerceptionAccuracy/std": 0.13217833638191223, "step": 443, "train_speed(iter/s)": 0.005032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/mean_length": 207.40625, "completions/min_length": 87.0, "epoch": 0.015141697643488047, "frac_reward_zero_std": 0.0, "grad_norm": 0.4661935269832611, "kl": 0.054887138307094574, "learning_rate": 7.566462167689162e-07, "loss": 0.048581939190626144, "memory(GiB)": 69.45, "reward": 0.6752816438674927, "reward_std": 0.08713225275278091, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9629629254341125, "rewards/PlanningActionSetORM/std": 0.052932560443878174, "rewards/RMReward/mean": 0.8302083015441895, "rewards/RMReward/std": 0.14686061441898346, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13084876537322998, "rewards/VisualPerceptionAccuracy/std": 0.08131299167871475, "step": 444, "train_speed(iter/s)": 0.005028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 141.921875, "completions/min_length": 77.0, "epoch": 0.015175800566108516, "frac_reward_zero_std": 0.0, "grad_norm": 0.8607478737831116, "kl": 0.05631104111671448, "learning_rate": 7.583503749147922e-07, "loss": 0.039095424115657806, "memory(GiB)": 69.45, "reward": 0.6936668157577515, "reward_std": 0.06153220310807228, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9670138359069824, "rewards/PlanningActionSetORM/std": 0.04720180481672287, "rewards/RMReward/mean": 0.8145833015441895, "rewards/RMReward/std": 0.08809950947761536, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23945893347263336, "rewards/VisualPerceptionAccuracy/std": 0.06529693305492401, "step": 445, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/mean_length": 174.53125, "completions/min_length": 8.0, "epoch": 0.015209903488728985, "frac_reward_zero_std": 0.0, "grad_norm": 14.680859565734863, "kl": 0.440962553024292, "learning_rate": 7.600545330606681e-07, "loss": 0.042885057628154755, "memory(GiB)": 69.45, "reward": 0.6069192290306091, "reward_std": 0.16058462858200073, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9031250476837158, "rewards/RMReward/std": 0.0996263176202774, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.1392739862203598, "rewards/VisualPerceptionAccuracy/std": 0.04860931262373924, "step": 446, "train_speed(iter/s)": 0.005024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/mean_length": 110.59375, "completions/min_length": 64.0, "epoch": 0.015244006411349452, "frac_reward_zero_std": 0.0, "grad_norm": 0.8606698513031006, "kl": 0.0499580092728138, "learning_rate": 7.61758691206544e-07, "loss": -0.023004040122032166, "memory(GiB)": 69.45, "reward": 0.8331249952316284, "reward_std": 0.07655565440654755, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.05000000074505806, "rewards/RMReward/mean": 0.797656238079071, "rewards/RMReward/std": 0.12580840289592743, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 447, "train_speed(iter/s)": 0.005026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/mean_length": 199.828125, "completions/min_length": 104.0, "epoch": 0.015278109333969921, "frac_reward_zero_std": 0.0, "grad_norm": 0.9148076176643372, "kl": 0.05013105645775795, "learning_rate": 7.6346284935242e-07, "loss": 0.06426423043012619, "memory(GiB)": 69.45, "reward": 0.7398909330368042, "reward_std": 0.047731027007102966, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.940625011920929, "rewards/RMReward/std": 0.06592730432748795, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.10206375271081924, "rewards/VisualPerceptionAccuracy/std": 0.04179216921329498, "step": 448, "train_speed(iter/s)": 0.005027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 302.875, "completions/min_length": 108.0, "epoch": 0.01531221225659039, "frac_reward_zero_std": 0.0, "grad_norm": 0.8494085073471069, "kl": 0.04338288679718971, "learning_rate": 7.651670074982959e-07, "loss": -0.030489057302474976, "memory(GiB)": 69.45, "reward": 0.30419033765792847, "reward_std": 0.11845714598894119, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.765625, "rewards/RMReward/std": 0.13750000298023224, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13475380837917328, "rewards/VisualPerceptionAccuracy/std": 0.1336483508348465, "step": 449, "train_speed(iter/s)": 0.005024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 205.34375, "completions/min_length": 73.0, "epoch": 0.015346315179210859, "frac_reward_zero_std": 0.0, "grad_norm": 0.9938344359397888, "kl": 0.04384661465883255, "learning_rate": 7.668711656441719e-07, "loss": 0.014071893878281116, "memory(GiB)": 69.45, "reward": 0.451774924993515, "reward_std": 0.038790248334407806, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.925896167755127, "rewards/PlanningActionSetORM/std": 0.07353174686431885, "rewards/RMReward/mean": 0.8959375023841858, "rewards/RMReward/std": 0.1127045527100563, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0016206181608140469, "rewards/VisualPerceptionAccuracy/std": 0.005968030542135239, "step": 450, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/mean_length": 94.609375, "completions/min_length": 7.0, "epoch": 0.015380418101831326, "frac_reward_zero_std": 0.5, "grad_norm": 0.5027854442596436, "kl": 0.5831714868545532, "learning_rate": 7.685753237900478e-07, "loss": 0.026442214846611023, "memory(GiB)": 69.45, "reward": 0.7515955567359924, "reward_std": 0.009072287008166313, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.953125, "rewards/RMReward/std": 0.022126534953713417, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.043882086873054504, "rewards/VisualPerceptionAccuracy/std": 0.018587898463010788, "step": 451, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 130.171875, "completions/min_length": 14.0, "epoch": 0.015414521024451795, "frac_reward_zero_std": 0.0, "grad_norm": 3.287388801574707, "kl": 0.0489577054977417, "learning_rate": 7.702794819359237e-07, "loss": 0.003683464601635933, "memory(GiB)": 69.45, "reward": 0.786148190498352, "reward_std": 0.09116114675998688, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9553124904632568, "rewards/RMReward/std": 0.08754665404558182, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.2754679024219513, "rewards/VisualPerceptionAccuracy/std": 0.01914285495877266, "step": 452, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 190.9375, "completions/min_length": 84.0, "epoch": 0.015448623947072264, "frac_reward_zero_std": 0.0, "grad_norm": 1.0348045825958252, "kl": 0.04571273922920227, "learning_rate": 7.719836400817997e-07, "loss": 0.001085999421775341, "memory(GiB)": 69.45, "reward": 0.3190673589706421, "reward_std": 0.06541706621646881, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8218749761581421, "rewards/RMReward/std": 0.2078210413455963, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1395898461341858, "rewards/VisualPerceptionAccuracy/std": 0.11975064128637314, "step": 453, "train_speed(iter/s)": 0.005023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 156.234375, "completions/min_length": 8.0, "epoch": 0.015482726869692733, "frac_reward_zero_std": 0.0, "grad_norm": 6.914935111999512, "kl": 0.3511953055858612, "learning_rate": 7.736877982276756e-07, "loss": -0.01577034592628479, "memory(GiB)": 69.45, "reward": 0.4923301339149475, "reward_std": 0.14803537726402283, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9440104365348816, "rewards/PlanningActionSetORM/std": 0.05693672224879265, "rewards/RMReward/mean": 0.8537499904632568, "rewards/RMReward/std": 0.13599690794944763, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.11634136736392975, "rewards/VisualPerceptionAccuracy/std": 0.15403306484222412, "step": 454, "train_speed(iter/s)": 0.005024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 64.765625, "completions/min_length": 8.0, "epoch": 0.0155168297923132, "frac_reward_zero_std": 0.75, "grad_norm": 0.16706065833568573, "kl": 0.5917527079582214, "learning_rate": 7.753919563735515e-07, "loss": 0.000882202759385109, "memory(GiB)": 69.45, "reward": 0.9827499985694885, "reward_std": 0.01853105239570141, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9137499928474426, "rewards/RMReward/std": 0.09265527874231339, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 455, "train_speed(iter/s)": 0.005024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 225.796875, "completions/min_length": 105.0, "epoch": 0.01555093271493367, "frac_reward_zero_std": 0.0, "grad_norm": 0.7761321067810059, "kl": 0.050979554653167725, "learning_rate": 7.770961145194275e-07, "loss": 0.0012627458199858665, "memory(GiB)": 69.45, "reward": 0.6622604131698608, "reward_std": 0.06746048480272293, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9825739860534668, "rewards/PlanningActionSetORM/std": 0.025223786011338234, "rewards/RMReward/mean": 0.8520833849906921, "rewards/RMReward/std": 0.12401026487350464, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.014497235417366028, "rewards/VisualPerceptionAccuracy/std": 0.04198335111141205, "step": 456, "train_speed(iter/s)": 0.005023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 122.5625, "completions/min_length": 2.0, "epoch": 0.015585035637554139, "frac_reward_zero_std": 0.0, "grad_norm": 26.319202423095703, "kl": 0.06685999035835266, "learning_rate": 7.788002726653034e-07, "loss": 0.008896986953914165, "memory(GiB)": 69.45, "reward": 0.7213178277015686, "reward_std": 0.12308712303638458, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9518749713897705, "rewards/RMReward/std": 0.06462984532117844, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.49224674701690674, "rewards/VisualPerceptionAccuracy/std": 0.4577540457248688, "step": 457, "train_speed(iter/s)": 0.005024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 177.109375, "completions/min_length": 98.0, "epoch": 0.015619138560174608, "frac_reward_zero_std": 0.0, "grad_norm": 0.8443417549133301, "kl": 0.07291465997695923, "learning_rate": 7.805044308111794e-07, "loss": 0.03598856180906296, "memory(GiB)": 69.45, "reward": 0.8402143120765686, "reward_std": 0.06191517785191536, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8985716700553894, "rewards/PlanningActionSetORM/std": 0.010520804673433304, "rewards/RMReward/mean": 0.8256250023841858, "rewards/RMReward/std": 0.08651772886514664, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 458, "train_speed(iter/s)": 0.005023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 226.828125, "completions/min_length": 8.0, "epoch": 0.015653241482795077, "frac_reward_zero_std": 0.0, "grad_norm": 15.461403846740723, "kl": 0.4289487898349762, "learning_rate": 7.822085889570554e-07, "loss": 0.0034823459573090076, "memory(GiB)": 69.45, "reward": 0.458617240190506, "reward_std": 0.15306922793388367, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9230769276618958, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8218750357627869, "rewards/RMReward/std": 0.09303896874189377, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.17430178821086884, "rewards/VisualPerceptionAccuracy/std": 0.06112981587648392, "step": 459, "train_speed(iter/s)": 0.005022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 131.9375, "completions/min_length": 2.0, "epoch": 0.015687344405415544, "frac_reward_zero_std": 0.0, "grad_norm": 33.96323013305664, "kl": 0.1820928454399109, "learning_rate": 7.839127471029312e-07, "loss": 0.17095966637134552, "memory(GiB)": 69.45, "reward": 0.6415903568267822, "reward_std": 0.23231816291809082, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9937500357627869, "rewards/RMReward/std": 0.012583059258759022, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.3747432827949524, "rewards/VisualPerceptionAccuracy/std": 0.4620184004306793, "step": 460, "train_speed(iter/s)": 0.005023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/mean_length": 181.890625, "completions/min_length": 65.0, "epoch": 0.01572144732803601, "frac_reward_zero_std": 0.0, "grad_norm": 1.1256475448608398, "kl": 0.07774332910776138, "learning_rate": 7.856169052488072e-07, "loss": -0.003442003857344389, "memory(GiB)": 69.45, "reward": 0.7028300762176514, "reward_std": 0.10843097418546677, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9826650619506836, "rewards/PlanningActionSetORM/std": 0.025500642135739326, "rewards/RMReward/mean": 0.8291667103767395, "rewards/RMReward/std": 0.14602293074131012, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23172113299369812, "rewards/VisualPerceptionAccuracy/std": 0.2829115092754364, "step": 461, "train_speed(iter/s)": 0.005021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 168.21875, "completions/min_length": 90.0, "epoch": 0.015755550250656482, "frac_reward_zero_std": 0.0, "grad_norm": 1.2260208129882812, "kl": 0.06625794619321823, "learning_rate": 7.873210633946831e-07, "loss": -0.062286972999572754, "memory(GiB)": 69.45, "reward": 0.6300667524337769, "reward_std": 0.07579515129327774, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9973958134651184, "rewards/PlanningActionSetORM/std": 0.018042195588350296, "rewards/RMReward/mean": 0.7614583373069763, "rewards/RMReward/std": 0.09743496775627136, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09432929754257202, "rewards/VisualPerceptionAccuracy/std": 0.07635091245174408, "step": 462, "train_speed(iter/s)": 0.005022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 300.109375, "completions/min_length": 81.0, "epoch": 0.01578965317327695, "frac_reward_zero_std": 0.0, "grad_norm": 0.8956395983695984, "kl": 0.0510689951479435, "learning_rate": 7.890252215405591e-07, "loss": -0.00011445581912994385, "memory(GiB)": 69.45, "reward": 0.5024944543838501, "reward_std": 0.05433247610926628, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9325000047683716, "rewards/RMReward/std": 0.07400958240032196, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.05898893624544144, "rewards/VisualPerceptionAccuracy/std": 0.08387827128171921, "step": 463, "train_speed(iter/s)": 0.005021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/mean_length": 83.171875, "completions/min_length": 8.0, "epoch": 0.01582375609589742, "frac_reward_zero_std": 0.0, "grad_norm": 22.129119873046875, "kl": 0.3978221118450165, "learning_rate": 7.90729379686435e-07, "loss": 0.008343778550624847, "memory(GiB)": 69.45, "reward": 0.737375020980835, "reward_std": 0.14783424139022827, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8591666221618652, "rewards/RMReward/std": 0.13711050152778625, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 464, "train_speed(iter/s)": 0.005021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 131.828125, "completions/min_length": 8.0, "epoch": 0.015857859018517887, "frac_reward_zero_std": 0.0, "grad_norm": 7.467848777770996, "kl": 0.4826483726501465, "learning_rate": 7.924335378323109e-07, "loss": 0.026713358238339424, "memory(GiB)": 69.45, "reward": 0.776451587677002, "reward_std": 0.09684351086616516, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.09837386757135391, "rewards/RMReward/mean": 0.9021875262260437, "rewards/RMReward/std": 0.08563742786645889, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.3316812515258789, "rewards/VisualPerceptionAccuracy/std": 0.0765872448682785, "step": 465, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 177.40625, "completions/min_length": 106.0, "epoch": 0.015891961941138354, "frac_reward_zero_std": 0.0, "grad_norm": 0.7495957612991333, "kl": 0.044982410967350006, "learning_rate": 7.941376959781869e-07, "loss": -0.011586697772145271, "memory(GiB)": 69.45, "reward": 0.7190877795219421, "reward_std": 0.05253206193447113, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9651041030883789, "rewards/PlanningActionSetORM/std": 0.050196319818496704, "rewards/RMReward/mean": 0.9027083516120911, "rewards/RMReward/std": 0.07739754766225815, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.130788654088974, "rewards/VisualPerceptionAccuracy/std": 0.11788396537303925, "step": 466, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/mean_length": 177.421875, "completions/min_length": 78.0, "epoch": 0.015926064863758825, "frac_reward_zero_std": 0.0, "grad_norm": 1.2379062175750732, "kl": 0.05701562762260437, "learning_rate": 7.958418541240629e-07, "loss": -0.025868257507681847, "memory(GiB)": 69.45, "reward": 0.35969462990760803, "reward_std": 0.1398368775844574, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8868749737739563, "rewards/RMReward/std": 0.0549810491502285, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.17642618715763092, "rewards/VisualPerceptionAccuracy/std": 0.25710636377334595, "step": 467, "train_speed(iter/s)": 0.00503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 163.65625, "completions/min_length": 8.0, "epoch": 0.015960167786379292, "frac_reward_zero_std": 0.0, "grad_norm": 18.928979873657227, "kl": 0.3700048625469208, "learning_rate": 7.975460122699387e-07, "loss": 0.0002848440781235695, "memory(GiB)": 69.45, "reward": 0.7602475881576538, "reward_std": 0.15943998098373413, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9743589758872986, "rewards/PlanningActionSetORM/std": 0.036645617336034775, "rewards/RMReward/mean": 0.8789582848548889, "rewards/RMReward/std": 0.11576277017593384, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 468, "train_speed(iter/s)": 0.005029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 134.4375, "completions/min_length": 8.0, "epoch": 0.01599427070899976, "frac_reward_zero_std": 0.0, "grad_norm": 7.437051296234131, "kl": 0.3925018608570099, "learning_rate": 7.992501704158147e-07, "loss": 0.005776241421699524, "memory(GiB)": 69.45, "reward": 0.5188594460487366, "reward_std": 0.0872909277677536, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.9350000023841858, "rewards/RMReward/std": 0.06026821583509445, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.09506291151046753, "rewards/VisualPerceptionAccuracy/std": 0.02807527966797352, "step": 469, "train_speed(iter/s)": 0.005032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 13.84375, "completions/min_length": 8.0, "epoch": 0.01602837363162023, "frac_reward_zero_std": 1.0, "grad_norm": 0.007386913523077965, "kl": 0.361492395401001, "learning_rate": 8.009543285616905e-07, "loss": 0.00036140409065410495, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 470, "train_speed(iter/s)": 0.005034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/mean_length": 228.703125, "completions/min_length": 86.0, "epoch": 0.016062476554240698, "frac_reward_zero_std": 0.0, "grad_norm": 0.874294638633728, "kl": 0.05658312141895294, "learning_rate": 8.026584867075666e-07, "loss": 0.03812384977936745, "memory(GiB)": 69.45, "reward": 0.5964798927307129, "reward_std": 0.08799099922180176, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9521808624267578, "rewards/PlanningActionSetORM/std": 0.043173231184482574, "rewards/RMReward/mean": 0.7491666674613953, "rewards/RMReward/std": 0.14986282587051392, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.016611149534583092, "rewards/VisualPerceptionAccuracy/std": 0.04550636187195778, "step": 471, "train_speed(iter/s)": 0.005034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 185.03125, "completions/min_length": 47.0, "epoch": 0.01609657947686117, "frac_reward_zero_std": 0.0, "grad_norm": 1.2031264305114746, "kl": 0.06295810639858246, "learning_rate": 8.043626448534425e-07, "loss": -0.03776673972606659, "memory(GiB)": 69.45, "reward": 0.2690560519695282, "reward_std": 0.09958939254283905, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.690625011920929, "rewards/RMReward/std": 0.12002604454755783, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.10790807008743286, "rewards/VisualPerceptionAccuracy/std": 0.19418074190616608, "step": 472, "train_speed(iter/s)": 0.005036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 49.203125, "completions/min_length": 8.0, "epoch": 0.016130682399481636, "frac_reward_zero_std": 0.75, "grad_norm": 0.4868250787258148, "kl": 0.5965997576713562, "learning_rate": 8.060668029993183e-07, "loss": 0.007402762770652771, "memory(GiB)": 69.45, "reward": 0.7557831406593323, "reward_std": 0.0013807284412905574, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.02313261106610298, "rewards/VisualPerceptionAccuracy/std": 0.0055229137651622295, "step": 473, "train_speed(iter/s)": 0.005036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/mean_length": 83.25, "completions/min_length": 8.0, "epoch": 0.016164785322102103, "frac_reward_zero_std": 0.0, "grad_norm": 18.490568161010742, "kl": 0.28458940982818604, "learning_rate": 8.077709611451944e-07, "loss": 0.01210244745016098, "memory(GiB)": 69.45, "reward": 0.7940624952316284, "reward_std": 0.1546224057674408, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9041666388511658, "rewards/RMReward/std": 0.05910081788897514, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 474, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/mean_length": 152.46875, "completions/min_length": 8.0, "epoch": 0.016198888244722574, "frac_reward_zero_std": 0.0, "grad_norm": 20.4107723236084, "kl": 0.48543137311935425, "learning_rate": 8.094751192910704e-07, "loss": -0.0003826860338449478, "memory(GiB)": 69.45, "reward": 0.8293749690055847, "reward_std": 0.16964150965213776, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8145833015441895, "rewards/RMReward/std": 0.1180327758193016, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 475, "train_speed(iter/s)": 0.005034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/mean_length": 160.3125, "completions/min_length": 8.0, "epoch": 0.01623299116734304, "frac_reward_zero_std": 0.0, "grad_norm": 17.405485153198242, "kl": 0.4019298851490021, "learning_rate": 8.111792774369461e-07, "loss": 0.004853741265833378, "memory(GiB)": 69.45, "reward": 0.5836825370788574, "reward_std": 0.1490301489830017, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.862618088722229, "rewards/PlanningActionSetORM/std": 0.15720270574092865, "rewards/RMReward/mean": 0.707812488079071, "rewards/RMReward/std": 0.13386882841587067, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.03530794754624367, "rewards/VisualPerceptionAccuracy/std": 0.009358947165310383, "step": 476, "train_speed(iter/s)": 0.005033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 248.03125, "completions/min_length": 131.0, "epoch": 0.016267094089963508, "frac_reward_zero_std": 0.0, "grad_norm": 0.6943798065185547, "kl": 0.04967735335230827, "learning_rate": 8.128834355828222e-07, "loss": 0.008208339102566242, "memory(GiB)": 69.45, "reward": 0.4859434962272644, "reward_std": 0.09875724464654922, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.8728125095367432, "rewards/RMReward/std": 0.13737712800502777, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08474808186292648, "rewards/VisualPerceptionAccuracy/std": 0.1323343813419342, "step": 477, "train_speed(iter/s)": 0.005027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/mean_length": 187.453125, "completions/min_length": 120.0, "epoch": 0.01630119701258398, "frac_reward_zero_std": 0.0, "grad_norm": 0.7056512832641602, "kl": 0.05602441355586052, "learning_rate": 8.14587593728698e-07, "loss": 0.022460713982582092, "memory(GiB)": 69.45, "reward": 0.8919155597686768, "reward_std": 0.06778877228498459, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.927702784538269, "rewards/PlanningActionSetORM/std": 0.12986347079277039, "rewards/RMReward/mean": 0.8829687237739563, "rewards/RMReward/std": 0.14056213200092316, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 478, "train_speed(iter/s)": 0.005027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 233.59375, "completions/min_length": 8.0, "epoch": 0.016335299935204446, "frac_reward_zero_std": 0.0, "grad_norm": 19.123376846313477, "kl": 0.2282969355583191, "learning_rate": 8.16291751874574e-07, "loss": -0.005895646288990974, "memory(GiB)": 69.45, "reward": 0.5087651014328003, "reward_std": 0.16752934455871582, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8890625238418579, "rewards/PlanningActionSetORM/std": 0.04827762395143509, "rewards/RMReward/mean": 0.9018750190734863, "rewards/RMReward/std": 0.08510532975196838, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.12724897265434265, "rewards/VisualPerceptionAccuracy/std": 0.14288578927516937, "step": 479, "train_speed(iter/s)": 0.005026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/mean_length": 182.6875, "completions/min_length": 70.0, "epoch": 0.016369402857824917, "frac_reward_zero_std": 0.0, "grad_norm": 1.0912435054779053, "kl": 0.05769793316721916, "learning_rate": 8.179959100204501e-07, "loss": -0.028464453294873238, "memory(GiB)": 69.45, "reward": 0.277522474527359, "reward_std": 0.07051200419664383, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8312499523162842, "rewards/RMReward/std": 0.09287086874246597, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.08169662207365036, "rewards/VisualPerceptionAccuracy/std": 0.11979042738676071, "step": 480, "train_speed(iter/s)": 0.005027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/mean_length": 215.65625, "completions/min_length": 105.0, "epoch": 0.016403505780445384, "frac_reward_zero_std": 0.0, "grad_norm": 0.6067813038825989, "kl": 0.06231726333498955, "learning_rate": 8.197000681663258e-07, "loss": 0.015073377639055252, "memory(GiB)": 69.45, "reward": 0.9379861354827881, "reward_std": 0.03830903023481369, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9555555582046509, "rewards/PlanningActionSetORM/std": 0.04751310870051384, "rewards/RMReward/mean": 0.93359375, "rewards/RMReward/std": 0.0893782302737236, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 481, "train_speed(iter/s)": 0.005027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 134.765625, "completions/min_length": 8.0, "epoch": 0.01643760870306585, "frac_reward_zero_std": 0.0, "grad_norm": 8.935067176818848, "kl": 0.27728474140167236, "learning_rate": 8.214042263122018e-07, "loss": -0.02269895374774933, "memory(GiB)": 69.45, "reward": 0.6810556054115295, "reward_std": 0.12355129420757294, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.801562488079071, "rewards/RMReward/std": 0.061545126140117645, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.16047222912311554, "rewards/VisualPerceptionAccuracy/std": 0.10721348226070404, "step": 482, "train_speed(iter/s)": 0.005029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/mean_length": 207.53125, "completions/min_length": 153.0, "epoch": 0.016471711625686322, "frac_reward_zero_std": 0.0, "grad_norm": 0.4453306496143341, "kl": 0.049899905920028687, "learning_rate": 8.231083844580779e-07, "loss": -0.008084120228886604, "memory(GiB)": 69.45, "reward": 0.7744154334068298, "reward_std": 0.04421297460794449, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9452083110809326, "rewards/RMReward/std": 0.08137094974517822, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22916170954704285, "rewards/VisualPerceptionAccuracy/std": 0.11097472161054611, "step": 483, "train_speed(iter/s)": 0.005028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 128.59375, "completions/min_length": 8.0, "epoch": 0.01650581454830679, "frac_reward_zero_std": 0.0, "grad_norm": 24.18337631225586, "kl": 0.42161568999290466, "learning_rate": 8.248125426039537e-07, "loss": -0.003732403740286827, "memory(GiB)": 69.45, "reward": 0.7494062781333923, "reward_std": 0.13491180539131165, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9039583206176758, "rewards/RMReward/std": 0.10672572255134583, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 484, "train_speed(iter/s)": 0.005027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/mean_length": 194.90625, "completions/min_length": 95.0, "epoch": 0.01653991747092726, "frac_reward_zero_std": 0.0, "grad_norm": 0.4619964063167572, "kl": 0.04899623990058899, "learning_rate": 8.265167007498296e-07, "loss": -0.025695130228996277, "memory(GiB)": 69.45, "reward": 0.8723750114440918, "reward_std": 0.05040191113948822, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8404687643051147, "rewards/RMReward/std": 0.11712738871574402, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 485, "train_speed(iter/s)": 0.005026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/mean_length": 99.296875, "completions/min_length": 2.0, "epoch": 0.016574020393547727, "frac_reward_zero_std": 0.0, "grad_norm": 40.77225875854492, "kl": 0.34843766689300537, "learning_rate": 8.282208588957055e-07, "loss": 0.0008930857293307781, "memory(GiB)": 69.45, "reward": 0.9053750038146973, "reward_std": 0.13786116242408752, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9464583396911621, "rewards/RMReward/std": 0.08418544381856918, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.75, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 486, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 209.703125, "completions/min_length": 86.0, "epoch": 0.016608123316168195, "frac_reward_zero_std": 0.0, "grad_norm": 0.7575241327285767, "kl": 0.045007482171058655, "learning_rate": 8.299250170415815e-07, "loss": -0.060132913291454315, "memory(GiB)": 69.45, "reward": 0.6843506097793579, "reward_std": 0.06987655162811279, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8514583706855774, "rewards/RMReward/std": 0.15202169120311737, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09390249103307724, "rewards/VisualPerceptionAccuracy/std": 0.08864695578813553, "step": 487, "train_speed(iter/s)": 0.005025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 71.453125, "completions/min_length": 7.0, "epoch": 0.016642226238788665, "frac_reward_zero_std": 0.0, "grad_norm": 14.06814193725586, "kl": 0.7870094180107117, "learning_rate": 8.316291751874575e-07, "loss": 0.04195467382669449, "memory(GiB)": 69.45, "reward": 0.6898359060287476, "reward_std": 0.19577345252037048, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7775000333786011, "rewards/RMReward/std": 0.21063396334648132, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": 0.05609370768070221, "rewards/VisualPerceptionAccuracy/std": 0.13958662748336792, "step": 488, "train_speed(iter/s)": 0.005028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 114.8125, "completions/min_length": 8.0, "epoch": 0.016676329161409133, "frac_reward_zero_std": 0.0, "grad_norm": 31.88422966003418, "kl": 0.40188413858413696, "learning_rate": 8.333333333333333e-07, "loss": -0.039707332849502563, "memory(GiB)": 69.45, "reward": 0.6031675338745117, "reward_std": 0.18324771523475647, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8687499761581421, "rewards/RMReward/std": 0.0656849667429924, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.1820453703403473, "rewards/VisualPerceptionAccuracy/std": 0.14007499814033508, "step": 489, "train_speed(iter/s)": 0.005029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/mean_length": 97.40625, "completions/min_length": 9.0, "epoch": 0.0167104320840296, "frac_reward_zero_std": 0.0, "grad_norm": 21.532251358032227, "kl": 0.3684527575969696, "learning_rate": 8.350374914792093e-07, "loss": -0.007165046408772469, "memory(GiB)": 69.45, "reward": 0.6670312881469727, "reward_std": 0.15541554987430573, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7666666507720947, "rewards/RMReward/std": 0.13461266458034515, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 490, "train_speed(iter/s)": 0.00503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 110.140625, "completions/min_length": 2.0, "epoch": 0.01674453500665007, "frac_reward_zero_std": 0.0, "grad_norm": 25.512001037597656, "kl": 0.5248591899871826, "learning_rate": 8.367416496250853e-07, "loss": 0.011254504323005676, "memory(GiB)": 69.45, "reward": 0.5289103388786316, "reward_std": 0.22929814457893372, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9103535413742065, "rewards/PlanningActionSetORM/std": 0.03571246191859245, "rewards/RMReward/mean": 0.8759375214576721, "rewards/RMReward/std": 0.18257759511470795, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.0625, "rewards/VisualPerceptionAccuracy/std": 0.25, "step": 491, "train_speed(iter/s)": 0.005034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/mean_length": 179.09375, "completions/min_length": 83.0, "epoch": 0.016778637929270538, "frac_reward_zero_std": 0.0, "grad_norm": 1.1666429042816162, "kl": 0.06198413670063019, "learning_rate": 8.384458077709612e-07, "loss": -0.009013542905449867, "memory(GiB)": 69.45, "reward": 0.5372896790504456, "reward_std": 0.10371556133031845, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9128124713897705, "rewards/RMReward/std": 0.16192859411239624, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.14432938396930695, "rewards/VisualPerceptionAccuracy/std": 0.15344589948654175, "step": 492, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.796875, "completions/min_length": 8.0, "epoch": 0.01681274085189101, "frac_reward_zero_std": 1.0, "grad_norm": 0.003098775865510106, "kl": 0.49033260345458984, "learning_rate": 8.401499659168371e-07, "loss": 0.0004909048439003527, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 493, "train_speed(iter/s)": 0.00504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 168.3125, "completions/min_length": 88.0, "epoch": 0.016846843774511476, "frac_reward_zero_std": 0.0, "grad_norm": 0.9198002815246582, "kl": 0.06126157566905022, "learning_rate": 8.41854124062713e-07, "loss": -0.01037684828042984, "memory(GiB)": 69.45, "reward": 0.7207396626472473, "reward_std": 0.08087243139743805, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9696969985961914, "rewards/PlanningActionSetORM/std": 0.04330844804644585, "rewards/RMReward/mean": 0.87562495470047, "rewards/RMReward/std": 0.1031608134508133, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.19964049756526947, "rewards/VisualPerceptionAccuracy/std": 0.16730090975761414, "step": 494, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/mean_length": 164.453125, "completions/min_length": 96.0, "epoch": 0.016880946697131943, "frac_reward_zero_std": 0.0, "grad_norm": 1.06869637966156, "kl": 0.057991985231637955, "learning_rate": 8.43558282208589e-07, "loss": 0.0011355876922607422, "memory(GiB)": 69.45, "reward": 0.5701261758804321, "reward_std": 0.07407797873020172, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.987500011920929, "rewards/PlanningActionSetORM/std": 0.0707106739282608, "rewards/RMReward/mean": 0.9131249785423279, "rewards/RMReward/std": 0.06869461387395859, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21225234866142273, "rewards/VisualPerceptionAccuracy/std": 0.11681176722049713, "step": 495, "train_speed(iter/s)": 0.005038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/mean_length": 220.390625, "completions/min_length": 173.0, "epoch": 0.016915049619752414, "frac_reward_zero_std": 0.0, "grad_norm": 0.599562406539917, "kl": 0.042947642505168915, "learning_rate": 8.45262440354465e-07, "loss": -0.008707656525075436, "memory(GiB)": 69.45, "reward": 0.7989351749420166, "reward_std": 0.05559055134654045, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9954545497894287, "rewards/PlanningActionSetORM/std": 0.031491827219724655, "rewards/RMReward/mean": 0.9866666793823242, "rewards/RMReward/std": 0.01837358996272087, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.23046790063381195, "rewards/VisualPerceptionAccuracy/std": 0.17883563041687012, "step": 496, "train_speed(iter/s)": 0.005038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 128.34375, "completions/min_length": 99.0, "epoch": 0.01694915254237288, "frac_reward_zero_std": 0.0, "grad_norm": 0.7788242697715759, "kl": 0.06373219937086105, "learning_rate": 8.469665985003408e-07, "loss": -0.0021589193493127823, "memory(GiB)": 69.45, "reward": 0.7609462141990662, "reward_std": 0.08466154336929321, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9609375, "rewards/PlanningActionSetORM/std": 0.058552179485559464, "rewards/RMReward/mean": 0.9191667437553406, "rewards/RMReward/std": 0.10809439420700073, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.2612222135066986, "rewards/VisualPerceptionAccuracy/std": 0.16009069979190826, "step": 497, "train_speed(iter/s)": 0.005039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 152.46875, "completions/min_length": 91.0, "epoch": 0.01698325546499335, "frac_reward_zero_std": 0.0, "grad_norm": 0.8363815546035767, "kl": 0.07109637558460236, "learning_rate": 8.486707566462168e-07, "loss": 0.007536187767982483, "memory(GiB)": 69.45, "reward": 0.8998750448226929, "reward_std": 0.0468837171792984, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8748437762260437, "rewards/RMReward/std": 0.10177772492170334, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 498, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/mean_length": 8.5, "completions/min_length": 8.0, "epoch": 0.01701735838761382, "frac_reward_zero_std": 1.0, "grad_norm": 0.002161572687327862, "kl": 1.0720486640930176, "learning_rate": 8.503749147920928e-07, "loss": 0.0010705087333917618, "memory(GiB)": 69.45, "reward": 0.762499988079071, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.4364357888698578, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 499, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 133.6875, "completions/min_length": 22.0, "epoch": 0.017051461310234287, "frac_reward_zero_std": 0.0, "grad_norm": 0.7897449135780334, "kl": 0.07028098404407501, "learning_rate": 8.520790729379687e-07, "loss": -0.08416132628917694, "memory(GiB)": 69.45, "reward": 0.7698967456817627, "reward_std": 0.09179411083459854, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9443750381469727, "rewards/RMReward/std": 0.07748454809188843, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.21308720111846924, "rewards/VisualPerceptionAccuracy/std": 0.26454591751098633, "step": 500, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/mean_length": 144.359375, "completions/min_length": 2.0, "epoch": 0.017085564232854757, "frac_reward_zero_std": 0.0, "grad_norm": 25.025346755981445, "kl": 0.11684056371450424, "learning_rate": 8.537832310838446e-07, "loss": -0.018236564472317696, "memory(GiB)": 69.45, "reward": 0.627972424030304, "reward_std": 0.15347884595394135, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9098830223083496, "rewards/PlanningActionSetORM/std": 0.033444203436374664, "rewards/RMReward/mean": 0.7681249976158142, "rewards/RMReward/std": 0.14308665692806244, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4594682455062866, "rewards/VisualPerceptionAccuracy/std": 0.48971620202064514, "step": 501, "train_speed(iter/s)": 0.005031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 65.046875, "completions/min_length": 7.0, "epoch": 0.017119667155475225, "frac_reward_zero_std": 0.0, "grad_norm": 23.490915298461914, "kl": 0.4820176064968109, "learning_rate": 8.554873892297205e-07, "loss": 0.00035982392728328705, "memory(GiB)": 69.45, "reward": 0.7767502069473267, "reward_std": 0.21270206570625305, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9968750476837158, "rewards/RMReward/std": 0.00478713121265173, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.33601075410842896, "rewards/VisualPerceptionAccuracy/mean": 0.3470008671283722, "rewards/VisualPerceptionAccuracy/std": 0.19800497591495514, "step": 502, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 180.90625, "completions/min_length": 94.0, "epoch": 0.017153770078095692, "frac_reward_zero_std": 0.0, "grad_norm": 1.052729845046997, "kl": 0.06889432668685913, "learning_rate": 8.571915473755965e-07, "loss": -0.046139977872371674, "memory(GiB)": 69.45, "reward": 0.516074538230896, "reward_std": 0.11036641895771027, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8674479126930237, "rewards/PlanningActionSetORM/std": 0.19131113588809967, "rewards/RMReward/mean": 0.8581249713897705, "rewards/RMReward/std": 0.10648147761821747, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.17215949296951294, "rewards/VisualPerceptionAccuracy/std": 0.15083624422550201, "step": 503, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1382.0, "completions/mean_length": 311.484375, "completions/min_length": 107.0, "epoch": 0.017187873000716163, "frac_reward_zero_std": 0.0, "grad_norm": 0.5314013957977295, "kl": 0.055388666689395905, "learning_rate": 8.588957055214725e-07, "loss": 0.09653706103563309, "memory(GiB)": 69.45, "reward": 0.6525136232376099, "reward_std": 0.06768975406885147, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7822917103767395, "rewards/RMReward/std": 0.07955300062894821, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.13255442678928375, "rewards/VisualPerceptionAccuracy/std": 0.10635261982679367, "step": 504, "train_speed(iter/s)": 0.005033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 97.4375, "completions/min_length": 8.0, "epoch": 0.01722197592333663, "frac_reward_zero_std": 0.0, "grad_norm": 16.540924072265625, "kl": 0.45483148097991943, "learning_rate": 8.605998636673484e-07, "loss": 0.03340821713209152, "memory(GiB)": 69.45, "reward": 0.44072145223617554, "reward_std": 0.19870509207248688, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.984375, "rewards/RMReward/std": 0.020966248586773872, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.25644293427467346, "rewards/VisualPerceptionAccuracy/std": 0.19618019461631775, "step": 505, "train_speed(iter/s)": 0.005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/mean_length": 331.765625, "completions/min_length": 185.0, "epoch": 0.017256078845957097, "frac_reward_zero_std": 0.0, "grad_norm": 0.5789417028427124, "kl": 0.03744082152843475, "learning_rate": 8.623040218132243e-07, "loss": -0.038212455809116364, "memory(GiB)": 69.45, "reward": 0.5005488395690918, "reward_std": 0.10316850244998932, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9425618052482605, "rewards/PlanningActionSetORM/std": 0.00679151201620698, "rewards/RMReward/mean": 0.7328125238418579, "rewards/RMReward/std": 0.08854502439498901, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22633525729179382, "rewards/VisualPerceptionAccuracy/std": 0.1421392858028412, "step": 506, "train_speed(iter/s)": 0.005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 138.0, "completions/min_length": 89.0, "epoch": 0.017290181768577568, "frac_reward_zero_std": 0.0, "grad_norm": 0.9972098469734192, "kl": 0.08043741434812546, "learning_rate": 8.640081799591003e-07, "loss": -0.042721010744571686, "memory(GiB)": 69.45, "reward": 0.7436482906341553, "reward_std": 0.09840922057628632, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8447916507720947, "rewards/RMReward/std": 0.07013632357120514, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3470933139324188, "rewards/VisualPerceptionAccuracy/std": 0.23796869814395905, "step": 507, "train_speed(iter/s)": 0.005038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/mean_length": 176.296875, "completions/min_length": 104.0, "epoch": 0.017324284691198035, "frac_reward_zero_std": 0.0, "grad_norm": 0.7908661961555481, "kl": 0.06706710159778595, "learning_rate": 8.657123381049762e-07, "loss": 0.004863538779318333, "memory(GiB)": 69.45, "reward": 0.9405694007873535, "reward_std": 0.04874143376946449, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9722222089767456, "rewards/PlanningActionSetORM/std": 0.04849286377429962, "rewards/RMReward/mean": 0.9326562881469727, "rewards/RMReward/std": 0.07213836163282394, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 508, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 145.25, "completions/min_length": 8.0, "epoch": 0.017358387613818506, "frac_reward_zero_std": 0.0, "grad_norm": 5.111431121826172, "kl": 0.30663034319877625, "learning_rate": 8.674164962508522e-07, "loss": -0.057138994336128235, "memory(GiB)": 69.45, "reward": 0.6623725891113281, "reward_std": 0.12778860330581665, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9701922535896301, "rewards/PlanningActionSetORM/std": 0.03729124739766121, "rewards/RMReward/mean": 0.815833330154419, "rewards/RMReward/std": 0.15487584471702576, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 509, "train_speed(iter/s)": 0.00503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 113.0625, "completions/min_length": 13.0, "epoch": 0.017392490536438973, "frac_reward_zero_std": 0.0, "grad_norm": 2.693096160888672, "kl": 0.09483636915683746, "learning_rate": 8.69120654396728e-07, "loss": -0.006491988897323608, "memory(GiB)": 69.45, "reward": 0.9476562738418579, "reward_std": 0.08045939356088638, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9916666150093079, "rewards/PlanningActionSetORM/std": 0.05773502588272095, "rewards/RMReward/mean": 0.9395833611488342, "rewards/RMReward/std": 0.09741244465112686, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 510, "train_speed(iter/s)": 0.00503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 210.46875, "completions/min_length": 100.0, "epoch": 0.01742659345905944, "frac_reward_zero_std": 0.0, "grad_norm": 0.5170531868934631, "kl": 0.08512803912162781, "learning_rate": 8.70824812542604e-07, "loss": -0.011734424158930779, "memory(GiB)": 69.45, "reward": 0.7047532796859741, "reward_std": 0.08405604958534241, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8858333230018616, "rewards/RMReward/std": 0.08838583528995514, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.09301315248012543, "rewards/VisualPerceptionAccuracy/std": 0.13360102474689484, "step": 511, "train_speed(iter/s)": 0.00503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 188.984375, "completions/min_length": 87.0, "epoch": 0.01746069638167991, "frac_reward_zero_std": 0.0, "grad_norm": 0.9263290166854858, "kl": 0.0880756676197052, "learning_rate": 8.7252897068848e-07, "loss": -0.008462395519018173, "memory(GiB)": 69.45, "reward": 0.6620980501174927, "reward_std": 0.0915641188621521, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9696969985961914, "rewards/PlanningActionSetORM/std": 0.04330844804644585, "rewards/RMReward/mean": 0.815000057220459, "rewards/RMReward/std": 0.18651336431503296, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.11057387292385101, "rewards/VisualPerceptionAccuracy/std": 0.12221712619066238, "step": 512, "train_speed(iter/s)": 0.005031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/mean_length": 92.140625, "completions/min_length": 8.0, "epoch": 0.01749479930430038, "frac_reward_zero_std": 0.0, "grad_norm": 7.835670471191406, "kl": 0.47521814703941345, "learning_rate": 8.742331288343559e-07, "loss": -0.006883531808853149, "memory(GiB)": 69.45, "reward": 0.7047960758209229, "reward_std": 0.17046692967414856, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7890625, "rewards/RMReward/std": 0.053482554852962494, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.4535589814186096, "rewards/VisualPerceptionAccuracy/std": 0.15943516790866852, "step": 513, "train_speed(iter/s)": 0.005032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 181.59375, "completions/min_length": 91.0, "epoch": 0.017528902226920846, "frac_reward_zero_std": 0.0, "grad_norm": 1.2213289737701416, "kl": 0.0753176212310791, "learning_rate": 8.759372869802318e-07, "loss": -0.018025003373622894, "memory(GiB)": 69.45, "reward": 0.688427209854126, "reward_std": 0.08582393825054169, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9978956580162048, "rewards/PlanningActionSetORM/std": 0.010252116248011589, "rewards/RMReward/mean": 0.8162500262260437, "rewards/RMReward/std": 0.13679654896259308, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1959713250398636, "rewards/VisualPerceptionAccuracy/std": 0.1388930082321167, "step": 514, "train_speed(iter/s)": 0.005032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 111.84375, "completions/min_length": 7.0, "epoch": 0.017563005149541316, "frac_reward_zero_std": 0.0, "grad_norm": 11.08240795135498, "kl": 0.6299185752868652, "learning_rate": 8.776414451261078e-07, "loss": -0.018811596557497978, "memory(GiB)": 69.45, "reward": 0.595880389213562, "reward_std": 0.2671208381652832, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.84375, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": 0.34019824862480164, "rewards/VisualPerceptionAccuracy/std": 0.1923154890537262, "step": 515, "train_speed(iter/s)": 0.005039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 181.71875, "completions/min_length": 60.0, "epoch": 0.017597108072161784, "frac_reward_zero_std": 0.0, "grad_norm": 1.4405020475387573, "kl": 0.06430782377719879, "learning_rate": 8.793456032719837e-07, "loss": -0.025325117632746696, "memory(GiB)": 69.45, "reward": 0.7309859395027161, "reward_std": 0.08367156982421875, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9741029739379883, "rewards/PlanningActionSetORM/std": 0.03927690535783768, "rewards/RMReward/mean": 0.8227083086967468, "rewards/RMReward/std": 0.1885751634836197, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.36498183012008667, "rewards/VisualPerceptionAccuracy/std": 0.17342689633369446, "step": 516, "train_speed(iter/s)": 0.005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 189.859375, "completions/min_length": 101.0, "epoch": 0.017631210994782254, "frac_reward_zero_std": 0.0, "grad_norm": 0.6672518253326416, "kl": 0.07274185121059418, "learning_rate": 8.810497614178597e-07, "loss": 0.012060733512043953, "memory(GiB)": 69.45, "reward": 0.8277187347412109, "reward_std": 0.09676086157560349, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9315349459648132, "rewards/PlanningActionSetORM/std": 0.05110739916563034, "rewards/RMReward/mean": 0.9543750286102295, "rewards/RMReward/std": 0.09097360074520111, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.461453914642334, "rewards/VisualPerceptionAccuracy/std": 0.18939591944217682, "step": 517, "train_speed(iter/s)": 0.005038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 182.625, "completions/min_length": 13.0, "epoch": 0.01766531391740272, "frac_reward_zero_std": 0.0, "grad_norm": 6.876402378082275, "kl": 0.14590710401535034, "learning_rate": 8.827539195637356e-07, "loss": -0.021521558985114098, "memory(GiB)": 69.45, "reward": 0.6859112977981567, "reward_std": 0.2072879523038864, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9618589878082275, "rewards/PlanningActionSetORM/std": 0.03879234194755554, "rewards/RMReward/mean": 0.8709374666213989, "rewards/RMReward/std": 0.12708543241024017, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.4404016137123108, "rewards/VisualPerceptionAccuracy/std": 0.22749850153923035, "step": 518, "train_speed(iter/s)": 0.005038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/mean_length": 227.421875, "completions/min_length": 101.0, "epoch": 0.01769941684002319, "frac_reward_zero_std": 0.0, "grad_norm": 0.5438879728317261, "kl": 0.05424296855926514, "learning_rate": 8.844580777096115e-07, "loss": -0.04162745177745819, "memory(GiB)": 69.45, "reward": 0.914898157119751, "reward_std": 0.08914303034543991, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9632408618927002, "rewards/PlanningActionSetORM/std": 0.03976597636938095, "rewards/RMReward/mean": 0.9028124809265137, "rewards/RMReward/std": 0.17168140411376953, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 519, "train_speed(iter/s)": 0.005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 137.078125, "completions/min_length": 12.0, "epoch": 0.01773351976264366, "frac_reward_zero_std": 0.0, "grad_norm": 2.690225124359131, "kl": 0.09246928989887238, "learning_rate": 8.861622358554875e-07, "loss": -0.04389742761850357, "memory(GiB)": 69.45, "reward": 0.8446753621101379, "reward_std": 0.11704252660274506, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9101274013519287, "rewards/PlanningActionSetORM/std": 0.01537414826452732, "rewards/RMReward/mean": 0.7883333563804626, "rewards/RMReward/std": 0.12816765904426575, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 520, "train_speed(iter/s)": 0.005036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/mean_length": 262.296875, "completions/min_length": 112.0, "epoch": 0.017767622685264127, "frac_reward_zero_std": 0.0, "grad_norm": 0.5817930102348328, "kl": 0.04091445356607437, "learning_rate": 8.878663940013634e-07, "loss": -0.03676566854119301, "memory(GiB)": 69.45, "reward": 0.6095355153083801, "reward_std": 0.10956694185733795, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9160839319229126, "rewards/PlanningActionSetORM/std": 0.007104890421032906, "rewards/RMReward/mean": 0.9603124856948853, "rewards/RMReward/std": 0.0492841899394989, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.26760420203208923, "rewards/VisualPerceptionAccuracy/std": 0.1898663341999054, "step": 521, "train_speed(iter/s)": 0.005036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/mean_length": 188.90625, "completions/min_length": 7.0, "epoch": 0.017801725607884594, "frac_reward_zero_std": 0.0, "grad_norm": 18.76716423034668, "kl": 0.4840041399002075, "learning_rate": 8.895705521472393e-07, "loss": 0.04620695114135742, "memory(GiB)": 69.45, "reward": 0.2570064961910248, "reward_std": 0.2246623933315277, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.26663365960121155, "rewards/VisualPerceptionAccuracy/std": 0.23078157007694244, "step": 522, "train_speed(iter/s)": 0.005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 226.421875, "completions/min_length": 86.0, "epoch": 0.017835828530505065, "frac_reward_zero_std": 0.0, "grad_norm": 0.8708788752555847, "kl": 0.08970597386360168, "learning_rate": 8.912747102931153e-07, "loss": -0.004332134500145912, "memory(GiB)": 69.45, "reward": 0.567304253578186, "reward_std": 0.16528701782226562, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9686461687088013, "rewards/PlanningActionSetORM/std": 0.0342826284468174, "rewards/RMReward/mean": 0.7653124928474426, "rewards/RMReward/std": 0.18902099132537842, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3286292254924774, "rewards/VisualPerceptionAccuracy/std": 0.19395682215690613, "step": 523, "train_speed(iter/s)": 0.005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 155.359375, "completions/min_length": 92.0, "epoch": 0.017869931453125532, "frac_reward_zero_std": 0.0, "grad_norm": 0.8947824239730835, "kl": 0.06068240478634834, "learning_rate": 8.929788684389912e-07, "loss": -0.015471774153411388, "memory(GiB)": 69.45, "reward": 0.6343576908111572, "reward_std": 0.08497849106788635, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9228125214576721, "rewards/RMReward/std": 0.0852031484246254, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3304654061794281, "rewards/VisualPerceptionAccuracy/std": 0.13923132419586182, "step": 524, "train_speed(iter/s)": 0.005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 139.953125, "completions/min_length": 87.0, "epoch": 0.017904034375746003, "frac_reward_zero_std": 0.0, "grad_norm": 1.2316838502883911, "kl": 0.09580281376838684, "learning_rate": 8.946830265848672e-07, "loss": 0.011453285813331604, "memory(GiB)": 69.45, "reward": 0.67460697889328, "reward_std": 0.12152823805809021, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9371874928474426, "rewards/RMReward/std": 0.05043259635567665, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3994640111923218, "rewards/VisualPerceptionAccuracy/std": 0.24215832352638245, "step": 525, "train_speed(iter/s)": 0.005036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/mean_length": 39.109375, "completions/min_length": 8.0, "epoch": 0.01793813729836647, "frac_reward_zero_std": 0.0, "grad_norm": 19.24477767944336, "kl": 1.0262088775634766, "learning_rate": 8.963871847307432e-07, "loss": -0.006555935367941856, "memory(GiB)": 69.45, "reward": 0.7956562638282776, "reward_std": 0.28446388244628906, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8999999761581421, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9681249856948853, "rewards/RMReward/std": 0.03833514451980591, "rewards/SpatialReasoningORM/mean": 0.7291666865348816, "rewards/SpatialReasoningORM/std": 0.449092835187912, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 526, "train_speed(iter/s)": 0.005042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 196.921875, "completions/min_length": 98.0, "epoch": 0.017972240220986938, "frac_reward_zero_std": 0.0, "grad_norm": 0.23297828435897827, "kl": 0.06380652636289597, "learning_rate": 8.98091342876619e-07, "loss": 0.0007841649348847568, "memory(GiB)": 69.45, "reward": 0.8890288472175598, "reward_std": 0.048552438616752625, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9495192170143127, "rewards/PlanningActionSetORM/std": 0.053686752915382385, "rewards/RMReward/mean": 0.8739062547683716, "rewards/RMReward/std": 0.13440778851509094, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 527, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/mean_length": 161.96875, "completions/min_length": 78.0, "epoch": 0.018006343143607408, "frac_reward_zero_std": 0.0, "grad_norm": 1.0906813144683838, "kl": 0.07536664605140686, "learning_rate": 8.99795501022495e-07, "loss": -0.02834160439670086, "memory(GiB)": 69.45, "reward": 0.5863839387893677, "reward_std": 0.13769878447055817, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9457961320877075, "rewards/PlanningActionSetORM/std": 0.05693444982171059, "rewards/RMReward/mean": 0.8400000333786011, "rewards/RMReward/std": 0.13452424108982086, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3116087317466736, "rewards/VisualPerceptionAccuracy/std": 0.23469278216362, "step": 528, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 128.234375, "completions/min_length": 8.0, "epoch": 0.018040446066227876, "frac_reward_zero_std": 0.0, "grad_norm": 27.454957962036133, "kl": 0.9667730927467346, "learning_rate": 9.014996591683709e-07, "loss": 0.011352570727467537, "memory(GiB)": 69.45, "reward": 0.5467187762260437, "reward_std": 0.1979755461215973, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8687499761581421, "rewards/RMReward/std": 0.12848371267318726, "rewards/SpatialReasoningORM/mean": 0.15625, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 529, "train_speed(iter/s)": 0.005047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/mean_length": 224.109375, "completions/min_length": 80.0, "epoch": 0.018074548988848343, "frac_reward_zero_std": 0.0, "grad_norm": 0.6423393487930298, "kl": 0.0633222758769989, "learning_rate": 9.032038173142469e-07, "loss": -0.0012853629887104034, "memory(GiB)": 69.45, "reward": 0.7277907729148865, "reward_std": 0.07663661986589432, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9743589758872986, "rewards/PlanningActionSetORM/std": 0.036645617336034775, "rewards/RMReward/mean": 0.8987500071525574, "rewards/RMReward/std": 0.1112334132194519, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.16954770684242249, "rewards/VisualPerceptionAccuracy/std": 0.20288729667663574, "step": 530, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/mean_length": 141.984375, "completions/min_length": 8.0, "epoch": 0.018108651911468814, "frac_reward_zero_std": 0.0, "grad_norm": 12.267720222473145, "kl": 0.29743343591690063, "learning_rate": 9.049079754601228e-07, "loss": 0.00892700720578432, "memory(GiB)": 69.45, "reward": 0.8668854236602783, "reward_std": 0.16166946291923523, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9777777791023254, "rewards/PlanningActionSetORM/std": 0.031759534031152725, "rewards/RMReward/mean": 0.8579166531562805, "rewards/RMReward/std": 0.11950176954269409, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 531, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/mean_length": 143.4375, "completions/min_length": 67.0, "epoch": 0.01814275483408928, "frac_reward_zero_std": 0.0, "grad_norm": 1.144412875175476, "kl": 0.06796813011169434, "learning_rate": 9.066121336059987e-07, "loss": 0.007289940491318703, "memory(GiB)": 69.45, "reward": 0.9704999923706055, "reward_std": 0.07065418362617493, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9631249904632568, "rewards/RMReward/std": 0.12865452468395233, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 532, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/mean_length": 201.453125, "completions/min_length": 84.0, "epoch": 0.01817685775670975, "frac_reward_zero_std": 0.0, "grad_norm": 0.9556103348731995, "kl": 0.07171131670475006, "learning_rate": 9.083162917518747e-07, "loss": -0.03599182516336441, "memory(GiB)": 69.45, "reward": 0.7790621519088745, "reward_std": 0.0978582501411438, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9561107754707336, "rewards/PlanningActionSetORM/std": 0.036650072783231735, "rewards/RMReward/mean": 0.8320832848548889, "rewards/RMReward/std": 0.1939067244529724, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5455820560455322, "rewards/VisualPerceptionAccuracy/std": 0.23420444130897522, "step": 533, "train_speed(iter/s)": 0.005041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/mean_length": 177.1875, "completions/min_length": 14.0, "epoch": 0.01821096067933022, "frac_reward_zero_std": 0.0, "grad_norm": 9.445131301879883, "kl": 0.07122763991355896, "learning_rate": 9.100204498977507e-07, "loss": -0.029710188508033752, "memory(GiB)": 69.45, "reward": 0.37053191661834717, "reward_std": 0.1612531989812851, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8796875476837158, "rewards/PlanningActionSetORM/std": 0.01007781270891428, "rewards/RMReward/mean": 0.9137499928474426, "rewards/RMReward/std": 0.08114801347255707, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.17353259027004242, "rewards/VisualPerceptionAccuracy/std": 0.13847826421260834, "step": 534, "train_speed(iter/s)": 0.005042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 216.59375, "completions/min_length": 119.0, "epoch": 0.018245063601950686, "frac_reward_zero_std": 0.0, "grad_norm": 0.692345142364502, "kl": 0.07105313241481781, "learning_rate": 9.117246080436265e-07, "loss": -0.012786141596734524, "memory(GiB)": 69.45, "reward": 0.6359082460403442, "reward_std": 0.10473054647445679, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8678125143051147, "rewards/RMReward/std": 0.08059193193912506, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.37756654620170593, "rewards/VisualPerceptionAccuracy/std": 0.17514565587043762, "step": 535, "train_speed(iter/s)": 0.005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 65.625, "completions/min_length": 8.0, "epoch": 0.018279166524571157, "frac_reward_zero_std": 0.0, "grad_norm": 46.65381622314453, "kl": 0.4027332067489624, "learning_rate": 9.134287661895025e-07, "loss": 0.0126499617472291, "memory(GiB)": 69.45, "reward": 0.612106442451477, "reward_std": 0.298786997795105, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.04082484170794487, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.578425943851471, "rewards/VisualPerceptionAccuracy/std": 0.21248812973499298, "step": 536, "train_speed(iter/s)": 0.005044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 119.203125, "completions/min_length": 15.0, "epoch": 0.018313269447191624, "frac_reward_zero_std": 0.0, "grad_norm": 10.220913887023926, "kl": 0.061037350445985794, "learning_rate": 9.151329243353784e-07, "loss": -0.009929646737873554, "memory(GiB)": 69.45, "reward": 0.7221124172210693, "reward_std": 0.20320649445056915, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.9203125238418579, "rewards/RMReward/std": 0.0689312070608139, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.6319218873977661, "rewards/VisualPerceptionAccuracy/std": 0.2501765489578247, "step": 537, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/mean_length": 102.796875, "completions/min_length": 88.0, "epoch": 0.01834737236981209, "frac_reward_zero_std": 0.0, "grad_norm": 1.4532033205032349, "kl": 0.08874626457691193, "learning_rate": 9.168370824812544e-07, "loss": 0.006834576837718487, "memory(GiB)": 69.45, "reward": 0.9297499656677246, "reward_std": 0.05067639425396919, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.054554473608732224, "rewards/RMReward/mean": 0.9200000166893005, "rewards/RMReward/std": 0.11081229895353317, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 538, "train_speed(iter/s)": 0.005044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/mean_length": 252.515625, "completions/min_length": 98.0, "epoch": 0.018381475292432562, "frac_reward_zero_std": 0.0, "grad_norm": 0.5588110685348511, "kl": 0.042644716799259186, "learning_rate": 9.185412406271303e-07, "loss": 0.026659224182367325, "memory(GiB)": 69.45, "reward": 0.7106016874313354, "reward_std": 0.09492629766464233, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8535416722297668, "rewards/RMReward/std": 0.16667574644088745, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1939067244529724, "rewards/VisualPerceptionAccuracy/std": 0.15160520374774933, "step": 539, "train_speed(iter/s)": 0.005042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 161.125, "completions/min_length": 71.0, "epoch": 0.01841557821505303, "frac_reward_zero_std": 0.0, "grad_norm": 1.203675389289856, "kl": 0.07213421911001205, "learning_rate": 9.202453987730062e-07, "loss": -0.003507504239678383, "memory(GiB)": 69.45, "reward": 0.5049945116043091, "reward_std": 0.20646178722381592, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.12583057582378387, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3899926245212555, "rewards/VisualPerceptionAccuracy/std": 0.252645343542099, "step": 540, "train_speed(iter/s)": 0.005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/mean_length": 8.0, "completions/min_length": 8.0, "epoch": 0.0184496811376735, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011251702380832285, "kl": 0.92578125, "learning_rate": 9.219495569188822e-07, "loss": 0.0009271809831261635, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 541, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.28125, "completions/min_length": 8.0, "epoch": 0.018483784060293967, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044344921479932964, "kl": 0.6479012966156006, "learning_rate": 9.236537150647582e-07, "loss": 0.0006481154705397785, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 542, "train_speed(iter/s)": 0.005055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 272.546875, "completions/min_length": 99.0, "epoch": 0.018517886982914435, "frac_reward_zero_std": 0.0, "grad_norm": 0.5041850209236145, "kl": 0.06751353293657303, "learning_rate": 9.25357873210634e-07, "loss": -0.013320568948984146, "memory(GiB)": 69.45, "reward": 0.729346513748169, "reward_std": 0.0943823978304863, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8708333373069763, "rewards/RMReward/std": 0.17117656767368317, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.22738611698150635, "rewards/VisualPerceptionAccuracy/std": 0.15243124961853027, "step": 543, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 145.75, "completions/min_length": 2.0, "epoch": 0.018551989905534905, "frac_reward_zero_std": 0.0, "grad_norm": 62.344661712646484, "kl": 0.0797518640756607, "learning_rate": 9.2706203135651e-07, "loss": -0.007158294320106506, "memory(GiB)": 69.45, "reward": 0.7493211030960083, "reward_std": 0.16384784877300262, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9829737544059753, "rewards/PlanningActionSetORM/std": 0.03187479078769684, "rewards/RMReward/mean": 0.7947916984558105, "rewards/RMReward/std": 0.12301166355609894, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5, "rewards/VisualPerceptionAccuracy/std": 0.5163977742195129, "step": 544, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/mean_length": 288.890625, "completions/min_length": 192.0, "epoch": 0.018586092828155373, "frac_reward_zero_std": 0.0, "grad_norm": 0.36278942227363586, "kl": 0.05202200636267662, "learning_rate": 9.287661895023858e-07, "loss": -0.008755063638091087, "memory(GiB)": 69.45, "reward": 0.8200932741165161, "reward_std": 0.06266143918037415, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9474999308586121, "rewards/RMReward/std": 0.06459365040063858, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4063730835914612, "rewards/VisualPerceptionAccuracy/std": 0.17265407741069794, "step": 545, "train_speed(iter/s)": 0.005055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 183.28125, "completions/min_length": 109.0, "epoch": 0.01862019575077584, "frac_reward_zero_std": 0.0, "grad_norm": 0.559700608253479, "kl": 0.056542739272117615, "learning_rate": 9.304703476482619e-07, "loss": -0.004987402819097042, "memory(GiB)": 69.45, "reward": 0.8658945560455322, "reward_std": 0.0589890256524086, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9683333039283752, "rewards/RMReward/std": 0.053607452660799026, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5395783185958862, "rewards/VisualPerceptionAccuracy/std": 0.15822799503803253, "step": 546, "train_speed(iter/s)": 0.005055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/mean_length": 144.453125, "completions/min_length": 8.0, "epoch": 0.01865429867339631, "frac_reward_zero_std": 0.0, "grad_norm": 51.44295120239258, "kl": 1.0541201829910278, "learning_rate": 9.321745057941379e-07, "loss": -0.0037098415195941925, "memory(GiB)": 69.45, "reward": 0.33655378222465515, "reward_std": 0.24492914974689484, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.15625, "rewards/SpatialReasoningORM/std": 0.3689020276069641, "rewards/VisualPerceptionAccuracy/mean": 0.47467005252838135, "rewards/VisualPerceptionAccuracy/std": 0.19165082275867462, "step": 547, "train_speed(iter/s)": 0.005062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/mean_length": 129.328125, "completions/min_length": 8.0, "epoch": 0.018688401596016778, "frac_reward_zero_std": 0.0, "grad_norm": 36.584068298339844, "kl": 0.49440091848373413, "learning_rate": 9.338786639400136e-07, "loss": 0.00788768008351326, "memory(GiB)": 69.45, "reward": 0.8798333406448364, "reward_std": 0.14017046988010406, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9930555820465088, "rewards/PlanningActionSetORM/std": 0.023275842890143394, "rewards/RMReward/mean": 0.9004166722297668, "rewards/RMReward/std": 0.08953683078289032, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 548, "train_speed(iter/s)": 0.005058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 99.71875, "completions/min_length": 9.0, "epoch": 0.01872250451863725, "frac_reward_zero_std": 0.0, "grad_norm": 14.793909072875977, "kl": 0.304597944021225, "learning_rate": 9.355828220858897e-07, "loss": -0.01532004401087761, "memory(GiB)": 69.45, "reward": 0.6667544841766357, "reward_std": 0.18667051196098328, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9175000190734863, "rewards/RMReward/std": 0.08805301785469055, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.43838396668434143, "rewards/VisualPerceptionAccuracy/std": 0.18612627685070038, "step": 549, "train_speed(iter/s)": 0.005059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 209.9375, "completions/min_length": 98.0, "epoch": 0.018756607441257716, "frac_reward_zero_std": 0.0, "grad_norm": 0.9187057018280029, "kl": 0.08939192444086075, "learning_rate": 9.372869802317657e-07, "loss": -0.01892612688243389, "memory(GiB)": 69.45, "reward": 0.6155439019203186, "reward_std": 0.12545594573020935, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9003125429153442, "rewards/RMReward/std": 0.09416814893484116, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3108378052711487, "rewards/VisualPerceptionAccuracy/std": 0.23014158010482788, "step": 550, "train_speed(iter/s)": 0.005062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 157.90625, "completions/min_length": 99.0, "epoch": 0.018790710363878183, "frac_reward_zero_std": 0.0, "grad_norm": 0.7574045062065125, "kl": 0.08030415326356888, "learning_rate": 9.389911383776415e-07, "loss": 0.0028751734644174576, "memory(GiB)": 69.45, "reward": 0.9099999666213989, "reward_std": 0.06853172183036804, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8875000476837158, "rewards/RMReward/std": 0.10488088428974152, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 551, "train_speed(iter/s)": 0.005059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/mean_length": 141.203125, "completions/min_length": 97.0, "epoch": 0.018824813286498654, "frac_reward_zero_std": 0.0, "grad_norm": 0.9449818134307861, "kl": 0.08533893525600433, "learning_rate": 9.406952965235175e-07, "loss": -0.01932985708117485, "memory(GiB)": 69.45, "reward": 0.7856884002685547, "reward_std": 0.06734622269868851, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8956249356269836, "rewards/RMReward/std": 0.10770144313573837, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.39325350522994995, "rewards/VisualPerceptionAccuracy/std": 0.16040877997875214, "step": 552, "train_speed(iter/s)": 0.005057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/mean_length": 180.3125, "completions/min_length": 9.0, "epoch": 0.01885891620911912, "frac_reward_zero_std": 0.0, "grad_norm": 20.171039581298828, "kl": 0.22990846633911133, "learning_rate": 9.423994546693933e-07, "loss": 0.019855637103319168, "memory(GiB)": 69.45, "reward": 0.7948500514030457, "reward_std": 0.16228285431861877, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.981249988079071, "rewards/RMReward/std": 0.046402864158153534, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5163977742195129, "rewards/VisualPerceptionAccuracy/mean": 0.7066226005554199, "rewards/VisualPerceptionAccuracy/std": 0.0954136848449707, "step": 553, "train_speed(iter/s)": 0.00506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 47.796875, "completions/min_length": 8.0, "epoch": 0.01889301913173959, "frac_reward_zero_std": 0.75, "grad_norm": 0.2939930856227875, "kl": 0.6741414070129395, "learning_rate": 9.441036128152693e-07, "loss": 0.000800715759396553, "memory(GiB)": 69.45, "reward": 0.9761250019073486, "reward_std": 0.029758751392364502, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8806250095367432, "rewards/RMReward/std": 0.1487937718629837, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 554, "train_speed(iter/s)": 0.005058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 160.90625, "completions/min_length": 9.0, "epoch": 0.01892712205436006, "frac_reward_zero_std": 0.0, "grad_norm": 18.157135009765625, "kl": 0.2580203115940094, "learning_rate": 9.458077709611454e-07, "loss": 0.0028847772628068924, "memory(GiB)": 69.45, "reward": 0.6692368984222412, "reward_std": 0.1342194676399231, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.875, "rewards/PlanningActionSetORM/std": 0.1270001232624054, "rewards/RMReward/mean": 0.6453125476837158, "rewards/RMReward/std": 0.1381673663854599, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.3538224399089813, "rewards/VisualPerceptionAccuracy/std": 0.13045679032802582, "step": 555, "train_speed(iter/s)": 0.00506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 76.125, "completions/min_length": 8.0, "epoch": 0.018961224976980526, "frac_reward_zero_std": 0.75, "grad_norm": 0.07970462739467621, "kl": 0.49881449341773987, "learning_rate": 9.475119291070211e-07, "loss": 0.0007580660749226809, "memory(GiB)": 69.45, "reward": 0.7619999647140503, "reward_std": 0.0008944290457293391, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9975000023841858, "rewards/RMReward/std": 0.0044721318408846855, "rewards/SpatialReasoningORM/mean": 0.6666666865348816, "rewards/SpatialReasoningORM/std": 0.47639307379722595, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 556, "train_speed(iter/s)": 0.005059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 96.671875, "completions/min_length": 8.0, "epoch": 0.018995327899600997, "frac_reward_zero_std": 0.0, "grad_norm": 15.058279991149902, "kl": 0.39824414253234863, "learning_rate": 9.492160872528971e-07, "loss": 0.04577939957380295, "memory(GiB)": 69.45, "reward": 0.7091169357299805, "reward_std": 0.2877240777015686, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.75, "rewards/SpatialReasoningORM/std": 0.4399413466453552, "rewards/VisualPerceptionAccuracy/mean": 0.6557338237762451, "rewards/VisualPerceptionAccuracy/std": 0.23187987506389618, "step": 557, "train_speed(iter/s)": 0.005066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/mean_length": 86.5625, "completions/min_length": 2.0, "epoch": 0.019029430822221464, "frac_reward_zero_std": 0.0, "grad_norm": 46.0914421081543, "kl": 0.27371934056282043, "learning_rate": 9.509202453987732e-07, "loss": 0.001107212621718645, "memory(GiB)": 69.45, "reward": 0.7930468320846558, "reward_std": 0.1716383546590805, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9661458134651184, "rewards/PlanningActionSetORM/std": 0.0561366081237793, "rewards/RMReward/mean": 0.8979166150093079, "rewards/RMReward/std": 0.07477506995201111, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4375, "rewards/VisualPerceptionAccuracy/std": 0.5123475790023804, "step": 558, "train_speed(iter/s)": 0.005067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 287.828125, "completions/min_length": 118.0, "epoch": 0.019063533744841932, "frac_reward_zero_std": 0.0, "grad_norm": 0.633152186870575, "kl": 0.045867063105106354, "learning_rate": 9.52624403544649e-07, "loss": -0.010757895186543465, "memory(GiB)": 69.45, "reward": 0.5866279602050781, "reward_std": 0.0998605489730835, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8656250238418579, "rewards/RMReward/std": 0.08370213210582733, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.28075599670410156, "rewards/VisualPerceptionAccuracy/std": 0.16565020382404327, "step": 559, "train_speed(iter/s)": 0.005064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 153.96875, "completions/min_length": 8.0, "epoch": 0.019097636667462402, "frac_reward_zero_std": 0.0, "grad_norm": 9.48554515838623, "kl": 0.3239016532897949, "learning_rate": 9.54328561690525e-07, "loss": 0.03131520748138428, "memory(GiB)": 69.45, "reward": 0.5256257057189941, "reward_std": 0.17381760478019714, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.9993749856948853, "rewards/RMReward/std": 0.002499997615814209, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.4993138313293457, "rewards/VisualPerceptionAccuracy/std": 0.22873076796531677, "step": 560, "train_speed(iter/s)": 0.005068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 163.75, "completions/min_length": 7.0, "epoch": 0.01913173959008287, "frac_reward_zero_std": 0.0, "grad_norm": 12.306177139282227, "kl": 0.4355328679084778, "learning_rate": 9.560327198364008e-07, "loss": -0.01617269776761532, "memory(GiB)": 69.45, "reward": 0.644644558429718, "reward_std": 0.1777346134185791, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8815624713897705, "rewards/RMReward/std": 0.1318872720003128, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.4212031662464142, "rewards/VisualPerceptionAccuracy/std": 0.1834731549024582, "step": 561, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 135.703125, "completions/min_length": 97.0, "epoch": 0.019165842512703337, "frac_reward_zero_std": 0.0, "grad_norm": 0.9093096256256104, "kl": 0.10007504373788834, "learning_rate": 9.577368779822768e-07, "loss": 0.02019672840833664, "memory(GiB)": 69.45, "reward": 0.7089378237724304, "reward_std": 0.057943351566791534, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9162499904632568, "rewards/RMReward/std": 0.10753771662712097, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.036751266568899155, "rewards/VisualPerceptionAccuracy/std": 0.04185263067483902, "step": 562, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 192.078125, "completions/min_length": 105.0, "epoch": 0.019199945435323808, "frac_reward_zero_std": 0.0, "grad_norm": 1.7467347383499146, "kl": 0.07339198142290115, "learning_rate": 9.594410361281528e-07, "loss": -0.0001916736364364624, "memory(GiB)": 69.45, "reward": 0.9119445085525513, "reward_std": 0.08148644864559174, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9722222089767456, "rewards/PlanningActionSetORM/std": 0.04849286377429962, "rewards/RMReward/mean": 0.8968750238418579, "rewards/RMReward/std": 0.1369958370923996, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 563, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 134.90625, "completions/min_length": 14.0, "epoch": 0.019234048357944275, "frac_reward_zero_std": 0.0, "grad_norm": 2.374384880065918, "kl": 0.06840870529413223, "learning_rate": 9.611451942740287e-07, "loss": 0.0010516326874494553, "memory(GiB)": 69.45, "reward": 0.5853750705718994, "reward_std": 0.13704952597618103, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9390624761581421, "rewards/RMReward/std": 0.043504487723112106, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.21087533235549927, "rewards/VisualPerceptionAccuracy/std": 0.11183837801218033, "step": 564, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/mean_length": 130.390625, "completions/min_length": 8.0, "epoch": 0.019268151280564746, "frac_reward_zero_std": 0.0, "grad_norm": 17.008996963500977, "kl": 0.7021955251693726, "learning_rate": 9.628493524199047e-07, "loss": 0.010382598266005516, "memory(GiB)": 69.45, "reward": 0.5595766305923462, "reward_std": 0.23390847444534302, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9098290205001831, "rewards/PlanningActionSetORM/std": 0.02151315100491047, "rewards/RMReward/mean": 0.7749999761581421, "rewards/RMReward/std": 0.17367005348205566, "rewards/SpatialReasoningORM/mean": 0.28125, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 565, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/mean_length": 8.5, "completions/min_length": 8.0, "epoch": 0.019302254203185213, "frac_reward_zero_std": 1.0, "grad_norm": 5.2708703151438385e-05, "kl": 1.304296851158142, "learning_rate": 9.645535105657807e-07, "loss": 0.001305215759202838, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 566, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/mean_length": 214.4375, "completions/min_length": 14.0, "epoch": 0.01933635712580568, "frac_reward_zero_std": 0.0, "grad_norm": 9.969717025756836, "kl": 0.05859646201133728, "learning_rate": 9.662576687116565e-07, "loss": -0.020782489329576492, "memory(GiB)": 69.45, "reward": 0.7881770730018616, "reward_std": 0.16293998062610626, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9555556178092957, "rewards/PlanningActionSetORM/std": 0.031759534031152725, "rewards/RMReward/mean": 0.9302083849906921, "rewards/RMReward/std": 0.14177431166172028, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 567, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 140.765625, "completions/min_length": 14.0, "epoch": 0.01937046004842615, "frac_reward_zero_std": 0.0, "grad_norm": 14.158466339111328, "kl": 0.061703942716121674, "learning_rate": 9.679618268575324e-07, "loss": 0.013664056546986103, "memory(GiB)": 69.45, "reward": 0.6009237766265869, "reward_std": 0.17090725898742676, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8615624904632568, "rewards/RMReward/std": 0.13995641469955444, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.3376948833465576, "rewards/VisualPerceptionAccuracy/std": 0.209686279296875, "step": 568, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/mean_length": 145.65625, "completions/min_length": 8.0, "epoch": 0.01940456297104662, "frac_reward_zero_std": 0.0, "grad_norm": 24.941720962524414, "kl": 0.5424534678459167, "learning_rate": 9.696659850034084e-07, "loss": 0.002549818716943264, "memory(GiB)": 69.45, "reward": 0.6860748529434204, "reward_std": 0.29236090183258057, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9090909361839294, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.75, "rewards/RMReward/std": 0.06582805514335632, "rewards/SpatialReasoningORM/mean": 0.71875, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": 0.49685636162757874, "rewards/VisualPerceptionAccuracy/std": 0.23715028166770935, "step": 569, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 196.40625, "completions/min_length": 159.0, "epoch": 0.019438665893667086, "frac_reward_zero_std": 0.0, "grad_norm": 0.578796923160553, "kl": 0.08008027821779251, "learning_rate": 9.713701431492844e-07, "loss": -0.011226577684283257, "memory(GiB)": 69.45, "reward": 0.8746659159660339, "reward_std": 0.07300561666488647, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9629629254341125, "rewards/PlanningActionSetORM/std": 0.052932560443878174, "rewards/RMReward/mean": 0.96875, "rewards/RMReward/std": 0.05963095650076866, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5958858728408813, "rewards/VisualPerceptionAccuracy/std": 0.2061050981283188, "step": 570, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/mean_length": 191.765625, "completions/min_length": 100.0, "epoch": 0.019472768816287556, "frac_reward_zero_std": 0.0, "grad_norm": 0.4916422963142395, "kl": 0.08505360037088394, "learning_rate": 9.730743012951604e-07, "loss": -0.00029043108224868774, "memory(GiB)": 69.45, "reward": 0.8821250200271606, "reward_std": 0.05709223449230194, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8526562452316284, "rewards/RMReward/std": 0.18042491376399994, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 571, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/mean_length": 205.921875, "completions/min_length": 87.0, "epoch": 0.019506871738908024, "frac_reward_zero_std": 0.0, "grad_norm": 1.6124522686004639, "kl": 0.09229462593793869, "learning_rate": 9.747784594410362e-07, "loss": 0.006091265007853508, "memory(GiB)": 69.45, "reward": 0.6477372646331787, "reward_std": 0.146388977766037, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9463541507720947, "rewards/PlanningActionSetORM/std": 0.10617023706436157, "rewards/RMReward/mean": 0.7853125333786011, "rewards/RMReward/std": 0.15737737715244293, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4779536724090576, "rewards/VisualPerceptionAccuracy/std": 0.18329881131649017, "step": 572, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 181.5625, "completions/min_length": 89.0, "epoch": 0.019540974661528494, "frac_reward_zero_std": 0.0, "grad_norm": 0.7715350389480591, "kl": 0.08043035119771957, "learning_rate": 9.764826175869121e-07, "loss": -0.02815844491124153, "memory(GiB)": 69.45, "reward": 0.7917987108230591, "reward_std": 0.05351916328072548, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8820833563804626, "rewards/RMReward/std": 0.11394209414720535, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.45019498467445374, "rewards/VisualPerceptionAccuracy/std": 0.08562951534986496, "step": 573, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 120.8125, "completions/min_length": 8.0, "epoch": 0.01957507758414896, "frac_reward_zero_std": 0.0, "grad_norm": 6.437344551086426, "kl": 0.32105040550231934, "learning_rate": 9.781867757327881e-07, "loss": 0.010588712990283966, "memory(GiB)": 69.45, "reward": 0.8060764074325562, "reward_std": 0.17737440764904022, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9259259104728699, "rewards/PlanningActionSetORM/std": 0.052932560443878174, "rewards/RMReward/mean": 0.84375, "rewards/RMReward/std": 0.1422557681798935, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 574, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 140.21875, "completions/min_length": 8.0, "epoch": 0.01960918050676943, "frac_reward_zero_std": 0.5, "grad_norm": 0.5259078145027161, "kl": 0.5687436461448669, "learning_rate": 9.79890933878664e-07, "loss": -0.006701434031128883, "memory(GiB)": 69.45, "reward": 0.850287675857544, "reward_std": 0.03804095834493637, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9812500476837158, "rewards/RMReward/std": 0.02217356488108635, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.4161507189273834, "rewards/VisualPerceptionAccuracy/std": 0.13442498445510864, "step": 575, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 172.1875, "completions/min_length": 11.0, "epoch": 0.0196432834293899, "frac_reward_zero_std": 0.0, "grad_norm": 14.50450611114502, "kl": 0.20074084401130676, "learning_rate": 9.8159509202454e-07, "loss": 0.01961521804332733, "memory(GiB)": 69.45, "reward": 0.7650281190872192, "reward_std": 0.12415122240781784, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.9678125381469727, "rewards/RMReward/std": 0.0596080906689167, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.18637210130691528, "rewards/VisualPerceptionAccuracy/std": 0.16294333338737488, "step": 576, "train_speed(iter/s)": 0.005067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/mean_length": 175.203125, "completions/min_length": 88.0, "epoch": 0.019677386352010367, "frac_reward_zero_std": 0.0, "grad_norm": 0.6864320039749146, "kl": 0.07831063121557236, "learning_rate": 9.832992501704158e-07, "loss": 0.011336009949445724, "memory(GiB)": 69.45, "reward": 0.9213749766349792, "reward_std": 0.06131048873066902, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9017187356948853, "rewards/RMReward/std": 0.11476068198680878, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 577, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 154.96875, "completions/min_length": 8.0, "epoch": 0.019711489274630834, "frac_reward_zero_std": 0.0, "grad_norm": 9.915817260742188, "kl": 0.5420858860015869, "learning_rate": 9.850034083162918e-07, "loss": 0.004638134501874447, "memory(GiB)": 69.45, "reward": 0.8322411775588989, "reward_std": 0.15452061593532562, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9413993954658508, "rewards/PlanningActionSetORM/std": 0.05195165425539017, "rewards/RMReward/mean": 0.85875004529953, "rewards/RMReward/std": 0.13222523033618927, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 578, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/mean_length": 129.921875, "completions/min_length": 84.0, "epoch": 0.019745592197251305, "frac_reward_zero_std": 0.0, "grad_norm": 0.6848955154418945, "kl": 0.06669937819242477, "learning_rate": 9.867075664621678e-07, "loss": 0.00908304750919342, "memory(GiB)": 69.45, "reward": 0.9614999890327454, "reward_std": 0.03717753663659096, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9518750309944153, "rewards/RMReward/std": 0.055000003427267075, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 579, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/mean_length": 118.5, "completions/min_length": 8.0, "epoch": 0.019779695119871772, "frac_reward_zero_std": 0.0, "grad_norm": 9.116412162780762, "kl": 0.5385851860046387, "learning_rate": 9.884117246080436e-07, "loss": -0.044942885637283325, "memory(GiB)": 69.45, "reward": 0.6229464411735535, "reward_std": 0.18186837434768677, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9925000071525574, "rewards/RMReward/std": 0.01238278578966856, "rewards/SpatialReasoningORM/mean": 0.53125, "rewards/SpatialReasoningORM/std": 0.507007360458374, "rewards/VisualPerceptionAccuracy/mean": 0.3884108066558838, "rewards/VisualPerceptionAccuracy/std": 0.1555805504322052, "step": 580, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 253.625, "completions/min_length": 112.0, "epoch": 0.019813798042492243, "frac_reward_zero_std": 0.0, "grad_norm": 0.538908839225769, "kl": 0.06482309103012085, "learning_rate": 9.901158827539197e-07, "loss": -0.014766210690140724, "memory(GiB)": 69.45, "reward": 0.7413525581359863, "reward_std": 0.08218131959438324, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9812502861022949, "rewards/PlanningActionSetORM/std": 0.027844736352562904, "rewards/RMReward/mean": 0.7760416865348816, "rewards/RMReward/std": 0.11012062430381775, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5141600966453552, "rewards/VisualPerceptionAccuracy/std": 0.09870514273643494, "step": 581, "train_speed(iter/s)": 0.005075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/mean_length": 157.359375, "completions/min_length": 107.0, "epoch": 0.01984790096511271, "frac_reward_zero_std": 0.0, "grad_norm": 0.7029281258583069, "kl": 0.06895479559898376, "learning_rate": 9.918200408997957e-07, "loss": 0.017686648294329643, "memory(GiB)": 69.45, "reward": 0.8921718597412109, "reward_std": 0.024325886741280556, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9621093273162842, "rewards/PlanningActionSetORM/std": 0.058190107345581055, "rewards/RMReward/mean": 0.8746874928474426, "rewards/RMReward/std": 0.10824675858020782, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 582, "train_speed(iter/s)": 0.005074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/mean_length": 275.828125, "completions/min_length": 162.0, "epoch": 0.019882003887733177, "frac_reward_zero_std": 0.0, "grad_norm": 0.8430898189544678, "kl": 0.045328106731176376, "learning_rate": 9.935241990456715e-07, "loss": 0.012833267450332642, "memory(GiB)": 69.45, "reward": 0.8348972201347351, "reward_std": 0.09757710248231888, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9834144711494446, "rewards/PlanningActionSetORM/std": 0.023778876289725304, "rewards/RMReward/mean": 0.8914583325386047, "rewards/RMReward/std": 0.12612034380435944, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.610040009021759, "rewards/VisualPerceptionAccuracy/std": 0.21264442801475525, "step": 583, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 205.4375, "completions/min_length": 109.0, "epoch": 0.019916106810353648, "frac_reward_zero_std": 0.0, "grad_norm": 0.7325937747955322, "kl": 0.06598600000143051, "learning_rate": 9.952283571915475e-07, "loss": -0.03155674785375595, "memory(GiB)": 69.45, "reward": 0.6656622886657715, "reward_std": 0.20429235696792603, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8387500047683716, "rewards/RMReward/std": 0.13743619620800018, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.46032461524009705, "rewards/VisualPerceptionAccuracy/std": 0.32895681262016296, "step": 584, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/mean_length": 217.953125, "completions/min_length": 77.0, "epoch": 0.019950209732974115, "frac_reward_zero_std": 0.0, "grad_norm": 1.1169146299362183, "kl": 0.0690944492816925, "learning_rate": 9.969325153374232e-07, "loss": 0.020604852586984634, "memory(GiB)": 69.45, "reward": 0.7283851504325867, "reward_std": 0.124467633664608, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9793750047683716, "rewards/RMReward/std": 0.023796016350388527, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6433468461036682, "rewards/VisualPerceptionAccuracy/std": 0.24443164467811584, "step": 585, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 187.859375, "completions/min_length": 102.0, "epoch": 0.019984312655594586, "frac_reward_zero_std": 0.0, "grad_norm": 0.3249812722206116, "kl": 0.06101921945810318, "learning_rate": 9.986366734832992e-07, "loss": -0.002162558026611805, "memory(GiB)": 69.45, "reward": 0.9395288228988647, "reward_std": 0.0436226949095726, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9495192170143127, "rewards/PlanningActionSetORM/std": 0.053686752915382385, "rewards/RMReward/mean": 0.9370312690734863, "rewards/RMReward/std": 0.07761687785387039, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 586, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "epoch": 0.020018415578215053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022287829779088497, "kl": 0.8907352089881897, "learning_rate": 1.0003408316291754e-06, "loss": 0.0008914034115150571, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 587, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/mean_length": 163.40625, "completions/min_length": 14.0, "epoch": 0.02005251850083552, "frac_reward_zero_std": 0.0, "grad_norm": 4.40418815612793, "kl": 0.177895188331604, "learning_rate": 1.0020449897750512e-06, "loss": 0.01961541548371315, "memory(GiB)": 69.45, "reward": 0.5905665755271912, "reward_std": 0.19744659960269928, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.908750057220459, "rewards/RMReward/std": 0.1287345290184021, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.3395163118839264, "rewards/VisualPerceptionAccuracy/std": 0.2830416262149811, "step": 588, "train_speed(iter/s)": 0.005073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/mean_length": 132.65625, "completions/min_length": 8.0, "epoch": 0.02008662142345599, "frac_reward_zero_std": 0.0, "grad_norm": 21.201642990112305, "kl": 0.35731783509254456, "learning_rate": 1.0037491479209271e-06, "loss": 0.006269130855798721, "memory(GiB)": 69.45, "reward": 0.7665659189224243, "reward_std": 0.14119583368301392, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.09837386757135391, "rewards/RMReward/mean": 0.901562511920929, "rewards/RMReward/std": 0.10631994158029556, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": 0.4118887484073639, "rewards/VisualPerceptionAccuracy/std": 0.09271524101495743, "step": 589, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/mean_length": 180.140625, "completions/min_length": 106.0, "epoch": 0.02012072434607646, "frac_reward_zero_std": 0.0, "grad_norm": 0.6318662762641907, "kl": 0.07060574740171432, "learning_rate": 1.0054533060668031e-06, "loss": -0.0010040197521448135, "memory(GiB)": 69.45, "reward": 0.9648482799530029, "reward_std": 0.02718336321413517, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9529914259910583, "rewards/PlanningActionSetORM/std": 0.0489213801920414, "rewards/RMReward/mean": 0.9678125381469727, "rewards/RMReward/std": 0.038892146199941635, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 590, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/mean_length": 165.1875, "completions/min_length": 104.0, "epoch": 0.020154827268696926, "frac_reward_zero_std": 0.0, "grad_norm": 0.9086765646934509, "kl": 0.06333747506141663, "learning_rate": 1.007157464212679e-06, "loss": -0.00032998621463775635, "memory(GiB)": 69.45, "reward": 0.834690272808075, "reward_std": 0.13925784826278687, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8466666340827942, "rewards/RMReward/std": 0.09554620087146759, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7067610621452332, "rewards/VisualPerceptionAccuracy/std": 0.36168843507766724, "step": 591, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 118.859375, "completions/min_length": 8.0, "epoch": 0.020188930191317397, "frac_reward_zero_std": 0.0, "grad_norm": 7.784846782684326, "kl": 0.4836201071739197, "learning_rate": 1.0088616223585549e-06, "loss": -7.016398012638092e-05, "memory(GiB)": 69.45, "reward": 0.9139687418937683, "reward_std": 0.12101395428180695, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9308333396911621, "rewards/RMReward/std": 0.06111840903759003, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 592, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 204.984375, "completions/min_length": 14.0, "epoch": 0.020223033113937864, "frac_reward_zero_std": 0.0, "grad_norm": 4.435153961181641, "kl": 0.04758039116859436, "learning_rate": 1.0105657805044309e-06, "loss": -0.04342396557331085, "memory(GiB)": 69.45, "reward": 0.4402186870574951, "reward_std": 0.19307266175746918, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.824999988079071, "rewards/RMReward/std": 0.06324555724859238, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.3066873550415039, "rewards/VisualPerceptionAccuracy/std": 0.1640065759420395, "step": 593, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 129.171875, "completions/min_length": 8.0, "epoch": 0.020257136036558335, "frac_reward_zero_std": 0.0, "grad_norm": 23.631427764892578, "kl": 0.41114214062690735, "learning_rate": 1.0122699386503068e-06, "loss": -0.012110486626625061, "memory(GiB)": 69.45, "reward": 0.6116719245910645, "reward_std": 0.17577987909317017, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9545454978942871, "rewards/PlanningActionSetORM/std": 0.046181850135326385, "rewards/RMReward/mean": 0.9196875095367432, "rewards/RMReward/std": 0.06630741059780121, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.30586951971054077, "rewards/VisualPerceptionAccuracy/std": 0.18544621765613556, "step": 594, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/mean_length": 245.40625, "completions/min_length": 133.0, "epoch": 0.020291238959178802, "frac_reward_zero_std": 0.0, "grad_norm": 0.781380832195282, "kl": 0.06579278409481049, "learning_rate": 1.0139740967961828e-06, "loss": 0.019043099135160446, "memory(GiB)": 69.45, "reward": 0.8469815254211426, "reward_std": 0.05922040343284607, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9608333706855774, "rewards/RMReward/std": 0.06750360876321793, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.48192620277404785, "rewards/VisualPerceptionAccuracy/std": 0.1654459536075592, "step": 595, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/mean_length": 137.59375, "completions/min_length": 91.0, "epoch": 0.02032534188179927, "frac_reward_zero_std": 0.0, "grad_norm": 0.9161847233772278, "kl": 0.09804404526948929, "learning_rate": 1.0156782549420586e-06, "loss": 0.026949280872941017, "memory(GiB)": 69.45, "reward": 0.8717014193534851, "reward_std": 0.07383416593074799, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9208333492279053, "rewards/RMReward/std": 0.07397047430276871, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6768057346343994, "rewards/VisualPerceptionAccuracy/std": 0.12414327263832092, "step": 596, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 154.640625, "completions/min_length": 95.0, "epoch": 0.02035944480441974, "frac_reward_zero_std": 0.0, "grad_norm": 0.8805145621299744, "kl": 0.09851274639368057, "learning_rate": 1.0173824130879346e-06, "loss": -0.0020819595083594322, "memory(GiB)": 69.45, "reward": 0.7816742658615112, "reward_std": 0.09940771758556366, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8214583396911621, "rewards/RMReward/std": 0.1489393711090088, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5551971197128296, "rewards/VisualPerceptionAccuracy/std": 0.19911837577819824, "step": 597, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/mean_length": 111.953125, "completions/min_length": 8.0, "epoch": 0.020393547727040207, "frac_reward_zero_std": 0.75, "grad_norm": 0.1666620671749115, "kl": 0.27688533067703247, "learning_rate": 1.0190865712338105e-06, "loss": 0.038779690861701965, "memory(GiB)": 69.45, "reward": 0.898617684841156, "reward_std": 0.03418973088264465, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.984853982925415, "rewards/PlanningActionSetORM/std": 0.02086213231086731, "rewards/RMReward/mean": 0.49687498807907104, "rewards/RMReward/std": 0.17075201869010925, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 598, "train_speed(iter/s)": 0.005066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/mean_length": 130.40625, "completions/min_length": 104.0, "epoch": 0.020427650649660675, "frac_reward_zero_std": 0.0, "grad_norm": 0.8478360176086426, "kl": 0.0810132771730423, "learning_rate": 1.0207907293796865e-06, "loss": 0.00993831641972065, "memory(GiB)": 69.45, "reward": 0.80628502368927, "reward_std": 0.11972029507160187, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8952083587646484, "rewards/RMReward/std": 0.08097778260707855, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.47664010524749756, "rewards/VisualPerceptionAccuracy/std": 0.38299617171287537, "step": 599, "train_speed(iter/s)": 0.005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/mean_length": 253.84375, "completions/min_length": 173.0, "epoch": 0.020461753572281145, "frac_reward_zero_std": 0.0, "grad_norm": 0.5284507274627686, "kl": 0.06979107111692429, "learning_rate": 1.0224948875255625e-06, "loss": -0.02551485225558281, "memory(GiB)": 69.45, "reward": 0.6894459128379822, "reward_std": 0.09663314372301102, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9495315551757812, "rewards/PlanningActionSetORM/std": 0.052865199744701385, "rewards/RMReward/mean": 0.6677083373069763, "rewards/RMReward/std": 0.18435902893543243, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5855648517608643, "rewards/VisualPerceptionAccuracy/std": 0.12789112329483032, "step": 600, "train_speed(iter/s)": 0.005064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 227.578125, "completions/min_length": 107.0, "epoch": 0.020495856494901613, "frac_reward_zero_std": 0.0, "grad_norm": 0.4753248393535614, "kl": 0.05230030417442322, "learning_rate": 1.0241990456714383e-06, "loss": -0.012041433714330196, "memory(GiB)": 69.45, "reward": 0.5930845737457275, "reward_std": 0.054562464356422424, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9967592358589172, "rewards/PlanningActionSetORM/std": 0.01636180654168129, "rewards/RMReward/mean": 0.7239583134651184, "rewards/RMReward/std": 0.07365068793296814, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.0367826372385025, "rewards/VisualPerceptionAccuracy/std": 0.07530242204666138, "step": 601, "train_speed(iter/s)": 0.005056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 158.1875, "completions/min_length": 2.0, "epoch": 0.020529959417522083, "frac_reward_zero_std": 0.0, "grad_norm": 99.28543853759766, "kl": 0.3547406792640686, "learning_rate": 1.0259032038173142e-06, "loss": 0.0018779914826154709, "memory(GiB)": 69.45, "reward": 0.7160288095474243, "reward_std": 0.18306365609169006, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.8465625047683716, "rewards/RMReward/std": 0.1477489024400711, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5625, "rewards/VisualPerceptionAccuracy/std": 0.504016101360321, "step": 602, "train_speed(iter/s)": 0.005058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/mean_length": 184.03125, "completions/min_length": 103.0, "epoch": 0.02056406234014255, "frac_reward_zero_std": 0.0, "grad_norm": 0.9790856242179871, "kl": 0.10084421932697296, "learning_rate": 1.0276073619631902e-06, "loss": 0.00413073506206274, "memory(GiB)": 69.45, "reward": 0.747917890548706, "reward_std": 0.1509975790977478, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.8888888955116272, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9806250333786011, "rewards/RMReward/std": 0.03837859630584717, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6764645576477051, "rewards/VisualPerceptionAccuracy/std": 0.27329596877098083, "step": 603, "train_speed(iter/s)": 0.005061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/mean_length": 9.203125, "completions/min_length": 8.0, "epoch": 0.020598165262763018, "frac_reward_zero_std": 0.75, "grad_norm": 13.86905574798584, "kl": 1.0166609287261963, "learning_rate": 1.0293115201090662e-06, "loss": -0.004336539190262556, "memory(GiB)": 69.45, "reward": 0.985156238079071, "reward_std": 0.05937499925494194, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.984375, "rewards/SpatialReasoningORM/std": 0.125, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 604, "train_speed(iter/s)": 0.005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/mean_length": 148.875, "completions/min_length": 8.0, "epoch": 0.02063226818538349, "frac_reward_zero_std": 0.0, "grad_norm": 21.44233512878418, "kl": 0.3471256196498871, "learning_rate": 1.0310156782549422e-06, "loss": -0.0018490049988031387, "memory(GiB)": 69.45, "reward": 0.9200311899185181, "reward_std": 0.08853253722190857, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8914583325386047, "rewards/RMReward/std": 0.096711665391922, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 605, "train_speed(iter/s)": 0.005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 113.75, "completions/min_length": 8.0, "epoch": 0.020666371108003956, "frac_reward_zero_std": 0.0, "grad_norm": 10.509926795959473, "kl": 0.22615915536880493, "learning_rate": 1.0327198364008181e-06, "loss": -0.005951210856437683, "memory(GiB)": 69.45, "reward": 0.5116132497787476, "reward_std": 0.23815029859542847, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.784375011920929, "rewards/RMReward/std": 0.05390964448451996, "rewards/SpatialReasoningORM/mean": 0.375, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": 0.4063514471054077, "rewards/VisualPerceptionAccuracy/std": 0.38156893849372864, "step": 606, "train_speed(iter/s)": 0.005067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 191.78125, "completions/min_length": 93.0, "epoch": 0.020700474030624423, "frac_reward_zero_std": 0.0, "grad_norm": 0.722585916519165, "kl": 0.056500766426324844, "learning_rate": 1.034423994546694e-06, "loss": -0.004074467346072197, "memory(GiB)": 69.45, "reward": 0.9262917041778564, "reward_std": 0.05316333845257759, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9520833492279053, "rewards/PlanningActionSetORM/std": 0.052578993141651154, "rewards/RMReward/mean": 0.9198437333106995, "rewards/RMReward/std": 0.09800375998020172, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 607, "train_speed(iter/s)": 0.005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 93.328125, "completions/min_length": 8.0, "epoch": 0.020734576953244894, "frac_reward_zero_std": 0.0, "grad_norm": 13.837789535522461, "kl": 0.3804766535758972, "learning_rate": 1.0361281526925699e-06, "loss": -0.007672186940908432, "memory(GiB)": 69.45, "reward": 0.38167327642440796, "reward_std": 0.27300363779067993, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.28125, "rewards/SpatialReasoningORM/std": 0.45680341124534607, "rewards/VisualPerceptionAccuracy/mean": 0.4461590647697449, "rewards/VisualPerceptionAccuracy/std": 0.18188458681106567, "step": 608, "train_speed(iter/s)": 0.005068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/mean_length": 89.4375, "completions/min_length": 8.0, "epoch": 0.02076867987586536, "frac_reward_zero_std": 0.0, "grad_norm": 12.633415222167969, "kl": 0.5537222027778625, "learning_rate": 1.0378323108384459e-06, "loss": -0.02495698817074299, "memory(GiB)": 69.45, "reward": 0.6953464150428772, "reward_std": 0.16143546998500824, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8015625476837158, "rewards/RMReward/std": 0.05886613577604294, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": 0.6332605481147766, "rewards/VisualPerceptionAccuracy/std": 0.07958198338747025, "step": 609, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 77.125, "completions/min_length": 8.0, "epoch": 0.020802782798485832, "frac_reward_zero_std": 0.75, "grad_norm": 0.35545289516448975, "kl": 0.8676943778991699, "learning_rate": 1.0395364689843218e-06, "loss": -0.002611083909869194, "memory(GiB)": 69.45, "reward": 0.9480429887771606, "reward_std": 0.01854492537677288, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.7921720743179321, "rewards/VisualPerceptionAccuracy/std": 0.07417970150709152, "step": 610, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.625, "completions/min_length": 8.0, "epoch": 0.0208368857211063, "frac_reward_zero_std": 1.0, "grad_norm": 0.01112569309771061, "kl": 0.7939261198043823, "learning_rate": 1.0412406271301978e-06, "loss": 0.0007940603536553681, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 611, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 176.4375, "completions/min_length": 77.0, "epoch": 0.020870988643726766, "frac_reward_zero_std": 0.0, "grad_norm": 0.7735050320625305, "kl": 0.09691034257411957, "learning_rate": 1.0429447852760736e-06, "loss": -0.0008714459836483002, "memory(GiB)": 69.45, "reward": 0.7367771863937378, "reward_std": 0.09907714277505875, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9239583015441895, "rewards/RMReward/std": 0.10826954990625381, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.1296086311340332, "rewards/VisualPerceptionAccuracy/std": 0.2399129718542099, "step": 612, "train_speed(iter/s)": 0.005068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/mean_length": 264.453125, "completions/min_length": 88.0, "epoch": 0.020905091566347237, "frac_reward_zero_std": 0.0, "grad_norm": 1.113572359085083, "kl": 0.09925737231969833, "learning_rate": 1.0446489434219496e-06, "loss": 0.0026038959622383118, "memory(GiB)": 69.45, "reward": 0.4037373661994934, "reward_std": 0.1613774299621582, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4037373661994934, "rewards/VisualPerceptionAccuracy/std": 0.28352317214012146, "step": 613, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 165.65625, "completions/min_length": 100.0, "epoch": 0.020939194488967704, "frac_reward_zero_std": 0.0, "grad_norm": 0.8552205562591553, "kl": 0.0969882383942604, "learning_rate": 1.0463531015678256e-06, "loss": -0.029169317334890366, "memory(GiB)": 69.45, "reward": 0.8248602151870728, "reward_std": 0.10558507591485977, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9418009519577026, "rewards/PlanningActionSetORM/std": 0.05235295742750168, "rewards/RMReward/mean": 0.7956249713897705, "rewards/RMReward/std": 0.1800694465637207, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 614, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 137.234375, "completions/min_length": 85.0, "epoch": 0.02097329741158817, "frac_reward_zero_std": 0.0, "grad_norm": 1.0584965944290161, "kl": 0.12244784832000732, "learning_rate": 1.0480572597137015e-06, "loss": 0.01703382097184658, "memory(GiB)": 69.45, "reward": 0.9057499766349792, "reward_std": 0.05335640162229538, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0629940778017044, "rewards/RMReward/mean": 0.8978124856948853, "rewards/RMReward/std": 0.09842359274625778, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 615, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 89.203125, "completions/min_length": 2.0, "epoch": 0.021007400334208642, "frac_reward_zero_std": 0.0, "grad_norm": 32.58427429199219, "kl": 0.15722700953483582, "learning_rate": 1.0497614178595775e-06, "loss": 0.023406527936458588, "memory(GiB)": 69.45, "reward": 0.8022395372390747, "reward_std": 0.16459932923316956, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9899305701255798, "rewards/PlanningActionSetORM/std": 0.0587235726416111, "rewards/RMReward/mean": 0.9072916507720947, "rewards/RMReward/std": 0.061876311898231506, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4375, "rewards/VisualPerceptionAccuracy/std": 0.5123475790023804, "step": 616, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/mean_length": 218.21875, "completions/min_length": 87.0, "epoch": 0.02104150325682911, "frac_reward_zero_std": 0.0, "grad_norm": 0.8972752094268799, "kl": 0.08560322970151901, "learning_rate": 1.0514655760054535e-06, "loss": -0.0026321299374103546, "memory(GiB)": 69.45, "reward": 0.8512511253356934, "reward_std": 0.07745207846164703, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9795058369636536, "rewards/PlanningActionSetORM/std": 0.03812548145651817, "rewards/RMReward/mean": 0.8854166865348816, "rewards/RMReward/std": 0.15178313851356506, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6923012733459473, "rewards/VisualPerceptionAccuracy/std": 0.14769014716148376, "step": 617, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/mean_length": 145.21875, "completions/min_length": 99.0, "epoch": 0.02107560617944958, "frac_reward_zero_std": 0.0, "grad_norm": 0.9484493732452393, "kl": 0.1115007996559143, "learning_rate": 1.0531697341513293e-06, "loss": 0.011710233055055141, "memory(GiB)": 69.45, "reward": 0.7761962413787842, "reward_std": 0.0535687655210495, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9212962985038757, "rewards/PlanningActionSetORM/std": 0.05653200298547745, "rewards/RMReward/mean": 0.9393749833106995, "rewards/RMReward/std": 0.08056730031967163, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.297507107257843, "rewards/VisualPerceptionAccuracy/std": 0.06753293424844742, "step": 618, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/mean_length": 153.140625, "completions/min_length": 7.0, "epoch": 0.021109709102070048, "frac_reward_zero_std": 0.0, "grad_norm": 23.225505828857422, "kl": 0.90361487865448, "learning_rate": 1.0548738922972052e-06, "loss": 0.014329876750707626, "memory(GiB)": 69.45, "reward": 0.8822959065437317, "reward_std": 0.19437800347805023, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.996874988079071, "rewards/RMReward/std": 0.00478713121265173, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24593468010425568, "rewards/VisualPerceptionAccuracy/mean": 0.6504336595535278, "rewards/VisualPerceptionAccuracy/std": 0.2986822724342346, "step": 619, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 197.765625, "completions/min_length": 103.0, "epoch": 0.021143812024690515, "frac_reward_zero_std": 0.0, "grad_norm": 0.5628483891487122, "kl": 0.09146958589553833, "learning_rate": 1.0565780504430812e-06, "loss": -0.0014934558421373367, "memory(GiB)": 69.45, "reward": 0.8109441995620728, "reward_std": 0.05458664521574974, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9629629254341125, "rewards/PlanningActionSetORM/std": 0.052932560443878174, "rewards/RMReward/mean": 0.9424999356269836, "rewards/RMReward/std": 0.11316528916358948, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4039990305900574, "rewards/VisualPerceptionAccuracy/std": 0.08968968689441681, "step": 620, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/mean_length": 33.5, "completions/min_length": 8.0, "epoch": 0.021177914947310986, "frac_reward_zero_std": 0.5, "grad_norm": 13.710577964782715, "kl": 0.48843294382095337, "learning_rate": 1.0582822085889572e-06, "loss": 0.0004882382636424154, "memory(GiB)": 69.45, "reward": 0.9323437809944153, "reward_std": 0.10588409006595612, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8843749761581421, "rewards/RMReward/std": 0.050723910331726074, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.24462303519248962, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 621, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 195.625, "completions/min_length": 14.0, "epoch": 0.021212017869931453, "frac_reward_zero_std": 0.0, "grad_norm": 12.112092018127441, "kl": 0.07662732899188995, "learning_rate": 1.0599863667348332e-06, "loss": 0.010002277791500092, "memory(GiB)": 69.45, "reward": 0.8965592384338379, "reward_std": 0.12008270621299744, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9472696781158447, "rewards/PlanningActionSetORM/std": 0.04607592895627022, "rewards/RMReward/mean": 0.9149999618530273, "rewards/RMReward/std": 0.1293189972639084, "rewards/SpatialReasoningORM/mean": 0.8125, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 622, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/mean_length": 144.265625, "completions/min_length": 82.0, "epoch": 0.02124612079255192, "frac_reward_zero_std": 0.0, "grad_norm": 0.6297332644462585, "kl": 0.089866504073143, "learning_rate": 1.061690524880709e-06, "loss": -0.008786613121628761, "memory(GiB)": 69.45, "reward": 0.8857246041297913, "reward_std": 0.07929383218288422, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.07939885556697845, "rewards/RMReward/mean": 0.9856249690055847, "rewards/RMReward/std": 0.02960062399506569, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6107316613197327, "rewards/VisualPerceptionAccuracy/std": 0.2773778736591339, "step": 623, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/mean_length": 234.671875, "completions/min_length": 8.0, "epoch": 0.02128022371517239, "frac_reward_zero_std": 0.0, "grad_norm": 27.945144653320312, "kl": 0.324948251247406, "learning_rate": 1.063394683026585e-06, "loss": 0.0014665925409644842, "memory(GiB)": 69.45, "reward": 0.8043022751808167, "reward_std": 0.14750231802463531, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9826404452323914, "rewards/PlanningActionSetORM/std": 0.024856556206941605, "rewards/RMReward/mean": 0.9008333086967468, "rewards/RMReward/std": 0.1527455598115921, "rewards/SpatialReasoningORM/mean": 0.4375, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 624, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2049.0, "completions/mean_length": 207.578125, "completions/min_length": 124.0, "epoch": 0.021314326637792858, "frac_reward_zero_std": 0.0, "grad_norm": 0.8949015140533447, "kl": 0.07912659645080566, "learning_rate": 1.0650988411724609e-06, "loss": -0.04310821741819382, "memory(GiB)": 69.45, "reward": 0.8746249675750732, "reward_std": 0.05369473621249199, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.921875, "rewards/PlanningActionSetORM/std": 0.15883378684520721, "rewards/RMReward/mean": 0.8628125190734863, "rewards/RMReward/std": 0.10337892174720764, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 625, "train_speed(iter/s)": 0.005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 197.0, "completions/min_length": 107.0, "epoch": 0.02134842956041333, "frac_reward_zero_std": 0.0, "grad_norm": 0.9152810573577881, "kl": 0.05743296444416046, "learning_rate": 1.0668029993183369e-06, "loss": 0.010381028987467289, "memory(GiB)": 69.45, "reward": 0.8985791802406311, "reward_std": 0.06138690188527107, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9777777791023254, "rewards/PlanningActionSetORM/std": 0.031759534031152725, "rewards/RMReward/mean": 0.9204166531562805, "rewards/RMReward/std": 0.12727852165699005, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7986501455307007, "rewards/VisualPerceptionAccuracy/std": 0.1204228326678276, "step": 626, "train_speed(iter/s)": 0.005066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 198.5, "completions/min_length": 14.0, "epoch": 0.021382532483033796, "frac_reward_zero_std": 0.0, "grad_norm": 5.816657543182373, "kl": 0.07460163533687592, "learning_rate": 1.0685071574642128e-06, "loss": -0.0325903557240963, "memory(GiB)": 69.45, "reward": 0.5700090527534485, "reward_std": 0.15128281712532043, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8604369163513184, "rewards/PlanningActionSetORM/std": 0.023605410009622574, "rewards/RMReward/mean": 0.8837499618530273, "rewards/RMReward/std": 0.0713559091091156, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.5567244291305542, "rewards/VisualPerceptionAccuracy/std": 0.06170700863003731, "step": 627, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/mean_length": 179.546875, "completions/min_length": 70.0, "epoch": 0.021416635405654263, "frac_reward_zero_std": 0.0, "grad_norm": 0.824273943901062, "kl": 0.08272988349199295, "learning_rate": 1.0702113156100886e-06, "loss": 0.008530900813639164, "memory(GiB)": 69.45, "reward": 0.7324375510215759, "reward_std": 0.06650421023368835, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9710085391998291, "rewards/PlanningActionSetORM/std": 0.05349228158593178, "rewards/RMReward/mean": 0.8433333039283752, "rewards/RMReward/std": 0.1377684324979782, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.3231450319290161, "rewards/VisualPerceptionAccuracy/std": 0.09076780080795288, "step": 628, "train_speed(iter/s)": 0.005071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/mean_length": 265.578125, "completions/min_length": 106.0, "epoch": 0.021450738328274734, "frac_reward_zero_std": 0.0, "grad_norm": 0.6099980473518372, "kl": 0.0744352787733078, "learning_rate": 1.0719154737559646e-06, "loss": -0.024636365473270416, "memory(GiB)": 69.45, "reward": 0.7283816337585449, "reward_std": 0.12482352554798126, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9732604026794434, "rewards/PlanningActionSetORM/std": 0.02724376507103443, "rewards/RMReward/mean": 0.9353125095367432, "rewards/RMReward/std": 0.08007996529340744, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5138612985610962, "rewards/VisualPerceptionAccuracy/std": 0.23080827295780182, "step": 629, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 295.484375, "completions/min_length": 106.0, "epoch": 0.0214848412508952, "frac_reward_zero_std": 0.0, "grad_norm": 0.5781903266906738, "kl": 0.066648930311203, "learning_rate": 1.0736196319018406e-06, "loss": 0.009944068267941475, "memory(GiB)": 69.45, "reward": 0.8429242372512817, "reward_std": 0.06183529645204544, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9669743180274963, "rewards/PlanningActionSetORM/std": 0.023931795731186867, "rewards/RMReward/mean": 0.8168749809265137, "rewards/RMReward/std": 0.1335466504096985, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.8310121893882751, "rewards/VisualPerceptionAccuracy/std": 0.07469596713781357, "step": 630, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/mean_length": 129.765625, "completions/min_length": 8.0, "epoch": 0.02151894417351567, "frac_reward_zero_std": 0.0, "grad_norm": 3.369205951690674, "kl": 0.4130963683128357, "learning_rate": 1.0753237900477165e-06, "loss": 0.0060180798172950745, "memory(GiB)": 69.45, "reward": 0.8832812309265137, "reward_std": 0.11038023233413696, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9583333134651184, "rewards/PlanningActionSetORM/std": 0.059549134224653244, "rewards/RMReward/mean": 0.840624988079071, "rewards/RMReward/std": 0.2467353343963623, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 631, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/mean_length": 202.734375, "completions/min_length": 91.0, "epoch": 0.02155304709613614, "frac_reward_zero_std": 0.0, "grad_norm": 0.9316927194595337, "kl": 0.10935217887163162, "learning_rate": 1.0770279481935925e-06, "loss": 0.0010366933420300484, "memory(GiB)": 69.45, "reward": 0.7178343534469604, "reward_std": 0.08146975189447403, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.76583331823349, "rewards/RMReward/std": 0.1817975789308548, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4333375096321106, "rewards/VisualPerceptionAccuracy/std": 0.11835327744483948, "step": 632, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 166.421875, "completions/min_length": 14.0, "epoch": 0.021587150018756607, "frac_reward_zero_std": 0.0, "grad_norm": 9.232906341552734, "kl": 0.0636255294084549, "learning_rate": 1.0787321063394685e-06, "loss": 0.022913863882422447, "memory(GiB)": 69.45, "reward": 0.7508436441421509, "reward_std": 0.15116049349308014, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9814574122428894, "rewards/PlanningActionSetORM/std": 0.026587937027215958, "rewards/RMReward/mean": 0.8862500190734863, "rewards/RMReward/std": 0.12347150593996048, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 633, "train_speed(iter/s)": 0.005068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 122.15625, "completions/min_length": 9.0, "epoch": 0.021621252941377078, "frac_reward_zero_std": 0.0, "grad_norm": 4.634146213531494, "kl": 0.3875715136528015, "learning_rate": 1.0804362644853443e-06, "loss": -0.007271189242601395, "memory(GiB)": 69.45, "reward": 0.9060961604118347, "reward_std": 0.11024437099695206, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9215624928474426, "rewards/RMReward/std": 0.050232309848070145, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.8686346411705017, "rewards/VisualPerceptionAccuracy/std": 0.053534265607595444, "step": 634, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 180.578125, "completions/min_length": 8.0, "epoch": 0.021655355863997545, "frac_reward_zero_std": 0.0, "grad_norm": 7.020723819732666, "kl": 0.35148847103118896, "learning_rate": 1.0821404226312203e-06, "loss": -0.02180350385606289, "memory(GiB)": 69.45, "reward": 0.6388580799102783, "reward_std": 0.10130465030670166, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9590624570846558, "rewards/RMReward/std": 0.05721263960003853, "rewards/SpatialReasoningORM/mean": 0.0625, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.5115573406219482, "rewards/VisualPerceptionAccuracy/std": 0.08608399331569672, "step": 635, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/mean_length": 164.578125, "completions/min_length": 89.0, "epoch": 0.021689458786618012, "frac_reward_zero_std": 0.0, "grad_norm": 0.7922321557998657, "kl": 0.0843743234872818, "learning_rate": 1.0838445807770962e-06, "loss": -0.0033129937946796417, "memory(GiB)": 69.45, "reward": 0.8484668135643005, "reward_std": 0.0759945809841156, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9629629254341125, "rewards/PlanningActionSetORM/std": 0.052932560443878174, "rewards/RMReward/mean": 0.9591667056083679, "rewards/RMReward/std": 0.05775345116853714, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5140894651412964, "rewards/VisualPerceptionAccuracy/std": 0.17364172637462616, "step": 636, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/mean_length": 192.625, "completions/min_length": 108.0, "epoch": 0.021723561709238483, "frac_reward_zero_std": 0.0, "grad_norm": 0.7650750875473022, "kl": 0.09412381052970886, "learning_rate": 1.0855487389229722e-06, "loss": 0.0044282469898462296, "memory(GiB)": 69.45, "reward": 0.760411262512207, "reward_std": 0.0883994922041893, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9446874856948853, "rewards/RMReward/std": 0.06344844400882721, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5650725960731506, "rewards/VisualPerceptionAccuracy/std": 0.1699746698141098, "step": 637, "train_speed(iter/s)": 0.005072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 248.078125, "completions/min_length": 68.0, "epoch": 0.02175766463185895, "frac_reward_zero_std": 0.0, "grad_norm": 1.0502572059631348, "kl": 0.11494625359773636, "learning_rate": 1.0872528970688482e-06, "loss": -0.0634336993098259, "memory(GiB)": 69.45, "reward": 0.7599565386772156, "reward_std": 0.0810132771730423, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9025000333786011, "rewards/RMReward/std": 0.1101318821310997, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5979130864143372, "rewards/VisualPerceptionAccuracy/std": 0.13488124310970306, "step": 638, "train_speed(iter/s)": 0.00507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 132.875, "completions/min_length": 81.0, "epoch": 0.021791767554479417, "frac_reward_zero_std": 0.0, "grad_norm": 1.2431211471557617, "kl": 0.11839979887008667, "learning_rate": 1.088957055214724e-06, "loss": -0.013201842084527016, "memory(GiB)": 69.45, "reward": 0.7772490978240967, "reward_std": 0.1450939178466797, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8731250166893005, "rewards/RMReward/std": 0.10632990300655365, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6559982895851135, "rewards/VisualPerceptionAccuracy/std": 0.3009926676750183, "step": 639, "train_speed(iter/s)": 0.005069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/mean_length": 313.15625, "completions/min_length": 157.0, "epoch": 0.021825870477099888, "frac_reward_zero_std": 0.0, "grad_norm": 0.6811613440513611, "kl": 0.07065164297819138, "learning_rate": 1.0906612133606e-06, "loss": -0.007655586116015911, "memory(GiB)": 69.45, "reward": 0.6586880683898926, "reward_std": 0.09300946444272995, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9746631979942322, "rewards/PlanningActionSetORM/std": 0.025762883946299553, "rewards/RMReward/mean": 0.8218749761581421, "rewards/RMReward/std": 0.06342063844203949, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.464943528175354, "rewards/VisualPerceptionAccuracy/std": 0.3298482596874237, "step": 640, "train_speed(iter/s)": 0.005063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/mean_length": 186.609375, "completions/min_length": 104.0, "epoch": 0.021859973399720355, "frac_reward_zero_std": 0.0, "grad_norm": 0.576653003692627, "kl": 0.09444063901901245, "learning_rate": 1.092365371506476e-06, "loss": 0.006231339648365974, "memory(GiB)": 69.45, "reward": 0.8220658302307129, "reward_std": 0.055785346776247025, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9583333134651184, "rewards/PlanningActionSetORM/std": 0.059549134224653244, "rewards/RMReward/mean": 0.940625011920929, "rewards/RMReward/std": 0.07498669624328613, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.4557632505893707, "rewards/VisualPerceptionAccuracy/std": 0.08465660363435745, "step": 641, "train_speed(iter/s)": 0.005061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/mean_length": 63.546875, "completions/min_length": 8.0, "epoch": 0.021894076322340826, "frac_reward_zero_std": 0.0, "grad_norm": 33.95769119262695, "kl": 0.649439811706543, "learning_rate": 1.0940695296523519e-06, "loss": 0.04846624284982681, "memory(GiB)": 69.45, "reward": 0.8036875128746033, "reward_std": 0.24118542671203613, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.8959375619888306, "rewards/RMReward/std": 0.19292083382606506, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4709290862083435, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 642, "train_speed(iter/s)": 0.005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 10.171875, "completions/min_length": 8.0, "epoch": 0.021928179244961293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019419704331085086, "kl": 0.8770633339881897, "learning_rate": 1.0957736877982279e-06, "loss": 0.0008789491257630289, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 643, "train_speed(iter/s)": 0.005062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/mean_length": 85.4375, "completions/min_length": 9.0, "epoch": 0.02196228216758176, "frac_reward_zero_std": 0.0, "grad_norm": 20.499914169311523, "kl": 0.470020592212677, "learning_rate": 1.0974778459441036e-06, "loss": -0.004915645346045494, "memory(GiB)": 69.45, "reward": 0.7546719312667847, "reward_std": 0.11985612660646439, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9192708134651184, "rewards/PlanningActionSetORM/std": 0.06041513755917549, "rewards/RMReward/mean": 0.9329166412353516, "rewards/RMReward/std": 0.10809684544801712, "rewards/SpatialReasoningORM/mean": 0.1875, "rewards/SpatialReasoningORM/std": 0.40311288833618164, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 644, "train_speed(iter/s)": 0.005061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/mean_length": 159.59375, "completions/min_length": 102.0, "epoch": 0.02199638509020223, "frac_reward_zero_std": 0.0, "grad_norm": 0.7674838304519653, "kl": 0.08681795746088028, "learning_rate": 1.0991820040899796e-06, "loss": -0.002674211747944355, "memory(GiB)": 69.45, "reward": 0.877750039100647, "reward_std": 0.0617101714015007, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.054554473608732224, "rewards/RMReward/mean": 0.8550000190734863, "rewards/RMReward/std": 0.10780640691518784, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 645, "train_speed(iter/s)": 0.00506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 226.109375, "completions/min_length": 162.0, "epoch": 0.0220304880128227, "frac_reward_zero_std": 0.0, "grad_norm": 0.41219404339790344, "kl": 0.06585557758808136, "learning_rate": 1.1008861622358556e-06, "loss": 0.002311393618583679, "memory(GiB)": 69.45, "reward": 0.9518144130706787, "reward_std": 0.02764277532696724, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9321969747543335, "rewards/PlanningActionSetORM/std": 0.04268967732787132, "rewards/RMReward/mean": 0.9567187428474426, "rewards/RMReward/std": 0.06967166811227798, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 646, "train_speed(iter/s)": 0.005055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/mean_length": 106.640625, "completions/min_length": 88.0, "epoch": 0.022064590935443166, "frac_reward_zero_std": 0.0, "grad_norm": 2.0740652084350586, "kl": 0.15267354249954224, "learning_rate": 1.1025903203817316e-06, "loss": -0.001928909681737423, "memory(GiB)": 69.45, "reward": 0.8862500190734863, "reward_std": 0.03277144581079483, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0629940778017044, "rewards/RMReward/mean": 0.8734375238418579, "rewards/RMReward/std": 0.1292741447687149, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 647, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/mean_length": 67.328125, "completions/min_length": 8.0, "epoch": 0.022098693858063637, "frac_reward_zero_std": 0.75, "grad_norm": 0.44697096943855286, "kl": 0.4780297875404358, "learning_rate": 1.1042944785276075e-06, "loss": 0.00726472120732069, "memory(GiB)": 69.45, "reward": 0.9798750281333923, "reward_std": 0.011954776011407375, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.949999988079071, "rewards/PlanningActionSetORM/std": 0.05080006271600723, "rewards/RMReward/mean": 0.9621875286102295, "rewards/RMReward/std": 0.05661040171980858, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 648, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 152.09375, "completions/min_length": 8.0, "epoch": 0.022132796780684104, "frac_reward_zero_std": 0.0, "grad_norm": 9.218460083007812, "kl": 0.3745150566101074, "learning_rate": 1.1059986366734835e-06, "loss": -0.021513212472200394, "memory(GiB)": 69.45, "reward": 0.704805850982666, "reward_std": 0.12216944992542267, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7906249761581421, "rewards/RMReward/std": 0.0611521378159523, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.5527366995811462, "rewards/VisualPerceptionAccuracy/std": 0.2989661693572998, "step": 649, "train_speed(iter/s)": 0.005053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 221.796875, "completions/min_length": 74.0, "epoch": 0.022166899703304575, "frac_reward_zero_std": 0.0, "grad_norm": 1.249754548072815, "kl": 0.1357862651348114, "learning_rate": 1.1077027948193593e-06, "loss": 0.04058995842933655, "memory(GiB)": 69.45, "reward": 0.6456972360610962, "reward_std": 0.18628185987472534, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.949999988079071, "rewards/PlanningActionSetORM/std": 0.06927039474248886, "rewards/RMReward/mean": 0.667187511920929, "rewards/RMReward/std": 0.20024555921554565, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.567644476890564, "rewards/VisualPerceptionAccuracy/std": 0.24314342439174652, "step": 650, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/mean_length": 141.25, "completions/min_length": 115.0, "epoch": 0.022201002625925042, "frac_reward_zero_std": 0.0, "grad_norm": 0.6761037707328796, "kl": 0.09360653907060623, "learning_rate": 1.1094069529652353e-06, "loss": -0.001005365513265133, "memory(GiB)": 69.45, "reward": 0.8796250224113464, "reward_std": 0.04885067045688629, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8495312333106995, "rewards/RMReward/std": 0.08787490427494049, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 651, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 154.671875, "completions/min_length": 101.0, "epoch": 0.02223510554854551, "frac_reward_zero_std": 0.0, "grad_norm": 0.9424182176589966, "kl": 0.12248295545578003, "learning_rate": 1.111111111111111e-06, "loss": 0.004313424229621887, "memory(GiB)": 69.45, "reward": 0.9048750400543213, "reward_std": 0.029949437826871872, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.11268723756074905, "rewards/RMReward/mean": 0.8967187404632568, "rewards/RMReward/std": 0.10430502891540527, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 652, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2049.0, "completions/mean_length": 204.375, "completions/min_length": 109.0, "epoch": 0.02226920847116598, "frac_reward_zero_std": 0.0, "grad_norm": 0.7239243388175964, "kl": 0.10000248998403549, "learning_rate": 1.1128152692569872e-06, "loss": -0.024395691230893135, "memory(GiB)": 69.45, "reward": 0.8309179544448853, "reward_std": 0.09791272133588791, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9334374666213989, "rewards/RMReward/std": 0.07694192230701447, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7150859236717224, "rewards/VisualPerceptionAccuracy/std": 0.22710393369197845, "step": 653, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/mean_length": 118.34375, "completions/min_length": 103.0, "epoch": 0.022303311393786447, "frac_reward_zero_std": 0.0, "grad_norm": 0.8008916974067688, "kl": 0.11965703964233398, "learning_rate": 1.1145194274028632e-06, "loss": -0.007884303107857704, "memory(GiB)": 69.45, "reward": 0.9011250734329224, "reward_std": 0.04068359360098839, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0629940778017044, "rewards/RMReward/mean": 0.8920312523841858, "rewards/RMReward/std": 0.09694016724824905, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 654, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/mean_length": 156.921875, "completions/min_length": 8.0, "epoch": 0.022337414316406914, "frac_reward_zero_std": 0.0, "grad_norm": 11.886199951171875, "kl": 0.3538549244403839, "learning_rate": 1.116223585548739e-06, "loss": 0.016604585573077202, "memory(GiB)": 69.45, "reward": 0.6433441042900085, "reward_std": 0.16317786276340485, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7746875286102295, "rewards/RMReward/std": 0.24399873614311218, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.587001383304596, "rewards/VisualPerceptionAccuracy/std": 0.09687134623527527, "step": 655, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 94.5625, "completions/min_length": 8.0, "epoch": 0.022371517239027385, "frac_reward_zero_std": 0.0, "grad_norm": 28.972763061523438, "kl": 0.2724098563194275, "learning_rate": 1.117927743694615e-06, "loss": 0.002332506701350212, "memory(GiB)": 69.45, "reward": 0.7724375128746033, "reward_std": 0.16445966064929962, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.950624942779541, "rewards/RMReward/std": 0.0567358024418354, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.504016101360321, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 656, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/mean_length": 217.765625, "completions/min_length": 105.0, "epoch": 0.022405620161647852, "frac_reward_zero_std": 0.0, "grad_norm": 1.4163240194320679, "kl": 0.10432842373847961, "learning_rate": 1.119631901840491e-06, "loss": 0.012681882828474045, "memory(GiB)": 69.45, "reward": 0.644245982170105, "reward_std": 0.02725367061793804, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03907695785164833, "rewards/RMReward/mean": 0.9512499570846558, "rewards/RMReward/std": 0.05723465234041214, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.335184246301651, "rewards/VisualPerceptionAccuracy/std": 0.34205976128578186, "step": 657, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/mean_length": 62.265625, "completions/min_length": 9.0, "epoch": 0.022439723084268323, "frac_reward_zero_std": 0.75, "grad_norm": 0.43651342391967773, "kl": 0.19509956240653992, "learning_rate": 1.1213360599863667e-06, "loss": -0.020812008529901505, "memory(GiB)": 69.45, "reward": 0.9612500071525574, "reward_std": 0.016683317720890045, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9750000238418579, "rewards/PlanningActionSetORM/std": 0.10000000149011612, "rewards/RMReward/mean": 0.8125, "rewards/RMReward/std": 0.07416198402643204, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 658, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 162.578125, "completions/min_length": 88.0, "epoch": 0.02247382600688879, "frac_reward_zero_std": 0.0, "grad_norm": 0.9978545904159546, "kl": 0.08770737797021866, "learning_rate": 1.1230402181322429e-06, "loss": 0.009045792743563652, "memory(GiB)": 69.45, "reward": 0.8541932106018066, "reward_std": 0.059642244130373, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9002082943916321, "rewards/RMReward/std": 0.10292507708072662, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6562726497650146, "rewards/VisualPerceptionAccuracy/std": 0.06404475122690201, "step": 659, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/mean_length": 130.046875, "completions/min_length": 15.0, "epoch": 0.022507928929509258, "frac_reward_zero_std": 0.0, "grad_norm": 2.473977565765381, "kl": 0.09331853687763214, "learning_rate": 1.1247443762781187e-06, "loss": 0.0037454813718795776, "memory(GiB)": 69.45, "reward": 0.3315076231956482, "reward_std": 0.15219949185848236, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.3857601583003998, "rewards/VisualPerceptionAccuracy/std": 0.27847421169281006, "step": 660, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/mean_length": 231.625, "completions/min_length": 101.0, "epoch": 0.02254203185212973, "frac_reward_zero_std": 0.0, "grad_norm": 0.8194020390510559, "kl": 0.09983966499567032, "learning_rate": 1.1264485344239946e-06, "loss": -0.02089320495724678, "memory(GiB)": 69.45, "reward": 0.7375810146331787, "reward_std": 0.0667455866932869, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8556249737739563, "rewards/RMReward/std": 0.09280283004045486, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.29682400822639465, "rewards/VisualPerceptionAccuracy/std": 0.09590733051300049, "step": 661, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 135.4375, "completions/min_length": 8.0, "epoch": 0.022576134774750196, "frac_reward_zero_std": 0.0, "grad_norm": 11.233111381530762, "kl": 0.466935932636261, "learning_rate": 1.1281526925698706e-06, "loss": 0.0018504168838262558, "memory(GiB)": 69.45, "reward": 0.730543851852417, "reward_std": 0.14991196990013123, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9937500357627869, "rewards/RMReward/std": 0.014973096549510956, "rewards/SpatialReasoningORM/mean": 0.25, "rewards/SpatialReasoningORM/std": 0.44721361994743347, "rewards/VisualPerceptionAccuracy/mean": 0.6446753740310669, "rewards/VisualPerceptionAccuracy/std": 0.15544559061527252, "step": 662, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 193.375, "completions/min_length": 129.0, "epoch": 0.022610237697370663, "frac_reward_zero_std": 0.0, "grad_norm": 0.70367431640625, "kl": 0.08101464807987213, "learning_rate": 1.1298568507157464e-06, "loss": 0.026699593290686607, "memory(GiB)": 69.45, "reward": 0.8345203399658203, "reward_std": 0.04432705044746399, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8827083110809326, "rewards/RMReward/std": 0.09999445080757141, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6195816397666931, "rewards/VisualPerceptionAccuracy/std": 0.04996377229690552, "step": 663, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/mean_length": 106.78125, "completions/min_length": 8.0, "epoch": 0.022644340619991134, "frac_reward_zero_std": 0.5, "grad_norm": 0.667919397354126, "kl": 0.5756195187568665, "learning_rate": 1.1315610088616224e-06, "loss": -0.01271122694015503, "memory(GiB)": 69.45, "reward": 0.6962722539901733, "reward_std": 0.028825365006923676, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9861111044883728, "rewards/PlanningActionSetORM/std": 0.03795166686177254, "rewards/RMReward/mean": 0.890625, "rewards/RMReward/std": 0.06637957692146301, "rewards/SpatialReasoningORM/mean": 0.5, "rewards/SpatialReasoningORM/std": 0.5080004930496216, "rewards/VisualPerceptionAccuracy/mean": 0.8253666758537292, "rewards/VisualPerceptionAccuracy/std": 0.06552585959434509, "step": 664, "train_speed(iter/s)": 0.005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 180.09375, "completions/min_length": 100.0, "epoch": 0.0226784435426116, "frac_reward_zero_std": 0.0, "grad_norm": 1.010596513748169, "kl": 0.11775590479373932, "learning_rate": 1.1332651670074985e-06, "loss": -0.007849869318306446, "memory(GiB)": 69.45, "reward": 0.833105742931366, "reward_std": 0.05040639638900757, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9987499713897705, "rewards/RMReward/std": 0.0034156469628214836, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7778077125549316, "rewards/VisualPerceptionAccuracy/std": 0.0834205150604248, "step": 665, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 102.578125, "completions/min_length": 8.0, "epoch": 0.022712546465232072, "frac_reward_zero_std": 0.0, "grad_norm": 12.00684928894043, "kl": 0.4902247488498688, "learning_rate": 1.1349693251533743e-06, "loss": -0.004618937149643898, "memory(GiB)": 69.45, "reward": 0.6475470066070557, "reward_std": 0.1217680275440216, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.14756080508232117, "rewards/RMReward/mean": 0.8912500143051147, "rewards/RMReward/std": 0.11350002884864807, "rewards/SpatialReasoningORM/mean": 0.125, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.6204380989074707, "rewards/VisualPerceptionAccuracy/std": 0.10008706152439117, "step": 666, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/mean_length": 84.328125, "completions/min_length": 8.0, "epoch": 0.02274664938785254, "frac_reward_zero_std": 0.75, "grad_norm": 0.2125938981771469, "kl": 0.27453112602233887, "learning_rate": 1.1366734832992503e-06, "loss": 0.004186577163636684, "memory(GiB)": 69.45, "reward": 0.7018623948097229, "reward_std": 0.013905012048780918, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9247481822967529, "rewards/PlanningActionSetORM/std": 0.0031735682860016823, "rewards/RMReward/mean": 0.7156250476837158, "rewards/RMReward/std": 0.0700446292757988, "rewards/SpatialReasoningORM/mean": 0.6666666865348816, "rewards/SpatialReasoningORM/std": 0.47639307379722595, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 667, "train_speed(iter/s)": 0.005053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 146.53125, "completions/min_length": 91.0, "epoch": 0.022780752310473006, "frac_reward_zero_std": 0.0, "grad_norm": 1.2476099729537964, "kl": 0.12591052055358887, "learning_rate": 1.138377641445126e-06, "loss": 0.022286424413323402, "memory(GiB)": 69.45, "reward": 0.7281593084335327, "reward_std": 0.07058458775281906, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9056249856948853, "rewards/RMReward/std": 0.10162986069917679, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5318186283111572, "rewards/VisualPerceptionAccuracy/std": 0.15983642637729645, "step": 668, "train_speed(iter/s)": 0.005054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/mean_length": 134.5625, "completions/min_length": 102.0, "epoch": 0.022814855233093477, "frac_reward_zero_std": 0.0, "grad_norm": 0.8328254222869873, "kl": 0.092910036444664, "learning_rate": 1.140081799591002e-06, "loss": 0.00054890476167202, "memory(GiB)": 69.45, "reward": 0.9047499895095825, "reward_std": 0.04346913844347, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8809374570846558, "rewards/RMReward/std": 0.09863704442977905, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 669, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/mean_length": 68.5, "completions/min_length": 8.0, "epoch": 0.022848958155713944, "frac_reward_zero_std": 0.5, "grad_norm": 25.067270278930664, "kl": 0.6547112464904785, "learning_rate": 1.141785957736878e-06, "loss": -0.005024529527872801, "memory(GiB)": 69.45, "reward": 0.7763437628746033, "reward_std": 0.061904821544885635, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9950000047683716, "rewards/RMReward/std": 0.012649113312363625, "rewards/SpatialReasoningORM/mean": 0.6875, "rewards/SpatialReasoningORM/std": 0.4684174358844757, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 670, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 120.421875, "completions/min_length": 14.0, "epoch": 0.02288306107833441, "frac_reward_zero_std": 0.0, "grad_norm": 13.454248428344727, "kl": 0.07314649224281311, "learning_rate": 1.143490115882754e-06, "loss": 0.00624474510550499, "memory(GiB)": 69.45, "reward": 0.8867670297622681, "reward_std": 0.13527324795722961, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9696969985961914, "rewards/PlanningActionSetORM/std": 0.04330844804644585, "rewards/RMReward/mean": 0.9672916531562805, "rewards/RMReward/std": 0.0416029691696167, "rewards/SpatialReasoningORM/mean": 0.625, "rewards/SpatialReasoningORM/std": 0.5, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 671, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 132.328125, "completions/min_length": 8.0, "epoch": 0.022917164000954882, "frac_reward_zero_std": 0.0, "grad_norm": 19.047618865966797, "kl": 0.3325996398925781, "learning_rate": 1.14519427402863e-06, "loss": -0.04993920773267746, "memory(GiB)": 69.45, "reward": 0.7156842947006226, "reward_std": 0.16162621974945068, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9375, "rewards/PlanningActionSetORM/std": 0.0635000616312027, "rewards/RMReward/mean": 0.7421875, "rewards/RMReward/std": 0.14318349957466125, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.3596119284629822, "rewards/VisualPerceptionAccuracy/std": 0.22092205286026, "step": 672, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/mean_length": 206.8125, "completions/min_length": 92.0, "epoch": 0.02295126692357535, "frac_reward_zero_std": 0.0, "grad_norm": 0.8595623970031738, "kl": 0.09030047804117203, "learning_rate": 1.146898432174506e-06, "loss": -0.00019137514755129814, "memory(GiB)": 69.45, "reward": 0.7988680601119995, "reward_std": 0.08483607321977615, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8421875238418579, "rewards/RMReward/std": 0.12385564297437668, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.723986029624939, "rewards/VisualPerceptionAccuracy/std": 0.1013011559844017, "step": 673, "train_speed(iter/s)": 0.005053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 211.765625, "completions/min_length": 95.0, "epoch": 0.02298536984619582, "frac_reward_zero_std": 0.0, "grad_norm": 0.9923982620239258, "kl": 0.10039536654949188, "learning_rate": 1.1486025903203817e-06, "loss": 0.004601847846060991, "memory(GiB)": 69.45, "reward": 0.8089588284492493, "reward_std": 0.07020868360996246, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9373219013214111, "rewards/PlanningActionSetORM/std": 0.04695754498243332, "rewards/RMReward/mean": 0.9037500023841858, "rewards/RMReward/std": 0.08997930586338043, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5044422149658203, "rewards/VisualPerceptionAccuracy/std": 0.08230265229940414, "step": 674, "train_speed(iter/s)": 0.005052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/mean_length": 222.796875, "completions/min_length": 132.0, "epoch": 0.023019472768816288, "frac_reward_zero_std": 0.0, "grad_norm": 0.5022323131561279, "kl": 0.09227496385574341, "learning_rate": 1.1503067484662577e-06, "loss": -0.03405272215604782, "memory(GiB)": 69.45, "reward": 0.8271827101707458, "reward_std": 0.06864678859710693, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9615384340286255, "rewards/PlanningActionSetORM/std": 0.03876558318734169, "rewards/RMReward/mean": 0.7935937643051147, "rewards/RMReward/std": 0.19491444528102875, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 675, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/mean_length": 167.0, "completions/min_length": 8.0, "epoch": 0.023053575691436755, "frac_reward_zero_std": 0.0, "grad_norm": 13.813305854797363, "kl": 0.28518909215927124, "learning_rate": 1.1520109066121337e-06, "loss": 0.0073455385863780975, "memory(GiB)": 69.45, "reward": 0.7800583243370056, "reward_std": 0.1567598283290863, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9472639560699463, "rewards/PlanningActionSetORM/std": 0.03894476220011711, "rewards/RMReward/mean": 0.9187500476837158, "rewards/RMReward/std": 0.078702911734581, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 676, "train_speed(iter/s)": 0.005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 139.09375, "completions/min_length": 91.0, "epoch": 0.023087678614057226, "frac_reward_zero_std": 0.0, "grad_norm": 0.4850768446922302, "kl": 0.12082359939813614, "learning_rate": 1.1537150647580097e-06, "loss": 0.0010160142555832863, "memory(GiB)": 69.45, "reward": 0.9152499437332153, "reward_std": 0.04383638873696327, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8940625190734863, "rewards/RMReward/std": 0.07863353937864304, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 677, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 182.5625, "completions/min_length": 9.0, "epoch": 0.023121781536677693, "frac_reward_zero_std": 0.0, "grad_norm": 19.23619842529297, "kl": 0.2684555947780609, "learning_rate": 1.1554192229038856e-06, "loss": 0.0005954131484031677, "memory(GiB)": 69.45, "reward": 0.8375135660171509, "reward_std": 0.16772685945034027, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.8861321806907654, "rewards/PlanningActionSetORM/std": 0.10225893557071686, "rewards/RMReward/mean": 0.9308333396911621, "rewards/RMReward/std": 0.07559221982955933, "rewards/SpatialReasoningORM/mean": 0.5625, "rewards/SpatialReasoningORM/std": 0.5123475790023804, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 678, "train_speed(iter/s)": 0.005047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/mean_length": 184.09375, "completions/min_length": 96.0, "epoch": 0.023155884459298164, "frac_reward_zero_std": 0.0, "grad_norm": 0.9859139323234558, "kl": 0.12552127242088318, "learning_rate": 1.1571233810497614e-06, "loss": -0.004666522145271301, "memory(GiB)": 69.45, "reward": 0.8613878488540649, "reward_std": 0.05404355376958847, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9747360348701477, "rewards/PlanningActionSetORM/std": 0.03619708865880966, "rewards/RMReward/mean": 0.862708330154419, "rewards/RMReward/std": 0.132355734705925, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7902098298072815, "rewards/VisualPerceptionAccuracy/std": 0.08863520622253418, "step": 679, "train_speed(iter/s)": 0.005047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 96.4375, "completions/min_length": 15.0, "epoch": 0.02318998738191863, "frac_reward_zero_std": 0.5, "grad_norm": 0.5255076289176941, "kl": 0.07732966542243958, "learning_rate": 1.1588275391956374e-06, "loss": 0.0011795995524153113, "memory(GiB)": 69.45, "reward": 0.7695292234420776, "reward_std": 0.014944653958082199, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.925000011920929, "rewards/RMReward/std": 0.036514829844236374, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": 0.13811691105365753, "rewards/VisualPerceptionAccuracy/std": 0.030566750094294548, "step": 680, "train_speed(iter/s)": 0.005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/mean_length": 223.734375, "completions/min_length": 113.0, "epoch": 0.023224090304539098, "frac_reward_zero_std": 0.0, "grad_norm": 0.6321706771850586, "kl": 0.11338325589895248, "learning_rate": 1.1605316973415134e-06, "loss": -0.018057381734251976, "memory(GiB)": 69.45, "reward": 0.8863613605499268, "reward_std": 0.054743532091379166, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9743589758872986, "rewards/PlanningActionSetORM/std": 0.036645617336034775, "rewards/RMReward/mean": 0.9347917437553406, "rewards/RMReward/std": 0.08862999081611633, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7173301577568054, "rewards/VisualPerceptionAccuracy/std": 0.09915585815906525, "step": 681, "train_speed(iter/s)": 0.005049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/mean_length": 185.109375, "completions/min_length": 71.0, "epoch": 0.02325819322715957, "frac_reward_zero_std": 0.0, "grad_norm": 0.868769645690918, "kl": 0.10714240372180939, "learning_rate": 1.1622358554873893e-06, "loss": 0.004546355456113815, "memory(GiB)": 69.45, "reward": 0.8267320394515991, "reward_std": 0.076424740254879, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9803921580314636, "rewards/PlanningActionSetORM/std": 0.028023120015859604, "rewards/RMReward/mean": 0.8060416579246521, "rewards/RMReward/std": 0.09777829051017761, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.7841929197311401, "rewards/VisualPerceptionAccuracy/std": 0.1110343411564827, "step": 682, "train_speed(iter/s)": 0.005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/mean_length": 196.03125, "completions/min_length": 44.0, "epoch": 0.023292296149780036, "frac_reward_zero_std": 0.0, "grad_norm": 0.7137486338615417, "kl": 0.11729224771261215, "learning_rate": 1.1639400136332653e-06, "loss": -0.06602177768945694, "memory(GiB)": 69.45, "reward": 0.6963150501251221, "reward_std": 0.09564469754695892, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9444444179534912, "rewards/PlanningActionSetORM/std": 0.05644449591636658, "rewards/RMReward/mean": 0.8546874523162842, "rewards/RMReward/std": 0.0806819349527359, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5199911594390869, "rewards/VisualPerceptionAccuracy/std": 0.1754814237356186, "step": 683, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 123.921875, "completions/min_length": 46.0, "epoch": 0.023326399072400503, "frac_reward_zero_std": 0.0, "grad_norm": 1.5952470302581787, "kl": 0.1341085135936737, "learning_rate": 1.165644171779141e-06, "loss": -0.0058679524809122086, "memory(GiB)": 69.45, "reward": 0.7545016407966614, "reward_std": 0.08752109110355377, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9921875, "rewards/PlanningActionSetORM/std": 0.04419417306780815, "rewards/RMReward/mean": 0.856249988079071, "rewards/RMReward/std": 0.07156092673540115, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6255658864974976, "rewards/VisualPerceptionAccuracy/std": 0.11978192627429962, "step": 684, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 211.234375, "completions/min_length": 101.0, "epoch": 0.023360501995020974, "frac_reward_zero_std": 0.0, "grad_norm": 0.7788161635398865, "kl": 0.11719442903995514, "learning_rate": 1.167348329925017e-06, "loss": -0.0016699896659702063, "memory(GiB)": 69.45, "reward": 0.8125417828559875, "reward_std": 0.05919170379638672, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8449999690055847, "rewards/RMReward/std": 0.1322634369134903, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.6221672296524048, "rewards/VisualPerceptionAccuracy/std": 0.09874623268842697, "step": 685, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 237.234375, "completions/min_length": 80.0, "epoch": 0.02339460491764144, "frac_reward_zero_std": 0.0, "grad_norm": 0.827048659324646, "kl": 0.11809559166431427, "learning_rate": 1.169052488070893e-06, "loss": 0.007814411073923111, "memory(GiB)": 69.45, "reward": 0.8393903374671936, "reward_std": 0.1009911373257637, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9749375581741333, "rewards/PlanningActionSetORM/std": 0.030537918210029602, "rewards/RMReward/mean": 0.8075000047683716, "rewards/RMReward/std": 0.21249668300151825, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.8377931714057922, "rewards/VisualPerceptionAccuracy/std": 0.15572971105575562, "step": 686, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 179.453125, "completions/min_length": 116.0, "epoch": 0.023428707840261912, "frac_reward_zero_std": 0.0, "grad_norm": 1.0666964054107666, "kl": 0.09616216272115707, "learning_rate": 1.170756646216769e-06, "loss": -0.018742796033620834, "memory(GiB)": 69.45, "reward": 0.9459444284439087, "reward_std": 0.02886185422539711, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9722222089767456, "rewards/PlanningActionSetORM/std": 0.04849286377429962, "rewards/RMReward/mean": 0.9393750429153442, "rewards/RMReward/std": 0.08717570453882217, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 687, "train_speed(iter/s)": 0.00505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 164.859375, "completions/min_length": 103.0, "epoch": 0.02346281076288238, "frac_reward_zero_std": 0.0, "grad_norm": 0.8524720072746277, "kl": 0.09430783987045288, "learning_rate": 1.172460804362645e-06, "loss": 0.0011424394324421883, "memory(GiB)": 69.45, "reward": 0.9731249809265137, "reward_std": 0.041312478482723236, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9664062261581421, "rewards/RMReward/std": 0.07415914535522461, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 688, "train_speed(iter/s)": 0.005051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/mean_length": 93.0625, "completions/min_length": 8.0, "epoch": 0.023496913685502847, "frac_reward_zero_std": 0.0, "grad_norm": 12.488261222839355, "kl": 0.3335815668106079, "learning_rate": 1.174164962508521e-06, "loss": 0.001511240378022194, "memory(GiB)": 69.45, "reward": 0.8730380535125732, "reward_std": 0.12648817896842957, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9424999952316284, "rewards/RMReward/std": 0.030900469049811363, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": 0.6435272693634033, "rewards/VisualPerceptionAccuracy/std": 0.2246461659669876, "step": 689, "train_speed(iter/s)": 0.005053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/mean_length": 128.34375, "completions/min_length": 8.0, "epoch": 0.023531016608123317, "frac_reward_zero_std": 0.0, "grad_norm": 15.325297355651855, "kl": 0.47302526235580444, "learning_rate": 1.1758691206543967e-06, "loss": 0.009465496987104416, "memory(GiB)": 69.45, "reward": 0.8133145570755005, "reward_std": 0.1554785668849945, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.7593749761581421, "rewards/RMReward/std": 0.049053557217121124, "rewards/SpatialReasoningORM/mean": 0.875, "rewards/SpatialReasoningORM/std": 0.3415650427341461, "rewards/VisualPerceptionAccuracy/mean": 0.7822540998458862, "rewards/VisualPerceptionAccuracy/std": 0.1285267174243927, "step": 690, "train_speed(iter/s)": 0.005057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 167.25, "completions/min_length": 106.0, "epoch": 0.023565119530743785, "frac_reward_zero_std": 0.0, "grad_norm": 0.7782865166664124, "kl": 0.1445641964673996, "learning_rate": 1.1775732788002727e-06, "loss": 0.02077208086848259, "memory(GiB)": 69.45, "reward": 0.9204038381576538, "reward_std": 0.05924852564930916, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9432691931724548, "rewards/PlanningActionSetORM/std": 0.1160663589835167, "rewards/RMReward/mean": 0.9146875143051147, "rewards/RMReward/std": 0.09535179287195206, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 691, "train_speed(iter/s)": 0.005058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 139.625, "completions/min_length": 48.0, "epoch": 0.023599222453364252, "frac_reward_zero_std": 0.0, "grad_norm": 1.0865684747695923, "kl": 0.14957883954048157, "learning_rate": 1.1792774369461487e-06, "loss": -0.0038185808807611465, "memory(GiB)": 69.45, "reward": 0.7761685848236084, "reward_std": 0.10531902313232422, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.9778125286102295, "rewards/RMReward/std": 0.024981854483485222, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.5700871348381042, "rewards/VisualPerceptionAccuracy/std": 0.30394572019577026, "step": 692, "train_speed(iter/s)": 0.005059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 144.3125, "completions/min_length": 2.0, "epoch": 0.023633325375984723, "frac_reward_zero_std": 0.0, "grad_norm": 25.491262435913086, "kl": 0.06976071000099182, "learning_rate": 1.1809815950920247e-06, "loss": -0.007900556549429893, "memory(GiB)": 69.45, "reward": 0.6174232959747314, "reward_std": 0.24441412091255188, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9545454978942871, "rewards/PlanningActionSetORM/std": 0.046181850135326385, "rewards/RMReward/mean": 0.9318749904632568, "rewards/RMReward/std": 0.047276630997657776, "rewards/SpatialReasoningORM/mean": 0.3125, "rewards/SpatialReasoningORM/std": 0.4787135720252991, "rewards/VisualPerceptionAccuracy/mean": 0.25, "rewards/VisualPerceptionAccuracy/std": 0.44721361994743347, "step": 693, "train_speed(iter/s)": 0.005059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 164.21875, "completions/min_length": 80.0, "epoch": 0.02366742829860519, "frac_reward_zero_std": 0.0, "grad_norm": 0.7353054285049438, "kl": 0.09685954451560974, "learning_rate": 1.1826857532379006e-06, "loss": 0.009473904967308044, "memory(GiB)": 69.45, "reward": 0.9469444751739502, "reward_std": 0.04249691218137741, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9743589758872986, "rewards/PlanningActionSetORM/std": 0.036645617336034775, "rewards/RMReward/mean": 0.95291668176651, "rewards/RMReward/std": 0.07844115048646927, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": 0.9161624908447266, "rewards/VisualPerceptionAccuracy/std": 0.027537576854228973, "step": 694, "train_speed(iter/s)": 0.005062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/mean_length": 122.375, "completions/min_length": 102.0, "epoch": 0.02370153122122566, "frac_reward_zero_std": 0.0, "grad_norm": 0.40060874819755554, "kl": 0.139485165476799, "learning_rate": 1.1843899113837764e-06, "loss": -0.0010497132316231728, "memory(GiB)": 69.45, "reward": 0.9358749985694885, "reward_std": 0.04013659060001373, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.96875, "rewards/PlanningActionSetORM/std": 0.054554473608732224, "rewards/RMReward/mean": 0.9276562929153442, "rewards/RMReward/std": 0.07197314500808716, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 695, "train_speed(iter/s)": 0.005062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 156.390625, "completions/min_length": 72.0, "epoch": 0.023735634143846128, "frac_reward_zero_std": 0.0, "grad_norm": 1.0732706785202026, "kl": 0.11717651784420013, "learning_rate": 1.1860940695296524e-06, "loss": 0.00723198102787137, "memory(GiB)": 69.45, "reward": 0.8704999685287476, "reward_std": 0.06637756526470184, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 1.0, "rewards/PlanningActionSetORM/std": 0.0, "rewards/RMReward/mean": 0.8381249904632568, "rewards/RMReward/std": 0.09640975296497345, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 696, "train_speed(iter/s)": 0.005063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 195.21875, "completions/min_length": 117.0, "epoch": 0.023769737066466595, "frac_reward_zero_std": 0.0, "grad_norm": 0.5202093124389648, "kl": 0.10174838453531265, "learning_rate": 1.1877982276755284e-06, "loss": -0.0032127508893609047, "memory(GiB)": 69.45, "reward": 0.9164174795150757, "reward_std": 0.05446375161409378, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9370874166488647, "rewards/PlanningActionSetORM/std": 0.040588054805994034, "rewards/RMReward/mean": 0.9112499952316284, "rewards/RMReward/std": 0.1053113266825676, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 697, "train_speed(iter/s)": 0.005063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/mean_length": 170.078125, "completions/min_length": 15.0, "epoch": 0.023803839989087066, "frac_reward_zero_std": 0.0, "grad_norm": 1.789932131767273, "kl": 0.0985354632139206, "learning_rate": 1.1895023858214044e-06, "loss": -0.035264208912849426, "memory(GiB)": 69.45, "reward": 0.8440268039703369, "reward_std": 0.13231414556503296, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": 0.9841367602348328, "rewards/PlanningActionSetORM/std": 0.026308318600058556, "rewards/RMReward/mean": 0.768750011920929, "rewards/RMReward/std": 0.1420312374830246, "rewards/SpatialReasoningORM/mean": 0.9375, "rewards/SpatialReasoningORM/std": 0.25, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 698, "train_speed(iter/s)": 0.005063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/mean_length": 11.71875, "completions/min_length": 7.0, "epoch": 0.023837942911707533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039466856978833675, "kl": 0.6382347941398621, "learning_rate": 1.1912065439672803e-06, "loss": 0.0006370305782184005, "memory(GiB)": 69.45, "reward": 1.0, "reward_std": 0.0, "rewards/MathAnswerFormat/mean": 1.0, "rewards/MathAnswerFormat/std": 0.0, "rewards/PlanningActionSetORM/mean": NaN, "rewards/PlanningActionSetORM/std": NaN, "rewards/RMReward/mean": NaN, "rewards/RMReward/std": NaN, "rewards/SpatialReasoningORM/mean": 1.0, "rewards/SpatialReasoningORM/std": 0.0, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 699, "train_speed(iter/s)": 0.005066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/mean_length": 243.375, "completions/min_length": 91.0, "epoch": 0.023872045834328, "frac_reward_zero_std": 0.0, "grad_norm": 0.7319353222846985, "kl": 0.1041514128446579, "learning_rate": 1.1929107021131563e-06, "loss": 0.003512608353048563, "memory(GiB)": 69.45, "reward": 0.9016183614730835, "reward_std": 0.06009867042303085, "rewards/MathAnswerFormat/mean": NaN, "rewards/MathAnswerFormat/std": NaN, "rewards/PlanningActionSetORM/mean": 0.9868420958518982, "rewards/PlanningActionSetORM/std": 0.02297029457986355, "rewards/RMReward/mean": 0.8803125023841858, "rewards/RMReward/std": 0.0913560688495636, "rewards/SpatialReasoningORM/mean": NaN, "rewards/SpatialReasoningORM/std": NaN, "rewards/VisualPerceptionAccuracy/mean": NaN, "rewards/VisualPerceptionAccuracy/std": NaN, "step": 700, "train_speed(iter/s)": 0.005065 } ], "logging_steps": 1, "max_steps": 29323, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }